From 916100baf16f3438acc000aaa70bf8de40cfb56e Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Wed, 4 Jan 2023 15:17:42 -0800 Subject: [PATCH 001/442] Initial fix --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 246 ++++++++++++++++-- .../KokkosSparse_spiluk_symbolic_impl.hpp | 33 ++- sparse/src/KokkosSparse_spiluk_handle.hpp | 12 + 3 files changed, 273 insertions(+), 18 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 4af8606dfb..61fbc47ec1 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -59,6 +59,9 @@ namespace Impl { namespace Experimental { // struct UnsortedTag {}; +struct TP1PopulateTag {}; +struct TP1EliminateTag {}; +struct TP1ResetTag {}; template + //class LevelViewType, class WorkViewType, class nnz_lno_t> + class LevelViewType, class LevelListViewType, class WorkViewType, class nnz_lno_t> struct ILUKLvlSchedTP1NumericFunctor { using execution_space = typename ARowMapType::execution_space; using policy_type = Kokkos::TeamPolicy; @@ -217,16 +221,20 @@ struct ILUKLvlSchedTP1NumericFunctor { UEntriesType U_entries; UValuesType U_values; LevelViewType level_idx; + LevelListViewType level_list; WorkViewType iw; nnz_lno_t lev_start; + size_type curr_lvl; ILUKLvlSchedTP1NumericFunctor( const ARowMapType &A_row_map_, const AEntriesType &A_entries_, const AValuesType &A_values_, const LRowMapType &L_row_map_, const LEntriesType &L_entries_, LValuesType &L_values_, const URowMapType &U_row_map_, const UEntriesType &U_entries_, - UValuesType &U_values_, const LevelViewType &level_idx_, - WorkViewType &iw_, const nnz_lno_t &lev_start_) + //UValuesType &U_values_, const LevelViewType &level_idx_, + //WorkViewType &iw_, const nnz_lno_t &lev_start_) + UValuesType &U_values_, const LevelViewType &level_idx_, const LevelListViewType &level_list_, + WorkViewType &iw_, const nnz_lno_t &lev_start_, const size_type &curr_lvl_) : A_row_map(A_row_map_), A_entries(A_entries_), A_values(A_values_), @@ -237,8 +245,9 @@ struct ILUKLvlSchedTP1NumericFunctor { U_entries(U_entries_), U_values(U_values_), level_idx(level_idx_), + level_list(level_list_), iw(iw_), - lev_start(lev_start_) {} + lev_start(lev_start_), curr_lvl(curr_lvl_) {} KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { @@ -380,6 +389,177 @@ struct ILUKLvlSchedTP1NumericFunctor { iw(my_team, col) = -1; }); } + + KOKKOS_INLINE_FUNCTION + void operator()(const TP1PopulateTag&, const member_type &team) const { + nnz_lno_t my_team = static_cast(team.league_rank()); + nnz_lno_t rowid = + static_cast(level_idx(my_team + lev_start)); // map to rowid + + size_type k1 = static_cast(L_row_map(rowid)); + size_type k2 = static_cast(L_row_map(rowid + 1)); +#ifdef KEEP_DIAG + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), + [&](const size_type k) { + nnz_lno_t col = static_cast(L_entries(k)); + L_values(k) = 0.0; + iw(my_team, col) = static_cast(k); + }); +#else + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + nnz_lno_t col = static_cast(L_entries(k)); + L_values(k) = 0.0; + iw(my_team, col) = static_cast(k); + }); +#endif + +#ifdef KEEP_DIAG + // if (my_thread == 0) L_values(k2 - 1) = scalar_t(1.0); + Kokkos::single(Kokkos::PerTeam(team), + [&]() { L_values(k2 - 1) = scalar_t(1.0); }); +#endif + + team.team_barrier(); + + k1 = static_cast(U_row_map(rowid)); + k2 = static_cast(U_row_map(rowid + 1)); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + nnz_lno_t col = static_cast(U_entries(k)); + U_values(k) = 0.0; + iw(my_team, col) = static_cast(k); + }); + + team.team_barrier(); + + // Unpack the ith row of A + k1 = static_cast(A_row_map(rowid)); + k2 = static_cast(A_row_map(rowid + 1)); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + nnz_lno_t col = static_cast(A_entries(k)); + nnz_lno_t ipos = iw(my_team, col); + if (col < rowid) + L_values(ipos) = A_values(k); + else + U_values(ipos) = A_values(k); + }); + } + + KOKKOS_INLINE_FUNCTION + //__device__ + void operator()(const TP1EliminateTag&, const member_type &team) const { + nnz_lno_t my_team = static_cast(team.league_rank()); + nnz_lno_t rowid = + static_cast(level_idx(my_team + lev_start)); // map to rowid + + //if (curr_lvl == (level_list(rowid)-1)) + // Kokkos::single(Kokkos::PerTeam(team), [&]() { printf("row %d (level %d), row %d (same level from level_list %d)\n", rowid, static_cast(curr_lvl), rowid, static_cast(level_list(rowid)-1)); }); + + // Eliminate prev rows + size_type k1 = static_cast(L_row_map(rowid)); + size_type k2 = static_cast(L_row_map(rowid + 1)); +#ifdef KEEP_DIAG + for (size_type k = k1; k < k2 - 1; k++) +#else + for (size_type k = k1; k < k2; k++) +#endif + { + nnz_lno_t prev_row = L_entries(k); + if (curr_lvl <= (level_list(prev_row)-1)) + Kokkos::single(Kokkos::PerTeam(team), [&]() { printf("row %d (level %d), prev_row %d (level %d)\n", rowid, static_cast(curr_lvl), prev_row, static_cast(level_list(prev_row)-1)); }); + +#ifdef KEEP_DIAG + scalar_t fact = L_values(k) / U_values(U_row_map(prev_row)); +#else + scalar_t fact = L_values(k) * U_values(U_row_map(prev_row)); +#endif + // if (my_thread == 0) L_values(k) = fact; + Kokkos::single(Kokkos::PerTeam(team), [&]() { L_values(k) = fact; }); + //L_values(k) = fact;//same results as above + + team.team_barrier(); + //__syncthreads(); + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, + U_row_map(prev_row + 1)), + [&](const size_type kk) { + nnz_lno_t col = static_cast(U_entries(kk)); + nnz_lno_t ipos = iw(my_team, col); + auto lxu = -U_values(kk) * fact; + if (ipos != -1) { + if (col < rowid) + Kokkos::atomic_add(&L_values(ipos), lxu); + //L_values(ipos) += lxu; + else + Kokkos::atomic_add(&U_values(ipos), lxu); + //U_values(ipos) += lxu; + } + ////More nonconsistency + //if (col < rowid) { + // if (ipos != -1) Kokkos::atomic_add(&L_values(ipos), lxu); + //} + //else { + // if (ipos != -1) Kokkos::atomic_add(&U_values(ipos), lxu); + //} + }); // end for kk + + team.team_barrier(); + //__syncthreads(); + } // end for k + +// Temporarily comment out below: +// // if (my_thread == 0) { +// Kokkos::single(Kokkos::PerTeam(team), [&]() { +// nnz_lno_t ipos = iw(my_team, rowid); +//#ifdef KEEP_DIAG +// if (U_values(ipos) == 0.0) { +// U_values(ipos) = 1e6; +// } +//#else +// if (U_values(ipos) == 0.0) { +// U_values(ipos) = 1e6; +// } else { +// U_values(ipos) = 1.0 / U_values(ipos); +// } +//#endif +// }); +// //} + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TP1ResetTag&, const member_type &team) const { + nnz_lno_t my_team = static_cast(team.league_rank()); + nnz_lno_t rowid = + static_cast(level_idx(my_team + lev_start)); // map to rowid + + // Reset + size_type k1 = static_cast(L_row_map(rowid)); + size_type k2 = static_cast(L_row_map(rowid + 1)); +#ifdef KEEP_DIAG + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), + [&](const size_type k) { + nnz_lno_t col = static_cast(L_entries(k)); + iw(my_team, col) = -1; + }); +#else + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + nnz_lno_t col = static_cast(L_entries(k)); + iw(my_team, col) = -1; + }); +#endif + + k1 = static_cast(U_row_map(rowid)); + k2 = static_cast(U_row_map(rowid + 1)); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + nnz_lno_t col = static_cast(U_entries(k)); + iw(my_team, col) = -1; + }); + } }; template 1024) + team_size = 1024; + else if (power_maxnnzperrow >= 128) + team_size = 768; + else + team_size = power_maxnnzperrow; + printf("power_maxnnzperrow %lld --> SEQLVLSCHD_TP1 uses team_size %d\n", power_maxnnzperrow, team_size); + } + } // Keep these as host View, create device version and copy back to host HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); HandleDeviceEntriesType level_idx = thandle.get_level_idx(); + HandleDeviceRowMapType level_list= thandle.get_level_list(); // Make level_ptr_h a separate allocation, since it will be accessed on host // between kernel launches. If a mirror were used and level_ptr is in UVM @@ -445,7 +644,9 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { using policy_type = Kokkos::TeamPolicy; - int team_size = thandle.get_team_size(); + using policy_type1= Kokkos::TeamPolicy; + using policy_type2= Kokkos::TeamPolicy; + using policy_type3= Kokkos::TeamPolicy; nnz_lno_t lvl_rowid_start = 0; nnz_lno_t lvl_nrows_chunk; @@ -456,21 +657,34 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, else lvl_nrows_chunk = level_nrowsperchunk_h(lvl); + //ILUKLvlSchedTP1NumericFunctor< + // ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, + // LValuesType, URowMapType, UEntriesType, UValuesType, + // HandleDeviceEntriesType, WorkViewType, nnz_lno_t> + // tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, + // L_values, U_row_map, U_entries, U_values, level_idx, iw, + // lev_start + lvl_rowid_start); ILUKLvlSchedTP1NumericFunctor< ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, LValuesType, URowMapType, UEntriesType, UValuesType, - HandleDeviceEntriesType, WorkViewType, nnz_lno_t> + HandleDeviceEntriesType, HandleDeviceRowMapType, WorkViewType, nnz_lno_t> tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, - L_values, U_row_map, U_entries, U_values, level_idx, iw, - lev_start + lvl_rowid_start); - - if (team_size == -1) - Kokkos::parallel_for("parfor_l_team", - policy_type(lvl_nrows_chunk, Kokkos::AUTO), - tstf); - else - Kokkos::parallel_for("parfor_l_team", - policy_type(lvl_nrows_chunk, team_size), tstf); + L_values, U_row_map, U_entries, U_values, level_idx, level_list, iw, + lev_start + lvl_rowid_start, lvl); + + Kokkos::parallel_for("parfor_tp1", + policy_type(lvl_nrows_chunk, team_size), tstf); + //Kokkos::parallel_for("tp1populate", + // policy_type1(lvl_nrows_chunk, Kokkos::AUTO), + // tstf); + //Kokkos::fence(); + //Kokkos::parallel_for("tp1eliminate", + // policy_type2(lvl_nrows_chunk, team_size), + // tstf); + //Kokkos::fence(); + //Kokkos::parallel_for("tp1reset", + // policy_type3(lvl_nrows_chunk, Kokkos::AUTO), + // tstf); Kokkos::fence(); lvl_rowid_start += lvl_nrows_chunk; } diff --git a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 99d0ab1fe8..0594903b67 100644 --- a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -123,7 +123,7 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, template void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, - const EntriesType entries, LevelType1& level_list, + const EntriesType entries, const RowMapType U_row_map, LevelType1& level_list, LevelType2& level_ptr, LevelType2& level_idx, size_type& nlevels) { // Scheduling currently compute on host @@ -186,6 +186,7 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, size_type maxrows = 0; size_type maxrowsperchunk = 0; + size_type maxnnzperrow = 0; for (size_type i = 0; i < nlevels; ++i) { size_type lnrows = level_ptr(i + 1) - level_ptr(i); if (maxrows < lnrows) { @@ -199,11 +200,38 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, lnrowsperchunk(i) = (lnrows % lnchunks(i) == 0) ? (lnrows / lnchunks(i)) : (lnrows / lnchunks(i) + 1); + nnz_lno_t lvl_rowid_start = 0; + nnz_lno_t lvl_nrows_chunk; + for (nnz_lno_t chunkid = 0; chunkid < lnchunks(i); chunkid++) { + if ((lvl_rowid_start + lnrowsperchunk(i)) > static_cast(lnrows)) + lvl_nrows_chunk = static_cast(lnrows) - lvl_rowid_start; + else + lvl_nrows_chunk = lnrowsperchunk(i); + // Determine the number of non-zeros in each level + for (nnz_lno_t r = 0; r < lvl_nrows_chunk; r++) { // Look at each row in the chunk + auto rid = level_idx(r + level_ptr(i) + lvl_rowid_start);// get actual rowid + nnz_lno_t rnnzU = U_row_map(rid + 1) - U_row_map(rid); // count the number of non-zeros in the current row of U + //nnz_lno_t rnnzL = row_map(rid + 1) - row_map(rid); // count the number of non-zeros in the current row of L + if (maxnnzperrow < static_cast(rnnzU)) { + maxnnzperrow = static_cast(rnnzU); + } + } + lvl_rowid_start += lvl_nrows_chunk; + } } else #endif { lnchunks(i) = 1; lnrowsperchunk(i) = lnrows; + // Determine the number of non-zeros in each level + for (nnz_lno_t r = 0; r < lnrows; r++) { // Look at each row in the chunk + auto rid = level_idx(r + level_ptr(i));// get actual rowid + nnz_lno_t rnnzU = U_row_map(rid + 1) - U_row_map(rid); // count the number of non-zeros in the current row of U + //nnz_lno_t rnnzL = row_map(rid + 1) - row_map(rid); // count the number of non-zeros in the current row of L + if (maxnnzperrow < static_cast(rnnzU)) { + maxnnzperrow = static_cast(rnnzU); + } + } } if (maxrowsperchunk < static_cast(lnrowsperchunk(i))) { maxrowsperchunk = lnrowsperchunk(i); @@ -213,6 +241,7 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, thandle.set_num_levels(nlevels); thandle.set_level_maxrows(maxrows); thandle.set_level_maxrowsperchunk(maxrowsperchunk); + thandle.set_level_maxnnzperrow(maxnnzperrow); } // Linear Search for the smallest row index @@ -461,7 +490,7 @@ void iluk_symbolic(IlukHandle& thandle, // Level scheduling on L if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - level_sched_tp(thandle, L_row_map, L_entries, level_list, level_ptr, + level_sched_tp(thandle, L_row_map, L_entries, U_row_map, level_list, level_ptr, level_idx, nlev); thandle.alloc_iw(thandle.get_level_maxrowsperchunk(), nrows); } else { diff --git a/sparse/src/KokkosSparse_spiluk_handle.hpp b/sparse/src/KokkosSparse_spiluk_handle.hpp index 54cc124474..139e66f641 100644 --- a/sparse/src/KokkosSparse_spiluk_handle.hpp +++ b/sparse/src/KokkosSparse_spiluk_handle.hpp @@ -123,6 +123,8 @@ class SPILUKHandle { size_type level_maxrows; // max. number of rows among levels size_type level_maxrowsperchunk; // max.number of rows among chunks among levels + size_type + level_maxnnzperrow; // max.number of nnz per row among levels bool symbolic_complete; @@ -147,6 +149,7 @@ class SPILUKHandle { nnzU(nnzU_), level_maxrows(0), level_maxrowsperchunk(0), + level_maxnnzperrow(0), symbolic_complete(symbolic_complete_), algm(choice), team_size(-1), @@ -160,6 +163,7 @@ class SPILUKHandle { set_nnzU(nnzU_); set_level_maxrows(0); set_level_maxrowsperchunk(0); + set_level_maxnnzperrow(0); level_list = nnz_row_view_t("level_list", nrows_), level_idx = nnz_lno_view_t("level_idx", nrows_), level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), @@ -242,6 +246,14 @@ class SPILUKHandle { this->level_maxrowsperchunk = level_maxrowsperchunk_; } + KOKKOS_INLINE_FUNCTION + size_type get_level_maxnnzperrow() const { return level_maxnnzperrow; } + + KOKKOS_INLINE_FUNCTION + void set_level_maxnnzperrow(const size_type level_maxnnzperrow_) { + this->level_maxnnzperrow = level_maxnnzperrow_; + } + bool is_symbolic_complete() const { return symbolic_complete; } size_type get_num_levels() const { return nlevels; } From 3adaa70ba9d85444191f3b2431b874c8b2ce3638 Mon Sep 17 00:00:00 2001 From: "Vinh Quang Dang (-EXP)" Date: Wed, 4 Jan 2023 23:08:48 -0700 Subject: [PATCH 002/442] Not use atomic_add --- sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 61fbc47ec1..e2ddb932af 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -336,9 +336,11 @@ struct ILUKLvlSchedTP1NumericFunctor { auto lxu = -U_values(kk) * fact; if (ipos != -1) { if (col < rowid) - Kokkos::atomic_add(&L_values(ipos), lxu); + //Kokkos::atomic_add(&L_values(ipos), lxu); + L_values(ipos) += lxu; else - Kokkos::atomic_add(&U_values(ipos), lxu); + //Kokkos::atomic_add(&U_values(ipos), lxu); + U_values(ipos) += lxu; } }); // end for kk From 0b4b667f116f8c0fec5943dec7621487abbf519d Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Thu, 5 Jan 2023 11:07:40 -0700 Subject: [PATCH 003/442] Use atomic_add again --- sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index e2ddb932af..e3dcd71740 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -336,11 +336,11 @@ struct ILUKLvlSchedTP1NumericFunctor { auto lxu = -U_values(kk) * fact; if (ipos != -1) { if (col < rowid) - //Kokkos::atomic_add(&L_values(ipos), lxu); - L_values(ipos) += lxu; + Kokkos::atomic_add(&L_values(ipos), lxu); + //L_values(ipos) += lxu; else - //Kokkos::atomic_add(&U_values(ipos), lxu); - U_values(ipos) += lxu; + Kokkos::atomic_add(&U_values(ipos), lxu); + //U_values(ipos) += lxu; } }); // end for kk From 81f77d0fb669e6f0564372dace753f72a657d6fe Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Thu, 5 Jan 2023 11:31:11 -0700 Subject: [PATCH 004/442] Prefer team size 32 --- sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index e3dcd71740..d3e813660b 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -337,10 +337,8 @@ struct ILUKLvlSchedTP1NumericFunctor { if (ipos != -1) { if (col < rowid) Kokkos::atomic_add(&L_values(ipos), lxu); - //L_values(ipos) += lxu; else Kokkos::atomic_add(&U_values(ipos), lxu); - //U_values(ipos) += lxu; } }); // end for kk @@ -595,7 +593,7 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, else if (power_maxnnzperrow >= 128) team_size = 768; else - team_size = power_maxnnzperrow; + team_size = 32; printf("power_maxnnzperrow %lld --> SEQLVLSCHD_TP1 uses team_size %d\n", power_maxnnzperrow, team_size); } } From 432c9541c5ddc0d029a74979fbbdb73ab53185d0 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Thu, 5 Jan 2023 16:03:02 -0700 Subject: [PATCH 005/442] Fix for VOLTA --- sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index d3e813660b..652b2e3a7b 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -582,6 +582,8 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, size_type nlevels = thandle.get_num_levels(); size_type maxnnzperrow = thandle.get_level_maxnnzperrow(); int team_size = thandle.get_team_size(); + +#ifdef KOKKOS_ARCH_VOLTA if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { if (team_size == -1) { @@ -597,6 +599,9 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, printf("power_maxnnzperrow %lld --> SEQLVLSCHD_TP1 uses team_size %d\n", power_maxnnzperrow, team_size); } } +#else + printf("SEQLVLSCHD_TP1 uses team_size %d (not KOKKOS_ARCH_VOLTA)\n", team_size); +#endif // Keep these as host View, create device version and copy back to host HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); @@ -672,8 +677,18 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, L_values, U_row_map, U_entries, U_values, level_idx, level_list, iw, lev_start + lvl_rowid_start, lvl); +#ifdef KOKKOS_ARCH_VOLTA Kokkos::parallel_for("parfor_tp1", policy_type(lvl_nrows_chunk, team_size), tstf); +#else + if (team_size == -1) + Kokkos::parallel_for("parfor_tp1", + policy_type(lvl_nrows_chunk, Kokkos::AUTO), + tstf); + else + Kokkos::parallel_for("parfor_tp1", + policy_type(lvl_nrows_chunk, team_size), tstf); +#endif //Kokkos::parallel_for("tp1populate", // policy_type1(lvl_nrows_chunk, Kokkos::AUTO), // tstf); From 547a6608a3bb271a8de87d3f1afcf565db2e3dff Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Thu, 5 Jan 2023 16:27:51 -0700 Subject: [PATCH 006/442] Clean up spiluk numeric --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 219 +----------------- 1 file changed, 8 insertions(+), 211 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 652b2e3a7b..cc689461c5 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -59,9 +59,6 @@ namespace Impl { namespace Experimental { // struct UnsortedTag {}; -struct TP1PopulateTag {}; -struct TP1EliminateTag {}; -struct TP1ResetTag {}; template - class LevelViewType, class LevelListViewType, class WorkViewType, class nnz_lno_t> + class LevelViewType, class WorkViewType, class nnz_lno_t> struct ILUKLvlSchedTP1NumericFunctor { using execution_space = typename ARowMapType::execution_space; using policy_type = Kokkos::TeamPolicy; @@ -221,20 +217,16 @@ struct ILUKLvlSchedTP1NumericFunctor { UEntriesType U_entries; UValuesType U_values; LevelViewType level_idx; - LevelListViewType level_list; WorkViewType iw; nnz_lno_t lev_start; - size_type curr_lvl; ILUKLvlSchedTP1NumericFunctor( const ARowMapType &A_row_map_, const AEntriesType &A_entries_, const AValuesType &A_values_, const LRowMapType &L_row_map_, const LEntriesType &L_entries_, LValuesType &L_values_, const URowMapType &U_row_map_, const UEntriesType &U_entries_, - //UValuesType &U_values_, const LevelViewType &level_idx_, - //WorkViewType &iw_, const nnz_lno_t &lev_start_) - UValuesType &U_values_, const LevelViewType &level_idx_, const LevelListViewType &level_list_, - WorkViewType &iw_, const nnz_lno_t &lev_start_, const size_type &curr_lvl_) + UValuesType &U_values_, const LevelViewType &level_idx_, + WorkViewType &iw_, const nnz_lno_t &lev_start_) : A_row_map(A_row_map_), A_entries(A_entries_), A_values(A_values_), @@ -245,9 +237,8 @@ struct ILUKLvlSchedTP1NumericFunctor { U_entries(U_entries_), U_values(U_values_), level_idx(level_idx_), - level_list(level_list_), iw(iw_), - lev_start(lev_start_), curr_lvl(curr_lvl_) {} + lev_start(lev_start_) {} KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { @@ -389,177 +380,6 @@ struct ILUKLvlSchedTP1NumericFunctor { iw(my_team, col) = -1; }); } - - KOKKOS_INLINE_FUNCTION - void operator()(const TP1PopulateTag&, const member_type &team) const { - nnz_lno_t my_team = static_cast(team.league_rank()); - nnz_lno_t rowid = - static_cast(level_idx(my_team + lev_start)); // map to rowid - - size_type k1 = static_cast(L_row_map(rowid)); - size_type k2 = static_cast(L_row_map(rowid + 1)); -#ifdef KEEP_DIAG - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), - [&](const size_type k) { - nnz_lno_t col = static_cast(L_entries(k)); - L_values(k) = 0.0; - iw(my_team, col) = static_cast(k); - }); -#else - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { - nnz_lno_t col = static_cast(L_entries(k)); - L_values(k) = 0.0; - iw(my_team, col) = static_cast(k); - }); -#endif - -#ifdef KEEP_DIAG - // if (my_thread == 0) L_values(k2 - 1) = scalar_t(1.0); - Kokkos::single(Kokkos::PerTeam(team), - [&]() { L_values(k2 - 1) = scalar_t(1.0); }); -#endif - - team.team_barrier(); - - k1 = static_cast(U_row_map(rowid)); - k2 = static_cast(U_row_map(rowid + 1)); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { - nnz_lno_t col = static_cast(U_entries(k)); - U_values(k) = 0.0; - iw(my_team, col) = static_cast(k); - }); - - team.team_barrier(); - - // Unpack the ith row of A - k1 = static_cast(A_row_map(rowid)); - k2 = static_cast(A_row_map(rowid + 1)); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { - nnz_lno_t col = static_cast(A_entries(k)); - nnz_lno_t ipos = iw(my_team, col); - if (col < rowid) - L_values(ipos) = A_values(k); - else - U_values(ipos) = A_values(k); - }); - } - - KOKKOS_INLINE_FUNCTION - //__device__ - void operator()(const TP1EliminateTag&, const member_type &team) const { - nnz_lno_t my_team = static_cast(team.league_rank()); - nnz_lno_t rowid = - static_cast(level_idx(my_team + lev_start)); // map to rowid - - //if (curr_lvl == (level_list(rowid)-1)) - // Kokkos::single(Kokkos::PerTeam(team), [&]() { printf("row %d (level %d), row %d (same level from level_list %d)\n", rowid, static_cast(curr_lvl), rowid, static_cast(level_list(rowid)-1)); }); - - // Eliminate prev rows - size_type k1 = static_cast(L_row_map(rowid)); - size_type k2 = static_cast(L_row_map(rowid + 1)); -#ifdef KEEP_DIAG - for (size_type k = k1; k < k2 - 1; k++) -#else - for (size_type k = k1; k < k2; k++) -#endif - { - nnz_lno_t prev_row = L_entries(k); - if (curr_lvl <= (level_list(prev_row)-1)) - Kokkos::single(Kokkos::PerTeam(team), [&]() { printf("row %d (level %d), prev_row %d (level %d)\n", rowid, static_cast(curr_lvl), prev_row, static_cast(level_list(prev_row)-1)); }); - -#ifdef KEEP_DIAG - scalar_t fact = L_values(k) / U_values(U_row_map(prev_row)); -#else - scalar_t fact = L_values(k) * U_values(U_row_map(prev_row)); -#endif - // if (my_thread == 0) L_values(k) = fact; - Kokkos::single(Kokkos::PerTeam(team), [&]() { L_values(k) = fact; }); - //L_values(k) = fact;//same results as above - - team.team_barrier(); - //__syncthreads(); - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, - U_row_map(prev_row + 1)), - [&](const size_type kk) { - nnz_lno_t col = static_cast(U_entries(kk)); - nnz_lno_t ipos = iw(my_team, col); - auto lxu = -U_values(kk) * fact; - if (ipos != -1) { - if (col < rowid) - Kokkos::atomic_add(&L_values(ipos), lxu); - //L_values(ipos) += lxu; - else - Kokkos::atomic_add(&U_values(ipos), lxu); - //U_values(ipos) += lxu; - } - ////More nonconsistency - //if (col < rowid) { - // if (ipos != -1) Kokkos::atomic_add(&L_values(ipos), lxu); - //} - //else { - // if (ipos != -1) Kokkos::atomic_add(&U_values(ipos), lxu); - //} - }); // end for kk - - team.team_barrier(); - //__syncthreads(); - } // end for k - -// Temporarily comment out below: -// // if (my_thread == 0) { -// Kokkos::single(Kokkos::PerTeam(team), [&]() { -// nnz_lno_t ipos = iw(my_team, rowid); -//#ifdef KEEP_DIAG -// if (U_values(ipos) == 0.0) { -// U_values(ipos) = 1e6; -// } -//#else -// if (U_values(ipos) == 0.0) { -// U_values(ipos) = 1e6; -// } else { -// U_values(ipos) = 1.0 / U_values(ipos); -// } -//#endif -// }); -// //} - } - - KOKKOS_INLINE_FUNCTION - void operator()(const TP1ResetTag&, const member_type &team) const { - nnz_lno_t my_team = static_cast(team.league_rank()); - nnz_lno_t rowid = - static_cast(level_idx(my_team + lev_start)); // map to rowid - - // Reset - size_type k1 = static_cast(L_row_map(rowid)); - size_type k2 = static_cast(L_row_map(rowid + 1)); -#ifdef KEEP_DIAG - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), - [&](const size_type k) { - nnz_lno_t col = static_cast(L_entries(k)); - iw(my_team, col) = -1; - }); -#else - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { - nnz_lno_t col = static_cast(L_entries(k)); - iw(my_team, col) = -1; - }); -#endif - - k1 = static_cast(U_row_map(rowid)); - k2 = static_cast(U_row_map(rowid + 1)); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { - nnz_lno_t col = static_cast(U_entries(k)); - iw(my_team, col) = -1; - }); - } }; template SEQLVLSCHD_TP1 uses team_size %d\n", power_maxnnzperrow, team_size); } } -#else - printf("SEQLVLSCHD_TP1 uses team_size %d (not KOKKOS_ARCH_VOLTA)\n", team_size); #endif // Keep these as host View, create device version and copy back to host @@ -649,9 +467,6 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { using policy_type = Kokkos::TeamPolicy; - using policy_type1= Kokkos::TeamPolicy; - using policy_type2= Kokkos::TeamPolicy; - using policy_type3= Kokkos::TeamPolicy; nnz_lno_t lvl_rowid_start = 0; nnz_lno_t lvl_nrows_chunk; @@ -662,20 +477,13 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, else lvl_nrows_chunk = level_nrowsperchunk_h(lvl); - //ILUKLvlSchedTP1NumericFunctor< - // ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, - // LValuesType, URowMapType, UEntriesType, UValuesType, - // HandleDeviceEntriesType, WorkViewType, nnz_lno_t> - // tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, - // L_values, U_row_map, U_entries, U_values, level_idx, iw, - // lev_start + lvl_rowid_start); ILUKLvlSchedTP1NumericFunctor< ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, LValuesType, URowMapType, UEntriesType, UValuesType, - HandleDeviceEntriesType, HandleDeviceRowMapType, WorkViewType, nnz_lno_t> + HandleDeviceEntriesType, WorkViewType, nnz_lno_t> tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, - L_values, U_row_map, U_entries, U_values, level_idx, level_list, iw, - lev_start + lvl_rowid_start, lvl); + L_values, U_row_map, U_entries, U_values, level_idx, iw, + lev_start + lvl_rowid_start); #ifdef KOKKOS_ARCH_VOLTA Kokkos::parallel_for("parfor_tp1", @@ -689,17 +497,6 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, Kokkos::parallel_for("parfor_tp1", policy_type(lvl_nrows_chunk, team_size), tstf); #endif - //Kokkos::parallel_for("tp1populate", - // policy_type1(lvl_nrows_chunk, Kokkos::AUTO), - // tstf); - //Kokkos::fence(); - //Kokkos::parallel_for("tp1eliminate", - // policy_type2(lvl_nrows_chunk, team_size), - // tstf); - //Kokkos::fence(); - //Kokkos::parallel_for("tp1reset", - // policy_type3(lvl_nrows_chunk, Kokkos::AUTO), - // tstf); Kokkos::fence(); lvl_rowid_start += lvl_nrows_chunk; } From f87b7d566a8db55eaefd1fae29bbfcaa0c99676f Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Fri, 6 Jan 2023 00:31:58 -0700 Subject: [PATCH 007/442] Clean up numeric and symbolic --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 3 --- .../KokkosSparse_spiluk_symbolic_impl.hpp | 21 ++++++++++++++++--- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index cc689461c5..a72858b6e2 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -395,7 +395,6 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, using size_type = typename IlukHandle::size_type; using nnz_lno_t = typename IlukHandle::nnz_lno_t; using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; - using HandleDeviceRowMapType = typename IlukHandle::nnz_row_view_t; using WorkViewType = typename IlukHandle::work_view_t; using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; @@ -416,7 +415,6 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, team_size = 768; else team_size = 32; - printf("power_maxnnzperrow %lld --> SEQLVLSCHD_TP1 uses team_size %d\n", power_maxnnzperrow, team_size); } } #endif @@ -424,7 +422,6 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, // Keep these as host View, create device version and copy back to host HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); HandleDeviceEntriesType level_idx = thandle.get_level_idx(); - HandleDeviceRowMapType level_list= thandle.get_level_list(); // Make level_ptr_h a separate allocation, since it will be accessed on host // between kernel launches. If a mirror were used and level_ptr is in UVM diff --git a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 0594903b67..431d4d3900 100644 --- a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -123,7 +123,11 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, template void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, - const EntriesType entries, const RowMapType U_row_map, LevelType1& level_list, + const EntriesType entries, +#ifdef KOKKOS_ARCH_VOLTA + const RowMapType U_row_map, +#endif + LevelType1& level_list, LevelType2& level_ptr, LevelType2& level_idx, size_type& nlevels) { // Scheduling currently compute on host @@ -186,7 +190,9 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, size_type maxrows = 0; size_type maxrowsperchunk = 0; +#ifdef KOKKOS_ARCH_VOLTA size_type maxnnzperrow = 0; +#endif for (size_type i = 0; i < nlevels; ++i) { size_type lnrows = level_ptr(i + 1) - level_ptr(i); if (maxrows < lnrows) { @@ -200,6 +206,7 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, lnrowsperchunk(i) = (lnrows % lnchunks(i) == 0) ? (lnrows / lnchunks(i)) : (lnrows / lnchunks(i) + 1); +#ifdef KOKKOS_ARCH_VOLTA nnz_lno_t lvl_rowid_start = 0; nnz_lno_t lvl_nrows_chunk; for (nnz_lno_t chunkid = 0; chunkid < lnchunks(i); chunkid++) { @@ -218,11 +225,13 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, } lvl_rowid_start += lvl_nrows_chunk; } +#endif } else #endif { lnchunks(i) = 1; lnrowsperchunk(i) = lnrows; +#ifdef KOKKOS_ARCH_VOLTA // Determine the number of non-zeros in each level for (nnz_lno_t r = 0; r < lnrows; r++) { // Look at each row in the chunk auto rid = level_idx(r + level_ptr(i));// get actual rowid @@ -232,6 +241,7 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, maxnnzperrow = static_cast(rnnzU); } } +#endif } if (maxrowsperchunk < static_cast(lnrowsperchunk(i))) { maxrowsperchunk = lnrowsperchunk(i); @@ -241,7 +251,9 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, thandle.set_num_levels(nlevels); thandle.set_level_maxrows(maxrows); thandle.set_level_maxrowsperchunk(maxrowsperchunk); +#ifdef KOKKOS_ARCH_VOLTA thandle.set_level_maxnnzperrow(maxnnzperrow); +#endif } // Linear Search for the smallest row index @@ -490,8 +502,11 @@ void iluk_symbolic(IlukHandle& thandle, // Level scheduling on L if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - level_sched_tp(thandle, L_row_map, L_entries, U_row_map, level_list, level_ptr, - level_idx, nlev); + level_sched_tp(thandle, L_row_map, L_entries, +#ifdef KOKKOS_ARCH_VOLTA + U_row_map, +#endif + level_list, level_ptr, level_idx, nlev); thandle.alloc_iw(thandle.get_level_maxrowsperchunk(), nrows); } else { level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, From 25b4fb815ca349873a1a4bd66ac068f8843c4c74 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 17 Jan 2023 13:11:07 -0700 Subject: [PATCH 008/442] Add new par_ilut test --- sparse/unit_test/Test_Sparse_par_ilut.hpp | 134 +++++++++++++++++++++- 1 file changed, 131 insertions(+), 3 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index 809f080c97..6ff31c10b8 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -63,6 +63,20 @@ using namespace KokkosKernels::Experimental; namespace Test { +namespace ParIlut { + +template +struct TolMeta { + static constexpr T value = 1e-8; +}; + +template <> +struct TolMeta { + static constexpr float value = 1e-5; // Lower tolerance for floats +}; + +} + template std::vector> decompress_matrix( @@ -300,6 +314,109 @@ void run_test_par_ilut() { #endif } +template +void run_test_par_ilut_precond() { + // Test using par_ilut as a preconditioner + // Does (LU)^inv Ax = (LU)^inv b converge faster than solving Ax=b? + using exe_space = typename device::execution_space; + using mem_space = typename device::memory_space; + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using sp_matrix_type = + KokkosSparse::CrsMatrix; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, exe_space, mem_space, mem_space>; + using float_t = typename Kokkos::ArithTraits::mag_type; + + // Create a diagonally dominant sparse matrix to test: + constexpr auto n = 5000; + constexpr auto m = 15; + constexpr auto tol = ParIlut::TolMeta::value; + constexpr auto numRows = n; + constexpr auto numCols = n; + constexpr auto diagDominance = 1; + constexpr bool verbose = false; + + typename sp_matrix_type::non_const_size_type nnz = 10 * numRows; + auto A = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< + sp_matrix_type>(numRows, numCols, nnz, 0, lno_t(0.01 * numRows), + diagDominance); + + // Make kernel handles + KernelHandle kh; + kh.create_gmres_handle(m, tol); + auto gmres_handle = kh.get_gmres_handle(); + using GMRESHandle = + typename std::remove_reference::type; + using ViewVectorType = typename GMRESHandle::nnz_value_view_t; + + kh.create_par_ilut_handle(numRows); + auto par_ilut_handle = kh.get_par_ilut_handle(); + + // Pull out views from CRS + auto row_map = A.graph.row_map; + auto entries = A.graph.entries; + auto values = A.values; + + // Allocate L and U CRS views as outputs + RowMapType L_row_map ("L_row_map", numRows + 1); + RowMapType U_row_map ("U_row_map", numRows + 1); + RowMapType LU_row_map("LU_row_map", numRows + 1); + EntriesType LU_entries("LU_entries"); + ValuesType LU_values ("LU_values"); + + // Initial L/U approximations for A + par_ilut_symbolic(&kh, row_map, entries, L_row_map, U_row_map); + + const size_type nnzL = par_ilut_handle->get_nnzL(); + const size_type nnzU = par_ilut_handle->get_nnzU(); + + EntriesType L_entries("L_entries", nnzL); + ValuesType L_values("L_values", nnzL); + EntriesType U_entries("U_entries", nnzU); + ValuesType U_values("U_values", nnzU); + + par_ilut_numeric(&kh, row_map, entries, values, L_row_map, L_entries, + L_values, U_row_map, U_entries, U_values, +#ifdef KOKKOS_ENABLE_SERIAL + true /*deterministic*/ +#else + false /*cannot ask for determinism*/ +#endif + ); + + // Create LU^inv + { + std::string myalg("SPGEMM_KK_MEMORY"); + KokkosSparse::SPGEMMAlgorithm spgemm_algorithm = + KokkosSparse::StringToSPGEMMAlgorithm(myalg); + kh.create_spgemm_handle(spgemm_algorithm); + kh.create_spadd_handle(true /*we expect inputs to be sorted*/); + + KokkosSparse::Experimental::spgemm_symbolic( + &kh, numRows, numRows, numRows, L_row_map, L_entries, false, U_row_map, + U_entries, false, LU_row_map); + + const size_type lu_nnz_size = kh.get_spgemm_handle()->get_c_nnz(); + Kokkos::resize(LU_entries, lu_nnz_size); + Kokkos::resize(LU_values, lu_nnz_size); + + KokkosSparse::Experimental::spgemm_numeric( + &kh, numRows, numRows, numRows, L_row_map, L_entries, L_values, false, + U_row_map, U_entries, U_values, false, LU_row_map, LU_entries, + LU_values); + + // Need to sort LU CRS if on CUDA! + KokkosSparse::sort_crs_matrix(LU_row_map, LU_entries, LU_values); + + kh.destroy_spgemm_handle(); + } + + +} + } // namespace Test template (); } -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, \ +template +void test_par_ilut_precond() { + Test::run_test_par_ilut_precond(); +} + + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, \ sparse##_##par_ilut##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_par_ilut(); \ + test_par_ilut(); \ + } \ + TEST_F(TestCategory, \ + sparse##_##par_ilut_precond##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_par_ilut_precond(); \ } #define NO_TEST_COMPLEX From 1ea3a7b9034854f3f27b26e8f9b8fe39b498c37d Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 18 Jan 2023 12:08:54 -0700 Subject: [PATCH 009/442] .github/workflows: - Added docs.yml - Save cycles with -DKokkos_ENABLE_TESTS=OFF docs: - Updated Doxyfile.in to fail with warnings src: - Fix doxygen style comments --- .github/workflows/docs.yml | 68 ++++++++++++ .github/workflows/osx.yml | 1 + .../dense/src/KokkosBatched_Gemm_Handle.hpp | 70 ++++++------ .../dense/src/KokkosBatched_Kernel_Handle.hpp | 104 +++++++++--------- .../src/KokkosBatched_Krylov_Handle.hpp | 7 +- blas/src/KokkosBlas1_nrm2w.hpp | 1 + blas/src/KokkosBlas1_nrm2w_squared.hpp | 1 + blas/src/KokkosBlas1_rotg.hpp | 10 +- blas/src/KokkosBlas1_rotm.hpp | 8 +- blas/src/KokkosBlas1_rotmg.hpp | 9 +- blas/src/KokkosBlas2_gemv.hpp | 2 +- docs/Doxyfile.in | 2 +- .../impl/KokkosSparse_sor_sequential_impl.hpp | 2 +- .../KokkosSparse_sptrsv_symbolic_impl.hpp | 2 +- sparse/impl/KokkosSparse_trsv_impl.hpp | 2 +- sparse/src/KokkosKernels_Handle.hpp | 28 ++--- sparse/src/KokkosSparse_BsrMatrix.hpp | 74 +++++++------ sparse/src/KokkosSparse_CcsMatrix.hpp | 1 - sparse/src/KokkosSparse_CrsMatrix.hpp | 60 +++++----- sparse/src/KokkosSparse_MatrixPrec.hpp | 2 +- sparse/src/KokkosSparse_OrdinalTraits.hpp | 2 +- sparse/src/KokkosSparse_Preconditioner.hpp | 3 +- sparse/src/KokkosSparse_Utils.hpp | 67 +++++------ sparse/src/KokkosSparse_findRelOffset.hpp | 2 +- sparse/src/KokkosSparse_getDiagCopy.hpp | 2 +- sparse/src/KokkosSparse_spadd_handle.hpp | 14 +-- sparse/src/KokkosSparse_spgemm_handle.hpp | 6 +- sparse/src/KokkosSparse_spmv.hpp | 13 ++- sparse/src/KokkosSparse_trsv.hpp | 2 +- 29 files changed, 311 insertions(+), 254 deletions(-) create mode 100644 .github/workflows/docs.yml diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000000..ea377dafcd --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,68 @@ +name: github-DOCS + +on: + pull_request: + branches: + - master + - develop + +permissions: + contents: none + +jobs: + docs-check: + runs-on: ubuntu-latest + steps: + - name: Install Dependencies + run: | + sudo apt install doxygen + pip install sphinx + pip install breathe + pip install sphinx-rtd-theme + + - name: checkout_kokkos_kernels + uses: actions/checkout@v2 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@v2 + with: + repository: kokkos/kokkos + ref: develop + path: kokkos + + - name: configure_kokkos + run: | + mkdir -p kokkos/{build,install} + cd kokkos/build + cmake \ + -DCMAKE_CXX_FLAGS="-Werror" \ + -DCMAKE_CXX_STANDARD=17 \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \ + -DKokkos_ENABLE_TESTS=OFF \ + .. + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j2 install + + - name: configure_kokkos_kernels + run: | + mkdir -p kokkos-kernels/{build,install} + cd kokkos-kernels/build + cmake \ + -DKokkos_DIR=$PWD/../../kokkos/install/lib/cmake/Kokkos \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkosKernels_ENABLE_DOCS=ON \ + .. + + - name: build_kokkos_kernels_doxygen + working-directory: kokkos-kernels/build + run: make Doxygen + + - name: build_kokkos_kernels_sphinx + working-directory: kokkos-kernels/build + run: make Sphinx diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 07c0cb8d1e..9d8007aac2 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -72,6 +72,7 @@ jobs: -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEBUG_BOUNDS_CHECK:BOOL=${{ matrix.debug_bounds_check }} \ -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \ + -DKokkos_ENABLE_TESTS=OFF \ -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ -DCMAKE_INSTALL_PREFIX=$PWD/../install \ .. diff --git a/batched/dense/src/KokkosBatched_Gemm_Handle.hpp b/batched/dense/src/KokkosBatched_Gemm_Handle.hpp index 402f10a91f..75b02f8a8a 100644 --- a/batched/dense/src/KokkosBatched_Gemm_Handle.hpp +++ b/batched/dense/src/KokkosBatched_Gemm_Handle.hpp @@ -82,44 +82,44 @@ enum GEMM_KOKKOS_BATCHED_ALGOS : int { // clang-format off /// \brief Handle for selecting runtime behavior of the BatchedGemm interface. /// -/// \var kernelAlgoType Specifies which algorithm to use for invocation (default, SQUARE). +/// \param kernelAlgoType Specifies which algorithm to use for invocation (default, SQUARE). /// -/// Specifies whether to select optimal invocations based on inputs and -/// heuristics: -/// SQUARE select invocations based on square matrix heuristics where M=N -/// TALL select invocations based on tall matrix heuristics where M>N -/// WIDE select invocations based on wide matrix heuristics where M= 24. -/// \var teamSz Specifies the team size that will affect any KK algorithm which uses +/// Specifies whether to select optimal invocations based on inputs and +/// heuristics: +/// SQUARE select invocations based on square matrix heuristics where M=N +/// TALL select invocations based on tall matrix heuristics where M>N +/// WIDE select invocations based on wide matrix heuristics where M= 24. +/// \param teamSz Specifies the team size that will affect any KK algorithm which uses /// TeamPolicy (default, Kokkos::AUTO). /// Note: Only applied if useAlgo_type == KK_* -/// \var vecLen Specifies the vector length that will affect any KK algorithm which +/// \param vecLen Specifies the vector length that will affect any KK algorithm which /// uses TeamPolicy and Kokkos::ThreadVectorRange or Kokkos::TeamVectorRange /// (default, Kokkos::AUTO). /// Note: Only applied if useAlgo_type == KK_* diff --git a/batched/dense/src/KokkosBatched_Kernel_Handle.hpp b/batched/dense/src/KokkosBatched_Kernel_Handle.hpp index ec8d2ee23f..8918d3b67b 100644 --- a/batched/dense/src/KokkosBatched_Kernel_Handle.hpp +++ b/batched/dense/src/KokkosBatched_Kernel_Handle.hpp @@ -118,55 +118,55 @@ struct TplParams { // clang-format off /// \brief Handle for selecting runtime behavior of the BatchedGemm interface. /// -/// \var kernelAlgoType Specifies which algorithm to use for invocation (default, SQUARE). +/// \param kernelAlgoType Specifies which algorithm to use for invocation (default, SQUARE). /// -/// Specifies whether to select optimal invocations based on inputs and -/// heuristics: -/// SQUARE select invocations based on square matrix heuristics where M=N -/// TALL select invocations based on tall matrix heuristics where M>N -/// WIDE select invocations based on wide matrix heuristics where MN +/// WIDE select invocations based on wide matrix heuristics where M diff --git a/blas/src/KokkosBlas1_nrm2w_squared.hpp b/blas/src/KokkosBlas1_nrm2w_squared.hpp index 6aec955de2..d07dabdaca 100644 --- a/blas/src/KokkosBlas1_nrm2w_squared.hpp +++ b/blas/src/KokkosBlas1_nrm2w_squared.hpp @@ -56,6 +56,7 @@ namespace KokkosBlas { /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// /// \param x [in] Input 1-D View. +/// \param w [in] /// /// \return The nrm2w product result; a single value. template diff --git a/blas/src/KokkosBlas1_rotg.hpp b/blas/src/KokkosBlas1_rotg.hpp index 4d70a8a8be..129a885127 100644 --- a/blas/src/KokkosBlas1_rotg.hpp +++ b/blas/src/KokkosBlas1_rotg.hpp @@ -54,10 +54,14 @@ namespace KokkosBlas { /// /// \tparam Scalar data type of inputs and outputs /// +/// \param space [in] the execution space /// \param a [in/out] on input one of the values to rotate, on output the -/// rotated value \param b [in/out] on input one of the values to rotate, on -/// output the rotated value \param c [out] cosine value associated with the -/// rotation \param s [out] sine value associated with the rotation +/// rotated value +/// \param b [in/out] on input one of the values to rotate, on +/// output the rotated value +/// \param c [out] cosine value associated with the +/// rotation +/// \param s [out] sine value associated with the rotation template void rotg(execution_space const& space, SViewType const& a, SViewType const& b, MViewType const& c, SViewType const& s) { diff --git a/blas/src/KokkosBlas1_rotm.hpp b/blas/src/KokkosBlas1_rotm.hpp index 38e0a78039..6a2cd357a4 100644 --- a/blas/src/KokkosBlas1_rotm.hpp +++ b/blas/src/KokkosBlas1_rotm.hpp @@ -53,14 +53,14 @@ namespace KokkosBlas { /// \brief Applies modified Givens rotation coefficients to vectors x and y. /// /// \tparam execution_space the execution space where the kernel will be -/// executed, it can be used to specify a stream too. +/// executed, it can be used to specify a stream too. /// \tparam VectorView a rank1 view type that hold non const data /// \tparam ParamView a rank1 view of static extent [5] type that -/// holds const data +/// holds const data /// /// \param space [in] execution space used for parallel loops in this kernel -/// \param x1 [in/out] vector to be rotated with param coefficients -/// \param y1 [in/out] vector to be rotated with param coefficients +/// \param X [in/out] vector to be rotated with param coefficients +/// \param Y [in/out] vector to be rotated with param coefficients /// \param param [in] output of rotmg contains rotation coefficients /// template diff --git a/blas/src/KokkosBlas1_rotmg.hpp b/blas/src/KokkosBlas1_rotmg.hpp index 43efa8d3b2..7a504a5836 100644 --- a/blas/src/KokkosBlas1_rotmg.hpp +++ b/blas/src/KokkosBlas1_rotmg.hpp @@ -53,10 +53,13 @@ namespace KokkosBlas { /// \brief Compute the coefficients to apply a modified Givens rotation. /// /// \tparam execution_space the execution space where the kernel will be -/// executed \tparam DXView a rank0 view type that hold non const data \tparam -/// YView a rank0 view type that holds const data \tparam PView a rank1 view of -/// static extent 5 that holds non const data +/// executed +/// \tparam DXView a rank0 view type that hold non const data +/// \tparam YView a rank0 view type that holds const data +/// \tparam PView a rank1 view of +/// static extent 5 that holds non const data /// +/// \param space [in] execution space used for parallel loops /// \param d1 [in/out] /// \param d2 [in/out] /// \param x1 [in/out] diff --git a/blas/src/KokkosBlas2_gemv.hpp b/blas/src/KokkosBlas2_gemv.hpp index fe8418cc40..028bd4c679 100644 --- a/blas/src/KokkosBlas2_gemv.hpp +++ b/blas/src/KokkosBlas2_gemv.hpp @@ -44,7 +44,7 @@ #ifndef KOKKOSBLAS2_GEMV_HPP_ #define KOKKOSBLAS2_GEMV_HPP_ -/// \file Kokkos_Blas2_MV.hpp +/// \file KokkosBlas2_gemv.hpp /// \brief BLAS 2 kernels specifically optimized for typical /// Tpetra::MultiVector use cases. diff --git a/docs/Doxyfile.in b/docs/Doxyfile.in index 5cb072a465..deb47d9d2b 100644 --- a/docs/Doxyfile.in +++ b/docs/Doxyfile.in @@ -843,7 +843,7 @@ WARN_NO_PARAMDOC = NO # Possible values are: NO, YES and FAIL_ON_WARNINGS. # The default value is: NO. -WARN_AS_ERROR = NO +WARN_AS_ERROR = FAIL_ON_WARNINGS # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which diff --git a/sparse/impl/KokkosSparse_sor_sequential_impl.hpp b/sparse/impl/KokkosSparse_sor_sequential_impl.hpp index 69aa11186b..f1d094a1ac 100644 --- a/sparse/impl/KokkosSparse_sor_sequential_impl.hpp +++ b/sparse/impl/KokkosSparse_sor_sequential_impl.hpp @@ -45,7 +45,7 @@ #ifndef KOKKOSSPARSE_IMPL_SOR_HPP #define KOKKOSSPARSE_IMPL_SOR_HPP -/// \file Kokkos_Sparse_impl_sor.hpp +/// \file KokkosSparse_impl_sor.hpp /// \brief Sequential implementations of Gauss-Seidel and SOR. /// /// This file exists mainly as a temporary porting aid. Until we can diff --git a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index 3a6f988835..641b47d20e 100644 --- a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -44,7 +44,7 @@ #ifndef KOKKOSSPARSE_IMPL_SPTRSV_SYMBOLIC_HPP_ #define KOKKOSSPARSE_IMPL_SPTRSV_SYMBOLIC_HPP_ -/// \file Kokkos_Sparse_impl_sptrsv_symbolic.hpp +/// \file KokkosSparse_impl_sptrsv_symbolic.hpp /// \brief Implementation(s) of sparse triangular solve. #include diff --git a/sparse/impl/KokkosSparse_trsv_impl.hpp b/sparse/impl/KokkosSparse_trsv_impl.hpp index bff037c228..d84ad27cff 100644 --- a/sparse/impl/KokkosSparse_trsv_impl.hpp +++ b/sparse/impl/KokkosSparse_trsv_impl.hpp @@ -45,7 +45,7 @@ #ifndef KOKKOSSPARSE_IMPL_TRSM_HPP_ #define KOKKOSSPARSE_IMPL_TRSM_HPP_ -/// \file Kokkos_Sparse_impl_trsm.hpp +/// \file KokkosSparse_impl_trsm.hpp /// \brief Implementation(s) of sparse triangular solve. #include diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index efd8ac3faf..1739ce9c7c 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -446,13 +446,11 @@ class KokkosKernelsHandle { return this->my_exec_space; } - /** - * \brief Returns the suggested team work size. If set with - * set_team_work_size, it will return the set value. Otherwise it will return - * the teamsize. \param team_size: input, team size used by the kernel. \param - * concurrency: input, the number of threads overall. Not used currently. - * \param overall_work_size: The overall work size. - */ + /// \brief Returns the suggested team work size. If set with + /// set_team_work_size, it will return the set value. Otherwise it will return + /// the teamsize. + /// \param team_size input, team size used by the kernel. + /// \param nnz_lno_t filler for overall_work_size int get_team_work_size(const int team_size, const int /* concurrency */, const nnz_lno_t /* overall_work_size */) { if (this->team_work_size != -1) { @@ -481,11 +479,10 @@ class KokkosKernelsHandle { */ bool is_dynamic_scheduling() { return this->use_dynamic_scheduling; } - /** - * \brief sets the shared memory size to be used by the kernels using shared - * memory on GPUs. \param shared_memory_size: input, shared memory size to be - * used by the kernel. * - */ + /// \brief sets the shared memory size to be used by the kernels using shared + /// memory on GPUs. + /// \param shared_memory_size_ input, shared memory size to be used by the + /// kernel. void set_shmem_size(const size_t shared_memory_size_) { this->shared_memory_size = shared_memory_size_; } @@ -528,10 +525,9 @@ class KokkosKernelsHandle { int get_set_suggested_team_size() { return this->suggested_team_size; } - /** - * \brief Returns the team size, either set by the user or suggested by the - * handle. \param vector_size: suggested vector size by the handle. - */ + /// \brief Returns the team size, either set by the user or suggested by the + /// handle. + /// \param vector_size_ suggested vector size by the handle. int get_suggested_team_size(const int vector_size_) { if (this->suggested_team_size != -1) { return this->suggested_team_size; diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index 12f4dff651..001fffe70e 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -42,7 +42,7 @@ //@HEADER */ -/// \file Kokkos_Sparse_BsrMatrix.hpp +/// \file KokkosSparse_BsrMatrix.hpp /// \brief Local sparse matrix interface /// /// This file provides KokkosSparse::Experimental::BsrMatrix. @@ -186,7 +186,6 @@ struct BsrRowView { /// \brief Return offset into colidx_ for the requested block idx /// If none found, return Kokkos::Details::ArithTraits::max /// \param idx_to_match [in] local block idx within block-row - /// \param is_sorted [in] defaulted to false; no usage at this time KOKKOS_INLINE_FUNCTION ordinal_type findRelBlockOffset(const ordinal_type idx_to_match, bool /*is_sorted*/ = false) const { @@ -241,6 +240,7 @@ struct BsrRowViewConst { /// /// \param values [in] Array of the row's values. /// \param colidx [in] Array of the row's column indices. + /// \param blockDim /// \param count [in] Number of entries in the row. /// \param start [in] Offset into values and colidx of the desired block-row /// start. @@ -322,7 +322,6 @@ struct BsrRowViewConst { /// \brief Return offset into colidx_ for the requested block idx /// If none found, return Kokkos::Details::ArithTraits::max /// \param idx_to_match [in] local block idx within block-row - /// \param is_sorted [in] defaulted to false; no usage at this time KOKKOS_INLINE_FUNCTION ordinal_type findRelBlockOffset(const ordinal_type& idx_to_match, bool /*is_sorted*/ = false) const { @@ -499,6 +498,7 @@ class BsrMatrix { /// val[k]. /// \param cols [in] The column indices. cols[k] is the column /// index of val[k]. + /// \param blockdim [in] The block dimensions. /// \param pad [in] If true, pad the sparse matrix's storage with /// zeros in order to improve cache alignment and / or /// vectorization. @@ -619,14 +619,14 @@ class BsrMatrix { /// The matrix will store and use the row map, indices, and values /// directly (by view, not by deep copy). /// - /// \param label [in] The sparse matrix's label. /// \param nrows [in] The number of rows. /// \param ncols [in] The number of columns. - /// \param annz [in] The number of entries. + /// \param size_type [in] Filler for annz /// \param vals [in/out] The entries. /// \param rows [in/out] The row map (containing the offsets to the /// data in each row). /// \param cols [in/out] The column indices. + /// \param blockDimIn [in] The block dimensions. BsrMatrix(const std::string& /*label*/, const OrdinalType nrows, const OrdinalType ncols, const size_type /*annz*/, const values_type& vals, const row_map_type& rows, @@ -669,11 +669,10 @@ class BsrMatrix { /// The matrix will store and use the row map, indices, and values /// directly (by view, not by deep copy). /// - /// \param[in] label The sparse matrix's label. - /// \param[in] ncols The number of columns. - /// \param[in] vals The entries. - /// \param[in] graph_ The graph between the blocks. - /// \param[in] blockDimIn The block size. + /// \param ncols [in] The number of columns. + /// \param vals [in] The entries. + /// \param graph_ [in] The graph between the blocks. + /// \param blockDimIn [in] The block size. BsrMatrix(const std::string& /*label*/, const OrdinalType& ncols, const values_type& vals, const staticcrsgraph_type& graph_, const OrdinalType& blockDimIn) @@ -803,17 +802,19 @@ class BsrMatrix { /// \brief Given an array of blocks, sum the values into corresponding /// block in BsrMatrix - /// \param[in] rowi is a block-row index - /// \param[in] ncol is number of blocks referenced in cols[] array - /// \param[in] cols[] are block colidxs within the block-row to be summed - /// into ncol entries - /// \param[in] vals[] array containing 'block' of values + /// \param rowi [in] is a block-row index + /// \param cols[] [in] are block colidxs within the block-row to be summed + /// into ncol entries + /// \param ncol [in] is number of blocks referenced in cols[] array + /// \param vals[] [in] array containing 'block' of values /// ncol*block_size*block_size entries /// assume vals block is provided in 'LayoutRight' or 'Row Major' /// format, that is e.g. 2x2 block [ a b ; c d ] provided as flattened /// 1d array as [a b c d] Assume that each block is stored contiguously /// in vals: [a b; c d] [e f; g h] -> [a b c d e f g h] If so, then i /// in [0, ncols) for cols[] maps to i*block_size*block_size in vals[] + /// \param is_sorted [in] + /// \param force_atomic [in] KOKKOS_INLINE_FUNCTION OrdinalType sumIntoValues(const OrdinalType rowi, const OrdinalType cols[], const OrdinalType ncol, const ScalarType vals[], @@ -825,17 +826,20 @@ class BsrMatrix { /// \brief Given an array of blocks, replace the values of corresponding /// blocks in BsrMatrix - /// \param[in] rowi is a block-row index - /// \param[in] ncol is number of blocks referenced in cols[] array - /// \param[in] cols[] are block colidxs within the block-row to be summed + /// \param rowi [in] is a block-row index + /// \param cols[] [in] are block colidxs within the block-row to be summed /// into ncol entries + /// \param ncol [in] is number of blocks referenced in cols[] array /// \param vals[] [in] array containing 'block' of values - // ncol*block_size*block_size entries - // assume vals block is provided in 'LayoutRight' or 'Row Major' - // format, that is e.g. 2x2 block [ a b ; c d ] provided as flattened - // 1d array as [a b c d] Assume that each block is stored contiguously - // in vals: [a b; c d] [e f; g h] -> [a b c d e f g h] If so, then i in - // [0, ncols) for cols[] maps to i*block_size*block_size in vals[] + /// ncol*block_size*block_size entries + /// assume vals block is provided in 'LayoutRight' or 'Row + /// Major' format, that is e.g. 2x2 block [ a b ; c d ] provided + /// as flattened 1d array as [a b c d] Assume that each block is + /// stored contiguously in vals: [a b; c d] [e f; g h] -> [a b c + /// d e f g h] If so, then i in [0, ncols) for cols[] maps to + /// i*block_size*block_size in vals[] + /// \param is_sorted [in] + /// \param force_atomic [in] KOKKOS_INLINE_FUNCTION OrdinalType replaceValues(const OrdinalType rowi, const OrdinalType cols[], const OrdinalType ncol, const ScalarType vals[], @@ -970,17 +974,21 @@ class BsrMatrix { /// \brief Given an array of blocks, operate on the values of corresponding /// blocks in BsrMatrix - /// \param[in] rowi is a block-row index - /// \param[in] ncol is number of blocks referenced in cols[] array - /// \param[in] cols[] are block colidxs within the block-row to be op-ed + /// \param op + /// \param rowi [in] is a block-row index + /// \param ncol [in] is number of blocks referenced in cols[] array + /// \param cols[] [in] are block colidxs within the block-row to be op-ed /// into ncol entries /// \param vals[] [in] array containing 'block' of values - // ncol*block_size*block_size entries - // assume vals block is provided in 'LayoutRight' or 'Row Major' - // format, that is e.g. 2x2 block [ a b ; c d ] provided as flattened - // 1d array as [a b c d] Assume that each block is stored contiguously - // in vals: [a b; c d] [e f; g h] -> [a b c d e f g h] If so, then i in - // [0, ncols) for cols[] maps to i*block_size*block_size in vals[] + /// ncol*block_size*block_size entries + /// assume vals block is provided in 'LayoutRight' or 'Row + /// Major' format, that is e.g. 2x2 block [ a b ; c d ] provided + /// as flattened 1d array as [a b c d] Assume that each block is + /// stored contiguously in vals: [a b; c d] [e f; g h] -> [a b c + /// d e f g h] If so, then i in [0, ncols) for cols[] maps to + /// i*block_size*block_size in vals[] + /// \param is_sorted [in] + /// \param force_atomic [in] KOKKOS_INLINE_FUNCTION OrdinalType operateValues(const BsrMatrix::valueOperation op, const OrdinalType rowi, const OrdinalType cols[], diff --git a/sparse/src/KokkosSparse_CcsMatrix.hpp b/sparse/src/KokkosSparse_CcsMatrix.hpp index c2ae9c806e..6ae05567ae 100644 --- a/sparse/src/KokkosSparse_CcsMatrix.hpp +++ b/sparse/src/KokkosSparse_CcsMatrix.hpp @@ -222,7 +222,6 @@ class CcsMatrix { /// The matrix will store and use the column map, indices, and values /// directly (by view, not by deep copy). /// - /// \param label [in] The sparse matrix's label. /// \param nrows [in] The number of rows. /// \param ncols [in] The number of columns. /// \param annz [in] The number of entries. diff --git a/sparse/src/KokkosSparse_CrsMatrix.hpp b/sparse/src/KokkosSparse_CrsMatrix.hpp index 7a6459d88e..8580eb2c7d 100644 --- a/sparse/src/KokkosSparse_CrsMatrix.hpp +++ b/sparse/src/KokkosSparse_CrsMatrix.hpp @@ -187,29 +187,28 @@ struct SparseRowView { public: /// \brief Constructor /// - /// \param values [in] Array of the row's values. - /// \param colidx [in] Array of the row's column indices. - /// \param stride [in] (Constant) stride between matrix entries in + /// \param values [in] Array of the row's values. + /// \param colidx__ [in] Array of the row's column indices. + /// \param stride [in] (Constant) stride between matrix entries in /// each of the above arrays. - /// \param count [in] Number of entries in the row. + /// \param count [in] Number of entries in the row. KOKKOS_INLINE_FUNCTION SparseRowView(value_type* const values, ordinal_type* const colidx__, const ordinal_type& stride, const ordinal_type& count) : values_(values), colidx_(colidx__), stride_(stride), length(count) {} /// \brief Constructor with offset into \c colidx array - /// - /// \param values [in] Array of the row's values. - /// \param colidx [in] Array of the row's column indices. - /// \param stride [in] (Constant) stride between matrix entries in - /// each of the above arrays. - /// \param count [in] Number of entries in the row. - /// \param idx [in] Start offset into \c colidx array - /// /// \tparam OffsetType The type of \c idx (see above). Must be a /// built-in integer type. This may differ from ordinal_type. /// For example, the matrix may have dimensions that fit in int, /// but a number of entries that does not fit in int. + /// + /// \param values [in] Array of the row's values. + /// \param colidx__ [in] Array of the row's column indices. + /// \param stride [in] (Constant) stride between matrix entries in + /// each of the above arrays. + /// \param count [in] Number of entries in the row. + /// \param idx [in] Start offset into \c colidx array template KOKKOS_INLINE_FUNCTION SparseRowView( const typename MatrixType::values_type& values, @@ -287,11 +286,11 @@ struct SparseRowViewConst { public: /// \brief Constructor /// - /// \param values [in] Array of the row's values. - /// \param colidx [in] Array of the row's column indices. - /// \param stride [in] (Constant) stride between matrix entries in - /// each of the above arrays. - /// \param count [in] Number of entries in the row. + /// \param values [in] Array of the row's values. + /// \param colidx__ [in] Array of the row's column indices. + /// \param stride [in] (Constant) stride between matrix entries in + /// each of the above arrays. + /// \param count [in] Number of entries in the row. KOKKOS_INLINE_FUNCTION SparseRowViewConst(value_type* const values, ordinal_type* const colidx__, const ordinal_type& stride, const ordinal_type& count) @@ -299,17 +298,16 @@ struct SparseRowViewConst { /// \brief Constructor with offset into \c colidx array /// - /// \param values [in] Array of the row's values. - /// \param colidx [in] Array of the row's column indices. - /// \param stride [in] (Constant) stride between matrix entries in - /// each of the above arrays. - /// \param count [in] Number of entries in the row. - /// \param idx [in] Start offset into \c colidx array - /// /// \tparam OffsetType The type of \c idx (see above). Must be a /// built-in integer type. This may differ from ordinal_type. /// For example, the matrix may have dimensions that fit in int, /// but a number of entries that does not fit in int. + /// \param values [in] Array of the row's values. + /// \param colidx__ [in] Array of the row's column indices. + /// \param stride [in] (Constant) stride between matrix entries in + /// each of the above arrays. + /// \param count [in] Number of entries in the row. + /// \param idx [in] Start offset into \c colidx array template KOKKOS_INLINE_FUNCTION SparseRowViewConst( const typename MatrixType::values_type& values, @@ -533,8 +531,9 @@ class CrsMatrix { /// The matrix will store and use the row map, indices /// (by view, not by deep copy) and allocate the values view. /// - /// \param label [in] The sparse matrix's label. - /// \param ncols [in] The number of columns. + /// \param label [in] The sparse matrix's label. + /// \param graph_ [in] + /// \param ncols [in] The number of columns. template CrsMatrix(const std::string& label, @@ -550,14 +549,9 @@ class CrsMatrix { /// The matrix will store and use the row map, indices, and values /// directly (by view, not by deep copy). /// - /// \param label [in] The sparse matrix's label. - /// \param nrows [in] The number of rows. /// \param ncols [in] The number of columns. - /// \param annz [in] The number of entries. /// \param vals [in/out] The entries. - /// \param rows [in/out] The row map (containing the offsets to the - /// data in each row). - /// \param cols [in/out] The column indices. + /// \param graph_ template CrsMatrix(const std::string&, const OrdinalType& ncols, @@ -577,7 +571,6 @@ class CrsMatrix { /// This constructor is mainly useful for benchmarking or for /// reading the sparse matrix's data from a file. /// - /// \param label [in] The sparse matrix's label. /// \param nrows [in] The number of rows. /// \param ncols [in] The number of columns. /// \param annz [in] The number of entries. @@ -635,7 +628,6 @@ class CrsMatrix { /// The matrix will store and use the row map, indices, and values /// directly (by view, not by deep copy). /// - /// \param label [in] The sparse matrix's label. /// \param nrows [in] The number of rows. /// \param ncols [in] The number of columns. /// \param annz [in] The number of entries. diff --git a/sparse/src/KokkosSparse_MatrixPrec.hpp b/sparse/src/KokkosSparse_MatrixPrec.hpp index 1686cc0563..5541d5e683 100644 --- a/sparse/src/KokkosSparse_MatrixPrec.hpp +++ b/sparse/src/KokkosSparse_MatrixPrec.hpp @@ -41,7 +41,6 @@ // ************************************************************************ //@HEADER */ -/// @file KokkosKernels_MatrixPrec.hpp #ifndef KK_MATRIX_PREC_HPP #define KK_MATRIX_PREC_HPP @@ -55,6 +54,7 @@ namespace KokkosSparse { namespace Experimental { +/// @file KokkosSparse_MatrixPrec.hpp /// \class MatrixPrec /// \brief This is a simple class to use if one /// already has a matrix representation of their diff --git a/sparse/src/KokkosSparse_OrdinalTraits.hpp b/sparse/src/KokkosSparse_OrdinalTraits.hpp index 21d44cf57c..29bcb2c633 100644 --- a/sparse/src/KokkosSparse_OrdinalTraits.hpp +++ b/sparse/src/KokkosSparse_OrdinalTraits.hpp @@ -45,7 +45,7 @@ #ifndef KOKKOS_SPARSE_ORDINALTRAITS_HPP_ #define KOKKOS_SPARSE_ORDINALTRAITS_HPP_ -/// \file Kokkos_Sparse_OrdinalTraits.hpp +/// \file KokkosSparse_OrdinalTraits.hpp /// \brief Declaration and definition of KokkosSparse::OrdinalTraits, /// a traits class for "invalid" (flag) values of integer types that /// KokkosKernels uses as local ordinals or global ordinals. diff --git a/sparse/src/KokkosSparse_Preconditioner.hpp b/sparse/src/KokkosSparse_Preconditioner.hpp index 999924c9c0..dbde21f4bd 100644 --- a/sparse/src/KokkosSparse_Preconditioner.hpp +++ b/sparse/src/KokkosSparse_Preconditioner.hpp @@ -41,8 +41,6 @@ // ************************************************************************ //@HEADER */ -/// @file KokkosKernels_Preconditioner.hpp -// #ifndef KK_PREC_HPP #define KK_PREC_HPP @@ -53,6 +51,7 @@ namespace KokkosSparse { namespace Experimental { +/// @file KokkosSparse_Preconditioner.hpp /// \class Preconditioner /// \brief Interface for KokkosKernels preconditioners /// \tparam ScalarType Type of the matrix's entries diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index f01e238d5d..e62ae9606d 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -752,26 +752,27 @@ struct Reverse_Map_Functor { } }; -/** - * \brief Utility function to obtain a reverse map given a map. - * Input is a map with the number of elements within the map. - * forward_map[c] = i, where c is a forward element and forward_map has a size - * of num_forward_elements. i is the value that c is mapped in the forward map, - * and the range of that is num_reverse_elements. Output is the reverse_map_xadj - * and reverse_map_adj such that, all c, forward_map[c] = i, will appear in - * reverse_map_adj[ reverse_map_xadj[i]: reverse_map_xadj[i+1]) \param: - * num_forward_elements: the number of elements in the forward map, the size of - * the forward map. \param: num_reverse_elements: the number of elements that - * forward map is mapped to. It is the value of max i. \param: forward_map: - * input forward_map, where forward_map[c] = i. \param: reverse_map_xadj: - * reverse map xadj, that is it will hold the beginning and end indices on - * reverse_map_adj such that all values mapped to i will be [ - * reverse_map_xadj[i]: reverse_map_xadj[i+1]) its size will be - * num_reverse_elements + 1. NO NEED TO INITIALIZE. \param: reverse_map_adj: - * reverse map adj, holds the values of reverse maps. Its size is - * num_forward_elements. - * - */ +/// \brief Utility function to obtain a reverse map given a map. +/// Input is a map with the number of elements within the map. +/// forward_map[c] = i, where c is a forward element and forward_map has a size +/// of num_forward_elements. i is the value that c is mapped in the forward map, +/// and the range of that is num_reverse_elements. Output is the +/// reverse_map_xadj and reverse_map_adj such that, all c, forward_map[c] = i, +/// will appear in reverse_map_adj[ reverse_map_xadj[i]: reverse_map_xadj[i+1]) + +/// \param num_forward_elements the number of elements in the forward map, +/// the size of the forward map. +/// \param num_reverse_elements the number of elements that +/// forward map is mapped to. It is the value of max i. +/// \param forward_map input forward_map, where forward_map[c] = i. +/// \param reverse_map_xadj +/// reverse map xadj, that is it will hold the beginning and +/// end indices on reverse_map_adj such that all values mapped +/// to i will be [reverse_map_xadj[i]: reverse_map_xadj[i+1]) +// its size will be num_reverse_elements + 1. +/// NO NEED TO INITIALIZE. +/// \param reverse_map_adj reverse map adj, holds the values of reverse +/// maps. Its size is num_forward_elements. template void kk_create_reverse_map( @@ -933,25 +934,13 @@ struct ColorChecker { } }; -/** - * \brief given a graph and a coloring function returns true or false if - distance-1 coloring is valid or not. - * \param num_rows: num rows in input graph - * \param num_cols: num cols in input graph - * \param xadj: row pointers of the input graph - * \param adj: column indices of the input graph - * \param t_xadj: output, the row indices of the output graph. MUST BE - INITIALIZED WITH ZEROES. - - * \param vector_size: suggested vector size, optional. if -1, kernel will - decide. - * \param suggested_team_size: suggested team size, optional. if -1, kernel will - decide. - * \param team_work_chunk_size: suggested work size of a team, optional. if -1, - kernel will decide. - * \param use_dynamic_scheduling: whether to use dynamic scheduling. Default is - true. - */ +/// \brief given a graph and a coloring function returns true or false if +/// distance-1 coloring is valid or not. +/// +/// \param num_rows num rows in input graph +/// \param xadj row pointers of the input graph +/// \param adj column indices of the input graphw +/// \param v_colors template inline size_t kk_is_d1_coloring_valid( diff --git a/sparse/src/KokkosSparse_findRelOffset.hpp b/sparse/src/KokkosSparse_findRelOffset.hpp index 3eb1100821..5904894ed2 100644 --- a/sparse/src/KokkosSparse_findRelOffset.hpp +++ b/sparse/src/KokkosSparse_findRelOffset.hpp @@ -45,7 +45,7 @@ #ifndef KOKKOS_SPARSE_FINDRELOFFSET_HPP #define KOKKOS_SPARSE_FINDRELOFFSET_HPP -/// \file Kokkos_Sparse_findRelOffset.hpp +/// \file KokkosSparse_findRelOffset.hpp /// \brief Find the relative offset of a column index in a sparse /// graph's or sparse matrix's row. diff --git a/sparse/src/KokkosSparse_getDiagCopy.hpp b/sparse/src/KokkosSparse_getDiagCopy.hpp index c1d45b13ec..02c9041e44 100644 --- a/sparse/src/KokkosSparse_getDiagCopy.hpp +++ b/sparse/src/KokkosSparse_getDiagCopy.hpp @@ -42,7 +42,7 @@ //@HEADER */ -/// \file Kokkos_Sparse_getDiagCopy.hpp +/// \file KokkosSparse_getDiagCopy.hpp /// \brief Get a copy of the diagonal entries of a KokkosSparse::CrsMatrix. #ifndef KOKKOS_SPARSE_GETDIAGCOPY_HPP_ diff --git a/sparse/src/KokkosSparse_spadd_handle.hpp b/sparse/src/KokkosSparse_spadd_handle.hpp index 917b1038a6..29c78c980d 100644 --- a/sparse/src/KokkosSparse_spadd_handle.hpp +++ b/sparse/src/KokkosSparse_spadd_handle.hpp @@ -76,11 +76,9 @@ class SPADDHandle { nnz_lno_view_t b_pos; public: - /** - * \brief sets the result nnz size. - * \param result_nnz_size: size of the output matrix. - */ - + /// \brief sets the result nnz size. + /// \param a_pos_in + /// \param b_pos_in void set_a_b_pos(const nnz_lno_view_t& a_pos_in, const nnz_lno_view_t& b_pos_in) { a_pos = a_pos_in; @@ -91,10 +89,8 @@ class SPADDHandle { nnz_lno_view_t get_b_pos() { return b_pos; } - /** - * \brief sets the result nnz size. - * \param result_nnz_size: size of the output matrix. - */ + /// \brief sets the result nnz size. + /// \param result_nnz_size_ size of the output matrix. void set_c_nnz(size_type result_nnz_size_) { this->result_nnz_size = result_nnz_size_; } diff --git a/sparse/src/KokkosSparse_spgemm_handle.hpp b/sparse/src/KokkosSparse_spgemm_handle.hpp index 8d9ebc9ed7..7dc3d9cdf1 100644 --- a/sparse/src/KokkosSparse_spgemm_handle.hpp +++ b/sparse/src/KokkosSparse_spgemm_handle.hpp @@ -413,10 +413,8 @@ class SPGEMMHandle { num_used_colors = num_used_colors_; } - /** - * \brief sets the result nnz size. - * \param result_nnz_size: size of the output matrix. - */ + /// \brief sets the result nnz size. + /// \param result_nnz_size_ size of the output matrix. void set_c_nnz(size_type result_nnz_size_) { this->result_nnz_size = result_nnz_size_; } diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index a95ae4fe90..983eaf8185 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -1323,12 +1323,15 @@ void spmv_struct(const char mode[], const int stencil_type, /// entries of y; if alpha == 0, ignore the entries of A and x. /// /// \param mode [in] "N" for no transpose, "T" for transpose, or "C" -/// for conjugate transpose. +/// for conjugate transpose. +/// \param stencil_type /// \param structure [in] this 1D view stores the # rows in each dimension -/// (i,j,k) \param alpha [in] Scalar multiplier for the matrix A. \param A [in] -/// The sparse matrix; KokkosSparse::CrsMatrix instance. \param x [in] Either a -/// single vector (rank-1 Kokkos::View) or -/// multivector (rank-2 Kokkos::View). +/// (i,j,k) +/// \param alpha [in] Scalar multiplier for the matrix A. +/// \param A [in] The sparse matrix; KokkosSparse::CrsMatrix instance. +/// \param x [in] Either a +/// single vector (rank-1 Kokkos::View) or +/// multivector (rank-2 Kokkos::View). /// \param beta [in] Scalar multiplier for the (multi)vector y. /// \param y [in/out] Either a single vector (rank-1 Kokkos::View) or /// multivector (rank-2 Kokkos::View). It must have the same number diff --git a/sparse/src/KokkosSparse_trsv.hpp b/sparse/src/KokkosSparse_trsv.hpp index eda3501ad0..f6d696cbc8 100644 --- a/sparse/src/KokkosSparse_trsv.hpp +++ b/sparse/src/KokkosSparse_trsv.hpp @@ -42,7 +42,7 @@ //@HEADER */ -/// \file Kokkos_Sparse_trsv.hpp +/// \file KokkosSparse_trsv.hpp /// \brief Local sparse triangular solve /// /// This file provides KokkosSparse::trsv. This function performs a From e04475d5566158b89b4a67e9980600d75642021f Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 30 Jan 2023 16:53:08 -0700 Subject: [PATCH 010/442] Things building --- blas/src/KokkosBlas3_trsm.hpp | 4 +- sparse/src/KokkosSparse_LUPrec.hpp | 140 +++++++++++++++++++++ sparse/src/KokkosSparse_MatrixPrec.hpp | 21 ++-- sparse/src/KokkosSparse_Preconditioner.hpp | 6 +- sparse/unit_test/Test_Sparse_gmres.hpp | 11 +- sparse/unit_test/Test_Sparse_par_ilut.hpp | 97 +++++++++++--- 6 files changed, 231 insertions(+), 48 deletions(-) create mode 100644 sparse/src/KokkosSparse_LUPrec.hpp diff --git a/blas/src/KokkosBlas3_trsm.hpp b/blas/src/KokkosBlas3_trsm.hpp index 1e73d92049..33c9fbaabe 100644 --- a/blas/src/KokkosBlas3_trsm.hpp +++ b/blas/src/KokkosBlas3_trsm.hpp @@ -70,8 +70,8 @@ namespace KokkosBlas { /// "L" or "l" indicates matrix A lower part is stored, the /// other part is not referenced /// \param trans [in] "N" or "n" for non-transpose, "T" or "t" for transpose, -/// "C" or "c" for conjugate transpose. \param diag [in] "U" or "u" indicates -/// the diagonal of A is assumed to be unit +/// "C" or "c" for conjugate transpose. +/// \param diag [in] "U" or "u" indicates the diagonal of A is assumed to be unit // "N" or "n" indicated the diagonal of A is assumed to be // non-unit /// \param alpha [in] Input coefficient used for multiplication with B diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp new file mode 100644 index 0000000000..2cdb6a1dc7 --- /dev/null +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -0,0 +1,140 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Jennifer Loe (jloe@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +/// @file KokkosKernels_LUPrec.hpp + +#ifndef KK_LU_PREC_HPP +#define KK_LU_PREC_HPP + +#include +#include +#include +#include +#include + +namespace KokkosSparse { + +namespace Experimental { + +/// \class LUPrec +/// \brief This class is for applying LU preconditioning. +/// It takes L and U and the apply method returns U^inv L^inv x +/// \tparam CRS the CRS type of L and U +/// +/// Preconditioner provides the following methods +/// - initialize() Does nothing; members initialized upon object construction. +/// - isInitialized() returns true +/// - compute() Does nothing; members initialized upon object construction. +/// - isComputed() returns true +/// +template +class LUPrec : public KokkosSparse::Experimental::Preconditioner { + public: + using ScalarType = typename std::remove_const::type; + using EXSP = typename CRS::execution_space; + using karith = typename Kokkos::ArithTraits; + + private: + Kokkos::View _L, _U, _tmp; + + public: + //! Constructor: + template + LUPrec(const ViewArg &L, const ViewArg &U) : _L(L), _U(U), _tmp("LUPrec::_tmp", _L.extent(0), 1) {} + + //! Destructor. + virtual ~LUPrec() {} + + ///// \brief Apply the preconditioner to X, putting the result in Y. + ///// + ///// \tparam XViewType Input vector, as a 1-D Kokkos::View + ///// \tparam YViewType Output vector, as a nonconst 1-D Kokkos::View + ///// + ///// \param transM [in] "N" for non-transpose, "T" for transpose, "C" + ///// for conjugate transpose. All characters after the first are + ///// ignored. This works just like the BLAS routines. + ///// \param alpha [in] Input coefficient of M*x + ///// \param beta [in] Input coefficient of Y + ///// + ///// If the result of applying this preconditioner to a vector X is + ///// \f$M \cdot X\f$, then this method computes \f$Y = \beta Y + \alpha M + ///\cdot X\f$. + ///// The typical case is \f$\beta = 0\f$ and \f$\alpha = 1\f$. + // + virtual void apply(const Kokkos::View &X, + const Kokkos::View &Y, + const char transM[] = "N", + ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { + + + // tmp = trsm(L, x); //Apply L^inv to x + // y = trsm(U, tmp); //Apply U^inv to tmp + auto tmpsv = Kokkos::subview(_tmp, Kokkos::ALL, 0); + Kokkos::deep_copy(tmpsv, X); + KokkosBlas::Impl::SerialTrsm_Invoke("L", "L", transM, "N", alpha, _L, _tmp); + KokkosBlas::Impl::SerialTrsm_Invoke("L", "U", transM, "N", alpha, _U, _tmp); + Kokkos::deep_copy(Y, tmpsv); + } + //@} + + //! Set this preconditioner's parameters. + void setParameters() {} + + void initialize() {} + + //! True if the preconditioner has been successfully initialized, else false. + bool isInitialized() const { return true; } + + void compute() {} + + //! True if the preconditioner has been successfully computed, else false. + bool isComputed() const { return true; } + + //! True if the preconditioner implements a transpose operator apply. + bool hasTransposeApply() const { return true; } +}; +} // namespace Experimental +} // End namespace KokkosSparse + +#endif diff --git a/sparse/src/KokkosSparse_MatrixPrec.hpp b/sparse/src/KokkosSparse_MatrixPrec.hpp index 1686cc0563..25c1431841 100644 --- a/sparse/src/KokkosSparse_MatrixPrec.hpp +++ b/sparse/src/KokkosSparse_MatrixPrec.hpp @@ -60,13 +60,9 @@ namespace Experimental { /// already has a matrix representation of their /// preconditioner M. The class applies an /// SpMV with M as the preconditioning step. -/// \tparam ScalarType Type of the matrix's entries -/// \tparam Layout Kokkos layout of vectors X and Y to which -/// the preconditioner is applied -/// \tparam EXSP Execution space for the preconditioner apply -/// \tparam Ordinal Type of the matrix's indices; +/// \tparam CRS the type of compressed matrix /// -/// Preconditioner provides the following methods +/// MatrixPrec provides the following methods /// - initialize() Does nothing; Matrix initialized upon object construction. /// - isInitialized() returns true /// - compute() Does nothing; Matrix initialized upon object construction. @@ -75,10 +71,7 @@ namespace Experimental { template class MatrixPrec : public KokkosSparse::Experimental::Preconditioner { private: - CRS A; - - bool isInitialized_ = true; - bool isComputed_ = true; + CRS _A; public: using ScalarType = typename std::remove_const::type; @@ -87,7 +80,7 @@ class MatrixPrec : public KokkosSparse::Experimental::Preconditioner { //! Constructor: template - MatrixPrec(const CRSArg &mat) : A(mat) {} + MatrixPrec(const CRSArg &mat) : _A(mat) {} //! Destructor. virtual ~MatrixPrec() {} @@ -113,7 +106,7 @@ class MatrixPrec : public KokkosSparse::Experimental::Preconditioner { const char transM[] = "N", ScalarType alpha = karith::one(), ScalarType beta = karith::zero()) const { - KokkosSparse::spmv(transM, alpha, A, X, beta, Y); + KokkosSparse::spmv(transM, alpha, _A, X, beta, Y); } //@} @@ -123,12 +116,12 @@ class MatrixPrec : public KokkosSparse::Experimental::Preconditioner { void initialize() {} //! True if the preconditioner has been successfully initialized, else false. - bool isInitialized() const { return isInitialized_; } + bool isInitialized() const { return true; } void compute() {} //! True if the preconditioner has been successfully computed, else false. - bool isComputed() const { return isComputed_; } + bool isComputed() const { return true; } //! True if the preconditioner implements a transpose operator apply. bool hasTransposeApply() const { return true; } diff --git a/sparse/src/KokkosSparse_Preconditioner.hpp b/sparse/src/KokkosSparse_Preconditioner.hpp index 999924c9c0..eab81f1d13 100644 --- a/sparse/src/KokkosSparse_Preconditioner.hpp +++ b/sparse/src/KokkosSparse_Preconditioner.hpp @@ -55,11 +55,7 @@ namespace Experimental { /// \class Preconditioner /// \brief Interface for KokkosKernels preconditioners -/// \tparam ScalarType Type of the matrix's entries -/// \tparam Layout Kokkos layout of vectors X and Y to which -/// the preconditioner is applied -/// \tparam EXSP Execution space for the preconditioner apply -/// \tparam Ordinal Type of the matrix's indices; +/// \tparam CRS Type of the compressed matrix /// /// Preconditioner provides the following methods /// - initialize() performs all operations based on the graph of the diff --git a/sparse/unit_test/Test_Sparse_gmres.hpp b/sparse/unit_test/Test_Sparse_gmres.hpp index 47b47f7f98..df2bce9baa 100644 --- a/sparse/unit_test/Test_Sparse_gmres.hpp +++ b/sparse/unit_test/Test_Sparse_gmres.hpp @@ -112,14 +112,13 @@ void run_test_gmres() { ViewVectorType Wj("Wj", n); // For checking residuals at end. ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), n); // right-hand side vec + // Make rhs ones so that results are repeatable: + Kokkos::deep_copy(B, 1.0); gmres_handle->set_verbose(verbose); // Test CGS2 { - // Make rhs ones so that results are repeatable: - Kokkos::deep_copy(B, 1.0); - gmres(&kh, A, B, X); // Double check residuals at end of solve: @@ -163,12 +162,12 @@ void run_test_gmres() { gmres_handle->set_verbose(verbose); // Make precond - auto myPrec = new KokkosSparse::Experimental::MatrixPrec(A); + KokkosSparse::Experimental::MatrixPrec myPrec(A); // reset X for next gmres call Kokkos::deep_copy(X, 0.0); - gmres(&kh, A, B, X, myPrec); + gmres(&kh, A, B, X, &myPrec); // Double check residuals at end of solve: float_t nrmB = KokkosBlas::nrm2(B); @@ -180,8 +179,6 @@ void run_test_gmres() { EXPECT_LT(endRes, gmres_handle->get_tol()); EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); - - delete myPrec; } } diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index 6ff31c10b8..bb088bc632 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -53,6 +53,8 @@ #include "KokkosBlas1_nrm2.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosSparse_par_ilut.hpp" +#include "KokkosSparse_gmres.hpp" +#include "KokkosSparse_LUPrec.hpp" #include @@ -110,6 +112,27 @@ std::vector> decompress_matrix( return result; } +template +void decompress_matrix( + Kokkos::View& row_map, + Kokkos::View& entries, + Kokkos::View& values, + Kokkos::View& output) { + const size_type nrows = row_map.size() - 1; + + Kokkos::parallel_for(nrows, KOKKOS_LAMBDA (const int& row_idx) { + const size_type row_nnz_begin = row_map(row_idx); + const size_type row_nnz_end = row_map(row_idx + 1); + for (size_type row_nnz = row_nnz_begin; row_nnz < row_nnz_end; ++row_nnz) { + const lno_t col_idx = entries(row_nnz); + const scalar_t value = values(row_nnz); + output(row_idx, col_idx) = value; + } + }); +} + + template void check_matrix(const std::string& name, @@ -387,34 +410,68 @@ void run_test_par_ilut_precond() { #endif ); - // Create LU^inv + // Convert L, U parILUT outputs to uncompressed 2d views as required + // by LUPrec + Kokkos::View + L_uncompressed("L_uncompressed", numRows, numRows), + U_uncompressed("U_uncompressed", numRows, numRows); + decompress_matrix(L_row_map, L_entries, L_values, L_uncompressed); + decompress_matrix(U_row_map, U_entries, U_values, U_uncompressed); + + // Set initial vectors: + ViewVectorType X("X", n); // Solution and initial guess + ViewVectorType Wj("Wj", n); // For checking residuals at end. + ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), + n); // right-hand side vec + // Make rhs ones so that results are repeatable: + Kokkos::deep_copy(B, 1.0); + + int num_iters_plain(0), num_iters_precond(0); + + // Solve Ax = b { - std::string myalg("SPGEMM_KK_MEMORY"); - KokkosSparse::SPGEMMAlgorithm spgemm_algorithm = - KokkosSparse::StringToSPGEMMAlgorithm(myalg); - kh.create_spgemm_handle(spgemm_algorithm); - kh.create_spadd_handle(true /*we expect inputs to be sorted*/); + gmres(&kh, A, B, X); + + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; + + const auto conv_flag = gmres_handle->get_conv_flag_val(); + num_iters_plain = gmres_handle->get_num_iters(); + + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + } + + // Solve Ax = b with LU preconditioner + { + gmres_handle->reset_handle(m, tol); + gmres_handle->set_verbose(verbose); + + // Make precond + KokkosSparse::Experimental::LUPrec myPrec(L_uncompressed, U_uncompressed); - KokkosSparse::Experimental::spgemm_symbolic( - &kh, numRows, numRows, numRows, L_row_map, L_entries, false, U_row_map, - U_entries, false, LU_row_map); + // reset X for next gmres call + Kokkos::deep_copy(X, 0.0); - const size_type lu_nnz_size = kh.get_spgemm_handle()->get_c_nnz(); - Kokkos::resize(LU_entries, lu_nnz_size); - Kokkos::resize(LU_values, lu_nnz_size); + gmres(&kh, A, B, X, &myPrec); - KokkosSparse::Experimental::spgemm_numeric( - &kh, numRows, numRows, numRows, L_row_map, L_entries, L_values, false, - U_row_map, U_entries, U_values, false, LU_row_map, LU_entries, - LU_values); + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; - // Need to sort LU CRS if on CUDA! - KokkosSparse::sort_crs_matrix(LU_row_map, LU_entries, LU_values); + const auto conv_flag = gmres_handle->get_conv_flag_val(); + num_iters_precond = gmres_handle->get_num_iters(); - kh.destroy_spgemm_handle(); + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + EXPECT_LT(num_iters_precond, num_iters_plain); } - } } // namespace Test From 11be16b616294dfd3d29e9d27fd1ba98fee48b7a Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 30 Jan 2023 17:26:41 -0700 Subject: [PATCH 011/442] Fixing deprecated usage of Kokkos::Impl::ALL_t in favore of Kokkos::ALL_t --- batched/KokkosBatched_Util.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/batched/KokkosBatched_Util.hpp b/batched/KokkosBatched_Util.hpp index 3e97c97d72..c848d7acda 100644 --- a/batched/KokkosBatched_Util.hpp +++ b/batched/KokkosBatched_Util.hpp @@ -612,8 +612,8 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, } template KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, - Kokkos::Impl::ALL_t i2, - Kokkos::Impl::ALL_t i3, + Kokkos::ALL_t i2, + Kokkos::ALL_t i3, const BatchLayout::Left &layout_tag, const Trans::Transpose) { auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); @@ -645,7 +645,7 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper( } template KOKKOS_INLINE_FUNCTION auto subview_wrapper( - ViewType v, IdxType1 i1, Kokkos::Impl::ALL_t i2, Kokkos::Impl::ALL_t i3, + ViewType v, IdxType1 i1, Kokkos::ALL_t i2, Kokkos::ALL_t i3, const BatchLayout::Right &layout_tag, const Trans::Transpose &) { auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); From 215c6beb0ab75613405db9d2da21714a2392d52b Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 30 Jan 2023 17:27:21 -0700 Subject: [PATCH 012/442] Benchmarks: for some reason the current version fails to build I suspect an invisible character was added on the modified line by the author's text editor... In any case this version now compiles fine. --- perf_test/Benchmark_Context.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index 8e356841de..4bd02321dd 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -43,7 +43,7 @@ */ #ifndef KOKKOSKERNELS_PERFTEST_BENCHMARK_CONTEXT_HPP -#define KOKKOSKENERLS_PERFTEST_BENCHMARK_CONTEXT_HPP +#define KOKKOSKERNELS_PERFTEST_BENCHMARK_CONTEXT_HPP #include From 6c5744fd6ae3e4fbb89a6b515b8a9ee7ff8071e9 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 30 Jan 2023 17:30:26 -0700 Subject: [PATCH 013/442] Applying clang-format --- batched/KokkosBatched_Util.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/batched/KokkosBatched_Util.hpp b/batched/KokkosBatched_Util.hpp index c848d7acda..35125cdd98 100644 --- a/batched/KokkosBatched_Util.hpp +++ b/batched/KokkosBatched_Util.hpp @@ -612,8 +612,7 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, } template KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, - Kokkos::ALL_t i2, - Kokkos::ALL_t i3, + Kokkos::ALL_t i2, Kokkos::ALL_t i3, const BatchLayout::Left &layout_tag, const Trans::Transpose) { auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); From fed582cb5e5c3031293261773f7a16aeeb5e19f1 Mon Sep 17 00:00:00 2001 From: kliegeois Date: Tue, 31 Jan 2023 07:37:00 -0700 Subject: [PATCH 014/442] Fix an error in Krylov Handle documentation --- batched/sparse/src/KokkosBatched_Krylov_Handle.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp b/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp index aff0219175..2ea489d307 100644 --- a/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp +++ b/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp @@ -114,7 +114,7 @@ class KrylovHandle { Kokkos::deep_copy(first_index, first_index_host); Kokkos::deep_copy(last_index, last_index_host); - // Default Classical GS + // Default modified GS ortho_strategy = 1; scratch_pad_level = 0; compute_last_residual = true; From b846db97e3b6a9a379dfbef09ad122f523607cce Mon Sep 17 00:00:00 2001 From: Evan Harvey <57234914+e10harvey@users.noreply.github.com> Date: Tue, 31 Jan 2023 08:47:36 -0700 Subject: [PATCH 015/442] Apply suggestions from code review --- blas/src/KokkosBlas1_nrm2w_squared.hpp | 2 +- sparse/src/KokkosSparse_BsrMatrix.hpp | 4 ++-- sparse/src/KokkosSparse_CrsMatrix.hpp | 4 ++-- sparse/src/KokkosSparse_Utils.hpp | 2 +- sparse/src/KokkosSparse_spadd_handle.hpp | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/blas/src/KokkosBlas1_nrm2w_squared.hpp b/blas/src/KokkosBlas1_nrm2w_squared.hpp index d07dabdaca..648d2c8d18 100644 --- a/blas/src/KokkosBlas1_nrm2w_squared.hpp +++ b/blas/src/KokkosBlas1_nrm2w_squared.hpp @@ -56,7 +56,7 @@ namespace KokkosBlas { /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// /// \param x [in] Input 1-D View. -/// \param w [in] +/// \param w [in] Input weights (1-D View). /// /// \return The nrm2w product result; a single value. template diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index 001fffe70e..f1c3c0df71 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -240,7 +240,7 @@ struct BsrRowViewConst { /// /// \param values [in] Array of the row's values. /// \param colidx [in] Array of the row's column indices. - /// \param blockDim + /// \param blockDim [in] The block dimensions. /// \param count [in] Number of entries in the row. /// \param start [in] Offset into values and colidx of the desired block-row /// start. @@ -672,7 +672,7 @@ class BsrMatrix { /// \param ncols [in] The number of columns. /// \param vals [in] The entries. /// \param graph_ [in] The graph between the blocks. - /// \param blockDimIn [in] The block size. + /// \param blockDimIn [in] The block dimensions. BsrMatrix(const std::string& /*label*/, const OrdinalType& ncols, const values_type& vals, const staticcrsgraph_type& graph_, const OrdinalType& blockDimIn) diff --git a/sparse/src/KokkosSparse_CrsMatrix.hpp b/sparse/src/KokkosSparse_CrsMatrix.hpp index 8580eb2c7d..2166d4e63e 100644 --- a/sparse/src/KokkosSparse_CrsMatrix.hpp +++ b/sparse/src/KokkosSparse_CrsMatrix.hpp @@ -532,7 +532,7 @@ class CrsMatrix { /// (by view, not by deep copy) and allocate the values view. /// /// \param label [in] The sparse matrix's label. - /// \param graph_ [in] + /// \param graph_ [in] The graph for storing the rowmap and col ids. /// \param ncols [in] The number of columns. template @@ -551,7 +551,7 @@ class CrsMatrix { /// /// \param ncols [in] The number of columns. /// \param vals [in/out] The entries. - /// \param graph_ + /// \param graph_ The graph for storing the rowmap and col ids. template CrsMatrix(const std::string&, const OrdinalType& ncols, diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index e62ae9606d..e99d9a5057 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -940,7 +940,7 @@ struct ColorChecker { /// \param num_rows num rows in input graph /// \param xadj row pointers of the input graph /// \param adj column indices of the input graphw -/// \param v_colors +/// \param v_colors The colors at each vertex in the graph. template inline size_t kk_is_d1_coloring_valid( diff --git a/sparse/src/KokkosSparse_spadd_handle.hpp b/sparse/src/KokkosSparse_spadd_handle.hpp index 29c78c980d..ccf3c17e9b 100644 --- a/sparse/src/KokkosSparse_spadd_handle.hpp +++ b/sparse/src/KokkosSparse_spadd_handle.hpp @@ -77,8 +77,8 @@ class SPADDHandle { public: /// \brief sets the result nnz size. - /// \param a_pos_in - /// \param b_pos_in + /// \param a_pos_in The offset into a. + /// \param b_pos_in The offset into b. void set_a_b_pos(const nnz_lno_view_t& a_pos_in, const nnz_lno_view_t& b_pos_in) { a_pos = a_pos_in; From d2f273c02b9f3dfd93ccb09c25768449641d51e1 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 31 Jan 2023 08:47:59 -0700 Subject: [PATCH 016/442] osx-ci: adding option to disable deprecated_code_4 in Kokkos This will eliminate some of the build warning that we currently see in our CI builds of Kokkos Core. --- .github/workflows/osx.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index f8b014587b..d1504158c9 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -72,6 +72,7 @@ jobs: -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEBUG_BOUNDS_CHECK:BOOL=${{ matrix.debug_bounds_check }} \ -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ -DCMAKE_INSTALL_PREFIX=$PWD/../install \ .. From 86edac3b13f1e073379b6d255fe776a21e75e6c8 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 31 Jan 2023 10:21:04 -0700 Subject: [PATCH 017/442] Minor fixes --- sparse/unit_test/Test_Sparse_par_ilut.hpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index bb088bc632..90374178c9 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -386,9 +386,6 @@ void run_test_par_ilut_precond() { // Allocate L and U CRS views as outputs RowMapType L_row_map ("L_row_map", numRows + 1); RowMapType U_row_map ("U_row_map", numRows + 1); - RowMapType LU_row_map("LU_row_map", numRows + 1); - EntriesType LU_entries("LU_entries"); - ValuesType LU_values ("LU_values"); // Initial L/U approximations for A par_ilut_symbolic(&kh, row_map, entries, L_row_map, U_row_map); @@ -415,8 +412,8 @@ void run_test_par_ilut_precond() { Kokkos::View L_uncompressed("L_uncompressed", numRows, numRows), U_uncompressed("U_uncompressed", numRows, numRows); - decompress_matrix(L_row_map, L_entries, L_values, L_uncompressed); - decompress_matrix(U_row_map, U_entries, U_values, U_uncompressed); + decompress_matrix(L_row_map, L_entries, L_values, L_uncompressed); + decompress_matrix(U_row_map, U_entries, U_values, U_uncompressed); // Set initial vectors: ViewVectorType X("X", n); // Solution and initial guess From cc38f32ef9683c0e9dc2056d91364cfda443274e Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 31 Jan 2023 11:31:46 -0700 Subject: [PATCH 018/442] Add deprecated code disable to docs build. --- .github/workflows/docs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index ea377dafcd..78ac43b04d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -43,6 +43,7 @@ jobs: -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \ -DKokkos_ENABLE_TESTS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ .. - name: build_and_install_kokkos From 51c3c5a0c66c9b600d7ec1fccd7660659d416fcb Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 31 Jan 2023 11:34:37 -0700 Subject: [PATCH 019/442] Fix whitespace --- .github/workflows/docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 78ac43b04d..8a0feef0a2 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -43,7 +43,7 @@ jobs: -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \ -DKokkos_ENABLE_TESTS=OFF \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ .. - name: build_and_install_kokkos From b4f3dd0ebd39e0943cd1e5ba2681212109b84b10 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 31 Jan 2023 11:38:27 -0700 Subject: [PATCH 020/442] Fix documentation regressions --- sparse/src/KokkosSparse_MatrixPrec.hpp | 2 +- sparse/src/KokkosSparse_Preconditioner.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sparse/src/KokkosSparse_MatrixPrec.hpp b/sparse/src/KokkosSparse_MatrixPrec.hpp index 3c823781ad..cfc7f0ae17 100644 --- a/sparse/src/KokkosSparse_MatrixPrec.hpp +++ b/sparse/src/KokkosSparse_MatrixPrec.hpp @@ -14,7 +14,7 @@ // //@HEADER -/// @file KokkosKernels_MatrixPrec.hpp +/// @file KokkosSparse_MatrixPrec.hpp #ifndef KK_MATRIX_PREC_HPP #define KK_MATRIX_PREC_HPP diff --git a/sparse/src/KokkosSparse_Preconditioner.hpp b/sparse/src/KokkosSparse_Preconditioner.hpp index 02f6f0dd67..ab20764825 100644 --- a/sparse/src/KokkosSparse_Preconditioner.hpp +++ b/sparse/src/KokkosSparse_Preconditioner.hpp @@ -14,7 +14,7 @@ // //@HEADER -/// @file KokkosKernels_Preconditioner.hpp +/// @file KokkosSparse_Preconditioner.hpp #ifndef KK_PREC_HPP #define KK_PREC_HPP From 3074b4b01ba5dee7c45f99410b0267f791545347 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 31 Jan 2023 14:35:13 -0700 Subject: [PATCH 021/442] CMakeLists.txt: update version to 4.0.99 Update develop following creation of release-candidate-4.0.0 --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ce714e1ff..443b124cb2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,8 +23,8 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS) IF(NOT DEFINED ${PROJECT_NAME}) PROJECT(KokkosKernels CXX) ENDIF() - SET(KokkosKernels_VERSION_MAJOR 3) - SET(KokkosKernels_VERSION_MINOR 7) + SET(KokkosKernels_VERSION_MAJOR 4) + SET(KokkosKernels_VERSION_MINOR 0) SET(KokkosKernels_VERSION_PATCH 99) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") MATH(EXPR KOKKOSKERNELS_VERSION "${KokkosKernels_VERSION_MAJOR} * 10000 + ${KokkosKernels_VERSION_MINOR} * 100 + ${KokkosKernels_VERSION_PATCH}") From 93ecefbc9c5e8468306a9a26dc845af677153abe Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 31 Jan 2023 14:35:46 -0700 Subject: [PATCH 022/442] Fix LUPrec license --- sparse/src/KokkosSparse_LUPrec.hpp | 35 +++++------------------------- 1 file changed, 5 insertions(+), 30 deletions(-) diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index 2cdb6a1dc7..bc0b51aa78 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -2,41 +2,16 @@ //@HEADER // ************************************************************************ // -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Jennifer Loe (jloe@sandia.gov) +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // // ************************************************************************ //@HEADER From 12e1b814b8b1442a55e4a507d848063f7fa8693e Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 25 Jan 2023 11:15:30 -0500 Subject: [PATCH 023/442] Do not use Kokkos::Impl::clock_tic, prefer std::chrono to get a random seed --- batched/dense/unit_test/Test_Batched_SerialTrmm.hpp | 5 ++++- batched/dense/unit_test/Test_Batched_SerialTrtri.hpp | 5 ++++- blas/unit_test/Test_Blas3_gemm.hpp | 8 ++++++-- blas/unit_test/Test_Blas3_trmm.hpp | 7 +++++-- blas/unit_test/Test_Blas3_trsm.hpp | 5 ++++- blas/unit_test/Test_Blas_trtri.hpp | 7 +++++-- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 10 +++++++--- perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 5 ++++- perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp | 5 ++++- 9 files changed, 43 insertions(+), 14 deletions(-) diff --git a/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp b/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp index b688a46e2e..7082d183a5 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp @@ -22,6 +22,8 @@ #include "KokkosKernels_TestUtils.hpp" +#include + using namespace KokkosBatched; namespace Test { @@ -165,7 +167,8 @@ void impl_test_batched_trmm(const int N, const int nRows, const int nCols, Kokkos::create_mirror_view(B_actual); typename ViewType::HostMirror B_expected_host = Kokkos::create_mirror_view(B_expected); - uint64_t seed = Kokkos::Impl::clock_tic(); + uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); using ViewTypeSubA = decltype(Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL())); diff --git a/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp b/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp index d9fb714008..ee0982efd4 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp @@ -22,6 +22,8 @@ #include "KokkosKernels_TestUtils.hpp" +#include + #define PRINT_MAT 0 using namespace KokkosBatched; @@ -161,7 +163,8 @@ void impl_test_batched_trtri(const int N, const int K) { typename ViewType::HostMirror I_host = Kokkos::create_mirror_view(A_I); typename ViewType::HostMirror A_host = Kokkos::create_mirror_view(A); - uint64_t seed = Kokkos::Impl::clock_tic(); + uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); using ViewTypeSubA = decltype(Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL())); diff --git a/blas/unit_test/Test_Blas3_gemm.hpp b/blas/unit_test/Test_Blas3_gemm.hpp index c3345ae0d4..8ab92e25b1 100644 --- a/blas/unit_test/Test_Blas3_gemm.hpp +++ b/blas/unit_test/Test_Blas3_gemm.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace Test { template rand_pool(seed); Kokkos::fill_random(A, rand_pool, Kokkos::rand rand_pool(seed); // (SA 11 Dec 2019) Max (previously: 10) increased to detect the bug in diff --git a/blas/unit_test/Test_Blas3_trmm.hpp b/blas/unit_test/Test_Blas3_trmm.hpp index bf44debaf4..f52dd8dd54 100644 --- a/blas/unit_test/Test_Blas3_trmm.hpp +++ b/blas/unit_test/Test_Blas3_trmm.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace Test { template @@ -110,8 +112,9 @@ void impl_test_trmm(const char* side, const char* uplo, const char* trans, ViewTypeA A("A", K, K); ViewTypeB B("B", M, N); ViewTypeB B_expected("B_expected", M, N); - uint64_t seed = Kokkos::Impl::clock_tic(); - ScalarA beta = ScalarA(0); + uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); + ScalarA beta = ScalarA(0); // printf("KokkosBlas::trmm test for alpha %g, %c %c %c %c, M %d, N %d, eps // %g, ViewType: %s\n", diff --git a/blas/unit_test/Test_Blas3_trsm.hpp b/blas/unit_test/Test_Blas3_trsm.hpp index 5857f7a533..79859aa24a 100644 --- a/blas/unit_test/Test_Blas3_trsm.hpp +++ b/blas/unit_test/Test_Blas3_trsm.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace Test { template @@ -121,7 +123,8 @@ void impl_test_trsm(const char* side, const char* uplo, const char* trans, typename ViewTypeB::HostMirror h_B = Kokkos::create_mirror_view(B); typename ViewTypeB::HostMirror h_X0 = Kokkos::create_mirror_view(X0); - uint64_t seed = Kokkos::Impl::clock_tic(); + uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); if ((diag[0] == 'U') || (diag[0] == 'u')) { diff --git a/blas/unit_test/Test_Blas_trtri.hpp b/blas/unit_test/Test_Blas_trtri.hpp index d333b963b4..518b96495f 100644 --- a/blas/unit_test/Test_Blas_trtri.hpp +++ b/blas/unit_test/Test_Blas_trtri.hpp @@ -19,6 +19,8 @@ #include #include +#include + namespace Test { template @@ -109,8 +111,9 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, ViewTypeA A("A", M, N); ViewTypeA A_original("A_original", M, N); ViewTypeA A_I("A_I", M, N); // is I taken...? - uint64_t seed = Kokkos::Impl::clock_tic(); - ScalarA beta = ScalarA(0); + uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); + ScalarA beta = ScalarA(0); ScalarA cur_check_val; // Either 1 or 0, to check A_I // const int As0 = A.stride(0), As1 = A.stride(1); diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index e1137aaeea..40bc80d0df 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -33,6 +33,8 @@ #include "gtest/gtest.h" // EXPECT_NEAR #include "KokkosKernels_TestUtils.hpp" +#include + #if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) #include "armpl.h" #else @@ -1334,7 +1336,8 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { simd_view_type C("C", simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1)); - // uint64_t seed = Kokkos::Impl::clock_tic(); + // uint64_t seed = + // std::chrono::high_resolution_clock::now().time_since_epoch().count(); // Kokkos::Random_XorShift64_Pool rand_pool(seed); // Kokkos::fill_random(A, rand_pool, // Kokkos::rand, @@ -1444,7 +1447,7 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { view_type C((scalar_type *)C_vector.data(), simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1)); internal_vector_view_type C_vector_internal(C_vector.data(), simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1)); - uint64_t seed = Kokkos::Impl::clock_tic(); + uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); Kokkos::fill_random(A, rand_pool, Kokkos::rand, scalar_type>::max()); Kokkos::fill_random(B, rand_pool, Kokkos::rand, scalar_type>::max()); @@ -1914,7 +1917,8 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { using execution_space = typename device_type::execution_space; gemm_args_t gemm_args; - uint64_t seed = Kokkos::Impl::clock_tic(); + uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); STATUS; diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 7f7e8e25ad..90f7a90617 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -27,6 +27,8 @@ #include "KokkosBatched_Trmm_Serial_Impl.hpp" #include "KokkosBatched_Util.hpp" +#include + //#define PERF_TEST_DEBUG // Forward declarations @@ -611,7 +613,8 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { using execution_space = typename device_type::execution_space; trmm_args_t trmm_args; - uint64_t seed = Kokkos::Impl::clock_tic(); + uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); decltype(dim.a.m) min_dim = dim.a.m < dim.a.n ? dim.a.m : dim.a.n; typename vta::HostMirror host_A; diff --git a/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp index 7a81a191bb..cbadcef0b1 100644 --- a/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp @@ -27,6 +27,8 @@ #include "KokkosBatched_Trtri_Serial_Impl.hpp" #include "KokkosBatched_Util.hpp" +#include + //#define TRTRI_PERF_TEST_DEBUG // Forward declarations @@ -436,7 +438,8 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) { using execution_space = typename device_type::execution_space; trtri_args_t trtri_args; - uint64_t seed = Kokkos::Impl::clock_tic(); + uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); Kokkos::Random_XorShift64_Pool rand_pool(seed); decltype(dim.a.m) min_dim = dim.a.m < dim.a.n ? dim.a.m : dim.a.n; typename vta::HostMirror host_A; From a4bea479838400170304d7f32f614d53ff0d22f4 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 24 Jan 2023 21:14:46 +0000 Subject: [PATCH 024/442] Replace printf in device code for SYCL --- common/src/KokkosKernels_SimpleUtils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index 63d1ff720e..baefbe8c35 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -292,7 +292,7 @@ struct IsRelativelyIdenticalFunctor { } if (val_diff > mag_type(eps)) { - printf( + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Values at index %d, %.6f + %.6fi and %.6f + %.6fi, differ too much " "(eps = %e)\n", (int)i, KAT::real(view1(i)), KAT::imag(view1(i)), KAT::real(view2(i)), From 4ee798d8374032596651982c22e15d5a8a1eceb7 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 25 Jan 2023 10:50:41 -0500 Subject: [PATCH 025/442] Drop pre Kokkos 3.6 workaround --- blas/impl/KokkosBlas_util.hpp | 75 ----------------------------------- 1 file changed, 75 deletions(-) diff --git a/blas/impl/KokkosBlas_util.hpp b/blas/impl/KokkosBlas_util.hpp index 0a96d05488..50173538fb 100644 --- a/blas/impl/KokkosBlas_util.hpp +++ b/blas/impl/KokkosBlas_util.hpp @@ -41,63 +41,6 @@ struct Trans { struct ConjTranspose {}; }; -#if !defined(KOKKOS_IF_ON_HOST) - -namespace Impl { - -template -struct algo_level3_blocked_mb_impl; -template <> -struct algo_level3_blocked_mb_impl { - static constexpr int value = 4; -}; -#if defined(KOKKOS_ENABLE_CUDA) -template <> -struct algo_level3_blocked_mb_impl { - static constexpr int value = 2; -}; -#endif -#if defined(KOKKOS_ENABLE_HIP) -template <> -struct algo_level3_blocked_mb_impl { - static constexpr int value = 2; -}; -#endif -#if defined(KOKKOS_ENABLE_SYCL) -template <> -struct algo_level3_blocked_mb_impl { - static constexpr int value = 2; -}; -#endif - -template -struct algo_level2_blocked_mb_impl; -template <> -struct algo_level2_blocked_mb_impl { - static constexpr int value = 4; -}; -#if defined(KOKKOS_ENABLE_CUDA) -template <> -struct algo_level2_blocked_mb_impl { - static constexpr int value = 1; -}; -#endif -#if defined(KOKKOS_ENABLE_HIP) -template <> -struct algo_level2_blocked_mb_impl { - static constexpr int value = 1; -}; -#endif -#if defined(KOKKOS_ENABLE_SYCL) -template <> -struct algo_level2_blocked_mb_impl { - static constexpr int value = 1; -}; -#endif - -} // namespace Impl -#endif - struct Algo { struct Level3 { struct Unblocked { @@ -111,19 +54,10 @@ struct Algo { // - team policy (smaller) or range policy (bigger) // - space (gpu vs host) // - blocksize input (blk <= 4 mb = 2, otherwise mb = 4), etc. -#if defined(KOKKOS_IF_ON_HOST) static constexpr KOKKOS_FUNCTION int mb() { KOKKOS_IF_ON_HOST((return 4;)) KOKKOS_IF_ON_DEVICE((return 2;)) } - -#else // FIXME remove when requiring minimum version of Kokkos 3.6 - static constexpr KOKKOS_FUNCTION int mb() { - return algo_level3_blocked_mb_impl< - Kokkos::Impl::ActiveExecutionMemorySpace>::value; - } - -#endif }; struct MKL { static const char *name() { return "MKL"; } @@ -161,19 +95,10 @@ struct Algo { // - team policy (smaller) or range policy (bigger) // - space (cuda vs host) // - blocksize input (blk <= 4 mb = 2, otherwise mb = 4), etc. -#if defined(KOKKOS_IF_ON_HOST) static constexpr KOKKOS_FUNCTION int mb() { KOKKOS_IF_ON_HOST((return 4;)) KOKKOS_IF_ON_DEVICE((return 1;)) } - -#else // FIXME remove when requiring minimum version of Kokkos 3.6 - static constexpr KOKKOS_FUNCTION int mb() { - return algo_level2_blocked_mb_impl< - Kokkos::Impl::ActiveExecutionMemorySpace>::value; - } - -#endif }; struct MKL {}; struct CompactMKL {}; From f33376a540175652093114344e08d27e7417d4c5 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 25 Jan 2023 10:31:43 -0500 Subject: [PATCH 026/442] Add Impl::are_integral_v helper variable template --- common/src/KokkosKernels_helpers.hpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/common/src/KokkosKernels_helpers.hpp b/common/src/KokkosKernels_helpers.hpp index a7a1882700..b36360b991 100644 --- a/common/src/KokkosKernels_helpers.hpp +++ b/common/src/KokkosKernels_helpers.hpp @@ -19,6 +19,8 @@ #include "KokkosKernels_config.h" // KOKKOSKERNELS_INST_LAYOUTLEFT, KOKKOSKERNELS_INST_LAYOUTRIGHT #include "KokkosKernels_default_types.hpp" // default_layout +#include + namespace KokkosKernels { namespace Impl { @@ -67,6 +69,13 @@ struct GetUnifiedScalarViewType { type; }; +template +struct are_integral : std::bool_constant<((std::is_integral_v || + std::is_enum_v)&&...)> {}; + +template +inline constexpr bool are_integral_v = are_integral::value; + } // namespace Impl } // namespace KokkosKernels #endif From 4df9db90ae6ca9cf0c2e79c149d250569092633a Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 25 Jan 2023 10:40:30 -0500 Subject: [PATCH 027/442] Hands off Kokkos::Impl::are_integral --- .../impl/KokkosBatched_Vector_SIMD_View.hpp | 74 ++++++++++--------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp index fb0c9b1f48..3fb7ac872b 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp @@ -16,6 +16,8 @@ #ifndef __KOKKOSBATCHED_VECTOR_SIMD_VIEW_HPP__ #define __KOKKOSBATCHED_VECTOR_SIMD_VIEW_HPP__ +#include + /// \author Kyungjoo Kim (kyukim@sandia.gov) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wswitch" @@ -94,20 +96,20 @@ struct SimdViewAccess { /// rank 1 template - KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if::value && - 1 == ViewType::rank, - reference_type>::type - operator()(const I0 &i0, Args... /*args*/) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + KokkosKernels::Impl::are_integral_v && 1 == ViewType::rank, + reference_type> + operator()(const I0 &i0, Args... /*args*/) const { return _a(i0 / vector_length)[i0 % vector_length]; } /// rank 2 template - KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< - Kokkos::Impl::are_integral::value && 2 == ViewType::rank, - reference_type>::type - operator()(const I0 &i0, const I1 &i1, Args... /*args*/) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t && + 2 == ViewType::rank, + reference_type> + operator()(const I0 &i0, const I1 &i1, Args... /*args*/) const { switch (PackDim::value) { case 0: return _a(i0 / vector_length, i1)[i0 % vector_length]; case 1: break; @@ -118,10 +120,10 @@ struct SimdViewAccess { /// rank 3 template - KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< - Kokkos::Impl::are_integral::value && + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + KokkosKernels::Impl::are_integral_v && 3 == ViewType::rank, - reference_type>::type + reference_type> operator()(const I0 &i0, const I1 &i1, const I2 &i2, Args... /*args*/) const { switch (PackDim::value) { case 0: return _a(i0 / vector_length, i1, i2)[i0 % vector_length]; @@ -134,10 +136,10 @@ struct SimdViewAccess { /// rank 4 template - KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< - Kokkos::Impl::are_integral::value && + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + KokkosKernels::Impl::are_integral_v && 4 == ViewType::rank, - reference_type>::type + reference_type> operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, Args... /*args*/) const { switch (PackDim::value) { @@ -153,10 +155,10 @@ struct SimdViewAccess { /// rank 5 template - KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< - Kokkos::Impl::are_integral::value && + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + KokkosKernels::Impl::are_integral_v && 5 == ViewType::rank, - reference_type>::type + reference_type> operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, const I4 &i4, Args... /*args*/) const { switch (PackDim::value) { @@ -173,10 +175,10 @@ struct SimdViewAccess { /// rank 6 template - KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< - Kokkos::Impl::are_integral::value && + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + KokkosKernels::Impl::are_integral_v && 6 == ViewType::rank, - reference_type>::type + reference_type> operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, const I4 &i4, const I5 &i5, Args... /*args*/) const { switch (PackDim::value) { @@ -199,12 +201,14 @@ struct SimdViewAccess { /// rank 7 template - KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< - Kokkos::Impl::are_integral::value && - 7 == ViewType::rank, - reference_type>::type - operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5, const I6 &i6, Args... /*args*/) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t && + 7 == ViewType::rank, + reference_type> + operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, + const I4 &i4, const I5 &i5, const I6 &i6, + Args... /*args*/) const { switch (PackDim::value) { case 0: return _a(i0 / vector_length, i1, i2, i3, i4, i5, @@ -233,14 +237,14 @@ struct SimdViewAccess { /// rank 8 template - KOKKOS_FORCEINLINE_FUNCTION typename std::enable_if< - Kokkos::Impl::are_integral::value && - 8 == ViewType::rank, - reference_type>::type - operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, - const I4 &i4, const I5 &i5, const I6 &i6, const I7 &i7, - Args... /*args*/) const { + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t && + 8 == ViewType::rank, + reference_type> + operator()(const I0 &i0, const I1 &i1, const I2 &i2, const I3 &i3, + const I4 &i4, const I5 &i5, const I6 &i6, const I7 &i7, + Args... /*args*/) const { switch (PackDim::value) { case 0: return _a(i0 / vector_length, i1, i2, i3, i4, i5, i6, From b4d8ca8bf12867ecc2da39ffb9b7813e65598584 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 31 Jan 2023 11:37:30 -0500 Subject: [PATCH 028/442] Update nightly SYCL setup --- scripts/docker/Dockerfile.sycl | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/scripts/docker/Dockerfile.sycl b/scripts/docker/Dockerfile.sycl index 3d94a1a45e..bda1197fc6 100644 --- a/scripts/docker/Dockerfile.sycl +++ b/scripts/docker/Dockerfile.sycl @@ -1,4 +1,4 @@ -ARG BASE=nvidia/cuda:10.2-devel +ARG BASE=nvidia/cuda:11.7.0-devel-ubuntu22.04 FROM $BASE RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub @@ -22,7 +22,7 @@ RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \ gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \ rm ${KEYDUMP_FILE}* -ARG CMAKE_VERSION=3.18.5 +ARG CMAKE_VERSION=3.23.2 ENV CMAKE_DIR=/opt/cmake RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \ CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \ @@ -31,25 +31,21 @@ RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSIO wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \ wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \ gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \ - grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \ + grep -i ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sed -e s/linux/Linux/ | sha256sum --check && \ mkdir -p ${CMAKE_DIR} && \ sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && \ rm cmake* ENV PATH=${CMAKE_DIR}/bin:$PATH -ENV SYCL_DIR=/opt/sycl -RUN SYCL_VERSION=20220112 && \ - SYCL_URL=https://github.com/intel/llvm/archive/sycl-nightly && \ - SYCL_ARCHIVE=${SYCL_VERSION}.tar.gz && \ - SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \ - wget --quiet ${SYCL_URL}/${SYCL_ARCHIVE} && \ - mkdir llvm && \ - tar -xf ${SYCL_ARCHIVE} -C llvm --strip-components=1 && \ - cd llvm && \ - python3 buildbot/configure.py --cuda && \ - python3 buildbot/compile.py && \ - mkdir -p ${SYCL_DIR} && \ - mv ${SCRATCH_DIR}/llvm/build/install/* ${SYCL_DIR} && \ - echo "${SYCL_DIR}/lib" > /etc/ld.so.conf.d/sycl.conf && ldconfig && \ - rm -rf ${SCRATCH_DIR} -ENV PATH=${SYCL_DIR}/bin:$PATH +RUN wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB && \ + apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB && \ + echo "deb https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \ + apt-get update -o Dir::Etc::sourcelist="sources.list.d/oneAPI.list" -o APT::Get::List-Cleanup="0" && \ + apt-get install -y intel-oneapi-compiler-dpcpp-cpp-and-cpp-classic-2023.0.0 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN wget https://cloud.cees.ornl.gov/download/oneapi-for-nvidia-gpus-2023.0.0-linux.sh && \ + chmod +x oneapi-for-nvidia-gpus-2023.0.0-linux.sh && \ + ./oneapi-for-nvidia-gpus-2023.0.0-linux.sh -y && \ + rm oneapi-for-nvidia-gpus-2023.0.0-linux.sh From dec1753fca61e411146271861d58a8126c167714 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 1 Feb 2023 15:53:45 -0700 Subject: [PATCH 029/442] Testing working in serial and openmp (IF I force determinism on parIlut) --- sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp | 8 ++++++++ sparse/unit_test/Test_Sparse_par_ilut.hpp | 13 ++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index 46c1ad13d8..2a388ac28b 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -331,11 +331,19 @@ struct IlutWrap { const auto out_val = lpu_col == col_idx ? lpu_val : r_val / diag; // store output entries if (row_idx >= col_idx) { + if (l_new_nnz >= L_new_row_map(row_idx+1)) { + throw std::runtime_error(std::string("Overflowed L_new in row ") + std::to_string(row_idx) + + std::string(", is your A matrix sorted?")); + } L_new_entries(l_new_nnz) = col_idx; L_new_values(l_new_nnz) = row_idx == col_idx ? 1. : out_val; ++l_new_nnz; } if (row_idx <= col_idx) { + if (u_new_nnz >= U_new_row_map(row_idx+1)) { + throw std::runtime_error(std::string("Overflowed U_new in row ") + std::to_string(row_idx) + + std::string(", is your A matrix sorted?")); + } U_new_entries(u_new_nnz) = col_idx; U_new_values(u_new_nnz) = out_val; ++u_new_nnz; diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index 5c1bb39e59..ddcf61afd2 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -27,6 +27,7 @@ #include "KokkosSparse_par_ilut.hpp" #include "KokkosSparse_gmres.hpp" #include "KokkosSparse_LUPrec.hpp" +#include "KokkosSparse_SortCrs.hpp" #include @@ -339,6 +340,8 @@ void run_test_par_ilut_precond() { sp_matrix_type>(numRows, numCols, nnz, 0, lno_t(0.01 * numRows), diagDominance); + KokkosSparse::sort_crs_matrix(A); + // Make kernel handles KernelHandle kh; kh.create_gmres_handle(m, tol); @@ -372,11 +375,11 @@ void run_test_par_ilut_precond() { par_ilut_numeric(&kh, row_map, entries, values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, -#ifdef KOKKOS_ENABLE_SERIAL - true /*deterministic*/ -#else - false /*cannot ask for determinism*/ -#endif +// #ifdef KOKKOS_ENABLE_SERIAL + true /*deterministic*/ +// #else +// false /*problem is too big for determinism?*/ +// #endif ); // Convert L, U parILUT outputs to uncompressed 2d views as required From 16c97ddb69fc3b710e1c771ae13e66d864b45b87 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 2 Feb 2023 09:59:03 -0700 Subject: [PATCH 030/442] Call concurrency(), not impl_thread_pool_size() For host serial/parallel backends. concurrency() is implemented in terms of impl_thread_pool_size() so it's equivalent, but is part of the public interface. --- sparse/impl/KokkosSparse_spmv_impl.hpp | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index d3e495c426..e9eb301b23 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -512,21 +512,7 @@ static void spmv_beta_transpose(typename YVector::const_value_type& alpha, #if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_OPENMP) || \ defined(KOKKOS_ENABLE_THREADS) { - int impl_thread_pool_size(0); -#if defined(KOKKOS_ENABLE_SERIAL) - if (std::is_same::value) - impl_thread_pool_size = 1; -#endif -#if defined(KOKKOS_ENABLE_OPENMP) - if (std::is_same::value) - impl_thread_pool_size = Kokkos::OpenMP::impl_thread_pool_size(); -#endif -#if defined(KOKKOS_ENABLE_THREADS) - if (std::is_same::value) - impl_thread_pool_size = Kokkos::Threads::impl_thread_pool_size(); -#endif - - if (impl_thread_pool_size == 1) { + if (execution_space().concurrency() == 1) { /// serial impl typedef typename AMatrix::non_const_value_type value_type; typedef Kokkos::Details::ArithTraits ATV; From e72bc3859722b66e64752e0225fc48c822764ac9 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 2 Feb 2023 14:02:46 -0500 Subject: [PATCH 031/442] SYCL CI: Specify the full path to the compiler --- .jenkins/nightly.groovy | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.jenkins/nightly.groovy b/.jenkins/nightly.groovy index 6092e75fba..a12aaf42de 100644 --- a/.jenkins/nightly.groovy +++ b/.jenkins/nightly.groovy @@ -18,12 +18,13 @@ pipeline { } } steps { - sh '''rm -rf kokkos && + sh '''. /opt/intel/oneapi/setvars.sh --include-intel-llvm && \ + rm -rf kokkos && git clone -b develop https://github.com/kokkos/kokkos.git && cd kokkos && \ mkdir build && cd build && \ cmake \ -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_CXX_COMPILER=/opt/intel/oneapi/compiler/2023.0.0/linux/bin-llvm/clang++ \ -DKokkos_ARCH_VOLTA70=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \ -DKokkos_ENABLE_SYCL=ON \ @@ -32,10 +33,11 @@ pipeline { .. && \ make -j8 && make install && \ cd ../.. && rm -rf kokkos''' - sh '''rm -rf build && mkdir -p build && cd build && \ + sh '''. /opt/intel/oneapi/setvars.sh --include-intel-llvm && \ + rm -rf build && mkdir -p build && cd build && \ cmake \ -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_CXX_COMPILER=/opt/intel/oneapi/compiler/2023.0.0/linux/bin-llvm/clang++ \ -DKokkosKernels_ENABLE_TESTS=ON \ -DKokkosKernels_ENABLE_EXAMPLES=ON \ -DKokkosKernels_INST_DOUBLE=ON \ From 834a85ecec46355915031a296725635db3599ba6 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Thu, 2 Feb 2023 16:13:48 -0700 Subject: [PATCH 032/442] Use the options ENABLE_PERFTEST, ENABLE_EXAMPLES (#1667) * Use the options ENABLE_PERFTEST, ENABLE_EXAMPLES The cmake options KokkosKernels_ENABLE_PERFTESTS and KokkosKernels_ENABLE_EXAMPLES were not actually used, both perf_test/ and example/ were always built as long as KokkosKernels_ENABLE_ALL_COMPONENTS=ON. This makes these options have an effect again. If perftests or examples are enabled but ENABLE_ALL_COMPONENTS=OFF, print a message about why they can't actually be enabled. * From e10harvey: fix typo in perf_test cmake * Add feedback about cmake - Turn ENABLE_PERFTESTS off by default - since both examples and perf tests are off by default, warn if those are ON but can't be enabled because ENABLE_ALL_COMPONENTS=OFF - use ELSE to simplify logic where ENABLE_ALL_COMPONENTS=OFF --- CMakeLists.txt | 23 +++++++++++++++++++---- perf_test/CMakeLists.txt | 2 +- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 443b124cb2..9d685e648e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,7 +64,7 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS) ) KOKKOSKERNELS_ADD_OPTION( "ENABLE_PERFTESTS" - ON + OFF BOOL "Whether to build performance tests. Default: OFF" ) @@ -383,9 +383,24 @@ ELSE() KOKKOSKERNELS_ADD_TEST_DIRECTORIES(sparse/unit_test) ENDIF() IF (KokkosKernels_ENABLE_ALL_COMPONENTS) - KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) - KOKKOSKERNELS_ADD_EXAMPLE_DIRECTORIES(example) - ENDIF() + IF (KokkosKernels_ENABLE_PERFTESTS) + MESSAGE(STATUS "Enabling perf tests.") + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) + ENDIF () + IF (KokkosKernels_ENABLE_EXAMPLES) + MESSAGE(STATUS "Enabling examples.") + KOKKOSKERNELS_ADD_EXAMPLE_DIRECTORIES(example) + ENDIF () + ELSE () + # ENABLE_ALL_COMPONENTS is OFF, so perftests and examples can't be enabled. + # Warn if they were requested. + IF (KokkosKernels_ENABLE_PERFTESTS) + MESSAGE(WARNING "Could not enable perf tests because KokkosKernels_ENABLE_ALL_COMPONENTS=OFF") + ENDIF () + IF (KokkosKernels_ENABLE_EXAMPLES) + MESSAGE(WARNING "Could not enable examples because KokkosKernels_ENABLE_ALL_COMPONENTS=OFF") + ENDIF () + ENDIF () KOKKOSKERNELS_PACKAGE_POSTPROCESS() IF (KokkosKernels_ENABLE_DOCS) diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index bc638b64c1..28752e9c6c 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -28,7 +28,7 @@ if (KokkosKernels_ENABLE_PERFTESTS) KOKKOSKERNELS_INCLUDE_DIRECTORIES(sparse) - if(Kokkos_ENABLE_TESTS_AND_PERFSUITE) + if(KokkosKernels_ENABLE_TESTS_AND_PERFSUITE) #Add RPS implementations of KK perf tests here KOKKOSKERNELS_ADD_EXECUTABLE( tracked_testing From b9bcc5f49d884e53ef1405e3dad47d1f89a2df35 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Fri, 3 Feb 2023 14:11:54 -0700 Subject: [PATCH 033/442] Add new assert/require macros. Other minor fixes --- common/src/KokkosKernels_Error.hpp | 67 +++++++++++++++++++ .../KokkosSparse_par_ilut_numeric_impl.hpp | 12 ++-- sparse/src/KokkosSparse_LUPrec.hpp | 4 +- sparse/unit_test/Test_Sparse_par_ilut.hpp | 5 +- 4 files changed, 76 insertions(+), 12 deletions(-) diff --git a/common/src/KokkosKernels_Error.hpp b/common/src/KokkosKernels_Error.hpp index e4e4981973..b174d60e63 100644 --- a/common/src/KokkosKernels_Error.hpp +++ b/common/src/KokkosKernels_Error.hpp @@ -53,4 +53,71 @@ inline void hip_internal_safe_call(hipError_t e, const char *name, } // namespace Impl } // namespace KokkosKernels +/* + * Asserts and error checking macros/functions. + * + * KK_KERNEL** are for error checking within kokkos kernels. + * + * Any check with "assert" in the name is disabled for release builds + * + * For _MSG checks, the msg argument can contain '<<' if not a kernel check. + * + * This code is adapted from EKAT/src/ekat/ekat_assert.hpp + */ + +// Internal do not call directly +#define IMPL_THROW(condition, msg, exception_type) \ + do { \ + if ( ! (condition) ) { \ + std::stringstream _ss_; \ + _ss_ << __FILE__ << ":" << __LINE__ << ": FAIL:\n" << #condition; \ + _ss_ << "\n" << msg; \ + throw exception_type(_ss_.str()); \ + } \ + } while(0) + +// SYCL cannot printf like the other backends quite yet +#ifdef __SYCL_DEVICE_ONLY__ +#define IMPL_KERNEL_THROW(condition, msg) \ + do { \ + if ( ! (condition) ) { \ + const __attribute__((opencl_constant)) char format[] = "KERNEL CHECK FAILED:\n %s %s\n"; \ + sycl::ext::oneapi::experimental::printf(format,#condition,msg); \ + Kokkos::abort(""); \ + } \ + } while (0) +#else +#define IMPL_KERNEL_THROW(condition, msg) \ + do { \ + if ( ! (condition) ) { \ + printf("KERNEL CHECK FAILED:\n %s\n %s\n",#condition,msg); \ + Kokkos::abort(""); \ + } \ + } while (0) +#endif + +#ifndef NDEBUG +#define KK_ASSERT(condition) IMPL_THROW(condition, "", std::logic_error) +#define KK_ASSERT_MSG(condition, msg) IMPL_THROW(condition, msg, std::logic_error) +#define KK_KERNEL_ASSERT(condition) IMPL_KERNEL_THROW(condition, "") +#define KK_KERNEL_ASSERT_MSG(condition, msg) IMPL_KERNEL_THROW(condition, msg) +#else +#define KK_ASSERT(condition) ((void) (0)) +#define KK_ASSERT_MSG(condition, msg) ((void) (0)) +#define KK_KERNEL_ASSERT(condition) ((void) (0)) +#define KK_KERNEL_ASSERT_MSG(condition, msg) ((void) (0)) +#endif + +#define KK_REQUIRE(condition) IMPL_THROW(condition, "", std::logic_error) +#define KK_REQUIRE_MSG(condition, msg) IMPL_THROW(condition, msg, std::logic_error) + +#define KK_KERNEL_REQUIRE(condition) IMPL_KERNEL_THROW(condition, "") +#define KK_KERNEL_REQUIRE_MSG(condition, msg) IMPL_KERNEL_THROW(condition, msg) + +#define KK_ERROR_MSG(msg) KK_REQUIRE_MSG(false, msg) +#define KK_KERNEL_ERROR_MSG(msg) KK_KERNEL_REQUIRE_MSG(false, msg) + +#undef IMPL_THROW +#undef IMPL_KERNEL_THROW + #endif // KOKKOSKERNELS_ERROR_HPP diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index 2a388ac28b..78af697360 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -331,19 +331,15 @@ struct IlutWrap { const auto out_val = lpu_col == col_idx ? lpu_val : r_val / diag; // store output entries if (row_idx >= col_idx) { - if (l_new_nnz >= L_new_row_map(row_idx+1)) { - throw std::runtime_error(std::string("Overflowed L_new in row ") + std::to_string(row_idx) + - std::string(", is your A matrix sorted?")); - } + KK_KERNEL_ASSERT_MSG(l_new_nnz < L_new_row_map(row_idx+1), + "add_candidates: Overflowed L_new, is your A matrix sorted?"); L_new_entries(l_new_nnz) = col_idx; L_new_values(l_new_nnz) = row_idx == col_idx ? 1. : out_val; ++l_new_nnz; } if (row_idx <= col_idx) { - if (u_new_nnz >= U_new_row_map(row_idx+1)) { - throw std::runtime_error(std::string("Overflowed U_new in row ") + std::to_string(row_idx) + - std::string(", is your A matrix sorted?")); - } + KK_KERNEL_ASSERT_MSG(u_new_nnz < U_new_row_map(row_idx+1), + "add_candidates: Overflowed U_new, is your A matrix sorted?"); U_new_entries(u_new_nnz) = col_idx; U_new_values(u_new_nnz) = out_val; ++u_new_nnz; diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index bc0b51aa78..3c9cc98504 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -28,7 +28,6 @@ #include namespace KokkosSparse { - namespace Experimental { /// \class LUPrec @@ -50,7 +49,7 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { using karith = typename Kokkos::ArithTraits; private: - Kokkos::View _L, _U, _tmp; + Kokkos::View _L, _U, _tmp; public: //! Constructor: @@ -109,6 +108,7 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { //! True if the preconditioner implements a transpose operator apply. bool hasTransposeApply() const { return true; } }; + } // namespace Experimental } // End namespace KokkosSparse diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index ddcf61afd2..7f2efda388 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -92,9 +92,11 @@ void decompress_matrix( Kokkos::View& entries, Kokkos::View& values, Kokkos::View& output) { + using exe_space = typename device::execution_space; + const size_type nrows = row_map.size() - 1; - Kokkos::parallel_for(nrows, KOKKOS_LAMBDA (const int& row_idx) { + Kokkos::parallel_for(Kokkos::RangePolicy(0, nrows), KOKKOS_LAMBDA (const int& row_idx) { const size_type row_nnz_begin = row_map(row_idx); const size_type row_nnz_end = row_map(row_idx + 1); for (size_type row_nnz = row_nnz_begin; row_nnz < row_nnz_end; ++row_nnz) { @@ -443,7 +445,6 @@ void run_test_par_ilut_precond() { EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); EXPECT_LT(num_iters_precond, num_iters_plain); } - } } // namespace Test From dd930a6622d0dd0dea73c4561001366085832856 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Fri, 3 Feb 2023 17:07:48 -0700 Subject: [PATCH 034/442] Fixes: trsm expects host views --- common/src/KokkosKernels_Error.hpp | 3 --- sparse/src/KokkosSparse_LUPrec.hpp | 13 +++++++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/common/src/KokkosKernels_Error.hpp b/common/src/KokkosKernels_Error.hpp index b174d60e63..a65374ffbe 100644 --- a/common/src/KokkosKernels_Error.hpp +++ b/common/src/KokkosKernels_Error.hpp @@ -117,7 +117,4 @@ inline void hip_internal_safe_call(hipError_t e, const char *name, #define KK_ERROR_MSG(msg) KK_REQUIRE_MSG(false, msg) #define KK_KERNEL_ERROR_MSG(msg) KK_KERNEL_REQUIRE_MSG(false, msg) -#undef IMPL_THROW -#undef IMPL_KERNEL_THROW - #endif // KOKKOSKERNELS_ERROR_HPP diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index 3c9cc98504..6a9635972f 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -47,14 +47,23 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { using ScalarType = typename std::remove_const::type; using EXSP = typename CRS::execution_space; using karith = typename Kokkos::ArithTraits; + using View2dD = typename Kokkos::View; + using View2dH = typename View2dD::HostMirror; private: - Kokkos::View _L, _U, _tmp; + // trsm takes host views + View2dH _L, _U, _tmp; public: //! Constructor: template - LUPrec(const ViewArg &L, const ViewArg &U) : _L(L), _U(U), _tmp("LUPrec::_tmp", _L.extent(0), 1) {} + LUPrec(const ViewArg &L, const ViewArg &U) : + _L("LUPrec::_L", L.extent(0), L.extent(1)), + _U("LUPrec::_U", U.extent(0), U.extent(1)), + _tmp("LUPrec::_tmp", L.extent(0), 1) { + Kokkos::deep_copy(_L, L); + Kokkos::deep_copy(_U, U); + } //! Destructor. virtual ~LUPrec() {} From a89349ddb761d88a70a10efa8dd5a6081bfd134e Mon Sep 17 00:00:00 2001 From: James Foucar Date: Fri, 3 Feb 2023 17:11:08 -0700 Subject: [PATCH 035/442] formatting --- blas/src/KokkosBlas3_trsm.hpp | 3 +- common/src/KokkosKernels_Error.hpp | 61 ++++++------ .../KokkosSparse_par_ilut_numeric_impl.hpp | 10 +- sparse/src/KokkosSparse_LUPrec.hpp | 12 +-- sparse/unit_test/Test_Sparse_par_ilut.hpp | 93 ++++++++++--------- 5 files changed, 93 insertions(+), 86 deletions(-) diff --git a/blas/src/KokkosBlas3_trsm.hpp b/blas/src/KokkosBlas3_trsm.hpp index df35b7ccb7..2e8d2f4cfa 100644 --- a/blas/src/KokkosBlas3_trsm.hpp +++ b/blas/src/KokkosBlas3_trsm.hpp @@ -43,7 +43,8 @@ namespace KokkosBlas { /// other part is not referenced /// \param trans [in] "N" or "n" for non-transpose, "T" or "t" for transpose, /// "C" or "c" for conjugate transpose. -/// \param diag [in] "U" or "u" indicates the diagonal of A is assumed to be unit +/// \param diag [in] "U" or "u" indicates the diagonal of A is assumed to be +/// unit // "N" or "n" indicated the diagonal of A is assumed to be // non-unit /// \param alpha [in] Input coefficient used for multiplication with B diff --git a/common/src/KokkosKernels_Error.hpp b/common/src/KokkosKernels_Error.hpp index a65374ffbe..ebcdf7ea81 100644 --- a/common/src/KokkosKernels_Error.hpp +++ b/common/src/KokkosKernels_Error.hpp @@ -68,53 +68,56 @@ inline void hip_internal_safe_call(hipError_t e, const char *name, // Internal do not call directly #define IMPL_THROW(condition, msg, exception_type) \ do { \ - if ( ! (condition) ) { \ + if (!(condition)) { \ std::stringstream _ss_; \ _ss_ << __FILE__ << ":" << __LINE__ << ": FAIL:\n" << #condition; \ _ss_ << "\n" << msg; \ throw exception_type(_ss_.str()); \ } \ - } while(0) + } while (0) // SYCL cannot printf like the other backends quite yet #ifdef __SYCL_DEVICE_ONLY__ -#define IMPL_KERNEL_THROW(condition, msg) \ - do { \ - if ( ! (condition) ) { \ - const __attribute__((opencl_constant)) char format[] = "KERNEL CHECK FAILED:\n %s %s\n"; \ - sycl::ext::oneapi::experimental::printf(format,#condition,msg); \ - Kokkos::abort(""); \ - } \ +#define IMPL_KERNEL_THROW(condition, msg) \ + do { \ + if (!(condition)) { \ + const __attribute__((opencl_constant)) char format[] = \ + "KERNEL CHECK FAILED:\n %s %s\n"; \ + sycl::ext::oneapi::experimental::printf(format, #condition, msg); \ + Kokkos::abort(""); \ + } \ } while (0) #else -#define IMPL_KERNEL_THROW(condition, msg) \ - do { \ - if ( ! (condition) ) { \ - printf("KERNEL CHECK FAILED:\n %s\n %s\n",#condition,msg); \ - Kokkos::abort(""); \ - } \ +#define IMPL_KERNEL_THROW(condition, msg) \ + do { \ + if (!(condition)) { \ + printf("KERNEL CHECK FAILED:\n %s\n %s\n", #condition, msg); \ + Kokkos::abort(""); \ + } \ } while (0) #endif #ifndef NDEBUG -#define KK_ASSERT(condition) IMPL_THROW(condition, "", std::logic_error) -#define KK_ASSERT_MSG(condition, msg) IMPL_THROW(condition, msg, std::logic_error) -#define KK_KERNEL_ASSERT(condition) IMPL_KERNEL_THROW(condition, "") -#define KK_KERNEL_ASSERT_MSG(condition, msg) IMPL_KERNEL_THROW(condition, msg) +#define KK_ASSERT(condition) IMPL_THROW(condition, "", std::logic_error) +#define KK_ASSERT_MSG(condition, msg) \ + IMPL_THROW(condition, msg, std::logic_error) +#define KK_KERNEL_ASSERT(condition) IMPL_KERNEL_THROW(condition, "") +#define KK_KERNEL_ASSERT_MSG(condition, msg) IMPL_KERNEL_THROW(condition, msg) #else -#define KK_ASSERT(condition) ((void) (0)) -#define KK_ASSERT_MSG(condition, msg) ((void) (0)) -#define KK_KERNEL_ASSERT(condition) ((void) (0)) -#define KK_KERNEL_ASSERT_MSG(condition, msg) ((void) (0)) +#define KK_ASSERT(condition) ((void)(0)) +#define KK_ASSERT_MSG(condition, msg) ((void)(0)) +#define KK_KERNEL_ASSERT(condition) ((void)(0)) +#define KK_KERNEL_ASSERT_MSG(condition, msg) ((void)(0)) #endif -#define KK_REQUIRE(condition) IMPL_THROW(condition, "", std::logic_error) -#define KK_REQUIRE_MSG(condition, msg) IMPL_THROW(condition, msg, std::logic_error) +#define KK_REQUIRE(condition) IMPL_THROW(condition, "", std::logic_error) +#define KK_REQUIRE_MSG(condition, msg) \ + IMPL_THROW(condition, msg, std::logic_error) -#define KK_KERNEL_REQUIRE(condition) IMPL_KERNEL_THROW(condition, "") -#define KK_KERNEL_REQUIRE_MSG(condition, msg) IMPL_KERNEL_THROW(condition, msg) +#define KK_KERNEL_REQUIRE(condition) IMPL_KERNEL_THROW(condition, "") +#define KK_KERNEL_REQUIRE_MSG(condition, msg) IMPL_KERNEL_THROW(condition, msg) -#define KK_ERROR_MSG(msg) KK_REQUIRE_MSG(false, msg) -#define KK_KERNEL_ERROR_MSG(msg) KK_KERNEL_REQUIRE_MSG(false, msg) +#define KK_ERROR_MSG(msg) KK_REQUIRE_MSG(false, msg) +#define KK_KERNEL_ERROR_MSG(msg) KK_KERNEL_REQUIRE_MSG(false, msg) #endif // KOKKOSKERNELS_ERROR_HPP diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index 78af697360..4ccdf7b07e 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -331,15 +331,17 @@ struct IlutWrap { const auto out_val = lpu_col == col_idx ? lpu_val : r_val / diag; // store output entries if (row_idx >= col_idx) { - KK_KERNEL_ASSERT_MSG(l_new_nnz < L_new_row_map(row_idx+1), - "add_candidates: Overflowed L_new, is your A matrix sorted?"); + KK_KERNEL_ASSERT_MSG( + l_new_nnz < L_new_row_map(row_idx + 1), + "add_candidates: Overflowed L_new, is your A matrix sorted?"); L_new_entries(l_new_nnz) = col_idx; L_new_values(l_new_nnz) = row_idx == col_idx ? 1. : out_val; ++l_new_nnz; } if (row_idx <= col_idx) { - KK_KERNEL_ASSERT_MSG(u_new_nnz < U_new_row_map(row_idx+1), - "add_candidates: Overflowed U_new, is your A matrix sorted?"); + KK_KERNEL_ASSERT_MSG( + u_new_nnz < U_new_row_map(row_idx + 1), + "add_candidates: Overflowed U_new, is your A matrix sorted?"); U_new_entries(u_new_nnz) = col_idx; U_new_values(u_new_nnz) = out_val; ++u_new_nnz; diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index 6a9635972f..f4659fd391 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -47,7 +47,7 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { using ScalarType = typename std::remove_const::type; using EXSP = typename CRS::execution_space; using karith = typename Kokkos::ArithTraits; - using View2dD = typename Kokkos::View; + using View2dD = typename Kokkos::View; using View2dH = typename View2dD::HostMirror; private: @@ -57,10 +57,10 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { public: //! Constructor: template - LUPrec(const ViewArg &L, const ViewArg &U) : - _L("LUPrec::_L", L.extent(0), L.extent(1)), - _U("LUPrec::_U", U.extent(0), U.extent(1)), - _tmp("LUPrec::_tmp", L.extent(0), 1) { + LUPrec(const ViewArg &L, const ViewArg &U) + : _L("LUPrec::_L", L.extent(0), L.extent(1)), + _U("LUPrec::_U", U.extent(0), U.extent(1)), + _tmp("LUPrec::_tmp", L.extent(0), 1) { Kokkos::deep_copy(_L, L); Kokkos::deep_copy(_U, U); } @@ -89,8 +89,6 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { const char transM[] = "N", ScalarType alpha = karith::one(), ScalarType beta = karith::zero()) const { - - // tmp = trsm(L, x); //Apply L^inv to x // y = trsm(U, tmp); //Apply U^inv to tmp auto tmpsv = Kokkos::subview(_tmp, Kokkos::ALL, 0); diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index 7f2efda388..8794f2e013 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -50,7 +50,7 @@ struct TolMeta { static constexpr float value = 1e-5; // Lower tolerance for floats }; -} +} // namespace ParIlut template @@ -87,27 +87,28 @@ std::vector> decompress_matrix( template -void decompress_matrix( - Kokkos::View& row_map, - Kokkos::View& entries, - Kokkos::View& values, - Kokkos::View& output) { +void decompress_matrix(Kokkos::View& row_map, + Kokkos::View& entries, + Kokkos::View& values, + Kokkos::View& output) { using exe_space = typename device::execution_space; const size_type nrows = row_map.size() - 1; - Kokkos::parallel_for(Kokkos::RangePolicy(0, nrows), KOKKOS_LAMBDA (const int& row_idx) { - const size_type row_nnz_begin = row_map(row_idx); - const size_type row_nnz_end = row_map(row_idx + 1); - for (size_type row_nnz = row_nnz_begin; row_nnz < row_nnz_end; ++row_nnz) { - const lno_t col_idx = entries(row_nnz); - const scalar_t value = values(row_nnz); - output(row_idx, col_idx) = value; - } - }); + Kokkos::parallel_for( + Kokkos::RangePolicy(0, nrows), + KOKKOS_LAMBDA(const int& row_idx) { + const size_type row_nnz_begin = row_map(row_idx); + const size_type row_nnz_end = row_map(row_idx + 1); + for (size_type row_nnz = row_nnz_begin; row_nnz < row_nnz_end; + ++row_nnz) { + const lno_t col_idx = entries(row_nnz); + const scalar_t value = values(row_nnz); + output(row_idx, col_idx) = value; + } + }); } - template void check_matrix(const std::string& name, @@ -317,11 +318,11 @@ template ; - using EntriesType = Kokkos::View; - using ValuesType = Kokkos::View; + using exe_space = typename device::execution_space; + using mem_space = typename device::memory_space; + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; using sp_matrix_type = KokkosSparse::CrsMatrix; using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< @@ -361,8 +362,8 @@ void run_test_par_ilut_precond() { auto values = A.values; // Allocate L and U CRS views as outputs - RowMapType L_row_map ("L_row_map", numRows + 1); - RowMapType U_row_map ("U_row_map", numRows + 1); + RowMapType L_row_map("L_row_map", numRows + 1); + RowMapType U_row_map("U_row_map", numRows + 1); // Initial L/U approximations for A par_ilut_symbolic(&kh, row_map, entries, L_row_map, U_row_map); @@ -375,20 +376,21 @@ void run_test_par_ilut_precond() { EntriesType U_entries("U_entries", nnzU); ValuesType U_values("U_values", nnzU); - par_ilut_numeric(&kh, row_map, entries, values, L_row_map, L_entries, - L_values, U_row_map, U_entries, U_values, -// #ifdef KOKKOS_ENABLE_SERIAL - true /*deterministic*/ -// #else -// false /*problem is too big for determinism?*/ -// #endif + par_ilut_numeric( + &kh, row_map, entries, values, L_row_map, L_entries, L_values, U_row_map, + U_entries, U_values, + // #ifdef KOKKOS_ENABLE_SERIAL + true /*deterministic*/ + // #else + // false /*problem is too big for determinism?*/ + // #endif ); // Convert L, U parILUT outputs to uncompressed 2d views as required // by LUPrec - Kokkos::View - L_uncompressed("L_uncompressed", numRows, numRows), - U_uncompressed("U_uncompressed", numRows, numRows); + Kokkos::View L_uncompressed("L_uncompressed", numRows, + numRows), + U_uncompressed("U_uncompressed", numRows, numRows); decompress_matrix(L_row_map, L_entries, L_values, L_uncompressed); decompress_matrix(U_row_map, U_entries, U_values, U_uncompressed); @@ -413,7 +415,7 @@ void run_test_par_ilut_precond() { float_t endRes = KokkosBlas::nrm2(B) / nrmB; const auto conv_flag = gmres_handle->get_conv_flag_val(); - num_iters_plain = gmres_handle->get_num_iters(); + num_iters_plain = gmres_handle->get_num_iters(); EXPECT_LT(endRes, gmres_handle->get_tol()); EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); @@ -425,7 +427,8 @@ void run_test_par_ilut_precond() { gmres_handle->set_verbose(verbose); // Make precond - KokkosSparse::Experimental::LUPrec myPrec(L_uncompressed, U_uncompressed); + KokkosSparse::Experimental::LUPrec myPrec(L_uncompressed, + U_uncompressed); // reset X for next gmres call Kokkos::deep_copy(X, 0.0); @@ -439,7 +442,7 @@ void run_test_par_ilut_precond() { float_t endRes = KokkosBlas::nrm2(B) / nrmB; const auto conv_flag = gmres_handle->get_conv_flag_val(); - num_iters_precond = gmres_handle->get_num_iters(); + num_iters_precond = gmres_handle->get_num_iters(); EXPECT_LT(endRes, gmres_handle->get_tol()); EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); @@ -461,15 +464,15 @@ void test_par_ilut_precond() { Test::run_test_par_ilut_precond(); } - -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, \ - sparse##_##par_ilut##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_par_ilut(); \ - } \ - TEST_F(TestCategory, \ - sparse##_##par_ilut_precond##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_par_ilut_precond(); \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, \ + sparse##_##par_ilut##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_par_ilut(); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##par_ilut_precond##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_par_ilut_precond(); \ } #define NO_TEST_COMPLEX From d83b0649c49b72a86ce084bf8c8f82f8339eae7f Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 6 Feb 2023 12:23:28 -0700 Subject: [PATCH 036/442] Turn off main par_ilut+gmres test if kokkos::serial is not enabled --- sparse/unit_test/Test_Sparse_par_ilut.hpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index 8794f2e013..48a5f6861e 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -421,7 +421,10 @@ void run_test_par_ilut_precond() { EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); } - // Solve Ax = b with LU preconditioner + // Solve Ax = b with LU preconditioner. Currently only works + // when deterministic mode in par_ilut is on, which is only + // possible when Kokkos::Serial has been enabled. +#ifdef KOKKOS_ENABLE_SERIAL { gmres_handle->reset_handle(m, tol); gmres_handle->set_verbose(verbose); @@ -448,6 +451,7 @@ void run_test_par_ilut_precond() { EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); EXPECT_LT(num_iters_precond, num_iters_plain); } +#endif } } // namespace Test From 08b71b3ffd11359ba19b48fb7172376f39b5e438 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 6 Feb 2023 12:37:37 -0700 Subject: [PATCH 037/442] Fix @file tags in a few headers --- sparse/src/KokkosSparse_LUPrec.hpp | 2 +- sparse/src/KokkosSparse_MatrixPrec.hpp | 2 +- sparse/src/KokkosSparse_Preconditioner.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index f4659fd391..a4525e2798 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -16,7 +16,7 @@ // ************************************************************************ //@HEADER */ -/// @file KokkosKernels_LUPrec.hpp +/// @file KokkosSparse_LUPrec.hpp #ifndef KK_LU_PREC_HPP #define KK_LU_PREC_HPP diff --git a/sparse/src/KokkosSparse_MatrixPrec.hpp b/sparse/src/KokkosSparse_MatrixPrec.hpp index 3ca9e020c7..8a0b67a12d 100644 --- a/sparse/src/KokkosSparse_MatrixPrec.hpp +++ b/sparse/src/KokkosSparse_MatrixPrec.hpp @@ -13,7 +13,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -/// @file KokkosKernels_MatrixPrec.hpp +/// @file KokkosSparse_MatrixPrec.hpp #ifndef KK_MATRIX_PREC_HPP #define KK_MATRIX_PREC_HPP diff --git a/sparse/src/KokkosSparse_Preconditioner.hpp b/sparse/src/KokkosSparse_Preconditioner.hpp index 9d62a960e9..72c936aecd 100644 --- a/sparse/src/KokkosSparse_Preconditioner.hpp +++ b/sparse/src/KokkosSparse_Preconditioner.hpp @@ -13,7 +13,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -/// @file KokkosKernels_Preconditioner.hpp +/// @file KokkosSparse_Preconditioner.hpp // #ifndef KK_PREC_HPP #define KK_PREC_HPP From 771f0f2cfbee69c9d5e69c313c90b2dc3d88126f Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 6 Feb 2023 15:33:40 -0700 Subject: [PATCH 038/442] Fix warnings --- sparse/src/KokkosSparse_LUPrec.hpp | 2 +- sparse/unit_test/Test_Sparse_par_ilut.hpp | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index 3c6405e274..48ba45588b 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -87,7 +87,7 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { const Kokkos::View &Y, const char transM[] = "N", ScalarType alpha = karith::one(), - ScalarType beta = karith::zero()) const { + ScalarType = karith::zero()) const { // tmp = trsm(L, x); //Apply L^inv to x // y = trsm(U, tmp); //Apply U^inv to tmp auto tmpsv = Kokkos::subview(_tmp, Kokkos::ALL, 0); diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index 48a5f6861e..d75ebbf0c7 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -349,6 +349,7 @@ void run_test_par_ilut_precond() { KernelHandle kh; kh.create_gmres_handle(m, tol); auto gmres_handle = kh.get_gmres_handle(); + gmres_handle->set_verbose(verbose); using GMRESHandle = typename std::remove_reference::type; using ViewVectorType = typename GMRESHandle::nnz_value_view_t; @@ -417,6 +418,7 @@ void run_test_par_ilut_precond() { const auto conv_flag = gmres_handle->get_conv_flag_val(); num_iters_plain = gmres_handle->get_num_iters(); + EXPECT_GT(num_iters_plain, 0); EXPECT_LT(endRes, gmres_handle->get_tol()); EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); } @@ -451,6 +453,8 @@ void run_test_par_ilut_precond() { EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); EXPECT_LT(num_iters_precond, num_iters_plain); } +#else + EXPECT_EQ(num_iters_precond, 0); #endif } From e50849b3747386ae7438f546d93c42f407e8b494 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 6 Feb 2023 15:53:51 -0700 Subject: [PATCH 039/442] Fix for openmp-only --- sparse/unit_test/Test_Sparse_par_ilut.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index d75ebbf0c7..383f8815c8 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -380,11 +380,11 @@ void run_test_par_ilut_precond() { par_ilut_numeric( &kh, row_map, entries, values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, - // #ifdef KOKKOS_ENABLE_SERIAL +#ifdef KOKKOS_ENABLE_SERIAL true /*deterministic*/ - // #else - // false /*problem is too big for determinism?*/ - // #endif +#else + false /*cannot ask for determinism*/ +#endif ); // Convert L, U parILUT outputs to uncompressed 2d views as required From d1ee1a43e1d6bd4ef68e2ac3684e70851186f5e2 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 7 Feb 2023 10:49:23 -0700 Subject: [PATCH 040/442] format fix --- sparse/unit_test/Test_Sparse_par_ilut.hpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index 383f8815c8..6607b3314f 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -377,13 +377,12 @@ void run_test_par_ilut_precond() { EntriesType U_entries("U_entries", nnzU); ValuesType U_values("U_values", nnzU); - par_ilut_numeric( - &kh, row_map, entries, values, L_row_map, L_entries, L_values, U_row_map, - U_entries, U_values, + par_ilut_numeric(&kh, row_map, entries, values, L_row_map, L_entries, + L_values, U_row_map, U_entries, U_values, #ifdef KOKKOS_ENABLE_SERIAL - true /*deterministic*/ + true /*deterministic*/ #else - false /*cannot ask for determinism*/ + false /*cannot ask for determinism*/ #endif ); From 566570a87c1b797767802f894365477306129ac0 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Tue, 7 Feb 2023 14:08:23 -0700 Subject: [PATCH 041/442] Temporary workaround for Kokkos #5860 (#1675) In array_sum_reduce, if scalar is half_t and N is 3, 5 or 7, pad out the array by one more element. The last element will be unused, but this bypasses the issues with parallel_reduce. --- common/src/KokkosKernels_Utils.hpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index 29bf2757d5..fd04bd2529 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -1416,20 +1416,30 @@ void init_view_withscalar( MyExecSpace().fence(); } -// A sum-reduction scalar representing a fixed-size array. template struct array_sum_reduce { + static_assert(N <= 8, "array_sum_reduce has only been tested up to N=8"); using ValueType = array_sum_reduce; - - scalar_t data[N]; + // Workaround for https://github.com/kokkos/kokkos/issues/5860 + static constexpr int N_internal = + ((N == 3 || N == 5 || N == 7) && + std::is_same::value && + sizeof(Kokkos::Experimental::half_t) == 2) + ? (N + 1) + : N; + + scalar_t data[N_internal]; KOKKOS_INLINE_FUNCTION array_sum_reduce() { - for (int i = 0; i < N; i++) data[i] = scalar_t(); + // Initialize all the elements, even those at index >= N (prevent valgrind + // warnings, etc.) + for (int i = 0; i < N_internal; i++) data[i] = scalar_t(); } KOKKOS_INLINE_FUNCTION // add operator array_sum_reduce & operator+=(const ValueType &src) { + // Don't bother summing elements >= N though as they will never be used for (int i = 0; i < N; i++) data[i] += src.data[i]; return *this; } From 167ad420ee23fd1e35f3e4585ba959f5c1459bb0 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Wed, 8 Feb 2023 10:19:38 -0500 Subject: [PATCH 042/442] Update SYCL docker file to include oneDPL --- scripts/docker/Dockerfile.sycl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/docker/Dockerfile.sycl b/scripts/docker/Dockerfile.sycl index bda1197fc6..4e185f4c1b 100644 --- a/scripts/docker/Dockerfile.sycl +++ b/scripts/docker/Dockerfile.sycl @@ -10,6 +10,7 @@ RUN apt-get update && apt-get install -y \ ninja-build \ python3 \ git \ + libomp-dev \ && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* @@ -49,3 +50,8 @@ RUN wget https://cloud.cees.ornl.gov/download/oneapi-for-nvidia-gpus-2023.0.0-li chmod +x oneapi-for-nvidia-gpus-2023.0.0-linux.sh && \ ./oneapi-for-nvidia-gpus-2023.0.0-linux.sh -y && \ rm oneapi-for-nvidia-gpus-2023.0.0-linux.sh + +RUN wget https://registrationcenter-download.intel.com/akdlm/irc_nas/19133/l_oneDPL_p_2022.0.0.25335.sh &&\ + chmod +x ./l_oneDPL_p_2022.0.0.25335.sh && \ + ./l_oneDPL_p_2022.0.0.25335.sh -a -s --eula accept && \ + rm l_oneDPL_p_2022.0.0.25335.sh From 9095beb5cbb8a34aab5362daa9cd1f14bbf10d85 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 21 Sep 2022 09:50:34 -0600 Subject: [PATCH 043/442] MDF: improving performance and adding performance test The performance test allows to generate random matrices, random diagonal matrices and to read matrices from file. It collects time for the handle creation, symbolic phase and numeric phase of the MDF algorithm. A small change in the methods names is made to make MDF more uniform with the rest of the library. The unit-test is improved by checking the results in L and U against analytical solution. mostly changing the way the discarded fill is computed at each factorization step, only selecting rows that were impacted by the last factorized row. --- perf_test/sparse/CMakeLists.txt | 5 + perf_test/sparse/KokkosSparse_mdf.cpp | 320 +++++++++++++++++++++++++ sparse/impl/KokkosSparse_mdf_impl.hpp | 236 ++++++++++++++++-- sparse/src/KokkosSparse_mdf.hpp | 67 ++++-- sparse/src/KokkosSparse_mdf_handle.hpp | 25 +- sparse/unit_test/Test_Sparse_mdf.hpp | 87 ++++++- 6 files changed, 688 insertions(+), 52 deletions(-) create mode 100644 perf_test/sparse/KokkosSparse_mdf.cpp diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index a574ed101f..6eac716aca 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -110,3 +110,8 @@ KOKKOSKERNELS_ADD_EXECUTABLE( sparse_spiluk SOURCES KokkosSparse_spiluk.cpp ) + +KOKKOSKERNELS_ADD_EXECUTABLE( + sparse_mdf + SOURCES KokkosSparse_mdf.cpp +) diff --git a/perf_test/sparse/KokkosSparse_mdf.cpp b/perf_test/sparse/KokkosSparse_mdf.cpp new file mode 100644 index 0000000000..ca48df8fd2 --- /dev/null +++ b/perf_test/sparse/KokkosSparse_mdf.cpp @@ -0,0 +1,320 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include "KokkosKernels_config.h" +#include "KokkosKernels_Handle.hpp" +#include "KokkosSparse_IOUtils.hpp" +#include "KokkosSparse_Utils_cusparse.hpp" +#include "KokkosSparse_mdf.hpp" +#include "KokkosKernels_TestUtils.hpp" + +struct Params { + int use_cuda = 0; + int use_hip = 0; + int use_sycl = 0; + int use_openmp = 0; + int use_threads = 0; + std::string amtx; + int m = 10000; + int n = 10000; + int nnzPerRow = 30; + bool diag = false; // Whether B should be diagonal only (requires A square) + bool verbose = false; + int repeat = 1; +}; + +template +struct diag_generator_functor { + using size_type = typename row_map_t::non_const_value_type; + + row_map_t row_map; + entries_t entries; + + diag_generator_functor(row_map_t row_map_, entries_t entries_) + : row_map(row_map_), entries(entries_){}; + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type rowIdx) const { + row_map(rowIdx + 1) = rowIdx + 1; + entries(rowIdx) = rowIdx; + } +}; + +template +void run_experiment(const Params& params) { + using size_type = typename crsMat_t::size_type; + using lno_t = typename crsMat_t::ordinal_type; + using scalar_t = typename crsMat_t::value_type; + using device_t = typename crsMat_t::device_type; + using exec_space = typename device_t::execution_space; + + using graph_t = typename crsMat_t::StaticCrsGraphType; + using rowmap_t = typename graph_t::row_map_type::non_const_type; + using entries_t = typename graph_t::entries_type::non_const_type; + using values_t = typename crsMat_t::values_type::non_const_type; + + std::cout << "************************************* \n"; + std::cout << "************************************* \n"; + crsMat_t A; + lno_t m = params.m; + lno_t n = params.n; + if (params.amtx.length()) { + std::cout << "Loading A from " << params.amtx << '\n'; + A = KokkosSparse::Impl::read_kokkos_crst_matrix( + params.amtx.c_str()); + m = A.numRows(); + n = A.numCols(); + } else { + if (params.diag) { + std::cout << "Randomly generating diag matrix\n"; + rowmap_t rowmapA("A row map", m + 1); + entries_t entriesA("A entries", m); + values_t valuesA("A values", m); + + // Generate the graph of A + diag_generator_functor diag_generator(rowmapA, entriesA); + Kokkos::parallel_for(Kokkos::RangePolicy(0, m), + diag_generator); + + // Generate the values of A + Kokkos::Random_XorShift64_Pool rand_pool(13718); + Kokkos::fill_random(valuesA, rand_pool, + 10 * Kokkos::ArithTraits::one()); + + // Actually put A together + graph_t graph(entriesA, rowmapA); + A = crsMat_t("A matrix", m, valuesA, graph); + } else { + std::cout << "Randomly generating matrix\n"; + size_type nnzUnused = m * params.nnzPerRow; + A = KokkosSparse::Impl::kk_generate_sparse_matrix( + m, n, nnzUnused, 0, (n + 3) / 3); + } + } + + if (params.verbose) { + std::cout << "Matrix A" << std::endl; + std::cout << " row_map A:" << std::endl; + KokkosKernels::Impl::print_1Dview(A.graph.row_map); + std::cout << " entries A:" << std::endl; + KokkosKernels::Impl::print_1Dview(A.graph.entries); + std::cout << " values A:" << std::endl; + KokkosKernels::Impl::print_1Dview(A.values); + std::cout << std::endl; + } + + Kokkos::Timer timer; + double handleTime = 0; + double symbolicTime = 0; + double numericTime = 0; + + timer.reset(); + KokkosSparse::Experimental::MDF_handle handle(A); + handle.set_verbosity(0); + handleTime += timer.seconds(); + + for (int sumRep = 0; sumRep < params.repeat; sumRep++) { + timer.reset(); + KokkosSparse::Experimental::mdf_symbolic(A, handle); + Kokkos::fence(); + symbolicTime += timer.seconds(); + + timer.reset(); + KokkosSparse::Experimental::mdf_numeric(A, handle); + Kokkos::fence(); + numericTime += timer.seconds(); + } + + std::cout << "Mean total time: " + << handleTime + (symbolicTime / params.repeat) + + (numericTime / params.repeat) + << std::endl + << "Handle time: " << handleTime << std::endl + << "Mean symbolic time: " << (symbolicTime / params.repeat) + << std::endl + << "Mean numeric time: " << (numericTime / params.repeat) + << std::endl; + + if (params.verbose) { + entries_t permutation = handle.get_permutation(); + + std::cout << "MDF permutation:" << std::endl; + KokkosKernels::Impl::print_1Dview(permutation); + } +} // run_experiment + +void print_options() { + std::cerr << "Options\n" << std::endl; + + std::cerr + << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp " + "[numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'" + " | '--sycl [syclDeviceIndex]'" + << std::endl; + + std::cerr << "\t[Optional] --amtx :: input matrix" << std::endl; + std::cerr << "\t[Optional] --repeat :: how many times to repeat overall " + "MDF" + << std::endl; + std::cerr << "\t[Optional] --verbose :: enable verbose output" + << std::endl; + std::cerr << "\nSettings for randomly generated A matrix" << std::endl; + std::cerr << "\t[Optional] --m :: number of rows to generate" + << std::endl; + std::cerr << "\t[Optional] --n :: number of cols to generate" + << std::endl; + std::cerr + << "\t[Optional] --nnz :: number of entries per row to generate" + << std::endl; + std::cerr << "\t[Optional] --diag :: generate a diagonal matrix" + << std::endl; +} // print_options + +int parse_inputs(Params& params, int argc, char** argv) { + for (int i = 1; i < argc; ++i) { + if (0 == Test::string_compare_no_case(argv[i], "--threads")) { + params.use_threads = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) { + params.use_openmp = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { + params.use_cuda = atoi(argv[++i]) + 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) { + params.use_hip = atoi(argv[++i]) + 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--sycl")) { + params.use_sycl = atoi(argv[++i]) + 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) { + params.amtx = argv[++i]; + } else if (0 == Test::string_compare_no_case(argv[i], "--m")) { + params.m = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--n")) { + params.n = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--nnz")) { + params.nnzPerRow = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--diag")) { + params.diag = true; + } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) { + params.repeat = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--verbose")) { + params.verbose = true; + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + return 1; + } + } + return 0; +} // parse_inputs + +int main(int argc, char** argv) { + Params params; + + if (parse_inputs(params, argc, argv)) { + return 1; + } + const int num_threads = + std::max(params.use_openmp, + params.use_threads); // Assumption is that use_openmp variable + // is provided as number of threads + + // If cuda, hip or sycl is used, set device_id + int device_id = 0; + if (params.use_cuda > 0) { + device_id = params.use_cuda - 1; + } + if (params.use_hip > 0) { + device_id = params.use_hip - 1; + } + if (params.use_sycl > 0) { + device_id = params.use_sycl - 1; + } + + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); + + bool useOMP = params.use_openmp != 0; + bool useThreads = params.use_threads != 0; + bool useCUDA = params.use_cuda != 0; + bool useHIP = params.use_hip != 0; + bool useSYCL = params.use_sycl != 0; + bool useSerial = !useOMP && !useCUDA && !useHIP && !useSYCL; + + if (useOMP) { +#if defined(KOKKOS_ENABLE_OPENMP) + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); +#else + std::cout << "ERROR: OpenMP requested, but not available.\n"; + return 1; +#endif + } + if (useThreads) { +#if defined(KOKKOS_ENABLE_THREADS) + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); +#else + std::cout << "ERROR: OpenMP requested, but not available.\n"; + return 1; +#endif + } + if (useCUDA) { +#if defined(KOKKOS_ENABLE_CUDA) + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); +#else + std::cout << "ERROR: CUDA requested, but not available.\n"; + return 1; +#endif + } + if (useHIP) { +#if defined(KOKKOS_ENABLE_HIP) + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); +#else + std::cout << "ERROR: HIP requested, but not available.\n"; + return 1; +#endif + } + if (useSYCL) { +#if defined(KOKKOS_ENABLE_SYCL) + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); +#else + std::cout << "ERROR: SYCL requested, but not available.\n"; + return 1; +#endif + } + if (useSerial) { +#if defined(KOKKOS_ENABLE_SERIAL) + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); +#else + std::cout << "ERROR: Serial device requested, but not available.\n"; + return 1; +#endif + } + Kokkos::finalize(); + return 0; +} // main diff --git a/sparse/impl/KokkosSparse_mdf_impl.hpp b/sparse/impl/KokkosSparse_mdf_impl.hpp index feee2d765b..b8a25485f5 100644 --- a/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -155,12 +155,133 @@ struct MDF_discarded_fill_norm { A.graph.row_map(rowIdx) - 1); KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", - rowIdx, KAS::sqrt(discard_norm), deficiency(rowIdx), degree); + static_cast(rowIdx), + static_cast(KAS::sqrt(discard_norm)), + static_cast(deficiency(rowIdx)), static_cast(degree)); } } }; // MDF_discarded_fill_norm +template +struct MDF_selective_discarded_fill_norm { + using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; + using col_ind_type = + typename static_crs_graph_type::entries_type::non_const_type; + using values_type = typename crs_matrix_type::values_type::non_const_type; + using size_type = typename crs_matrix_type::size_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using scalar_type = typename crs_matrix_type::value_type; + using KAS = typename Kokkos::ArithTraits; + + const scalar_type zero = KAS::zero(); + + crs_matrix_type A, At; + ordinal_type factorization_step; + col_ind_type permutation; + col_ind_type update_list; + + values_type discarded_fill; + col_ind_type deficiency; + int verbosity; + + MDF_selective_discarded_fill_norm(crs_matrix_type A_, crs_matrix_type At_, + ordinal_type factorization_step_, + col_ind_type permutation_, + col_ind_type update_list_, + values_type discarded_fill_, + col_ind_type deficiency_, int verbosity_) + : A(A_), + At(At_), + factorization_step(factorization_step_), + permutation(permutation_), + update_list(update_list_), + discarded_fill(discarded_fill_), + deficiency(deficiency_), + verbosity(verbosity_){}; + + KOKKOS_INLINE_FUNCTION + void operator()(const ordinal_type i) const { + ordinal_type rowIdx = permutation(update_list(i)); + scalar_type discard_norm = zero, diag_val = zero; + bool entryIsDiscarded = true; + ordinal_type numFillEntries = 0; + for (size_type alphaIdx = At.graph.row_map(rowIdx); + alphaIdx < At.graph.row_map(rowIdx + 1); ++alphaIdx) { + ordinal_type fillRowIdx = At.graph.entries(alphaIdx); + bool row_not_eliminated = true; + for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) { + if (fillRowIdx == permutation(stepIdx)) { + row_not_eliminated = false; + } + } + + if (fillRowIdx != rowIdx && row_not_eliminated) { + for (size_type betaIdx = A.graph.row_map(rowIdx); + betaIdx < A.graph.row_map(rowIdx + 1); ++betaIdx) { + ordinal_type fillColIdx = A.graph.entries(betaIdx); + bool col_not_eliminated = true; + for (ordinal_type stepIdx = 0; stepIdx < factorization_step; + ++stepIdx) { + if (fillColIdx == permutation(stepIdx)) { + col_not_eliminated = false; + } + } + + if (fillColIdx != rowIdx && col_not_eliminated) { + entryIsDiscarded = true; + for (size_type entryIdx = A.graph.row_map(fillRowIdx); + entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { + if (A.graph.entries(entryIdx) == fillColIdx) { + entryIsDiscarded = false; + } + } + if (entryIsDiscarded) { + numFillEntries += 1; + discard_norm += + KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * + KAS::abs(At.values(alphaIdx) * A.values(betaIdx)); + if (verbosity > 1) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Adding value A[%d,%d]=%f to discard norm of row %d\n", + static_cast(At.graph.entries(alphaIdx)), + static_cast(A.graph.entries(betaIdx)), + static_cast( + KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * + KAS::abs(At.values(alphaIdx) * A.values(betaIdx))), + static_cast(rowIdx)); + } + } + } + } + } else if (fillRowIdx == rowIdx) { + diag_val = At.values(alphaIdx); + if (verbosity > 1) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Row %d diagonal value dected, values(%d)=%f\n", + static_cast(rowIdx), static_cast(alphaIdx), + static_cast(At.values(alphaIdx))); + } + } + } + + // TODO add a check on `diag_val == zero` + discard_norm = discard_norm / (diag_val * diag_val); + discarded_fill(rowIdx) = discard_norm; + deficiency(rowIdx) = numFillEntries; + if (verbosity > 0) { + const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) - + A.graph.row_map(rowIdx) - 1); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", + static_cast(rowIdx), + static_cast(KAS::sqrt(discard_norm)), + static_cast(deficiency(rowIdx)), static_cast(degree)); + } + } + +}; // MDF_selective_discarded_fill_norm + template struct MDF_select_row { using values_type = typename crs_matrix_type::values_type::non_const_type; @@ -294,6 +415,8 @@ struct MDF_factorize_row { values_type valuesU; col_ind_type permutation, permutation_inv; + values_type discarded_fill; + col_ind_type factored; ordinal_type selected_row_idx, factorization_step; int verbosity; @@ -303,6 +426,7 @@ struct MDF_factorize_row { values_type valuesL_, row_map_type row_mapU_, col_ind_type entriesU_, values_type valuesU_, col_ind_type permutation_, col_ind_type permutation_inv_, + values_type discarded_fill_, col_ind_type factored_, ordinal_type selected_row_idx_, ordinal_type factorization_step_, int verbosity_) : A(A_), @@ -315,6 +439,8 @@ struct MDF_factorize_row { valuesU(valuesU_), permutation(permutation_), permutation_inv(permutation_inv_), + discarded_fill(discarded_fill_), + factored(factored_), selected_row_idx(selected_row_idx_), factorization_step(factorization_step_), verbosity(verbosity_){}; @@ -322,6 +448,7 @@ struct MDF_factorize_row { KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type /* idx */) const { const ordinal_type selected_row = permutation(selected_row_idx); + discarded_fill(selected_row) = Kokkos::ArithTraits::max(); // Swap entries in permutation vectors permutation(selected_row_idx) = permutation(factorization_step); @@ -332,7 +459,8 @@ struct MDF_factorize_row { if (verbosity > 0) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("Permutation vector: { "); for (ordinal_type rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", permutation(rowIdx)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", + static_cast(permutation(rowIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } @@ -356,23 +484,27 @@ struct MDF_factorize_row { if (verbosity > 0) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("Diagonal values of row %d is %f\n", - selected_row, diag); + static_cast(selected_row), + static_cast(diag)); } if (verbosity > 2) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("U, row_map={ "); for (ordinal_type rowIdx = 0; rowIdx < factorization_step + 1; ++rowIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", int(row_mapU(rowIdx))); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", + static_cast(row_mapU(rowIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, entries={ "); for (size_type entryIdx = row_mapU(0); entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", int(entriesU(entryIdx))); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", + static_cast(entriesU(entryIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); for (size_type entryIdx = row_mapU(0); entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", valuesU(entryIdx)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", + static_cast(valuesU(entryIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } @@ -397,17 +529,21 @@ struct MDF_factorize_row { if (verbosity > 2) { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "L(%d), [row_map(%d), row_map(%d)[ = [%d, %d[, entries={ ", - int(factorization_step), int(factorization_step), - int(factorization_step + 1), int(row_mapL(factorization_step)), - int(row_mapL(factorization_step + 1))); + static_cast(factorization_step), + static_cast(factorization_step), + static_cast(factorization_step + 1), + static_cast(row_mapL(factorization_step)), + static_cast(row_mapL(factorization_step + 1))); for (size_type entryIdx = row_mapL(factorization_step); entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", int(entriesL(entryIdx))); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", + static_cast(entriesL(entryIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); for (size_type entryIdx = row_mapL(factorization_step); entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", valuesL(entryIdx)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", + static_cast(valuesL(entryIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } @@ -466,8 +602,10 @@ struct MDF_factorize_row { if (verbosity > 1) { KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "A[%d, %d] -= %f\n", int(fillRowIdx), int(fillColIdx), - At.values(alphaIdx) * A.values(betaIdx) / diag_val); + "A[%d, %d] -= %f\n", static_cast(fillRowIdx), + static_cast(fillColIdx), + static_cast(At.values(alphaIdx) * + A.values(betaIdx) / diag_val)); } } } @@ -484,15 +622,19 @@ struct MDF_factorize_row { } } + factored(selected_row) = 1; + if (verbosity > 0) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in A: { "); for (size_type entryIdx = 0; entryIdx < A.nnz(); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", A.values(entryIdx)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", + static_cast(A.values(entryIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in At: { "); for (size_type entryIdx = 0; entryIdx < At.nnz(); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", At.values(entryIdx)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", + static_cast(At.values(entryIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } @@ -500,6 +642,70 @@ struct MDF_factorize_row { }; // MDF_factorize_row +template +struct MDF_compute_list_length { + using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: + entries_type::non_const_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using size_type = typename crs_matrix_type::size_type; + + ordinal_type selected_row_idx; + crs_matrix_type A; + crs_matrix_type At; + col_ind_type permutation; + col_ind_type factored; + col_ind_type update_list_length; + col_ind_type update_list; + + MDF_compute_list_length(const ordinal_type rowIdx_, const crs_matrix_type& A_, + const crs_matrix_type& At_, + const col_ind_type& permutation_, + const col_ind_type factored_, + col_ind_type& update_list_length_, + col_ind_type& update_list_) + : selected_row_idx(rowIdx_), + A(A_), + At(At_), + permutation(permutation_), + factored(factored_), + update_list_length(update_list_length_), + update_list(update_list_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type /*idx*/) const { + const ordinal_type selected_row = permutation(selected_row_idx); + + size_type updateIdx = 0; + for (size_type entryIdx = A.graph.row_map(selected_row); + entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { + if ((A.graph.entries(entryIdx) != selected_row) && + (factored(A.graph.entries(entryIdx)) != 1)) { + update_list(updateIdx) = A.graph.entries(entryIdx); + ++updateIdx; + } + } + size_type update_rows = updateIdx; + for (size_type entryIdx = At.graph.row_map(selected_row); + entryIdx < At.graph.row_map(selected_row + 1); ++entryIdx) { + if ((At.graph.entries(entryIdx) != selected_row) && + (factored(A.graph.entries(entryIdx)) != 1)) { + bool already_updated = false; + for (size_type checkIdx = 0; checkIdx < update_rows; ++checkIdx) { + if (At.graph.entries(entryIdx) == update_list(checkIdx)) { + already_updated = true; + break; + } + } + if (already_updated == false) { + update_list(updateIdx) = At.graph.entries(entryIdx); + ++updateIdx; + } + } + } + update_list_length(0) = updateIdx; + } +}; + template struct MDF_reindex_matrix { col_ind_type permutation_inv; diff --git a/sparse/src/KokkosSparse_mdf.hpp b/sparse/src/KokkosSparse_mdf.hpp index 33229b6cdb..90fa3beeef 100644 --- a/sparse/src/KokkosSparse_mdf.hpp +++ b/sparse/src/KokkosSparse_mdf.hpp @@ -34,7 +34,7 @@ namespace KokkosSparse { namespace Experimental { template -void mdf_symbolic_phase(crs_matrix_type& A, MDF_handle& handle) { +void mdf_symbolic(crs_matrix_type& A, MDF_handle& handle) { using size_type = typename crs_matrix_type::size_type; using ordinal_type = typename crs_matrix_type::ordinal_type; @@ -60,10 +60,10 @@ void mdf_symbolic_phase(crs_matrix_type& A, MDF_handle& handle) { } return; -} // mdf_symbolic_phase +} // mdf_symbolic template -void mdf_numeric_phase(crs_matrix_type& A, MDF_handle& handle) { +void mdf_numeric(crs_matrix_type& A, MDF_handle& handle) { using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: entries_type::non_const_type; using values_type = typename crs_matrix_type::values_type::non_const_type; @@ -78,13 +78,26 @@ void mdf_numeric_phase(crs_matrix_type& A, MDF_handle& handle) { // compute discarded fill of each row // selected pivot based on MDF // factorize pivot row of A - crs_matrix_type Atmp = crs_matrix_type("A fill", A); + const int verbosity_level = handle.verbosity; + crs_matrix_type Atmp = crs_matrix_type("A fill", A); crs_matrix_type At = KokkosSparse::Impl::transpose_matrix(A); KokkosSparse::sort_crs_matrix(At); values_type discarded_fill("discarded fill", A.numRows()); col_ind_type deficiency("deficiency", A.numRows()); + col_ind_type update_list_length("update list length", 1); + typename col_ind_type::HostMirror update_list_length_host = + Kokkos::create_mirror_view(update_list_length); + col_ind_type update_list("update list", A.numRows()); + col_ind_type factored("factored rows", A.numRows()); + Kokkos::deep_copy(discarded_fill, Kokkos::ArithTraits::max()); + Kokkos::deep_copy(deficiency, Kokkos::ArithTraits::max()); + + KokkosSparse::Impl::MDF_discarded_fill_norm MDF_df_norm( + Atmp, At, 0, handle.permutation, discarded_fill, deficiency, + verbosity_level); + Kokkos::parallel_for("MDF: initial fill computation", + range_policy_type(0, Atmp.numRows()), MDF_df_norm); - const int verbosity_level = handle.verbosity; for (ordinal_type factorization_step = 0; factorization_step < A.numRows(); ++factorization_step) { if (verbosity_level > 0) { @@ -92,44 +105,58 @@ void mdf_numeric_phase(crs_matrix_type& A, MDF_handle& handle) { static_cast(factorization_step)); } - range_policy_type stepPolicy(factorization_step, Atmp.numRows()); - Kokkos::deep_copy(discarded_fill, Kokkos::ArithTraits::max()); - Kokkos::deep_copy(deficiency, Kokkos::ArithTraits::max()); - KokkosSparse::Impl::MDF_discarded_fill_norm MDF_df_norm( - Atmp, At, factorization_step, handle.permutation, discarded_fill, - deficiency, verbosity_level); - Kokkos::parallel_for(stepPolicy, MDF_df_norm); + Kokkos::deep_copy(update_list_length_host, update_list_length); + range_policy_type updatePolicy(0, update_list_length_host(0)); + KokkosSparse::Impl::MDF_selective_discarded_fill_norm + MDF_update_df_norm(Atmp, At, factorization_step, handle.permutation, + update_list, discarded_fill, deficiency, + verbosity_level); + Kokkos::parallel_for("MDF: updating fill norms", updatePolicy, + MDF_update_df_norm); + range_policy_type stepPolicy(factorization_step, Atmp.numRows()); ordinal_type selected_row_idx = 0; KokkosSparse::Impl::MDF_select_row MDF_row_selector( factorization_step, discarded_fill, deficiency, Atmp.graph.row_map, handle.permutation); - Kokkos::parallel_reduce(stepPolicy, MDF_row_selector, selected_row_idx); + Kokkos::parallel_reduce("MDF: select pivot", stepPolicy, MDF_row_selector, + selected_row_idx); + + KokkosSparse::Impl::MDF_compute_list_length + compute_list_length(selected_row_idx, Atmp, At, handle.permutation, + factored, update_list_length, update_list); + Kokkos::parallel_for("MDF: compute update list", range_policy_type(0, 1), + compute_list_length); KokkosSparse::Impl::MDF_factorize_row factorize_row( Atmp, At, handle.row_mapL, handle.entriesL, handle.valuesL, handle.row_mapU, handle.entriesU, handle.valuesU, handle.permutation, - handle.permutation_inv, selected_row_idx, factorization_step, - verbosity_level); - Kokkos::parallel_for(range_policy_type(0, 1), factorize_row); + handle.permutation_inv, discarded_fill, factored, selected_row_idx, + factorization_step, verbosity_level); + Kokkos::parallel_for("MDF: factorize row", range_policy_type(0, 1), + factorize_row); if (verbosity_level > 0) { printf("\n"); } - } + } // Loop over factorization steps KokkosSparse::Impl::MDF_reindex_matrix reindex_U( handle.permutation_inv, handle.entriesU); - Kokkos::parallel_for(range_policy_type(0, handle.entriesU.extent(0)), + Kokkos::parallel_for("MDF: re-index U", + range_policy_type(0, handle.entriesU.extent(0)), reindex_U); KokkosSparse::Impl::MDF_reindex_matrix reindex_L( handle.permutation_inv, handle.entriesL); - Kokkos::parallel_for(range_policy_type(0, handle.entriesL.extent(0)), + Kokkos::parallel_for("MDF: re-index L", + range_policy_type(0, handle.entriesL.extent(0)), reindex_L); + handle.L = KokkosSparse::Impl::transpose_matrix(handle.L); + return; -} // mdf_numeric_phase +} // mdf_numeric } // namespace Experimental } // namespace KokkosSparse diff --git a/sparse/src/KokkosSparse_mdf_handle.hpp b/sparse/src/KokkosSparse_mdf_handle.hpp index eb44657337..6f6f2658be 100644 --- a/sparse/src/KokkosSparse_mdf_handle.hpp +++ b/sparse/src/KokkosSparse_mdf_handle.hpp @@ -60,6 +60,8 @@ struct MDF_handle { int verbosity; + crs_matrix_type L, U; + MDF_handle(const crs_matrix_type A) : numRows(A.numRows()), permutation(col_ind_type("row permutation", A.numRows())), @@ -74,31 +76,28 @@ struct MDF_handle { entriesL = col_ind_type("entries L", nnzL); valuesL = values_type("values L", nnzL); + L = crs_matrix_type("L", numRows, numRows, nnzL, valuesL, row_mapL, + entriesL); + // Allocate U row_mapU = row_map_type("row map U", numRows + 1); entriesU = col_ind_type("entries U", nnzU); valuesU = values_type("values U", nnzU); + + U = crs_matrix_type("U", numRows, numRows, nnzU, valuesU, row_mapU, + entriesU); } col_ind_type get_permutation() { return permutation; } void sort_factors() { - KokkosSparse::sort_crs_matrix(row_mapL, entriesL, valuesL); - KokkosSparse::sort_crs_matrix(row_mapU, entriesU, valuesU); + KokkosSparse::sort_crs_matrix(L); + KokkosSparse::sort_crs_matrix(U); } - crs_matrix_type getL() { - return KokkosSparse::Impl::transpose_matrix( - crs_matrix_type("L", numRows, numRows, entriesL.extent(0), valuesL, - row_mapL, entriesL)); - } + crs_matrix_type getL() { return L; } - crs_matrix_type getU() { - return crs_matrix_type("U", numRows, numRows, entriesU.extent(0), valuesU, - row_mapU, entriesU); - } + crs_matrix_type getU() { return U; } }; } // namespace Experimental diff --git a/sparse/unit_test/Test_Sparse_mdf.hpp b/sparse/unit_test/Test_Sparse_mdf.hpp index 3fcd827292..41204c9b4d 100644 --- a/sparse/unit_test/Test_Sparse_mdf.hpp +++ b/sparse/unit_test/Test_Sparse_mdf.hpp @@ -32,6 +32,8 @@ void run_test_mdf() { using values_type = typename crs_matrix_type::values_type::non_const_type; using value_type = typename crs_matrix_type::value_type; + const value_type four = static_cast(4.0); + constexpr ordinal_type numRows = 16; constexpr ordinal_type numCols = 16; constexpr size_type numNonZeros = 64; @@ -70,8 +72,8 @@ void run_test_mdf() { KokkosSparse::Experimental::MDF_handle handle(A); handle.set_verbosity(0); - mdf_symbolic_phase(A, handle); - mdf_numeric_phase(A, handle); + KokkosSparse::Experimental::mdf_symbolic(A, handle); + KokkosSparse::Experimental::mdf_numeric(A, handle); col_ind_type permutation = handle.get_permutation(); @@ -83,20 +85,97 @@ void run_test_mdf() { 7, 11, 13, 14, 5, 6, 9, 10}; printf("MDF ordering: { "); for (ordinal_type idx = 0; idx < A.numRows(); ++idx) { - ; printf("%d ", static_cast(permutation_h(idx))); if (permutation_h(idx) != permutation_ref[idx]) { success = false; } } printf("}\n"); - EXPECT_TRUE(success) << "The permutation computed is different from the reference solution!"; + // Check the factors L and U handle.sort_factors(); crs_matrix_type U = handle.getU(); crs_matrix_type L = handle.getL(); + + EXPECT_TRUE(U.numRows() == 16); + EXPECT_TRUE(U.nnz() == 40); + + { + auto row_map_U = Kokkos::create_mirror(U.graph.row_map); + Kokkos::deep_copy(row_map_U, U.graph.row_map); + auto entries_U = Kokkos::create_mirror(U.graph.entries); + Kokkos::deep_copy(entries_U, U.graph.entries); + auto values_U = Kokkos::create_mirror(U.values); + Kokkos::deep_copy(values_U, U.values); + + const size_type row_map_U_ref[17] = {0, 3, 6, 9, 12, 15, 17, 20, 22, + 25, 27, 30, 32, 35, 37, 39, 40}; + const ordinal_type entries_U_ref[40] = { + 0, 4, 6, 1, 5, 8, 2, 7, 10, 3, 9, 11, 4, 5, + 12, 5, 13, 6, 7, 12, 7, 14, 8, 9, 13, 9, 15, 10, + 11, 14, 11, 15, 12, 13, 14, 13, 15, 14, 15, 15}; + + const scalar_type val0 = static_cast(15. / 4.); + const scalar_type val1 = static_cast(val0 - 1 / val0); + const scalar_type val2 = static_cast(4 - 2 / val0); + const scalar_type val3 = + static_cast(4 - 1 / val0 - 1 / val1 - 1 / val2); + const scalar_type val4 = static_cast(4 - 2 / val1 - 2 / val3); + const scalar_type values_U_ref[40] = { + 4, -1, -1, 4, -1, -1, 4, -1, -1, 4, -1, -1, val0, -1, -1, + val1, -1, val0, -1, -1, val1, -1, val0, -1, -1, val1, -1, val0, -1, -1, + val1, -1, val2, -1, -1, val3, -1, val3, -1, val4}; + + for (int idx = 0; idx < 17; ++idx) { + EXPECT_TRUE(row_map_U_ref[idx] == row_map_U(idx)) + << "rowmap_U(" << idx << ") is wrong!"; + } + for (int idx = 0; idx < 40; ++idx) { + EXPECT_TRUE(entries_U_ref[idx] == entries_U(idx)) + << "entries_U(" << idx << ") is wrong!"; + EXPECT_NEAR_KK(values_U_ref[idx], values_U(idx), + 10 * Kokkos::ArithTraits::eps(), + "An entry in U.values is wrong!"); + } + + auto row_map_L = Kokkos::create_mirror(L.graph.row_map); + Kokkos::deep_copy(row_map_L, L.graph.row_map); + auto entries_L = Kokkos::create_mirror(L.graph.entries); + Kokkos::deep_copy(entries_L, L.graph.entries); + auto values_L = Kokkos::create_mirror(L.values); + Kokkos::deep_copy(values_L, L.values); + + const size_type row_map_L_ref[17] = {0, 1, 2, 3, 4, 6, 9, 11, 14, + 16, 19, 21, 24, 27, 31, 35, 40}; + const ordinal_type entries_L_ref[40] = { + 0, 1, 2, 3, 0, 4, 1, 4, 5, 0, 6, 2, 6, 7, + 1, 8, 3, 8, 9, 2, 10, 3, 10, 11, 4, 6, 12, 5, + 8, 12, 13, 7, 10, 12, 14, 9, 11, 13, 14, 15}; + const scalar_type values_L_ref[40] = { + 1, 1, 1, 1, -1 / four, 1, + -1 / four, -1 / val0, 1, -1 / four, 1, -1 / four, + -1 / val0, 1, -1 / four, 1, -1 / four, -1 / val0, + 1, -1 / four, 1, -1 / four, -1 / val0, 1, + -1 / val0, -1 / val0, 1, -1 / val1, -1 / val0, -1 / val2, + 1, -1 / val1, -1 / val0, -1 / val2, 1, -1 / val1, + -1 / val1, -1 / val3, -1 / val3, 1}; + + for (int idx = 0; idx < 17; ++idx) { + EXPECT_TRUE(row_map_L_ref[idx] == row_map_L(idx)) + << "rowmap_L(" << idx << ")=" << row_map_L(idx) << " is wrong!"; + } + for (int idx = 0; idx < 40; ++idx) { + EXPECT_TRUE(entries_L_ref[idx] == entries_L(idx)) + << "entries_L(" << idx << ")=" << entries_L(idx) + << " is wrong, entries_L_ref[" << idx << "]=" << entries_L_ref[idx] + << "!"; + EXPECT_NEAR_KK(values_L_ref[idx], values_L(idx), + 10 * Kokkos::ArithTraits::eps(), + "An entry in L.values is wrong!"); + } + } } } // namespace Test From 9586dd948ecb49a110e1fb8823ee275e51cfc4eb Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 8 Feb 2023 16:23:05 -0700 Subject: [PATCH 044/442] Use sptrsv instead of blas::trsm --- sparse/src/KokkosSparse_LUPrec.hpp | 59 +++++++++++++---------- sparse/unit_test/Test_Sparse_par_ilut.hpp | 38 ++------------- 2 files changed, 38 insertions(+), 59 deletions(-) diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index 48ba45588b..c0e0bf4c07 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -22,9 +22,8 @@ #include #include -#include #include -#include +#include namespace KokkosSparse { namespace Experimental { @@ -40,32 +39,43 @@ namespace Experimental { /// - compute() Does nothing; members initialized upon object construction. /// - isComputed() returns true /// -template +template class LUPrec : public KokkosSparse::Experimental::Preconditioner { public: using ScalarType = typename std::remove_const::type; using EXSP = typename CRS::execution_space; using karith = typename Kokkos::ArithTraits; - using View2dD = typename Kokkos::View; - using View2dH = typename View2dD::HostMirror; + using View1d = typename Kokkos::View; private: // trsm takes host views - View2dH _L, _U, _tmp; + CRS _L, _U; + View1d _tmp; + mutable KernelHandle _khL; + mutable KernelHandle _khU; public: //! Constructor: - template - LUPrec(const ViewArg &L, const ViewArg &U) - : _L("LUPrec::_L", L.extent(0), L.extent(1)), - _U("LUPrec::_U", U.extent(0), U.extent(1)), - _tmp("LUPrec::_tmp", L.extent(0), 1) { - Kokkos::deep_copy(_L, L); - Kokkos::deep_copy(_U, U); + template + LUPrec(const CRSArg &L, const CRSArg &U) : + _L(L), + _U(U), + _tmp("LUPrec::_tmp", L.numRows()), + _khL(), + _khU() + { + KK_REQUIRE_MSG(L.numRows() == U.numRows(), "LUPrec: L.numRows() != U.numRows()"); + + _khL.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, L.numRows(), true); + _khU.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, U.numRows(), false); } //! Destructor. - virtual ~LUPrec() {} + virtual ~LUPrec() + { + _khL.destroy_sptrsv_handle(); + _khU.destroy_sptrsv_handle(); + } ///// \brief Apply the preconditioner to X, putting the result in Y. ///// @@ -76,25 +86,22 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { ///// for conjugate transpose. All characters after the first are ///// ignored. This works just like the BLAS routines. ///// \param alpha [in] Input coefficient of M*x - ///// \param beta [in] Input coefficient of Y + ///// \param beta [in] Not used. ///// - ///// If the result of applying this preconditioner to a vector X is - ///// \f$M \cdot X\f$, then this method computes \f$Y = \beta Y + \alpha M - ///\cdot X\f$. - ///// The typical case is \f$\beta = 0\f$ and \f$\alpha = 1\f$. + ///// It takes L and U and the stores U^inv L^inv X in Y // virtual void apply(const Kokkos::View &X, const Kokkos::View &Y, const char transM[] = "N", ScalarType alpha = karith::one(), ScalarType = karith::zero()) const { - // tmp = trsm(L, x); //Apply L^inv to x - // y = trsm(U, tmp); //Apply U^inv to tmp - auto tmpsv = Kokkos::subview(_tmp, Kokkos::ALL, 0); - Kokkos::deep_copy(tmpsv, X); - KokkosBlas::Impl::SerialTrsm_Invoke("L", "L", transM, "N", alpha, _L, _tmp); - KokkosBlas::Impl::SerialTrsm_Invoke("L", "U", transM, "N", alpha, _U, _tmp); - Kokkos::deep_copy(Y, tmpsv); + // tmp = trsv(L, x); //Apply L^inv to x + // y = trsv(U, tmp); //Apply U^inv to tmp + sptrsv_symbolic( &_khL, _L.graph.row_map, _L.graph.entries ); + sptrsv_solve( &_khL, _L.graph.row_map, _L.graph.entries, _L.values, X, _tmp ); + + sptrsv_symbolic( &_khU, _U.graph.row_map, _U.graph.entries ); + sptrsv_solve( &_khU, _U.graph.row_map, _U.graph.entries, _U.values, _tmp, Y ); } //@} diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index 6607b3314f..cf34234347 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -85,30 +85,6 @@ std::vector> decompress_matrix( return result; } -template -void decompress_matrix(Kokkos::View& row_map, - Kokkos::View& entries, - Kokkos::View& values, - Kokkos::View& output) { - using exe_space = typename device::execution_space; - - const size_type nrows = row_map.size() - 1; - - Kokkos::parallel_for( - Kokkos::RangePolicy(0, nrows), - KOKKOS_LAMBDA(const int& row_idx) { - const size_type row_nnz_begin = row_map(row_idx); - const size_type row_nnz_end = row_map(row_idx + 1); - for (size_type row_nnz = row_nnz_begin; row_nnz < row_nnz_end; - ++row_nnz) { - const lno_t col_idx = entries(row_nnz); - const scalar_t value = values(row_nnz); - output(row_idx, col_idx) = value; - } - }); -} - template void check_matrix(const std::string& name, @@ -386,13 +362,10 @@ void run_test_par_ilut_precond() { #endif ); - // Convert L, U parILUT outputs to uncompressed 2d views as required - // by LUPrec - Kokkos::View L_uncompressed("L_uncompressed", numRows, - numRows), - U_uncompressed("U_uncompressed", numRows, numRows); - decompress_matrix(L_row_map, L_entries, L_values, L_uncompressed); - decompress_matrix(U_row_map, U_entries, U_values, U_uncompressed); + // Create CRSs + sp_matrix_type + L("L", numRows, numCols, L_values.extent(0), L_values, L_row_map, L_entries), + U("U", numRows, numCols, U_values.extent(0), U_values, U_row_map, U_entries); // Set initial vectors: ViewVectorType X("X", n); // Solution and initial guess @@ -431,8 +404,7 @@ void run_test_par_ilut_precond() { gmres_handle->set_verbose(verbose); // Make precond - KokkosSparse::Experimental::LUPrec myPrec(L_uncompressed, - U_uncompressed); + KokkosSparse::Experimental::LUPrec myPrec(L, U); // reset X for next gmres call Kokkos::deep_copy(X, 0.0); From 6a4bf14cec2e3fd294e378e86076ae55df3cf8d1 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 8 Feb 2023 16:42:29 -0700 Subject: [PATCH 045/442] Address GH feedback --- common/src/KokkosKernels_Error.hpp | 14 +------------- sparse/src/KokkosSparse_LUPrec.hpp | 2 +- sparse/src/KokkosSparse_par_ilut.hpp | 3 +++ 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/common/src/KokkosKernels_Error.hpp b/common/src/KokkosKernels_Error.hpp index ebcdf7ea81..5b0b6cc34f 100644 --- a/common/src/KokkosKernels_Error.hpp +++ b/common/src/KokkosKernels_Error.hpp @@ -77,25 +77,13 @@ inline void hip_internal_safe_call(hipError_t e, const char *name, } while (0) // SYCL cannot printf like the other backends quite yet -#ifdef __SYCL_DEVICE_ONLY__ -#define IMPL_KERNEL_THROW(condition, msg) \ - do { \ - if (!(condition)) { \ - const __attribute__((opencl_constant)) char format[] = \ - "KERNEL CHECK FAILED:\n %s %s\n"; \ - sycl::ext::oneapi::experimental::printf(format, #condition, msg); \ - Kokkos::abort(""); \ - } \ - } while (0) -#else #define IMPL_KERNEL_THROW(condition, msg) \ do { \ if (!(condition)) { \ - printf("KERNEL CHECK FAILED:\n %s\n %s\n", #condition, msg); \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("KERNEL CHECK FAILED:\n %s\n %s\n", #condition, msg); \ Kokkos::abort(""); \ } \ } while (0) -#endif #ifndef NDEBUG #define KK_ASSERT(condition) IMPL_THROW(condition, "", std::logic_error) diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index c0e0bf4c07..40d934b762 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -45,7 +45,7 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { using ScalarType = typename std::remove_const::type; using EXSP = typename CRS::execution_space; using karith = typename Kokkos::ArithTraits; - using View1d = typename Kokkos::View; + using View1d = typename Kokkos::View; private: // trsm takes host views diff --git a/sparse/src/KokkosSparse_par_ilut.hpp b/sparse/src/KokkosSparse_par_ilut.hpp index ca46976366..ee16e4c71e 100644 --- a/sparse/src/KokkosSparse_par_ilut.hpp +++ b/sparse/src/KokkosSparse_par_ilut.hpp @@ -348,6 +348,9 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, KokkosKernels::Impl::throw_runtime_exception(os.str()); } + KK_REQUIRE_MSG(KokkosSparse::Impl::isCrsGraphSorted(L_rowmap, L_entries), "L is not sorted"); + KK_REQUIRE_MSG(KokkosSparse::Impl::isCrsGraphSorted(U_rowmap, U_entries), "U is not sorted"); + using c_size_t = typename KernelHandle::const_size_type; using c_lno_t = typename KernelHandle::const_nnz_lno_t; using c_scalar_t = typename KernelHandle::const_nnz_scalar_t; From 91222dba212b5c12ee68e54b70d7168a13360b2f Mon Sep 17 00:00:00 2001 From: James Foucar Date: Wed, 8 Feb 2023 16:46:08 -0700 Subject: [PATCH 046/442] formatting --- common/src/KokkosKernels_Error.hpp | 13 ++++----- sparse/src/KokkosSparse_LUPrec.hpp | 33 +++++++++++------------ sparse/src/KokkosSparse_par_ilut.hpp | 6 +++-- sparse/unit_test/Test_Sparse_par_ilut.hpp | 10 ++++--- 4 files changed, 32 insertions(+), 30 deletions(-) diff --git a/common/src/KokkosKernels_Error.hpp b/common/src/KokkosKernels_Error.hpp index 5b0b6cc34f..9ebb104378 100644 --- a/common/src/KokkosKernels_Error.hpp +++ b/common/src/KokkosKernels_Error.hpp @@ -77,12 +77,13 @@ inline void hip_internal_safe_call(hipError_t e, const char *name, } while (0) // SYCL cannot printf like the other backends quite yet -#define IMPL_KERNEL_THROW(condition, msg) \ - do { \ - if (!(condition)) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("KERNEL CHECK FAILED:\n %s\n %s\n", #condition, msg); \ - Kokkos::abort(""); \ - } \ +#define IMPL_KERNEL_THROW(condition, msg) \ + do { \ + if (!(condition)) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("KERNEL CHECK FAILED:\n %s\n %s\n", \ + #condition, msg); \ + Kokkos::abort(""); \ + } \ } while (0) #ifndef NDEBUG diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index 40d934b762..0b87d58b3d 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -45,7 +45,7 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { using ScalarType = typename std::remove_const::type; using EXSP = typename CRS::execution_space; using karith = typename Kokkos::ArithTraits; - using View1d = typename Kokkos::View; + using View1d = typename Kokkos::View; private: // trsm takes host views @@ -57,22 +57,19 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { public: //! Constructor: template - LUPrec(const CRSArg &L, const CRSArg &U) : - _L(L), - _U(U), - _tmp("LUPrec::_tmp", L.numRows()), - _khL(), - _khU() - { - KK_REQUIRE_MSG(L.numRows() == U.numRows(), "LUPrec: L.numRows() != U.numRows()"); - - _khL.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, L.numRows(), true); - _khU.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, U.numRows(), false); + LUPrec(const CRSArg &L, const CRSArg &U) + : _L(L), _U(U), _tmp("LUPrec::_tmp", L.numRows()), _khL(), _khU() { + KK_REQUIRE_MSG(L.numRows() == U.numRows(), + "LUPrec: L.numRows() != U.numRows()"); + + _khL.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, L.numRows(), + true); + _khU.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, U.numRows(), + false); } //! Destructor. - virtual ~LUPrec() - { + virtual ~LUPrec() { _khL.destroy_sptrsv_handle(); _khU.destroy_sptrsv_handle(); } @@ -97,11 +94,11 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { ScalarType = karith::zero()) const { // tmp = trsv(L, x); //Apply L^inv to x // y = trsv(U, tmp); //Apply U^inv to tmp - sptrsv_symbolic( &_khL, _L.graph.row_map, _L.graph.entries ); - sptrsv_solve( &_khL, _L.graph.row_map, _L.graph.entries, _L.values, X, _tmp ); + sptrsv_symbolic(&_khL, _L.graph.row_map, _L.graph.entries); + sptrsv_solve(&_khL, _L.graph.row_map, _L.graph.entries, _L.values, X, _tmp); - sptrsv_symbolic( &_khU, _U.graph.row_map, _U.graph.entries ); - sptrsv_solve( &_khU, _U.graph.row_map, _U.graph.entries, _U.values, _tmp, Y ); + sptrsv_symbolic(&_khU, _U.graph.row_map, _U.graph.entries); + sptrsv_solve(&_khU, _U.graph.row_map, _U.graph.entries, _U.values, _tmp, Y); } //@} diff --git a/sparse/src/KokkosSparse_par_ilut.hpp b/sparse/src/KokkosSparse_par_ilut.hpp index ee16e4c71e..21371792c0 100644 --- a/sparse/src/KokkosSparse_par_ilut.hpp +++ b/sparse/src/KokkosSparse_par_ilut.hpp @@ -348,8 +348,10 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, KokkosKernels::Impl::throw_runtime_exception(os.str()); } - KK_REQUIRE_MSG(KokkosSparse::Impl::isCrsGraphSorted(L_rowmap, L_entries), "L is not sorted"); - KK_REQUIRE_MSG(KokkosSparse::Impl::isCrsGraphSorted(U_rowmap, U_entries), "U is not sorted"); + KK_REQUIRE_MSG(KokkosSparse::Impl::isCrsGraphSorted(L_rowmap, L_entries), + "L is not sorted"); + KK_REQUIRE_MSG(KokkosSparse::Impl::isCrsGraphSorted(U_rowmap, U_entries), + "U is not sorted"); using c_size_t = typename KernelHandle::const_size_type; using c_lno_t = typename KernelHandle::const_nnz_lno_t; diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index cf34234347..377d8127ec 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -363,9 +363,10 @@ void run_test_par_ilut_precond() { ); // Create CRSs - sp_matrix_type - L("L", numRows, numCols, L_values.extent(0), L_values, L_row_map, L_entries), - U("U", numRows, numCols, U_values.extent(0), U_values, U_row_map, U_entries); + sp_matrix_type L("L", numRows, numCols, L_values.extent(0), L_values, + L_row_map, L_entries), + U("U", numRows, numCols, U_values.extent(0), U_values, U_row_map, + U_entries); // Set initial vectors: ViewVectorType X("X", n); // Solution and initial guess @@ -404,7 +405,8 @@ void run_test_par_ilut_precond() { gmres_handle->set_verbose(verbose); // Make precond - KokkosSparse::Experimental::LUPrec myPrec(L, U); + KokkosSparse::Experimental::LUPrec myPrec(L, + U); // reset X for next gmres call Kokkos::deep_copy(X, 0.0); From a05f21e3b096f9ab75397c969e3bf1f5faf528a0 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 8 Feb 2023 22:40:01 -0800 Subject: [PATCH 047/442] Prefer View::{R->r}ank --- .../dense/impl/KokkosBatched_Axpy_Impl.hpp | 18 ++++----- .../dense/impl/KokkosBatched_Dot_Internal.hpp | 36 ++++++++--------- .../KokkosBatched_Gemv_TeamVector_Impl.hpp | 8 ++-- .../impl/KokkosBatched_Gemv_Team_Impl.hpp | 8 ++-- .../dense/impl/KokkosBatched_Gesv_Impl.hpp | 24 +++++------ .../KokkosBatched_HadamardProduct_Impl.hpp | 18 ++++----- .../dense/impl/KokkosBatched_Xpay_Impl.hpp | 12 +++--- .../impl/KokkosBatched_Spmv_Serial_Impl.hpp | 20 +++++----- .../KokkosBatched_Spmv_TeamVector_Impl.hpp | 24 +++++------ .../impl/KokkosBatched_Spmv_Team_Impl.hpp | 20 +++++----- blas/impl/KokkosBlas1_axpby_impl.hpp | 12 +++--- blas/impl/KokkosBlas1_axpby_mv_impl.hpp | 40 +++++++++---------- blas/impl/KokkosBlas1_axpby_spec.hpp | 18 ++++----- blas/impl/KokkosBlas2_team_gemv_spec.hpp | 18 ++++----- blas/src/KokkosBlas1_axpby.hpp | 8 ++-- blas/src/KokkosBlas1_nrm2.hpp | 2 +- .../tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp | 2 +- blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp | 16 ++++---- blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp | 4 +- .../tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp | 2 +- blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp | 2 +- blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp | 2 +- .../KokkosBlas1_nrminf_tpl_spec_avail.hpp | 2 +- blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp | 2 +- 24 files changed, 159 insertions(+), 159 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp b/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp index ab77c30e83..232ef5278c 100644 --- a/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp @@ -188,11 +188,11 @@ KOKKOS_INLINE_FUNCTION int SerialAxpy::invoke(const alphaViewType& alpha, "KokkosBatched::axpy: YViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::axpy: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::axpy: YViewType must have rank 2."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::axpy: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -235,11 +235,11 @@ KOKKOS_INLINE_FUNCTION int TeamAxpy::invoke( "KokkosBatched::axpy: YViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::axpy: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::axpy: YViewType must have rank 2."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::axpy: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -283,11 +283,11 @@ KOKKOS_INLINE_FUNCTION int TeamVectorAxpy::invoke( "KokkosBatched::axpy: YViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::axpy: alphaViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::axpy: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::axpy: YViewType must have rank 2."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::axpy: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. diff --git a/batched/dense/impl/KokkosBatched_Dot_Internal.hpp b/batched/dense/impl/KokkosBatched_Dot_Internal.hpp index 92d7e7c07f..c50da7a3d4 100644 --- a/batched/dense/impl/KokkosBatched_Dot_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Dot_Internal.hpp @@ -175,11 +175,11 @@ struct SerialDot { "KokkosBatched::dot: YViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::Rank == 1, + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -221,11 +221,11 @@ struct SerialDot { "KokkosBatched::dot: YViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::Rank == 1, + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -270,11 +270,11 @@ struct TeamDot { "KokkosBatched::dot: YViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::Rank == 1, + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -317,11 +317,11 @@ struct TeamDot { "KokkosBatched::dot: YViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::Rank == 1, + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -366,11 +366,11 @@ struct TeamVectorDot { "KokkosBatched::dot: YViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::Rank == 1, + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -413,11 +413,11 @@ struct TeamVectorDot { "KokkosBatched::dot: YViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::dot: NormViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::dot: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::dot: YViewType must have rank 2."); - static_assert(NormViewType::Rank == 1, + static_assert(NormViewType::rank == 1, "KokkosBatched::dot: NormViewType must have rank 1."); // Check compatibility of dimensions at run time. diff --git a/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp index 12b2d88250..26e22fb00c 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp @@ -45,7 +45,7 @@ struct TeamVectorGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - static_assert(AViewType::Rank == 3, + static_assert(AViewType::rank == 3, "Batched TeamVectorGemv requires rank-3 A matrix (use " "KokkosBlas::TeamVectorGemv for regular rank-2 matrix)"); return TeamVectorGemvInternal::template invoke< @@ -67,7 +67,7 @@ struct TeamVectorGemv { const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { - static_assert(AViewType::Rank == 3, + static_assert(AViewType::rank == 3, "Batched TeamVectorGemv requires rank-3 A matrix (use " "KokkosBlas::TeamVectorGemv for regular rank-2 matrix)"); Kokkos::abort( @@ -87,7 +87,7 @@ struct TeamVectorGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - static_assert(AViewType::Rank == 3, + static_assert(AViewType::rank == 3, "Batched TeamVectorGemv requires rank-3 A matrix (use " "KokkosBlas::TeamVectorGemv for regular rank-2 matrix)"); return TeamVectorGemvInternal::template invoke< @@ -109,7 +109,7 @@ struct TeamVectorGemv { const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { - static_assert(AViewType::Rank == 3, + static_assert(AViewType::rank == 3, "Batched TeamVectorGemv requires rank-3 A matrix (use " "KokkosBlas::TeamVectorGemv for regular rank-2 matrix)"); Kokkos::abort( diff --git a/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp index 18cd78fd31..b86796f4ff 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp @@ -45,7 +45,7 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - static_assert(AViewType::Rank == 3, + static_assert(AViewType::rank == 3, "Batched TeamGemv requires rank-3 A matrix (use " "KokkosBlas::TeamGemv for regular rank-2 matrix)"); return TeamGemvInternal::template invoke< @@ -67,7 +67,7 @@ struct TeamGemv { const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { - static_assert(AViewType::Rank == 3, + static_assert(AViewType::rank == 3, "Batched TeamGemv requires rank-3 A matrix (use " "KokkosBlas::TeamGemv for regular rank-2 matrix)"); Kokkos::abort( @@ -87,7 +87,7 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - static_assert(AViewType::Rank == 3, + static_assert(AViewType::rank == 3, "Batched TeamGemv requires rank-3 A matrix (use " "KokkosBlas::TeamGemv for regular rank-2 matrix)"); return TeamGemvInternal::template invoke< @@ -109,7 +109,7 @@ struct TeamGemv { const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { - static_assert(AViewType::Rank == 3, + static_assert(AViewType::rank == 3, "Batched TeamGemv requires rank-3 A matrix (use " "KokkosBlas::TeamGemv for regular rank-2 matrix)"); Kokkos::abort( diff --git a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index c98aa08788..a72c1a04ce 100644 --- a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -376,9 +376,9 @@ struct SerialGesv { "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::Rank == 2, + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::Rank == 1, + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. @@ -449,9 +449,9 @@ struct SerialGesv { "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::Rank == 2, + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::Rank == 1, + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. @@ -501,9 +501,9 @@ struct TeamGesv { "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::Rank == 2, + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::Rank == 1, + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. @@ -579,9 +579,9 @@ struct TeamGesv { "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::Rank == 2, + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::Rank == 1, + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. @@ -637,9 +637,9 @@ struct TeamVectorGesv { "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::Rank == 2, + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::Rank == 1, + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. @@ -716,9 +716,9 @@ struct TeamVectorGesv { "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::Rank == 2, + static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::Rank == 1, + static_assert(VectorType::rank == 1, "KokkosBatched::gesv: VectorType must have rank 1."); // Check compatibility of dimensions at run time. diff --git a/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp b/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp index 7dcdc78811..ebd789c2e8 100644 --- a/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp @@ -101,11 +101,11 @@ KOKKOS_INLINE_FUNCTION int SerialHadamardProduct::invoke(const XViewType& X, static_assert( Kokkos::is_view::value, "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::HadamardProduct: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::HadamardProduct: YViewType must have rank 2."); - static_assert(VViewType::Rank == 2, + static_assert(VViewType::rank == 2, "KokkosBatched::HadamardProduct: VViewType must have rank 2."); // Check compatibility of dimensions at run time. @@ -152,11 +152,11 @@ KOKKOS_INLINE_FUNCTION int TeamHadamardProduct::invoke( static_assert( Kokkos::is_view::value, "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::HadamardProduct: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::HadamardProduct: YViewType must have rank 2."); - static_assert(VViewType::Rank == 2, + static_assert(VViewType::rank == 2, "KokkosBatched::HadamardProduct: VViewType must have rank 2."); // Check compatibility of dimensions at run time. @@ -205,11 +205,11 @@ KOKKOS_INLINE_FUNCTION int TeamVectorHadamardProduct::invoke( static_assert( Kokkos::is_view::value, "KokkosBatched::HadamardProduct: VViewType is not a Kokkos::View."); - static_assert(XViewType::Rank == 2, + static_assert(XViewType::rank == 2, "KokkosBatched::HadamardProduct: XViewType must have rank 2."); - static_assert(YViewType::Rank == 2, + static_assert(YViewType::rank == 2, "KokkosBatched::HadamardProduct: YViewType must have rank 2."); - static_assert(VViewType::Rank == 2, + static_assert(VViewType::rank == 2, "KokkosBatched::HadamardProduct: VViewType must have rank 2."); // Check compatibility of dimensions at run time. diff --git a/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp b/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp index ba144cc778..4f90c0be38 100644 --- a/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp @@ -197,9 +197,9 @@ KOKKOS_INLINE_FUNCTION int SerialXpay::invoke(const alphaViewType& alpha, "KokkosBatched::xpay: ViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::xpay: alphaViewType is not a Kokkos::View."); - static_assert(ViewType::Rank == 2, + static_assert(ViewType::rank == 2, "KokkosBatched::xpay: ViewType must have rank 2."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::xpay: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -240,9 +240,9 @@ KOKKOS_INLINE_FUNCTION int TeamXpay::invoke( "KokkosBatched::xpay: ViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::xpay: alphaViewType is not a Kokkos::View."); - static_assert(ViewType::Rank == 2, + static_assert(ViewType::rank == 2, "KokkosBatched::xpay: ViewType must have rank 2."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::xpay: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -284,9 +284,9 @@ KOKKOS_INLINE_FUNCTION int TeamVectorXpay::invoke( "KokkosBatched::xpay: ViewType is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBatched::xpay: alphaViewType is not a Kokkos::View."); - static_assert(ViewType::Rank == 2, + static_assert(ViewType::rank == 2, "KokkosBatched::xpay: ViewType must have rank 2."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::xpay: alphaViewType must have rank 1."); // Check compatibility of dimensions at run time. diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp index d5a19cb56b..cafdc602a0 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp @@ -138,17 +138,17 @@ struct SerialSpmv { static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); - static_assert(ValuesViewType::Rank == 2, + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::Rank == 1, + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::Rank == 2, + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::Rank == 2, + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); - static_assert(betaViewType::Rank == 1, + static_assert(betaViewType::rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -232,13 +232,13 @@ struct SerialSpmv { static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); - static_assert(ValuesViewType::Rank == 2, + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::Rank == 1, + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::Rank == 2, + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::Rank == 2, + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); // Check compatibility of dimensions at run time. diff --git a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp index 6f04427924..5c0edbd390 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp @@ -321,21 +321,21 @@ struct TeamVectorSpmv { static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); - static_assert(ValuesViewType::Rank == 2, + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::Rank == 1, + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::Rank == 2, + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::Rank == 2, + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); - static_assert(betaViewType::Rank == 1, + static_assert(betaViewType::rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); - static_assert(betaViewType::Rank == 1, + static_assert(betaViewType::rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -420,13 +420,13 @@ struct TeamVectorSpmv { static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); - static_assert(ValuesViewType::Rank == 2, + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::Rank == 1, + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::Rank == 2, + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::Rank == 2, + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); // Check compatibility of dimensions at run time. diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp index bf2f0a82e7..fb9f44e8b0 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp @@ -176,17 +176,17 @@ struct TeamSpmv { static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: betaViewType is not a Kokkos::View."); - static_assert(ValuesViewType::Rank == 2, + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::Rank == 1, + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::Rank == 2, + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::Rank == 2, + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); - static_assert(alphaViewType::Rank == 1, + static_assert(alphaViewType::rank == 1, "KokkosBatched::spmv: alphaViewType must have rank 1."); - static_assert(betaViewType::Rank == 1, + static_assert(betaViewType::rank == 1, "KokkosBatched::spmv: betaViewType must have rank 1."); // Check compatibility of dimensions at run time. @@ -271,13 +271,13 @@ struct TeamSpmv { static_assert(Kokkos::is_view::value, "KokkosBatched::spmv: yViewType is not a Kokkos::View."); - static_assert(ValuesViewType::Rank == 2, + static_assert(ValuesViewType::rank == 2, "KokkosBatched::spmv: ValuesViewType must have rank 2."); - static_assert(IntView::Rank == 1, + static_assert(IntView::rank == 1, "KokkosBatched::spmv: IntView must have rank 2."); - static_assert(xViewType::Rank == 2, + static_assert(xViewType::rank == 2, "KokkosBatched::spmv: xViewType must have rank 2."); - static_assert(yViewType::Rank == 2, + static_assert(yViewType::rank == 2, "KokkosBatched::spmv: yViewType must have rank 2."); // Check compatibility of dimensions at run time. diff --git a/blas/impl/KokkosBlas1_axpby_impl.hpp b/blas/impl/KokkosBlas1_axpby_impl.hpp index e159ea798e..f6242c1514 100644 --- a/blas/impl/KokkosBlas1_axpby_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_impl.hpp @@ -75,10 +75,10 @@ struct Axpby_Functor { "KokkosBlas::Impl::Axpby_Functor: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert((int)YV::Rank == (int)XV::Rank, + static_assert((int)YV::rank == (int)XV::rank, "KokkosBlas::Impl::" "Axpby_Functor: X and Y must have the same rank."); - static_assert(YV::Rank == 1, + static_assert(YV::rank == 1, "KokkosBlas::Impl::Axpby_Functor: " "XV and YV must have rank 1."); @@ -211,10 +211,10 @@ struct Axpby_Functor +template struct axpby_eti_spec_avail { enum : bool { value = false }; }; @@ -127,7 +127,7 @@ namespace Impl { /// Any scalar coefficient of zero has BLAS semantics of /// ignoring the corresponding (multi)vector entry. This does NOT /// apply to coefficients in av and bv vectors, if they are used. -template ::value, bool eti_spec_avail = axpby_eti_spec_avail::value> struct Axpby { @@ -138,7 +138,7 @@ template struct Axpby { static void axpby(const AV& /* av */, const XMV& /* X */, const BV& /* bv */, const YMV& /* Y */) { - static_assert(YMV::Rank == 0, "Oh My God"); + static_assert(YMV::rank == 0, "Oh My God"); } }; @@ -160,10 +160,10 @@ struct Axpby { "KokkosBlas::Impl::Axpby::axpby: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert((int)YMV::Rank == (int)XMV::Rank, + static_assert((int)YMV::rank == (int)XMV::rank, "KokkosBlas::Impl::Axpby::axpby (MV): " "X and Y must have the same rank."); - static_assert(YMV::Rank == 2, + static_assert(YMV::rank == 2, "KokkosBlas::Impl::Axpby::axpby: " "X and Y must have rank 2."); Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY @@ -241,10 +241,10 @@ struct Axpby::axpby: Y is const. " "It must be nonconst, because it is an output argument " "(we have to be able to write to its entries)."); - static_assert((int)YV::Rank == (int)XV::Rank, + static_assert((int)YV::rank == (int)XV::rank, "KokkosBlas::Impl::" "Axpby::axpby: X and Y must have the same rank."); - static_assert(YV::Rank == 1, + static_assert(YV::rank == 1, "KokkosBlas::Impl::Axpby::axpby: " "X and Y must have rank 1."); diff --git a/blas/impl/KokkosBlas2_team_gemv_spec.hpp b/blas/impl/KokkosBlas2_team_gemv_spec.hpp index 355c1ca6cf..d46fb7be6f 100644 --- a/blas/impl/KokkosBlas2_team_gemv_spec.hpp +++ b/blas/impl/KokkosBlas2_team_gemv_spec.hpp @@ -61,7 +61,7 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const AViewType& A, const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::Rank == 2, + static_assert(AViewType::rank == 2, "KokkosBlas::TeamGemv requires rank-2 A matrix"); return Impl::TeamGemvInternal::invoke( member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), @@ -76,7 +76,7 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const AViewType& A, const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::Rank == 2, + static_assert(AViewType::rank == 2, "KokkosBlas::TeamGemv requires rank-2 A matrix"); return Impl::TeamGemvInternal::invoke( member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), @@ -95,7 +95,7 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const AViewType& A, const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::Rank == 2, + static_assert(AViewType::rank == 2, "BLAS TeamGemv requires rank-2 A matrix"); return Impl::TeamGemvInternal::invoke( member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), @@ -110,7 +110,7 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const AViewType& A, const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::Rank == 2, + static_assert(AViewType::rank == 2, "BLAS TeamGemv requires rank-2 A matrix"); return Impl::TeamGemvInternal::invoke( member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), @@ -129,7 +129,7 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const AViewType& A, const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::Rank == 2, + static_assert(AViewType::rank == 2, "BLAS TeamGemv requires rank-2 A matrix"); return Impl::TeamGemvInternal::invoke( member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, A.data(), @@ -145,7 +145,7 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const AViewType& A, const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::Rank == 2, + static_assert(AViewType::rank == 2, "BLAS TeamGemv requires rank-2 A matrix"); return Impl::TeamGemvInternal::invoke( member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, A.data(), @@ -165,7 +165,7 @@ struct TeamVectorGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const AViewType& A, const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::Rank == 2, + static_assert(AViewType::rank == 2, "Batched TeamVectorGemv requires rank-2 A matrix"); return Impl::TeamVectorGemvInternal::invoke( member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), @@ -184,7 +184,7 @@ struct TeamVectorGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const AViewType& A, const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::Rank == 2, + static_assert(AViewType::rank == 2, "Batched TeamVectorGemv requires rank-2 A matrix"); return Impl::TeamVectorGemvInternal::invoke( member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), @@ -203,7 +203,7 @@ struct TeamVectorGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const AViewType& A, const xViewType& x, const ScalarType beta, const yViewType& y) { - static_assert(AViewType::Rank == 2, + static_assert(AViewType::rank == 2, "Batched TeamVectorGemv requires rank-2 A matrix"); return Impl::TeamVectorGemvInternal::invoke( member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, A.data(), diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index e2ec1dde0c..ff037e59d1 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -43,10 +43,10 @@ void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { "KokkosBlas::axpby: Y is const. It must be nonconst, " "because it is an output argument " "(we must be able to write to its entries)."); - static_assert(int(YMV::Rank) == int(XMV::Rank), + static_assert(int(YMV::rank) == int(XMV::rank), "KokkosBlas::axpby: " "X and Y must have the same rank."); - static_assert(YMV::Rank == 1 || YMV::Rank == 2, + static_assert(YMV::rank == 1 || YMV::rank == 2, "KokkosBlas::axpby: " "XMV and YMV must either have rank 1 or rank 2."); @@ -107,10 +107,10 @@ KOKKOS_FUNCTION void serial_axpy(const scalar_type alpha, const XMV X, YMV Y) { "KokkosBlas::serial_axpy: XMV is not a Kokkos::View"); static_assert(Kokkos::is_view::value, "KokkosBlas::serial_axpy: YMV is not a Kokkos::View"); - static_assert(XMV::Rank == 1 || XMV::Rank == 2, + static_assert(XMV::rank == 1 || XMV::rank == 2, "KokkosBlas::serial_axpy: XMV must have rank 1 or 2."); static_assert( - XMV::Rank == YMV::Rank, + XMV::rank == YMV::rank, "KokkosBlas::serial_axpy: XMV and YMV must have the same rank."); if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { diff --git a/blas/src/KokkosBlas1_nrm2.hpp b/blas/src/KokkosBlas1_nrm2.hpp index d3e2f03138..a8e56d95cf 100644 --- a/blas/src/KokkosBlas1_nrm2.hpp +++ b/blas/src/KokkosBlas1_nrm2.hpp @@ -140,7 +140,7 @@ serial_nrm2(const XMV X) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) static_assert(Kokkos::is_view::value, "KokkosBlas::serial_nrm2: XMV is not a Kokkos::View"); - static_assert(XMV::Rank == 1, + static_assert(XMV::rank == 1, "KokkosBlas::serial_nrm2: XMV must have rank 1"); #endif // KOKKOSKERNELS_DEBUG_LEVEL diff --git a/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp index 7ae8ef87b3..3d7952a578 100644 --- a/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct axpby_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp index 74fa4265d8..f69fc618a0 100644 --- a/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp @@ -69,7 +69,7 @@ namespace Impl { int one = 1; \ HostBlas::axpy(N, alpha, X.data(), one, Y.data(), one); \ } else \ - Axpby::axpby( \ + Axpby::axpby( \ alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ @@ -104,7 +104,7 @@ namespace Impl { int one = 1; \ HostBlas::axpy(N, alpha, X.data(), one, Y.data(), one); \ } else \ - Axpby::axpby( \ + Axpby::axpby( \ alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ @@ -146,7 +146,7 @@ namespace Impl { reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(Y.data()), one); \ } else \ - Axpby::axpby( \ + Axpby::axpby( \ alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ @@ -188,7 +188,7 @@ namespace Impl { reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(Y.data()), one); \ } else \ - Axpby::axpby( \ + Axpby::axpby( \ alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ @@ -255,7 +255,7 @@ namespace Impl { KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one); \ } else \ - Axpby::axpby( \ + Axpby::axpby( \ alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ @@ -294,7 +294,7 @@ namespace Impl { KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one); \ } else \ - Axpby::axpby( \ + Axpby::axpby( \ alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ @@ -339,7 +339,7 @@ namespace Impl { reinterpret_cast(X.data()), one, \ reinterpret_cast(Y.data()), one); \ } else \ - Axpby::axpby( \ + Axpby::axpby( \ alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ @@ -383,7 +383,7 @@ namespace Impl { reinterpret_cast(X.data()), one, \ reinterpret_cast(Y.data()), one); \ } else \ - Axpby::axpby( \ + Axpby::axpby( \ alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp index 97f4be71da..500bd5f239 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp @@ -20,8 +20,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct dot_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp index 37b61a2361..71d7c664aa 100644 --- a/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct iamax_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp index a2ce0d4390..529952c10c 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrm1_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp index 8b5476fd40..0680a72d99 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrm2_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp index 2fd3da50ee..54a74cfcf7 100644 --- a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrminf_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp index d340c6bc02..dccc20b9ac 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct scal_tpl_spec_avail { enum : bool { value = false }; }; From 1d19eeabbd3a0dd1bea4450e50ab8a117eaebedc Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 9 Feb 2023 09:14:57 -0700 Subject: [PATCH 048/442] CMake: export version and subversion to config file The new logic exports CMake variables to KokkosKernels_config.h so that users can use them in their code more easily. It also allows us to print more cleanly version in print_config() --- CMakeLists.txt | 11 +++++++++-- cmake/KokkosKernels_config.h.in | 6 +++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9d685e648e..d75a45499c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,7 +27,12 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS) SET(KokkosKernels_VERSION_MINOR 0) SET(KokkosKernels_VERSION_PATCH 99) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") + + #Set variables for config file MATH(EXPR KOKKOSKERNELS_VERSION "${KokkosKernels_VERSION_MAJOR} * 10000 + ${KokkosKernels_VERSION_MINOR} * 100 + ${KokkosKernels_VERSION_PATCH}") + MATH(EXPR KOKKOSKERNELS_VERSION_MAJOR "${KOKKOSKERNELS_VERSION} / 10000") + MATH(EXPR KOKKOSKERNELS_VERSION_MINOR "${KOKKOSKERNELS_VERSION} / 100 % 100") + MATH(EXPR KOKKOSKERNELS_VERSION_PATCH "${KOKKOSKERNELS_VERSION} % 100") ENDIF() INCLUDE(GNUInstallDirs) @@ -214,7 +219,9 @@ ELSE() # CMake Summary # ================================================================== MESSAGE("") - MESSAGE("=======================") + MESSAGE("================================") + MESSAGE("Kokkos Kernels version: ${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") + MESSAGE("================================") MESSAGE("Kokkos Kernels ETI Types") MESSAGE(" Devices: ${DEVICE_LIST}") MESSAGE(" Scalars: ${SCALAR_LIST}") @@ -238,7 +245,7 @@ ELSE() ELSE() MESSAGE(" (None)") ENDIF() - MESSAGE("=======================") + MESSAGE("================================") MESSAGE("") diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index 1fb6a31544..22a6cd9416 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -9,7 +9,11 @@ // clang-format on /* Define the current version of Kokkos Kernels */ -#cmakedefine KOKKOSKERNELS_VERSION @KOKKOSKERNELS_VERSION@ +#define KOKKOSKERNELS_VERSION @KOKKOSKERNELS_VERSION@ +#define KOKKOSKERNELS_VERSION_MAJOR @KOKKOSKERNELS_VERSION_MAJOR@ +#define KOKKOSKERNELS_VERSION_MINOR @KOKKOSKERNELS_VERSION_MINOR@ +#define KOKKOSKERNELS_VERSION_PATCH @KOKKOSKERNELS_VERSION_PATCH@ + /* Define if fortran blas 1 function can return complex type */ #cmakedefine KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX From 6a6a51045fe669a4c0937d4f87c8298ab370918a Mon Sep 17 00:00:00 2001 From: James Foucar Date: Thu, 9 Feb 2023 10:06:32 -0700 Subject: [PATCH 049/442] Fix warnings --- sparse/src/KokkosSparse_LUPrec.hpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index 0b87d58b3d..d288f341e1 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -79,18 +79,16 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { ///// \tparam XViewType Input vector, as a 1-D Kokkos::View ///// \tparam YViewType Output vector, as a nonconst 1-D Kokkos::View ///// - ///// \param transM [in] "N" for non-transpose, "T" for transpose, "C" - ///// for conjugate transpose. All characters after the first are - ///// ignored. This works just like the BLAS routines. - ///// \param alpha [in] Input coefficient of M*x + ///// \param transM [in] Not used. + ///// \param alpha [in] Not used ///// \param beta [in] Not used. ///// ///// It takes L and U and the stores U^inv L^inv X in Y // virtual void apply(const Kokkos::View &X, const Kokkos::View &Y, - const char transM[] = "N", - ScalarType alpha = karith::one(), + const char[] = "N", + ScalarType = karith::one(), ScalarType = karith::zero()) const { // tmp = trsv(L, x); //Apply L^inv to x // y = trsv(U, tmp); //Apply U^inv to tmp From c284ef4ac111bffe8079eab22c43f02ed44a433c Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 9 Feb 2023 11:00:25 -0700 Subject: [PATCH 050/442] Version: adding unit-test to verify that version info is available Small unit-test that asserts that the pre-processor macros to access the version number of Kokkos Kernels are available and that the values are consistent with each other. --- common/unit_test/Test_Common.hpp | 1 + common/unit_test/Test_Common_Version.hpp | 52 ++++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 common/unit_test/Test_Common_Version.hpp diff --git a/common/unit_test/Test_Common.hpp b/common/unit_test/Test_Common.hpp index 9cf686f513..dd368f009b 100644 --- a/common/unit_test/Test_Common.hpp +++ b/common/unit_test/Test_Common.hpp @@ -22,5 +22,6 @@ #include #include #include +#include #endif // TEST_COMMON_HPP diff --git a/common/unit_test/Test_Common_Version.hpp b/common/unit_test/Test_Common_Version.hpp new file mode 100644 index 0000000000..f09f1709d0 --- /dev/null +++ b/common/unit_test/Test_Common_Version.hpp @@ -0,0 +1,52 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/// \file Test_Common_Version.hpp +/// \brief Tests that the version information that Kokkos Kernels +/// makes available in KokkosKernels_config.h is properly +/// accessible and correct. + +#ifndef TEST_COMMON_VERSION_HPP +#define TEST_COMMON_VERSION_HPP + +#include +#include + +void test_version_info() { +#ifndef KOKKOSKERNELS_VERSION + static_assert(false, "KOKKOSKERNELS_VERSION macro is not defined!"); +#endif + +#ifndef KOKKOSKERNELS_VERSION_MAJOR + static_assert(false, "KOKKOSKERNELS_VERSION_MAJOR macro is not defined!"); +#endif + +#ifndef KOKKOSKERNELS_VERSION_MINOR + static_assert(false, "KOKKOSKERNELS_VERSION_MINOR macro is not defined!"); +#endif + +#ifndef KOKKOSKERNELS_VERSION_PATCH + static_assert(false, "KOKKOSKERNELS_VERSION_PATCH macro is not defined!"); +#endif + + static_assert(KOKKOSKERNELS_VERSION == (KOKKOSKERNELS_VERSION_MAJOR*10000 + KOKKOSKERNELS_VERSION_MINOR*100 + KOKKOSKERNELS_VERSION_PATCH)); +} + +TEST_F(TestCategory, common_version) { + test_version_info(); +} + +#endif // TEST_COMMON_VERSION_HPP From c9e631b61f1d716653f9ad45b38c664922d72726 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 9 Feb 2023 11:12:40 -0700 Subject: [PATCH 051/442] Version: applying clang-format --- common/unit_test/Test_Common_Version.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/common/unit_test/Test_Common_Version.hpp b/common/unit_test/Test_Common_Version.hpp index f09f1709d0..cb5265cfef 100644 --- a/common/unit_test/Test_Common_Version.hpp +++ b/common/unit_test/Test_Common_Version.hpp @@ -42,11 +42,11 @@ void test_version_info() { static_assert(false, "KOKKOSKERNELS_VERSION_PATCH macro is not defined!"); #endif - static_assert(KOKKOSKERNELS_VERSION == (KOKKOSKERNELS_VERSION_MAJOR*10000 + KOKKOSKERNELS_VERSION_MINOR*100 + KOKKOSKERNELS_VERSION_PATCH)); + static_assert(KOKKOSKERNELS_VERSION == (KOKKOSKERNELS_VERSION_MAJOR * 10000 + + KOKKOSKERNELS_VERSION_MINOR * 100 + + KOKKOSKERNELS_VERSION_PATCH)); } -TEST_F(TestCategory, common_version) { - test_version_info(); -} +TEST_F(TestCategory, common_version) { test_version_info(); } -#endif // TEST_COMMON_VERSION_HPP +#endif // TEST_COMMON_VERSION_HPP From d3ffe8214f9f1b5287c093fc98ed1d917ef74e43 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 6 Feb 2023 17:13:01 -0700 Subject: [PATCH 052/442] Perf Tests: adding utilities and instantiation wrapper The goal of this work is to create a common core infrastructure for the performance test in order to simplify maintenance. Here two ideas are introduced: 1. the instantiation wrapper 2. the common input parser both are trying to capture some of the implementation of our performance test in generic functions that can be called instead of duplicating logic around instantiation and command line input parsing. The new parsing routine checks the parameter name and that the associated value can be casted properly. It also add some logic to remove the arguments from argv and argc once they are parsed properly. --- .../KokkosKernels_perf_test_instantiation.hpp | 133 +++++++++ .../KokkosKernels_perf_test_utilities.hpp | 146 ++++++++++ perf_test/sparse/KokkosSparse_mdf.cpp | 268 ++++++------------ 3 files changed, 364 insertions(+), 183 deletions(-) create mode 100644 perf_test/KokkosKernels_perf_test_instantiation.hpp create mode 100644 perf_test/KokkosKernels_perf_test_utilities.hpp diff --git a/perf_test/KokkosKernels_perf_test_instantiation.hpp b/perf_test/KokkosKernels_perf_test_instantiation.hpp new file mode 100644 index 0000000000..9ed5ec23bc --- /dev/null +++ b/perf_test/KokkosKernels_perf_test_instantiation.hpp @@ -0,0 +1,133 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +// +// Created by Berger-Vergiat, Luc on 2/6/23. +// + +#ifndef KOKKOSKERNELS_PERF_TEST_INSTANTIATION_HPP +#define KOKKOSKERNELS_PERF_TEST_INSTANTIATION_HPP + +#include "KokkosKernels_perf_test_utilities.hpp" + +#ifndef KOKKOSKERNELS_PERF_TEST_NAME +#error "The macro KOKKOSKERNELS_PERF_TEST_NAME was not defined" +#endif + +int main_instantiation(int argc, char** argv) { + perf_test::CommonInputParams params; + perf_test::parse_common_options(argc, argv, params); + + /* Assumption is that use_openmp/use_threads variables are */ + /* provided as numbers of threads */ + int num_threads = 1; + if (params.use_openmp) { + num_threads = params.use_openmp; + } else if (params.use_threads) { + num_threads = params.use_threads; + } + + int device_id = 0; + if (params.use_cuda) + device_id = params.use_cuda - 1; + else if (params.use_hip) + device_id = params.use_hip - 1; + else if (params.use_sycl) + device_id = params.use_sycl - 1; + + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); + Kokkos::print_configuration(std::cout); + std::cout << '\n'; + + bool ran = false; + + if (params.use_openmp) { +#if defined(KOKKOS_ENABLE_OPENMP) + std::cout << "Running on OpenMP backend.\n"; + KOKKOSKERNELS_PERF_TEST_NAME(argc, argv, params); + ran = true; +#else + std::cout << "ERROR: OpenMP requested, but not available.\n"; + Kokkos::finalize(); + return 1; +#endif + } + if (params.use_threads) { +#if defined(KOKKOS_ENABLE_THREADS) + std::cout << "Running on Threads backend.\n"; + KOKKOSKERNELS_PERF_TEST_NAME(argc, argv, params); + ran = true; +#else + std::cout << "ERROR: Threads requested, but not available.\n"; + Kokkos::finalize(); + return 1; +#endif + } + if (params.use_cuda) { +#if defined(KOKKOS_ENABLE_CUDA) + std::cout << "Running on Cuda backend.\n"; + KOKKOSKERNELS_PERF_TEST_NAME(argc, argv, params); + ran = true; +#else + std::cout << "ERROR: CUDA requested, but not available.\n"; + Kokkos::finalize(); + return 1; +#endif + } + if (params.use_hip) { +#if defined(KOKKOS_ENABLE_HIP) + std::cout << "Running on HIP backend.\n"; + KOKKOSKERNELS_PERF_TEST_NAME(argc, argv, params); + ran = true; +#else + std::cout << "ERROR: HIP requested, but not available.\n"; + Kokkos::finalize(); + return 1; +#endif + } + if (params.use_sycl) { +#if defined(KOKKOS_ENABLE_SYCL) + std::cout << "Running on SYCL backend.\n"; + KOKKOSKERNELS_PERF_TEST_NAME(argc, argv, + params); + ran = true; +#else + std::cout << "ERROR: SYCL requested, but not available.\n"; + Kokkos::finalize(); + return 1; +#endif + } + if (!ran) { +#if defined(KOKKOS_ENABLE_SERIAL) + std::cout << "Running on Serial backend.\n"; + KOKKOSKERNELS_PERF_TEST_NAME(argc, argv, params); +#else + std::cout << "ERROR: Tried to run on Serial device (as no parallel" + " backends requested), but Serial is not enabled.\n"; + Kokkos::finalize(); + return 1; +#endif + } + Kokkos::finalize(); + return 0; +} + +// Undefine the macro to avoid potential bad interaction +// with other parts of the code... +#undef KOKKOSKERNELS_PERF_TEST_NAME + +#endif // KOKKOSKERNELS_PERF_TEST_INSTANTIATION_HPP diff --git a/perf_test/KokkosKernels_perf_test_utilities.hpp b/perf_test/KokkosKernels_perf_test_utilities.hpp new file mode 100644 index 0000000000..b798d55a8e --- /dev/null +++ b/perf_test/KokkosKernels_perf_test_utilities.hpp @@ -0,0 +1,146 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +// +// Created by Berger-Vergiat, Luc on 2/6/23. +// + +#ifndef KOKKOSKERNELS_PERF_TEST_UTILITIES_HPP +#define KOKKOSKERNELS_PERF_TEST_UTILITIES_HPP + +// Namepsace that defines common utilities +// for performance tests +namespace perf_test { + +struct CommonInputParams { + int use_cuda = 0; + int use_hip = 0; + int use_sycl = 0; + int use_openmp = 0; + int use_threads = 0; +}; + +std::string list_common_options() { + std::ostringstream common_options; + common_options + << "\t[Required] BACKEND:\n" + << "\t\t'--threads [numThreads]' |\n" + << "\t\t'--openmp [numThreads]' |\n" + << "\t\t'--cuda [deviceIndex]' |\n" + << "\t\t'--hip [deviceIndex]' |\n" + << "\t\t'--sycl [deviceIndex]'\n\n" + << "\tIf no parallel backend is requested, Serial will be used " + "(if enabled)\n\n"; + + return common_options.str(); +} + +void process_arg_int(char const* str_val, int& val) { + errno = 0; + char* ptr_end; + val = std::strtol(str_val, &ptr_end, 10); + + if (str_val == ptr_end) { + std::stringstream ss; + ss << "Error: cannot convert command line argument '" << str_val + << "' to an integer.\n"; + throw std::invalid_argument(ss.str()); + } + + if (errno == ERANGE) { + std::stringstream ss; + ss << "Error: converted value for command line argument '" << str_val + << "' falls out of range.\n"; + throw std::invalid_argument(ss.str()); + } +} + +bool check_arg_int(int const i, int const argc, char** argv, char const* name, + int& val) { + if (0 != Test::string_compare_no_case(argv[i], name)) { + return false; + } + + if (i < argc - 1) { + process_arg_int(argv[i + 1], val); + } else { + std::stringstream msg; + msg << name << " input argument needs to be followed by an int"; + throw std::invalid_argument(msg.str()); + } + return true; +} + +bool check_arg_bool(int const i, int const /*argc*/, char** argv, + char const* name, bool& val) { + if (0 != Test::string_compare_no_case(argv[i], name)) { + return false; + } + val = true; + return true; +} + +bool check_arg_str(int const i, int const argc, char** argv, char const* name, + std::string& val) { + if (0 != Test::string_compare_no_case(argv[i], name)) { + return false; + } + + if (i < argc - 1) { + val = std::string(argv[i + 1]); + } else { + std::stringstream msg; + msg << name << " input argument needs to be followed by a string"; + throw std::invalid_argument(msg.str()); + } + return true; +} + +void parse_common_options(int& argc, char** argv, CommonInputParams& params) { + // Skip the program name, start with argIdx=1 + int argIdx = 1; + while (argIdx < argc) { + bool remove_flag = false; + if (check_arg_int(argIdx, argc, argv, "--threads", params.use_threads)) { + remove_flag = true; + } else if (check_arg_int(argIdx, argc, argv, "--openmp", + params.use_openmp)) { + remove_flag = true; + } else if (check_arg_int(argIdx, argc, argv, "--cuda", params.use_cuda)) { + remove_flag = true; + } else if (check_arg_int(argIdx, argc, argv, "--hip", params.use_hip)) { + remove_flag = true; + } else if (check_arg_int(argIdx, argc, argv, "--sycl", params.use_sycl)) { + remove_flag = true; + } + + if (remove_flag) { + // Shift the remainder of the argv list by one. Note that argv has + // (argc + 1) arguments, the last one always being nullptr. The following + // loop moves the trailing nullptr element as well + for (int k = argIdx; k < argc - 1; ++k) { + argv[k] = argv[k + 2]; + argv[k + 1] = argv[k + 3]; + } + argc = argc - 2; + } else { + ++argIdx; + } + } +} // parse_common_options() + +} // namespace perf_test + +#endif // KOKKOSKERNELS_PERF_TEST_UTILITIES_HPP diff --git a/perf_test/sparse/KokkosSparse_mdf.cpp b/perf_test/sparse/KokkosSparse_mdf.cpp index ca48df8fd2..8f1ddc4e14 100644 --- a/perf_test/sparse/KokkosSparse_mdf.cpp +++ b/perf_test/sparse/KokkosSparse_mdf.cpp @@ -1,4 +1,4 @@ -//@HEADER +//@HEADERA // ************************************************************************ // // Kokkos v. 4.0 @@ -19,15 +19,14 @@ #include "KokkosKernels_Handle.hpp" #include "KokkosSparse_IOUtils.hpp" #include "KokkosSparse_Utils_cusparse.hpp" -#include "KokkosSparse_mdf.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +#include "KokkosSparse_mdf.hpp" + +using perf_test::CommonInputParams; -struct Params { - int use_cuda = 0; - int use_hip = 0; - int use_sycl = 0; - int use_openmp = 0; - int use_threads = 0; +struct LocalParams { std::string amtx; int m = 10000; int n = 10000; @@ -54,8 +53,61 @@ struct diag_generator_functor { } }; -template -void run_experiment(const Params& params) { +void print_options() { + std::cerr << "Options\n" << std::endl; + + std::cerr << perf_test::list_common_options(); + + std::cerr << "\t[Optional] --amtx :: input matrix" << std::endl; + std::cerr << "\t[Optional] --repeat :: how many times to repeat overall " + "MDF" + << std::endl; + std::cerr << "\t[Optional] --verbose :: enable verbose output" + << std::endl; + std::cerr << "\nSettings for randomly generated A matrix" << std::endl; + std::cerr << "\t[Optional] --m :: number of rows to generate" + << std::endl; + std::cerr << "\t[Optional] --n :: number of cols to generate" + << std::endl; + std::cerr + << "\t[Optional] --nnz :: number of entries per row to generate" + << std::endl; + std::cerr << "\t[Optional] --diag :: generate a diagonal matrix" + << std::endl; +} // print_options + +int parse_inputs(LocalParams& params, int argc, char** argv) { + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_str(i, argc, argv, "--amtx", params.amtx)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--m", params.m)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--n", params.n)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--nnz", + params.nnzPerRow)) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--diag", + params.diag)) { + } else if (perf_test::check_arg_int(i, argc, argv, "--repeat", + params.repeat)) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--verbose", + params.verbose)) { + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + return 1; + } + } + return 0; +} // parse_inputs + +template +void run_experiment(int argc, char** argv, CommonInputParams /*params*/) { + using crsMat_t = + KokkosSparse::CrsMatrix; using size_type = typename crsMat_t::size_type; using lno_t = typename crsMat_t::ordinal_type; using scalar_t = typename crsMat_t::value_type; @@ -67,19 +119,22 @@ void run_experiment(const Params& params) { using entries_t = typename graph_t::entries_type::non_const_type; using values_t = typename crsMat_t::values_type::non_const_type; + LocalParams localParams; + parse_inputs(localParams, argc, argv); + std::cout << "************************************* \n"; std::cout << "************************************* \n"; crsMat_t A; - lno_t m = params.m; - lno_t n = params.n; - if (params.amtx.length()) { - std::cout << "Loading A from " << params.amtx << '\n'; + lno_t m = localParams.m; + lno_t n = localParams.n; + if (localParams.amtx.length()) { + std::cout << "Loading A from " << localParams.amtx << '\n'; A = KokkosSparse::Impl::read_kokkos_crst_matrix( - params.amtx.c_str()); + localParams.amtx.c_str()); m = A.numRows(); n = A.numCols(); } else { - if (params.diag) { + if (localParams.diag) { std::cout << "Randomly generating diag matrix\n"; rowmap_t rowmapA("A row map", m + 1); entries_t entriesA("A entries", m); @@ -100,13 +155,13 @@ void run_experiment(const Params& params) { A = crsMat_t("A matrix", m, valuesA, graph); } else { std::cout << "Randomly generating matrix\n"; - size_type nnzUnused = m * params.nnzPerRow; + size_type nnzUnused = m * localParams.nnzPerRow; A = KokkosSparse::Impl::kk_generate_sparse_matrix( m, n, nnzUnused, 0, (n + 3) / 3); } } - if (params.verbose) { + if (localParams.verbose) { std::cout << "Matrix A" << std::endl; std::cout << " row_map A:" << std::endl; KokkosKernels::Impl::print_1Dview(A.graph.row_map); @@ -125,9 +180,12 @@ void run_experiment(const Params& params) { timer.reset(); KokkosSparse::Experimental::MDF_handle handle(A); handle.set_verbosity(0); + if (localParams.verbose) { + handle.set_verbosity(1); + } handleTime += timer.seconds(); - for (int sumRep = 0; sumRep < params.repeat; sumRep++) { + for (int sumRep = 0; sumRep < localParams.repeat; sumRep++) { timer.reset(); KokkosSparse::Experimental::mdf_symbolic(A, handle); Kokkos::fence(); @@ -140,16 +198,16 @@ void run_experiment(const Params& params) { } std::cout << "Mean total time: " - << handleTime + (symbolicTime / params.repeat) + - (numericTime / params.repeat) + << handleTime + (symbolicTime / localParams.repeat) + + (numericTime / localParams.repeat) << std::endl << "Handle time: " << handleTime << std::endl - << "Mean symbolic time: " << (symbolicTime / params.repeat) + << "Mean symbolic time: " << (symbolicTime / localParams.repeat) << std::endl - << "Mean numeric time: " << (numericTime / params.repeat) + << "Mean numeric time: " << (numericTime / localParams.repeat) << std::endl; - if (params.verbose) { + if (localParams.verbose) { entries_t permutation = handle.get_permutation(); std::cout << "MDF permutation:" << std::endl; @@ -157,164 +215,8 @@ void run_experiment(const Params& params) { } } // run_experiment -void print_options() { - std::cerr << "Options\n" << std::endl; - - std::cerr - << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp " - "[numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'" - " | '--sycl [syclDeviceIndex]'" - << std::endl; - - std::cerr << "\t[Optional] --amtx :: input matrix" << std::endl; - std::cerr << "\t[Optional] --repeat :: how many times to repeat overall " - "MDF" - << std::endl; - std::cerr << "\t[Optional] --verbose :: enable verbose output" - << std::endl; - std::cerr << "\nSettings for randomly generated A matrix" << std::endl; - std::cerr << "\t[Optional] --m :: number of rows to generate" - << std::endl; - std::cerr << "\t[Optional] --n :: number of cols to generate" - << std::endl; - std::cerr - << "\t[Optional] --nnz :: number of entries per row to generate" - << std::endl; - std::cerr << "\t[Optional] --diag :: generate a diagonal matrix" - << std::endl; -} // print_options - -int parse_inputs(Params& params, int argc, char** argv) { - for (int i = 1; i < argc; ++i) { - if (0 == Test::string_compare_no_case(argv[i], "--threads")) { - params.use_threads = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) { - params.use_openmp = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { - params.use_cuda = atoi(argv[++i]) + 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) { - params.use_hip = atoi(argv[++i]) + 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--sycl")) { - params.use_sycl = atoi(argv[++i]) + 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) { - params.amtx = argv[++i]; - } else if (0 == Test::string_compare_no_case(argv[i], "--m")) { - params.m = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--n")) { - params.n = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--nnz")) { - params.nnzPerRow = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--diag")) { - params.diag = true; - } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) { - params.repeat = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--verbose")) { - params.verbose = true; - } else { - std::cerr << "Unrecognized command line argument #" << i << ": " - << argv[i] << std::endl; - print_options(); - return 1; - } - } - return 0; -} // parse_inputs - +#define KOKKOSKERNELS_PERF_TEST_NAME run_experiment +#include "KokkosKernels_perf_test_instantiation.hpp" int main(int argc, char** argv) { - Params params; - - if (parse_inputs(params, argc, argv)) { - return 1; - } - const int num_threads = - std::max(params.use_openmp, - params.use_threads); // Assumption is that use_openmp variable - // is provided as number of threads - - // If cuda, hip or sycl is used, set device_id - int device_id = 0; - if (params.use_cuda > 0) { - device_id = params.use_cuda - 1; - } - if (params.use_hip > 0) { - device_id = params.use_hip - 1; - } - if (params.use_sycl > 0) { - device_id = params.use_sycl - 1; - } - - Kokkos::initialize(Kokkos::InitializationSettings() - .set_num_threads(num_threads) - .set_device_id(device_id)); - - bool useOMP = params.use_openmp != 0; - bool useThreads = params.use_threads != 0; - bool useCUDA = params.use_cuda != 0; - bool useHIP = params.use_hip != 0; - bool useSYCL = params.use_sycl != 0; - bool useSerial = !useOMP && !useCUDA && !useHIP && !useSYCL; - - if (useOMP) { -#if defined(KOKKOS_ENABLE_OPENMP) - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); -#else - std::cout << "ERROR: OpenMP requested, but not available.\n"; - return 1; -#endif - } - if (useThreads) { -#if defined(KOKKOS_ENABLE_THREADS) - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); -#else - std::cout << "ERROR: OpenMP requested, but not available.\n"; - return 1; -#endif - } - if (useCUDA) { -#if defined(KOKKOS_ENABLE_CUDA) - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); -#else - std::cout << "ERROR: CUDA requested, but not available.\n"; - return 1; -#endif - } - if (useHIP) { -#if defined(KOKKOS_ENABLE_HIP) - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); -#else - std::cout << "ERROR: HIP requested, but not available.\n"; - return 1; -#endif - } - if (useSYCL) { -#if defined(KOKKOS_ENABLE_SYCL) - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); -#else - std::cout << "ERROR: SYCL requested, but not available.\n"; - return 1; -#endif - } - if (useSerial) { -#if defined(KOKKOS_ENABLE_SERIAL) - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); -#else - std::cout << "ERROR: Serial device requested, but not available.\n"; - return 1; -#endif - } - Kokkos::finalize(); - return 0; + return main_instantiation(argc, argv); } // main From b0965b7d41456447a6f9fa20b813cf747fa11be8 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Fri, 10 Feb 2023 12:12:30 -0700 Subject: [PATCH 053/442] Spgemm non-reuse: unification layer and TPLs (#1678) * Add unification layer, tests for non-reuse SpGEMM This version has a simpler interface (A, B) -> C, so the user doesn't have to manage a handle. The default algorithm (SPGEMM_KK) is always used. The native implementation just calls symbolic and then numeric. * Add cusparse 11.0+ spgemm noreuse wrapper * Fix unused local typedef warning/error * Add cusparse 10.x spgemm noreuse wrapper * Formatting * Remove pointless no-reuse spgemm wrapper for cusparse 10, rocsparse For these versions, the no-reuse wrapper would be identical to the symbolic wrapper plus the numeric wrapper, so just call those. * Add MKL non-reuse spgemm wrapper * Formatting * Don't try to call 10 spgemm noreuse from cusparse --- sparse/CMakeLists.txt | 7 + ...Sparse_spgemm_noreuse_eti_spec_inst.cpp.in | 26 ++ ...parse_spgemm_noreuse_eti_spec_avail.hpp.in | 24 ++ ...Sparse_spgemm_noreuse_eti_spec_decl.hpp.in | 24 ++ .../impl/KokkosSparse_spgemm_noreuse_spec.hpp | 163 ++++++++++ sparse/src/KokkosSparse_spgemm.hpp | 67 ++++- ...osSparse_spgemm_noreuse_tpl_spec_avail.hpp | 93 ++++++ ...kosSparse_spgemm_noreuse_tpl_spec_decl.hpp | 279 ++++++++++++++++++ .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 2 +- sparse/unit_test/Test_Sparse_spgemm.hpp | 102 +++++-- 10 files changed, 745 insertions(+), 42 deletions(-) create mode 100644 sparse/eti/generated_specializations_cpp/spgemm_noreuse/KokkosSparse_spgemm_noreuse_eti_spec_inst.cpp.in create mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_avail.hpp.in create mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_decl.hpp.in create mode 100644 sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp create mode 100644 sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp create mode 100644 sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp diff --git a/sparse/CMakeLists.txt b/sparse/CMakeLists.txt index cdfda0c614..97076655f7 100644 --- a/sparse/CMakeLists.txt +++ b/sparse/CMakeLists.txt @@ -81,6 +81,13 @@ KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_numeric spgemm_numeric TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) +KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_noreuse spgemm_noreuse + COMPONENTS sparse + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS ORDINALS OFFSETS DEVICES +) + KOKKOSKERNELS_GENERATE_ETI(Sparse_bspgemm_numeric bspgemm_numeric COMPONENTS sparse HEADER_LIST ETI_HEADERS diff --git a/sparse/eti/generated_specializations_cpp/spgemm_noreuse/KokkosSparse_spgemm_noreuse_eti_spec_inst.cpp.in b/sparse/eti/generated_specializations_cpp/spgemm_noreuse/KokkosSparse_spgemm_noreuse_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..85459ac139 --- /dev/null +++ b/sparse/eti/generated_specializations_cpp/spgemm_noreuse/KokkosSparse_spgemm_noreuse_eti_spec_inst.cpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" + +#include "KokkosSparse_spgemm_noreuse_spec.hpp" +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPGEMM_NOREUSE_ETI_INST_BLOCK@ + } //IMPL +} //Kokkos diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_avail.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..a75af70608 --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_avail.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPGEMM_NOREUSE_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSSPARSE_SPGEMM_NOREUSE_ETI_SPEC_AVAIL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPGEMM_NOREUSE_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..2ca1ecf07b --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPGEMM_NOREUSE_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPGEMM_NOREUSE_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPGEMM_NOREUSE_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp b/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp new file mode 100644 index 0000000000..352e3384ac --- /dev/null +++ b/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp @@ -0,0 +1,163 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSSPARSE_IMPL_SPGEMM_NOREUSE_SPEC_HPP_ +#define KOKKOSSPARSE_IMPL_SPGEMM_NOREUSE_SPEC_HPP_ + +#include + +#include +#include "KokkosSparse_CrsMatrix.hpp" +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include "KokkosKernels_Handle.hpp" +#include "KokkosSparse_spgemm_symbolic.hpp" +#include "KokkosSparse_spgemm_numeric.hpp" +#endif + +namespace KokkosSparse { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct spgemm_noreuse_eti_spec_avail { + enum : bool { value = false }; +}; + +} // namespace Impl +} // namespace KokkosSparse + +#define KOKKOSSPARSE_SPGEMM_NOREUSE_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template <> \ + struct spgemm_noreuse_eti_spec_avail< \ + KokkosSparse::CrsMatrix, \ + void, OFFSET_TYPE>, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>> { \ + enum : bool { value = true }; \ + }; + +// Include the actual specialization declarations +#include +#include + +namespace KokkosSparse { +namespace Impl { + +// Unification layer +/// \brief Implementation of KokkosSparse::spgemm (sparse matrix - dense +/// vector multiply) for multiple vectors at a time (multivectors) +/// and possibly multiple coefficients at a time. + +template ::value, + bool eti_spec_avail = + spgemm_noreuse_eti_spec_avail::value> +struct SPGEMM_NOREUSE { + static CMatrix spgemm_noreuse(const AMatrix& A, bool transA, const BMatrix& B, + bool transB); +}; + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + +// Unification layer +template +struct SPGEMM_NOREUSE { + static CMatrix spgemm_noreuse(const AMatrix& A, bool transA, const BMatrix& B, + bool transB) { + using device_t = typename CMatrix::device_type; + using scalar_t = typename CMatrix::value_type; + using ordinal_t = typename CMatrix::ordinal_type; + using size_type = typename CMatrix::size_type; + using c_rowmap_t = typename CMatrix::row_map_type::non_const_type; + using c_entries_t = typename CMatrix::index_type::non_const_type; + using c_values_t = typename CMatrix::values_type::non_const_type; + KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, ordinal_t, scalar_t, typename device_t::execution_space, + typename device_t::memory_space, typename device_t::memory_space> + kh; + kh.create_spgemm_handle(); + // A is m*n, B is n*k, C is m*k + ordinal_t m = A.numRows(); + ordinal_t n = B.numRows(); + ordinal_t k = B.numCols(); + c_rowmap_t row_mapC( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "C rowmap"), m + 1); + KokkosSparse::Experimental::spgemm_symbolic( + &kh, m, n, k, A.graph.row_map, A.graph.entries, transA, B.graph.row_map, + B.graph.entries, transB, row_mapC); + size_type c_nnz = kh.get_spgemm_handle()->get_c_nnz(); + c_entries_t entriesC( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "C entries"), c_nnz); + c_values_t valuesC( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "C values"), c_nnz); + KokkosSparse::Experimental::spgemm_numeric( + &kh, m, n, k, A.graph.row_map, A.graph.entries, A.values, transA, + B.graph.row_map, B.graph.entries, B.values, transB, row_mapC, entriesC, + valuesC); + kh.destroy_spgemm_handle(); + return CMatrix("C", m, k, c_nnz, valuesC, row_mapC, entriesC); + } +}; + +#endif + +} // namespace Impl +} // namespace KokkosSparse + +#define KOKKOSSPARSE_SPGEMM_NOREUSE_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + extern template struct SPGEMM_NOREUSE< \ + KokkosSparse::CrsMatrix, \ + void, OFFSET_TYPE>, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + false, true>; + +#define KOKKOSSPARSE_SPGEMM_NOREUSE_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template struct SPGEMM_NOREUSE< \ + KokkosSparse::CrsMatrix, \ + void, OFFSET_TYPE>, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const OFFSET_TYPE>, \ + false, true>; + +#include +#include + +#endif // KOKKOSSPARSE_IMPL_SPGEMM_NOREUSE_SPEC_HPP_ diff --git a/sparse/src/KokkosSparse_spgemm.hpp b/sparse/src/KokkosSparse_spgemm.hpp index 0cac811a4c..11cb58ed4a 100644 --- a/sparse/src/KokkosSparse_spgemm.hpp +++ b/sparse/src/KokkosSparse_spgemm.hpp @@ -19,6 +19,7 @@ #include "KokkosSparse_spgemm_numeric.hpp" #include "KokkosSparse_spgemm_symbolic.hpp" #include "KokkosSparse_spgemm_jacobi.hpp" +#include "KokkosSparse_spgemm_noreuse_spec.hpp" namespace KokkosSparse { @@ -125,20 +126,58 @@ void block_spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, template CMatrix spgemm(const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode) { - using device_t = typename CMatrix::device_type; - using scalar_t = typename CMatrix::value_type; - using ordinal_t = typename CMatrix::ordinal_type; - using size_type = typename CMatrix::size_type; - using KKH = KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, ordinal_t, scalar_t, typename device_t::execution_space, - typename device_t::memory_space, typename device_t::memory_space>; - KKH kh; - kh.create_spgemm_handle(); - CMatrix C; - spgemm_symbolic(kh, A, Amode, B, Bmode, C); - spgemm_numeric(kh, A, Amode, B, Bmode, C); - kh.destroy_spgemm_handle(); - return C; + // Canonicalize the matrix types: + // - Make A,B have const values and entries. + // - Make all views in A,B unmanaged, but otherwise default memory traits + // - C must have managed memory since its views are allocated in this + // function + using AMatrix_Internal = KokkosSparse::CrsMatrix< + typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, + typename AMatrix::device_type, Kokkos::MemoryTraits, + typename AMatrix::const_size_type>; + using BMatrix_Internal = KokkosSparse::CrsMatrix< + typename BMatrix::const_value_type, typename BMatrix::const_ordinal_type, + typename BMatrix::device_type, Kokkos::MemoryTraits, + typename BMatrix::const_size_type>; + using CMatrix_Internal = + KokkosSparse::CrsMatrix; + // Check now that A, B dimensions are compatible to multiply + auto opACols = Amode ? A.numRows() : A.numCols(); + auto opBRows = Bmode ? B.numCols() : B.numRows(); + if (Amode || Bmode) + throw std::invalid_argument( + "KokkosSparse::spgemm: transposing A and/or B is not yet supported"); + if (opACols != opBRows) + throw std::invalid_argument( + "KokkosSparse::spgemm: op(A) and op(B) have incompatible dimensions " + "for multiplication"); + // Make sure C has managed memory. If its memory traits are void (default), + // then that also means it's managed. + if constexpr (!std::is_same::value) { + if (CMatrix::memory_traits::is_unmanaged) + throw std::invalid_argument( + "KokkosSparse::spgemm: C must not have the Unmanaged memory trait, " + "because spgemm needs to allocate its Views"); + } + AMatrix_Internal A_internal(A); + BMatrix_Internal B_internal(B); + // Intercept empty C case here so that TPL wrappers don't have to deal with it + if (!A.numRows() || !A.numCols() || !B.numCols() || !A.nnz() || !B.nnz()) { + auto Crows = Amode ? A.numCols() : A.numRows(); + auto Ccols = Bmode ? B.numRows() : B.numCols(); + typename CMatrix::row_map_type::non_const_type row_mapC("C rowmap", + Crows + 1); + typename CMatrix::index_type entriesC; + typename CMatrix::values_type valuesC; + return CMatrix("C", Crows, Ccols, 0, valuesC, row_mapC, entriesC); + } + return CMatrix(KokkosSparse::Impl::SPGEMM_NOREUSE< + CMatrix_Internal, AMatrix_Internal, + BMatrix_Internal>::spgemm_noreuse(A_internal, Amode, + B_internal, Bmode)); } } // namespace KokkosSparse diff --git a/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp new file mode 100644 index 0000000000..81d3273e17 --- /dev/null +++ b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp @@ -0,0 +1,93 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +*/ + +#ifndef KOKKOSPARSE_SPGEMM_NOREUSE_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSPARSE_SPGEMM_NOREUSE_TPL_SPEC_AVAIL_HPP_ + +namespace KokkosSparse { +namespace Impl { + +// Specialization struct which defines whether a specialization exists +template +struct spgemm_noreuse_tpl_spec_avail { + enum : bool { value = false }; +}; + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && (CUDA_VERSION >= 11000) +// For cuSparse 11 and up, use the non-reuse generic interface. +// But for cuSparse 10, there is only one interface +// so just let KokkosSparse::spgemm call the symbolic and numeric wrappers. + +#define SPGEMM_NOREUSE_AVAIL_CUSPARSE(SCALAR, MEMSPACE) \ + template <> \ + struct spgemm_noreuse_tpl_spec_avail< \ + KokkosSparse::CrsMatrix< \ + SCALAR, int, Kokkos::Device, void, int>, \ + KokkosSparse::CrsMatrix< \ + const SCALAR, const int, Kokkos::Device, \ + Kokkos::MemoryTraits, const int>, \ + KokkosSparse::CrsMatrix< \ + const SCALAR, const int, Kokkos::Device, \ + Kokkos::MemoryTraits, const int>> { \ + enum : bool { value = true }; \ + }; + +#define SPGEMM_NOREUSE_AVAIL_CUSPARSE_S(SCALAR) \ + SPGEMM_NOREUSE_AVAIL_CUSPARSE(SCALAR, Kokkos::CudaSpace) \ + SPGEMM_NOREUSE_AVAIL_CUSPARSE(SCALAR, Kokkos::CudaUVMSpace) + +SPGEMM_NOREUSE_AVAIL_CUSPARSE_S(float) +SPGEMM_NOREUSE_AVAIL_CUSPARSE_S(double) +SPGEMM_NOREUSE_AVAIL_CUSPARSE_S(Kokkos::complex) +SPGEMM_NOREUSE_AVAIL_CUSPARSE_S(Kokkos::complex) + +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#define SPGEMM_NOREUSE_AVAIL_MKL(SCALAR, EXEC) \ + template <> \ + struct spgemm_noreuse_tpl_spec_avail< \ + KokkosSparse::CrsMatrix< \ + SCALAR, int, Kokkos::Device, void, int>, \ + KokkosSparse::CrsMatrix< \ + const SCALAR, const int, Kokkos::Device, \ + Kokkos::MemoryTraits, const int>, \ + KokkosSparse::CrsMatrix< \ + const SCALAR, const int, Kokkos::Device, \ + Kokkos::MemoryTraits, const int>> { \ + enum : bool { value = true }; \ + }; + +#define SPGEMM_NOREUSE_AVAIL_MKL_E(EXEC) \ + SPGEMM_NOREUSE_AVAIL_MKL(float, EXEC) \ + SPGEMM_NOREUSE_AVAIL_MKL(double, EXEC) \ + SPGEMM_NOREUSE_AVAIL_MKL(Kokkos::complex, EXEC) \ + SPGEMM_NOREUSE_AVAIL_MKL(Kokkos::complex, EXEC) + +#ifdef KOKKOS_ENABLE_SERIAL +SPGEMM_NOREUSE_AVAIL_MKL_E(Kokkos::Serial) +#endif +#ifdef KOKKOS_ENABLE_OPENMP +SPGEMM_NOREUSE_AVAIL_MKL_E(Kokkos::OpenMP) +#endif +#endif + +} // namespace Impl +} // namespace KokkosSparse + +#endif diff --git a/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp new file mode 100644 index 0000000000..f3d32a01fb --- /dev/null +++ b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp @@ -0,0 +1,279 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +*/ + +#ifndef KOKKOSPARSE_SPGEMM_NOREUSE_TPL_SPEC_DECL_HPP_ +#define KOKKOSPARSE_SPGEMM_NOREUSE_TPL_SPEC_DECL_HPP_ + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#include "cusparse.h" +#include "KokkosSparse_Utils_cusparse.hpp" +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include "KokkosSparse_Utils_mkl.hpp" +#include "mkl_spblas.h" +#endif + +namespace KokkosSparse { +namespace Impl { + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && (CUDA_VERSION >= 11000) + +template +Matrix spgemm_noreuse_cusparse(const MatrixConst &A, const MatrixConst &B) { + using Scalar = typename Matrix::value_type; + cudaDataType cudaScalarType = Impl::cuda_data_type_from(); + KokkosKernels::Experimental::Controls kkControls; + cusparseHandle_t cusparseHandle = kkControls.getCusparseHandle(); + cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; + cusparseSpMatDescr_t descr_A, descr_B, descr_C; + cusparseSpGEMMDescr_t spgemmDescr; + cusparseSpGEMMAlg_t alg = CUSPARSE_SPGEMM_DEFAULT; + size_t bufferSize1 = 0, bufferSize2 = 0; + void *buffer1 = nullptr, *buffer2 = nullptr; + // A is m*n, B is n*k, C is m*k + int m = A.numRows(); + int n = B.numRows(); + int k = B.numCols(); + const auto alpha = Kokkos::ArithTraits::one(); + const auto beta = Kokkos::ArithTraits::zero(); + typename Matrix::row_map_type::non_const_type row_mapC( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "C rowmap"), m + 1); + + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_createDescr(&spgemmDescr)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( + &descr_A, m, n, A.graph.entries.extent(0), (void *)A.graph.row_map.data(), + (void *)A.graph.entries.data(), (void *)A.values.data(), + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + cudaScalarType)); + + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( + &descr_B, n, k, B.graph.entries.extent(0), (void *)B.graph.row_map.data(), + (void *)B.graph.entries.data(), (void *)B.values.data(), + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + cudaScalarType)); + + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseCreateCsr(&descr_C, m, k, 0, (void *)row_mapC.data(), nullptr, + nullptr, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, cudaScalarType)); + + //---------------------------------------------------------------------- + // query workEstimation buffer size, allocate, then call again with buffer. + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_workEstimation( + cusparseHandle, op, op, &alpha, descr_A, descr_B, &beta, descr_C, + cudaScalarType, alg, spgemmDescr, &bufferSize1, nullptr)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void **)&buffer1, bufferSize1)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_workEstimation( + cusparseHandle, op, op, &alpha, descr_A, descr_B, &beta, descr_C, + cudaScalarType, alg, spgemmDescr, &bufferSize1, buffer1)); + + //---------------------------------------------------------------------- + // query compute buffer size, allocate, then call again with buffer. + + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_compute( + cusparseHandle, op, op, &alpha, descr_A, descr_B, &beta, descr_C, + cudaScalarType, alg, spgemmDescr, &bufferSize2, nullptr)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void **)&buffer2, bufferSize2)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_compute( + cusparseHandle, op, op, &alpha, descr_A, descr_B, &beta, descr_C, + cudaScalarType, alg, spgemmDescr, &bufferSize2, buffer2)); + int64_t unused1, unused2, c_nnz; + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSpMatGetSize(descr_C, &unused1, &unused2, &c_nnz)); + + typename Matrix::index_type entriesC( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "C entries"), c_nnz); + typename Matrix::values_type valuesC( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "C values"), c_nnz); + + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseCsrSetPointers(descr_C, (void *)row_mapC.data(), + (void *)entriesC.data(), (void *)valuesC.data())); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_compute( + cusparseHandle, op, op, &alpha, descr_A, descr_B, &beta, descr_C, + cudaScalarType, alg, spgemmDescr, &bufferSize2, buffer2)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSpGEMM_copy(cusparseHandle, op, op, &alpha, descr_A, descr_B, + &beta, descr_C, cudaScalarType, alg, spgemmDescr)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(descr_A)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(descr_B)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(descr_C)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpGEMM_destroyDescr(spgemmDescr)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(buffer1)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(buffer2)); + return Matrix("C", m, k, c_nnz, valuesC, row_mapC, entriesC); +} + +#define SPGEMM_NOREUSE_DECL_CUSPARSE(SCALAR, MEMSPACE, TPL_AVAIL) \ + template <> \ + struct SPGEMM_NOREUSE< \ + KokkosSparse::CrsMatrix< \ + SCALAR, int, Kokkos::Device, void, int>, \ + KokkosSparse::CrsMatrix< \ + const SCALAR, const int, Kokkos::Device, \ + Kokkos::MemoryTraits, const int>, \ + KokkosSparse::CrsMatrix< \ + const SCALAR, const int, Kokkos::Device, \ + Kokkos::MemoryTraits, const int>, \ + true, TPL_AVAIL> { \ + using Matrix = KokkosSparse::CrsMatrix< \ + SCALAR, int, Kokkos::Device, void, int>; \ + using ConstMatrix = KokkosSparse::CrsMatrix< \ + const SCALAR, const int, Kokkos::Device, \ + Kokkos::MemoryTraits, const int>; \ + static KokkosSparse::CrsMatrix< \ + SCALAR, int, Kokkos::Device, void, int> \ + spgemm_noreuse(const ConstMatrix &A, bool, const ConstMatrix &B, bool) { \ + std::string label = "KokkosSparse::spgemm_noreuse[TPL_CUSPARSE," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + Matrix C = spgemm_noreuse_cusparse(A, B); \ + Kokkos::Profiling::popRegion(); \ + return C; \ + } \ + }; + +#define SPGEMM_NOREUSE_DECL_CUSPARSE_S(SCALAR, TPL_AVAIL) \ + SPGEMM_NOREUSE_DECL_CUSPARSE(SCALAR, Kokkos::CudaSpace, TPL_AVAIL) \ + SPGEMM_NOREUSE_DECL_CUSPARSE(SCALAR, Kokkos::CudaUVMSpace, TPL_AVAIL) + +SPGEMM_NOREUSE_DECL_CUSPARSE_S(float, true) +SPGEMM_NOREUSE_DECL_CUSPARSE_S(double, true) +SPGEMM_NOREUSE_DECL_CUSPARSE_S(Kokkos::complex, true) +SPGEMM_NOREUSE_DECL_CUSPARSE_S(Kokkos::complex, true) + +SPGEMM_NOREUSE_DECL_CUSPARSE_S(float, false) +SPGEMM_NOREUSE_DECL_CUSPARSE_S(double, false) +SPGEMM_NOREUSE_DECL_CUSPARSE_S(Kokkos::complex, false) +SPGEMM_NOREUSE_DECL_CUSPARSE_S(Kokkos::complex, false) + +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +template +Matrix spgemm_noreuse_mkl(const MatrixConst &A, const MatrixConst &B) { + using size_type = typename Matrix::non_const_size_type; + using index_type = typename Matrix::non_const_ordinal_type; + using scalar_type = typename Matrix::non_const_value_type; + using ExecSpace = typename Matrix::execution_space; + using MKLMatrix = MKLSparseMatrix; + auto m = A.numRows(); + auto n = A.numCols(); + auto k = B.numCols(); + MKLMatrix Amkl(m, n, const_cast(A.graph.row_map.data()), + const_cast(A.graph.entries.data()), + const_cast(A.values.data())); + MKLMatrix Bmkl(n, k, const_cast(B.graph.row_map.data()), + const_cast(B.graph.entries.data()), + const_cast(B.values.data())); + sparse_matrix_t C; + matrix_descr generalDescr; + generalDescr.type = SPARSE_MATRIX_TYPE_GENERAL; + generalDescr.mode = SPARSE_FILL_MODE_FULL; + generalDescr.diag = SPARSE_DIAG_NON_UNIT; + KOKKOSKERNELS_MKL_SAFE_CALL( + mkl_sparse_spmm(SPARSE_OPERATION_NON_TRANSPOSE, Amkl, Bmkl, &C)); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_order(C)); + MKLMatrix wrappedC(C); + MKL_INT nrows = 0, ncols = 0; + MKL_INT *rowmapRaw = nullptr; + MKL_INT *entriesRaw = nullptr; + scalar_type *valuesRaw = nullptr; + wrappedC.export_data(nrows, ncols, rowmapRaw, entriesRaw, valuesRaw); + if (nrows != m || ncols != k) + throw std::runtime_error( + "KokkosSparse::spgemm: matrix returned by MKL has incorrect " + "dimensions"); + MKL_INT c_nnz = rowmapRaw[m]; + Kokkos::View> + rowmapRawView(rowmapRaw, m + 1); + Kokkos::View> + entriesRawView(entriesRaw, c_nnz); + Kokkos::View> + valuesRawView(valuesRaw, c_nnz); + + typename Matrix::row_map_type::non_const_type row_mapC( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "C rowmap"), m + 1); + typename Matrix::index_type entriesC( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "C entries"), c_nnz); + typename Matrix::values_type valuesC( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "C values"), c_nnz); + + Kokkos::deep_copy(ExecSpace(), row_mapC, rowmapRawView); + Kokkos::deep_copy(ExecSpace(), entriesC, entriesRawView); + Kokkos::deep_copy(ExecSpace(), valuesC, valuesRawView); + // Now, done with the copy of C owned by MKL + wrappedC.destroy(); + return Matrix("C", m, k, c_nnz, valuesC, row_mapC, entriesC); +} + +#define SPGEMM_NOREUSE_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ + template <> \ + struct SPGEMM_NOREUSE< \ + KokkosSparse::CrsMatrix< \ + SCALAR, int, Kokkos::Device, void, int>, \ + KokkosSparse::CrsMatrix< \ + const SCALAR, const int, Kokkos::Device, \ + Kokkos::MemoryTraits, const int>, \ + KokkosSparse::CrsMatrix< \ + const SCALAR, const int, Kokkos::Device, \ + Kokkos::MemoryTraits, const int>, \ + true, TPL_AVAIL> { \ + using Matrix = KokkosSparse::CrsMatrix< \ + SCALAR, int, Kokkos::Device, void, int>; \ + using ConstMatrix = KokkosSparse::CrsMatrix< \ + const SCALAR, const int, Kokkos::Device, \ + Kokkos::MemoryTraits, const int>; \ + static KokkosSparse::CrsMatrix< \ + SCALAR, int, Kokkos::Device, void, int> \ + spgemm_noreuse(const ConstMatrix &A, bool, const ConstMatrix &B, bool) { \ + std::string label = "KokkosSparse::spgemm_noreuse[TPL_MKL," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + Matrix C = spgemm_noreuse_mkl(A, B); \ + Kokkos::Profiling::popRegion(); \ + return C; \ + } \ + }; + +#define SPGEMM_NOREUSE_DECL_MKL_SE(SCALAR, EXEC) \ + SPGEMM_NOREUSE_DECL_MKL(SCALAR, EXEC, true) \ + SPGEMM_NOREUSE_DECL_MKL(SCALAR, EXEC, false) + +#define SPGEMM_NOREUSE_DECL_MKL_E(EXEC) \ + SPGEMM_NOREUSE_DECL_MKL_SE(float, EXEC) \ + SPGEMM_NOREUSE_DECL_MKL_SE(double, EXEC) \ + SPGEMM_NOREUSE_DECL_MKL_SE(Kokkos::complex, EXEC) \ + SPGEMM_NOREUSE_DECL_MKL_SE(Kokkos::complex, EXEC) + +#ifdef KOKKOS_ENABLE_SERIAL +SPGEMM_NOREUSE_DECL_MKL_E(Kokkos::Serial) +#endif +#ifdef KOKKOS_ENABLE_OPENMP +SPGEMM_NOREUSE_DECL_MKL_E(Kokkos::OpenMP) +#endif +#endif + +} // namespace Impl +} // namespace KokkosSparse + +#endif diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 3c68195f97..f223ed0e5a 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -35,7 +35,6 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, typename YVector::non_const_value_type const& beta, const YVector& y) { using offset_type = typename AMatrix::non_const_size_type; - using entry_type = typename AMatrix::non_const_ordinal_type; using value_type = typename AMatrix::non_const_value_type; /* initialize cusparse library */ @@ -57,6 +56,7 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) + using entry_type = typename AMatrix::non_const_ordinal_type; /* Check that cusparse can handle the types of the input Kokkos::CrsMatrix */ const cusparseIndexType_t myCusparseOffsetType = cusparse_index_type_t_from(); diff --git a/sparse/unit_test/Test_Sparse_spgemm.hpp b/sparse/unit_test/Test_Sparse_spgemm.hpp index b6480f4f16..4cb225b97d 100644 --- a/sparse/unit_test/Test_Sparse_spgemm.hpp +++ b/sparse/unit_test/Test_Sparse_spgemm.hpp @@ -49,6 +49,16 @@ typedef Kokkos::complex kokkos_complex_float; namespace Test { +// 3 ways to call SpGEMM: +// - symbolic/numeric with Views +// - symbolic/numeric with CrsMatrices +// - non-reuse with CrsMatrices +enum spgemm_call_mode { + spgemm_reuse_view, + spgemm_reuse_matrix, + spgemm_noreuse +}; + // Randomize matrix values again from the same uniform distribution as // kk_generate_sparse_matrix uses. template @@ -60,6 +70,11 @@ void randomize_matrix_values(const Values &v) { Kokkos::fill_random(v, pool, randStart, randEnd); } +template +void run_spgemm_noreuse(crsMat_t A, crsMat_t B, crsMat_t &C) { + C = KokkosSparse::spgemm(A, false, B, false); +} + template int run_spgemm(crsMat_t A, crsMat_t B, KokkosSparse::SPGEMMAlgorithm spgemm_algorithm, crsMat_t &C, @@ -275,7 +290,7 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) { template void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, - lno_t row_size_variance, bool oldInterface = false, + lno_t row_size_variance, Test::spgemm_call_mode callMode, bool testReuse = false) { #if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) { @@ -313,10 +328,17 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, run_spgemm(A, B, SPGEMM_DEBUG, output_mat2, false); } - std::vector algorithms = { - SPGEMM_KK, SPGEMM_KK_LP, SPGEMM_KK_MEMORY /* alias SPGEMM_KK_MEMSPEED */, - SPGEMM_KK_SPEED /* alias SPGEMM_KK_DENSE */ - }; + std::vector algorithms; + if (callMode == spgemm_noreuse) { + // No-reuse interface always uses the default algorithm + algorithms = {SPGEMM_KK}; + } else { + algorithms = { + SPGEMM_KK, SPGEMM_KK_LP, + SPGEMM_KK_MEMORY /* alias SPGEMM_KK_MEMSPEED */, + SPGEMM_KK_SPEED /* alias SPGEMM_KK_DENSE */ + }; + } for (auto spgemm_algorithm : algorithms) { std::string algo = "UNKNOWN"; @@ -337,12 +359,17 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, bool failed = false; int res = 0; try { - if (oldInterface) - res = run_spgemm_old_interface(A, B, spgemm_algorithm, - output_mat, testReuse); - else - res = run_spgemm(A, B, spgemm_algorithm, output_mat, - testReuse); + switch (callMode) { + case spgemm_reuse_view: + res = run_spgemm_old_interface( + A, B, spgemm_algorithm, output_mat, testReuse); + break; + case spgemm_reuse_matrix: + res = run_spgemm(A, B, spgemm_algorithm, output_mat, + testReuse); + break; + case spgemm_noreuse: run_spgemm_noreuse(A, B, output_mat); break; + } } catch (const char *message) { EXPECT_TRUE(is_expected_to_fail) << algo << ": " << message; failed = true; @@ -522,22 +549,43 @@ void test_issue402() { #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##spgemm##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_spgemm(10000, 8000, 6000, 8000 * 20, \ - 500, 10, false); \ - test_spgemm(10000, 8000, 6000, 8000 * 20, \ - 500, 10, true); \ - test_spgemm(1000, 500, 1600, 1000 * 20, \ - 500, 10, false, true); \ - test_spgemm(1000, 500, 1600, 1000 * 20, \ - 500, 10, true, true); \ - test_spgemm(0, 0, 0, 0, 10, 10, false); \ - test_spgemm(0, 0, 0, 0, 10, 10, true); \ - test_spgemm(0, 12, 5, 0, 10, 0, false); \ - test_spgemm(0, 12, 5, 0, 10, 0, true); \ - test_spgemm(10, 10, 0, 0, 10, 10, false); \ - test_spgemm(10, 10, 0, 0, 10, 10, true); \ - test_spgemm(10, 10, 10, 0, 0, 0, false); \ - test_spgemm(10, 10, 10, 0, 0, 0, true); \ + test_spgemm( \ + 10000, 8000, 6000, 8000 * 20, 500, 10, ::Test::spgemm_reuse_matrix); \ + test_spgemm( \ + 10000, 8000, 6000, 8000 * 20, 500, 10, ::Test::spgemm_reuse_view); \ + test_spgemm( \ + 1000, 500, 1600, 1000 * 20, 500, 10, ::Test::spgemm_reuse_matrix, \ + true); \ + test_spgemm( \ + 1000, 500, 1600, 1000 * 20, 500, 10, ::Test::spgemm_reuse_view, true); \ + test_spgemm(0, 0, 0, 0, 10, 10, \ + ::Test::spgemm_reuse_matrix); \ + test_spgemm(0, 0, 0, 0, 10, 10, \ + ::Test::spgemm_reuse_view); \ + test_spgemm(0, 12, 5, 0, 10, 0, \ + ::Test::spgemm_reuse_matrix); \ + test_spgemm(0, 12, 5, 0, 10, 0, \ + ::Test::spgemm_reuse_view); \ + test_spgemm(10, 10, 0, 0, 10, 10, \ + ::Test::spgemm_reuse_matrix); \ + test_spgemm(10, 10, 0, 0, 10, 10, \ + ::Test::spgemm_reuse_view); \ + test_spgemm(10, 10, 10, 0, 0, 0, \ + ::Test::spgemm_reuse_matrix); \ + test_spgemm(10, 10, 10, 0, 0, 0, \ + ::Test::spgemm_reuse_view); \ + test_spgemm( \ + 10000, 8000, 6000, 8000 * 20, 500, 10, ::Test::spgemm_noreuse); \ + test_spgemm( \ + 1000, 500, 1600, 1000 * 20, 500, 10, ::Test::spgemm_noreuse); \ + test_spgemm(0, 0, 0, 0, 10, 10, \ + ::Test::spgemm_noreuse); \ + test_spgemm(0, 12, 5, 0, 10, 0, \ + ::Test::spgemm_noreuse); \ + test_spgemm(10, 10, 0, 0, 10, 10, \ + ::Test::spgemm_noreuse); \ + test_spgemm(10, 10, 10, 0, 0, 0, \ + ::Test::spgemm_noreuse); \ test_spgemm_symbolic(true, true); \ test_spgemm_symbolic(false, true); \ test_spgemm_symbolic(true, false); \ From fd8bf8ae4b2ec5d852a6bfec3e0f2e983cd032f7 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Fri, 10 Feb 2023 22:16:19 +0000 Subject: [PATCH 054/442] Update perf_test/sparse/KokkosSparse_mdf.cpp Co-authored-by: brian-kelley --- perf_test/sparse/KokkosSparse_mdf.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/sparse/KokkosSparse_mdf.cpp b/perf_test/sparse/KokkosSparse_mdf.cpp index 8f1ddc4e14..319a43ae11 100644 --- a/perf_test/sparse/KokkosSparse_mdf.cpp +++ b/perf_test/sparse/KokkosSparse_mdf.cpp @@ -1,4 +1,4 @@ -//@HEADERA +//@HEADER // ************************************************************************ // // Kokkos v. 4.0 From 7f78fceb1b8c0d11cd797463d9654a716cae3679 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Fri, 10 Feb 2023 16:05:04 -0700 Subject: [PATCH 055/442] Support alpha and beta in LUPrec::apply --- sparse/src/KokkosSparse_LUPrec.hpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index d288f341e1..4f701ed1dc 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -50,7 +50,7 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { private: // trsm takes host views CRS _L, _U; - View1d _tmp; + View1d _tmp, _tmp2; mutable KernelHandle _khL; mutable KernelHandle _khU; @@ -58,7 +58,7 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { //! Constructor: template LUPrec(const CRSArg &L, const CRSArg &U) - : _L(L), _U(U), _tmp("LUPrec::_tmp", L.numRows()), _khL(), _khU() { + : _L(L), _U(U), _tmp("LUPrec::_tmp", L.numRows()), _tmp2("LUPrec::_tmp", L.numRows()), _khL(), _khU() { KK_REQUIRE_MSG(L.numRows() == U.numRows(), "LUPrec: L.numRows() != U.numRows()"); @@ -87,16 +87,21 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { // virtual void apply(const Kokkos::View &X, const Kokkos::View &Y, - const char[] = "N", - ScalarType = karith::one(), - ScalarType = karith::zero()) const { + const char transM[] = "N", + ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { // tmp = trsv(L, x); //Apply L^inv to x // y = trsv(U, tmp); //Apply U^inv to tmp + + KK_REQUIRE_MSG(transM[0] == NoTranspose[0], "LUPrec::apply only supports 'N' for transM"); + sptrsv_symbolic(&_khL, _L.graph.row_map, _L.graph.entries); sptrsv_solve(&_khL, _L.graph.row_map, _L.graph.entries, _L.values, X, _tmp); sptrsv_symbolic(&_khU, _U.graph.row_map, _U.graph.entries); - sptrsv_solve(&_khU, _U.graph.row_map, _U.graph.entries, _U.values, _tmp, Y); + sptrsv_solve(&_khU, _U.graph.row_map, _U.graph.entries, _U.values, _tmp, _tmp2); + + KokkosBlas::axpby(alpha, _tmp2, beta, Y); } //@} From 76d9ed4abac968e9900ed9c1fa30296ddad4cd31 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Fri, 10 Feb 2023 16:05:28 -0700 Subject: [PATCH 056/442] formatting --- sparse/src/KokkosSparse_LUPrec.hpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index 4f701ed1dc..fddb1f0d68 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -58,7 +58,12 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { //! Constructor: template LUPrec(const CRSArg &L, const CRSArg &U) - : _L(L), _U(U), _tmp("LUPrec::_tmp", L.numRows()), _tmp2("LUPrec::_tmp", L.numRows()), _khL(), _khU() { + : _L(L), + _U(U), + _tmp("LUPrec::_tmp", L.numRows()), + _tmp2("LUPrec::_tmp", L.numRows()), + _khL(), + _khU() { KK_REQUIRE_MSG(L.numRows() == U.numRows(), "LUPrec: L.numRows() != U.numRows()"); @@ -93,13 +98,15 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { // tmp = trsv(L, x); //Apply L^inv to x // y = trsv(U, tmp); //Apply U^inv to tmp - KK_REQUIRE_MSG(transM[0] == NoTranspose[0], "LUPrec::apply only supports 'N' for transM"); + KK_REQUIRE_MSG(transM[0] == NoTranspose[0], + "LUPrec::apply only supports 'N' for transM"); sptrsv_symbolic(&_khL, _L.graph.row_map, _L.graph.entries); sptrsv_solve(&_khL, _L.graph.row_map, _L.graph.entries, _L.values, X, _tmp); sptrsv_symbolic(&_khU, _U.graph.row_map, _U.graph.entries); - sptrsv_solve(&_khU, _U.graph.row_map, _U.graph.entries, _U.values, _tmp, _tmp2); + sptrsv_solve(&_khU, _U.graph.row_map, _U.graph.entries, _U.values, _tmp, + _tmp2); KokkosBlas::axpby(alpha, _tmp2, beta, Y); } From 0b5bc7a61b10e0d163e6d49cecb75f9a9ccd06f7 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Fri, 10 Feb 2023 16:36:32 -0800 Subject: [PATCH 057/442] Fix race condition when read and write L_values at the same k --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 42 ++++------------ .../KokkosSparse_spiluk_symbolic_impl.hpp | 50 ++----------------- sparse/src/KokkosSparse_spiluk_handle.hpp | 12 ----- 3 files changed, 13 insertions(+), 91 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 38704c2fb0..069d6d866d 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -235,7 +235,7 @@ struct ILUKLvlSchedTP1NumericFunctor { iw(my_team, col) = static_cast(k); }); #endif - +team.team_barrier(); #ifdef KEEP_DIAG // if (my_thread == 0) L_values(k2 - 1) = scalar_t(1.0); Kokkos::single(Kokkos::PerTeam(team), @@ -280,15 +280,16 @@ struct ILUKLvlSchedTP1NumericFunctor { #endif { nnz_lno_t prev_row = L_entries(k); + + scalar_t fact; + Kokkos::single(Kokkos::PerTeam(team), [&](scalar_t& tmp_fact) { #ifdef KEEP_DIAG - scalar_t fact = L_values(k) / U_values(U_row_map(prev_row)); + tmp_fact = L_values(k) / U_values(U_row_map(prev_row)); #else - scalar_t fact = L_values(k) * U_values(U_row_map(prev_row)); + tmp_fact = L_values(k) * U_values(U_row_map(prev_row)); #endif - // if (my_thread == 0) L_values(k) = fact; - Kokkos::single(Kokkos::PerTeam(team), [&]() { L_values(k) = fact; }); - - team.team_barrier(); + L_values(k) = tmp_fact; + }, fact); Kokkos::parallel_for( Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, @@ -299,9 +300,9 @@ struct ILUKLvlSchedTP1NumericFunctor { auto lxu = -U_values(kk) * fact; if (ipos != -1) { if (col < rowid) - Kokkos::atomic_add(&L_values(ipos), lxu); + L_values(ipos) += lxu; else - Kokkos::atomic_add(&U_values(ipos), lxu); + U_values(ipos) += lxu; } }); // end for kk @@ -373,24 +374,6 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, size_type nlevels = thandle.get_num_levels(); int team_size = thandle.get_team_size(); -#ifdef KOKKOS_ARCH_VOLTA - size_type maxnnzperrow = thandle.get_level_maxnnzperrow(); - if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - if (team_size == -1) { - // Round up maxnnzperrow to closest power of 2 - size_type power_maxnnzperrow = 1; - while (power_maxnnzperrow < maxnnzperrow) power_maxnnzperrow *= 2; - if (power_maxnnzperrow > 1024) - team_size = 1024; - else if (power_maxnnzperrow >= 128) - team_size = 768; - else - team_size = 32; - } - } -#endif - // Keep these as host View, create device version and copy back to host HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); HandleDeviceEntriesType level_idx = thandle.get_level_idx(); @@ -454,10 +437,6 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, L_values, U_row_map, U_entries, U_values, level_idx, iw, lev_start + lvl_rowid_start); -#ifdef KOKKOS_ARCH_VOLTA - Kokkos::parallel_for("parfor_tp1", - policy_type(lvl_nrows_chunk, team_size), tstf); -#else if (team_size == -1) Kokkos::parallel_for("parfor_tp1", policy_type(lvl_nrows_chunk, Kokkos::AUTO), @@ -465,7 +444,6 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, else Kokkos::parallel_for("parfor_tp1", policy_type(lvl_nrows_chunk, team_size), tstf); -#endif Kokkos::fence(); lvl_rowid_start += lvl_nrows_chunk; } diff --git a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 67d79a6186..85839561a1 100644 --- a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -95,11 +95,7 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, template void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, - const EntriesType entries, -#ifdef KOKKOS_ARCH_VOLTA - const RowMapType U_row_map, -#endif - LevelType1& level_list, + const EntriesType entries, LevelType1& level_list, LevelType2& level_ptr, LevelType2& level_idx, size_type& nlevels) { // Scheduling currently compute on host @@ -162,9 +158,6 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, size_type maxrows = 0; size_type maxrowsperchunk = 0; -#ifdef KOKKOS_ARCH_VOLTA - size_type maxnnzperrow = 0; -#endif for (size_type i = 0; i < nlevels; ++i) { size_type lnrows = level_ptr(i + 1) - level_ptr(i); if (maxrows < lnrows) { @@ -178,42 +171,11 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, lnrowsperchunk(i) = (lnrows % lnchunks(i) == 0) ? (lnrows / lnchunks(i)) : (lnrows / lnchunks(i) + 1); -#ifdef KOKKOS_ARCH_VOLTA - nnz_lno_t lvl_rowid_start = 0; - nnz_lno_t lvl_nrows_chunk; - for (nnz_lno_t chunkid = 0; chunkid < lnchunks(i); chunkid++) { - if ((lvl_rowid_start + lnrowsperchunk(i)) > static_cast(lnrows)) - lvl_nrows_chunk = static_cast(lnrows) - lvl_rowid_start; - else - lvl_nrows_chunk = lnrowsperchunk(i); - // Determine the number of non-zeros in each level - for (nnz_lno_t r = 0; r < lvl_nrows_chunk; r++) { // Look at each row in the chunk - auto rid = level_idx(r + level_ptr(i) + lvl_rowid_start);// get actual rowid - nnz_lno_t rnnzU = U_row_map(rid + 1) - U_row_map(rid); // count the number of non-zeros in the current row of U - //nnz_lno_t rnnzL = row_map(rid + 1) - row_map(rid); // count the number of non-zeros in the current row of L - if (maxnnzperrow < static_cast(rnnzU)) { - maxnnzperrow = static_cast(rnnzU); - } - } - lvl_rowid_start += lvl_nrows_chunk; - } -#endif } else #endif { lnchunks(i) = 1; lnrowsperchunk(i) = lnrows; -#ifdef KOKKOS_ARCH_VOLTA - // Determine the number of non-zeros in each level - for (nnz_lno_t r = 0; r < lnrows; r++) { // Look at each row in the chunk - auto rid = level_idx(r + level_ptr(i));// get actual rowid - nnz_lno_t rnnzU = U_row_map(rid + 1) - U_row_map(rid); // count the number of non-zeros in the current row of U - //nnz_lno_t rnnzL = row_map(rid + 1) - row_map(rid); // count the number of non-zeros in the current row of L - if (maxnnzperrow < static_cast(rnnzU)) { - maxnnzperrow = static_cast(rnnzU); - } - } -#endif } if (maxrowsperchunk < static_cast(lnrowsperchunk(i))) { maxrowsperchunk = lnrowsperchunk(i); @@ -223,9 +185,6 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, thandle.set_num_levels(nlevels); thandle.set_level_maxrows(maxrows); thandle.set_level_maxrowsperchunk(maxrowsperchunk); -#ifdef KOKKOS_ARCH_VOLTA - thandle.set_level_maxnnzperrow(maxnnzperrow); -#endif } // Linear Search for the smallest row index @@ -474,11 +433,8 @@ void iluk_symbolic(IlukHandle& thandle, // Level scheduling on L if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - level_sched_tp(thandle, L_row_map, L_entries, -#ifdef KOKKOS_ARCH_VOLTA - U_row_map, -#endif - level_list, level_ptr, level_idx, nlev); + level_sched_tp(thandle, L_row_map, L_entries, level_list, + level_ptr, level_idx, nlev); thandle.alloc_iw(thandle.get_level_maxrowsperchunk(), nrows); } else { level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, diff --git a/sparse/src/KokkosSparse_spiluk_handle.hpp b/sparse/src/KokkosSparse_spiluk_handle.hpp index 48909c5b3c..78c072e72a 100644 --- a/sparse/src/KokkosSparse_spiluk_handle.hpp +++ b/sparse/src/KokkosSparse_spiluk_handle.hpp @@ -95,8 +95,6 @@ class SPILUKHandle { size_type level_maxrows; // max. number of rows among levels size_type level_maxrowsperchunk; // max.number of rows among chunks among levels - size_type - level_maxnnzperrow; // max.number of nnz per row among levels bool symbolic_complete; @@ -121,7 +119,6 @@ class SPILUKHandle { nnzU(nnzU_), level_maxrows(0), level_maxrowsperchunk(0), - level_maxnnzperrow(0), symbolic_complete(symbolic_complete_), algm(choice), team_size(-1), @@ -135,7 +132,6 @@ class SPILUKHandle { set_nnzU(nnzU_); set_level_maxrows(0); set_level_maxrowsperchunk(0); - set_level_maxnnzperrow(0); level_list = nnz_row_view_t("level_list", nrows_), level_idx = nnz_lno_view_t("level_idx", nrows_), level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), @@ -218,14 +214,6 @@ class SPILUKHandle { this->level_maxrowsperchunk = level_maxrowsperchunk_; } - KOKKOS_INLINE_FUNCTION - size_type get_level_maxnnzperrow() const { return level_maxnnzperrow; } - - KOKKOS_INLINE_FUNCTION - void set_level_maxnnzperrow(const size_type level_maxnnzperrow_) { - this->level_maxnnzperrow = level_maxnnzperrow_; - } - bool is_symbolic_complete() const { return symbolic_complete; } size_type get_num_levels() const { return nlevels; } From 6d02704ad97ca0f12e942c2700db02a748097852 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Fri, 10 Feb 2023 16:44:33 -0800 Subject: [PATCH 058/442] Remove one unnecessary barrier --- sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp | 2 +- sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 069d6d866d..d2b92c2e6e 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -235,7 +235,7 @@ struct ILUKLvlSchedTP1NumericFunctor { iw(my_team, col) = static_cast(k); }); #endif -team.team_barrier(); + #ifdef KEEP_DIAG // if (my_thread == 0) L_values(k2 - 1) = scalar_t(1.0); Kokkos::single(Kokkos::PerTeam(team), diff --git a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 85839561a1..5926aa872e 100644 --- a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -433,8 +433,8 @@ void iluk_symbolic(IlukHandle& thandle, // Level scheduling on L if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - level_sched_tp(thandle, L_row_map, L_entries, level_list, - level_ptr, level_idx, nlev); + level_sched_tp(thandle, L_row_map, L_entries, level_list, level_ptr, + level_idx, nlev); thandle.alloc_iw(thandle.get_level_maxrowsperchunk(), nrows); } else { level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, From e2b8df3fd744900f116ae0d72093f9611c94cc78 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Fri, 10 Feb 2023 22:05:44 -0800 Subject: [PATCH 059/442] Make hlevel_ptr a separate allocation --- sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp | 14 ++------------ sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp | 13 +++++++------ sparse/src/KokkosSparse_spiluk_handle.hpp | 8 ++++++++ 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index d2b92c2e6e..787ac606b3 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -374,22 +374,12 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, size_type nlevels = thandle.get_num_levels(); int team_size = thandle.get_team_size(); - // Keep these as host View, create device version and copy back to host - HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); + LevelHostViewType level_ptr_h = thandle.get_host_level_ptr(); HandleDeviceEntriesType level_idx = thandle.get_level_idx(); - // Make level_ptr_h a separate allocation, since it will be accessed on host - // between kernel launches. If a mirror were used and level_ptr is in UVM - // space, a fence would be required before each access since UVM views can - // share pages. - LevelHostViewType level_ptr_h, level_nchunks_h, level_nrowsperchunk_h; + LevelHostViewType level_nchunks_h, level_nrowsperchunk_h; WorkViewType iw; - level_ptr_h = LevelHostViewType( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level pointers"), - level_ptr.extent(0)); - Kokkos::deep_copy(level_ptr_h, level_ptr); - //{ if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { diff --git a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 5926aa872e..b9c9ea8a9c 100644 --- a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -33,10 +33,11 @@ namespace Impl { namespace Experimental { template + class LevelType1, class LevelType2, class LevelType3, + class size_type> void level_sched(IlukHandle& thandle, const RowMapType row_map, const EntriesType entries, LevelType1& level_list, - LevelType2& level_ptr, LevelType2& level_idx, + LevelType2& level_ptr, LevelType3& level_idx, size_type& nlevels) { // Scheduling currently compute on host @@ -93,10 +94,11 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, // SEQLVLSCHD_TP1 algorithm (chunks) template + class LevelType1, class LevelType2, class LevelType3, + class size_type> void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, const EntriesType entries, LevelType1& level_list, - LevelType2& level_ptr, LevelType2& level_idx, + LevelType2& level_ptr, LevelType3& level_idx, size_type& nlevels) { // Scheduling currently compute on host @@ -264,8 +266,7 @@ void iluk_symbolic(IlukHandle& thandle, Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), dlevel_list); HandleDeviceEntriesType dlevel_ptr = thandle.get_level_ptr(); - auto level_ptr = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), dlevel_ptr); + auto level_ptr = thandle.get_host_level_ptr(); HandleDeviceEntriesType dlevel_idx = thandle.get_level_idx(); auto level_idx = diff --git a/sparse/src/KokkosSparse_spiluk_handle.hpp b/sparse/src/KokkosSparse_spiluk_handle.hpp index 78c072e72a..4d163c205b 100644 --- a/sparse/src/KokkosSparse_spiluk_handle.hpp +++ b/sparse/src/KokkosSparse_spiluk_handle.hpp @@ -83,6 +83,9 @@ class SPILUKHandle { nnz_lno_view_t level_idx; // the list of rows in each level nnz_lno_view_t level_ptr; // the starting index (into the view level_idx) of each level + // Make hlevel_ptr a separate allocation, since it will be accessed on host + // between kernel launches. + nnz_lno_view_host_t hlevel_ptr; nnz_lno_view_host_t level_nchunks; // number of chunks of rows at each level nnz_lno_view_host_t level_nrowsperchunk; // maximum number of rows among chunks at each level @@ -110,6 +113,7 @@ class SPILUKHandle { : level_list(), level_idx(), level_ptr(), + hlevel_ptr(), level_nchunks(), level_nrowsperchunk(), iw(), @@ -135,6 +139,7 @@ class SPILUKHandle { level_list = nnz_row_view_t("level_list", nrows_), level_idx = nnz_lno_view_t("level_idx", nrows_), level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), + hlevel_ptr = nnz_lno_view_host_t("hlevel_ptr", nrows_ + 1), level_nchunks = nnz_lno_view_host_t(), level_nrowsperchunk = nnz_lno_view_host_t(), reset_symbolic_complete(), iw = work_view_t(); @@ -155,6 +160,9 @@ class SPILUKHandle { KOKKOS_INLINE_FUNCTION nnz_lno_view_t get_level_ptr() const { return level_ptr; } + inline + nnz_lno_view_host_t get_host_level_ptr() const { return hlevel_ptr; } + KOKKOS_INLINE_FUNCTION nnz_lno_view_host_t get_level_nchunks() const { return level_nchunks; } From a67bc42ce3740e7b50cd691e23130576c6606920 Mon Sep 17 00:00:00 2001 From: "Vinh Quang Dang (-EXP)" Date: Fri, 10 Feb 2023 23:20:37 -0700 Subject: [PATCH 060/442] Apply clang format --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 22 ++++++++++--------- .../KokkosSparse_spiluk_symbolic_impl.hpp | 8 +++---- sparse/src/KokkosSparse_spiluk_handle.hpp | 3 +-- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 787ac606b3..94483da47e 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -282,14 +282,17 @@ struct ILUKLvlSchedTP1NumericFunctor { nnz_lno_t prev_row = L_entries(k); scalar_t fact; - Kokkos::single(Kokkos::PerTeam(team), [&](scalar_t& tmp_fact) { + Kokkos::single( + Kokkos::PerTeam(team), + [&](scalar_t &tmp_fact) { #ifdef KEEP_DIAG - tmp_fact = L_values(k) / U_values(U_row_map(prev_row)); + tmp_fact = L_values(k) / U_values(U_row_map(prev_row)); #else - tmp_fact = L_values(k) * U_values(U_row_map(prev_row)); + tmp_fact = L_values(k) * U_values(U_row_map(prev_row)); #endif - L_values(k) = tmp_fact; - }, fact); + L_values(k) = tmp_fact; + }, + fact); Kokkos::parallel_for( Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, @@ -372,9 +375,9 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; size_type nlevels = thandle.get_num_levels(); - int team_size = thandle.get_team_size(); + int team_size = thandle.get_team_size(); - LevelHostViewType level_ptr_h = thandle.get_host_level_ptr(); + LevelHostViewType level_ptr_h = thandle.get_host_level_ptr(); HandleDeviceEntriesType level_idx = thandle.get_level_idx(); LevelHostViewType level_nchunks_h, level_nrowsperchunk_h; @@ -428,9 +431,8 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, lev_start + lvl_rowid_start); if (team_size == -1) - Kokkos::parallel_for("parfor_tp1", - policy_type(lvl_nrows_chunk, Kokkos::AUTO), - tstf); + Kokkos::parallel_for( + "parfor_tp1", policy_type(lvl_nrows_chunk, Kokkos::AUTO), tstf); else Kokkos::parallel_for("parfor_tp1", policy_type(lvl_nrows_chunk, team_size), tstf); diff --git a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index b9c9ea8a9c..616e87f154 100644 --- a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -33,8 +33,7 @@ namespace Impl { namespace Experimental { template + class LevelType1, class LevelType2, class LevelType3, class size_type> void level_sched(IlukHandle& thandle, const RowMapType row_map, const EntriesType entries, LevelType1& level_list, LevelType2& level_ptr, LevelType3& level_idx, @@ -94,8 +93,7 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, // SEQLVLSCHD_TP1 algorithm (chunks) template + class LevelType1, class LevelType2, class LevelType3, class size_type> void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, const EntriesType entries, LevelType1& level_list, LevelType2& level_ptr, LevelType3& level_idx, @@ -266,7 +264,7 @@ void iluk_symbolic(IlukHandle& thandle, Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), dlevel_list); HandleDeviceEntriesType dlevel_ptr = thandle.get_level_ptr(); - auto level_ptr = thandle.get_host_level_ptr(); + auto level_ptr = thandle.get_host_level_ptr(); HandleDeviceEntriesType dlevel_idx = thandle.get_level_idx(); auto level_idx = diff --git a/sparse/src/KokkosSparse_spiluk_handle.hpp b/sparse/src/KokkosSparse_spiluk_handle.hpp index 4d163c205b..6ccd42709a 100644 --- a/sparse/src/KokkosSparse_spiluk_handle.hpp +++ b/sparse/src/KokkosSparse_spiluk_handle.hpp @@ -160,8 +160,7 @@ class SPILUKHandle { KOKKOS_INLINE_FUNCTION nnz_lno_view_t get_level_ptr() const { return level_ptr; } - inline - nnz_lno_view_host_t get_host_level_ptr() const { return hlevel_ptr; } + inline nnz_lno_view_host_t get_host_level_ptr() const { return hlevel_ptr; } KOKKOS_INLINE_FUNCTION nnz_lno_view_host_t get_level_nchunks() const { return level_nchunks; } From a66a5d6d63734c2355e2ede46dc3e2bdb9ff9586 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Sun, 12 Feb 2023 00:07:14 -0800 Subject: [PATCH 061/442] Fix uninitialized error --- sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 94483da47e..9436b67029 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -281,7 +281,7 @@ struct ILUKLvlSchedTP1NumericFunctor { { nnz_lno_t prev_row = L_entries(k); - scalar_t fact; + scalar_t fact = scalar_t(0.0); Kokkos::single( Kokkos::PerTeam(team), [&](scalar_t &tmp_fact) { From 9ff35198d66f97b1042134ccfa0382cb87b029b4 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Mon, 13 Feb 2023 11:01:38 -0700 Subject: [PATCH 062/442] Add utility KokkosSparse::removeCrsMatrixZeros(A, tol) (#1681) * Add utility KokkosSparse::removeCrsMatrixZeros(A, tol) And testing for it. It removes explicit zeros (or, entries where |Aij| < tol) from a matrix and returns a new matrix. If A has no entries to remove, A is returned unchanged. * Add missing Kokkos_Bitset.hpp include * Incorporate feedback on removeCrsMatrixZeros test - Simplify rowptr filling loop in reference impl - Hardcode the reference outputs for all hardcoded test cases, and check the reference impl itself against them. * Add fence Make sure all kernels finish running before temporary views go out of scope. --- sparse/src/KokkosSparse_Utils.hpp | 199 ++++++++++++++ sparse/unit_test/Test_Sparse.hpp | 1 + sparse/unit_test/Test_Sparse_Utils.hpp | 98 +++++++ .../Test_Sparse_removeCrsMatrixZeros.hpp | 258 ++++++++++++++++++ sparse/unit_test/Test_Sparse_spgemm.hpp | 82 +----- 5 files changed, 558 insertions(+), 80 deletions(-) create mode 100644 sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index de01b3e2c1..4039b6f5a7 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -24,6 +24,7 @@ #include "KokkosKernels_PrintUtils.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_BsrMatrix.hpp" +#include "Kokkos_Bitset.hpp" #ifdef KOKKOSKERNELS_HAVE_PARALLEL_GNUSORT #include @@ -2108,6 +2109,200 @@ bool isCrsGraphSorted(const Rowmap &rowmap, const Entries &entries) { return totalFallingEdges == rowBoundaryFallingEdges; } +template +struct CountDroppedEntriesFunctor { + using Scalar = typename Values::non_const_value_type; + CountDroppedEntriesFunctor(const Values &values_, Mag tol_) + : values(values_), tol(tol_) {} + + KOKKOS_INLINE_FUNCTION void operator()(int64_t i, Offset &lcount) const { + if (Kokkos::ArithTraits::abs(values(i)) <= tol) lcount++; + } + + Values values; + Mag tol; +}; + +template +struct MarkFinalRowEntries { + MarkFinalRowEntries(const Bitset &rowEndMarkers_, const Rowmap &rowmap_) + : rowEndMarkers(rowEndMarkers_), rowmap(rowmap_) {} + + KOKKOS_INLINE_FUNCTION void operator()(int64_t i) const { + auto index = rowmap(i); + if (index) rowEndMarkers.set(index - 1); + } + + Bitset rowEndMarkers; + Rowmap rowmap; +}; + +template +struct DropEntriesScanner { + KOKKOS_DEFAULTED_FUNCTION DropEntriesScanner() = default; + KOKKOS_INLINE_FUNCTION DropEntriesScanner(Offset i_out_, Offset row_) + : i_out(i_out_), row(row_) {} + + KOKKOS_INLINE_FUNCTION void operator+=( + const DropEntriesScanner &rhs) { + i_out += rhs.i_out; + row += rhs.row; + } + + Offset i_out; // The index to write in output entries/values + Offset row; // The row index (ignoring rows which were empty in input) +}; + +template +struct DropEntriesFunctor { + using Offset = typename RowmapIn::non_const_value_type; + using Scalar = typename ValuesIn::non_const_value_type; + + DropEntriesFunctor(const Bitset &rowEndMarkers_, const RowmapIn &rowmapIn_, + const EntriesIn &entriesIn_, const ValuesIn &valuesIn_, + const RowmapOut &compactRowmapOut_, + const EntriesOut &entriesOut_, const ValuesOut &valuesOut_, + Mag tol_) + : rowEndMarkers(rowEndMarkers_), + rowmapIn(rowmapIn_), + entriesIn(entriesIn_), + valuesIn(valuesIn_), + compactRowmapOut(compactRowmapOut_), + entriesOut(entriesOut_), + valuesOut(valuesOut_), + tol(tol_) {} + + KOKKOS_INLINE_FUNCTION void operator()(int64_t i_in, + DropEntriesScanner &scanval, + bool finalPass) const { + // i_in is the index of the input entry being processed + // i_out (if finalPass == true) is the index of where that same entry goes + // in the filtered matrix + bool filter = Kokkos::ArithTraits::abs(valuesIn(i_in)) <= tol; + bool isRowEnd = rowEndMarkers.test(i_in); + if (finalPass) { + if (!filter) { + // Keeping this entry, so copy it to the output. + entriesOut(scanval.i_out) = entriesIn(i_in); + valuesOut(scanval.i_out) = valuesIn(i_in); + } + if (isRowEnd) { + // Entry i_in was the last in its row of the input matrix. + // We now know where that filtered row ends, so mark it in + // compactRowmapOut. + compactRowmapOut(scanval.row + 1) = scanval.i_out + (filter ? 0 : 1); + } + // Also, make one thread responsible for initializing first compact rowmap + // entry + if (i_in == 0) compactRowmapOut(0) = 0; + } + if (!filter) scanval.i_out++; + if (isRowEnd) scanval.row++; + } + + Bitset rowEndMarkers; + RowmapIn rowmapIn; + EntriesIn entriesIn; + ValuesIn valuesIn; + RowmapOut compactRowmapOut; + EntriesOut entriesOut; + ValuesOut valuesOut; + Mag tol; +}; + +template +struct ExpandRowmapFunctor { + using Offset = typename RowmapIn::non_const_value_type; + + ExpandRowmapFunctor(const RowmapIn &rowmapIn_, + const RowmapOut &compactRowmapOut_, + const RowmapOut &rowmapOut_) + : rowmapIn(rowmapIn_), + compactRowmapOut(compactRowmapOut_), + rowmapOut(rowmapOut_) {} + + KOKKOS_INLINE_FUNCTION void operator()(Ordinal row, Ordinal &compactRow, + bool finalPass) const { + if (finalPass) { + rowmapOut(row) = compactRowmapOut(compactRow); + } + if (row + 1 < rowmapIn.extent_int(0) && rowmapIn(row + 1) != rowmapIn(row)) + compactRow++; + } + + RowmapIn rowmapIn; + RowmapOut compactRowmapOut; + RowmapOut rowmapOut; +}; + +// Given a CrsMatrix A, filter out all entries Aij where |Aij| <= tol. +// If there are no entries to remove, A is returned. +// Otherwise a new matrix is returned. +template +Matrix removeCrsMatrixZeros( + const Matrix &A, + typename Kokkos::ArithTraits::mag_type tol = + 0) { + using Ordinal = typename Matrix::non_const_ordinal_type; + using Offset = typename Matrix::non_const_size_type; + using Device = typename Matrix::device_type; + using ExecSpace = typename Device::execution_space; + using Mag = decltype(tol); + using RangePol = Kokkos::RangePolicy; + // First, count the number of entries to remove + Offset entriesToRemove; + Kokkos::parallel_reduce( + RangePol(0, A.nnz()), + CountDroppedEntriesFunctor( + A.values, tol), + entriesToRemove); + if (entriesToRemove == Offset(0)) { + // The matrix has no zeros to remove, so just return it as-is + return A; + } + // Actually have to make the new matrix with (near-)zeros removed. + // To help construct the new rowmap, for each original entry record whether + // it's at the end of its row. + Kokkos::Bitset rowEndMarkersNonconst(A.nnz()); + Kokkos::parallel_for( + RangePol(0, A.graph.row_map.extent(0)), + MarkFinalRowEntries(rowEndMarkersNonconst, A.graph.row_map)); + Offset filteredNNZ = A.nnz() - entriesToRemove; + typename Matrix::values_type::non_const_type filteredValues( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Afiltered values"), + filteredNNZ); + typename Matrix::index_type::non_const_type filteredEntries( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Afiltered entries"), + filteredNNZ); + typename Matrix::row_map_type::non_const_type compactFilteredRowmap( + Kokkos::view_alloc(Kokkos::WithoutInitializing, + "Afiltered rowmap (compact)"), + A.numRows() + 1); + typename Matrix::row_map_type::non_const_type filteredRowmap( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Afiltered rowmap"), + A.numRows() + 1); + // Using a parallel scan, compact the non-filtered entries and partially fill + // in the rowmap (only marking row begins for rows which were originally + // non-empty) The rest can be filled in with a max-scan. + Kokkos::ConstBitset rowEndMarkers(rowEndMarkersNonconst); + Kokkos::parallel_scan( + RangePol(0, A.nnz()), + DropEntriesFunctor(rowEndMarkers, A.graph.row_map, A.graph.entries, + A.values, compactFilteredRowmap, filteredEntries, + filteredValues, tol)); + Kokkos::parallel_scan( + RangePol(0, A.numRows() + 1), + ExpandRowmapFunctor(A.graph.row_map, compactFilteredRowmap, + filteredRowmap)); + ExecSpace().fence(); + return Matrix("A filtered", A.numRows(), A.numCols(), filteredNNZ, + filteredValues, filteredRowmap, filteredEntries); +} + template void validateCrsMatrix(int m, int n, const Rowmap &rowmapIn, const Entries &entriesIn, const Values &valuesIn) { @@ -2136,6 +2331,10 @@ void validateCrsMatrix(int m, int n, const Rowmap &rowmapIn, } } // namespace Impl + +using Impl::isCrsGraphSorted; +using Impl::removeCrsMatrixZeros; + } // namespace KokkosSparse #endif diff --git a/sparse/unit_test/Test_Sparse.hpp b/sparse/unit_test/Test_Sparse.hpp index d5fb879c64..647fff4c18 100644 --- a/sparse/unit_test/Test_Sparse.hpp +++ b/sparse/unit_test/Test_Sparse.hpp @@ -41,6 +41,7 @@ #include "Test_Sparse_TestUtils_RandCsMat.hpp" #include "Test_Sparse_ccs2crs.hpp" #include "Test_Sparse_crs2ccs.hpp" +#include "Test_Sparse_removeCrsMatrixZeros.hpp" // TPL specific tests, these require // particular pairs of backend and TPL diff --git a/sparse/unit_test/Test_Sparse_Utils.hpp b/sparse/unit_test/Test_Sparse_Utils.hpp index 1bfd33fc0d..73320e9358 100644 --- a/sparse/unit_test/Test_Sparse_Utils.hpp +++ b/sparse/unit_test/Test_Sparse_Utils.hpp @@ -18,6 +18,7 @@ #define TEST_SPARSE_UTILS_HPP #include "KokkosSparse_spmv.hpp" +#include "KokkosSparse_SortCrs.hpp" namespace Test { @@ -37,6 +38,103 @@ vector_t create_random_y_vector_mv(crsMat_t crsMat, vector_t x_vector) { return y_vector; } +template +bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) { + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type lno_view_t; + typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + + size_t nrows_actual = output_mat_actual.numRows(); + size_t ncols_actual = output_mat_actual.numCols(); + size_t nentries_actual = output_mat_actual.graph.entries.extent(0); + size_t nvals_actual = output_mat_actual.values.extent(0); + + size_t nrows_reference = output_mat_reference.numRows(); + size_t ncols_reference = output_mat_reference.numCols(); + size_t nentries_reference = output_mat_reference.graph.entries.extent(0); + size_t nvals_reference = output_mat_reference.values.extent(0); + + if (nrows_actual != nrows_reference || ncols_actual != ncols_reference) { + std::cout << "dimensions (actual):" << nrows_actual << 'x' << ncols_actual + << ", dimensions (reference): " << nrows_reference << 'x' + << ncols_reference << '\n'; + return false; + } + if (nentries_actual != nentries_reference) { + std::cout << "nentries_actual:" << nentries_actual + << " nentries_reference:" << nentries_reference << std::endl; + return false; + } + if (nvals_actual != nvals_reference) { + std::cout << "nvals_actual:" << nvals_actual + << " nvals_reference:" << nvals_reference << std::endl; + return false; + } + + bool is_identical = true; + // Special case: a matrix with 0 rows can have a rowmap of length 0 or 1. + // Treat these as equivalent. + bool zero_row_equivalent = false; + if (nrows_reference == 0) { + auto rm1 = Kokkos::create_mirror_view_and_copy( + Kokkos::HostSpace(), output_mat_actual.graph.row_map); + auto rm2 = Kokkos::create_mirror_view_and_copy( + Kokkos::HostSpace(), output_mat_reference.graph.row_map); + if (rm1.extent_int(0) == 0 && rm2.extent_int(0) == 1) { + // Make sure the one element of rm2 is 0 + zero_row_equivalent = !rm2(0); + } else if (rm1.extent_int(0) == 1 && rm2.extent_int(0) == 0) { + // Make sure the one element of rm1 is 0 + zero_row_equivalent = !rm1(0); + } + } + if (!zero_row_equivalent) { + is_identical = KokkosKernels::Impl::kk_is_identical_view< + typename graph_t::row_map_type, typename graph_t::row_map_type, + typename lno_view_t::value_type, typename device::execution_space>( + output_mat_actual.graph.row_map, output_mat_reference.graph.row_map, 0); + } + + if (!is_identical) { + std::cout << "rowmaps are different." << std::endl; + std::cout << "Actual rowmap:\n"; + KokkosKernels::Impl::kk_print_1Dview(output_mat_actual.graph.row_map, true); + std::cout << "Correct rowmap:\n"; + KokkosKernels::Impl::kk_print_1Dview(output_mat_reference.graph.row_map, + true); + return false; + } + + is_identical = KokkosKernels::Impl::kk_is_identical_view< + lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type, + typename device::execution_space>(output_mat_actual.graph.entries, + output_mat_reference.graph.entries, 0); + + if (!is_identical) { + std::cout << "entries are different." << std::endl; + KokkosKernels::Impl::kk_print_1Dview(output_mat_actual.graph.entries); + KokkosKernels::Impl::kk_print_1Dview(output_mat_reference.graph.entries); + return false; + } + + typedef typename Kokkos::Details::ArithTraits< + typename scalar_view_t::non_const_value_type>::mag_type eps_type; + eps_type eps = std::is_same::value ? 3.7e-3 : 1e-7; + + is_identical = KokkosKernels::Impl::kk_is_relatively_identical_view< + scalar_view_t, scalar_view_t, eps_type, typename device::execution_space>( + output_mat_actual.values, output_mat_reference.values, eps); + + if (!is_identical) { + std::cout << "values are different." << std::endl; + KokkosKernels::Impl::kk_print_1Dview(output_mat_actual.values); + KokkosKernels::Impl::kk_print_1Dview(output_mat_reference.values); + + return false; + } + return true; +} } // namespace Test #endif // TEST_SPARSE_UTILS_HPP diff --git a/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp b/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp new file mode 100644 index 0000000000..b5c57dbe49 --- /dev/null +++ b/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp @@ -0,0 +1,258 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/// \file Test_Sparse_SortCrs.hpp +/// \brief Tests for sort_crs_matrix and sort_crs_graph in +/// KokkosSparse_SortCrs.hpp + +#ifndef KOKKOSSPARSE_REMOVECRSZEROS_HPP +#define KOKKOSSPARSE_REMOVECRSZEROS_HPP + +#include +#include +#include +#include + +namespace TestRemoveCrsMatrixZeros { + +// Simple, sequential implementation of zero-removal to compare against +template +Matrix removeMatrixZerosReference(const Matrix& A) { + using Offset = typename Matrix::non_const_size_type; + using Ordinal = typename Matrix::ordinal_type; + using Scalar = typename Matrix::value_type; + using KAT = Kokkos::ArithTraits; + auto rowmapHost = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); + auto entriesHost = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); + auto valuesHost = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values); + // First, create the filtered rowmap (the CrsMatrix constructor taking host + // pointers does expect rowmap to be in Ordinal) + Ordinal filteredNNZ = 0; + std::vector filteredRowmap = {0}; // first row begins at 0 + for (Ordinal i = 0; i < A.numRows(); i++) { + for (Offset j = rowmapHost(i); j < rowmapHost(i + 1); j++) { + if (valuesHost(j) != KAT::zero()) { + filteredNNZ++; + } + } + filteredRowmap.push_back(filteredNNZ); + } + // Then allocate and fill in the filtered entries and values + std::vector filteredEntries; + std::vector filteredValues; + for (Offset i = 0; i < A.nnz(); i++) { + if (valuesHost(i) != KAT::zero()) { + filteredEntries.push_back(entriesHost(i)); + filteredValues.push_back(valuesHost(i)); + } + } + // Copy all the views back to device and construct matrix + return Matrix("A filtered", A.numRows(), A.numCols(), filteredNNZ, + filteredValues.data(), filteredRowmap.data(), + filteredEntries.data()); +} + +template +Matrix loadMatrixFromVectors(int numRows, int numCols, + const std::vector& rowmapRawInt, + const std::vector& entriesRawInt, + const std::vector& valuesRawDouble) { + using Offset = typename Matrix::non_const_size_type; + using Ordinal = typename Matrix::ordinal_type; + using Scalar = typename Matrix::value_type; + // The CrsMatrix constructor taking host pointers expects rowmap to be in + // Ordinal + std::vector rowmapRaw; + std::vector entriesRaw; + std::vector valuesRaw; + for (auto val : rowmapRawInt) rowmapRaw.push_back(val); + for (auto val : entriesRawInt) entriesRaw.push_back(val); + for (auto val : valuesRawDouble) valuesRaw.push_back(Scalar(val)); + Offset nnz = rowmapRaw.size() ? rowmapRaw[numRows] : 0; + return Matrix("A", numRows, numCols, nnz, valuesRaw.data(), rowmapRaw.data(), + entriesRaw.data()); +} + +template +void getTestInput(int test, Matrix& A, Matrix& Afiltered_ref) { + using Offset = typename Matrix::size_type; + using Device = + Kokkos::Device; + bool haveHardcodedReference = true; + switch (test) { + case 0: { + // No entries, but nonzero dimensions. + std::vector rowmap = {0, 0, 0, 0, 0}; + std::vector entries; + std::vector values; + A = loadMatrixFromVectors(4, 4, rowmap, entries, values); + Afiltered_ref = + loadMatrixFromVectors(4, 4, rowmap, entries, values); + break; + } + case 1: { + // Some empty rows, and some zero values + std::vector rowmap = {0, 0, 3, 3, 5}; + std::vector entries = {0, 1, 3, 1, 2}; + std::vector values = {1, 3, 0, 0, 2}; + A = loadMatrixFromVectors(4, 4, rowmap, entries, values); + std::vector rowmapFilt = {0, 0, 2, 2, 3}; + std::vector entriesFilt = {0, 1, 2}; + std::vector valuesFilt = {1, 3, 2}; + Afiltered_ref = loadMatrixFromVectors(4, 4, rowmapFilt, + entriesFilt, valuesFilt); + break; + } + case 2: { + // Zero-row matrix, length-0 rowmap + typename Matrix::row_map_type rowmap; + typename Matrix::index_type entries; + typename Matrix::values_type values; + A = Matrix("A empty", 0, 0, 0, values, rowmap, entries); + Afiltered_ref = A; + break; + } + case 3: { + // Zero-row matrix, length-1 rowmap + std::vector rowmap = {0}; + std::vector entries; + std::vector values; + A = loadMatrixFromVectors(0, 0, rowmap, entries, values); + Afiltered_ref = A; + break; + } + case 4: { + // A row of all zeros that will be filtered + std::vector rowmap = {0, 3, 6}; + std::vector entries = {0, 1, 2, 3, 4, 5}; + std::vector values = {0, 0, 0, 1, 1, 1}; + A = loadMatrixFromVectors(2, 6, rowmap, entries, values); + std::vector rowmapFilt = {0, 0, 3}; + std::vector entriesFilt = {3, 4, 5}; + std::vector valuesFilt = {1, 1, 1}; + Afiltered_ref = loadMatrixFromVectors(2, 6, rowmapFilt, + entriesFilt, valuesFilt); + break; + } + case 5: { + // One zero in each row that will be filtered + std::vector rowmap = {0, 2, 4, 7}; + std::vector entries = {0, 1, 1, 2, 0, 1, 2}; + std::vector values = {0, 1, 1, 0, 0, 3, -3}; + A = loadMatrixFromVectors(3, 3, rowmap, entries, values); + std::vector rowmapFilt = {0, 1, 2, 4}; + std::vector entriesFilt = {1, 1, 1, 2}; + std::vector valuesFilt = {1, 1, 3, -3}; + Afiltered_ref = loadMatrixFromVectors(3, 3, rowmapFilt, + entriesFilt, valuesFilt); + break; + } + case 6: { + // First and last rows empty + std::vector rowmap = {0, 0, 2, 2}; + std::vector entries = {0, 1}; + std::vector values = {0, 3.14}; + A = loadMatrixFromVectors(3, 2, rowmap, entries, values); + std::vector rowmapFilt = {0, 0, 1, 1}; + std::vector entriesFilt = {1}; + std::vector valuesFilt = {3.14}; + Afiltered_ref = loadMatrixFromVectors(3, 2, rowmapFilt, + entriesFilt, valuesFilt); + break; + } + case 7: { + // First and last rows nonempty, but will be empty after filtering + std::vector rowmap = {0, 2, 4, 6}; + std::vector entries = {0, 1, 1, 2, 0, 3}; + std::vector values = {0, 0, 1, -1, 0, 0}; + A = loadMatrixFromVectors(3, 4, rowmap, entries, values); + std::vector rowmapFilt = {0, 0, 2, 2}; + std::vector entriesFilt = {1, 2}; + std::vector valuesFilt = {1, -1}; + Afiltered_ref = loadMatrixFromVectors(3, 4, rowmapFilt, + entriesFilt, valuesFilt); + break; + } + case 8: { + // Large, random matrix with 30% of values converted to zero + Offset nnz = 40 * 10000; + A = KokkosSparse::Impl::kk_generate_sparse_matrix(10000, 10000, + nnz, 10, 5000); + auto valuesHost = Kokkos::create_mirror_view(A.values); + Kokkos::deep_copy(valuesHost, A.values); + for (Offset i = 0; i < A.nnz(); i++) { + if (rand() % 10 < 3) valuesHost(i) = 0.0; + } + Kokkos::deep_copy(A.values, valuesHost); + Afiltered_ref = removeMatrixZerosReference(A); + haveHardcodedReference = false; + break; + } + case 9: { + // Large, sparser random matrix with 99% of values converted to zero + Offset nnz = 10 * 40000; + A = KokkosSparse::Impl::kk_generate_sparse_matrix(40000, 40000, + nnz, 10, 10000); + auto valuesHost = Kokkos::create_mirror_view(A.values); + Kokkos::deep_copy(valuesHost, A.values); + for (Offset i = 0; i < A.nnz(); i++) { + if (rand() % 100 != 99) valuesHost(i) = 0.0; + } + Kokkos::deep_copy(A.values, valuesHost); + Afiltered_ref = removeMatrixZerosReference(A); + haveHardcodedReference = false; + break; + } + default: throw std::invalid_argument("Test case number of out bounds"); + } + // If we have a hardcoded reference, check that the reference impl is correct + // on this case + if (haveHardcodedReference) { + Matrix Afiltered_refimpl = removeMatrixZerosReference(A); + bool referenceImplMatchesHardcoded = + Test::is_same_matrix(Afiltered_ref, Afiltered_refimpl); + ASSERT_TRUE(referenceImplMatchesHardcoded) + << "Test case " << test << ": reference impl gave wrong answer!"; + } +} + +} // namespace TestRemoveCrsMatrixZeros + +void testRemoveCrsMatrixZeros(int testCase) { + using namespace TestRemoveCrsMatrixZeros; + using Device = + Kokkos::Device; + using Matrix = KokkosSparse::CrsMatrix; + Matrix A, Afiltered_ref; + getTestInput(testCase, A, Afiltered_ref); + Matrix Afiltered_actual = KokkosSparse::removeCrsMatrixZeros(A); + bool matches = + Test::is_same_matrix(Afiltered_actual, Afiltered_ref); + EXPECT_TRUE(matches) + << "Test case " << testCase + << ": matrix with zeros filtered out does not match reference."; +} + +TEST_F(TestCategory, sparse_remove_crs_zeros) { + for (int testCase = 0; testCase < 10; testCase++) + testRemoveCrsMatrixZeros(testCase); +} + +#endif // KOKKOSSPARSE_REMOVECRSZEROS_HPP diff --git a/sparse/unit_test/Test_Sparse_spgemm.hpp b/sparse/unit_test/Test_Sparse_spgemm.hpp index 4cb225b97d..4d53b1e126 100644 --- a/sparse/unit_test/Test_Sparse_spgemm.hpp +++ b/sparse/unit_test/Test_Sparse_spgemm.hpp @@ -19,6 +19,8 @@ #include "KokkosSparse_Utils.hpp" #include "KokkosSparse_SortCrs.hpp" +// For Test::is_same_matrix +#include "Test_Sparse_Utils.hpp" #include #include @@ -203,86 +205,6 @@ int run_spgemm_old_interface(crsMat_t A, crsMat_t B, kh.destroy_spgemm_handle(); return 0; } - -template -bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) { - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type lno_view_t; - typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - - size_t nrows_actual = output_mat_actual.numRows(); - size_t nentries_actual = output_mat_actual.graph.entries.extent(0); - size_t nvals_actual = output_mat_actual.values.extent(0); - - size_t nrows_reference = output_mat_reference.numRows(); - size_t nentries_reference = output_mat_reference.graph.entries.extent(0); - size_t nvals_reference = output_mat_reference.values.extent(0); - - if (nrows_actual != nrows_reference) { - std::cout << "nrows_actual:" << nrows_actual - << " nrows_reference:" << nrows_reference << std::endl; - return false; - } - if (nentries_actual != nentries_reference) { - std::cout << "nentries_actual:" << nentries_actual - << " nentries_reference:" << nentries_reference << std::endl; - return false; - } - if (nvals_actual != nvals_reference) { - std::cout << "nvals_actual:" << nvals_actual - << " nvals_reference:" << nvals_reference << std::endl; - return false; - } - - // Do not sort the actual product matrix - test that it's already sorted - KokkosSparse::sort_crs_matrix(output_mat_reference); - - bool is_identical = true; - is_identical = KokkosKernels::Impl::kk_is_identical_view< - typename graph_t::row_map_type, typename graph_t::row_map_type, - typename lno_view_t::value_type, typename device::execution_space>( - output_mat_actual.graph.row_map, output_mat_reference.graph.row_map, 0); - - if (!is_identical) { - std::cout << "rowmaps are different." << std::endl; - std::cout << "Actual rowmap:\n"; - KokkosKernels::Impl::kk_print_1Dview(output_mat_actual.graph.row_map, true); - std::cout << "Correct rowmap (SPGEMM_DEBUG):\n"; - KokkosKernels::Impl::kk_print_1Dview(output_mat_reference.graph.row_map, - true); - return false; - } - - is_identical = KokkosKernels::Impl::kk_is_identical_view< - lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type, - typename device::execution_space>(output_mat_actual.graph.entries, - output_mat_reference.graph.entries, 0); - - if (!is_identical) { - std::cout << "entries are different." << std::endl; - KokkosKernels::Impl::kk_print_1Dview(output_mat_actual.graph.entries); - KokkosKernels::Impl::kk_print_1Dview(output_mat_reference.graph.entries); - return false; - } - - typedef typename Kokkos::Details::ArithTraits< - typename scalar_view_t::non_const_value_type>::mag_type eps_type; - eps_type eps = std::is_same::value ? 3.7e-3 : 1e-7; - - is_identical = KokkosKernels::Impl::kk_is_relatively_identical_view< - scalar_view_t, scalar_view_t, eps_type, typename device::execution_space>( - output_mat_actual.values, output_mat_reference.values, eps); - - if (!is_identical) { - std::cout << "values are different." << std::endl; - KokkosKernels::Impl::kk_print_1Dview(output_mat_actual.values); - KokkosKernels::Impl::kk_print_1Dview(output_mat_reference.values); - - return false; - } - return true; -} } // namespace Test // Generate matrices and test all supported spgemm algorithms. From 9455f6505de92b62ebd2b93b8712594baf48a72b Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 1 Dec 2022 17:52:47 -0700 Subject: [PATCH 063/442] BLAS: fix build with KokkosKernels_TEST_ETI_ONLY=OFF Mostly commenting all the tests that are templated on LayoutStride These mostly do not build because the view constructor called is not valid. Also changing some unification layer logic to not have return views templated on LayoutStride when it only makes sense for the input view to have such a layout... --- blas/src/KokkosBlas1_dot.hpp | 4 +-- blas/src/KokkosBlas1_nrm1.hpp | 20 +++++------ blas/src/KokkosBlas1_nrm2_squared.hpp | 4 +-- blas/src/KokkosBlas1_nrm2w_squared.hpp | 2 +- blas/unit_test/Test_Blas1_axpby.hpp | 9 ++--- blas/unit_test/Test_Blas1_axpy.hpp | 9 ++--- blas/unit_test/Test_Blas1_dot.hpp | 8 +++++ blas/unit_test/Test_Blas1_iamax.hpp | 6 +++- blas/unit_test/Test_Blas1_mult.hpp | 4 +++ blas/unit_test/Test_Blas1_nrm1.hpp | 2 ++ blas/unit_test/Test_Blas1_nrm2.hpp | 4 +++ blas/unit_test/Test_Blas1_nrm2w.hpp | 4 +++ blas/unit_test/Test_Blas1_nrm2w_squared.hpp | 4 +++ blas/unit_test/Test_Blas1_nrminf.hpp | 4 +++ blas/unit_test/Test_Blas1_reciprocal.hpp | 4 +++ blas/unit_test/Test_Blas1_scal.hpp | 4 +++ blas/unit_test/Test_Blas1_sum.hpp | 4 +++ blas/unit_test/Test_Blas1_update.hpp | 4 +++ blas/unit_test/Test_Blas2_gemv.hpp | 2 ++ blas/unit_test/Test_Blas2_gemv_util.hpp | 2 +- blas/unit_test/Test_Blas3_gemm.hpp | 38 +++++++++++++++++++++ 21 files changed, 116 insertions(+), 26 deletions(-) diff --git a/blas/src/KokkosBlas1_dot.hpp b/blas/src/KokkosBlas1_dot.hpp index 6d87a70a08..aec666cd13 100644 --- a/blas/src/KokkosBlas1_dot.hpp +++ b/blas/src/KokkosBlas1_dot.hpp @@ -75,10 +75,10 @@ dot(const XVector& x, const YVector& y) { using result_type = typename KokkosBlas::Impl::DotAccumulatingScalar::type; using RVector_Internal = - Kokkos::View>; using RVector_Result = - Kokkos::View>; result_type result{}; diff --git a/blas/src/KokkosBlas1_nrm1.hpp b/blas/src/KokkosBlas1_nrm1.hpp index 9481cd9472..2377224b5f 100644 --- a/blas/src/KokkosBlas1_nrm1.hpp +++ b/blas/src/KokkosBlas1_nrm1.hpp @@ -39,19 +39,17 @@ nrm1(const XVector& x) { static_assert(XVector::rank == 1, "KokkosBlas::nrm1: " "Both Vector inputs must have rank 1."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type mag_type; + using mag_type= typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::mag_type; - typedef Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits > - XVector_Internal; + using XVector_Internal = Kokkos::View::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits >; - typedef Kokkos::View > - RVector_Internal; + using RVector_Internal = Kokkos::View >; mag_type result; RVector_Internal R = RVector_Internal(&result); diff --git a/blas/src/KokkosBlas1_nrm2_squared.hpp b/blas/src/KokkosBlas1_nrm2_squared.hpp index 8f053fad47..2bd0fe15c6 100644 --- a/blas/src/KokkosBlas1_nrm2_squared.hpp +++ b/blas/src/KokkosBlas1_nrm2_squared.hpp @@ -49,7 +49,7 @@ nrm2_squared(const XVector& x) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; @@ -111,7 +111,7 @@ void nrm2_squared( typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, UnifiedXLayout>::array_layout; + RV, Kokkos::LayoutRight>::array_layout; // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. diff --git a/blas/src/KokkosBlas1_nrm2w_squared.hpp b/blas/src/KokkosBlas1_nrm2w_squared.hpp index 62fa263ab0..e7333dc173 100644 --- a/blas/src/KokkosBlas1_nrm2w_squared.hpp +++ b/blas/src/KokkosBlas1_nrm2w_squared.hpp @@ -50,7 +50,7 @@ nrm2w_squared(const XVector& x, const XVector& w) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; diff --git a/blas/unit_test/Test_Blas1_axpby.hpp b/blas/unit_test/Test_Blas1_axpby.hpp index 5ba19c7ce5..b81926b3cb 100644 --- a/blas/unit_test/Test_Blas1_axpby.hpp +++ b/blas/unit_test/Test_Blas1_axpby.hpp @@ -136,8 +136,9 @@ void impl_test_axpby_mv(int N, int K) { } Kokkos::deep_copy(b_org_y, b_y); - auto h_b_org_y = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); + ViewTypeB org_y = vfB_type::view(b_org_y); + auto h_org_y = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), org_y); Kokkos::deep_copy(h_b_x, b_x); Kokkos::deep_copy(h_b_y, b_y); @@ -155,7 +156,7 @@ void impl_test_axpby_mv(int N, int K) { for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(a * h_x(i, j) + b * h_b_org_y(i, j), h_y(i, j), eps); + EXPECT_NEAR_KK(a * h_x(i, j) + b * h_org_y(i, j), h_y(i, j), eps); } } @@ -165,7 +166,7 @@ void impl_test_axpby_mv(int N, int K) { for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(a * h_x(i, j) + b * h_b_org_y(i, j), h_y(i, j), eps); + EXPECT_NEAR_KK(a * h_x(i, j) + b * h_org_y(i, j), h_y(i, j), eps); } } } diff --git a/blas/unit_test/Test_Blas1_axpy.hpp b/blas/unit_test/Test_Blas1_axpy.hpp index 35293652b5..91395b60fb 100644 --- a/blas/unit_test/Test_Blas1_axpy.hpp +++ b/blas/unit_test/Test_Blas1_axpy.hpp @@ -138,8 +138,9 @@ void impl_test_axpy_mv(int N, int K) { } Kokkos::deep_copy(b_org_y, b_y); - auto h_b_org_y = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); + ViewTypeB org_y = vfB_type::view(b_org_y); + auto h_org_y = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), org_y); Kokkos::deep_copy(h_b_x, b_x); Kokkos::deep_copy(h_b_y, b_y); @@ -153,7 +154,7 @@ void impl_test_axpy_mv(int N, int K) { Kokkos::deep_copy(h_b_y, b_y); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(a * h_x(i, j) + h_b_org_y(i, j), h_y(i, j), eps); + EXPECT_NEAR_KK(a * h_x(i, j) + h_org_y(i, j), h_y(i, j), eps); } } @@ -162,7 +163,7 @@ void impl_test_axpy_mv(int N, int K) { Kokkos::deep_copy(h_b_y, b_y); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(a * h_x(i, j) + h_b_org_y(i, j), h_y(i, j), eps); + EXPECT_NEAR_KK(a * h_x(i, j) + h_org_y(i, j), h_y(i, j), eps); } } } diff --git a/blas/unit_test/Test_Blas1_dot.hpp b/blas/unit_test/Test_Blas1_dot.hpp index 044a9765d2..7b49ccc7de 100644 --- a/blas/unit_test/Test_Blas1_dot.hpp +++ b/blas/unit_test/Test_Blas1_dot.hpp @@ -185,6 +185,9 @@ int test_dot() { // Test::impl_test_dot(132231); #endif + // Removing the layout stride test as ViewTypeA a("a", N); + // is invalid since the view constructor needs a stride object! +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -201,6 +204,7 @@ int test_dot() { Test::impl_test_dot(1024); Test::impl_test_dot(1024); #endif +*/ return 1; } @@ -231,6 +235,9 @@ int test_dot_mv() { // Test::impl_test_dot_mv(132231,5); #endif + // Removing the layout stride test as ViewTypeA a("a", N); + // is invalid since the view constructor needs a stride object! +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -248,6 +255,7 @@ int test_dot_mv() { Test::impl_test_dot_mv(1024, 5); Test::impl_test_dot_mv(1024, 5); #endif +*/ return 1; } diff --git a/blas/unit_test/Test_Blas1_iamax.hpp b/blas/unit_test/Test_Blas1_iamax.hpp index 1619512ceb..efa5b3da67 100644 --- a/blas/unit_test/Test_Blas1_iamax.hpp +++ b/blas/unit_test/Test_Blas1_iamax.hpp @@ -27,7 +27,7 @@ void impl_test_iamax(int N) { typedef typename AT::mag_type mag_type; using size_type = typename ViewTypeA::size_type; - ViewTypeA a("a", N); + ViewTypeA a("A", N); typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); @@ -240,6 +240,7 @@ int test_iamax() { // Test::impl_test_iamax(132231); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -249,6 +250,7 @@ int test_iamax() { Test::impl_test_iamax(1024); // Test::impl_test_iamax(132231); #endif +*/ return 1; } @@ -275,6 +277,7 @@ int test_iamax_mv() { // Test::impl_test_iamax_mv(132231,5); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -284,6 +287,7 @@ int test_iamax_mv() { Test::impl_test_iamax_mv(1024, 5); // Test::impl_test_iamax_mv(132231,5); #endif +*/ return 1; } diff --git a/blas/unit_test/Test_Blas1_mult.hpp b/blas/unit_test/Test_Blas1_mult.hpp index c922cb295f..e438ae370a 100644 --- a/blas/unit_test/Test_Blas1_mult.hpp +++ b/blas/unit_test/Test_Blas1_mult.hpp @@ -208,6 +208,7 @@ int test_mult() { // Device>(132231); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -231,6 +232,7 @@ int test_mult() { Test::impl_test_mult( 1024); #endif +*/ return 1; } @@ -269,6 +271,7 @@ int test_mult_mv() { // Device>(132231,5); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -292,6 +295,7 @@ int test_mult_mv() { Test::impl_test_mult_mv(1024, 5); #endif +*/ return 1; } diff --git a/blas/unit_test/Test_Blas1_nrm1.hpp b/blas/unit_test/Test_Blas1_nrm1.hpp index 7b56dc94b3..ca43988937 100644 --- a/blas/unit_test/Test_Blas1_nrm1.hpp +++ b/blas/unit_test/Test_Blas1_nrm1.hpp @@ -143,6 +143,7 @@ int test_nrm1() { Test::impl_test_nrm1(132231); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -152,6 +153,7 @@ int test_nrm1() { Test::impl_test_nrm1(1024); Test::impl_test_nrm1(132231); #endif +*/ return 1; } diff --git a/blas/unit_test/Test_Blas1_nrm2.hpp b/blas/unit_test/Test_Blas1_nrm2.hpp index 5cbc89488e..3d6d419e91 100644 --- a/blas/unit_test/Test_Blas1_nrm2.hpp +++ b/blas/unit_test/Test_Blas1_nrm2.hpp @@ -139,6 +139,7 @@ int test_nrm2() { // Test::impl_test_nrm2(132231); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -148,6 +149,7 @@ int test_nrm2() { Test::impl_test_nrm2(1024); // Test::impl_test_nrm2(132231); #endif +*/ return 1; } @@ -176,6 +178,7 @@ int test_nrm2_mv() { // Test::impl_test_nrm2_mv(132231,5); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -186,6 +189,7 @@ int test_nrm2_mv() { Test::impl_test_nrm2_mv(789, 1); // Test::impl_test_nrm2_mv(132231,5); #endif +*/ return 1; } diff --git a/blas/unit_test/Test_Blas1_nrm2w.hpp b/blas/unit_test/Test_Blas1_nrm2w.hpp index b87c5ac48d..aade8b6bcd 100644 --- a/blas/unit_test/Test_Blas1_nrm2w.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w.hpp @@ -137,6 +137,7 @@ int test_nrm2w() { // Test::impl_test_nrm2(132231); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -146,6 +147,7 @@ int test_nrm2w() { Test::impl_test_nrm2w(1024); // Test::impl_test_nrm2(132231); #endif +*/ return 1; } @@ -174,6 +176,7 @@ int test_nrm2w_mv() { // Test::impl_test_nrm2w_mv(132231,5); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -184,6 +187,7 @@ int test_nrm2w_mv() { Test::impl_test_nrm2w_mv(789, 1); // Test::impl_test_nrm2w_mv(132231,5); #endif +*/ return 1; } diff --git a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp index 3f76c84f30..387e313443 100644 --- a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp @@ -133,6 +133,7 @@ int test_nrm2w_squared() { // Test::impl_test_nrm2(132231); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -142,6 +143,7 @@ int test_nrm2w_squared() { Test::impl_test_nrm2w_squared(1024); // Test::impl_test_nrm2(132231); #endif +*/ return 1; } @@ -170,6 +172,7 @@ int test_nrm2w_squared_mv() { // Test::impl_test_nrm2w_squared_mv(132231,5); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -180,6 +183,7 @@ int test_nrm2w_squared_mv() { Test::impl_test_nrm2w_squared_mv(789, 1); // Test::impl_test_nrm2w_squared_mv(132231,5); #endif +*/ return 1; } diff --git a/blas/unit_test/Test_Blas1_nrminf.hpp b/blas/unit_test/Test_Blas1_nrminf.hpp index b827dfa26e..6d42ef1486 100644 --- a/blas/unit_test/Test_Blas1_nrminf.hpp +++ b/blas/unit_test/Test_Blas1_nrminf.hpp @@ -137,6 +137,7 @@ int test_nrminf() { // Test::impl_test_nrminf(132231); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -146,6 +147,7 @@ int test_nrminf() { Test::impl_test_nrminf(1024); // Test::impl_test_nrminf(132231); #endif +*/ return 1; } @@ -172,6 +174,7 @@ int test_nrminf_mv() { // Test::impl_test_nrminf_mv(132231,5); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -181,6 +184,7 @@ int test_nrminf_mv() { Test::impl_test_nrminf_mv(1024, 5); // Test::impl_test_nrminf_mv(132231,5); #endif +*/ return 1; } diff --git a/blas/unit_test/Test_Blas1_reciprocal.hpp b/blas/unit_test/Test_Blas1_reciprocal.hpp index fdec530ee6..1a2aebf782 100644 --- a/blas/unit_test/Test_Blas1_reciprocal.hpp +++ b/blas/unit_test/Test_Blas1_reciprocal.hpp @@ -212,6 +212,7 @@ int test_reciprocal() { // Test::impl_test_reciprocal(132231); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -228,6 +229,7 @@ int test_reciprocal() { Test::impl_test_reciprocal(1024); Test::impl_test_reciprocal(1024); #endif +*/ return 1; } @@ -260,6 +262,7 @@ int test_reciprocal_mv() { // Device>(132231,5); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -280,6 +283,7 @@ int test_reciprocal_mv() { Test::impl_test_reciprocal_mv(1024, 5); #endif +*/ return 1; } diff --git a/blas/unit_test/Test_Blas1_scal.hpp b/blas/unit_test/Test_Blas1_scal.hpp index 5fac67417f..2f3fce5d03 100644 --- a/blas/unit_test/Test_Blas1_scal.hpp +++ b/blas/unit_test/Test_Blas1_scal.hpp @@ -195,6 +195,7 @@ int test_scal() { // Test::impl_test_scal(132231); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -211,6 +212,7 @@ int test_scal() { Test::impl_test_scal(1024); Test::impl_test_scal(1024); #endif +*/ return 1; } @@ -239,6 +241,7 @@ int test_scal_mv() { // Test::impl_test_scal_mv(132231,5); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -255,6 +258,7 @@ int test_scal_mv() { Test::impl_test_scal_mv(1024, 5); Test::impl_test_scal_mv(1024, 5); #endif +*/ return 1; } diff --git a/blas/unit_test/Test_Blas1_sum.hpp b/blas/unit_test/Test_Blas1_sum.hpp index d2ccd4bf3d..0cb3626987 100644 --- a/blas/unit_test/Test_Blas1_sum.hpp +++ b/blas/unit_test/Test_Blas1_sum.hpp @@ -128,6 +128,7 @@ int test_sum() { // Test::impl_test_sum(132231); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -137,6 +138,7 @@ int test_sum() { Test::impl_test_sum(1024); // Test::impl_test_sum(132231); #endif +*/ return 1; } @@ -165,6 +167,7 @@ int test_sum_mv() { // Test::impl_test_sum_mv(132231,5); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -175,6 +178,7 @@ int test_sum_mv() { Test::impl_test_sum_mv(789, 1); // Test::impl_test_sum_mv(132231,5); #endif +*/ return 1; } diff --git a/blas/unit_test/Test_Blas1_update.hpp b/blas/unit_test/Test_Blas1_update.hpp index 5a0d27cf42..c2b5cef0bc 100644 --- a/blas/unit_test/Test_Blas1_update.hpp +++ b/blas/unit_test/Test_Blas1_update.hpp @@ -243,6 +243,7 @@ int test_update() { // Device>(132231); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -266,6 +267,7 @@ int test_update() { Test::impl_test_update(1024); #endif +*/ return 1; } @@ -304,6 +306,7 @@ int test_update_mv() { Device>(132231, 5); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -327,6 +330,7 @@ int test_update_mv() { Test::impl_test_update_mv(1024, 5); #endif +*/ return 1; } diff --git a/blas/unit_test/Test_Blas2_gemv.hpp b/blas/unit_test/Test_Blas2_gemv.hpp index 4e8c53e7e6..8d3ce380fb 100644 --- a/blas/unit_test/Test_Blas2_gemv.hpp +++ b/blas/unit_test/Test_Blas2_gemv.hpp @@ -203,6 +203,7 @@ int test_gemv(const char* mode) { // Device>(mode,132231,1024); #endif +/* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -234,6 +235,7 @@ int test_gemv(const char* mode) { Test::impl_test_gemv( mode, 1024, 1024); #endif +*/ return 1; } diff --git a/blas/unit_test/Test_Blas2_gemv_util.hpp b/blas/unit_test/Test_Blas2_gemv_util.hpp index ba0392b2e1..99b4516cff 100644 --- a/blas/unit_test/Test_Blas2_gemv_util.hpp +++ b/blas/unit_test/Test_Blas2_gemv_util.hpp @@ -276,7 +276,7 @@ struct GEMVTest { // fetch GEMV functor from the factory using op_type = typename GemvFunc::template functor_type; + ViewTypeY, Device, ScalarType>; op_type gemv_op(trans, alpha, A, x, beta, y); Kokkos::parallel_for(Kokkos::TeamPolicy(1, 1), gemv_op); diff --git a/blas/unit_test/Test_Blas3_gemm.hpp b/blas/unit_test/Test_Blas3_gemm.hpp index 8ab92e25b1..d3d34d60e3 100644 --- a/blas/unit_test/Test_Blas3_gemm.hpp +++ b/blas/unit_test/Test_Blas3_gemm.hpp @@ -394,6 +394,23 @@ void test_gemm_enabled_layouts() { #endif } +template +void test_gemm_mixed_scalars() { + using CMatrix = Kokkos::View; + using BMatrix = Kokkos::View; + using AMatrix = Kokkos::View; + + AMatrix A("A", 10, 10); + BMatrix B("B", 10, 10); + CMatrix C("C", 10, 10); + + Kokkos::deep_copy(A, Kokkos::ArithTraits::one()); + Kokkos::deep_copy(B, Kokkos::ArithTraits::one()); + Kokkos::deep_copy(C, Kokkos::ArithTraits::one()); + + KokkosBlas::gemm(TestExecSpace(), "N", "N", 1.0, C, A, 0.0, B); +} + #if defined(KOKKOSKERNELS_INST_FLOAT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -433,3 +450,24 @@ TEST_F(TestCategory, gemm_complex_float) { Kokkos::Profiling::popRegion(); } #endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, gemm_mixed_scalars_complex_double_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_mixed_complex_double_double"); + test_gemm_mixed_scalars, double>(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, gemm_mixed_scalar_complex_float_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_mixed_complex_float_float"); + test_gemm_mixed_scalars, float>(); + Kokkos::Profiling::popRegion(); +} +#endif + From cc11c6d7a2b47627ce24be5c56e95e5c2f334981 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Thu, 2 Feb 2023 15:19:31 +0100 Subject: [PATCH 064/442] #5: Added basis for print_configuration method --- common/CMakeLists.txt | 2 + .../src/kokkoskernels_print_configuration.cpp | 81 +++++++++++++++++++ .../src/kokkoskernels_print_configuration.hpp | 28 +++++++ perf_test/Benchmark_Context.hpp | 8 +- perf_test/CMakeLists.txt | 2 +- 5 files changed, 117 insertions(+), 4 deletions(-) create mode 100644 common/src/kokkoskernels_print_configuration.cpp create mode 100644 common/src/kokkoskernels_print_configuration.hpp diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 72972b5cd7..3acc28a7ba 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -7,3 +7,5 @@ LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/unit_test) # Adding unit-tests KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/common) KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/common) + +LIST(APPEND SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/common/src/kokkoskernels_print_configuration.cpp) diff --git a/common/src/kokkoskernels_print_configuration.cpp b/common/src/kokkoskernels_print_configuration.cpp new file mode 100644 index 0000000000..5d2d0523c8 --- /dev/null +++ b/common/src/kokkoskernels_print_configuration.cpp @@ -0,0 +1,81 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "kokkoskernels_print_configuration.hpp" +#include "KokkosKernels_config.h" + +#include + +namespace { +void print_enabled_tpls(std::ostream& os) { +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK + os << "LAPACK" << ";"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + os << "BLAS" << ";"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_CBLAS + os << "CBLAS" << ";"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACKE + os << "LAPACKE" << ";"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_SUPERLU + os << "SUPERLU" << ";"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_CHOLMOD + os << "CHOLMOD" << ";"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + os << "MKL" << ";"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS + os << "CUBLAS" << ";"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + os << "CUSPARSE" << ";"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS + os << "ROCBLAS" << ";"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCPARSE + os << "ROCPARSE" << ";"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_METIS + os << "METIS" << ";"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ARMPL + tpls << "ARMPL" << ";"; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA + tpls << "MAGMA" << ";"; +#endif +} + +void print_version(std::ostream& os) { + os << "Kernels Version: "<< KOKKOSKERNELS_VERSION <<'\n'; +} + +} // namespace + +void KokkosKernels::print_configuration(std::ostream& os) { + print_version(os); + + os << "Enabled TPLs names:\n"; + print_enabled_tpls(os); + +} + diff --git a/common/src/kokkoskernels_print_configuration.hpp b/common/src/kokkoskernels_print_configuration.hpp new file mode 100644 index 0000000000..f98fb9278d --- /dev/null +++ b/common/src/kokkoskernels_print_configuration.hpp @@ -0,0 +1,28 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef _KOKKOSKERNELS_PRINT_CONFIGURATION_HPP +#define _KOKKOSKERNELS_PRINT_CONFIGURATION_HPP + +#include + +namespace KokkosKernels { + +/** \brief Print "Bill of Materials" */ +void print_configuration(std::ostream& os); + +} // namespace KokkosKernels +#endif // _KOKKOSKERNELS_PRINT_CONFIGURATION_HPP diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index 0ef4910cc5..073de9e161 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -24,6 +24,7 @@ #include #include +#include namespace KokkosKernelsBenchmark { @@ -43,9 +44,10 @@ std::string remove_unwanted_characters(std::string str) { /// \brief Extract all key:value pairs from kokkos configuration and add it to /// the benchmark context -void add_kokkos_configuration(bool verbose) { +void add_kernels_configuration(bool verbose) { std::ostringstream msg; Kokkos::print_configuration(msg, verbose); + KokkosKernels::print_configuration(msg); // Iterate over lines returned from kokkos and extract key:value pairs std::stringstream ss{msg.str()}; @@ -64,8 +66,8 @@ void add_kokkos_configuration(bool verbose) { /// \brief Gather all context information and add it to benchmark context data void add_benchmark_context(bool verbose = false) { - // Add Kokkos configuration to benchmark context data - add_kokkos_configuration(verbose); + // Add Kokkos and kernerls configuration to benchmark context data + add_kernels_configuration(verbose); } } // namespace KokkosKernelsBenchmark diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index 28752e9c6c..e9f5a9c88c 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -59,7 +59,7 @@ IF(KokkosKernels_ENABLE_BENCHMARK) message(FATAL_ERROR "Benchmarks are not supported when building as part of Trilinos") ENDIF() - find_package(benchmark QUIET) +# find_package(benchmark QUIET) IF(benchmark_FOUND) MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}") From b60e9913fb3276bf3f46bc3a5ecd45c93d720419 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Mon, 6 Feb 2023 15:13:47 +0100 Subject: [PATCH 065/442] #5: moved print_configuration to header only file and added its test --- common/CMakeLists.txt | 2 - .../src/kokkoskernels_print_configuration.cpp | 81 ------------------- .../src/kokkoskernels_print_configuration.hpp | 28 ------- common/unit_test/Test_Common.hpp | 1 + perf_test/Benchmark_Context.hpp | 2 +- 5 files changed, 2 insertions(+), 112 deletions(-) delete mode 100644 common/src/kokkoskernels_print_configuration.cpp delete mode 100644 common/src/kokkoskernels_print_configuration.hpp diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 3acc28a7ba..72972b5cd7 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -7,5 +7,3 @@ LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/unit_test) # Adding unit-tests KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/common) KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/common) - -LIST(APPEND SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/common/src/kokkoskernels_print_configuration.cpp) diff --git a/common/src/kokkoskernels_print_configuration.cpp b/common/src/kokkoskernels_print_configuration.cpp deleted file mode 100644 index 5d2d0523c8..0000000000 --- a/common/src/kokkoskernels_print_configuration.cpp +++ /dev/null @@ -1,81 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include "kokkoskernels_print_configuration.hpp" -#include "KokkosKernels_config.h" - -#include - -namespace { -void print_enabled_tpls(std::ostream& os) { -#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK - os << "LAPACK" << ";"; -#endif -#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - os << "BLAS" << ";"; -#endif -#ifdef KOKKOSKERNELS_ENABLE_TPL_CBLAS - os << "CBLAS" << ";"; -#endif -#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACKE - os << "LAPACKE" << ";"; -#endif -#ifdef KOKKOSKERNELS_ENABLE_TPL_SUPERLU - os << "SUPERLU" << ";"; -#endif -#ifdef KOKKOSKERNELS_ENABLE_TPL_CHOLMOD - os << "CHOLMOD" << ";"; -#endif -#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - os << "MKL" << ";"; -#endif -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS - os << "CUBLAS" << ";"; -#endif -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - os << "CUSPARSE" << ";"; -#endif -#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS - os << "ROCBLAS" << ";"; -#endif -#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCPARSE - os << "ROCPARSE" << ";"; -#endif -#ifdef KOKKOSKERNELS_ENABLE_TPL_METIS - os << "METIS" << ";"; -#endif -#ifdef KOKKOSKERNELS_ENABLE_TPL_ARMPL - tpls << "ARMPL" << ";"; -#endif -#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA - tpls << "MAGMA" << ";"; -#endif -} - -void print_version(std::ostream& os) { - os << "Kernels Version: "<< KOKKOSKERNELS_VERSION <<'\n'; -} - -} // namespace - -void KokkosKernels::print_configuration(std::ostream& os) { - print_version(os); - - os << "Enabled TPLs names:\n"; - print_enabled_tpls(os); - -} - diff --git a/common/src/kokkoskernels_print_configuration.hpp b/common/src/kokkoskernels_print_configuration.hpp deleted file mode 100644 index f98fb9278d..0000000000 --- a/common/src/kokkoskernels_print_configuration.hpp +++ /dev/null @@ -1,28 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef _KOKKOSKERNELS_PRINT_CONFIGURATION_HPP -#define _KOKKOSKERNELS_PRINT_CONFIGURATION_HPP - -#include - -namespace KokkosKernels { - -/** \brief Print "Bill of Materials" */ -void print_configuration(std::ostream& os); - -} // namespace KokkosKernels -#endif // _KOKKOSKERNELS_PRINT_CONFIGURATION_HPP diff --git a/common/unit_test/Test_Common.hpp b/common/unit_test/Test_Common.hpp index dd368f009b..36bc4bcf35 100644 --- a/common/unit_test/Test_Common.hpp +++ b/common/unit_test/Test_Common.hpp @@ -23,5 +23,6 @@ #include #include #include +#include #endif // TEST_COMMON_HPP diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index 073de9e161..f9ce7bb744 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -42,7 +42,7 @@ std::string remove_unwanted_characters(std::string str) { return str.substr(from, to + 1); } -/// \brief Extract all key:value pairs from kokkos configuration and add it to +/// \brief Extract all key:value pairs from kernels configuration and add it to /// the benchmark context void add_kernels_configuration(bool verbose) { std::ostringstream msg; From 634b2cad7ac8b7e1efa6a95711c4c0f0a95d8cd6 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Mon, 6 Feb 2023 15:17:08 +0100 Subject: [PATCH 066/442] #5: added print_configuration file and its test --- .../src/KokkosKernels_PrintConfguration.hpp | 99 +++++++++++++++++++ .../Test_Common_PrintConfiguration.hpp | 61 ++++++++++++ 2 files changed, 160 insertions(+) create mode 100644 common/src/KokkosKernels_PrintConfguration.hpp create mode 100644 common/unit_test/Test_Common_PrintConfiguration.hpp diff --git a/common/src/KokkosKernels_PrintConfguration.hpp b/common/src/KokkosKernels_PrintConfguration.hpp new file mode 100644 index 0000000000..f786d937b3 --- /dev/null +++ b/common/src/KokkosKernels_PrintConfguration.hpp @@ -0,0 +1,99 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef _KOKKOSKERNELS_PRINT_CONFIGURATION_HPP +#define _KOKKOSKERNELS_PRINT_CONFIGURATION_HPP + +#include "KokkosKernels_config.h" + +#include +#include + +namespace KokkosKernels { +constexpr std::string_view KernelsVersionKey= "Kernels Version"; +constexpr std::string_view EnabledTPLsNamesKey= "Enabled TPLs names"; + +namespace { +void print_enabled_tpls(std::ostream& os) { + std::list tpls; +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK + tpls.emplace_back("LAPACK"); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + tpls.emplace_back("BLAS"); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_CBLAS + tpls.emplace_back("CBLAS"); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACKE + tpls.emplace_back("LAPACKE"); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_SUPERLU + tpls.emplace_back("SUPERLU"); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_CHOLMOD + tpls.emplace_back("CHOLMOD"); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + tpls.emplace_back("MKL"); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS + tpls.emplace_back("CUBLAS"); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + tpls.emplace_back("CUSPARSE"); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS + tpls.emplace_back("ROCBLAS"); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCPARSE + tpls.emplace_back("ROCPARSE"); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_METIS + tpls.emplace_back("METIS"); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ARMPL + tpls.emplace_back("ARMPL"); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA + tpls.emplace_back("MAGMA"); +#endif + if(!tpls.empty()){ + auto tplsIte = tpls.cbegin(); + os << *tplsIte; + ++tplsIte; + for(; tplsIte != tpls.cend(); ++tplsIte) { + os << ";" << *tplsIte ; + } + } +} + +void print_version(std::ostream& os) { + os << KernelsVersionKey<< ": "<< KOKKOSKERNELS_VERSION <<'\n'; +} + +} // namespace + +void print_configuration(std::ostream& os) { + print_version(os); + + os << EnabledTPLsNamesKey << ": "; + print_enabled_tpls(os); + os << "\n"; +} + +} // namespace KokkosKernels +#endif // _KOKKOSKERNELS_PRINT_CONFIGURATION_HPP diff --git a/common/unit_test/Test_Common_PrintConfiguration.hpp b/common/unit_test/Test_Common_PrintConfiguration.hpp new file mode 100644 index 0000000000..c8ba2cc7de --- /dev/null +++ b/common/unit_test/Test_Common_PrintConfiguration.hpp @@ -0,0 +1,61 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/// \file Test_Common_PrintConfiguration.hpp +/// \brief Tests for print configuration + +#ifndef KOKKOSKERNELS_PRINTCONFIGURATION_HPP +#define KOKKOSKERNELS_PRINTCONFIGURATION_HPP + +#include "KokkosKernels_PrintConfguration.hpp" + +/// \brief Verify that all keys from kernels configuration and check their value +void check_print_configuration(std::ostream& os) { + std::ostringstream msg; + KokkosKernels::print_configuration(msg); + KokkosKernels::print_configuration(std::cout); + + bool kernelsVersionKeyFound = false; + bool enabledTPLsNamesKeyFound = false; + // Iterate over lines returned from kokkos and extract key:value pairs + std::stringstream ss{msg.str()}; + for (std::string line; std::getline(ss, line, '\n');) { + auto found = line.find_first_of(':'); + if (found != std::string::npos) { + auto currentKey = line.substr(0, found); + if (currentKey == KokkosKernels::KernelsVersionKey) { + kernelsVersionKeyFound = true; + } + else if (currentKey == KokkosKernels::EnabledTPLsNamesKey) { + enabledTPLsNamesKeyFound = true; + } + } + } + EXPECT_TRUE(kernelsVersionKeyFound && enabledTPLsNamesKeyFound); + +} + +/// \brief Verify that print_configuration print the expected keys from kernels configuration +template +void testPrintConfiguration() { + std::ostringstream out; + KokkosKernels::print_configuration(out); + check_print_configuration(out); +} + +TEST_F(TestCategory, common_print_configuration) { testPrintConfiguration(); } + +#endif // KOKKOSKERNELS_PRINTCONFIGURATION_HPP From 32d58f6c3daee20fdef4e2481ec46c8875c62c2a Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Mon, 6 Feb 2023 16:35:33 +0100 Subject: [PATCH 067/442] #5: fixed previous commit mistake --- perf_test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index e9f5a9c88c..28752e9c6c 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -59,7 +59,7 @@ IF(KokkosKernels_ENABLE_BENCHMARK) message(FATAL_ERROR "Benchmarks are not supported when building as part of Trilinos") ENDIF() -# find_package(benchmark QUIET) + find_package(benchmark QUIET) IF(benchmark_FOUND) MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}") From 95b9ddcb5f91123403255dc22af7b7a73b978305 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Mon, 6 Feb 2023 20:56:11 +0100 Subject: [PATCH 068/442] #5 Updated print_configuration content format --- .../src/KokkosKernels_PrintConfguration.hpp | 85 +++++++++++-------- .../Test_Common_PrintConfiguration.hpp | 4 +- 2 files changed, 53 insertions(+), 36 deletions(-) diff --git a/common/src/KokkosKernels_PrintConfguration.hpp b/common/src/KokkosKernels_PrintConfguration.hpp index f786d937b3..3c5686d63f 100644 --- a/common/src/KokkosKernels_PrintConfguration.hpp +++ b/common/src/KokkosKernels_PrintConfguration.hpp @@ -20,79 +20,96 @@ #include "KokkosKernels_config.h" #include -#include namespace KokkosKernels { -constexpr std::string_view KernelsVersionKey= "Kernels Version"; -constexpr std::string_view EnabledTPLsNamesKey= "Enabled TPLs names"; - -namespace { +namespace Impl { void print_enabled_tpls(std::ostream& os) { - std::list tpls; #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK - tpls.emplace_back("LAPACK"); + os << " " << "KOKKOSKERNELS_ENABLE_TPL_LAPACK: yes\n"; +#else + os << " " << "KOKKOSKERNELS_ENABLE_TPL_LAPACK: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - tpls.emplace_back("BLAS"); + os << " " << "KOKKOSKERNELS_ENABLE_TPL_BLAS: yes\n"; +#else + os << " " << "KOKKOSKERNELS_ENABLE_TPL_BLAS: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_CBLAS - tpls.emplace_back("CBLAS"); + os << " " << "KOKKOSKERNELS_ENABLE_TPL_CBLAS: yes\n"; +#else + os << " " << "KOKKOSKERNELS_ENABLE_TPL_CBLAS: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACKE - tpls.emplace_back("LAPACKE"); + os << " " << "KOKKOSKERNELS_ENABLE_TPL_LAPACKE: yes\n"; +#else + os << " " << "KOKKOSKERNELS_ENABLE_TPL_LAPACKE: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_SUPERLU - tpls.emplace_back("SUPERLU"); + os << " " << "KOKKOSKERNELS_ENABLE_TPL_SUPERLU: yes\n"; +#else + os << " " << "KOKKOSKERNELS_ENABLE_TPL_SUPERLU: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_CHOLMOD - tpls.emplace_back("CHOLMOD"); + os << " " << "KOKKOSKERNELS_ENABLE_TPL_CHOLMOD: yes\n"; +#else + os << " " << "KOKKOSKERNELS_ENABLE_TPL_CHOLMOD: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - tpls.emplace_back("MKL"); + os << " " << "KOKKOSKERNELS_ENABLE_TPL_MKL: yes\n"; +#else + os << " " << "KOKKOSKERNELS_ENABLE_TPL_MKL: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS - tpls.emplace_back("CUBLAS"); + os << " " << "KOKKOSKERNELS_ENABLE_TPL_CUBLAS: yes\n"; +#else + os << " " << "KOKKOSKERNELS_ENABLE_TPL_CUBLAS: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - tpls.emplace_back("CUSPARSE"); + os << " " << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: yes\n"; +#else + os << " " << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS - tpls.emplace_back("ROCBLAS"); + os << " " << "KOKKOSKERNELS_ENABLE_TPL_ROCBLAS: yes\n"; +#else + os << " " << "KOKKOSKERNELS_ENABLE_TPL_ROCBLAS: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCPARSE - tpls.emplace_back("ROCPARSE"); + os << " " << "KOKKOSKERNELS_ENABLE_TPL_ROCPARSE: yes\n"; +#else + os << " " << "KOKKOSKERNELS_ENABLE_TPL_ROCPARSE: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_METIS - tpls.emplace_back("METIS"); + os << "KOKKOSKERNELS_ENABLE_TPL_METIS: yes\n"; +#else + os << " " << "KOKKOSKERNELS_ENABLE_TPL_METIS: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ARMPL - tpls.emplace_back("ARMPL"); + os << " " << "KOKKOSKERNELS_ENABLE_TPL_ARMPL: yes\n"; +#else + os << " " << "KOKKOSKERNELS_ENABLE_TPL_ARMPL: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA - tpls.emplace_back("MAGMA"); + os << " " << "KOKKOSKERNELS_ENABLE_TPL_MAGMA: yes\n"; +#else + os << " " << "KOKKOSKERNELS_ENABLE_TPL_MAGMA: no\n"; #endif - if(!tpls.empty()){ - auto tplsIte = tpls.cbegin(); - os << *tplsIte; - ++tplsIte; - for(; tplsIte != tpls.cend(); ++tplsIte) { - os << ";" << *tplsIte ; - } - } + } void print_version(std::ostream& os) { - os << KernelsVersionKey<< ": "<< KOKKOSKERNELS_VERSION <<'\n'; + // KOKKOSKERNELS_VERSION is used because MAJOR, MINOR and PATCH macros + // are not available in Kernels + os << " "<<"Kernels Version: "<< KOKKOSKERNELS_VERSION <<'\n'; } -} // namespace +} // namespace Impl void print_configuration(std::ostream& os) { - print_version(os); + Impl::print_version(os); - os << EnabledTPLsNamesKey << ": "; - print_enabled_tpls(os); - os << "\n"; + os << "TPLs: \n"; + Impl::print_enabled_tpls(os); } } // namespace KokkosKernels diff --git a/common/unit_test/Test_Common_PrintConfiguration.hpp b/common/unit_test/Test_Common_PrintConfiguration.hpp index c8ba2cc7de..26277a9dab 100644 --- a/common/unit_test/Test_Common_PrintConfiguration.hpp +++ b/common/unit_test/Test_Common_PrintConfiguration.hpp @@ -36,10 +36,10 @@ void check_print_configuration(std::ostream& os) { auto found = line.find_first_of(':'); if (found != std::string::npos) { auto currentKey = line.substr(0, found); - if (currentKey == KokkosKernels::KernelsVersionKey) { + if (currentKey == "Kernels Version") { kernelsVersionKeyFound = true; } - else if (currentKey == KokkosKernels::EnabledTPLsNamesKey) { + else if (currentKey == "TPLs") { enabledTPLsNamesKeyFound = true; } } From e3c311bd72ae778f532a7d36c972e8833df30800 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Mon, 6 Feb 2023 21:04:59 +0100 Subject: [PATCH 069/442] #5: updated key verification --- common/unit_test/Test_Common_PrintConfiguration.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/common/unit_test/Test_Common_PrintConfiguration.hpp b/common/unit_test/Test_Common_PrintConfiguration.hpp index 26277a9dab..8cec466217 100644 --- a/common/unit_test/Test_Common_PrintConfiguration.hpp +++ b/common/unit_test/Test_Common_PrintConfiguration.hpp @@ -26,7 +26,6 @@ void check_print_configuration(std::ostream& os) { std::ostringstream msg; KokkosKernels::print_configuration(msg); - KokkosKernels::print_configuration(std::cout); bool kernelsVersionKeyFound = false; bool enabledTPLsNamesKeyFound = false; @@ -36,7 +35,7 @@ void check_print_configuration(std::ostream& os) { auto found = line.find_first_of(':'); if (found != std::string::npos) { auto currentKey = line.substr(0, found); - if (currentKey == "Kernels Version") { + if (currentKey == " Kernels Version") { kernelsVersionKeyFound = true; } else if (currentKey == "TPLs") { From 8c1a89e0ebd4c6070484f82b8e890662570e07b4 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Mon, 6 Feb 2023 21:38:32 +0100 Subject: [PATCH 070/442] #5: Added inline to avoit multiple define problem --- common/src/KokkosKernels_PrintConfguration.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/src/KokkosKernels_PrintConfguration.hpp b/common/src/KokkosKernels_PrintConfguration.hpp index 3c5686d63f..ee428dd658 100644 --- a/common/src/KokkosKernels_PrintConfguration.hpp +++ b/common/src/KokkosKernels_PrintConfguration.hpp @@ -23,7 +23,7 @@ namespace KokkosKernels { namespace Impl { -void print_enabled_tpls(std::ostream& os) { +inline void print_enabled_tpls(std::ostream& os) { #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK os << " " << "KOKKOSKERNELS_ENABLE_TPL_LAPACK: yes\n"; #else @@ -97,7 +97,7 @@ void print_enabled_tpls(std::ostream& os) { } -void print_version(std::ostream& os) { +inline void print_version(std::ostream& os) { // KOKKOSKERNELS_VERSION is used because MAJOR, MINOR and PATCH macros // are not available in Kernels os << " "<<"Kernels Version: "<< KOKKOSKERNELS_VERSION <<'\n'; @@ -105,7 +105,7 @@ void print_version(std::ostream& os) { } // namespace Impl -void print_configuration(std::ostream& os) { +inline void print_configuration(std::ostream& os) { Impl::print_version(os); os << "TPLs: \n"; From 3ddf1dea066da98e6a84ac83a84fe372d547874c Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Tue, 7 Feb 2023 10:40:42 +0100 Subject: [PATCH 071/442] #5: Fixed clang format and removed form this PR benchmark modification benchmark modification will come through another PR --- .../src/KokkosKernels_PrintConfguration.hpp | 97 ++++++++++++------- .../Test_Common_PrintConfiguration.hpp | 17 ++-- perf_test/Benchmark_Context.hpp | 10 +- 3 files changed, 75 insertions(+), 49 deletions(-) diff --git a/common/src/KokkosKernels_PrintConfguration.hpp b/common/src/KokkosKernels_PrintConfguration.hpp index ee428dd658..c28a109225 100644 --- a/common/src/KokkosKernels_PrintConfguration.hpp +++ b/common/src/KokkosKernels_PrintConfguration.hpp @@ -25,92 +25,119 @@ namespace KokkosKernels { namespace Impl { inline void print_enabled_tpls(std::ostream& os) { #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK - os << " " << "KOKKOSKERNELS_ENABLE_TPL_LAPACK: yes\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_LAPACK: yes\n"; #else - os << " " << "KOKKOSKERNELS_ENABLE_TPL_LAPACK: no\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_LAPACK: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - os << " " << "KOKKOSKERNELS_ENABLE_TPL_BLAS: yes\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_BLAS: yes\n"; #else - os << " " << "KOKKOSKERNELS_ENABLE_TPL_BLAS: no\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_BLAS: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_CBLAS - os << " " << "KOKKOSKERNELS_ENABLE_TPL_CBLAS: yes\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CBLAS: yes\n"; #else - os << " " << "KOKKOSKERNELS_ENABLE_TPL_CBLAS: no\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CBLAS: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACKE - os << " " << "KOKKOSKERNELS_ENABLE_TPL_LAPACKE: yes\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_LAPACKE: yes\n"; #else - os << " " << "KOKKOSKERNELS_ENABLE_TPL_LAPACKE: no\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_LAPACKE: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_SUPERLU - os << " " << "KOKKOSKERNELS_ENABLE_TPL_SUPERLU: yes\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_SUPERLU: yes\n"; #else - os << " " << "KOKKOSKERNELS_ENABLE_TPL_SUPERLU: no\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_SUPERLU: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_CHOLMOD - os << " " << "KOKKOSKERNELS_ENABLE_TPL_CHOLMOD: yes\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CHOLMOD: yes\n"; #else - os << " " << "KOKKOSKERNELS_ENABLE_TPL_CHOLMOD: no\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CHOLMOD: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - os << " " << "KOKKOSKERNELS_ENABLE_TPL_MKL: yes\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_MKL: yes\n"; #else - os << " " << "KOKKOSKERNELS_ENABLE_TPL_MKL: no\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_MKL: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS - os << " " << "KOKKOSKERNELS_ENABLE_TPL_CUBLAS: yes\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CUBLAS: yes\n"; #else - os << " " << "KOKKOSKERNELS_ENABLE_TPL_CUBLAS: no\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CUBLAS: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - os << " " << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: yes\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: yes\n"; #else - os << " " << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: no\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS - os << " " << "KOKKOSKERNELS_ENABLE_TPL_ROCBLAS: yes\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_ROCBLAS: yes\n"; #else - os << " " << "KOKKOSKERNELS_ENABLE_TPL_ROCBLAS: no\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_ROCBLAS: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCPARSE - os << " " << "KOKKOSKERNELS_ENABLE_TPL_ROCPARSE: yes\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_ROCPARSE: yes\n"; #else - os << " " << "KOKKOSKERNELS_ENABLE_TPL_ROCPARSE: no\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_ROCPARSE: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_METIS os << "KOKKOSKERNELS_ENABLE_TPL_METIS: yes\n"; #else - os << " " << "KOKKOSKERNELS_ENABLE_TPL_METIS: no\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_METIS: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ARMPL - os << " " << "KOKKOSKERNELS_ENABLE_TPL_ARMPL: yes\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_ARMPL: yes\n"; #else - os << " " << "KOKKOSKERNELS_ENABLE_TPL_ARMPL: no\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_ARMPL: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA - os << " " << "KOKKOSKERNELS_ENABLE_TPL_MAGMA: yes\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_MAGMA: yes\n"; #else - os << " " << "KOKKOSKERNELS_ENABLE_TPL_MAGMA: no\n"; + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_MAGMA: no\n"; #endif - } inline void print_version(std::ostream& os) { - // KOKKOSKERNELS_VERSION is used because MAJOR, MINOR and PATCH macros - // are not available in Kernels - os << " "<<"Kernels Version: "<< KOKKOSKERNELS_VERSION <<'\n'; + // KOKKOSKERNELS_VERSION is used because MAJOR, MINOR and PATCH macros + // are not available in Kernels + os << " " + << "Kernels Version: " << KOKKOSKERNELS_VERSION << '\n'; } } // namespace Impl inline void print_configuration(std::ostream& os) { - Impl::print_version(os); + Impl::print_version(os); - os << "TPLs: \n"; - Impl::print_enabled_tpls(os); + os << "TPLs: \n"; + Impl::print_enabled_tpls(os); } } // namespace KokkosKernels -#endif // _KOKKOSKERNELS_PRINT_CONFIGURATION_HPP +#endif // _KOKKOSKERNELS_PRINT_CONFIGURATION_HPP diff --git a/common/unit_test/Test_Common_PrintConfiguration.hpp b/common/unit_test/Test_Common_PrintConfiguration.hpp index 8cec466217..ac85734145 100644 --- a/common/unit_test/Test_Common_PrintConfiguration.hpp +++ b/common/unit_test/Test_Common_PrintConfiguration.hpp @@ -27,7 +27,7 @@ void check_print_configuration(std::ostream& os) { std::ostringstream msg; KokkosKernels::print_configuration(msg); - bool kernelsVersionKeyFound = false; + bool kernelsVersionKeyFound = false; bool enabledTPLsNamesKeyFound = false; // Iterate over lines returned from kokkos and extract key:value pairs std::stringstream ss{msg.str()}; @@ -36,18 +36,17 @@ void check_print_configuration(std::ostream& os) { if (found != std::string::npos) { auto currentKey = line.substr(0, found); if (currentKey == " Kernels Version") { - kernelsVersionKeyFound = true; - } - else if (currentKey == "TPLs") { - enabledTPLsNamesKeyFound = true; + kernelsVersionKeyFound = true; + } else if (currentKey == "TPLs") { + enabledTPLsNamesKeyFound = true; } } } EXPECT_TRUE(kernelsVersionKeyFound && enabledTPLsNamesKeyFound); - } -/// \brief Verify that print_configuration print the expected keys from kernels configuration +/// \brief Verify that print_configuration print the expected keys from kernels +/// configuration template void testPrintConfiguration() { std::ostringstream out; @@ -55,6 +54,8 @@ void testPrintConfiguration() { check_print_configuration(out); } -TEST_F(TestCategory, common_print_configuration) { testPrintConfiguration(); } +TEST_F(TestCategory, common_print_configuration) { + testPrintConfiguration(); +} #endif // KOKKOSKERNELS_PRINTCONFIGURATION_HPP diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index f9ce7bb744..0ef4910cc5 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -24,7 +24,6 @@ #include #include -#include namespace KokkosKernelsBenchmark { @@ -42,12 +41,11 @@ std::string remove_unwanted_characters(std::string str) { return str.substr(from, to + 1); } -/// \brief Extract all key:value pairs from kernels configuration and add it to +/// \brief Extract all key:value pairs from kokkos configuration and add it to /// the benchmark context -void add_kernels_configuration(bool verbose) { +void add_kokkos_configuration(bool verbose) { std::ostringstream msg; Kokkos::print_configuration(msg, verbose); - KokkosKernels::print_configuration(msg); // Iterate over lines returned from kokkos and extract key:value pairs std::stringstream ss{msg.str()}; @@ -66,8 +64,8 @@ void add_kernels_configuration(bool verbose) { /// \brief Gather all context information and add it to benchmark context data void add_benchmark_context(bool verbose = false) { - // Add Kokkos and kernerls configuration to benchmark context data - add_kernels_configuration(verbose); + // Add Kokkos configuration to benchmark context data + add_kokkos_configuration(verbose); } } // namespace KokkosKernelsBenchmark From d12158be65d641dfa6eaf4957bb14140fbf28211 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Tue, 7 Feb 2023 15:32:41 +0100 Subject: [PATCH 072/442] #5: Fixed mistake in filename and updated Kernels version key --- ...p => KokkosKernels_PrintConfiguration.hpp} | 2 +- .../Test_Common_PrintConfiguration.hpp | 24 ++++++++++--------- 2 files changed, 14 insertions(+), 12 deletions(-) rename common/src/{KokkosKernels_PrintConfguration.hpp => KokkosKernels_PrintConfiguration.hpp} (98%) diff --git a/common/src/KokkosKernels_PrintConfguration.hpp b/common/src/KokkosKernels_PrintConfiguration.hpp similarity index 98% rename from common/src/KokkosKernels_PrintConfguration.hpp rename to common/src/KokkosKernels_PrintConfiguration.hpp index c28a109225..56a8129ab3 100644 --- a/common/src/KokkosKernels_PrintConfguration.hpp +++ b/common/src/KokkosKernels_PrintConfiguration.hpp @@ -127,7 +127,7 @@ inline void print_version(std::ostream& os) { // KOKKOSKERNELS_VERSION is used because MAJOR, MINOR and PATCH macros // are not available in Kernels os << " " - << "Kernels Version: " << KOKKOSKERNELS_VERSION << '\n'; + << "KokkosKernels Version: " << KOKKOSKERNELS_VERSION << '\n'; } } // namespace Impl diff --git a/common/unit_test/Test_Common_PrintConfiguration.hpp b/common/unit_test/Test_Common_PrintConfiguration.hpp index ac85734145..07a55e152b 100644 --- a/common/unit_test/Test_Common_PrintConfiguration.hpp +++ b/common/unit_test/Test_Common_PrintConfiguration.hpp @@ -17,16 +17,14 @@ /// \file Test_Common_PrintConfiguration.hpp /// \brief Tests for print configuration -#ifndef KOKKOSKERNELS_PRINTCONFIGURATION_HPP -#define KOKKOSKERNELS_PRINTCONFIGURATION_HPP +#ifndef KOKKOSKERNELS_PRINTCONFIGURATIONTEST_HPP +#define KOKKOSKERNELS_PRINTCONFIGURATIONTEST_HPP -#include "KokkosKernels_PrintConfguration.hpp" - -/// \brief Verify that all keys from kernels configuration and check their value -void check_print_configuration(std::ostream& os) { - std::ostringstream msg; - KokkosKernels::print_configuration(msg); +#include "KokkosKernels_PrintConfiguration.hpp" +/// \brief Verify that all keys from kernels configuration and check their +/// values +void check_print_configuration(const std::ostringstream& msg) { bool kernelsVersionKeyFound = false; bool enabledTPLsNamesKeyFound = false; // Iterate over lines returned from kokkos and extract key:value pairs @@ -35,7 +33,7 @@ void check_print_configuration(std::ostream& os) { auto found = line.find_first_of(':'); if (found != std::string::npos) { auto currentKey = line.substr(0, found); - if (currentKey == " Kernels Version") { + if (currentKey == " KokkosKernels Version") { kernelsVersionKeyFound = true; } else if (currentKey == "TPLs") { enabledTPLsNamesKeyFound = true; @@ -45,10 +43,14 @@ void check_print_configuration(std::ostream& os) { EXPECT_TRUE(kernelsVersionKeyFound && enabledTPLsNamesKeyFound); } -/// \brief Verify that print_configuration print the expected keys from kernels +/// \brief Verify that print_configuration prints the expected keys from Kernels /// configuration template void testPrintConfiguration() { + // First, print this to cout in order to see what it looks like + KokkosKernels::print_configuration(std::cout); + // Then, run the actual test which prints the string to "out" and verifies + // that out has meet some expected behavior std::ostringstream out; KokkosKernels::print_configuration(out); check_print_configuration(out); @@ -58,4 +60,4 @@ TEST_F(TestCategory, common_print_configuration) { testPrintConfiguration(); } -#endif // KOKKOSKERNELS_PRINTCONFIGURATION_HPP +#endif // KOKKOSKERNELS_PRINTCONFIGURATIONTEST_HPP From 7f579fb5ce1c21912cf4f6ed94d1d4aa4ef7907a Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Tue, 14 Feb 2023 09:43:52 +0100 Subject: [PATCH 073/442] #5 rebased on develop and updated print_version method for kernels --- common/src/KokkosKernels_PrintConfiguration.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/common/src/KokkosKernels_PrintConfiguration.hpp b/common/src/KokkosKernels_PrintConfiguration.hpp index 56a8129ab3..a7ebc01bce 100644 --- a/common/src/KokkosKernels_PrintConfiguration.hpp +++ b/common/src/KokkosKernels_PrintConfiguration.hpp @@ -127,7 +127,9 @@ inline void print_version(std::ostream& os) { // KOKKOSKERNELS_VERSION is used because MAJOR, MINOR and PATCH macros // are not available in Kernels os << " " - << "KokkosKernels Version: " << KOKKOSKERNELS_VERSION << '\n'; + << "KokkosKernels Version: " << KOKKOSKERNELS_VERSION_MAJOR << "." + << KOKKOSKERNELS_VERSION_MINOR << "." << KOKKOSKERNELS_VERSION_PATCH + << '\n'; } } // namespace Impl From 2140e99b098be7830b07242e356f21ce2388a132 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Tue, 14 Feb 2023 18:05:58 +0100 Subject: [PATCH 074/442] #5 Fixed typo --- common/src/KokkosKernels_PrintConfiguration.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/src/KokkosKernels_PrintConfiguration.hpp b/common/src/KokkosKernels_PrintConfiguration.hpp index a7ebc01bce..99775f388c 100644 --- a/common/src/KokkosKernels_PrintConfiguration.hpp +++ b/common/src/KokkosKernels_PrintConfiguration.hpp @@ -94,12 +94,12 @@ inline void print_enabled_tpls(std::ostream& os) { os << " " << "KOKKOSKERNELS_ENABLE_TPL_ROCBLAS: no\n"; #endif -#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCPARSE +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE os << " " - << "KOKKOSKERNELS_ENABLE_TPL_ROCPARSE: yes\n"; + << "KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE: yes\n"; #else os << " " - << "KOKKOSKERNELS_ENABLE_TPL_ROCPARSE: no\n"; + << "KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE: no\n"; #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_METIS os << "KOKKOSKERNELS_ENABLE_TPL_METIS: yes\n"; From d76e8e18a4d35adba52e50bce1123bed084457e5 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 1 Dec 2022 18:00:34 -0700 Subject: [PATCH 075/442] BLAS: mixed gemm Adding a reproducer for an issue observed by Sierra developers when gemm is called on matrices with complex and real values in the same multiplication. In quite a few place the check performed in unit-tests are not computing results in the expected type leading to issues with round-off errors when compared to internal implementation. --- blas/src/KokkosBlas1_dot.hpp | 8 +- blas/src/KokkosBlas1_nrm1.hpp | 16 +-- blas/src/KokkosBlas1_nrm2_squared.hpp | 5 +- blas/src/KokkosBlas1_nrm2w_squared.hpp | 3 +- blas/unit_test/Test_Blas1_axpby.hpp | 37 +++--- blas/unit_test/Test_Blas1_axpy.hpp | 6 +- blas/unit_test/Test_Blas1_dot.hpp | 75 +++++------ blas/unit_test/Test_Blas1_iamax.hpp | 44 +++---- blas/unit_test/Test_Blas1_mult.hpp | 114 ++++++++--------- blas/unit_test/Test_Blas1_nrm1.hpp | 22 ++-- blas/unit_test/Test_Blas1_nrm2.hpp | 46 +++---- blas/unit_test/Test_Blas1_nrm2w.hpp | 46 +++---- blas/unit_test/Test_Blas1_nrm2w_squared.hpp | 46 +++---- blas/unit_test/Test_Blas1_nrminf.hpp | 44 +++---- blas/unit_test/Test_Blas1_reciprocal.hpp | 80 ++++++------ blas/unit_test/Test_Blas1_scal.hpp | 87 ++++++------- blas/unit_test/Test_Blas1_sum.hpp | 46 +++---- blas/unit_test/Test_Blas1_team_update.hpp | 4 +- blas/unit_test/Test_Blas1_update.hpp | 117 ++++++++++-------- blas/unit_test/Test_Blas2_gemv.hpp | 60 ++++----- blas/unit_test/Test_Blas3_gemm.hpp | 34 ++--- .../KokkosSparse_par_ilut_numeric_impl.hpp | 8 +- sparse/unit_test/Test_Sparse_spmv.hpp | 4 +- sparse/unit_test/Test_Sparse_spmv_bsr.hpp | 18 ++- 24 files changed, 490 insertions(+), 480 deletions(-) diff --git a/blas/src/KokkosBlas1_dot.hpp b/blas/src/KokkosBlas1_dot.hpp index aec666cd13..4a5a18b976 100644 --- a/blas/src/KokkosBlas1_dot.hpp +++ b/blas/src/KokkosBlas1_dot.hpp @@ -75,11 +75,11 @@ dot(const XVector& x, const YVector& y) { using result_type = typename KokkosBlas::Impl::DotAccumulatingScalar::type; using RVector_Internal = - Kokkos::View>; + Kokkos::View>; using RVector_Result = - Kokkos::View>; + Kokkos::View>; result_type result{}; RVector_Result R = RVector_Result(&result); diff --git a/blas/src/KokkosBlas1_nrm1.hpp b/blas/src/KokkosBlas1_nrm1.hpp index 2377224b5f..62f373d7b8 100644 --- a/blas/src/KokkosBlas1_nrm1.hpp +++ b/blas/src/KokkosBlas1_nrm1.hpp @@ -39,17 +39,17 @@ nrm1(const XVector& x) { static_assert(XVector::rank == 1, "KokkosBlas::nrm1: " "Both Vector inputs must have rank 1."); - using mag_type= typename Kokkos::Details::InnerProductSpaceTraits< + using mag_type = typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type; - using XVector_Internal = Kokkos::View::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits >; + using XVector_Internal = Kokkos::View< + typename XVector::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename XVector::device_type, Kokkos::MemoryTraits >; - using RVector_Internal = Kokkos::View >; + using RVector_Internal = + Kokkos::View >; mag_type result; RVector_Internal R = RVector_Internal(&result); diff --git a/blas/src/KokkosBlas1_nrm2_squared.hpp b/blas/src/KokkosBlas1_nrm2_squared.hpp index 2bd0fe15c6..3a584c8a99 100644 --- a/blas/src/KokkosBlas1_nrm2_squared.hpp +++ b/blas/src/KokkosBlas1_nrm2_squared.hpp @@ -49,8 +49,7 @@ nrm2_squared(const XVector& x) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; @@ -111,7 +110,7 @@ void nrm2_squared( typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; using UnifiedRVLayout = typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - RV, Kokkos::LayoutRight>::array_layout; + RV, UnifiedXLayout>::array_layout; // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. diff --git a/blas/src/KokkosBlas1_nrm2w_squared.hpp b/blas/src/KokkosBlas1_nrm2w_squared.hpp index e7333dc173..a65dad9b0f 100644 --- a/blas/src/KokkosBlas1_nrm2w_squared.hpp +++ b/blas/src/KokkosBlas1_nrm2w_squared.hpp @@ -50,8 +50,7 @@ nrm2w_squared(const XVector& x, const XVector& w) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; diff --git a/blas/unit_test/Test_Blas1_axpby.hpp b/blas/unit_test/Test_Blas1_axpby.hpp index b81926b3cb..ac053366da 100644 --- a/blas/unit_test/Test_Blas1_axpby.hpp +++ b/blas/unit_test/Test_Blas1_axpby.hpp @@ -1,4 +1,4 @@ -//@HEADER +//@HEADERA // ************************************************************************ // // Kokkos v. 4.0 @@ -23,27 +23,28 @@ namespace Test { template void impl_test_axpby(int N) { - typedef typename ViewTypeA::value_type ScalarA; - typedef typename ViewTypeB::value_type ScalarB; + using ScalarA = typename ViewTypeA::value_type; + using ScalarB = typename ViewTypeB::value_type; - typedef Kokkos::View< + using BaseTypeA = Kokkos::View< ScalarA * [2], typename std::conditional::value, Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< + Device>; + using BaseTypeB = Kokkos::View< ScalarB * [2], typename std::conditional::value, Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; + Device>; - ScalarA a = 3; - ScalarB b = 5; - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + ScalarA a = 3; + ScalarB b = 5; + // eps should probably be based on ScalarB since that is the type + // in which the result is computed. + const double eps = Kokkos::ArithTraits:: + epsilon(); // std::is_same::value ? 2 * 1e-5 : 1e-7; BaseTypeA b_x("X", N); BaseTypeB b_y("Y", N); @@ -85,7 +86,8 @@ void impl_test_axpby(int N) { KokkosBlas::axpby(a, x, b, y); Kokkos::deep_copy(h_b_y, b_y); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(a * h_x(i) + b * h_b_org_y(i, 0), h_y(i), eps); + EXPECT_NEAR_KK(static_cast(a * h_x(i) + b * h_b_org_y(i, 0)), + h_y(i), eps); } Kokkos::deep_copy(b_y, b_org_y); @@ -93,7 +95,8 @@ void impl_test_axpby(int N) { KokkosBlas::axpby(a, c_x, b, y); Kokkos::deep_copy(h_b_y, b_y); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(a * h_x(i) + b * h_b_org_y(i, 0), h_y(i), eps); + EXPECT_NEAR_KK(static_cast(a * h_x(i) + b * h_b_org_y(i, 0)), + h_y(i), eps); } } @@ -156,7 +159,8 @@ void impl_test_axpby_mv(int N, int K) { for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(a * h_x(i, j) + b * h_org_y(i, j), h_y(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + b * h_org_y(i, j)), + h_y(i, j), eps); } } @@ -166,7 +170,8 @@ void impl_test_axpby_mv(int N, int K) { for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(a * h_x(i, j) + b * h_org_y(i, j), h_y(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + b * h_org_y(i, j)), + h_y(i, j), eps); } } } diff --git a/blas/unit_test/Test_Blas1_axpy.hpp b/blas/unit_test/Test_Blas1_axpy.hpp index 91395b60fb..24a2886ce5 100644 --- a/blas/unit_test/Test_Blas1_axpy.hpp +++ b/blas/unit_test/Test_Blas1_axpy.hpp @@ -154,7 +154,8 @@ void impl_test_axpy_mv(int N, int K) { Kokkos::deep_copy(h_b_y, b_y); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(a * h_x(i, j) + h_org_y(i, j), h_y(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + h_org_y(i, j)), + h_y(i, j), eps); } } @@ -163,7 +164,8 @@ void impl_test_axpy_mv(int N, int K) { Kokkos::deep_copy(h_b_y, b_y); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(a * h_x(i, j) + h_org_y(i, j), h_y(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + h_org_y(i, j)), + h_y(i, j), eps); } } } diff --git a/blas/unit_test/Test_Blas1_dot.hpp b/blas/unit_test/Test_Blas1_dot.hpp index 7b49ccc7de..b2dfc1bd41 100644 --- a/blas/unit_test/Test_Blas1_dot.hpp +++ b/blas/unit_test/Test_Blas1_dot.hpp @@ -187,24 +187,24 @@ int test_dot() { // Removing the layout stride test as ViewTypeA a("a", N); // is invalid since the view constructor needs a stride object! -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - Test::impl_test_dot(0); - Test::impl_test_dot(13); - Test::impl_test_dot(1024); - // Test::impl_test_dot(132231); -#endif - -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_dot(1024); - Test::impl_test_dot(1024); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + Test::impl_test_dot(0); + Test::impl_test_dot(13); + Test::impl_test_dot(1024); + // Test::impl_test_dot(132231); + #endif + + #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_dot(1024); + Test::impl_test_dot(1024); + #endif + */ return 1; } @@ -237,25 +237,26 @@ int test_dot_mv() { // Removing the layout stride test as ViewTypeA a("a", N); // is invalid since the view constructor needs a stride object! -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - Test::impl_test_dot_mv(0, 5); - Test::impl_test_dot_mv(13, 5); - Test::impl_test_dot_mv(1024, 5); - Test::impl_test_dot_mv(789, 1); - // Test::impl_test_dot_mv(132231,5); -#endif - -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_dot_mv(1024, 5); - Test::impl_test_dot_mv(1024, 5); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View + view_type_a_ls; typedef Kokkos::View + view_type_b_ls; Test::impl_test_dot_mv(0, 5); Test::impl_test_dot_mv(13, 5); Test::impl_test_dot_mv(1024, 5); Test::impl_test_dot_mv(789, 1); + // Test::impl_test_dot_mv(132231,5); + #endif + + #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_dot_mv(1024, 5); + Test::impl_test_dot_mv(1024, 5); + #endif + */ return 1; } diff --git a/blas/unit_test/Test_Blas1_iamax.hpp b/blas/unit_test/Test_Blas1_iamax.hpp index efa5b3da67..ced1759301 100644 --- a/blas/unit_test/Test_Blas1_iamax.hpp +++ b/blas/unit_test/Test_Blas1_iamax.hpp @@ -240,17 +240,17 @@ int test_iamax() { // Test::impl_test_iamax(132231); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_iamax(0); - Test::impl_test_iamax(13); - Test::impl_test_iamax(1024); - // Test::impl_test_iamax(132231); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_iamax(0); + Test::impl_test_iamax(13); + Test::impl_test_iamax(1024); + // Test::impl_test_iamax(132231); + #endif + */ return 1; } @@ -277,17 +277,17 @@ int test_iamax_mv() { // Test::impl_test_iamax_mv(132231,5); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_iamax_mv(0, 5); - Test::impl_test_iamax_mv(13, 5); - Test::impl_test_iamax_mv(1024, 5); - // Test::impl_test_iamax_mv(132231,5); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View + view_type_a_ls; Test::impl_test_iamax_mv(0, 5); + Test::impl_test_iamax_mv(13, 5); + Test::impl_test_iamax_mv(1024, 5); + // Test::impl_test_iamax_mv(132231,5); + #endif + */ return 1; } diff --git a/blas/unit_test/Test_Blas1_mult.hpp b/blas/unit_test/Test_Blas1_mult.hpp index e438ae370a..3c027f26e7 100644 --- a/blas/unit_test/Test_Blas1_mult.hpp +++ b/blas/unit_test/Test_Blas1_mult.hpp @@ -74,21 +74,24 @@ void impl_test_mult(int N) { KokkosBlas::mult(b, z, a, x, y); Kokkos::deep_copy(h_z, z); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(a * h_x(i) * h_y(i) + b * h_b_org_z(i), h_z(i), eps); + EXPECT_NEAR_KK(static_cast(a * h_x(i) * h_y(i) + b * h_b_org_z(i)), + h_z(i), eps); } Kokkos::deep_copy(z, b_org_z); KokkosBlas::mult(b, z, a, x, c_y); Kokkos::deep_copy(h_z, z); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(a * h_x(i) * h_y(i) + b * h_b_org_z(i), h_z(i), eps); + EXPECT_NEAR_KK(static_cast(a * h_x(i) * h_y(i) + b * h_b_org_z(i)), + h_z(i), eps); } Kokkos::deep_copy(z, b_org_z); KokkosBlas::mult(b, z, a, c_x, c_y); Kokkos::deep_copy(h_z, z); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(a * h_x(i) * h_y(i) + b * h_b_org_z(i), h_z(i), eps); + EXPECT_NEAR_KK(static_cast(a * h_x(i) * h_y(i) + b * h_b_org_z(i)), + h_z(i), eps); } } @@ -157,8 +160,9 @@ void impl_test_mult_mv(int N, int K) { Kokkos::deep_copy(h_b_z, b_z); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(a * h_x(i) * h_y(i, j) + b * h_b_org_z(i, j), h_z(i, j), - eps); + EXPECT_NEAR_KK( + static_cast(a * h_x(i) * h_y(i, j) + b * h_b_org_z(i, j)), + h_z(i, j), eps); } } @@ -167,8 +171,9 @@ void impl_test_mult_mv(int N, int K) { Kokkos::deep_copy(h_b_z, b_z); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(a * h_x(i) * h_y(i, j) + b * h_b_org_z(i, j), h_z(i, j), - eps); + EXPECT_NEAR_KK( + static_cast(a * h_x(i) * h_y(i, j) + b * h_b_org_z(i, j)), + h_z(i, j), eps); } } } @@ -208,31 +213,27 @@ int test_mult() { // Device>(132231); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - typedef Kokkos::View view_type_c_ls; - Test::impl_test_mult( - 0); - Test::impl_test_mult( - 13); - Test::impl_test_mult( - 1024); - // Test::impl_test_mult(132231); -#endif - -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_mult( - 1024); - Test::impl_test_mult( - 1024); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + typedef Kokkos::View view_type_c_ls; + Test::impl_test_mult( 0); Test::impl_test_mult( 13); Test::impl_test_mult( 1024); + // Test::impl_test_mult(132231); + #endif + + #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_mult( 1024); Test::impl_test_mult( 1024); #endif + */ return 1; } @@ -271,31 +272,30 @@ int test_mult_mv() { // Device>(132231,5); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - typedef Kokkos::View view_type_c_ls; - Test::impl_test_mult_mv(0, 5); - Test::impl_test_mult_mv(13, 5); - Test::impl_test_mult_mv(1024, 5); - // Test::impl_test_mult_mv(132231,5); -#endif - -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_mult_mv(1024, 5); - Test::impl_test_mult_mv(1024, 5); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View + view_type_b_ls; typedef Kokkos::View + view_type_c_ls; Test::impl_test_mult_mv(0, 5); Test::impl_test_mult_mv(13, 5); + Test::impl_test_mult_mv(1024, 5); + // Test::impl_test_mult_mv(132231,5); + #endif + + #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_mult_mv(1024, 5); + Test::impl_test_mult_mv(1024, 5); + #endif + */ return 1; } diff --git a/blas/unit_test/Test_Blas1_nrm1.hpp b/blas/unit_test/Test_Blas1_nrm1.hpp index ca43988937..b64aab9c3c 100644 --- a/blas/unit_test/Test_Blas1_nrm1.hpp +++ b/blas/unit_test/Test_Blas1_nrm1.hpp @@ -143,17 +143,17 @@ int test_nrm1() { Test::impl_test_nrm1(132231); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrm1(0); - Test::impl_test_nrm1(13); - Test::impl_test_nrm1(1024); - Test::impl_test_nrm1(132231); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm1(0); + Test::impl_test_nrm1(13); + Test::impl_test_nrm1(1024); + Test::impl_test_nrm1(132231); + #endif + */ return 1; } diff --git a/blas/unit_test/Test_Blas1_nrm2.hpp b/blas/unit_test/Test_Blas1_nrm2.hpp index 3d6d419e91..d17c9af505 100644 --- a/blas/unit_test/Test_Blas1_nrm2.hpp +++ b/blas/unit_test/Test_Blas1_nrm2.hpp @@ -139,17 +139,17 @@ int test_nrm2() { // Test::impl_test_nrm2(132231); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrm2(0); - Test::impl_test_nrm2(13); - Test::impl_test_nrm2(1024); - // Test::impl_test_nrm2(132231); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2(0); + Test::impl_test_nrm2(13); + Test::impl_test_nrm2(1024); + // Test::impl_test_nrm2(132231); + #endif + */ return 1; } @@ -178,18 +178,18 @@ int test_nrm2_mv() { // Test::impl_test_nrm2_mv(132231,5); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrm2_mv(0, 5); - Test::impl_test_nrm2_mv(13, 5); - Test::impl_test_nrm2_mv(1024, 5); - Test::impl_test_nrm2_mv(789, 1); - // Test::impl_test_nrm2_mv(132231,5); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View + view_type_a_ls; Test::impl_test_nrm2_mv(0, 5); + Test::impl_test_nrm2_mv(13, 5); + Test::impl_test_nrm2_mv(1024, 5); + Test::impl_test_nrm2_mv(789, 1); + // Test::impl_test_nrm2_mv(132231,5); + #endif + */ return 1; } diff --git a/blas/unit_test/Test_Blas1_nrm2w.hpp b/blas/unit_test/Test_Blas1_nrm2w.hpp index aade8b6bcd..b91c5fbf78 100644 --- a/blas/unit_test/Test_Blas1_nrm2w.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w.hpp @@ -137,17 +137,17 @@ int test_nrm2w() { // Test::impl_test_nrm2(132231); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrm2w(0); - Test::impl_test_nrm2w(13); - Test::impl_test_nrm2w(1024); - // Test::impl_test_nrm2(132231); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w(0); + Test::impl_test_nrm2w(13); + Test::impl_test_nrm2w(1024); + // Test::impl_test_nrm2(132231); + #endif + */ return 1; } @@ -176,18 +176,18 @@ int test_nrm2w_mv() { // Test::impl_test_nrm2w_mv(132231,5); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrm2w_mv(0, 5); - Test::impl_test_nrm2w_mv(13, 5); - Test::impl_test_nrm2w_mv(1024, 5); - Test::impl_test_nrm2w_mv(789, 1); - // Test::impl_test_nrm2w_mv(132231,5); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View + view_type_a_ls; Test::impl_test_nrm2w_mv(0, 5); + Test::impl_test_nrm2w_mv(13, 5); + Test::impl_test_nrm2w_mv(1024, 5); + Test::impl_test_nrm2w_mv(789, 1); + // Test::impl_test_nrm2w_mv(132231,5); + #endif + */ return 1; } diff --git a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp index 387e313443..59661cc7e5 100644 --- a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp @@ -133,17 +133,17 @@ int test_nrm2w_squared() { // Test::impl_test_nrm2(132231); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrm2w_squared(0); - Test::impl_test_nrm2w_squared(13); - Test::impl_test_nrm2w_squared(1024); - // Test::impl_test_nrm2(132231); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w_squared(0); + Test::impl_test_nrm2w_squared(13); + Test::impl_test_nrm2w_squared(1024); + // Test::impl_test_nrm2(132231); + #endif + */ return 1; } @@ -172,18 +172,18 @@ int test_nrm2w_squared_mv() { // Test::impl_test_nrm2w_squared_mv(132231,5); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrm2w_squared_mv(0, 5); - Test::impl_test_nrm2w_squared_mv(13, 5); - Test::impl_test_nrm2w_squared_mv(1024, 5); - Test::impl_test_nrm2w_squared_mv(789, 1); - // Test::impl_test_nrm2w_squared_mv(132231,5); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View + view_type_a_ls; Test::impl_test_nrm2w_squared_mv(0, + 5); Test::impl_test_nrm2w_squared_mv(13, 5); + Test::impl_test_nrm2w_squared_mv(1024, 5); + Test::impl_test_nrm2w_squared_mv(789, 1); + // Test::impl_test_nrm2w_squared_mv(132231,5); + #endif + */ return 1; } diff --git a/blas/unit_test/Test_Blas1_nrminf.hpp b/blas/unit_test/Test_Blas1_nrminf.hpp index 6d42ef1486..8da5550afa 100644 --- a/blas/unit_test/Test_Blas1_nrminf.hpp +++ b/blas/unit_test/Test_Blas1_nrminf.hpp @@ -137,17 +137,17 @@ int test_nrminf() { // Test::impl_test_nrminf(132231); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrminf(0); - Test::impl_test_nrminf(13); - Test::impl_test_nrminf(1024); - // Test::impl_test_nrminf(132231); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrminf(0); + Test::impl_test_nrminf(13); + Test::impl_test_nrminf(1024); + // Test::impl_test_nrminf(132231); + #endif + */ return 1; } @@ -174,17 +174,17 @@ int test_nrminf_mv() { // Test::impl_test_nrminf_mv(132231,5); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrminf_mv(0, 5); - Test::impl_test_nrminf_mv(13, 5); - Test::impl_test_nrminf_mv(1024, 5); - // Test::impl_test_nrminf_mv(132231,5); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View + view_type_a_ls; Test::impl_test_nrminf_mv(0, 5); + Test::impl_test_nrminf_mv(13, 5); + Test::impl_test_nrminf_mv(1024, 5); + // Test::impl_test_nrminf_mv(132231,5); + #endif + */ return 1; } diff --git a/blas/unit_test/Test_Blas1_reciprocal.hpp b/blas/unit_test/Test_Blas1_reciprocal.hpp index 1a2aebf782..257429ac0d 100644 --- a/blas/unit_test/Test_Blas1_reciprocal.hpp +++ b/blas/unit_test/Test_Blas1_reciprocal.hpp @@ -212,24 +212,24 @@ int test_reciprocal() { // Test::impl_test_reciprocal(132231); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - Test::impl_test_reciprocal(0); - Test::impl_test_reciprocal(13); - Test::impl_test_reciprocal(1024); - // Test::impl_test_reciprocal(132231); -#endif - -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_reciprocal(1024); - Test::impl_test_reciprocal(1024); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + Test::impl_test_reciprocal(0); + Test::impl_test_reciprocal(13); + Test::impl_test_reciprocal(1024); + // Test::impl_test_reciprocal(132231); #endif + + #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_reciprocal(1024); + Test::impl_test_reciprocal(1024); + #endif + */ return 1; } @@ -262,28 +262,28 @@ int test_reciprocal_mv() { // Device>(132231,5); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - Test::impl_test_reciprocal_mv(0, 5); - Test::impl_test_reciprocal_mv(13, 5); - Test::impl_test_reciprocal_mv(1024, - 5); - // Test::impl_test_reciprocal_mv(132231,5); -#endif - -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_reciprocal_mv(1024, - 5); - Test::impl_test_reciprocal_mv(1024, - 5); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View + view_type_a_ls; typedef Kokkos::View + view_type_b_ls; Test::impl_test_reciprocal_mv(0, 5); Test::impl_test_reciprocal_mv(13, 5); Test::impl_test_reciprocal_mv(1024, 5); + // Test::impl_test_reciprocal_mv(132231,5); + #endif + + #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_reciprocal_mv(1024, + 5); + Test::impl_test_reciprocal_mv(1024, + 5); + #endif + */ return 1; } diff --git a/blas/unit_test/Test_Blas1_scal.hpp b/blas/unit_test/Test_Blas1_scal.hpp index 2f3fce5d03..1c572073a5 100644 --- a/blas/unit_test/Test_Blas1_scal.hpp +++ b/blas/unit_test/Test_Blas1_scal.hpp @@ -61,14 +61,14 @@ void impl_test_scal(int N) { KokkosBlas::scal(y, a, x); Kokkos::deep_copy(h_y, y); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(a * h_x(i), h_y(i), eps); + EXPECT_NEAR_KK(static_cast(a * h_x(i)), h_y(i), eps); } Kokkos::deep_copy(y, org_y); KokkosBlas::scal(y, a, c_x); Kokkos::deep_copy(h_y, y); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(a * h_x(i), h_y(i), eps); + EXPECT_NEAR_KK(static_cast(a * h_x(i)), h_y(i), eps); } } @@ -128,7 +128,7 @@ void impl_test_scal_mv(int N, int K) { Kokkos::deep_copy(h_b_y, b_y); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(a * h_x(i, j), h_y(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * h_x(i, j)), h_y(i, j), eps); } } @@ -137,7 +137,7 @@ void impl_test_scal_mv(int N, int K) { Kokkos::deep_copy(h_b_y, b_y); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(a * h_x(i, j), h_y(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * h_x(i, j)), h_y(i, j), eps); } } @@ -156,7 +156,8 @@ void impl_test_scal_mv(int N, int K) { Kokkos::deep_copy(h_b_y, b_y); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(h_params(j) * h_x(i, j), h_y(i, j), eps); + EXPECT_NEAR_KK(static_cast(h_params(j) * h_x(i, j)), h_y(i, j), + eps); } } @@ -165,7 +166,8 @@ void impl_test_scal_mv(int N, int K) { Kokkos::deep_copy(h_b_y, b_y); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(h_params(j) * h_x(i, j), h_y(i, j), eps); + EXPECT_NEAR_KK(static_cast(h_params(j) * h_x(i, j)), h_y(i, j), + eps); } } } @@ -195,24 +197,24 @@ int test_scal() { // Test::impl_test_scal(132231); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - Test::impl_test_scal(0); - Test::impl_test_scal(13); - Test::impl_test_scal(1024); - // Test::impl_test_scal(132231); -#endif - -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_scal(1024); - Test::impl_test_scal(1024); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + Test::impl_test_scal(0); + Test::impl_test_scal(13); + Test::impl_test_scal(1024); + // Test::impl_test_scal(132231); + #endif + + #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_scal(1024); + Test::impl_test_scal(1024); + #endif + */ return 1; } @@ -241,24 +243,25 @@ int test_scal_mv() { // Test::impl_test_scal_mv(132231,5); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - Test::impl_test_scal_mv(0, 5); - Test::impl_test_scal_mv(13, 5); - Test::impl_test_scal_mv(1024, 5); - // Test::impl_test_scal_mv(132231,5); -#endif - -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_scal_mv(1024, 5); - Test::impl_test_scal_mv(1024, 5); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View + view_type_a_ls; typedef Kokkos::View + view_type_b_ls; Test::impl_test_scal_mv(0, 5); Test::impl_test_scal_mv(13, 5); Test::impl_test_scal_mv(1024, 5); + // Test::impl_test_scal_mv(132231,5); #endif + + #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_scal_mv(1024, 5); + Test::impl_test_scal_mv(1024, 5); + #endif + */ return 1; } diff --git a/blas/unit_test/Test_Blas1_sum.hpp b/blas/unit_test/Test_Blas1_sum.hpp index 0cb3626987..4472f8d204 100644 --- a/blas/unit_test/Test_Blas1_sum.hpp +++ b/blas/unit_test/Test_Blas1_sum.hpp @@ -128,17 +128,17 @@ int test_sum() { // Test::impl_test_sum(132231); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_sum(0); - Test::impl_test_sum(13); - Test::impl_test_sum(1024); - // Test::impl_test_sum(132231); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_sum(0); + Test::impl_test_sum(13); + Test::impl_test_sum(1024); + // Test::impl_test_sum(132231); + #endif + */ return 1; } @@ -167,18 +167,18 @@ int test_sum_mv() { // Test::impl_test_sum_mv(132231,5); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_sum_mv(0, 5); - Test::impl_test_sum_mv(13, 5); - Test::impl_test_sum_mv(1024, 5); - Test::impl_test_sum_mv(789, 1); - // Test::impl_test_sum_mv(132231,5); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View + view_type_a_ls; Test::impl_test_sum_mv(0, 5); + Test::impl_test_sum_mv(13, 5); + Test::impl_test_sum_mv(1024, 5); + Test::impl_test_sum_mv(789, 1); + // Test::impl_test_sum_mv(132231,5); + #endif + */ return 1; } diff --git a/blas/unit_test/Test_Blas1_team_update.hpp b/blas/unit_test/Test_Blas1_team_update.hpp index efb743bc0a..cf118e7ba2 100644 --- a/blas/unit_test/Test_Blas1_team_update.hpp +++ b/blas/unit_test/Test_Blas1_team_update.hpp @@ -102,8 +102,8 @@ void impl_test_team_update(int N) { ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += ScalarB(c * h_z(i) + a * h_x(i) + b * h_y(i)) * - ScalarB(c * h_z(i) + a * h_x(i) + b * h_y(i)); + expected_result += ScalarC(c * h_z(i) + a * h_x(i) + b * h_y(i)) * + ScalarC(c * h_z(i) + a * h_x(i) + b * h_y(i)); // KokkosBlas::update(a,x,b,y,c,z); Kokkos::parallel_for( diff --git a/blas/unit_test/Test_Blas1_update.hpp b/blas/unit_test/Test_Blas1_update.hpp index c2b5cef0bc..189dc2afb6 100644 --- a/blas/unit_test/Test_Blas1_update.hpp +++ b/blas/unit_test/Test_Blas1_update.hpp @@ -104,21 +104,27 @@ void impl_test_update(int N) { KokkosBlas::update(a, x, b, y, c, z); Kokkos::deep_copy(h_b_z, b_z); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(a * h_x(i) + b * h_y(i) + c * h_org_z(i), h_z(i), eps); + EXPECT_NEAR_KK( + static_cast(a * h_x(i) + b * h_y(i) + c * h_org_z(i)), h_z(i), + eps); } Kokkos::deep_copy(b_z, b_org_z); KokkosBlas::update(a, c_x, b, y, c, z); Kokkos::deep_copy(h_b_z, b_z); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(a * h_x(i) + b * h_y(i) + c * h_org_z(i), h_z(i), eps); + EXPECT_NEAR_KK( + static_cast(a * h_x(i) + b * h_y(i) + c * h_org_z(i)), h_z(i), + eps); } Kokkos::deep_copy(b_z, b_org_z); KokkosBlas::update(a, c_x, b, c_y, c, z); Kokkos::deep_copy(h_b_z, b_z); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(a * h_x(i) + b * h_y(i) + c * h_org_z(i), h_z(i), eps); + EXPECT_NEAR_KK( + static_cast(a * h_x(i) + b * h_y(i) + c * h_org_z(i)), h_z(i), + eps); } } @@ -192,7 +198,8 @@ void impl_test_update_mv(int N, int K) { Kokkos::deep_copy(h_b_z, b_z); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(a * h_x(i, j) + b * h_y(i, j) + c * h_b_org_z(i, j), + EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + b * h_y(i, j) + + c * h_b_org_z(i, j)), h_z(i, j), eps); } } @@ -202,7 +209,8 @@ void impl_test_update_mv(int N, int K) { Kokkos::deep_copy(h_b_z, b_z); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(a * h_x(i, j) + b * h_y(i, j) + c * h_b_org_z(i, j), + EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + b * h_y(i, j) + + c * h_b_org_z(i, j)), h_z(i, j), eps); } } @@ -243,31 +251,31 @@ int test_update() { // Device>(132231); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - typedef Kokkos::View view_type_c_ls; - Test::impl_test_update(0); - Test::impl_test_update(13); - Test::impl_test_update(1024); - // Test::impl_test_update(132231); -#endif - -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_update(1024); - Test::impl_test_update(1024); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + typedef Kokkos::View view_type_c_ls; + Test::impl_test_update(0); + Test::impl_test_update(13); + Test::impl_test_update(1024); + // Test::impl_test_update(132231); + #endif + + #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_update(1024); + Test::impl_test_update(1024); + #endif + */ return 1; } @@ -306,31 +314,30 @@ int test_update_mv() { Device>(132231, 5); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - typedef Kokkos::View view_type_c_ls; - Test::impl_test_update_mv(0, 5); - Test::impl_test_update_mv(13, 5); - Test::impl_test_update_mv(1024, 5); - Test::impl_test_update_mv(132231, 5); -#endif - -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_update_mv(1024, 5); - Test::impl_test_update_mv(1024, 5); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View + view_type_a_ls; typedef Kokkos::View + view_type_b_ls; typedef Kokkos::View + view_type_c_ls; Test::impl_test_update_mv(0, 5); Test::impl_test_update_mv(13, 5); + Test::impl_test_update_mv(1024, 5); + Test::impl_test_update_mv(132231, 5); + #endif + + #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_update_mv(1024, 5); + Test::impl_test_update_mv(1024, 5); + #endif + */ return 1; } diff --git a/blas/unit_test/Test_Blas2_gemv.hpp b/blas/unit_test/Test_Blas2_gemv.hpp index 8d3ce380fb..1df115d2c3 100644 --- a/blas/unit_test/Test_Blas2_gemv.hpp +++ b/blas/unit_test/Test_Blas2_gemv.hpp @@ -203,39 +203,33 @@ int test_gemv(const char* mode) { // Device>(mode,132231,1024); #endif -/* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - typedef Kokkos::View view_type_c_ls; - Test::impl_test_gemv( - mode, 0, 1024); - Test::impl_test_gemv( - mode, 1024, 0); - Test::impl_test_gemv( - mode, 13, 13); - Test::impl_test_gemv( - mode, 13, 1024); - Test::impl_test_gemv( - mode, 50, 40); - Test::impl_test_gemv( - mode, 1024, 1024); - Test::impl_test_gemv( - mode, 2131, 2131); - // Test::impl_test_gemv(mode,132231,1024); -#endif - -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_gemv( - mode, 1024, 1024); - Test::impl_test_gemv( - mode, 1024, 1024); -#endif -*/ + /* + #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View + view_type_a_ls; typedef Kokkos::View + view_type_b_ls; typedef Kokkos::View + view_type_c_ls; Test::impl_test_gemv( mode, 0, 1024); Test::impl_test_gemv( mode, 1024, 0); + Test::impl_test_gemv( mode, 13, 13); Test::impl_test_gemv( mode, 13, 1024); Test::impl_test_gemv( mode, 50, 40); + Test::impl_test_gemv( mode, 1024, 1024); Test::impl_test_gemv( mode, 2131, 2131); + // Test::impl_test_gemv(mode,132231,1024); + #endif + + #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_gemv( mode, 1024, 1024); Test::impl_test_gemv( mode, 1024, 1024); #endif + */ return 1; } diff --git a/blas/unit_test/Test_Blas3_gemm.hpp b/blas/unit_test/Test_Blas3_gemm.hpp index d3d34d60e3..b72c6ef2fe 100644 --- a/blas/unit_test/Test_Blas3_gemm.hpp +++ b/blas/unit_test/Test_Blas3_gemm.hpp @@ -396,19 +396,22 @@ void test_gemm_enabled_layouts() { template void test_gemm_mixed_scalars() { - using CMatrix = Kokkos::View; - using BMatrix = Kokkos::View; - using AMatrix = Kokkos::View; + using Matrix1 = Kokkos::View; + using Matrix2 = Kokkos::View; - AMatrix A("A", 10, 10); - BMatrix B("B", 10, 10); - CMatrix C("C", 10, 10); + const int dim1 = 400, dim2 = 1000; + + Matrix1 A("A", dim1, dim1); + Matrix1 B("B", dim2, dim2); + Matrix1 C("C", dim2, dim1); + Matrix2 D("D", dim2, dim1); Kokkos::deep_copy(A, Kokkos::ArithTraits::one()); Kokkos::deep_copy(B, Kokkos::ArithTraits::one()); Kokkos::deep_copy(C, Kokkos::ArithTraits::one()); - KokkosBlas::gemm(TestExecSpace(), "N", "N", 1.0, C, A, 0.0, B); + KokkosBlas::gemm(TestExecSpace(), "N", "N", 1.0, D, A, 0.0, C); + KokkosBlas::gemm(TestExecSpace(), "N", "T", 1.0, C, D, 0.0, B); } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ @@ -451,23 +454,22 @@ TEST_F(TestCategory, gemm_complex_float) { } #endif -#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) && \ + !defined(KOKKOSKERNELS_ETI_ONLY) TEST_F(TestCategory, gemm_mixed_scalars_complex_double_double) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_mixed_complex_double_double"); + Kokkos::Profiling::pushRegion( + "KokkosBlas::Test::gemm_mixed_complex_double_double"); test_gemm_mixed_scalars, double>(); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) && \ + !defined(KOKKOSKERNELS_ETI_ONLY) TEST_F(TestCategory, gemm_mixed_scalar_complex_float_float) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemm_mixed_complex_float_float"); + Kokkos::Profiling::pushRegion( + "KokkosBlas::Test::gemm_mixed_complex_float_float"); test_gemm_mixed_scalars, float>(); Kokkos::Profiling::popRegion(); } #endif - diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index 4ccdf7b07e..aa8af73d69 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -31,8 +31,6 @@ #include -//#define NUMERIC_OUTPUT_INFO - namespace KokkosSparse { namespace Impl { namespace Experimental { @@ -931,8 +929,10 @@ struct IlutWrap { const index_t l_nnz = L_new_values.extent(0); const index_t u_nnz = U_new_values.extent(0); - const auto l_filter_rank = std::max(0, l_nnz - l_nnz_limit - 1); - const auto u_filter_rank = std::max(0, u_nnz - u_nnz_limit - 1); + const auto l_filter_rank = + std::max(static_cast(0), l_nnz - l_nnz_limit - 1); + const auto u_filter_rank = + std::max(static_cast(0), u_nnz - u_nnz_limit - 1); const auto l_threshold = threshold_select(L_new_values, l_filter_rank, V_copy); diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index 8c4dc6a3c5..d0a6d1464c 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -1000,7 +1000,7 @@ void test_github_issue_101() { constexpr double ONE_d = static_cast(1.0); constexpr double TWO_d = static_cast(2.0); - double_matrix_type A_d("A_d", G); + double_matrix_type A_d("A_d", G, numCols); auto A_d_val_h = Kokkos::create_mirror_view(A_d.values); A_d_val_h[0] = ONE_d; // This cast is deliberate; we want to use float eps here, but as @@ -1048,7 +1048,7 @@ void test_github_issue_101() { constexpr float TWO_f = static_cast(2.0); constexpr double ZERO_d = static_cast(0.0); - float_matrix_type A_f("A_f", G); + float_matrix_type A_f("A_f", G, numCols); auto A_f_val_h = Kokkos::create_mirror_view(A_f.values); A_f_val_h[0] = ONE_f; A_f_val_h[1] = EPS_f / TWO_f; diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index b9e5334abf..ccbcb21301 100644 --- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -513,11 +513,10 @@ void testSpMVBsrMatrix() { Test_Bsr::check_bsrm_times_v( &mode, alpha_s, beta_s, bMax, num_errors); if (num_errors > 0) { - printf( - "KokkosSparse::Test::spmv_bsr: %i errors of %i with params: " - "%c %lf %lf\n", - num_errors, bMax, mode, Kokkos::ArithTraits::abs(alpha_s), - Kokkos::ArithTraits::abs(beta_s)); + std::cout << "KokkosSparse::Test::spmv_bsr: " << num_errors + << " errors of %i with params: " << bMax << " " << mode << " " + << Kokkos::ArithTraits::abs(alpha_s) << " " + << Kokkos::ArithTraits::abs(beta_s) << std::endl; } EXPECT_TRUE(num_errors == 0); } @@ -555,11 +554,10 @@ void testBsrMatrix_SpM_MV() { Test_Bsr::check_bsrm_times_mv( &mode, alpha_s, beta_s, bMax, num_errors); if (num_errors > 0) { - printf( - "KokkosSparse::Test::spm_mv_bsr: %i errors of %i with params: " - "%c %lf %lf\n", - num_errors, bMax, mode, Kokkos::ArithTraits::abs(alpha_s), - Kokkos::ArithTraits::abs(beta_s)); + std::cout << "KokkosSparse::Test::spm_mv_bsr: " << num_errors + << " errors of " << bMax << " with params: " << mode << " " + << Kokkos::ArithTraits::abs(alpha_s) << " " + << Kokkos::ArithTraits::abs(beta_s) << std::endl; } EXPECT_TRUE(num_errors == 0); } From e9f46343985600a74222ffeae68c0a8e4064a7d3 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 14 Feb 2023 14:31:37 -0700 Subject: [PATCH 076/442] Mix Scalars: fixing the tolerance in axpby Updating the formula for the tolerance picked in the axpby test based on error analysis. --- blas/unit_test/Test_Blas1_axpby.hpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/blas/unit_test/Test_Blas1_axpby.hpp b/blas/unit_test/Test_Blas1_axpby.hpp index ac053366da..ce1aef817d 100644 --- a/blas/unit_test/Test_Blas1_axpby.hpp +++ b/blas/unit_test/Test_Blas1_axpby.hpp @@ -43,8 +43,9 @@ void impl_test_axpby(int N) { ScalarB b = 5; // eps should probably be based on ScalarB since that is the type // in which the result is computed. - const double eps = Kokkos::ArithTraits:: - epsilon(); // std::is_same::value ? 2 * 1e-5 : 1e-7; + const double eps = Kokkos::ArithTraits::epsilon(); + const double max_val = 10; + const double max_error = (a + b) * max_val * eps; BaseTypeA b_x("X", N); BaseTypeB b_y("Y", N); @@ -68,12 +69,12 @@ void impl_test_axpby(int N) { { ScalarA randStart, randEnd; - Test::getRandomBounds(10.0, randStart, randEnd); + Test::getRandomBounds(max_val, randStart, randEnd); Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; - Test::getRandomBounds(10.0, randStart, randEnd); + Test::getRandomBounds(max_val, randStart, randEnd); Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); } @@ -87,7 +88,7 @@ void impl_test_axpby(int N) { Kokkos::deep_copy(h_b_y, b_y); for (int i = 0; i < N; i++) { EXPECT_NEAR_KK(static_cast(a * h_x(i) + b * h_b_org_y(i, 0)), - h_y(i), eps); + h_y(i), 2 * max_error); } Kokkos::deep_copy(b_y, b_org_y); @@ -96,7 +97,7 @@ void impl_test_axpby(int N) { Kokkos::deep_copy(h_b_y, b_y); for (int i = 0; i < N; i++) { EXPECT_NEAR_KK(static_cast(a * h_x(i) + b * h_b_org_y(i, 0)), - h_y(i), eps); + h_y(i), 2 * max_error); } } From 1507de8dcae771099e42e873e609ec96090c035b Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 14 Feb 2023 14:59:30 -0700 Subject: [PATCH 077/442] Mixed Scalars: modifying according to PR comments. --- blas/unit_test/Test_Blas1_axpby.hpp | 16 +++++++++------- blas/unit_test/Test_Blas3_gemm.hpp | 2 +- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/blas/unit_test/Test_Blas1_axpby.hpp b/blas/unit_test/Test_Blas1_axpby.hpp index ce1aef817d..b895659850 100644 --- a/blas/unit_test/Test_Blas1_axpby.hpp +++ b/blas/unit_test/Test_Blas1_axpby.hpp @@ -1,4 +1,4 @@ -//@HEADERA +//@HEADER // ************************************************************************ // // Kokkos v. 4.0 @@ -125,6 +125,12 @@ void impl_test_axpby_mv(int N, int K) { typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + const double eps = Kokkos::ArithTraits::epsilon(); + const double max_val = 10; + ScalarA a = 3; + ScalarB b = 5; + const double max_error = (a + b) * max_val * eps; + Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -147,12 +153,8 @@ void impl_test_axpby_mv(int N, int K) { Kokkos::deep_copy(h_b_x, b_x); Kokkos::deep_copy(h_b_y, b_y); - ScalarA a = 3; - ScalarB b = 5; typename ViewTypeA::const_type c_x = x; - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - Kokkos::View r("Dot::Result", K); KokkosBlas::axpby(a, x, b, y); @@ -161,7 +163,7 @@ void impl_test_axpby_mv(int N, int K) { for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + b * h_org_y(i, j)), - h_y(i, j), eps); + h_y(i, j), 2*max_error); } } @@ -172,7 +174,7 @@ void impl_test_axpby_mv(int N, int K) { for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + b * h_org_y(i, j)), - h_y(i, j), eps); + h_y(i, j), 2*max_error); } } } diff --git a/blas/unit_test/Test_Blas3_gemm.hpp b/blas/unit_test/Test_Blas3_gemm.hpp index b72c6ef2fe..a210806929 100644 --- a/blas/unit_test/Test_Blas3_gemm.hpp +++ b/blas/unit_test/Test_Blas3_gemm.hpp @@ -406,7 +406,7 @@ void test_gemm_mixed_scalars() { Matrix1 C("C", dim2, dim1); Matrix2 D("D", dim2, dim1); - Kokkos::deep_copy(A, Kokkos::ArithTraits::one()); + Kokkos::deep_copy(A, Kokkos::ArithTraits::one()); Kokkos::deep_copy(B, Kokkos::ArithTraits::one()); Kokkos::deep_copy(C, Kokkos::ArithTraits::one()); From 92b82ef884f443c6ceb77bfd8a5cb9a29dec37fd Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 14 Feb 2023 15:08:41 -0700 Subject: [PATCH 078/442] Mixed Scalars: modifying one more test according to review comment --- blas/unit_test/Test_Blas1_axpby.hpp | 12 ++++++------ blas/unit_test/Test_Blas1_axpy.hpp | 30 ++++++++++++++++------------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/blas/unit_test/Test_Blas1_axpby.hpp b/blas/unit_test/Test_Blas1_axpby.hpp index b895659850..eb6f02ad7e 100644 --- a/blas/unit_test/Test_Blas1_axpby.hpp +++ b/blas/unit_test/Test_Blas1_axpby.hpp @@ -125,10 +125,10 @@ void impl_test_axpby_mv(int N, int K) { typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); - const double eps = Kokkos::ArithTraits::epsilon(); - const double max_val = 10; - ScalarA a = 3; - ScalarB b = 5; + const double eps = Kokkos::ArithTraits::epsilon(); + const double max_val = 10; + ScalarA a = 3; + ScalarB b = 5; const double max_error = (a + b) * max_val * eps; Kokkos::Random_XorShift64_Pool rand_pool( @@ -163,7 +163,7 @@ void impl_test_axpby_mv(int N, int K) { for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + b * h_org_y(i, j)), - h_y(i, j), 2*max_error); + h_y(i, j), 2 * max_error); } } @@ -174,7 +174,7 @@ void impl_test_axpby_mv(int N, int K) { for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + b * h_org_y(i, j)), - h_y(i, j), 2*max_error); + h_y(i, j), 2 * max_error); } } } diff --git a/blas/unit_test/Test_Blas1_axpy.hpp b/blas/unit_test/Test_Blas1_axpy.hpp index 24a2886ce5..f3d6c82dea 100644 --- a/blas/unit_test/Test_Blas1_axpy.hpp +++ b/blas/unit_test/Test_Blas1_axpy.hpp @@ -43,8 +43,10 @@ void impl_test_axpy(int N) { using MagnitudeA = typename Kokkos::ArithTraits::mag_type; - ScalarA a = 3; - double eps = std::is_same::value ? 2e-5 : 1e-7; + ScalarA a = 3; + const double eps = Kokkos::ArithTraits::epsilon(); + const double max_val = 10; + const double max_error = (a * max_val + max_val) * eps; BaseTypeA b_x("X", N); BaseTypeB b_y("Y", N); @@ -66,12 +68,12 @@ void impl_test_axpy(int N) { { ScalarA randStart, randEnd; - Test::getRandomBounds(10.0, randStart, randEnd); + Test::getRandomBounds(max_val, randStart, randEnd); Kokkos::fill_random(x, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; - Test::getRandomBounds(10.0, randStart, randEnd); + Test::getRandomBounds(max_val, randStart, randEnd); Kokkos::fill_random(y, rand_pool, randStart, randEnd); } @@ -86,7 +88,7 @@ void impl_test_axpy(int N) { for (int i = 0; i < N; i++) { ScalarB expected = a * h_x(i) + h_b_org_y(i, 0); - EXPECT_NEAR_KK(expected, h_y(i), eps); + EXPECT_NEAR_KK(expected, h_y(i), 2 * max_error); } // reset y to orig, and run again with const-valued x @@ -95,7 +97,7 @@ void impl_test_axpy(int N) { Kokkos::deep_copy(h_b_y, b_y); for (int i = 0; i < N; i++) { ScalarB expected = a * h_x(i) + h_b_org_y(i, 0); - EXPECT_NEAR_KK(expected, h_y(i), eps); + EXPECT_NEAR_KK(expected, h_y(i), 2 * max_error); } } @@ -123,17 +125,22 @@ void impl_test_axpy_mv(int N, int K) { typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + const double eps = Kokkos::ArithTraits::epsilon(); + const double max_val = 10; + ScalarA a = 3; + const double max_error = (3 * max_val + max_val) * eps; + Kokkos::Random_XorShift64_Pool rand_pool( 13718); { ScalarA randStart, randEnd; - Test::getRandomBounds(10.0, randStart, randEnd); + Test::getRandomBounds(max_val, randStart, randEnd); Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; - Test::getRandomBounds(10.0, randStart, randEnd); + Test::getRandomBounds(max_val, randStart, randEnd); Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); } @@ -145,17 +152,14 @@ void impl_test_axpy_mv(int N, int K) { Kokkos::deep_copy(h_b_x, b_x); Kokkos::deep_copy(h_b_y, b_y); - ScalarA a = 3; typename ViewTypeA::const_type c_x = x; - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - KokkosBlas::axpy(a, x, y); Kokkos::deep_copy(h_b_y, b_y); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + h_org_y(i, j)), - h_y(i, j), eps); + h_y(i, j), 2 * max_error); } } @@ -165,7 +169,7 @@ void impl_test_axpy_mv(int N, int K) { for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + h_org_y(i, j)), - h_y(i, j), eps); + h_y(i, j), 2 * max_error); } } } From 31a756661b4c2b897e32c7857a532a00b443e61e Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 14 Feb 2023 16:31:34 -0700 Subject: [PATCH 079/442] Mixed Scalars: fixing some type conversion in unit-tests --- blas/unit_test/Test_Blas1_axpby.hpp | 32 +++++++++++++-------- blas/unit_test/Test_Blas1_axpy.hpp | 44 ++++++++++++++++------------- 2 files changed, 44 insertions(+), 32 deletions(-) diff --git a/blas/unit_test/Test_Blas1_axpby.hpp b/blas/unit_test/Test_Blas1_axpby.hpp index eb6f02ad7e..79a244fc6e 100644 --- a/blas/unit_test/Test_Blas1_axpby.hpp +++ b/blas/unit_test/Test_Blas1_axpby.hpp @@ -23,8 +23,9 @@ namespace Test { template void impl_test_axpby(int N) { - using ScalarA = typename ViewTypeA::value_type; - using ScalarB = typename ViewTypeB::value_type; + using ScalarA = typename ViewTypeA::value_type; + using ScalarB = typename ViewTypeB::value_type; + using MagnitudeB = typename Kokkos::ArithTraits::mag_type; using BaseTypeA = Kokkos::View< ScalarA * [2], @@ -43,9 +44,12 @@ void impl_test_axpby(int N) { ScalarB b = 5; // eps should probably be based on ScalarB since that is the type // in which the result is computed. - const double eps = Kokkos::ArithTraits::epsilon(); - const double max_val = 10; - const double max_error = (a + b) * max_val * eps; + const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); + const MagnitudeB max_val = 10; + const MagnitudeB max_error = + (static_cast(Kokkos::ArithTraits::abs(a)) + + Kokkos::ArithTraits::abs(b)) * + max_val * eps; BaseTypeA b_x("X", N); BaseTypeB b_y("Y", N); @@ -103,8 +107,9 @@ void impl_test_axpby(int N) { template void impl_test_axpby_mv(int N, int K) { - typedef typename ViewTypeA::value_type ScalarA; - typedef typename ViewTypeB::value_type ScalarB; + using ScalarA = typename ViewTypeA::value_type; + using ScalarB = typename ViewTypeB::value_type; + using MagnitudeB = typename Kokkos::ArithTraits::mag_type; typedef multivector_layout_adapter vfA_type; typedef multivector_layout_adapter vfB_type; @@ -125,11 +130,14 @@ void impl_test_axpby_mv(int N, int K) { typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); - const double eps = Kokkos::ArithTraits::epsilon(); - const double max_val = 10; - ScalarA a = 3; - ScalarB b = 5; - const double max_error = (a + b) * max_val * eps; + ScalarA a = 3; + ScalarB b = 5; + const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); + const MagnitudeB max_val = 10; + const MagnitudeB max_error = + (static_cast(Kokkos::ArithTraits::abs(a)) + + Kokkos::ArithTraits::abs(b)) * + max_val * eps; Kokkos::Random_XorShift64_Pool rand_pool( 13718); diff --git a/blas/unit_test/Test_Blas1_axpy.hpp b/blas/unit_test/Test_Blas1_axpy.hpp index f3d6c82dea..a292108201 100644 --- a/blas/unit_test/Test_Blas1_axpy.hpp +++ b/blas/unit_test/Test_Blas1_axpy.hpp @@ -23,30 +23,30 @@ namespace Test { template void impl_test_axpy(int N) { - typedef typename ViewTypeA::value_type ScalarA; - typedef typename ViewTypeB::value_type ScalarB; + using ScalarA = typename ViewTypeA::value_type; + using ScalarB = typename ViewTypeB::value_type; + using MagnitudeB = typename Kokkos::ArithTraits::mag_type; - typedef Kokkos::View< + using BaseTypeA = Kokkos::View< ScalarA * [2], typename std::conditional::value, Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< + Device>; + using BaseTypeB = Kokkos::View< ScalarB * [2], typename std::conditional::value, Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; + Device>; - using MagnitudeA = typename Kokkos::ArithTraits::mag_type; - - ScalarA a = 3; - const double eps = Kokkos::ArithTraits::epsilon(); - const double max_val = 10; - const double max_error = (a * max_val + max_val) * eps; + ScalarA a = 3; + const MagnitudeB max_val = 10; + const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); + const MagnitudeB max_error = + (static_cast(Kokkos::ArithTrairs::abs(a)) * max_val + + max_val) * + eps; BaseTypeA b_x("X", N); BaseTypeB b_y("Y", N); @@ -103,8 +103,9 @@ void impl_test_axpy(int N) { template void impl_test_axpy_mv(int N, int K) { - typedef typename ViewTypeA::value_type ScalarA; - typedef typename ViewTypeB::value_type ScalarB; + using ScalarA = typename ViewTypeA::value_type; + using ScalarB = typename ViewTypeB::value_type; + using MagnitudeB = typename Kokkos::ArithTraits::mag_type; typedef multivector_layout_adapter vfA_type; typedef multivector_layout_adapter vfB_type; @@ -125,10 +126,13 @@ void impl_test_axpy_mv(int N, int K) { typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); - const double eps = Kokkos::ArithTraits::epsilon(); - const double max_val = 10; - ScalarA a = 3; - const double max_error = (3 * max_val + max_val) * eps; + ScalarA a = 3; + const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); + const MagnitudeB max_val = 10; + const MagnitudeB max_error = + (static_cast(Kokkos::ArithTraits::abs(a)) * max_val + + max_val) * + eps; Kokkos::Random_XorShift64_Pool rand_pool( 13718); From 1fccf4a276b3959bca3313b323f78610d54ecc80 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 14 Feb 2023 16:43:48 -0700 Subject: [PATCH 080/442] Mixed Scalars: fixing typo --- blas/unit_test/Test_Blas1_axpy.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/unit_test/Test_Blas1_axpy.hpp b/blas/unit_test/Test_Blas1_axpy.hpp index a292108201..890e116584 100644 --- a/blas/unit_test/Test_Blas1_axpy.hpp +++ b/blas/unit_test/Test_Blas1_axpy.hpp @@ -44,7 +44,7 @@ void impl_test_axpy(int N) { const MagnitudeB max_val = 10; const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); const MagnitudeB max_error = - (static_cast(Kokkos::ArithTrairs::abs(a)) * max_val + + (static_cast(Kokkos::ArithTraits::abs(a)) * max_val + max_val) * eps; From 8206953f5cbd559a043d44b3a3aec8697c662770 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 15 Feb 2023 10:11:15 -0700 Subject: [PATCH 081/442] scripts: add --disable-test-eti-only --- cm_generate_makefile.bash | 8 ++++++-- scripts/cm_test_all_sandia | 10 ++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index b98abbbfc8..3eab04694a 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -358,6 +358,7 @@ display_help_text() { # echo "--with-hpx-options=[OPT]: Additional options to HPX:" # echo " enable_async_dispatch" echo "--no-default-eti: Do not include default ETI types for Kokkos Kernels" + echo "--disable-test-eti-only: Do not restrict testing to ETI types for Kokkos Kernels" echo "--gcc-toolchain=/Path/To/GccRoot: Set the gcc toolchain to use with clang (e.g. /usr)" echo "--kokkos-make-j=[NUM]: Set -j parallel level for kokkos install" echo " Default: j == 4" @@ -511,6 +512,9 @@ do --no-default-eti) KERNELS_DEFAULT_ETI_OPTION="-DKokkosKernels_ADD_DEFAULT_ETI=OFF" ;; + --disable-test-eti-only) + KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION="-DKokkosKernels_TEST_ETI_ONLY=OFF" + ;; --kokkos-release) KOKKOS_RELEASE=ON ;; @@ -812,6 +816,6 @@ cd $STORE_KOKKOSKERNELS_BUILD_PATH # Configure kokkos-kernels echo "" -echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS=\"${KOKKOS_CXXFLAGS}\" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS=\"${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED}\" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH} +echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS=\"${KOKKOS_CXXFLAGS}\" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS=\"${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED}\" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} ${KOKKOSKERNELS_PATH} echo "" -cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS="${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED//\"}" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH} +cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS="${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED//\"}" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} ${KOKKOSKERNELS_PATH} diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 117da595a7..fada06b816 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -77,6 +77,8 @@ print_help() { echo "" echo "--no-default-eti: Do not include default ETI types for Kokkos Kernels" echo "" + echo "--disable-test-eti-only: Do not restrict testing to ETI types for Kokkos Kernels" + echo "" echo "--with-spaces=SPACES: Set spaces to be instantiated." echo " Options: hostspace, cudaspace, cudauvmspace" echo "" @@ -224,6 +226,7 @@ SKIP_HWLOC=False SPOT_CHECK=False NO_DEFAULT_ETI=False ENABLE_PERFTESTS=True +ENABLE_TEST_ETI_ONLY=True PRINT_HELP=False OPT_FLAG="" @@ -388,6 +391,9 @@ do --no-default-eti*) NO_DEFAULT_ETI=True ;; + --disable-test-eti-only*) + ENABLE_TEST_ETI_ONLY=False + ;; --disable-perftests*) ENABLE_PERFTESTS=False ;; @@ -1173,6 +1179,10 @@ single_build_and_test() { local extra_args="$extra_args --no-default-eti" fi + if [ "${ENABLE_TEST_ETI_ONLY}" = "False" ]; then + local extra_args="$extra_args --disable-test-eti-only" + fi + if [ "${ENABLE_PERFTESTS}" = "False" ]; then local extra_args="$extra_args --disable-perftests" fi From 45ffc084975aa26fe9bc59e6b4ca732ca1b1379f Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 16 Feb 2023 13:44:46 -0700 Subject: [PATCH 082/442] Versions: fixing the CMake logic to export Kokkos Kernels version The logic was hidden behind !KOKKOSKERNELS_HAS_TRILINOS which of course makes some tests fail when they build within Trilinos... --- CMakeLists.txt | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d75a45499c..72c00118da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,17 @@ GET_DIRECTORY_PROPERTY(KOKKOSKERNELS_HAS_PARENT PARENT_DIRECTORY) SET(KOKKOSKERNELS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +SET(KokkosKernels_VERSION_MAJOR 4) +SET(KokkosKernels_VERSION_MINOR 0) +SET(KokkosKernels_VERSION_PATCH 99) +SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") + +#Set variables for config file +MATH(EXPR KOKKOSKERNELS_VERSION "${KokkosKernels_VERSION_MAJOR} * 10000 + ${KokkosKernels_VERSION_MINOR} * 100 + ${KokkosKernels_VERSION_PATCH}") +MATH(EXPR KOKKOSKERNELS_VERSION_MAJOR "${KOKKOSKERNELS_VERSION} / 10000") +MATH(EXPR KOKKOSKERNELS_VERSION_MINOR "${KOKKOSKERNELS_VERSION} / 100 % 100") +MATH(EXPR KOKKOSKERNELS_VERSION_PATCH "${KOKKOSKERNELS_VERSION} % 100") + IF(NOT KOKKOSKERNELS_HAS_TRILINOS) cmake_minimum_required(VERSION 3.16 FATAL_ERROR) IF (Spack_WORKAROUND) @@ -23,16 +34,6 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS) IF(NOT DEFINED ${PROJECT_NAME}) PROJECT(KokkosKernels CXX) ENDIF() - SET(KokkosKernels_VERSION_MAJOR 4) - SET(KokkosKernels_VERSION_MINOR 0) - SET(KokkosKernels_VERSION_PATCH 99) - SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") - - #Set variables for config file - MATH(EXPR KOKKOSKERNELS_VERSION "${KokkosKernels_VERSION_MAJOR} * 10000 + ${KokkosKernels_VERSION_MINOR} * 100 + ${KokkosKernels_VERSION_PATCH}") - MATH(EXPR KOKKOSKERNELS_VERSION_MAJOR "${KOKKOSKERNELS_VERSION} / 10000") - MATH(EXPR KOKKOSKERNELS_VERSION_MINOR "${KOKKOSKERNELS_VERSION} / 100 % 100") - MATH(EXPR KOKKOSKERNELS_VERSION_PATCH "${KOKKOSKERNELS_VERSION} % 100") ENDIF() INCLUDE(GNUInstallDirs) From 557e62a67fac41f2eb1af3a805736dcb55b31c9c Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 21 Feb 2023 10:44:34 -0700 Subject: [PATCH 083/442] Test mixed scalars: more fixes related to mixed scalar tests Mostly fixing issues that stemed from not running these tests on a regular basis, for instance generating FPE because of changes in how random inputs are generated. Test mixed scalars: applying clang-format --- blas/unit_test/Test_Blas1_nrm2w.hpp | 42 +++++---- blas/unit_test/Test_Blas1_nrm2w_squared.hpp | 39 ++++---- blas/unit_test/Test_Blas1_reciprocal.hpp | 98 ++++++++------------- blas/unit_test/Test_Blas1_team_mult.hpp | 67 +++++++++----- blas/unit_test/Test_Blas1_team_update.hpp | 2 +- blas/unit_test/Test_Blas2_gemv.hpp | 39 +++++--- 6 files changed, 158 insertions(+), 129 deletions(-) diff --git a/blas/unit_test/Test_Blas1_nrm2w.hpp b/blas/unit_test/Test_Blas1_nrm2w.hpp index b91c5fbf78..ca5714db1c 100644 --- a/blas/unit_test/Test_Blas1_nrm2w.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w.hpp @@ -22,8 +22,9 @@ namespace Test { template void impl_test_nrm2w(int N) { - typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::ArithTraits AT; + using ScalarA = typename ViewTypeA::value_type; + using AT = Kokkos::ArithTraits; + using MagnitudeA = typename AT::mag_type; ViewTypeA a("A", N); ViewTypeA w("W", N); @@ -31,19 +32,22 @@ void impl_test_nrm2w(int N) { typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); typename ViewTypeA::HostMirror h_w = Kokkos::create_mirror_view(w); + const MagnitudeA max_val = 10; + const MagnitudeA eps = AT::epsilon(); + const MagnitudeA max_error = + max_val * std::sqrt(static_cast(N)) * eps; + Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); + Test::getRandomBounds(max_val, randStart, randEnd); Kokkos::fill_random(a, rand_pool, randStart, randEnd); - Kokkos::fill_random(w, rand_pool, randStart, randEnd); + Kokkos::fill_random(w, rand_pool, AT::one(), randEnd); // Avoid divide by 0 Kokkos::deep_copy(h_a, a); Kokkos::deep_copy(h_w, w); - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - typename AT::mag_type expected_result = 0; for (int i = 0; i < N; i++) { typename AT::mag_type term = AT::abs(h_a(i)) / AT::abs(h_w(i)); @@ -53,15 +57,16 @@ void impl_test_nrm2w(int N) { Kokkos::ArithTraits::sqrt(expected_result); typename AT::mag_type nonconst_result = KokkosBlas::nrm2w(a, w); - EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); + EXPECT_NEAR_KK(nonconst_result, expected_result, max_error); } template void impl_test_nrm2w_mv(int N, int K) { - typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::ArithTraits AT; + using ScalarA = typename ViewTypeA::value_type; + using AT = Kokkos::ArithTraits; + using MagnitudeA = typename AT::mag_type; - typedef multivector_layout_adapter vfA_type; + using vfA_type = multivector_layout_adapter; typename vfA_type::BaseType b_a("A", N, K); typename vfA_type::BaseType b_w("W", N, K); @@ -69,7 +74,7 @@ void impl_test_nrm2w_mv(int N, int K) { ViewTypeA a = vfA_type::view(b_a); ViewTypeA w = vfA_type::view(b_w); - typedef multivector_layout_adapter h_vfA_type; + using h_vfA_type = multivector_layout_adapter; typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); typename h_vfA_type::BaseType h_b_w = Kokkos::create_mirror_view(b_w); @@ -77,13 +82,19 @@ void impl_test_nrm2w_mv(int N, int K) { typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); typename ViewTypeA::HostMirror h_w = h_vfA_type::view(h_b_w); + const MagnitudeA max_val = 10; + const MagnitudeA eps = AT::epsilon(); + const MagnitudeA max_error = + max_val * std::sqrt(static_cast(N)) * eps; + Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); + Test::getRandomBounds(max_val, randStart, randEnd); Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); - Kokkos::fill_random(b_w, rand_pool, randStart, randEnd); + Kokkos::fill_random(b_w, rand_pool, AT::one(), + randEnd); // Avoid dividing by 0 Kokkos::deep_copy(h_b_a, b_a); Kokkos::deep_copy(h_b_w, b_w); @@ -99,16 +110,13 @@ void impl_test_nrm2w_mv(int N, int K) { Kokkos::ArithTraits::sqrt(expected_result[j]); } - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - Kokkos::View r("Dot::Result", K); KokkosBlas::nrm2w(r, a, w); auto r_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), r); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r_host(k); - EXPECT_NEAR_KK(nonconst_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(nonconst_result, expected_result[k], max_error); } delete[] expected_result; diff --git a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp index 59661cc7e5..9390666c6e 100644 --- a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp @@ -22,8 +22,9 @@ namespace Test { template void impl_test_nrm2w_squared(int N) { - typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::ArithTraits AT; + using ScalarA = typename ViewTypeA::value_type; + using AT = Kokkos::ArithTraits; + using MagnitudeA = typename AT::mag_type; ViewTypeA a("A", N); ViewTypeA w("W", N); @@ -31,19 +32,21 @@ void impl_test_nrm2w_squared(int N) { typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); typename ViewTypeA::HostMirror h_w = Kokkos::create_mirror_view(w); + const MagnitudeA max_val = 10; + const MagnitudeA eps = AT::epsilon(); + const MagnitudeA max_error = max_val * max_val * N * eps; + Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); + Test::getRandomBounds(max_val, randStart, randEnd); Kokkos::fill_random(a, rand_pool, randStart, randEnd); - Kokkos::fill_random(w, rand_pool, randStart, randEnd); + Kokkos::fill_random(w, rand_pool, AT::one(), randEnd); // Avoid divide by 0 Kokkos::deep_copy(h_a, a); Kokkos::deep_copy(h_w, w); - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - typename AT::mag_type expected_result = 0; for (int i = 0; i < N; i++) { typename AT::mag_type term = AT::abs(h_a(i)) / AT::abs(h_w(i)); @@ -51,15 +54,16 @@ void impl_test_nrm2w_squared(int N) { } typename AT::mag_type nonconst_result = KokkosBlas::nrm2w_squared(a, w); - EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); + EXPECT_NEAR_KK(nonconst_result, expected_result, max_error); } template void impl_test_nrm2w_squared_mv(int N, int K) { - typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::ArithTraits AT; + using ScalarA = typename ViewTypeA::value_type; + using AT = Kokkos::ArithTraits; + using MagnitudeA = typename AT::mag_type; - typedef multivector_layout_adapter vfA_type; + using vfA_type = multivector_layout_adapter; typename vfA_type::BaseType b_a("A", N, K); typename vfA_type::BaseType b_w("W", N, K); @@ -67,7 +71,7 @@ void impl_test_nrm2w_squared_mv(int N, int K) { ViewTypeA a = vfA_type::view(b_a); ViewTypeA w = vfA_type::view(b_w); - typedef multivector_layout_adapter h_vfA_type; + using h_vfA_type = multivector_layout_adapter; typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); typename h_vfA_type::BaseType h_b_w = Kokkos::create_mirror_view(b_w); @@ -75,13 +79,17 @@ void impl_test_nrm2w_squared_mv(int N, int K) { typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); typename ViewTypeA::HostMirror h_w = h_vfA_type::view(h_b_w); + const MagnitudeA max_val = 10; + const MagnitudeA eps = AT::epsilon(); + const MagnitudeA max_error = max_val * max_val * N * eps; + Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); + Test::getRandomBounds(max_val, randStart, randEnd); Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); - Kokkos::fill_random(b_w, rand_pool, randStart, randEnd); + Kokkos::fill_random(b_w, rand_pool, AT::one(), randEnd); Kokkos::deep_copy(h_b_a, b_a); Kokkos::deep_copy(h_b_w, b_w); @@ -95,16 +103,13 @@ void impl_test_nrm2w_squared_mv(int N, int K) { } } - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - Kokkos::View r("Dot::Result", K); KokkosBlas::nrm2w_squared(r, a, w); auto r_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), r); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r_host(k); - EXPECT_NEAR_KK(nonconst_result, expected_result[k], - eps * expected_result[k]); + EXPECT_NEAR_KK(nonconst_result, expected_result[k], max_error); } delete[] expected_result; diff --git a/blas/unit_test/Test_Blas1_reciprocal.hpp b/blas/unit_test/Test_Blas1_reciprocal.hpp index 257429ac0d..49dd1c6119 100644 --- a/blas/unit_test/Test_Blas1_reciprocal.hpp +++ b/blas/unit_test/Test_Blas1_reciprocal.hpp @@ -23,9 +23,11 @@ namespace Test { template void impl_test_reciprocal(int N) { - typedef typename ViewTypeA::value_type ScalarA; - typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; + using ScalarA = typename ViewTypeA::value_type; + using ScalarB = typename ViewTypeB::value_type; + using AT = Kokkos::Details::ArithTraits; + using MagnitudeA = typename AT::mag_type; + using MagnitudeB = typename Kokkos::ArithTraits::mag_type; typedef Kokkos::View< ScalarA * [2], @@ -42,9 +44,9 @@ void impl_test_reciprocal(int N) { Device> BaseTypeB; - typename AT::mag_type eps = AT::epsilon() * 2000; - typename AT::mag_type zero = AT::abs(AT::zero()); - typename AT::mag_type one = AT::abs(AT::one()); + const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); + const MagnitudeA one = AT::abs(AT::one()); + const MagnitudeA max_val = 10; BaseTypeA b_x("X", N); BaseTypeB b_y("Y", N); @@ -65,12 +67,12 @@ void impl_test_reciprocal(int N) { { ScalarA randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(b_x, rand_pool, one, randEnd); } { ScalarB randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); + Test::getRandomBounds(10, randStart, randEnd); Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); } @@ -79,32 +81,24 @@ void impl_test_reciprocal(int N) { Kokkos::deep_copy(h_b_x, b_x); Kokkos::deep_copy(h_b_y, b_y); - ScalarA expected_result(0); - for (int i = 0; i < N; i++) { - expected_result += - AT::abs(AT::one() / h_x(i)) * AT::abs(AT::one() / h_x(i)); - } - KokkosBlas::reciprocal(y, x); - ScalarB nonconst_nonconst_result = KokkosBlas::dot(y, y); - typename AT::mag_type divisor = - AT::abs(expected_result) == zero ? one : AT::abs(expected_result); - typename AT::mag_type diff = - AT::abs(nonconst_nonconst_result - expected_result) / divisor; - EXPECT_NEAR_KK(diff, zero, eps); + Kokkos::deep_copy(h_b_y, b_y); + for (int i = 0; i < N; ++i) { + EXPECT_NEAR_KK(h_b_y(i, 0), ScalarB(one / h_b_x(i, 0)), 2 * eps); + } Kokkos::deep_copy(b_y, b_org_y); KokkosBlas::reciprocal(y, c_x); - ScalarB const_nonconst_result = KokkosBlas::dot(y, y); - diff = AT::abs(const_nonconst_result - expected_result) / divisor; - EXPECT_NEAR_KK(diff, zero, eps); + Kokkos::deep_copy(h_b_y, b_y); + for (int i = 0; i < N; ++i) { + EXPECT_NEAR_KK(h_b_y(i, 0), ScalarB(one / h_b_x(i, 0)), 2 * eps); + } } template void impl_test_reciprocal_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; typedef multivector_layout_adapter vfA_type; typedef multivector_layout_adapter vfB_type; @@ -130,12 +124,13 @@ void impl_test_reciprocal_mv(int N, int K) { { ScalarA randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); + Test::getRandomBounds(10, randStart, randEnd); + Kokkos::fill_random(b_x, rand_pool, Kokkos::ArithTraits::one(), + randEnd); } { ScalarB randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); + Test::getRandomBounds(10, randStart, randEnd); Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); } @@ -146,45 +141,26 @@ void impl_test_reciprocal_mv(int N, int K) { typename ViewTypeA::const_type c_x = x; - ScalarA* expected_result = new ScalarA[K]; - for (int j = 0; j < K; j++) { - expected_result[j] = ScalarA(); - for (int i = 0; i < N; i++) { - expected_result[j] += - AT::abs(AT::one() / h_x(i, j)) * AT::abs(AT::one() / h_x(i, j)); - } - } - - typename AT::mag_type eps = AT::epsilon() * 2000; - typename AT::mag_type zero = AT::abs(AT::zero()); - typename AT::mag_type one = AT::abs(AT::one()); - - Kokkos::View r("Dot::Result", K); - KokkosBlas::reciprocal(y, x); - KokkosBlas::dot(r, y, y); - for (int k = 0; k < K; k++) { - ScalarA nonconst_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(nonconst_result - expected_result[k]) / divisor; - EXPECT_NEAR_KK(diff, zero, eps); + Kokkos::deep_copy(h_b_y, b_y); + for (int j = 0; j < K; ++j) { + for (int i = 0; i < N; ++i) { + EXPECT_NEAR_KK(h_b_y(i, j), + Kokkos::ArithTraits::one() / ScalarB(h_b_x(i, j)), + 2 * Kokkos::ArithTraits::epsilon()); + } } Kokkos::deep_copy(b_y, b_org_y); KokkosBlas::reciprocal(y, c_x); - KokkosBlas::dot(r, y, y); - for (int k = 0; k < K; k++) { - ScalarA const_result = r(k); - typename AT::mag_type divisor = - AT::abs(expected_result[k]) == zero ? one : AT::abs(expected_result[k]); - typename AT::mag_type diff = - AT::abs(const_result - expected_result[k]) / divisor; - EXPECT_NEAR_KK(diff, zero, eps); + Kokkos::deep_copy(h_b_y, b_y); + for (int j = 0; j < K; j++) { + for (int i = 0; i < N; ++i) { + EXPECT_NEAR_KK(h_b_y(i, j), + Kokkos::ArithTraits::one() / ScalarB(h_b_x(i, j)), + 2 * Kokkos::ArithTraits::epsilon()); + } } - - delete[] expected_result; } } // namespace Test diff --git a/blas/unit_test/Test_Blas1_team_mult.hpp b/blas/unit_test/Test_Blas1_team_mult.hpp index da8c836130..91706d3cc3 100644 --- a/blas/unit_test/Test_Blas1_team_mult.hpp +++ b/blas/unit_test/Test_Blas1_team_mult.hpp @@ -240,17 +240,26 @@ void impl_test_team_mult_mv(int N, int K) { typename ViewTypeA::const_type c_x = x; typename ViewTypeB::const_type c_y = y; - ScalarC *expected_result = new ScalarC[K]; - for (int j = 0; j < K; j++) { - expected_result[j] = ScalarC(); - for (int i = 0; i < N; i++) - expected_result[j] += ScalarC(b * h_z(i, j) + a * h_x(i) * h_y(i, j)) * - ScalarC(b * h_z(i, j) + a * h_x(i) * h_y(i, j)); + std::cout << "Input values:" << std::endl; + std::cout << "\ta=" << a << ", b=" << b << std::endl; + std::cout << "\tx: { "; + for (int i = 0; i < static_cast(h_b_x.extent(0)); ++i) { + std::cout << h_b_x(i, 0) << " "; } + std::cout << "}" << std::endl; + std::cout << "\ty: { "; + for (int i = 0; i < static_cast(h_b_y.extent(0)); ++i) { + std::cout << h_b_y(i, 0) << " "; + } + std::cout << "}" << std::endl; + std::cout << "\tz: { "; + for (int i = 0; i < static_cast(h_b_z.extent(0)); ++i) { + std::cout << h_b_z(i, 0) << " "; + } + std::cout << "}" << std::endl; - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - - Kokkos::View r("Dot::Result", K); + typename Kokkos::ArithTraits::mag_type const eps = + Kokkos::ArithTraits::epsilon(); // KokkosBlas::mult(b,z,a,x,y); Kokkos::parallel_for( @@ -261,11 +270,25 @@ void impl_test_team_mult_mv(int N, int K) { teamMember, b, Kokkos::subview(z, Kokkos::ALL(), teamId), a, x, Kokkos::subview(y, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, z, z); - for (int k = 0; k < K; k++) { - ScalarA nonconst_nonconst_result = r(k); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], - eps * expected_result[k]); + + ScalarC temp; + typename h_vfC_type::BaseType h_b_z_res = Kokkos::create_mirror_view(b_z); + Kokkos::deep_copy(h_b_z_res, b_z); + typename h_vfC_type::BaseType h_b_org_z = Kokkos::create_mirror_view(b_org_z); + Kokkos::deep_copy(h_b_org_z, b_org_z); + + std::cout << "Output values:" << std::endl; + std::cout << "\tz: { "; + for (int i = 0; i < static_cast(h_b_z_res.extent(0)); ++i) { + std::cout << h_b_z_res(i, 0) << " "; + } + std::cout << "}" << std::endl; + + for (int j = 0; j < K; j++) { + for (int i = 0; i < N; i++) { + temp = ScalarC(b * h_b_org_z(i, j) + a * h_x(i) * h_y(i, j)); + EXPECT_NEAR_KK(temp, h_b_z_res(i, j), 10 * eps); + } } Kokkos::deep_copy(b_z, b_org_z); @@ -278,14 +301,14 @@ void impl_test_team_mult_mv(int N, int K) { teamMember, b, Kokkos::subview(z, Kokkos::ALL(), teamId), a, x, Kokkos::subview(c_y, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, z, z); + Kokkos::deep_copy(h_b_z_res, b_z); + for (int k = 0; k < K; k++) { - ScalarA const_non_const_result = r(k); - EXPECT_NEAR_KK(const_non_const_result, expected_result[k], - eps * expected_result[k]); + for (int i = 0; i < N; ++i) { + temp = ScalarC(b * h_b_org_z(i, k) + a * h_x(i) * h_y(i, k)); + EXPECT_NEAR_KK(temp, h_b_z_res(i, k), 10 * eps); + } } - - delete[] expected_result; } } // namespace Test @@ -368,6 +391,7 @@ int test_team_mult_mv() { // view_type_c_ll, Device>(132231,5); #endif + /* #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -383,7 +407,9 @@ int test_team_mult_mv() { // Test::impl_test_team_mult_mv(132231,5); #endif + */ + /* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -407,6 +433,7 @@ int test_team_mult_mv() { Test::impl_test_team_mult_mv(124, 5); #endif + */ return 1; } diff --git a/blas/unit_test/Test_Blas1_team_update.hpp b/blas/unit_test/Test_Blas1_team_update.hpp index cf118e7ba2..8a591b8c27 100644 --- a/blas/unit_test/Test_Blas1_team_update.hpp +++ b/blas/unit_test/Test_Blas1_team_update.hpp @@ -249,7 +249,7 @@ void impl_test_team_update_mv(int N, int K) { ScalarC(a * h_x(i, j) + b * h_y(i, j) + c * h_z(i, j)); } - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; Kokkos::View r("Dot::Result", K); diff --git a/blas/unit_test/Test_Blas2_gemv.hpp b/blas/unit_test/Test_Blas2_gemv.hpp index 1df115d2c3..4731891e15 100644 --- a/blas/unit_test/Test_Blas2_gemv.hpp +++ b/blas/unit_test/Test_Blas2_gemv.hpp @@ -30,10 +30,9 @@ void impl_test_gemv(const char* mode, int M, int N) { typedef multivector_layout_adapter vfA_type; - ScalarA alpha = 3; - ScalarY beta = 5; - double eps = - (std::is_same::value ? 1e-2 : 5e-10); + ScalarA alpha = 3; + ScalarY beta = 5; + typename KAT_Y::mag_type const eps = KAT_Y::epsilon(); int ldx; int ldy; @@ -64,22 +63,30 @@ void impl_test_gemv(const char* mode, int M, int N) { Kokkos::Random_XorShift64_Pool rand_pool( 13718); + const double max_valX = 1; + const double max_valY = 1; + const double max_valA = 1; { ScalarX randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); + Test::getRandomBounds(max_valX, randStart, randEnd); Kokkos::fill_random(x, rand_pool, randStart, randEnd); } { ScalarY randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); + Test::getRandomBounds(max_valY, randStart, randEnd); Kokkos::fill_random(y, rand_pool, randStart, randEnd); } { ScalarA randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); + Test::getRandomBounds(max_valA, randStart, randEnd); Kokkos::fill_random(b_A, rand_pool, randStart, randEnd); } + const typename KAT_Y::mag_type max_error = + KAT_Y::abs(alpha * max_valA * max_valX * ldx + beta * max_valY); + const typename KAT_Y::mag_type tol = + max_error * eps * 2; // adding small fudge factor of 2 + Kokkos::deep_copy(org_y, y); auto h_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), org_y); @@ -96,8 +103,11 @@ void impl_test_gemv(const char* mode, int M, int N) { Kokkos::deep_copy(h_y, y); int numErrors = 0; for (int i = 0; i < ldy; i++) { - if (KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i))) + if (KAT_Y::abs(expected(i) - h_y(i)) > tol) { numErrors++; + std::cout << "expected(i)=" << expected(i) << ", h_y(i)=" << h_y(i) + << std::endl; + } } EXPECT_EQ(numErrors, 0) << "Nonconst input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta @@ -108,8 +118,7 @@ void impl_test_gemv(const char* mode, int M, int N) { Kokkos::deep_copy(h_y, y); numErrors = 0; for (int i = 0; i < ldy; i++) { - if (KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i))) - numErrors++; + if (KAT_Y::abs(expected(i) - h_y(i)) > tol) numErrors++; } EXPECT_EQ(numErrors, 0) << "Const vector input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta @@ -120,8 +129,7 @@ void impl_test_gemv(const char* mode, int M, int N) { Kokkos::deep_copy(h_y, y); numErrors = 0; for (int i = 0; i < ldy; i++) { - if (KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i))) - numErrors++; + if (KAT_Y::abs(expected(i) - h_y(i)) > tol) numErrors++; } EXPECT_EQ(numErrors, 0) << "Const matrix/vector input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta @@ -137,8 +145,13 @@ void impl_test_gemv(const char* mode, int M, int N) { numErrors = 0; for (int i = 0; i < ldy; i++) { if (KAT_Y::isNan(h_y(i)) || - KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(eps * expected(i))) + KAT_Y::abs(expected(i) - h_y(i)) > + KAT_Y::abs(alpha * max_valA * max_valX * ldx * eps * 2)) { numErrors++; + std::cout << "expected(" << i << ")=" << expected(i) << ", h_y(" << i + << ")=" << h_y(i) << ", eps=" << eps + << ", 1024*2*eps=" << 1024 * 2 * KAT_Y::epsilon() << std::endl; + } } EXPECT_EQ(numErrors, 0) << "beta = 0, input contains NaN, A is " << M << 'x' << N << ", mode " << mode << ": gemv incorrect"; From 602c526d75a7e1a56a33c520afbc722b06ecc1f5 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 21 Feb 2023 11:41:54 -0700 Subject: [PATCH 084/442] Tested mixed scalars: removing temporary output --- blas/unit_test/Test_Blas1_team_mult.hpp | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/blas/unit_test/Test_Blas1_team_mult.hpp b/blas/unit_test/Test_Blas1_team_mult.hpp index 91706d3cc3..6f50cc31cc 100644 --- a/blas/unit_test/Test_Blas1_team_mult.hpp +++ b/blas/unit_test/Test_Blas1_team_mult.hpp @@ -240,24 +240,6 @@ void impl_test_team_mult_mv(int N, int K) { typename ViewTypeA::const_type c_x = x; typename ViewTypeB::const_type c_y = y; - std::cout << "Input values:" << std::endl; - std::cout << "\ta=" << a << ", b=" << b << std::endl; - std::cout << "\tx: { "; - for (int i = 0; i < static_cast(h_b_x.extent(0)); ++i) { - std::cout << h_b_x(i, 0) << " "; - } - std::cout << "}" << std::endl; - std::cout << "\ty: { "; - for (int i = 0; i < static_cast(h_b_y.extent(0)); ++i) { - std::cout << h_b_y(i, 0) << " "; - } - std::cout << "}" << std::endl; - std::cout << "\tz: { "; - for (int i = 0; i < static_cast(h_b_z.extent(0)); ++i) { - std::cout << h_b_z(i, 0) << " "; - } - std::cout << "}" << std::endl; - typename Kokkos::ArithTraits::mag_type const eps = Kokkos::ArithTraits::epsilon(); @@ -277,13 +259,6 @@ void impl_test_team_mult_mv(int N, int K) { typename h_vfC_type::BaseType h_b_org_z = Kokkos::create_mirror_view(b_org_z); Kokkos::deep_copy(h_b_org_z, b_org_z); - std::cout << "Output values:" << std::endl; - std::cout << "\tz: { "; - for (int i = 0; i < static_cast(h_b_z_res.extent(0)); ++i) { - std::cout << h_b_z_res(i, 0) << " "; - } - std::cout << "}" << std::endl; - for (int j = 0; j < K; j++) { for (int i = 0; i < N; i++) { temp = ScalarC(b * h_b_org_z(i, j) + a * h_x(i) * h_y(i, j)); From 8aa7fa23e665000c3eefb83e6e2bfe852ca835b5 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 22 Feb 2023 14:01:58 -0700 Subject: [PATCH 085/442] cast Kokkos::Impl::integral_constant to int --- common/src/KokkosKernels_PrintUtils.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/src/KokkosKernels_PrintUtils.hpp b/common/src/KokkosKernels_PrintUtils.hpp index eff4f1f43f..74b32c793a 100644 --- a/common/src/KokkosKernels_PrintUtils.hpp +++ b/common/src/KokkosKernels_PrintUtils.hpp @@ -104,7 +104,8 @@ inline std::enable_if_t= 2> kk_print_1Dview( return; } os << "[" << view.extent(0); - for (int i = 1; i < idx_array_type::rank; ++i) { + // ::rank is a Kokkos::...::integral_constant, not appropriate for `i` + for (int i = 1; i < int(idx_array_type::rank); ++i) { os << "x" << view.extent(i); } os << " multi-vector]" << std::endl; From e9ec438001d0ee75f74a12ec625a8ce0102606a1 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 21 Feb 2023 08:29:23 -0700 Subject: [PATCH 086/442] Introduce KOKKOSKERNELS_ALL_COMPONENTS_ENABLED variable Previously, we could have the following issue: 1. User sets KokkosKernernels_ENABLE_ALL_COMPONENTS to on 2. This enables each component 3. Invoke cmake again 4. Since components are enabled, KokkosKernels_ENABLE_ALL_COMPONENTS would be set to off 5. Things that rely on KokkosKernels_ENABLE_ALL_COMPONENTS are no longer build (e.g. tests) Now, KOKKOSKERELS_ALL_COMPONENTS_ENABLED is used to track whether all components are enabled, regardless of how we got there. It's marked as advanced to hide it from GUIs, and leaves the user-facing KokkosKernels_ENABLE_ALL_COMPONENTS intact. --- CMakeLists.txt | 8 ++++---- cmake/kokkoskernels_components.cmake | 27 +++++++++++++++------------ 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 72c00118da..bea922075d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -390,7 +390,7 @@ ELSE() IF (KokkosKernels_ENABLE_COMPONENT_SPARSE) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(sparse/unit_test) ENDIF() - IF (KokkosKernels_ENABLE_ALL_COMPONENTS) + IF (KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) IF (KokkosKernels_ENABLE_PERFTESTS) MESSAGE(STATUS "Enabling perf tests.") KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) @@ -400,13 +400,13 @@ ELSE() KOKKOSKERNELS_ADD_EXAMPLE_DIRECTORIES(example) ENDIF () ELSE () - # ENABLE_ALL_COMPONENTS is OFF, so perftests and examples can't be enabled. + # all components were not enabled, so perftests and examples can't be enabled. # Warn if they were requested. IF (KokkosKernels_ENABLE_PERFTESTS) - MESSAGE(WARNING "Could not enable perf tests because KokkosKernels_ENABLE_ALL_COMPONENTS=OFF") + MESSAGE(WARNING "Could not enable perf tests because not all components were enabled") ENDIF () IF (KokkosKernels_ENABLE_EXAMPLES) - MESSAGE(WARNING "Could not enable examples because KokkosKernels_ENABLE_ALL_COMPONENTS=OFF") + MESSAGE(WARNING "Could not enable examples because not all components were enabled") ENDIF () ENDIF () diff --git a/cmake/kokkoskernels_components.cmake b/cmake/kokkoskernels_components.cmake index f33a62b6ff..56ab1a7c31 100644 --- a/cmake/kokkoskernels_components.cmake +++ b/cmake/kokkoskernels_components.cmake @@ -45,14 +45,6 @@ KOKKOSKERNELS_ADD_OPTION( "Whether to build the graph component. Default: OFF" ) -# The user requested individual components, -# the assumption is that a full build is not -# desired and ENABLE_ALL_COMPONENETS is turned -# off. -IF (KokkosKernels_ENABLE_COMPONENT_BATCHED OR KokkosKernels_ENABLE_COMPONENT_BLAS - OR KokkosKernels_ENABLE_COMPONENT_GRAPH OR KokkosKernels_ENABLE_COMPONENT_SPARSE) - SET(KokkosKernels_ENABLE_ALL_COMPONENTS OFF CACHE BOOL "" FORCE) -ENDIF() # Graph depends on everything else because it depends # on Sparse at the moment, breaking that dependency will @@ -72,13 +64,24 @@ IF (KokkosKernels_ENABLE_COMPONENT_SPARSE) SET(KokkosKernels_ENABLE_COMPONENT_GRAPH ON CACHE BOOL "" FORCE) ENDIF() -# At this point, if ENABLE_ALL_COMPONENTS is -# still ON we need to enable all individual -# components as they are required for this -# build. +# If user requested to enable all components, enable all components IF (KokkosKernels_ENABLE_ALL_COMPONENTS) SET(KokkosKernels_ENABLE_COMPONENT_BATCHED ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_BLAS ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_SPARSE ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_GRAPH ON CACHE BOOL "" FORCE) ENDIF() + +# KOKKOSKERNELS_ALL_COMPONENTS_ENABLED says whether all components are on, +# regardless of how this came to be +# this is in the cache so we can use it as a global variable, +# but marking it as advanced should hide it from GUIs +IF ( KokkosKernels_ENABLE_COMPONENT_BATCHED + AND KokkosKernels_ENABLE_COMPONENT_BLAS + AND KokkosKernels_ENABLE_COMPONENT_GRAPH + AND KokkosKernels_ENABLE_COMPONENT_SPARSE) + SET(KOKKOSKERNELS_ALL_COMPONENTS_ENABLED ON CACHE BOOL "" FORCE) +ELSE() + SET(KOKKOSKERNELS_ALL_COMPONENTS_ENABLED OFF CACHE BOOL "" FORCE) +ENDIF() +mark_as_advanced(FORCE KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) \ No newline at end of file From feb9f9ae63df60372bc85cbc721433e61d3b02c6 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Fri, 24 Feb 2023 16:56:00 -0500 Subject: [PATCH 087/442] use rocsparse_spmv_ex for rocm >= 5.4.0 --- sparse/src/KokkosSparse_Utils_rocsparse.hpp | 4 ++++ .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 18 +++++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/sparse/src/KokkosSparse_Utils_rocsparse.hpp b/sparse/src/KokkosSparse_Utils_rocsparse.hpp index dd479610ca..b146aff782 100644 --- a/sparse/src/KokkosSparse_Utils_rocsparse.hpp +++ b/sparse/src/KokkosSparse_Utils_rocsparse.hpp @@ -18,6 +18,7 @@ #define _KOKKOSKERNELS_SPARSEUTILS_ROCSPARSE_HPP #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE +#include #include "rocsparse/rocsparse.h" namespace KokkosSparse { @@ -164,6 +165,9 @@ struct kokkos_to_rocsparse_type> { using type = rocsparse_double_complex; }; +#define KOKKOSSPARSE_IMPL_ROCM_VERSION \ + ROCM_VERSION_MAJOR * 10000 + ROCM_VERSION_MINOR * 100 + ROCM_VERSION_PATCH + } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index f223ed0e5a..db719b43d8 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -343,6 +343,7 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, // rocSPARSE #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) #include +#include #include "KokkosSparse_Utils_rocsparse.hpp" namespace KokkosSparse { @@ -421,13 +422,24 @@ void spmv_rocsparse(const KokkosKernels::Experimental::Controls& controls, else if (algName == "merge") alg = rocsparse_spmv_alg_csr_stream; } - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( - rocsparse_spmv(handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, - vecY, compute_type, alg, &buffer_size, tmp_buffer)); + +#if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400 + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex( + handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, + compute_type, alg, rocsparse_spmv_stage_auto, &buffer_size, tmp_buffer)); KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&tmp_buffer, buffer_size)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex( + handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, + compute_type, alg, rocsparse_spmv_stage_auto, &buffer_size, tmp_buffer)); +#else KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( rocsparse_spmv(handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, compute_type, alg, &buffer_size, tmp_buffer)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&tmp_buffer, buffer_size)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex( + handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, + compute_type, alg, &buffer_size, tmp_buffer)); +#endif KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(tmp_buffer)); KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnvec_descr(vecY)); From 016384fff0f5a66565ffd4b328978d2a5c82d4c8 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Fri, 24 Feb 2023 17:12:21 -0500 Subject: [PATCH 088/442] View::Rank -> View::rank --- perf_test/batched/sparse/KokkosBatched_Test_Sparse_Helper.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf_test/batched/sparse/KokkosBatched_Test_Sparse_Helper.hpp b/perf_test/batched/sparse/KokkosBatched_Test_Sparse_Helper.hpp index 663ea400be..1eaacbde5e 100644 --- a/perf_test/batched/sparse/KokkosBatched_Test_Sparse_Helper.hpp +++ b/perf_test/batched/sparse/KokkosBatched_Test_Sparse_Helper.hpp @@ -141,9 +141,9 @@ void readCRSFromMM(std::string name, const VType &V, const IntType &r, r_h(tmp_row) = i; current_row = read_row; - // if (VType::Rank == 1) + // if (VType::rank == 1) // input >> V_h(i); - if (VType::Rank == 2) + if (VType::rank == 2) for (size_t j = 0; j < V_h.extent(0); ++j) input >> V_h(j, i); } From e4b324c8c25bffc150df3434c830013f83eab78d Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Fri, 24 Feb 2023 11:54:02 -0700 Subject: [PATCH 089/442] test mixed scalars: incorporate Evan's comments Fixing some tolerance issues, making some variables constexpr and re-enabling some tests that should not be disabled. --- blas/unit_test/Test_Blas1_nrm2w.hpp | 8 ++++---- blas/unit_test/Test_Blas1_nrm2w_squared.hpp | 12 ++++++------ blas/unit_test/Test_Blas1_team_mult.hpp | 15 ++++++++------- blas/unit_test/Test_Blas2_gemv.hpp | 8 ++++---- 4 files changed, 22 insertions(+), 21 deletions(-) diff --git a/blas/unit_test/Test_Blas1_nrm2w.hpp b/blas/unit_test/Test_Blas1_nrm2w.hpp index ca5714db1c..8a3675cc5e 100644 --- a/blas/unit_test/Test_Blas1_nrm2w.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w.hpp @@ -32,8 +32,8 @@ void impl_test_nrm2w(int N) { typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); typename ViewTypeA::HostMirror h_w = Kokkos::create_mirror_view(w); - const MagnitudeA max_val = 10; - const MagnitudeA eps = AT::epsilon(); + constexpr MagnitudeA max_val = 10; + const MagnitudeA eps = AT::epsilon(); const MagnitudeA max_error = max_val * std::sqrt(static_cast(N)) * eps; @@ -82,8 +82,8 @@ void impl_test_nrm2w_mv(int N, int K) { typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); typename ViewTypeA::HostMirror h_w = h_vfA_type::view(h_b_w); - const MagnitudeA max_val = 10; - const MagnitudeA eps = AT::epsilon(); + constexpr MagnitudeA max_val = 10; + const MagnitudeA eps = AT::epsilon(); const MagnitudeA max_error = max_val * std::sqrt(static_cast(N)) * eps; diff --git a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp index 9390666c6e..7d6c84def6 100644 --- a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp @@ -32,9 +32,9 @@ void impl_test_nrm2w_squared(int N) { typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); typename ViewTypeA::HostMirror h_w = Kokkos::create_mirror_view(w); - const MagnitudeA max_val = 10; - const MagnitudeA eps = AT::epsilon(); - const MagnitudeA max_error = max_val * max_val * N * eps; + constexpr MagnitudeA max_val = 10; + const MagnitudeA eps = AT::epsilon(); + const MagnitudeA max_error = max_val * max_val * N * eps; Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -79,9 +79,9 @@ void impl_test_nrm2w_squared_mv(int N, int K) { typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); typename ViewTypeA::HostMirror h_w = h_vfA_type::view(h_b_w); - const MagnitudeA max_val = 10; - const MagnitudeA eps = AT::epsilon(); - const MagnitudeA max_error = max_val * max_val * N * eps; + constexpr MagnitudeA max_val = 10; + const MagnitudeA eps = AT::epsilon(); + const MagnitudeA max_error = max_val * max_val * N * eps; Kokkos::Random_XorShift64_Pool rand_pool( 13718); diff --git a/blas/unit_test/Test_Blas1_team_mult.hpp b/blas/unit_test/Test_Blas1_team_mult.hpp index 6f50cc31cc..5dddd125ec 100644 --- a/blas/unit_test/Test_Blas1_team_mult.hpp +++ b/blas/unit_test/Test_Blas1_team_mult.hpp @@ -225,9 +225,10 @@ void impl_test_team_mult_mv(int N, int K) { Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(10)); - Kokkos::fill_random(b_z, rand_pool, ScalarC(10)); + typename Kokkos::ArithTraits::mag_type const max_val = 10; + Kokkos::fill_random(b_x, rand_pool, ScalarA(max_val)); + Kokkos::fill_random(b_y, rand_pool, ScalarB(max_val)); + Kokkos::fill_random(b_z, rand_pool, ScalarC(max_val)); Kokkos::deep_copy(b_org_z, b_z); @@ -242,6 +243,8 @@ void impl_test_team_mult_mv(int N, int K) { typename Kokkos::ArithTraits::mag_type const eps = Kokkos::ArithTraits::epsilon(); + typename Kokkos::ArithTraits::mag_type const max_error = + 3 * max_val * max_val * eps; // KokkosBlas::mult(b,z,a,x,y); Kokkos::parallel_for( @@ -262,7 +265,7 @@ void impl_test_team_mult_mv(int N, int K) { for (int j = 0; j < K; j++) { for (int i = 0; i < N; i++) { temp = ScalarC(b * h_b_org_z(i, j) + a * h_x(i) * h_y(i, j)); - EXPECT_NEAR_KK(temp, h_b_z_res(i, j), 10 * eps); + EXPECT_NEAR_KK(temp, h_b_z_res(i, j), max_error); } } @@ -281,7 +284,7 @@ void impl_test_team_mult_mv(int N, int K) { for (int k = 0; k < K; k++) { for (int i = 0; i < N; ++i) { temp = ScalarC(b * h_b_org_z(i, k) + a * h_x(i) * h_y(i, k)); - EXPECT_NEAR_KK(temp, h_b_z_res(i, k), 10 * eps); + EXPECT_NEAR_KK(temp, h_b_z_res(i, k), max_error); } } } @@ -366,7 +369,6 @@ int test_team_mult_mv() { // view_type_c_ll, Device>(132231,5); #endif - /* #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -382,7 +384,6 @@ int test_team_mult_mv() { // Test::impl_test_team_mult_mv(132231,5); #endif - */ /* #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ diff --git a/blas/unit_test/Test_Blas2_gemv.hpp b/blas/unit_test/Test_Blas2_gemv.hpp index 4731891e15..54081c4b68 100644 --- a/blas/unit_test/Test_Blas2_gemv.hpp +++ b/blas/unit_test/Test_Blas2_gemv.hpp @@ -30,7 +30,7 @@ void impl_test_gemv(const char* mode, int M, int N) { typedef multivector_layout_adapter vfA_type; - ScalarA alpha = 3; + const ScalarA alpha = 3; ScalarY beta = 5; typename KAT_Y::mag_type const eps = KAT_Y::epsilon(); @@ -63,9 +63,9 @@ void impl_test_gemv(const char* mode, int M, int N) { Kokkos::Random_XorShift64_Pool rand_pool( 13718); - const double max_valX = 1; - const double max_valY = 1; - const double max_valA = 1; + constexpr double max_valX = 1; + constexpr double max_valY = 1; + constexpr double max_valA = 1; { ScalarX randStart, randEnd; Test::getRandomBounds(max_valX, randStart, randEnd); From f46b24258b9466172c391a3c7043ab6582a13268 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 27 Feb 2023 08:13:20 -0700 Subject: [PATCH 090/442] blas/blas1: Fix a couple documentation typos. --- blas/src/KokkosBlas1_nrm2w.hpp | 2 +- blas/src/KokkosBlas1_reciprocal.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/blas/src/KokkosBlas1_nrm2w.hpp b/blas/src/KokkosBlas1_nrm2w.hpp index aaca5ba644..bf952c77a5 100644 --- a/blas/src/KokkosBlas1_nrm2w.hpp +++ b/blas/src/KokkosBlas1_nrm2w.hpp @@ -66,7 +66,7 @@ nrm2w(const XVector& x, const XVector& w) { /// \brief R(i,j) = nrm2w(X(i,j)) /// -/// Replace each entry in R with the nrm2wolute value (magnitude) of the +/// Replace each entry in R with the nrm2w, absolute value (magnitude), of the /// corresponding entry in X. /// /// \tparam RMV 1-D or 2-D Kokkos::View specialization. diff --git a/blas/src/KokkosBlas1_reciprocal.hpp b/blas/src/KokkosBlas1_reciprocal.hpp index 7e171cb6df..19624d11c9 100644 --- a/blas/src/KokkosBlas1_reciprocal.hpp +++ b/blas/src/KokkosBlas1_reciprocal.hpp @@ -25,8 +25,8 @@ namespace KokkosBlas { /// \brief R(i,j) = reciprocal(X(i,j)) /// -/// Replace each entry in R with the reciprocalolute value (magnitude) of the -/// corresponding entry in X. +/// Replace each entry in R with the absolute value (magnitude), of the +/// reciprocal of the corresponding entry in X. /// /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have From 31190a68ce4b68e54031d9c82f7bc0fee8e393ac Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 27 Feb 2023 09:44:57 -0700 Subject: [PATCH 091/442] blas/blas1: Add mult docs --- blas/src/KokkosBlas1_mult.hpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/blas/src/KokkosBlas1_mult.hpp b/blas/src/KokkosBlas1_mult.hpp index 39ccbbeebd..e08409e9aa 100644 --- a/blas/src/KokkosBlas1_mult.hpp +++ b/blas/src/KokkosBlas1_mult.hpp @@ -23,6 +23,20 @@ namespace KokkosBlas { +/// \brief Element wise multiplication of two vectors: +/// Y[i] = gamma * Y[i] + alpha * A[i] * X[i] +/// +/// \tparam YMV Type of the first vector Y; a 1-D or 2-D Kokkos::View. +/// \tparam AV Type of the second vector A; a 1-D Kokkos::View. +/// \tparam XMV Type of the third vector X; a 1-D or 2-D Kokkos::View. +/// +/// \param gamma [in] The scalar to apply to Y. +/// \param Y [in/out] The Y vector. +/// \param alpha [in] The scalar to apply to A. +/// \param A [in] The vector to apply to X. +/// \param X [in] The X vector. +/// +/// \return Y = gamma * Y + alpha * A * X. template void mult(typename YMV::const_value_type& gamma, const YMV& Y, typename AV::const_value_type& alpha, const AV& A, const XMV& X) { From 0b88c05edc71b19a8af9fbb0162f627d388b37cc Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 27 Feb 2023 11:18:06 -0700 Subject: [PATCH 092/442] test mixed scalars: adding more comments and sending msg to cerr --- blas/unit_test/Test_Blas1_team_mult.hpp | 7 ++++++- blas/unit_test/Test_Blas2_gemv.hpp | 8 +++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/blas/unit_test/Test_Blas1_team_mult.hpp b/blas/unit_test/Test_Blas1_team_mult.hpp index 5dddd125ec..f5382664e4 100644 --- a/blas/unit_test/Test_Blas1_team_mult.hpp +++ b/blas/unit_test/Test_Blas1_team_mult.hpp @@ -241,10 +241,15 @@ void impl_test_team_mult_mv(int N, int K) { typename ViewTypeA::const_type c_x = x; typename ViewTypeB::const_type c_y = y; + // In the operation z = (b*z) + (a*x*y) we estimate + // the largest rounding error to be dominated by max(b*z, a*x*y) + // Since b and a are known and the largest value in z, x and y + // is set by the variables max_val, the error upper bound will be + // max_error = a * max_val * max_val typename Kokkos::ArithTraits::mag_type const eps = Kokkos::ArithTraits::epsilon(); typename Kokkos::ArithTraits::mag_type const max_error = - 3 * max_val * max_val * eps; + a * max_val * max_val * eps; // KokkosBlas::mult(b,z,a,x,y); Kokkos::parallel_for( diff --git a/blas/unit_test/Test_Blas2_gemv.hpp b/blas/unit_test/Test_Blas2_gemv.hpp index 54081c4b68..dce07df9bc 100644 --- a/blas/unit_test/Test_Blas2_gemv.hpp +++ b/blas/unit_test/Test_Blas2_gemv.hpp @@ -105,7 +105,8 @@ void impl_test_gemv(const char* mode, int M, int N) { for (int i = 0; i < ldy; i++) { if (KAT_Y::abs(expected(i) - h_y(i)) > tol) { numErrors++; - std::cout << "expected(i)=" << expected(i) << ", h_y(i)=" << h_y(i) + std::cerr << __FILE__ << ":" << __LINE__ + << ": expected(i)=" << expected(i) << ", h_y(i)=" << h_y(i) << std::endl; } } @@ -148,8 +149,9 @@ void impl_test_gemv(const char* mode, int M, int N) { KAT_Y::abs(expected(i) - h_y(i)) > KAT_Y::abs(alpha * max_valA * max_valX * ldx * eps * 2)) { numErrors++; - std::cout << "expected(" << i << ")=" << expected(i) << ", h_y(" << i - << ")=" << h_y(i) << ", eps=" << eps + std::cerr << __FILE__ << ":" << __LINE__ << ": expected(" << i + << ")=" << expected(i) << ", h_y(" << i << ")=" << h_y(i) + << ", eps=" << eps << ", 1024*2*eps=" << 1024 * 2 * KAT_Y::epsilon() << std::endl; } } From 562aaffd9643f96e3abd096a049a4d25606149b7 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 27 Feb 2023 13:07:53 -0700 Subject: [PATCH 093/442] team mult: fix type issue in max_error calculation --- blas/unit_test/Test_Blas1_team_mult.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/unit_test/Test_Blas1_team_mult.hpp b/blas/unit_test/Test_Blas1_team_mult.hpp index f5382664e4..4b4a5c3543 100644 --- a/blas/unit_test/Test_Blas1_team_mult.hpp +++ b/blas/unit_test/Test_Blas1_team_mult.hpp @@ -249,7 +249,7 @@ void impl_test_team_mult_mv(int N, int K) { typename Kokkos::ArithTraits::mag_type const eps = Kokkos::ArithTraits::epsilon(); typename Kokkos::ArithTraits::mag_type const max_error = - a * max_val * max_val * eps; + Kokkos::ArithTraits::abs(a) * max_val * max_val * eps; // KokkosBlas::mult(b,z,a,x,y); Kokkos::parallel_for( From db0071a4382a65f876b70d20b261efed2c43e90d Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 27 Feb 2023 13:11:48 -0700 Subject: [PATCH 094/442] team mult: applying clang-format --- blas/unit_test/Test_Blas1_team_mult.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/unit_test/Test_Blas1_team_mult.hpp b/blas/unit_test/Test_Blas1_team_mult.hpp index 4b4a5c3543..4df5dd9cd4 100644 --- a/blas/unit_test/Test_Blas1_team_mult.hpp +++ b/blas/unit_test/Test_Blas1_team_mult.hpp @@ -249,7 +249,7 @@ void impl_test_team_mult_mv(int N, int K) { typename Kokkos::ArithTraits::mag_type const eps = Kokkos::ArithTraits::epsilon(); typename Kokkos::ArithTraits::mag_type const max_error = - Kokkos::ArithTraits::abs(a) * max_val * max_val * eps; + Kokkos::ArithTraits::abs(a) * max_val * max_val * eps; // KokkosBlas::mult(b,z,a,x,y); Kokkos::parallel_for( From 8469d478f70a899b805c90915c9d7acf5614d7b5 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 27 Feb 2023 15:39:40 -0700 Subject: [PATCH 095/442] Kokkos Kernels version: need to use upper case variables It appears that we need to use the upper case spelling of the variables defining the current version of Kokkos Kernels. I'm not totally sure why but it could have something to do with the fact that we are re-using the same variable names twice... --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 72c00118da..cfa8a0561e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -221,7 +221,7 @@ ELSE() # ================================================================== MESSAGE("") MESSAGE("================================") - MESSAGE("Kokkos Kernels version: ${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") + MESSAGE("Kokkos Kernels version: ${KOKKOSKERNELS_VERSION_MAJOR}.${KOKKOSKERNELS_VERSION_MINOR}.${KOKKOSKERNELS_VERSION_PATCH}") MESSAGE("================================") MESSAGE("Kokkos Kernels ETI Types") MESSAGE(" Devices: ${DEVICE_LIST}") From 7f3acf1332e101c88ff11b2ba8b41ffc2dd464cc Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 27 Feb 2023 17:54:21 -0700 Subject: [PATCH 096/442] Compatibility upgrade: adding compatibility branch in code with the ifdef introduced, the code will compile against 4.0.0 and develop versions of Kokkos Core. --- batched/KokkosBatched_Util.hpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/batched/KokkosBatched_Util.hpp b/batched/KokkosBatched_Util.hpp index a171af8d03..acfd5cab68 100644 --- a/batched/KokkosBatched_Util.hpp +++ b/batched/KokkosBatched_Util.hpp @@ -625,6 +625,18 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, const Trans::NoTranspose) { return subview_wrapper(v, i1, i2, i3, layout_tag); } +#if KOKKOS_VERSION <= 40000 +template +KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, + Kokkos::Impl::ALL_t i2, + Kokkos::Impl::ALL_t i3, + const BatchLayout::Left &layout_tag, + const Trans::Transpose) { + auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); + + return transpose_2d_view(sv_nt, layout_tag); +} +#else template KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, Kokkos::ALL_t i2, Kokkos::ALL_t i3, @@ -634,6 +646,7 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, return transpose_2d_view(sv_nt, layout_tag); } +#endif template KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, @@ -657,6 +670,16 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper( const BatchLayout::Right &layout_tag, const Trans::NoTranspose &) { return subview_wrapper(v, i1, i2, i3, layout_tag); } +#if KOKKOS_VERSION <= 40000 +template +KOKKOS_INLINE_FUNCTION auto subview_wrapper( + ViewType v, IdxType1 i1, Kokkos::Impl::ALL_t i2, Kokkos::Impl::ALL_t i3, + const BatchLayout::Right &layout_tag, const Trans::Transpose &) { + auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); + + return transpose_2d_view(sv_nt, layout_tag); +} +#else template KOKKOS_INLINE_FUNCTION auto subview_wrapper( ViewType v, IdxType1 i1, Kokkos::ALL_t i2, Kokkos::ALL_t i3, @@ -665,6 +688,7 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper( return transpose_2d_view(sv_nt, layout_tag); } +#endif template KOKKOS_INLINE_FUNCTION auto subview_wrapper( ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, From a3ab61082da9dcb6f79400450e0f01dc31ddea61 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 22 Feb 2023 13:30:32 -0800 Subject: [PATCH 097/442] CUSPARSE_MM_ALG_DEFAULT deprecated by 11.1 --- sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index 28a80ec266..717c62b985 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -151,7 +151,8 @@ void spmv_mv_cusparse(const KokkosKernels::Experimental::Controls &controls, cusparseOperation_t opB = xIsLL ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; -#if CUDA_VERSION < 12000 +// CUSPARSE_MM_ALG_DEFAULT was deprecated as early as 11.1 (maybe earlier) +#if CUSPARSE_VERSION < 11010 const cusparseSpMMAlg_t alg = CUSPARSE_MM_ALG_DEFAULT; #else const cusparseSpMMAlg_t alg = CUSPARSE_SPMM_ALG_DEFAULT; From 6ead860027b59cafa956530951d03ab04309efdc Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 28 Feb 2023 15:32:18 -0700 Subject: [PATCH 098/442] add explicit tests of opt-in algorithms --- sparse/unit_test/Test_Sparse_spmv.hpp | 75 +++++++++++++++++++-------- 1 file changed, 52 insertions(+), 23 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index d0a6d1464c..9da0733581 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -159,8 +159,8 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, template void check_spmv( - crsMat_t input_mat, x_vector_type x, y_vector_type y, - typename y_vector_type::non_const_value_type alpha, + const Controls &controls, crsMat_t input_mat, x_vector_type x, + y_vector_type y, typename y_vector_type::non_const_value_type alpha, typename y_vector_type::non_const_value_type beta, char mode, typename Kokkos::ArithTraits::mag_type max_val) { @@ -183,7 +183,7 @@ void check_spmv( bool threw = false; std::string msg; try { - KokkosSparse::spmv(&mode, alpha, input_mat, x, beta, y); + KokkosSparse::spmv(controls, &mode, alpha, input_mat, x, beta, y); Kokkos::fence(); } catch (std::exception &e) { threw = true; @@ -422,9 +422,10 @@ Kokkos::complex randomUpperBound>(int mag) { return Kokkos::complex(mag, mag); } -template -void test_spmv(lno_t numRows, size_type nnz, lno_t bandwidth, - lno_t row_size_variance, bool heavy) { +template +void test_spmv(const Controls &controls, lno_t numRows, size_type nnz, + lno_t bandwidth, lno_t row_size_variance, bool heavy) { using crsMat_t = typename KokkosSparse::CrsMatrix; using scalar_view_t = typename crsMat_t::values_type::non_const_type; @@ -479,8 +480,8 @@ void test_spmv(lno_t numRows, size_type nnz, lno_t bandwidth, for (double beta : testAlphaBeta) { mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; - Test::check_spmv(input_mat, input_x, output_y, alpha, beta, mode, - max_error); + Test::check_spmv(controls, input_mat, input_x, output_y, alpha, beta, + mode, max_error); } } } @@ -490,13 +491,31 @@ void test_spmv(lno_t numRows, size_type nnz, lno_t bandwidth, // hoping the transpose won't have a long column... mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; - Test::check_spmv(input_mat, input_xt, output_yt, alpha, beta, mode, - max_error); + Test::check_spmv(controls, input_mat, input_xt, output_yt, alpha, beta, + mode, max_error); } } } } +template +void test_spmv_algorithms(lno_t numRows, size_type nnz, lno_t bandwidth, + lno_t row_size_variance, bool heavy) { + { + Controls controls; + test_spmv( + controls, numRows, nnz, bandwidth, row_size_variance, heavy); + } + + { + Controls controls; + controls.setParameter("algorithm", "native"); + test_spmv( + controls, numRows, nnz, bandwidth, row_size_variance, heavy); + } +} + template void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, @@ -899,13 +918,13 @@ void test_spmv_mv_struct_1D(lno_t nx, int numMV) { // check that the controls are flowing down correctly in the spmv kernel template void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, - lno_t row_size_variance) { + lno_t row_size_variance, + const Controls &controls = Controls()) { using crsMat_t = typename KokkosSparse::CrsMatrix; using scalar_view_t = typename crsMat_t::values_type::non_const_type; using x_vector_type = scalar_view_t; using y_vector_type = scalar_view_t; - using Controls = KokkosKernels::Experimental::Controls; using mag_t = typename Kokkos::ArithTraits::mag_type; constexpr mag_t max_x = static_cast(10); @@ -931,8 +950,6 @@ void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, const mag_t max_error = max_y + bandwidth * max_val * max_x; - Controls controls; - Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 0.0, max_error); Test::check_spmv_controls(controls, input_mat, input_x, output_y, 0.0, 1.0, @@ -941,6 +958,15 @@ void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, max_error); } // test_spmv_controls +// test the native algorithm +template +void test_spmv_native(lno_t numRows, size_type nnz, lno_t bandwidth, + lno_t row_size_variance) { + Controls controls; + controls.setParameter("algorithm", "native"); + test_spmv_controls(numRows, nnz, bandwidth, row_size_variance, controls); +} // test_spmv_native + // call it if ordinal int and, scalar float and double are instantiated. template void test_github_issue_101() { @@ -1577,15 +1603,18 @@ void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha, #define EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##spmv##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_spmv(1000, 1000 * 3, 200, 10, true); \ - test_spmv(1000, 1000 * 3, 100, 10, true); \ - test_spmv(1000, 1000 * 20, 100, 5, true); \ - test_spmv(50000, 50000 * 3, 20, 10, \ - false); \ - test_spmv(50000, 50000 * 3, 100, 10, \ - false); \ - test_spmv(10000, 10000 * 2, 100, 5, \ - false); \ + test_spmv_algorithms(1000, 1000 * 3, 200, \ + 10, true); \ + test_spmv_algorithms(1000, 1000 * 3, 100, \ + 10, true); \ + test_spmv_algorithms(1000, 1000 * 20, \ + 100, 5, true); \ + test_spmv_algorithms(50000, 50000 * 3, \ + 20, 10, false); \ + test_spmv_algorithms(50000, 50000 * 3, \ + 100, 10, false); \ + test_spmv_algorithms(10000, 10000 * 2, \ + 100, 5, false); \ test_spmv_controls(10000, 10000 * 20, \ 100, 5); \ } From fcf349d33dce8f023836d8ed3cc94105f32fc778 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 1 Mar 2023 10:09:45 -0700 Subject: [PATCH 099/442] print the patch that clang-format-8 wants to apply --- .github/workflows/format.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index df1df44ad7..de5d35e09f 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -31,9 +31,15 @@ jobs: fi done - # If any diffs exist, error out + # If any diffs exist, print the patch and error out if [[ ! -z $(git status -s -uno . -- ':!.github') ]]; then echo "The following files require formatting changes:" git status -s -uno . -- ':!.github' + + echo "==== Begin Format Patch ====" + # --cached means show staged changes (git add above) + git --no-pager diff --patch --cached + echo "==== End Format Patch ====" + exit 1 fi From 8ed861214604c467eb638f6ee4d3818db86bb07e Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 28 Feb 2023 14:36:01 -0700 Subject: [PATCH 100/442] Adds KokkosKernels::Impl::Iota, a view-like where iota(i) = i + offset --- common/src/KokkosKernels_Iota.hpp | 142 ++++++++++++++++++++++++++ common/unit_test/Test_Common.hpp | 1 + common/unit_test/Test_Common_Iota.hpp | 117 +++++++++++++++++++++ 3 files changed, 260 insertions(+) create mode 100644 common/src/KokkosKernels_Iota.hpp create mode 100644 common/unit_test/Test_Common_Iota.hpp diff --git a/common/src/KokkosKernels_Iota.hpp b/common/src/KokkosKernels_Iota.hpp new file mode 100644 index 0000000000..c5d6a8dfac --- /dev/null +++ b/common/src/KokkosKernels_Iota.hpp @@ -0,0 +1,142 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef _KOKKOSKERNELS_IOTA_HPP +#define _KOKKOSKERNELS_IOTA_HPP + +#include + +#include + +#include "KokkosKernels_Error.hpp" + +/*! \file KokkosKernels_Iota.hpp + * Define an Iota struct that implements a small subset of Kokkos::View and + * related utilities. + */ + +namespace KokkosKernels { +namespace Impl { + +/*! \class Iota + \brief A class that mimics a small subset of Kokkos::View + + \tparam T the type returned by operator() + \tparam SizeType a custom offset type + + \typedef size_type SizeType + \typedef value_type T + \typedef non_const_value_type non-const T + \typedef device_type void + \typedef data_type const value_type * + \enum rank always 1 + + Iota::operator() returns offset + i + Meant to be used in place of a Kokkos::View where entry i holds i + offset. + Unlike a Kokkos::View, Iota is not materialized in memory. + + Constructing with a size less than 0 yeilds a 0-size Iota +*/ +template +class Iota { + public: + using size_type = SizeType; + using value_type = T; + using non_const_value_type = std::remove_const; + using device_type = void; + using data_type = const value_type *; + + /*! \brief construct an Iota where iota(i) -> offset + i + + \param[in] size the number of entries + \param[in] offset the offset of the first entry + + Constructing with size < 0 yeilds a 0-size Iota + */ + KOKKOS_INLINE_FUNCTION + constexpr Iota(const size_type &size, const value_type offset) + : size_(size), offset_(offset) { + if constexpr (std::is_signed_v) { + if (size_ < size_type(0)) { + size_ = 0; + } + } + } + + /*! \brief construct an Iota where iota(i) -> i + + \param[in] size the number of entries + */ + KOKKOS_INLINE_FUNCTION + explicit constexpr Iota(const size_type &size) : Iota(size, 0) {} + + /*! \brief construct a zero-sized iota + */ + KOKKOS_INLINE_FUNCTION + constexpr Iota() : size_(0), offset_(0) {} + + /*! \brief Construct Iota subview + + Like the Kokkos::View 1D subview constructor: + \verbatim + Kokkos::View a(10); // size = 10 + Kokkos::View b(a, Kokkos::pair{3,7}); // entries 3,4,5,6 of a + + Iota a(10); + Iota b(a, Kokkos::pair{3,7}); // entries // 3,4,5,6 of a + \endverbatim + + Creating a subview outside of the base Iota yeilds undefined behavior + */ + template + KOKKOS_INLINE_FUNCTION constexpr Iota(const Iota &base, + const Kokkos::pair &range) + : Iota(range.second - range.first, base.offset_ + range.first) {} + + /*! \brief Construct Iota subview + + i >= size() or i < 0 yields undefined behavior. + */ + KOKKOS_INLINE_FUNCTION + constexpr T operator()(size_type i) const noexcept { + return value_type(i + offset_); + }; + + /// \brief return the size of the iota + KOKKOS_INLINE_FUNCTION + constexpr size_t size() const noexcept { return size_; } + + /// \brief Iotas are always like a rank-1 Kokkos::View + enum { rank = 1 }; + + private: + size_type size_; + value_type offset_; +}; + +/// \class is_iota +/// \brief is_iota::value is true if T is a Iota<...>, false otherwise +template +struct is_iota : public std::false_type {}; +template +struct is_iota> : public std::true_type {}; +template +struct is_iota> : public std::true_type {}; + +} // namespace Impl +} // namespace KokkosKernels + +#endif // _KOKKOSKERNELS_IOTA_HPP diff --git a/common/unit_test/Test_Common.hpp b/common/unit_test/Test_Common.hpp index 36bc4bcf35..9b26f9bf9e 100644 --- a/common/unit_test/Test_Common.hpp +++ b/common/unit_test/Test_Common.hpp @@ -24,5 +24,6 @@ #include #include #include +#include #endif // TEST_COMMON_HPP diff --git a/common/unit_test/Test_Common_Iota.hpp b/common/unit_test/Test_Common_Iota.hpp new file mode 100644 index 0000000000..7207d6f4b1 --- /dev/null +++ b/common/unit_test/Test_Common_Iota.hpp @@ -0,0 +1,117 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef TEST_COMMON_IOTA_HPP +#define TEST_COMMON_IOTA_HPP + +#include + +#include "KokkosKernels_Iota.hpp" + +template +void test_iota_constructor() { + // empty iota + { + Iota i; + EXPECT_EQ(i.size(), 0); + } + + // basic iota + { + Iota ten(10); + EXPECT_EQ(ten.size(), 10); + for (size_t i = 0; i < ten.size(); ++i) { + EXPECT_EQ(ten(i), i); + } + } + + // iota with negative offset + if constexpr (std::is_signed_v) { + Iota three(3, -7); + EXPECT_EQ(three.size(), 3); + for (size_t i = 0; i < three.size(); ++i) { + EXPECT_EQ(three(i), T(i) - T(7)); + } + } + + // iota with positive offset + { + Iota three(3, 2); + EXPECT_EQ(three.size(), 3); + for (size_t i = 0; i < three.size(); ++i) { + EXPECT_EQ(three(i), i + 2); + } + } + + // negative sizes are capped at 0 + if constexpr (std::is_signed_v) { + { + Iota i(-7); + EXPECT_EQ(i.size(), 0); + } + { + Iota i(-1, 2); + EXPECT_EQ(i.size(), 0); + } + } +} + +template +void test_iota_rank() { + EXPECT_EQ((Iota::rank), 1); +} + +template +void test_iota_subview() { + // get the 7th and 8th elements of an Iota + Iota ten(10, 1); // 1..<11 + Iota sub(ten, Kokkos::pair{7, 9}); // 8, 9 + + EXPECT_EQ(sub.size(), 2); + EXPECT_EQ(sub(0), 8); + EXPECT_EQ(sub(1), 9); +} + +template +void test_iota() { + test_iota_constructor(); + test_iota_rank(); + test_iota_subview(); +} + +TEST_F(TestCategory, common_iota) { + test_iota(); + test_iota(); + test_iota(); + test_iota(); + + test_iota(); + test_iota(); + test_iota(); + test_iota(); + + test_iota(); + test_iota(); + test_iota(); + test_iota(); + + test_iota(); + test_iota(); + test_iota(); + test_iota(); +} + +#endif // TEST_COMMON_IOTA_HPP From 4abf2a3a8bc3410f79af02a7e51c3bc9b81d0b29 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 3 Mar 2023 12:51:51 -0700 Subject: [PATCH 101/442] rocsparse spmv tpl: Fix rocsparse_spmv call for rocm < 5.4.0 Addresses issue #1715 --- sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index db719b43d8..7d14e304d7 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -436,9 +436,9 @@ void spmv_rocsparse(const KokkosKernels::Experimental::Controls& controls, rocsparse_spmv(handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, compute_type, alg, &buffer_size, tmp_buffer)); KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&tmp_buffer, buffer_size)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex( - handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, - compute_type, alg, &buffer_size, tmp_buffer)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_spmv(handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, + vecY, compute_type, alg, &buffer_size, tmp_buffer)); #endif KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(tmp_buffer)); From 6ce7ea4ecc993704de638e26bb05e9731fe42679 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 23 Feb 2023 20:43:27 -0700 Subject: [PATCH 102/442] Merge pull request #1695 from kokkos/update-changelog-to-4.0.0 Update changelog to 4.0.0 (cherry picked from commit 3bd6a3de36978f61aad80a25ef67b6d4a7544b91) --- CHANGELOG.md | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc04a9b6f0..fa19491753 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,109 @@ # Change Log +## [4.0.0](https://github.com/kokkos/kokkos-kernels/tree/4.0.0) (2023-21-02) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.7.01...4.0.0) + +### Features: +- Copyright update 4.0 [\#1657](https://github.com/kokkos/kokkos-kernels/pull/1657) +- Added google benchmark to kokkos kernel and to the CI [\#1626](https://github.com/kokkos/kokkos-kernels/pull/1626) + +#### Completing BLAS Level 1: +- ROTG: implementation of BLAS level1 rotg [\#1529](https://github.com/kokkos/kokkos-kernels/pull/1529) +- ROT: adding function to rotate two vector using Givens rotation coefficients [\#1581](https://github.com/kokkos/kokkos-kernels/pull/1581) +- ROTMG: adding rotmg implementation to KokkosBlas [\#1560](https://github.com/kokkos/kokkos-kernels/pull/1560) +- ROTM: adding blas 1 function for modified rotation [\#1583](https://github.com/kokkos/kokkos-kernels/pull/1583) +- SWAP: adding implementation of level 1 BLAS function [\#1612](https://github.com/kokkos/kokkos-kernels/pull/1612) + +#### New incomplete factorization algorithms: +- MDF implementation in parallel [\#1393](https://github.com/kokkos/kokkos-kernels/pull/1393) and [\#1624](https://github.com/kokkos/kokkos-kernels/pull/1624) +- Jgfouca/par ilut [\#1506](https://github.com/kokkos/kokkos-kernels/pull/1506) + +#### New additional features +- Add utility `KokkosSparse::removeCrsMatrixZeros` [\#1681](https://github.com/kokkos/kokkos-kernels/pull/1681) +- Add spgemm TPL support for cuSparse and rocSparse [\#1513](https://github.com/kokkos/kokkos-kernels/pull/1513) +- Add csr2csc [\#1446](https://github.com/kokkos/kokkos-kernels/pull/1446) +- Adding my weighted graph coarsening code into kokkos-kernels [\#1043](https://github.com/kokkos/kokkos-kernels/pull/1043) +- VBD/VBDBIT D1 coloring: support distributed graphs [\#1598](https://github.com/kokkos/kokkos-kernels/pull/1598) + +### Implemented enhancements: +- New tests for mixed-precision GEMM, some fixes for BLAS tests with non-ETI types [\#1615](https://github.com/kokkos/kokkos-kernels/pull/1615) +- Spgemm non-reuse: unification layer and TPLs [\#1678](https://github.com/kokkos/kokkos-kernels/pull/1678) +- Remove "slow mem space" device ETI [\#1619](https://github.com/kokkos/kokkos-kernels/pull/1619) +- First phase of SpGEMM TPL refactor [\#1582](https://github.com/kokkos/kokkos-kernels/pull/1582) +- Spgemm TPL refactor [\#1618](https://github.com/kokkos/kokkos-kernels/pull/1618) +- cleaned messages printed at configuration time [\#1616](https://github.com/kokkos/kokkos-kernels/pull/1616) +- Batched dense tests: splitting batched dense unit-tests [\#1608](https://github.com/kokkos/kokkos-kernels/pull/1608) +- sparse/unit_test: Use native spmv impl in bsr unit tests [\#1606](https://github.com/kokkos/kokkos-kernels/pull/1606) +- ROT* HIP: testing and improving rocBLAS support for ROT* kernels [\#1594](https://github.com/kokkos/kokkos-kernels/pull/1594) +- Add main functions for batched sparse solver performance tests [\#1554](https://github.com/kokkos/kokkos-kernels/pull/1554) +- Batched sparse kernels update [\#1546](https://github.com/kokkos/kokkos-kernels/pull/1546) +- supernodal SpTRSV : require invert-diag option to use SpMV [\#1518](https://github.com/kokkos/kokkos-kernels/pull/1518) +- Update --verbose option in D2 coloring perftest [\#1486](https://github.com/kokkos/kokkos-kernels/pull/1486) + +### Reorganization: +- Modular build: allowing to build components independently [\#1504](https://github.com/kokkos/kokkos-kernels/pull/1504) +- Move GMRES from example to sparse experimental [\#1620](https://github.com/kokkos/kokkos-kernels/pull/1620) +- Remove Experimental::BlockCrsMatrix (replaced with Experimental::BsrMatrix) [\#1458](https://github.com/kokkos/kokkos-kernels/pull/1458) +- Move {Team,TeamVector}Gemv to KokkosBlas [\#1435](https://github.com/kokkos/kokkos-kernels/pull/1435) +- Move SerialGEMV to KokkosBlas [\#1433](https://github.com/kokkos/kokkos-kernels/pull/1433) + +### Build System: +- CMake: export version and subversion to config file [\#1680](https://github.com/kokkos/kokkos-kernels/pull/1680) +- CMake: update package COMPATIBILITY mode in anticipation of release 4.0 [\#1645](https://github.com/kokkos/kokkos-kernels/pull/1645) +- FindTPLMKL.cmake: fix naming of mkl arg to FIND_PACKAGE_HANDLE_STANDARD_ARGS [\#1644](https://github.com/kokkos/kokkos-kernels/pull/1644) +- KokkosKernels: Use KOKKOSKERNELS_INCLUDE_DIRECTORIES() (TriBITSPub/TriBITS#429) [\#1635](https://github.com/kokkos/kokkos-kernels/pull/1635) +- Fix docs build [\#1569](https://github.com/kokkos/kokkos-kernels/pull/1569) +- KokkosKernels: Remove listing of undefined TPL deps (trilinos/Trilinos#11152) [\#1568](https://github.com/kokkos/kokkos-kernels/pull/1568) + +### Testing: +- Update nightly SYCL setup [\#1660](https://github.com/kokkos/kokkos-kernels/pull/1660) +- Add github DOCS ci check & disable Kokkos tests [\#1647](https://github.com/kokkos/kokkos-kernels/pull/1647) +- docs: Fix RTD build [\#1490](https://github.com/kokkos/kokkos-kernels/pull/1490) +- sparse/unit_test: Disable spmv_mv_heavy for all A64FX builds [\#1555](https://github.com/kokkos/kokkos-kernels/pull/1555) +- ROTMG: rocblas TPL turned off [\#1603](https://github.com/kokkos/kokkos-kernels/pull/1603) +- Fix HIP nightly build on ORNL Jenkins CI server [\#1544](https://github.com/kokkos/kokkos-kernels/pull/1544) +- Turn on cublas and cusparse in CLANG13CUDA10 CI check [\#1584](https://github.com/kokkos/kokkos-kernels/pull/1584) +- Add clang13+cuda10 PR build [\#1524](https://github.com/kokkos/kokkos-kernels/pull/1524) +- .githob/workflows: Fix redundant workflow triggers [\#1527](https://github.com/kokkos/kokkos-kernels/pull/1527) +- Add GCC test options for C++17 and disable perftests for INTEL19 [\#1511](https://github.com/kokkos/kokkos-kernels/pull/1511) +- Add INTEL19 and CUDA11 CI settings [\#1505](https://github.com/kokkos/kokkos-kernels/pull/1505) +- .github/workflows: use c++17 [\#1484](https://github.com/kokkos/kokkos-kernels/pull/1484) + +### Bug Fixes: +- Workaround for array_sum_reduce if scalar is half_t and N is 3, 5 or 7 [\#1675](https://github.com/kokkos/kokkos-kernels/pull/1675) +- Fix the nondeterministic issue in SPILUK numeric [\#1683](https://github.com/kokkos/kokkos-kernels/pull/1683) +- Fix an error in Krylov Handle documentation [\#1659](https://github.com/kokkos/kokkos-kernels/pull/1659) +- ROTMG: loosen unit-test tolerance for Host TPLs [\#1638](https://github.com/kokkos/kokkos-kernels/pull/1638) +- SWAP: fixing obvious mistake in TPL layer : ( [\#1637](https://github.com/kokkos/kokkos-kernels/pull/1637) +- Fix 1631: Use Kokkos::LayoutRight with CrsMatrix values_type (Trilinos compatibility) [\#1633](https://github.com/kokkos/kokkos-kernels/pull/1633) +- Cuda/12 with CuSPARSE updates [\#1632](https://github.com/kokkos/kokkos-kernels/pull/1632) +- Fix 1627: cusparse 11.0-11.3 spgemm symbolic wrapper [\#1628](https://github.com/kokkos/kokkos-kernels/pull/1628) +- Make sure to call ExecutionSpace::concurrency() from an object [\#1614](https://github.com/kokkos/kokkos-kernels/pull/1614) +- SPGEMM: fixing the rocsparse interface [\#1607](https://github.com/kokkos/kokkos-kernels/pull/1607) +- Fix Trilinos issue 11033: remove compile time check to allow compilation with non-standard scalar types [\#1591](https://github.com/kokkos/kokkos-kernels/pull/1591) +- SPMM: fixing cuSPARSE issue with incompatible compute type and op [\#1587](https://github.com/kokkos/kokkos-kernels/pull/1587) +- ParILUT: convert two lambdas to functors [\#1580](https://github.com/kokkos/kokkos-kernels/pull/1580) +- Update kk_get_free_total_memory for SYCL [\#1579](https://github.com/kokkos/kokkos-kernels/pull/1579) +- SYCL: Use KOKKOS_IMPL_DO_NOT_USE_PRINTF instead of printf in kernels [\#1567](https://github.com/kokkos/kokkos-kernels/pull/1567) +- Rotg fixes for issue 1577 [\#1578](https://github.com/kokkos/kokkos-kernels/pull/1578) +- Rotg update: fixing the interface [\#1566](https://github.com/kokkos/kokkos-kernels/pull/1566) +- Fix rotg eti [\#1534](https://github.com/kokkos/kokkos-kernels/pull/1534) +- Fix to include KokkosBatched_Util.hpp [\#1565](https://github.com/kokkos/kokkos-kernels/pull/1565) +- TeamGemvInternal: reintroduce 12-arg invoke method [\#1561](https://github.com/kokkos/kokkos-kernels/pull/1561) +- Rename component options to avoid overloaded usage in Trilinos [\#1641](https://github.com/kokkos/kokkos-kernels/pull/1641) +- Avoid the SIMD code branch if the batched size is not a multiple of the vector length [\#1552](https://github.com/kokkos/kokkos-kernels/pull/1552) +- SYCL: Fix linking with ze_loader in Trilinos [\#1551](https://github.com/kokkos/kokkos-kernels/pull/1551) +- ARMPL Fixes and Workarounds [\#1543](https://github.com/kokkos/kokkos-kernels/pull/1543) +- Test_Graph_coarsen: replace HostMirror usage with auto [\#1538](https://github.com/kokkos/kokkos-kernels/pull/1538) +- Fix spgemm cusparse [\#1535](https://github.com/kokkos/kokkos-kernels/pull/1535) +- Warning fixes: Apple Clang complains about [-Werror,-Wunused-but-set-variable] [\#1532](https://github.com/kokkos/kokkos-kernels/pull/1532) +- In src/batched/dense: Barrier after broadcast [\#1520](https://github.com/kokkos/kokkos-kernels/pull/1520) +- Graph coarsen: fix test [\#1517](https://github.com/kokkos/kokkos-kernels/pull/1517) +- KokkosGraph_CoarsenHeuristics: remove volatile qualifier from join [\#1510](https://github.com/kokkos/kokkos-kernels/pull/1510) +- Replace capture [\#1502](https://github.com/kokkos/kokkos-kernels/pull/1502) +- utils: implicit copy-assign deprecated in array_sum_reduce [\#1494](https://github.com/kokkos/kokkos-kernels/pull/1494) + + ## [3.7.01](https://github.com/kokkos/kokkos-kernels/tree/3.7.01) (2022-12-01) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.7.00...3.7.01) From 4414f46c181119a0ba88d351cf42615a7379e357 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 6 Mar 2023 10:36:04 -0700 Subject: [PATCH 103/442] GMRES: fixing some type issues related to memory space instantiation Basically one wants to be very careful about only instantiating View or other object with an execution space only as it might generate a memory type mismatch down the road --- example/gmres/test_prec.cpp | 6 ++++-- sparse/src/KokkosSparse_MatrixPrec.hpp | 11 ++++++----- sparse/src/KokkosSparse_Preconditioner.hpp | 11 ++++++----- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/example/gmres/test_prec.cpp b/example/gmres/test_prec.cpp index f4aca0d6f6..8d1ff74b87 100644 --- a/example/gmres/test_prec.cpp +++ b/example/gmres/test_prec.cpp @@ -27,9 +27,11 @@ int main(int argc, char* argv[]) { using OT = int; using EXSP = Kokkos::DefaultExecutionSpace; using MESP = typename EXSP::memory_space; - using CRS = KokkosSparse::CrsMatrix; + using CRS = + KokkosSparse::CrsMatrix, void, OT>; - using ViewVectorType = Kokkos::View; + using ViewVectorType = + Kokkos::View>; using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; diff --git a/sparse/src/KokkosSparse_MatrixPrec.hpp b/sparse/src/KokkosSparse_MatrixPrec.hpp index 56ec2cc8b7..1e2e408063 100644 --- a/sparse/src/KokkosSparse_MatrixPrec.hpp +++ b/sparse/src/KokkosSparse_MatrixPrec.hpp @@ -50,6 +50,7 @@ class MatrixPrec : public KokkosSparse::Experimental::Preconditioner { public: using ScalarType = typename std::remove_const::type; using EXSP = typename CRS::execution_space; + using MEMSP = typename CRS::memory_space; using karith = typename Kokkos::ArithTraits; //! Constructor: @@ -75,11 +76,11 @@ class MatrixPrec : public KokkosSparse::Experimental::Preconditioner { ///\cdot X\f$. ///// The typical case is \f$\beta = 0\f$ and \f$\alpha = 1\f$. // - virtual void apply(const Kokkos::View &X, - const Kokkos::View &Y, - const char transM[] = "N", - ScalarType alpha = karith::one(), - ScalarType beta = karith::zero()) const { + virtual void apply( + const Kokkos::View> &X, + const Kokkos::View> &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { KokkosSparse::spmv(transM, alpha, _A, X, beta, Y); } //@} diff --git a/sparse/src/KokkosSparse_Preconditioner.hpp b/sparse/src/KokkosSparse_Preconditioner.hpp index 27bea71f33..99ce1a2f1a 100644 --- a/sparse/src/KokkosSparse_Preconditioner.hpp +++ b/sparse/src/KokkosSparse_Preconditioner.hpp @@ -53,6 +53,7 @@ class Preconditioner { public: using ScalarType = typename std::remove_const::type; using EXSP = typename CRS::execution_space; + using MEMSP = typename CRS::memory_space; using karith = typename Kokkos::ArithTraits; //! Constructor: @@ -77,11 +78,11 @@ class Preconditioner { ///\cdot X\f$. ///// The typical case is \f$\beta = 0\f$ and \f$\alpha = 1\f$. // - virtual void apply(const Kokkos::View &X, - const Kokkos::View &Y, - const char transM[] = "N", - ScalarType alpha = karith::one(), - ScalarType beta = karith::zero()) const = 0; + virtual void apply( + const Kokkos::View> &X, + const Kokkos::View> &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const = 0; //@} //! Set this preconditioner's parameters. From ba311291cbf14a79747eb44381e144cdb3c30eab Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 6 Mar 2023 16:45:11 -0700 Subject: [PATCH 104/442] Adding fix for LUPrec --- sparse/src/KokkosSparse_LUPrec.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sparse/src/KokkosSparse_LUPrec.hpp b/sparse/src/KokkosSparse_LUPrec.hpp index fddb1f0d68..a257b8f09c 100644 --- a/sparse/src/KokkosSparse_LUPrec.hpp +++ b/sparse/src/KokkosSparse_LUPrec.hpp @@ -44,6 +44,7 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { public: using ScalarType = typename std::remove_const::type; using EXSP = typename CRS::execution_space; + using MEMSP = typename CRS::memory_space; using karith = typename Kokkos::ArithTraits; using View1d = typename Kokkos::View; @@ -90,11 +91,11 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { ///// ///// It takes L and U and the stores U^inv L^inv X in Y // - virtual void apply(const Kokkos::View &X, - const Kokkos::View &Y, - const char transM[] = "N", - ScalarType alpha = karith::one(), - ScalarType beta = karith::zero()) const { + virtual void apply( + const Kokkos::View> &X, + const Kokkos::View> &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { // tmp = trsv(L, x); //Apply L^inv to x // y = trsv(U, tmp); //Apply U^inv to tmp From d9df4fd6bbacb67e9b6d37f4be5298e0ba216c9e Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 7 Mar 2023 09:59:21 -0500 Subject: [PATCH 105/442] Drop obsolete workaround checking whether KOKKOS_IF_ON_{HOST,DEVICE} macros are defined This is always true from Kokkos release 3.6 and current develop does not even build any more against Kokkos 3.7 --- ...hed_Eigendecomposition_Serial_Internal.hpp | 32 ------------------- sparse/impl/KokkosSparse_spmv_struct_impl.hpp | 22 ++----------- 2 files changed, 2 insertions(+), 52 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp index c1bc0439c5..f89b76e162 100644 --- a/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp @@ -388,42 +388,10 @@ struct SerialEigendecompositionInternal { const int ers, RealType* ei, const int eis, RealType* UL, const int uls0, const int uls1, RealType* UR, const int urs0, const int urs1, RealType* w, const int wlen) { -#if defined(KOKKOS_IF_ON_HOST) KOKKOS_IF_ON_HOST((host_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0, urs1, w, wlen);)) KOKKOS_IF_ON_DEVICE((device_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0, urs1, w, wlen);)) -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) // FIXME remove when - // requiring minimum - // version of - // Kokkos 3.6 - // if (as0 == 1 || as1 == 1) { - /// column major or row major and it runs on host - /// potentially it can run tpls internally - // NOTE BMK: If LAPACK not enabled, this will static_assert. - // If neither stride is unit, will runtime assert. - // Otherwise will succeed using LAPACK. - host_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0, - urs1, w, wlen); - /* - } else { - /// arbitrary strides should be handled by native implementation - device_invoke(m, - A, as0, as1, - er, ers, - ei, eis, - UL, uls0, uls1, - UR, urs0, urs1, - w, wlen); - throw std::runtime_error("Serial eigendecomposition without unit stride - implemented yet."); - } - */ -#else - /// device code runs - device_invoke(m, A, as0, as1, er, ers, ei, eis, UL, uls0, uls1, UR, urs0, - urs1, w, wlen); -#endif return 0; } }; diff --git a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index 2831cb8861..ac3da5e45f 100644 --- a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -521,7 +521,7 @@ struct SPMV_Struct_Functor { const size_type rowOffset = m_A.graph.row_map(rowIdx); y_value_type sum(0.0); -#if defined(KOKKOS_IF_ON_HOST) + // clang-format off KOKKOS_IF_ON_HOST(( for (ordinal_type idx = 0; idx < 27; ++idx) { @@ -540,25 +540,7 @@ struct SPMV_Struct_Functor { }, sum); )) - // clang-format on -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) // FIXME remove when - // requiring minimum - // version of - // Kokkos 3.6 - for (ordinal_type idx = 0; idx < 27; ++idx) { - sum += - m_A.values(rowOffset + idx) * m_x(rowIdx + columnOffsets(idx)); - } -#else - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(dev, 27), - [&](const ordinal_type& idx, y_value_type& lclSum) { - lclSum += (conjugate ? ATV::conj(m_A.values(rowOffset + idx)) - : m_A.values(rowOffset + idx)) * - m_x(rowIdx + columnOffsets(idx)); - }, - sum); -#endif + // clang-format on Kokkos::single(Kokkos::PerThread(dev), [&]() { m_y(rowIdx) = beta * m_y(rowIdx) + alpha * sum; From f658cc4dd7b49b674053ae2e561ab1ba9fa57b20 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Wed, 8 Mar 2023 00:22:37 -0800 Subject: [PATCH 106/442] Add spiluk_numeric_streams interface --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 158 ++++++++ .../impl/KokkosSparse_spiluk_numeric_spec.hpp | 69 +++- sparse/src/KokkosSparse_spiluk.hpp | 372 +++++++++++++++++- ...osSparse_spiluk_numeric_tpl_spec_avail.hpp | 8 +- 4 files changed, 583 insertions(+), 24 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 9436b67029..ff60d21913 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -489,6 +489,164 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, } // end iluk_numeric +template +void iluk_numeric_streams(const std::vector &execspace_v, + const std::vector &thandle_v, + const std::vector &A_row_map_v, + const std::vector &A_entries_v, + const std::vector &A_values_v, + const std::vector &L_row_map_v, + const std::vector &L_entries_v, + std::vector &L_values_v, + const std::vector &U_row_map_v, + const std::vector &U_entries_v, + std::vector &U_values_v) { + using size_type = typename IlukHandle::size_type; + using nnz_lno_t = typename IlukHandle::nnz_lno_t; + using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; + using WorkViewType = typename IlukHandle::work_view_t; + using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; + + // Create vectors for handles' data in streams + int nstreams = execspace_v.size(); + std::vector nlevels_v(nstreams); + std::vector lvl_ptr_h_v(nstreams); + std::vector lvl_idx_v(nstreams);//device views + std::vector lvl_start_v(nstreams); + std::vector lvl_end_v(nstreams); + std::vector iw_v(nstreams);//device views + std::vector stream_have_level_v(nstreams); + + // Retrieve data from handles and find max. number of levels among streams + size_type nlevels_max = 0; + for (int i = 0; i < nstreams; i++) { + nlevels_v[i] = thandle_v[i]->get_num_levels(); + lvl_ptr_h_v[i] = thandle_v[i]->get_host_level_ptr(); + lvl_idx_v[i] = thandle_v[i]->get_level_idx(); + iw_v[i] = thandle_v[i]->get_iw(); + stream_have_level_v[i] = true; + if (nlevels_max < nlevels_v[i]) + nlevels_max = nlevels_v[i]; + } + + std::cout << "iluk_numeric_streams--Max. number of levels among streams (nlevels_max): " + << nlevels_max << std::endl; + + // Assume all streams use the same algorithm + if (thandle_v[0]->get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) { + // Main loop must be performed sequential + for (size_type lvl = 0; lvl < nlevels_max; lvl++) { + // Initial work across streams at each level + for (int i = 0; i < nstreams; i++) { + // Only do this if this stream has this level + if (lvl < nlevels_v[i]) { + lvl_start_v[i] = lvl_ptr_h_v[i](lvl); + lvl_end_v[i] = lvl_ptr_h_v[i](lvl + 1); + if ((lvl_end_v[i] - lvl_start_v[i]) != 0) + stream_have_level_v[i] = true; + else + stream_have_level_v[i] = false; + } + else + stream_have_level_v[i] = false; + } + + // Main work of the level across streams + // 1. Launch work on all streams + for (int i = 0; i < nstreams; i++) { + // Launch only if stream i-th has this level + if (stream_have_level_v[i]) { + ILUKLvlSchedRPNumericFunctor tstf(A_row_map_v[i], A_entries_v[i], A_values_v[i], L_row_map_v[i], L_entries_v[i], L_values_v[i], U_row_map_v[i], U_entries_v[i], U_values_v[i], lvl_idx_v[i], iw_v[i], lvl_start_v[i]); + Kokkos::parallel_for("parfor_rp", Kokkos::RangePolicy(execspace_v[i], lvl_start_v[i], lvl_end_v[i]), tstf); + } // end if (stream_have_level_v[i]) + } // end for streams + + // 2. Wait for all streams finished + for (int i = 0; i < nstreams; i++) { + if (stream_have_level_v[i]) + execspace_v[i].fence(); + } // end for streams + } // end for lvl + } // end SEQLVLSCHD_RP + else if (thandle_v[0]->get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + using policy_type = Kokkos::TeamPolicy; + + std::vector lvl_nchunks_h_v(nstreams); + std::vector lvl_nrowsperchunk_h_v(nstreams); + std::vector lvl_rowid_start_v(nstreams); + std::vector team_size_v(nstreams); + + for (int i = 0; i < nstreams; i++) { + lvl_nchunks_h_v[i] = thandle_v[i]->get_level_nchunks(); + lvl_nrowsperchunk_h_v[i] = thandle_v[i]->get_level_nrowsperchunk(); + team_size_v[i] = thandle_v[i]->get_team_size(); + } + + // Main loop must be performed sequential + for (size_type lvl = 0; lvl < nlevels_max; lvl++) { + // Initial work across streams at each level + nnz_lno_t lvl_nchunks_max = 0; + for (int i = 0; i < nstreams; i++) { + // Only do this if this stream has this level + if (lvl < nlevels_v[i]) { + lvl_start_v[i] = lvl_ptr_h_v[i](lvl); + lvl_end_v[i] = lvl_ptr_h_v[i](lvl + 1); + if ((lvl_end_v[i] - lvl_start_v[i]) != 0) { + stream_have_level_v[i] = true; + lvl_rowid_start_v[i] = 0; + if (lvl_nchunks_max < lvl_nchunks_h_v[i](lvl)) + lvl_nchunks_max = lvl_nchunks_h_v[i](lvl); + } + else + stream_have_level_v[i] = false; + } + else + stream_have_level_v[i] = false; + } + + // Main work of the level across streams -- looping through chunnks + for (int chunkid = 0; chunkid < lvl_nchunks_max; chunkid++) { + // 1. Launch work on all streams (for each chunk) + for (int i = 0; i < nstreams; i++) { + // Launch only if stream i-th has this level + if (stream_have_level_v[i]) { + // Launch only if stream i-th has this chunk + if (chunkid < lvl_nchunks_h_v[i](lvl)) { + // 1.a. Specify number of rows (i.e. number of teams) to launch + nnz_lno_t lvl_nrows_chunk = 0; + if ((lvl_rowid_start_v[i] + lvl_nrowsperchunk_h_v[i](lvl)) > (lvl_end_v[i] - lvl_start_v[i])) + lvl_nrows_chunk = (lvl_end_v[i] - lvl_start_v[i]) - lvl_rowid_start_v[i]; + else + lvl_nrows_chunk = lvl_nrowsperchunk_h_v[i](lvl); + + // 1.b. Create functor for stream i-th and launch + ILUKLvlSchedTP1NumericFunctor tstf(A_row_map_v[i], A_entries_v[i], A_values_v[i], L_row_map_v[i], L_entries_v[i], L_values_v[i], U_row_map_v[i], U_entries_v[i], U_values_v[i], lvl_idx_v[i], iw_v[i], lvl_start_v[i] + lvl_rowid_start_v[i]); + if (team_size_v[i] == -1) + Kokkos::parallel_for("parfor_tp1", policy_type(execspace_v[i], lvl_nrows_chunk, Kokkos::AUTO), tstf); + else + Kokkos::parallel_for("parfor_tp1", policy_type(execspace_v[i], lvl_nrows_chunk, team_size_v[i]), tstf); + + // 1.c. Ready to move to next chunk + lvl_rowid_start_v[i] += lvl_nrows_chunk; + } // end if (chunkid < lvl_nchunks_h_v[i](lvl)) + } // end if (stream_have_level_v[i]) + } // end for streams + + // 2. Wait for all streams finishing + for (int i = 0; i < nstreams; i++) { + if (stream_have_level_v[i]) + if (chunkid < lvl_nchunks_h_v[i](lvl)) + execspace_v[i].fence(); + } // end for streams + } // end for chunkid + } // end for lvl + } // end SEQLVLSCHD_TP1 + +} // end iluk_numeric_streams + } // namespace Experimental } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp index ec711a3b17..cee0846d66 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp @@ -31,10 +31,10 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spiluk_numeric_eti_spec_avail { enum : bool { value = false }; }; @@ -47,6 +47,7 @@ struct spiluk_numeric_eti_spec_avail { MEM_SPACE_TYPE) \ template <> \ struct spiluk_numeric_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -99,15 +100,15 @@ namespace Impl { // Unification layer /// \brief Implementation of KokkosSparse::spiluk_numeric -template ::value, - bool eti_spec_avail = spiluk_numeric_eti_spec_avail< + bool eti_spec_avail = spiluk_numeric_eti_spec_avail::value> @@ -119,18 +120,30 @@ struct SPILUK_NUMERIC { const AValuesType &A_values, LRowMapType &L_row_map, LEntriesType &L_entries, LValuesType &L_values, URowMapType &U_row_map, UEntriesType &U_entries, UValuesType &U_values); + static void spiluk_numeric_streams( + const std::vector& execspace_v, + std::vector& handle_v, + const std::vector& A_row_map_v, + const std::vector& A_entries_v, + const std::vector& A_values_v, + const std::vector& L_row_map_v, + const std::vector& L_entries_v, + std::vector& L_values_v, + const std::vector& U_row_map_v, + const std::vector& U_entries_v, + std::vector& U_values_v); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of spiluk_numeric // Unification layer -template -struct SPILUK_NUMERIC +struct SPILUK_NUMERIC { static void spiluk_numeric( KernelHandle *handle, @@ -146,6 +159,26 @@ struct SPILUK_NUMERIC& execspace_v, + std::vector& handle_v, + const std::vector& A_row_map_v, + const std::vector& A_entries_v, + const std::vector& A_values_v, + const std::vector& L_row_map_v, + const std::vector& L_entries_v, + std::vector& L_values_v, + const std::vector& U_row_map_v, + const std::vector& U_entries_v, + std::vector& U_values_v) { + std::vector spiluk_handle_v(execspace_v.size()); + for (int i = 0; i < static_cast(execspace_v.size()); i++) { + spiluk_handle_v[i] = handle_v[i].get_spiluk_handle(); + } + + Experimental::iluk_numeric_streams(execspace_v, spiluk_handle_v, A_row_map_v, A_entries_v, A_values_v, L_row_map_v, L_entries_v, L_values_v, U_row_map_v, U_entries_v, U_values_v); + } }; #endif @@ -163,6 +196,7 @@ struct SPILUK_NUMERIC, \ @@ -208,6 +242,7 @@ struct SPILUK_NUMERIC, \ diff --git a/sparse/src/KokkosSparse_spiluk.hpp b/sparse/src/KokkosSparse_spiluk.hpp index ac2afc066f..428596bdc2 100644 --- a/sparse/src/KokkosSparse_spiluk.hpp +++ b/sparse/src/KokkosSparse_spiluk.hpp @@ -521,11 +521,12 @@ void spiluk_numeric(KernelHandle* handle, UEntries_Internal U_entries_i = U_entries; UValues_Internal U_values_i = U_values; - KokkosSparse::Impl::SPILUK_NUMERIC< + KokkosSparse::Impl::SPILUK_NUMERIC::spiluk_numeric(&tmp_handle, - fill_lev, A_rowmap_i, + fill_lev, + A_rowmap_i, A_entries_i, A_values_i, L_rowmap_i, @@ -535,7 +536,372 @@ void spiluk_numeric(KernelHandle* handle, U_entries_i, U_values_i); -} // spiluk_numeric +} // spiluk_numeric + +template +void spiluk_numeric_streams(const std::vector& execspace_v, + const std::vector& handle_v, + typename KernelHandle::const_nnz_lno_t fill_lev, + const std::vector& A_rowmap_v, + const std::vector& A_entries_v, + const std::vector& A_values_v, + const std::vector& L_rowmap_v, + const std::vector& L_entries_v, + std::vector& L_values_v, + const std::vector& U_rowmap_v, + const std::vector& U_entries_v, + std::vector& U_values_v) { + + using size_type = typename KernelHandle::size_type; + using ordinal_type = typename KernelHandle::nnz_lno_t; + using scalar_type = typename KernelHandle::nnz_scalar_t; + + static_assert(Kokkos::is_execution_space::value, "ExecutionSpace is not valid"); + static_assert(Kokkos::SpaceAccessibility::accessible, "spiluk_numeric_streams: ExecutionSpace cannot access data in ARowMapType"); + static_assert(Kokkos::SpaceAccessibility::accessible, "spiluk_numeric_streams: ExecutionSpace cannot access data in AEntriesType"); + static_assert(Kokkos::SpaceAccessibility::accessible, "spiluk_numeric_streams: ExecutionSpace cannot access data in AValuesType"); + static_assert(Kokkos::SpaceAccessibility::accessible, "spiluk_numeric_streams: ExecutionSpace cannot access data in LRowMapType"); + static_assert(Kokkos::SpaceAccessibility::accessible, "spiluk_numeric_streams: ExecutionSpace cannot access data in LEntriesType"); + static_assert(Kokkos::SpaceAccessibility::accessible, "spiluk_numeric_streams: ExecutionSpace cannot access data in LValuesType"); + static_assert(Kokkos::SpaceAccessibility::accessible, "spiluk_numeric_streams: ExecutionSpace cannot access data in URowMapType"); + static_assert(Kokkos::SpaceAccessibility::accessible, "spiluk_numeric_streams: ExecutionSpace cannot access data in UEntriesType"); + static_assert(Kokkos::SpaceAccessibility::accessible, "spiluk_numeric_streams: ExecutionSpace cannot access data in UValuesType"); + + static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE( + typename ARowMapType::non_const_value_type, size_type), + "spiluk_numeric_streams: A size_type must match KernelHandle size_type " + "(const doesn't matter)"); + static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE( + typename AEntriesType::non_const_value_type, ordinal_type), + "spiluk_numeric_streams: A entry type must match KernelHandle entry " + "type (aka nnz_lno_t, and const doesn't matter)"); + static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename AValuesType::value_type, + scalar_type), + "spiluk_numeric_streams: A scalar type must match KernelHandle entry " + "type (aka nnz_lno_t, and const doesn't matter)"); + + static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE( + typename LRowMapType::non_const_value_type, size_type), + "spiluk_numeric_streams: L size_type must match KernelHandle size_type " + "(const doesn't matter)"); + static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE( + typename LEntriesType::non_const_value_type, ordinal_type), + "spiluk_numeric_streams: L entry type must match KernelHandle entry " + "type (aka nnz_lno_t, and const doesn't matter)"); + static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename LValuesType::value_type, + scalar_type), + "spiluk_numeric_streams: L scalar type must match KernelHandle entry " + "type (aka nnz_lno_t, and const doesn't matter)"); + + static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE( + typename URowMapType::non_const_value_type, size_type), + "spiluk_numeric_streams: U size_type must match KernelHandle size_type " + "(const doesn't matter)"); + static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE( + typename UEntriesType::non_const_value_type, ordinal_type), + "spiluk_numeric_streams: U entry type must match KernelHandle entry " + "type (aka nnz_lno_t, and const doesn't matter)"); + static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename UValuesType::value_type, + scalar_type), + "spiluk_numeric_streams: U scalar type must match KernelHandle entry " + "type (aka nnz_lno_t, and const doesn't matter)"); + + static_assert(Kokkos::is_view::value, + "spiluk_numeric_streams: A_rowmap is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "spiluk_numeric_streams: A_entries is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "spiluk_numeric_streams: A_values is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "spiluk_numeric_streams: L_rowmap is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "spiluk_numeric_streams: L_entries is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "spiluk_numeric_streams: L_values is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "spiluk_numeric_streams: U_rowmap is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "spiluk_numeric_streams: U_entries is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "spiluk_numeric_streams: U_values is not a Kokkos::View."); + + static_assert( + (int)LRowMapType::rank == (int)ARowMapType::rank, + "spiluk_numeric_streams: The ranks of L_rowmap and A_rowmap do not match."); + static_assert( + (int)LEntriesType::rank == (int)AEntriesType::rank, + "spiluk_numeric_streams: The ranks of L_entries and A_entries do not match."); + static_assert( + (int)LValuesType::rank == (int)AValuesType::rank, + "spiluk_numeric_streams: The ranks of L_values and A_values do not match."); + + static_assert( + (int)LRowMapType::rank == (int)URowMapType::rank, + "spiluk_numeric_streams: The ranks of L_rowmap and U_rowmap do not match."); + static_assert( + (int)LEntriesType::rank == (int)UEntriesType::rank, + "spiluk_numeric_streams: The ranks of L_entries and U_entries do not match."); + static_assert( + (int)LValuesType::rank == (int)UValuesType::rank, + "spiluk_numeric_streams: The ranks of L_values and U_values do not match."); + + static_assert( + LRowMapType::rank == 1, + "spiluk_numeric_streams: A_rowmap, L_rowmap and U_rowmap must all have rank 1."); + static_assert(LEntriesType::rank == 1, + "spiluk_numeric_streams: A_entries, L_entries and U_entries must all " + "have rank 1."); + static_assert( + LValuesType::rank == 1, + "spiluk_numeric_streams: A_values, L_values and U_values must all have rank 1."); + + static_assert( + std::is_same::value, + "spiluk_numeric_streams: The output L_entries must be nonconst."); + static_assert(std::is_same::value, + "spiluk_numeric_streams: The output L_values must be nonconst."); + static_assert( + std::is_same::value, + "spiluk_numeric_streams: The output U_entries must be nonconst."); + static_assert(std::is_same::value, + "spiluk_numeric_streams: The output U_values must be nonconst."); + + static_assert(std::is_same::value, + "spiluk_numeric_streams: Views LRowMapType and ARowMapType have " + "different device_types."); + static_assert(std::is_same::value, + "spiluk_numeric_streams: Views LEntriesType and AEntriesType have " + "different device_types."); + static_assert(std::is_same::value, + "spiluk_numeric_streams: Views LValuesType and AValuesType have " + "different device_types."); + + static_assert(std::is_same::value, + "spiluk_numeric_streams: Views LRowMapType and URowMapType have " + "different device_types."); + static_assert(std::is_same::value, + "spiluk_numeric_streams: Views LEntriesType and UEntriesType have " + "different device_types."); + static_assert(std::is_same::value, + "spiluk_numeric_streams: Views LValuesType and UValuesType have " + "different device_types."); + + static_assert( + std::is_same::value, + "spiluk_numeric_streams: KernelHandle's execution space is different from " + "ExecutionSpace."); + + static_assert( + std::is_same< + typename LRowMapType::device_type::execution_space, + typename KernelHandle::SPILUKHandleType::execution_space>::value, + "spiluk_numeric_streams: KernelHandle and Views have different execution " + "spaces."); + static_assert( + std::is_same< + typename LEntriesType::device_type::execution_space, + typename KernelHandle::SPILUKHandleType::execution_space>::value, + "spiluk_numeric_streams: KernelHandle and Views have different execution " + "spaces."); + static_assert( + std::is_same< + typename LValuesType::device_type::execution_space, + typename KernelHandle::SPILUKHandleType::execution_space>::value, + "spiluk_numeric_streams: KernelHandle and Views have different execution " + "spaces."); + + static_assert( + std::is_same::value, + "spiluk_numeric_streams: rowmap and entries have different device types."); + static_assert( + std::is_same::value, + "spiluk_numeric_streams: rowmap and values have different device types."); + + // Check validity of fill level + if (fill_lev < 0) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: fill_lev: " << fill_lev + << ". Valid value is >= 0."; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + // Check sizes of vectors + if (execspace_v.size() != handle_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. handle_v.size() " << handle_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != A_rowmap_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. A_rowmap_v.size() " << A_rowmap_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != A_entries_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. A_entries_v.size() " << A_entries_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != A_values_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. A_values_v.size() " << A_values_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != L_rowmap_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. L_rowmap_v.size() " << L_rowmap_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != L_entries_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. L_entries_v.size() " << L_entries_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != L_values_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. L_values_v.size() " << L_values_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != U_rowmap_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. U_rowmap_v.size() " << U_rowmap_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != U_entries_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. U_entries_v.size() " << U_entries_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != U_values_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. U_values_v.size() " << U_values_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + // Check if symbolic has been called + for (int i = 0; i < static_cast(execspace_v.size()); i++) { + if (handle_v[i]->get_spiluk_handle()->is_symbolic_complete() == false) { + std::ostringstream os; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: spiluk_symbolic must be " + "called before spiluk_numeric_streams -- stream " << i; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + } + + using c_size_t = typename KernelHandle::const_size_type; + using c_lno_t = typename KernelHandle::const_nnz_lno_t; + using c_scalar_t = typename KernelHandle::const_nnz_scalar_t; + using c_exec_t = typename KernelHandle::HandleExecSpace; + using c_temp_t = typename KernelHandle::HandleTempMemorySpace; + using c_persist_t = typename KernelHandle::HandlePersistentMemorySpace; + + using const_handle_type = typename KokkosKernels::Experimental::KokkosKernelsHandle; + + using ARowMap_Internal = Kokkos::View< + typename ARowMapType::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename ARowMapType::device_type, + Kokkos::MemoryTraits >; + + using AEntries_Internal = Kokkos::View< + typename AEntriesType::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename AEntriesType::device_type, + Kokkos::MemoryTraits >; + + using AValues_Internal = Kokkos::View< + typename AValuesType::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename AValuesType::device_type, + Kokkos::MemoryTraits >; + + using LRowMap_Internal = Kokkos::View< + typename LRowMapType::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename LRowMapType::device_type, + Kokkos::MemoryTraits >; + + using LEntries_Internal = Kokkos::View< + typename LEntriesType::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename LEntriesType::device_type, + Kokkos::MemoryTraits >; + + using LValues_Internal = Kokkos::View< + typename LValuesType::non_const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename LValuesType::device_type, + Kokkos::MemoryTraits >; + + using URowMap_Internal = Kokkos::View< + typename URowMapType::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename URowMapType::device_type, + Kokkos::MemoryTraits >; + + using UEntries_Internal = Kokkos::View< + typename UEntriesType::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename UEntriesType::device_type, + Kokkos::MemoryTraits >; + + using UValues_Internal = Kokkos::View< + typename UValuesType::non_const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename UValuesType::device_type, + Kokkos::MemoryTraits >; + + std::vector handle_i_v(execspace_v.size()); + std::vector A_rowmap_i_v(execspace_v.size()); + std::vector A_entries_i_v(execspace_v.size()); + std::vector A_values_i_v(execspace_v.size()); + std::vector L_rowmap_i_v(execspace_v.size()); + std::vector L_entries_i_v(execspace_v.size()); + std::vector L_values_i_v(execspace_v.size()); + std::vector U_rowmap_i_v(execspace_v.size()); + std::vector U_entries_i_v(execspace_v.size()); + std::vector U_values_i_v(execspace_v.size()); + + for (int i = 0; i < static_cast(execspace_v.size()); i++) { + handle_i_v[i] = const_handle_type(*(handle_v[i])); + A_rowmap_i_v[i] = A_rowmap_v[i]; + A_entries_i_v[i]= A_entries_v[i]; + A_values_i_v[i] = A_values_v[i]; + L_rowmap_i_v[i] = L_rowmap_v[i]; + L_entries_i_v[i]= L_entries_v[i]; + L_values_i_v[i] = L_values_v[i]; + U_rowmap_i_v[i] = U_rowmap_v[i]; + U_entries_i_v[i]= U_entries_v[i]; + U_values_i_v[i] = U_values_v[i]; + } + + KokkosSparse::Impl::SPILUK_NUMERIC::spiluk_numeric_streams(execspace_v, handle_i_v, A_rowmap_i_v, A_entries_i_v, A_values_i_v, L_rowmap_i_v, L_entries_i_v, L_values_i_v, U_rowmap_i_v, U_entries_i_v, U_values_i_v); + +} // spiluk_numeric_streams } // namespace Experimental } // namespace KokkosSparse diff --git a/sparse/tpls/KokkosSparse_spiluk_numeric_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spiluk_numeric_tpl_spec_avail.hpp index cf2a653e2a..87a4b9f001 100644 --- a/sparse/tpls/KokkosSparse_spiluk_numeric_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spiluk_numeric_tpl_spec_avail.hpp @@ -20,10 +20,10 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spiluk_numeric_tpl_spec_avail { enum : bool { value = false }; }; From 1f74d4399fd937cffeb68f8d0dd0fe229f78eca2 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Wed, 8 Mar 2023 10:34:30 -0800 Subject: [PATCH 107/442] Add nstreams to avail_byte calculation --- sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp | 10 ++++++---- sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp | 7 +++---- sparse/src/KokkosSparse_spiluk.hpp | 4 ++-- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 616e87f154..7c716d90d7 100644 --- a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -97,7 +97,7 @@ template (free_byte, total_byte); - avail_byte = static_cast(0.85 * free_byte); + avail_byte = static_cast(0.85 * static_cast(free_byte) / static_cast(nstreams)); } #endif @@ -225,7 +225,7 @@ void iluk_symbolic(IlukHandle& thandle, const ARowMapType& A_row_map_d, const AEntriesType& A_entries_d, LRowMapType& L_row_map_d, LEntriesType& L_entries_d, URowMapType& U_row_map_d, - UEntriesType& U_entries_d) { + UEntriesType& U_entries_d, int nstreams = 1) { if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP || thandle.get_algorithm() == @@ -433,12 +433,14 @@ void iluk_symbolic(IlukHandle& thandle, if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { level_sched_tp(thandle, L_row_map, L_entries, level_list, level_ptr, - level_idx, nlev); + level_idx, nlev, nstreams); thandle.alloc_iw(thandle.get_level_maxrowsperchunk(), nrows); + printf("spiluk_symbolic: iw (%d x %d) size %lu bytes\n", thandle.get_level_maxrowsperchunk(), nrows, (size_t)(nrows)*(size_t)(thandle.get_level_maxrowsperchunk())*sizeof(nnz_lno_t)); } else { level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, level_idx, nlev); thandle.alloc_iw(thandle.get_level_maxrows(), nrows); + printf("spiluk_symbolic: iw (%d x %d) size %lu bytes\n", thandle.get_level_maxrows(), nrows, (size_t)(nrows)*(size_t)(thandle.get_level_maxrows())*sizeof(nnz_lno_t)); } Kokkos::deep_copy(dlevel_ptr, level_ptr); diff --git a/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp b/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp index 86f018886c..cfd8524f24 100644 --- a/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp @@ -100,7 +100,7 @@ struct SPILUK_SYMBOLIC { const typename KernelHandle::const_nnz_lno_t &fill_lev, const ARowMapType &A_row_map, const AEntriesType &A_entries, LRowMapType &L_row_map, LEntriesType &L_entries, URowMapType &U_row_map, - UEntriesType &U_entries); + UEntriesType &U_entries, int nstreams = 1); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY @@ -117,11 +117,10 @@ struct SPILUK_SYMBOLICget_spiluk_handle(); - Experimental::iluk_symbolic(*spiluk_handle, fill_lev, A_row_map, A_entries, - L_row_map, L_entries, U_row_map, U_entries); + Experimental::iluk_symbolic(*spiluk_handle, fill_lev, A_row_map, A_entries, L_row_map, L_entries, U_row_map, U_entries, nstreams); spiluk_handle->set_symbolic_complete(); } }; diff --git a/sparse/src/KokkosSparse_spiluk.hpp b/sparse/src/KokkosSparse_spiluk.hpp index 428596bdc2..4c88c1a205 100644 --- a/sparse/src/KokkosSparse_spiluk.hpp +++ b/sparse/src/KokkosSparse_spiluk.hpp @@ -46,7 +46,7 @@ void spiluk_symbolic(KernelHandle* handle, typename KernelHandle::const_nnz_lno_t fill_lev, ARowMapType& A_rowmap, AEntriesType& A_entries, LRowMapType& L_rowmap, LEntriesType& L_entries, - URowMapType& U_rowmap, UEntriesType& U_entries) { + URowMapType& U_rowmap, UEntriesType& U_entries, int nstreams = 1) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; @@ -240,7 +240,7 @@ void spiluk_symbolic(KernelHandle* handle, LEntries_Internal, URowMap_Internal, UEntries_Internal>::spiluk_symbolic(&tmp_handle, fill_lev, A_rowmap_i, A_entries_i, L_rowmap_i, L_entries_i, - U_rowmap_i, U_entries_i); + U_rowmap_i, U_entries_i, nstreams); } // spiluk_symbolic From d178771637fcd2e80ab782c8dd4fd638f80fa1b0 Mon Sep 17 00:00:00 2001 From: "Vinh Quang Dang (-EXP)" Date: Wed, 8 Mar 2023 11:47:24 -0700 Subject: [PATCH 108/442] Apply clang format --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 147 +++--- .../impl/KokkosSparse_spiluk_numeric_spec.hpp | 70 +-- .../KokkosSparse_spiluk_symbolic_impl.hpp | 13 +- .../KokkosSparse_spiluk_symbolic_spec.hpp | 4 +- sparse/src/KokkosSparse_spiluk.hpp | 446 +++++++++++------- 5 files changed, 404 insertions(+), 276 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index ff60d21913..831f3796c2 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -489,54 +489,55 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, } // end iluk_numeric -template +template void iluk_numeric_streams(const std::vector &execspace_v, - const std::vector &thandle_v, - const std::vector &A_row_map_v, - const std::vector &A_entries_v, - const std::vector &A_values_v, - const std::vector &L_row_map_v, - const std::vector &L_entries_v, - std::vector &L_values_v, - const std::vector &U_row_map_v, - const std::vector &U_entries_v, - std::vector &U_values_v) { - using size_type = typename IlukHandle::size_type; - using nnz_lno_t = typename IlukHandle::nnz_lno_t; + const std::vector &thandle_v, + const std::vector &A_row_map_v, + const std::vector &A_entries_v, + const std::vector &A_values_v, + const std::vector &L_row_map_v, + const std::vector &L_entries_v, + std::vector &L_values_v, + const std::vector &U_row_map_v, + const std::vector &U_entries_v, + std::vector &U_values_v) { + using size_type = typename IlukHandle::size_type; + using nnz_lno_t = typename IlukHandle::nnz_lno_t; using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; - using WorkViewType = typename IlukHandle::work_view_t; - using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; + using WorkViewType = typename IlukHandle::work_view_t; + using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; // Create vectors for handles' data in streams int nstreams = execspace_v.size(); std::vector nlevels_v(nstreams); std::vector lvl_ptr_h_v(nstreams); - std::vector lvl_idx_v(nstreams);//device views + std::vector lvl_idx_v(nstreams); // device views std::vector lvl_start_v(nstreams); std::vector lvl_end_v(nstreams); - std::vector iw_v(nstreams);//device views + std::vector iw_v(nstreams); // device views std::vector stream_have_level_v(nstreams); // Retrieve data from handles and find max. number of levels among streams size_type nlevels_max = 0; for (int i = 0; i < nstreams; i++) { - nlevels_v[i] = thandle_v[i]->get_num_levels(); - lvl_ptr_h_v[i] = thandle_v[i]->get_host_level_ptr(); - lvl_idx_v[i] = thandle_v[i]->get_level_idx(); - iw_v[i] = thandle_v[i]->get_iw(); + nlevels_v[i] = thandle_v[i]->get_num_levels(); + lvl_ptr_h_v[i] = thandle_v[i]->get_host_level_ptr(); + lvl_idx_v[i] = thandle_v[i]->get_level_idx(); + iw_v[i] = thandle_v[i]->get_iw(); stream_have_level_v[i] = true; - if (nlevels_max < nlevels_v[i]) - nlevels_max = nlevels_v[i]; + if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; } - std::cout << "iluk_numeric_streams--Max. number of levels among streams (nlevels_max): " - << nlevels_max << std::endl; + std::cout << "iluk_numeric_streams--Max. number of levels among streams " + "(nlevels_max): " + << nlevels_max << std::endl; // Assume all streams use the same algorithm - if (thandle_v[0]->get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) { + if (thandle_v[0]->get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) { // Main loop must be performed sequential for (size_type lvl = 0; lvl < nlevels_max; lvl++) { // Initial work across streams at each level @@ -549,29 +550,39 @@ void iluk_numeric_streams(const std::vector &execspace_v, stream_have_level_v[i] = true; else stream_have_level_v[i] = false; - } - else + } else stream_have_level_v[i] = false; } - + // Main work of the level across streams // 1. Launch work on all streams for (int i = 0; i < nstreams; i++) { // Launch only if stream i-th has this level if (stream_have_level_v[i]) { - ILUKLvlSchedRPNumericFunctor tstf(A_row_map_v[i], A_entries_v[i], A_values_v[i], L_row_map_v[i], L_entries_v[i], L_values_v[i], U_row_map_v[i], U_entries_v[i], U_values_v[i], lvl_idx_v[i], iw_v[i], lvl_start_v[i]); - Kokkos::parallel_for("parfor_rp", Kokkos::RangePolicy(execspace_v[i], lvl_start_v[i], lvl_end_v[i]), tstf); + ILUKLvlSchedRPNumericFunctor< + ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, + LValuesType, URowMapType, UEntriesType, UValuesType, + HandleDeviceEntriesType, WorkViewType, nnz_lno_t> + tstf(A_row_map_v[i], A_entries_v[i], A_values_v[i], + L_row_map_v[i], L_entries_v[i], L_values_v[i], + U_row_map_v[i], U_entries_v[i], U_values_v[i], lvl_idx_v[i], + iw_v[i], lvl_start_v[i]); + Kokkos::parallel_for( + "parfor_rp", + Kokkos::RangePolicy(execspace_v[i], + lvl_start_v[i], lvl_end_v[i]), + tstf); } // end if (stream_have_level_v[i]) - } // end for streams - + } // end for streams + // 2. Wait for all streams finished for (int i = 0; i < nstreams; i++) { - if (stream_have_level_v[i]) - execspace_v[i].fence(); + if (stream_have_level_v[i]) execspace_v[i].fence(); } // end for streams - } // end for lvl - } // end SEQLVLSCHD_RP - else if (thandle_v[0]->get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + } // end for lvl + } // end SEQLVLSCHD_RP + else if (thandle_v[0]->get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { using policy_type = Kokkos::TeamPolicy; std::vector lvl_nchunks_h_v(nstreams); @@ -596,17 +607,15 @@ void iluk_numeric_streams(const std::vector &execspace_v, lvl_end_v[i] = lvl_ptr_h_v[i](lvl + 1); if ((lvl_end_v[i] - lvl_start_v[i]) != 0) { stream_have_level_v[i] = true; - lvl_rowid_start_v[i] = 0; + lvl_rowid_start_v[i] = 0; if (lvl_nchunks_max < lvl_nchunks_h_v[i](lvl)) lvl_nchunks_max = lvl_nchunks_h_v[i](lvl); - } - else + } else stream_have_level_v[i] = false; - } - else + } else stream_have_level_v[i] = false; } - + // Main work of the level across streams -- looping through chunnks for (int chunkid = 0; chunkid < lvl_nchunks_max; chunkid++) { // 1. Launch work on all streams (for each chunk) @@ -617,33 +626,49 @@ void iluk_numeric_streams(const std::vector &execspace_v, if (chunkid < lvl_nchunks_h_v[i](lvl)) { // 1.a. Specify number of rows (i.e. number of teams) to launch nnz_lno_t lvl_nrows_chunk = 0; - if ((lvl_rowid_start_v[i] + lvl_nrowsperchunk_h_v[i](lvl)) > (lvl_end_v[i] - lvl_start_v[i])) - lvl_nrows_chunk = (lvl_end_v[i] - lvl_start_v[i]) - lvl_rowid_start_v[i]; + if ((lvl_rowid_start_v[i] + lvl_nrowsperchunk_h_v[i](lvl)) > + (lvl_end_v[i] - lvl_start_v[i])) + lvl_nrows_chunk = + (lvl_end_v[i] - lvl_start_v[i]) - lvl_rowid_start_v[i]; else lvl_nrows_chunk = lvl_nrowsperchunk_h_v[i](lvl); // 1.b. Create functor for stream i-th and launch - ILUKLvlSchedTP1NumericFunctor tstf(A_row_map_v[i], A_entries_v[i], A_values_v[i], L_row_map_v[i], L_entries_v[i], L_values_v[i], U_row_map_v[i], U_entries_v[i], U_values_v[i], lvl_idx_v[i], iw_v[i], lvl_start_v[i] + lvl_rowid_start_v[i]); + ILUKLvlSchedTP1NumericFunctor< + ARowMapType, AEntriesType, AValuesType, LRowMapType, + LEntriesType, LValuesType, URowMapType, UEntriesType, + UValuesType, HandleDeviceEntriesType, WorkViewType, nnz_lno_t> + tstf(A_row_map_v[i], A_entries_v[i], A_values_v[i], + L_row_map_v[i], L_entries_v[i], L_values_v[i], + U_row_map_v[i], U_entries_v[i], U_values_v[i], + lvl_idx_v[i], iw_v[i], + lvl_start_v[i] + lvl_rowid_start_v[i]); if (team_size_v[i] == -1) - Kokkos::parallel_for("parfor_tp1", policy_type(execspace_v[i], lvl_nrows_chunk, Kokkos::AUTO), tstf); + Kokkos::parallel_for( + "parfor_tp1", + policy_type(execspace_v[i], lvl_nrows_chunk, Kokkos::AUTO), + tstf); else - Kokkos::parallel_for("parfor_tp1", policy_type(execspace_v[i], lvl_nrows_chunk, team_size_v[i]), tstf); - + Kokkos::parallel_for( + "parfor_tp1", + policy_type(execspace_v[i], lvl_nrows_chunk, + team_size_v[i]), + tstf); + // 1.c. Ready to move to next chunk lvl_rowid_start_v[i] += lvl_nrows_chunk; } // end if (chunkid < lvl_nchunks_h_v[i](lvl)) - } // end if (stream_have_level_v[i]) - } // end for streams - + } // end if (stream_have_level_v[i]) + } // end for streams + // 2. Wait for all streams finishing for (int i = 0; i < nstreams; i++) { if (stream_have_level_v[i]) - if (chunkid < lvl_nchunks_h_v[i](lvl)) - execspace_v[i].fence(); + if (chunkid < lvl_nchunks_h_v[i](lvl)) execspace_v[i].fence(); } // end for streams - } // end for chunkid - } // end for lvl - } // end SEQLVLSCHD_TP1 + } // end for chunkid + } // end for lvl + } // end SEQLVLSCHD_TP1 } // end iluk_numeric_streams diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp index cee0846d66..6081878e9c 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp @@ -103,15 +103,15 @@ namespace Impl { template ::value, - bool eti_spec_avail = spiluk_numeric_eti_spec_avail::value> + class UEntriesType, class UValuesType, + bool tpl_spec_avail = spiluk_numeric_tpl_spec_avail< + ExecutionSpace, KernelHandle, ARowMapType, AEntriesType, + AValuesType, LRowMapType, LEntriesType, LValuesType, URowMapType, + UEntriesType, UValuesType>::value, + bool eti_spec_avail = spiluk_numeric_eti_spec_avail< + ExecutionSpace, KernelHandle, ARowMapType, AEntriesType, + AValuesType, LRowMapType, LEntriesType, LValuesType, URowMapType, + UEntriesType, UValuesType>::value> struct SPILUK_NUMERIC { static void spiluk_numeric( KernelHandle *handle, @@ -121,17 +121,17 @@ struct SPILUK_NUMERIC { LEntriesType &L_entries, LValuesType &L_values, URowMapType &U_row_map, UEntriesType &U_entries, UValuesType &U_values); static void spiluk_numeric_streams( - const std::vector& execspace_v, - std::vector& handle_v, - const std::vector& A_row_map_v, - const std::vector& A_entries_v, - const std::vector& A_values_v, - const std::vector& L_row_map_v, - const std::vector& L_entries_v, - std::vector& L_values_v, - const std::vector& U_row_map_v, - const std::vector& U_entries_v, - std::vector& U_values_v); + const std::vector &execspace_v, + std::vector &handle_v, + const std::vector &A_row_map_v, + const std::vector &A_entries_v, + const std::vector &A_values_v, + const std::vector &L_row_map_v, + const std::vector &L_entries_v, + std::vector &L_values_v, + const std::vector &U_row_map_v, + const std::vector &U_entries_v, + std::vector &U_values_v); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY @@ -161,23 +161,27 @@ struct SPILUK_NUMERIC& execspace_v, - std::vector& handle_v, - const std::vector& A_row_map_v, - const std::vector& A_entries_v, - const std::vector& A_values_v, - const std::vector& L_row_map_v, - const std::vector& L_entries_v, - std::vector& L_values_v, - const std::vector& U_row_map_v, - const std::vector& U_entries_v, - std::vector& U_values_v) { - std::vector spiluk_handle_v(execspace_v.size()); + const std::vector &execspace_v, + std::vector &handle_v, + const std::vector &A_row_map_v, + const std::vector &A_entries_v, + const std::vector &A_values_v, + const std::vector &L_row_map_v, + const std::vector &L_entries_v, + std::vector &L_values_v, + const std::vector &U_row_map_v, + const std::vector &U_entries_v, + std::vector &U_values_v) { + std::vector spiluk_handle_v( + execspace_v.size()); for (int i = 0; i < static_cast(execspace_v.size()); i++) { spiluk_handle_v[i] = handle_v[i].get_spiluk_handle(); } - Experimental::iluk_numeric_streams(execspace_v, spiluk_handle_v, A_row_map_v, A_entries_v, A_values_v, L_row_map_v, L_entries_v, L_values_v, U_row_map_v, U_entries_v, U_values_v); + Experimental::iluk_numeric_streams(execspace_v, spiluk_handle_v, + A_row_map_v, A_entries_v, A_values_v, + L_row_map_v, L_entries_v, L_values_v, + U_row_map_v, U_entries_v, U_values_v); } }; diff --git a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 7c716d90d7..9edc6559b6 100644 --- a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -152,7 +152,8 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, size_t free_byte, total_byte; KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); - avail_byte = static_cast(0.85 * static_cast(free_byte) / static_cast(nstreams)); + avail_byte = static_cast(0.85 * static_cast(free_byte) / + static_cast(nstreams)); } #endif @@ -435,12 +436,18 @@ void iluk_symbolic(IlukHandle& thandle, level_sched_tp(thandle, L_row_map, L_entries, level_list, level_ptr, level_idx, nlev, nstreams); thandle.alloc_iw(thandle.get_level_maxrowsperchunk(), nrows); - printf("spiluk_symbolic: iw (%d x %d) size %lu bytes\n", thandle.get_level_maxrowsperchunk(), nrows, (size_t)(nrows)*(size_t)(thandle.get_level_maxrowsperchunk())*sizeof(nnz_lno_t)); + printf("spiluk_symbolic: iw (%d x %d) size %lu bytes\n", + thandle.get_level_maxrowsperchunk(), nrows, + (size_t)(nrows) * (size_t)(thandle.get_level_maxrowsperchunk()) * + sizeof(nnz_lno_t)); } else { level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, level_idx, nlev); thandle.alloc_iw(thandle.get_level_maxrows(), nrows); - printf("spiluk_symbolic: iw (%d x %d) size %lu bytes\n", thandle.get_level_maxrows(), nrows, (size_t)(nrows)*(size_t)(thandle.get_level_maxrows())*sizeof(nnz_lno_t)); + printf("spiluk_symbolic: iw (%d x %d) size %lu bytes\n", + thandle.get_level_maxrows(), nrows, + (size_t)(nrows) * (size_t)(thandle.get_level_maxrows()) * + sizeof(nnz_lno_t)); } Kokkos::deep_copy(dlevel_ptr, level_ptr); diff --git a/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp b/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp index cfd8524f24..32f306904b 100644 --- a/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp @@ -120,7 +120,9 @@ struct SPILUK_SYMBOLICget_spiluk_handle(); - Experimental::iluk_symbolic(*spiluk_handle, fill_lev, A_row_map, A_entries, L_row_map, L_entries, U_row_map, U_entries, nstreams); + Experimental::iluk_symbolic(*spiluk_handle, fill_lev, A_row_map, A_entries, + L_row_map, L_entries, U_row_map, U_entries, + nstreams); spiluk_handle->set_symbolic_complete(); } }; diff --git a/sparse/src/KokkosSparse_spiluk.hpp b/sparse/src/KokkosSparse_spiluk.hpp index 4c88c1a205..885f742f89 100644 --- a/sparse/src/KokkosSparse_spiluk.hpp +++ b/sparse/src/KokkosSparse_spiluk.hpp @@ -46,7 +46,8 @@ void spiluk_symbolic(KernelHandle* handle, typename KernelHandle::const_nnz_lno_t fill_lev, ARowMapType& A_rowmap, AEntriesType& A_entries, LRowMapType& L_rowmap, LEntriesType& L_entries, - URowMapType& U_rowmap, UEntriesType& U_entries, int nstreams = 1) { + URowMapType& U_rowmap, UEntriesType& U_entries, + int nstreams = 1) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; @@ -521,22 +522,16 @@ void spiluk_numeric(KernelHandle* handle, UEntries_Internal U_entries_i = U_entries; UValues_Internal U_values_i = U_values; - KokkosSparse::Impl::SPILUK_NUMERIC::spiluk_numeric(&tmp_handle, - fill_lev, - A_rowmap_i, - A_entries_i, - A_values_i, - L_rowmap_i, - L_entries_i, - L_values_i, - U_rowmap_i, - U_entries_i, - U_values_i); - -} // spiluk_numeric + KokkosSparse::Impl::SPILUK_NUMERIC< + typename AValuesType::execution_space, const_handle_type, + ARowMap_Internal, AEntries_Internal, AValues_Internal, LRowMap_Internal, + LEntries_Internal, LValues_Internal, URowMap_Internal, UEntries_Internal, + UValues_Internal>::spiluk_numeric(&tmp_handle, fill_lev, A_rowmap_i, + A_entries_i, A_values_i, L_rowmap_i, + L_entries_i, L_values_i, U_rowmap_i, + U_entries_i, U_values_i); + +} // spiluk_numeric template & execspace_v, const std::vector& A_values_v, const std::vector& L_rowmap_v, const std::vector& L_entries_v, - std::vector& L_values_v, + std::vector& L_values_v, const std::vector& U_rowmap_v, const std::vector& U_entries_v, - std::vector& U_values_v) { - - using size_type = typename KernelHandle::size_type; + std::vector& U_values_v) { + using size_type = typename KernelHandle::size_type; using ordinal_type = typename KernelHandle::nnz_lno_t; - using scalar_type = typename KernelHandle::nnz_scalar_t; - - static_assert(Kokkos::is_execution_space::value, "ExecutionSpace is not valid"); - static_assert(Kokkos::SpaceAccessibility::accessible, "spiluk_numeric_streams: ExecutionSpace cannot access data in ARowMapType"); - static_assert(Kokkos::SpaceAccessibility::accessible, "spiluk_numeric_streams: ExecutionSpace cannot access data in AEntriesType"); - static_assert(Kokkos::SpaceAccessibility::accessible, "spiluk_numeric_streams: ExecutionSpace cannot access data in AValuesType"); - static_assert(Kokkos::SpaceAccessibility::accessible, "spiluk_numeric_streams: ExecutionSpace cannot access data in LRowMapType"); - static_assert(Kokkos::SpaceAccessibility::accessible, "spiluk_numeric_streams: ExecutionSpace cannot access data in LEntriesType"); - static_assert(Kokkos::SpaceAccessibility::accessible, "spiluk_numeric_streams: ExecutionSpace cannot access data in LValuesType"); - static_assert(Kokkos::SpaceAccessibility::accessible, "spiluk_numeric_streams: ExecutionSpace cannot access data in URowMapType"); - static_assert(Kokkos::SpaceAccessibility::accessible, "spiluk_numeric_streams: ExecutionSpace cannot access data in UEntriesType"); - static_assert(Kokkos::SpaceAccessibility::accessible, "spiluk_numeric_streams: ExecutionSpace cannot access data in UValuesType"); - - static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE( - typename ARowMapType::non_const_value_type, size_type), - "spiluk_numeric_streams: A size_type must match KernelHandle size_type " - "(const doesn't matter)"); - static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE( - typename AEntriesType::non_const_value_type, ordinal_type), - "spiluk_numeric_streams: A entry type must match KernelHandle entry " - "type (aka nnz_lno_t, and const doesn't matter)"); - static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename AValuesType::value_type, - scalar_type), - "spiluk_numeric_streams: A scalar type must match KernelHandle entry " - "type (aka nnz_lno_t, and const doesn't matter)"); - - static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE( - typename LRowMapType::non_const_value_type, size_type), - "spiluk_numeric_streams: L size_type must match KernelHandle size_type " - "(const doesn't matter)"); - static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE( - typename LEntriesType::non_const_value_type, ordinal_type), - "spiluk_numeric_streams: L entry type must match KernelHandle entry " - "type (aka nnz_lno_t, and const doesn't matter)"); - static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename LValuesType::value_type, - scalar_type), - "spiluk_numeric_streams: L scalar type must match KernelHandle entry " - "type (aka nnz_lno_t, and const doesn't matter)"); - - static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE( - typename URowMapType::non_const_value_type, size_type), - "spiluk_numeric_streams: U size_type must match KernelHandle size_type " - "(const doesn't matter)"); - static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE( - typename UEntriesType::non_const_value_type, ordinal_type), - "spiluk_numeric_streams: U entry type must match KernelHandle entry " - "type (aka nnz_lno_t, and const doesn't matter)"); - static_assert(KOKKOSKERNELS_SPILUK_SAME_TYPE(typename UValuesType::value_type, - scalar_type), - "spiluk_numeric_streams: U scalar type must match KernelHandle entry " - "type (aka nnz_lno_t, and const doesn't matter)"); + using scalar_type = typename KernelHandle::nnz_scalar_t; + + static_assert(Kokkos::is_execution_space::value, + "ExecutionSpace is not valid"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename ARowMapType::memory_space>::accessible, + "spiluk_numeric_streams: ExecutionSpace cannot access data in " + "ARowMapType"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename AEntriesType::memory_space>::accessible, + "spiluk_numeric_streams: ExecutionSpace cannot access data in " + "AEntriesType"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename AValuesType::memory_space>::accessible, + "spiluk_numeric_streams: ExecutionSpace cannot access data in " + "AValuesType"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename LRowMapType::memory_space>::accessible, + "spiluk_numeric_streams: ExecutionSpace cannot access data in " + "LRowMapType"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename LEntriesType::memory_space>::accessible, + "spiluk_numeric_streams: ExecutionSpace cannot access data in " + "LEntriesType"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename LValuesType::memory_space>::accessible, + "spiluk_numeric_streams: ExecutionSpace cannot access data in " + "LValuesType"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename URowMapType::memory_space>::accessible, + "spiluk_numeric_streams: ExecutionSpace cannot access data in " + "URowMapType"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename UEntriesType::memory_space>::accessible, + "spiluk_numeric_streams: ExecutionSpace cannot access data in " + "UEntriesType"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename UValuesType::memory_space>::accessible, + "spiluk_numeric_streams: ExecutionSpace cannot access data in " + "UValuesType"); + + static_assert( + KOKKOSKERNELS_SPILUK_SAME_TYPE(typename ARowMapType::non_const_value_type, + size_type), + "spiluk_numeric_streams: A size_type must match KernelHandle size_type " + "(const doesn't matter)"); + static_assert( + KOKKOSKERNELS_SPILUK_SAME_TYPE( + typename AEntriesType::non_const_value_type, ordinal_type), + "spiluk_numeric_streams: A entry type must match KernelHandle entry " + "type (aka nnz_lno_t, and const doesn't matter)"); + static_assert( + KOKKOSKERNELS_SPILUK_SAME_TYPE(typename AValuesType::value_type, + scalar_type), + "spiluk_numeric_streams: A scalar type must match KernelHandle entry " + "type (aka nnz_lno_t, and const doesn't matter)"); + + static_assert( + KOKKOSKERNELS_SPILUK_SAME_TYPE(typename LRowMapType::non_const_value_type, + size_type), + "spiluk_numeric_streams: L size_type must match KernelHandle size_type " + "(const doesn't matter)"); + static_assert( + KOKKOSKERNELS_SPILUK_SAME_TYPE( + typename LEntriesType::non_const_value_type, ordinal_type), + "spiluk_numeric_streams: L entry type must match KernelHandle entry " + "type (aka nnz_lno_t, and const doesn't matter)"); + static_assert( + KOKKOSKERNELS_SPILUK_SAME_TYPE(typename LValuesType::value_type, + scalar_type), + "spiluk_numeric_streams: L scalar type must match KernelHandle entry " + "type (aka nnz_lno_t, and const doesn't matter)"); + + static_assert( + KOKKOSKERNELS_SPILUK_SAME_TYPE(typename URowMapType::non_const_value_type, + size_type), + "spiluk_numeric_streams: U size_type must match KernelHandle size_type " + "(const doesn't matter)"); + static_assert( + KOKKOSKERNELS_SPILUK_SAME_TYPE( + typename UEntriesType::non_const_value_type, ordinal_type), + "spiluk_numeric_streams: U entry type must match KernelHandle entry " + "type (aka nnz_lno_t, and const doesn't matter)"); + static_assert( + KOKKOSKERNELS_SPILUK_SAME_TYPE(typename UValuesType::value_type, + scalar_type), + "spiluk_numeric_streams: U scalar type must match KernelHandle entry " + "type (aka nnz_lno_t, and const doesn't matter)"); static_assert(Kokkos::is_view::value, "spiluk_numeric_streams: A_rowmap is not a Kokkos::View."); @@ -628,81 +668,92 @@ void spiluk_numeric_streams(const std::vector& execspace_v, static_assert(Kokkos::is_view::value, "spiluk_numeric_streams: U_values is not a Kokkos::View."); - static_assert( - (int)LRowMapType::rank == (int)ARowMapType::rank, - "spiluk_numeric_streams: The ranks of L_rowmap and A_rowmap do not match."); - static_assert( - (int)LEntriesType::rank == (int)AEntriesType::rank, - "spiluk_numeric_streams: The ranks of L_entries and A_entries do not match."); - static_assert( - (int)LValuesType::rank == (int)AValuesType::rank, - "spiluk_numeric_streams: The ranks of L_values and A_values do not match."); - - static_assert( - (int)LRowMapType::rank == (int)URowMapType::rank, - "spiluk_numeric_streams: The ranks of L_rowmap and U_rowmap do not match."); - static_assert( - (int)LEntriesType::rank == (int)UEntriesType::rank, - "spiluk_numeric_streams: The ranks of L_entries and U_entries do not match."); - static_assert( - (int)LValuesType::rank == (int)UValuesType::rank, - "spiluk_numeric_streams: The ranks of L_values and U_values do not match."); - - static_assert( - LRowMapType::rank == 1, - "spiluk_numeric_streams: A_rowmap, L_rowmap and U_rowmap must all have rank 1."); - static_assert(LEntriesType::rank == 1, - "spiluk_numeric_streams: A_entries, L_entries and U_entries must all " - "have rank 1."); - static_assert( - LValuesType::rank == 1, - "spiluk_numeric_streams: A_values, L_values and U_values must all have rank 1."); + static_assert((int)LRowMapType::rank == (int)ARowMapType::rank, + "spiluk_numeric_streams: The ranks of L_rowmap and A_rowmap do " + "not match."); + static_assert((int)LEntriesType::rank == (int)AEntriesType::rank, + "spiluk_numeric_streams: The ranks of L_entries and A_entries " + "do not match."); + static_assert((int)LValuesType::rank == (int)AValuesType::rank, + "spiluk_numeric_streams: The ranks of L_values and A_values do " + "not match."); + + static_assert((int)LRowMapType::rank == (int)URowMapType::rank, + "spiluk_numeric_streams: The ranks of L_rowmap and U_rowmap do " + "not match."); + static_assert((int)LEntriesType::rank == (int)UEntriesType::rank, + "spiluk_numeric_streams: The ranks of L_entries and U_entries " + "do not match."); + static_assert((int)LValuesType::rank == (int)UValuesType::rank, + "spiluk_numeric_streams: The ranks of L_values and U_values do " + "not match."); + + static_assert(LRowMapType::rank == 1, + "spiluk_numeric_streams: A_rowmap, L_rowmap and U_rowmap must " + "all have rank 1."); + static_assert( + LEntriesType::rank == 1, + "spiluk_numeric_streams: A_entries, L_entries and U_entries must all " + "have rank 1."); + static_assert(LValuesType::rank == 1, + "spiluk_numeric_streams: A_values, L_values and U_values must " + "all have rank 1."); static_assert( std::is_same::value, "spiluk_numeric_streams: The output L_entries must be nonconst."); - static_assert(std::is_same::value, - "spiluk_numeric_streams: The output L_values must be nonconst."); + static_assert( + std::is_same::value, + "spiluk_numeric_streams: The output L_values must be nonconst."); static_assert( std::is_same::value, "spiluk_numeric_streams: The output U_entries must be nonconst."); - static_assert(std::is_same::value, - "spiluk_numeric_streams: The output U_values must be nonconst."); + static_assert( + std::is_same::value, + "spiluk_numeric_streams: The output U_values must be nonconst."); - static_assert(std::is_same::value, - "spiluk_numeric_streams: Views LRowMapType and ARowMapType have " - "different device_types."); - static_assert(std::is_same::value, - "spiluk_numeric_streams: Views LEntriesType and AEntriesType have " - "different device_types."); - static_assert(std::is_same::value, - "spiluk_numeric_streams: Views LValuesType and AValuesType have " - "different device_types."); + static_assert( + std::is_same::value, + "spiluk_numeric_streams: Views LRowMapType and ARowMapType have " + "different device_types."); + static_assert( + std::is_same::value, + "spiluk_numeric_streams: Views LEntriesType and AEntriesType have " + "different device_types."); + static_assert( + std::is_same::value, + "spiluk_numeric_streams: Views LValuesType and AValuesType have " + "different device_types."); - static_assert(std::is_same::value, - "spiluk_numeric_streams: Views LRowMapType and URowMapType have " - "different device_types."); - static_assert(std::is_same::value, - "spiluk_numeric_streams: Views LEntriesType and UEntriesType have " - "different device_types."); - static_assert(std::is_same::value, - "spiluk_numeric_streams: Views LValuesType and UValuesType have " - "different device_types."); + static_assert( + std::is_same::value, + "spiluk_numeric_streams: Views LRowMapType and URowMapType have " + "different device_types."); + static_assert( + std::is_same::value, + "spiluk_numeric_streams: Views LEntriesType and UEntriesType have " + "different device_types."); + static_assert( + std::is_same::value, + "spiluk_numeric_streams: Views LValuesType and UValuesType have " + "different device_types."); static_assert( - std::is_same::value, - "spiluk_numeric_streams: KernelHandle's execution space is different from " + "spiluk_numeric_streams: KernelHandle's execution space is different " + "from " "ExecutionSpace."); static_assert( @@ -724,10 +775,10 @@ void spiluk_numeric_streams(const std::vector& execspace_v, "spiluk_numeric_streams: KernelHandle and Views have different execution " "spaces."); - static_assert( - std::is_same::value, - "spiluk_numeric_streams: rowmap and entries have different device types."); + static_assert(std::is_same::value, + "spiluk_numeric_streams: rowmap and entries have different " + "device types."); static_assert( std::is_same::value, @@ -736,78 +787,103 @@ void spiluk_numeric_streams(const std::vector& execspace_v, // Check validity of fill level if (fill_lev < 0) { std::ostringstream os; - os << "KokkosSparse::Experimental::spiluk_numeric_streams: fill_lev: " << fill_lev - << ". Valid value is >= 0."; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: fill_lev: " + << fill_lev << ". Valid value is >= 0."; KokkosKernels::Impl::throw_runtime_exception(os.str()); } // Check sizes of vectors if (execspace_v.size() != handle_v.size()) { std::ostringstream os; - os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. handle_v.size() " << handle_v.size(); + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. handle_v.size() " << handle_v.size(); KokkosKernels::Impl::throw_runtime_exception(os.str()); } if (execspace_v.size() != A_rowmap_v.size()) { std::ostringstream os; - os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. A_rowmap_v.size() " << A_rowmap_v.size(); + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. A_rowmap_v.size() " << A_rowmap_v.size(); KokkosKernels::Impl::throw_runtime_exception(os.str()); } if (execspace_v.size() != A_entries_v.size()) { std::ostringstream os; - os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. A_entries_v.size() " << A_entries_v.size(); + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. A_entries_v.size() " + << A_entries_v.size(); KokkosKernels::Impl::throw_runtime_exception(os.str()); } if (execspace_v.size() != A_values_v.size()) { std::ostringstream os; - os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. A_values_v.size() " << A_values_v.size(); + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. A_values_v.size() " << A_values_v.size(); KokkosKernels::Impl::throw_runtime_exception(os.str()); } if (execspace_v.size() != L_rowmap_v.size()) { std::ostringstream os; - os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. L_rowmap_v.size() " << L_rowmap_v.size(); + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. L_rowmap_v.size() " << L_rowmap_v.size(); KokkosKernels::Impl::throw_runtime_exception(os.str()); } if (execspace_v.size() != L_entries_v.size()) { std::ostringstream os; - os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. L_entries_v.size() " << L_entries_v.size(); + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. L_entries_v.size() " + << L_entries_v.size(); KokkosKernels::Impl::throw_runtime_exception(os.str()); } if (execspace_v.size() != L_values_v.size()) { std::ostringstream os; - os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. L_values_v.size() " << L_values_v.size(); + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. L_values_v.size() " << L_values_v.size(); KokkosKernels::Impl::throw_runtime_exception(os.str()); } if (execspace_v.size() != U_rowmap_v.size()) { std::ostringstream os; - os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. U_rowmap_v.size() " << U_rowmap_v.size(); + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. U_rowmap_v.size() " << U_rowmap_v.size(); KokkosKernels::Impl::throw_runtime_exception(os.str()); } if (execspace_v.size() != U_entries_v.size()) { std::ostringstream os; - os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. U_entries_v.size() " << U_entries_v.size(); + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. U_entries_v.size() " + << U_entries_v.size(); KokkosKernels::Impl::throw_runtime_exception(os.str()); } if (execspace_v.size() != U_values_v.size()) { std::ostringstream os; - os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes must match -- execspace_v.size() " << execspace_v.size() << " vs. U_values_v.size() " << U_values_v.size(); + os << "KokkosSparse::Experimental::spiluk_numeric_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. U_values_v.size() " << U_values_v.size(); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - + // Check if symbolic has been called for (int i = 0; i < static_cast(execspace_v.size()); i++) { if (handle_v[i]->get_spiluk_handle()->is_symbolic_complete() == false) { std::ostringstream os; - os << "KokkosSparse::Experimental::spiluk_numeric_streams: spiluk_symbolic must be " - "called before spiluk_numeric_streams -- stream " << i; + os << "KokkosSparse::Experimental::spiluk_numeric_streams: " + "spiluk_symbolic must be " + "called before spiluk_numeric_streams -- stream " + << i; KokkosKernels::Impl::throw_runtime_exception(os.str()); } } @@ -819,9 +895,11 @@ void spiluk_numeric_streams(const std::vector& execspace_v, using c_temp_t = typename KernelHandle::HandleTempMemorySpace; using c_persist_t = typename KernelHandle::HandlePersistentMemorySpace; - using const_handle_type = typename KokkosKernels::Experimental::KokkosKernelsHandle; + using const_handle_type = + typename KokkosKernels::Experimental::KokkosKernelsHandle< + c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t>; - using ARowMap_Internal = Kokkos::View< + using ARowMap_Internal = Kokkos::View< typename ARowMapType::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename ARowMapType::device_type, @@ -829,7 +907,8 @@ void spiluk_numeric_streams(const std::vector& execspace_v, using AEntries_Internal = Kokkos::View< typename AEntriesType::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename KokkosKernels::Impl::GetUnifiedLayout< + AEntriesType>::array_layout, typename AEntriesType::device_type, Kokkos::MemoryTraits >; @@ -847,7 +926,8 @@ void spiluk_numeric_streams(const std::vector& execspace_v, using LEntries_Internal = Kokkos::View< typename LEntriesType::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename KokkosKernels::Impl::GetUnifiedLayout< + LEntriesType>::array_layout, typename LEntriesType::device_type, Kokkos::MemoryTraits >; @@ -865,7 +945,8 @@ void spiluk_numeric_streams(const std::vector& execspace_v, using UEntries_Internal = Kokkos::View< typename UEntriesType::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename KokkosKernels::Impl::GetUnifiedLayout< + UEntriesType>::array_layout, typename UEntriesType::device_type, Kokkos::MemoryTraits >; @@ -875,31 +956,40 @@ void spiluk_numeric_streams(const std::vector& execspace_v, typename UValuesType::device_type, Kokkos::MemoryTraits >; - std::vector handle_i_v(execspace_v.size()); - std::vector A_rowmap_i_v(execspace_v.size()); - std::vector A_entries_i_v(execspace_v.size()); - std::vector A_values_i_v(execspace_v.size()); - std::vector L_rowmap_i_v(execspace_v.size()); - std::vector L_entries_i_v(execspace_v.size()); - std::vector L_values_i_v(execspace_v.size()); - std::vector U_rowmap_i_v(execspace_v.size()); - std::vector U_entries_i_v(execspace_v.size()); - std::vector U_values_i_v(execspace_v.size()); + std::vector handle_i_v(execspace_v.size()); + std::vector A_rowmap_i_v(execspace_v.size()); + std::vector A_entries_i_v(execspace_v.size()); + std::vector A_values_i_v(execspace_v.size()); + std::vector L_rowmap_i_v(execspace_v.size()); + std::vector L_entries_i_v(execspace_v.size()); + std::vector L_values_i_v(execspace_v.size()); + std::vector U_rowmap_i_v(execspace_v.size()); + std::vector U_entries_i_v(execspace_v.size()); + std::vector U_values_i_v(execspace_v.size()); for (int i = 0; i < static_cast(execspace_v.size()); i++) { - handle_i_v[i] = const_handle_type(*(handle_v[i])); - A_rowmap_i_v[i] = A_rowmap_v[i]; - A_entries_i_v[i]= A_entries_v[i]; - A_values_i_v[i] = A_values_v[i]; - L_rowmap_i_v[i] = L_rowmap_v[i]; - L_entries_i_v[i]= L_entries_v[i]; - L_values_i_v[i] = L_values_v[i]; - U_rowmap_i_v[i] = U_rowmap_v[i]; - U_entries_i_v[i]= U_entries_v[i]; - U_values_i_v[i] = U_values_v[i]; + handle_i_v[i] = const_handle_type(*(handle_v[i])); + A_rowmap_i_v[i] = A_rowmap_v[i]; + A_entries_i_v[i] = A_entries_v[i]; + A_values_i_v[i] = A_values_v[i]; + L_rowmap_i_v[i] = L_rowmap_v[i]; + L_entries_i_v[i] = L_entries_v[i]; + L_values_i_v[i] = L_values_v[i]; + U_rowmap_i_v[i] = U_rowmap_v[i]; + U_entries_i_v[i] = U_entries_v[i]; + U_values_i_v[i] = U_values_v[i]; } - KokkosSparse::Impl::SPILUK_NUMERIC::spiluk_numeric_streams(execspace_v, handle_i_v, A_rowmap_i_v, A_entries_i_v, A_values_i_v, L_rowmap_i_v, L_entries_i_v, L_values_i_v, U_rowmap_i_v, U_entries_i_v, U_values_i_v); + KokkosSparse::Impl::SPILUK_NUMERIC< + ExecutionSpace, const_handle_type, ARowMap_Internal, AEntries_Internal, + AValues_Internal, LRowMap_Internal, LEntries_Internal, LValues_Internal, + URowMap_Internal, UEntries_Internal, + UValues_Internal>::spiluk_numeric_streams(execspace_v, handle_i_v, + A_rowmap_i_v, A_entries_i_v, + A_values_i_v, L_rowmap_i_v, + L_entries_i_v, L_values_i_v, + U_rowmap_i_v, U_entries_i_v, + U_values_i_v); } // spiluk_numeric_streams From 004c1c041444bf1117a0ba5fe0a415537a722bc3 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Wed, 8 Mar 2023 23:57:27 -0800 Subject: [PATCH 109/442] Fix undefined reference errors and clean up printf statements --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 4 - .../KokkosSparse_spiluk_symbolic_impl.hpp | 8 - sparse/src/KokkosSparse_spiluk.hpp | 4 +- sparse/unit_test/Test_Sparse_spiluk.hpp | 205 ++++++++++++++++++ 4 files changed, 207 insertions(+), 14 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 831f3796c2..511fb35709 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -531,10 +531,6 @@ void iluk_numeric_streams(const std::vector &execspace_v, if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; } - std::cout << "iluk_numeric_streams--Max. number of levels among streams " - "(nlevels_max): " - << nlevels_max << std::endl; - // Assume all streams use the same algorithm if (thandle_v[0]->get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) { diff --git a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 9edc6559b6..c8e9cd4ed7 100644 --- a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -436,18 +436,10 @@ void iluk_symbolic(IlukHandle& thandle, level_sched_tp(thandle, L_row_map, L_entries, level_list, level_ptr, level_idx, nlev, nstreams); thandle.alloc_iw(thandle.get_level_maxrowsperchunk(), nrows); - printf("spiluk_symbolic: iw (%d x %d) size %lu bytes\n", - thandle.get_level_maxrowsperchunk(), nrows, - (size_t)(nrows) * (size_t)(thandle.get_level_maxrowsperchunk()) * - sizeof(nnz_lno_t)); } else { level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, level_idx, nlev); thandle.alloc_iw(thandle.get_level_maxrows(), nrows); - printf("spiluk_symbolic: iw (%d x %d) size %lu bytes\n", - thandle.get_level_maxrows(), nrows, - (size_t)(nrows) * (size_t)(thandle.get_level_maxrows()) * - sizeof(nnz_lno_t)); } Kokkos::deep_copy(dlevel_ptr, level_ptr); diff --git a/sparse/src/KokkosSparse_spiluk.hpp b/sparse/src/KokkosSparse_spiluk.hpp index 885f742f89..1bf78abe5e 100644 --- a/sparse/src/KokkosSparse_spiluk.hpp +++ b/sparse/src/KokkosSparse_spiluk.hpp @@ -925,7 +925,7 @@ void spiluk_numeric_streams(const std::vector& execspace_v, Kokkos::MemoryTraits >; using LEntries_Internal = Kokkos::View< - typename LEntriesType::const_value_type*, + typename LEntriesType::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout< LEntriesType>::array_layout, typename LEntriesType::device_type, @@ -944,7 +944,7 @@ void spiluk_numeric_streams(const std::vector& execspace_v, Kokkos::MemoryTraits >; using UEntries_Internal = Kokkos::View< - typename UEntriesType::const_value_type*, + typename UEntriesType::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout< UEntriesType>::array_layout, typename UEntriesType::device_type, diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 12065781f1..d1172aca9e 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -262,6 +262,188 @@ void run_test_spiluk() { } } +template +void run_test_spiluk_streams(int test_algo, int nstreams) { + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using RowMapType_hostmirror = typename RowMapType::HostMirror; + using EntriesType_hostmirror = typename EntriesType::HostMirror; + using ValuesType_hostmirror = typename ValuesType::HostMirror; + using execution_space = typename device::execution_space; + using memory_space = typename device::memory_space; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; + using crsMat_t = CrsMatrix; + using AT = Kokkos::Details::ArithTraits; + + const size_type nrows = 9; + const size_type nnz = 21; + + std::vector instances; + if (nstreams == 2) + instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); + else if (nstreams == 3) + instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); + else + instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); + + std::vector kh_v(nstreams); + std::vector kh_ptr_v(nstreams); + std::vector A_row_map_v(nstreams); + std::vector A_entries_v(nstreams); + std::vector A_values_v(nstreams); + std::vector L_row_map_v(nstreams); + std::vector L_entries_v(nstreams); + std::vector L_values_v(nstreams); + std::vector U_row_map_v(nstreams); + std::vector U_entries_v(nstreams); + std::vector U_values_v(nstreams); + + RowMapType_hostmirror hrow_map("hrow_map", nrows + 1); + EntriesType_hostmirror hentries("hentries", nnz); + ValuesType_hostmirror hvalues("hvalues", nnz); + + scalar_t ZERO = scalar_t(0); + scalar_t ONE = scalar_t(1); + scalar_t MONE = scalar_t(-1); + + hrow_map(0) = 0; + hrow_map(1) = 3; + hrow_map(2) = 5; + hrow_map(3) = 6; + hrow_map(4) = 9; + hrow_map(5) = 11; + hrow_map(6) = 13; + hrow_map(7) = 15; + hrow_map(8) = 18; + hrow_map(9) = nnz; + + hentries(0) = 0; + hentries(1) = 2; + hentries(2) = 5; + hentries(3) = 1; + hentries(4) = 6; + hentries(5) = 2; + hentries(6) = 0; + hentries(7) = 3; + hentries(8) = 4; + hentries(9) = 0; + hentries(10) = 4; + hentries(11) = 1; + hentries(12) = 5; + hentries(13) = 2; + hentries(14) = 6; + hentries(15) = 3; + hentries(16) = 4; + hentries(17) = 7; + hentries(18) = 3; + hentries(19) = 4; + hentries(20) = 8; + + hvalues(0) = 10; + hvalues(1) = 0.3; + hvalues(2) = 0.6; + hvalues(3) = 11; + hvalues(4) = 0.7; + hvalues(5) = 12; + hvalues(6) = 5; + hvalues(7) = 13; + hvalues(8) = 1; + hvalues(9) = 4; + hvalues(10) = 14; + hvalues(11) = 3; + hvalues(12) = 15; + hvalues(13) = 7; + hvalues(14) = 16; + hvalues(15) = 6; + hvalues(16) = 5; + hvalues(17) = 17; + hvalues(18) = 2; + hvalues(19) = 2.5; + hvalues(20) = 18; + + typename KernelHandle::const_nnz_lno_t fill_lev = 2; + + for (int i = 0; i < nstreams; i++) { + // Allocate A as input + A_row_map_v[i] = RowMapType("A_row_map", nrows + 1); + A_entries_v[i] = EntriesType("A_entries", nnz); + A_values_v[i] = ValuesType("A_values", nnz); + + // Copy from host to device + Kokkos::deep_copy(A_row_map_v[i], hrow_map); + Kokkos::deep_copy(A_entries_v[i], hentries); + Kokkos::deep_copy(A_values_v[i], hvalues); + + // Create handle + kh_v[i] = KernelHandle(); + if (test_algo == 0) + kh_v[i].create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows, 4 * nrows, 4 * nrows); + else if (test_algo == 1) + kh_v[i].create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, 4 * nrows, 4 * nrows); + kh_ptr_v[i] = &kh_v[i]; + + auto spiluk_handle = kh_v[i].get_spiluk_handle(); + std::cout << " Stream " << i << ": "; + spiluk_handle->print_algorithm(); + + // Allocate L and U as outputs + L_row_map_v[i] = RowMapType("L_row_map", nrows + 1); + L_entries_v[i] = EntriesType("L_entries", spiluk_handle->get_nnzL()); + L_values_v[i] = ValuesType("L_values", spiluk_handle->get_nnzL()); + U_row_map_v[i] = RowMapType("U_row_map", nrows + 1); + U_entries_v[i] = EntriesType("U_entries", spiluk_handle->get_nnzU()); + U_values_v[i] = ValuesType("U_values", spiluk_handle->get_nnzU()); + + // Symbolic phase + spiluk_symbolic(kh_ptr_v[i], fill_lev, A_row_map_v[i], A_entries_v[i], L_row_map_v[i], L_entries_v[i], U_row_map_v[i], U_entries_v[i]); + + Kokkos::fence(); + + Kokkos::resize(L_entries_v[i], spiluk_handle->get_nnzL()); + Kokkos::resize(L_values_v[i], spiluk_handle->get_nnzL()); + Kokkos::resize(U_entries_v[i], spiluk_handle->get_nnzU()); + Kokkos::resize(U_values_v[i], spiluk_handle->get_nnzU()); + } + + // Numeric phase + spiluk_numeric_streams(instances, kh_ptr_v, fill_lev, A_row_map_v, A_entries_v, A_values_v, L_row_map_v, L_entries_v, L_values_v, U_row_map_v, U_entries_v, U_values_v); + + for (int i = 0; i < nstreams; i++) + instances[i].fence(); + + // Checking + for (int i = 0; i < nstreams; i++) { + auto spiluk_handle = kh_v[i].get_spiluk_handle(); + crsMat_t A("A_Mtx", nrows, nrows, nnz, A_values_v[i], A_row_map_v[i], A_entries_v[i]); + crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values_v[i], L_row_map_v[i], L_entries_v[i]); + crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values_v[i], U_row_map_v[i], U_entries_v[i]); + + // Create a reference view e set to all 1's + ValuesType e_one("e_one", nrows); + Kokkos::deep_copy(e_one, 1.0); + + // Create two views for spmv results + ValuesType bb("bb", nrows); + ValuesType bb_tmp("bb_tmp", nrows); + + // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one) + KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb); + + typename AT::mag_type bb_nrm = KokkosBlas::nrm2(bb); + + KokkosSparse::spmv("N", ONE, U, e_one, ZERO, bb_tmp); + KokkosSparse::spmv("N", ONE, L, bb_tmp, MONE, bb); + + typename AT::mag_type diff_nrm = KokkosBlas::nrm2(bb); + + EXPECT_TRUE((diff_nrm / bb_nrm) < 1e-4); + + kh_v[i].destroy_spiluk_handle(); + } +} + } // namespace Test template (); } +template +void test_spiluk_streams() { + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 2 streams" << std::endl; + Test::run_test_spiluk_streams(0, 2); + + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 3 streams" << std::endl; + Test::run_test_spiluk_streams(0, 3); + + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 4 streams" << std::endl; + Test::run_test_spiluk_streams(0, 4); + + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 2 streams" << std::endl; + Test::run_test_spiluk_streams(1, 2); + + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 3 streams" << std::endl; + Test::run_test_spiluk_streams(1, 3); + + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 4 streams" << std::endl; + Test::run_test_spiluk_streams(1, 4); +} + #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##spiluk##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ test_spiluk(); \ + test_spiluk_streams(); \ } #define NO_TEST_COMPLEX From 08e3824f3efd39eafd8faa0278e10080ef6d1923 Mon Sep 17 00:00:00 2001 From: "Vinh Quang Dang (-EXP)" Date: Thu, 9 Mar 2023 01:04:17 -0700 Subject: [PATCH 110/442] Apply clang format to Test_Sparse_spiluk.hpp --- sparse/unit_test/Test_Sparse_spiluk.hpp | 51 +++++++++++++++---------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index d1172aca9e..bb0c9a52e7 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -265,17 +265,18 @@ void run_test_spiluk() { template void run_test_spiluk_streams(int test_algo, int nstreams) { - using RowMapType = Kokkos::View; - using EntriesType = Kokkos::View; - using ValuesType = Kokkos::View; - using RowMapType_hostmirror = typename RowMapType::HostMirror; + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using RowMapType_hostmirror = typename RowMapType::HostMirror; using EntriesType_hostmirror = typename EntriesType::HostMirror; - using ValuesType_hostmirror = typename ValuesType::HostMirror; - using execution_space = typename device::execution_space; - using memory_space = typename device::memory_space; - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; + using ValuesType_hostmirror = typename ValuesType::HostMirror; + using execution_space = typename device::execution_space; + using memory_space = typename device::memory_space; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; using crsMat_t = CrsMatrix; - using AT = Kokkos::Details::ArithTraits; + using AT = Kokkos::Details::ArithTraits; const size_type nrows = 9; const size_type nnz = 21; @@ -284,9 +285,11 @@ void run_test_spiluk_streams(int test_algo, int nstreams) { if (nstreams == 2) instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); else if (nstreams == 3) - instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); else - instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); std::vector kh_v(nstreams); std::vector kh_ptr_v(nstreams); @@ -379,9 +382,11 @@ void run_test_spiluk_streams(int test_algo, int nstreams) { // Create handle kh_v[i] = KernelHandle(); if (test_algo == 0) - kh_v[i].create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows, 4 * nrows, 4 * nrows); + kh_v[i].create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows, + 4 * nrows, 4 * nrows); else if (test_algo == 1) - kh_v[i].create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, 4 * nrows, 4 * nrows); + kh_v[i].create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, + 4 * nrows, 4 * nrows); kh_ptr_v[i] = &kh_v[i]; auto spiluk_handle = kh_v[i].get_spiluk_handle(); @@ -397,7 +402,9 @@ void run_test_spiluk_streams(int test_algo, int nstreams) { U_values_v[i] = ValuesType("U_values", spiluk_handle->get_nnzU()); // Symbolic phase - spiluk_symbolic(kh_ptr_v[i], fill_lev, A_row_map_v[i], A_entries_v[i], L_row_map_v[i], L_entries_v[i], U_row_map_v[i], U_entries_v[i]); + spiluk_symbolic(kh_ptr_v[i], fill_lev, A_row_map_v[i], A_entries_v[i], + L_row_map_v[i], L_entries_v[i], U_row_map_v[i], + U_entries_v[i]); Kokkos::fence(); @@ -408,17 +415,21 @@ void run_test_spiluk_streams(int test_algo, int nstreams) { } // Numeric phase - spiluk_numeric_streams(instances, kh_ptr_v, fill_lev, A_row_map_v, A_entries_v, A_values_v, L_row_map_v, L_entries_v, L_values_v, U_row_map_v, U_entries_v, U_values_v); + spiluk_numeric_streams(instances, kh_ptr_v, fill_lev, A_row_map_v, + A_entries_v, A_values_v, L_row_map_v, L_entries_v, + L_values_v, U_row_map_v, U_entries_v, U_values_v); - for (int i = 0; i < nstreams; i++) - instances[i].fence(); + for (int i = 0; i < nstreams; i++) instances[i].fence(); // Checking for (int i = 0; i < nstreams; i++) { auto spiluk_handle = kh_v[i].get_spiluk_handle(); - crsMat_t A("A_Mtx", nrows, nrows, nnz, A_values_v[i], A_row_map_v[i], A_entries_v[i]); - crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values_v[i], L_row_map_v[i], L_entries_v[i]); - crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values_v[i], U_row_map_v[i], U_entries_v[i]); + crsMat_t A("A_Mtx", nrows, nrows, nnz, A_values_v[i], A_row_map_v[i], + A_entries_v[i]); + crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values_v[i], + L_row_map_v[i], L_entries_v[i]); + crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values_v[i], + U_row_map_v[i], U_entries_v[i]); // Create a reference view e set to all 1's ValuesType e_one("e_one", nrows); From d83c123ea894d67228253e7a890b6e76d3c53392 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Thu, 9 Mar 2023 08:47:49 -0800 Subject: [PATCH 111/442] Add nstreams to symbolic call --- sparse/unit_test/Test_Sparse_spiluk.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index bb0c9a52e7..a7424285f4 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -404,7 +404,7 @@ void run_test_spiluk_streams(int test_algo, int nstreams) { // Symbolic phase spiluk_symbolic(kh_ptr_v[i], fill_lev, A_row_map_v[i], A_entries_v[i], L_row_map_v[i], L_entries_v[i], U_row_map_v[i], - U_entries_v[i]); + U_entries_v[i], nstreams); Kokkos::fence(); @@ -412,7 +412,7 @@ void run_test_spiluk_streams(int test_algo, int nstreams) { Kokkos::resize(L_values_v[i], spiluk_handle->get_nnzL()); Kokkos::resize(U_entries_v[i], spiluk_handle->get_nnzU()); Kokkos::resize(U_values_v[i], spiluk_handle->get_nnzU()); - } + } // Done handle creation and spiluk_symbolic on all streams // Numeric phase spiluk_numeric_streams(instances, kh_ptr_v, fill_lev, A_row_map_v, From 482cc00f6c4277869bbc06bd13f5726ac6b82d10 Mon Sep 17 00:00:00 2001 From: "Vinh Quang Dang (-EXP)" Date: Thu, 9 Mar 2023 09:54:09 -0700 Subject: [PATCH 112/442] clang format --- sparse/unit_test/Test_Sparse_spiluk.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index a7424285f4..f3b7559ea3 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -412,7 +412,7 @@ void run_test_spiluk_streams(int test_algo, int nstreams) { Kokkos::resize(L_values_v[i], spiluk_handle->get_nnzL()); Kokkos::resize(U_entries_v[i], spiluk_handle->get_nnzU()); Kokkos::resize(U_values_v[i], spiluk_handle->get_nnzU()); - } // Done handle creation and spiluk_symbolic on all streams + } // Done handle creation and spiluk_symbolic on all streams // Numeric phase spiluk_numeric_streams(instances, kh_ptr_v, fill_lev, A_row_map_v, From 0361d1d321e65302fba6825ad503599cbd7fed48 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Tue, 20 Dec 2022 20:13:35 +0100 Subject: [PATCH 113/442] #5: Added benchmark dot perf test --- perf_test/CMakeLists.txt | 1 + .../KokkosBlas_dot_perf_test_benchmark.cpp | 143 ++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index 28752e9c6c..788951968d 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -141,6 +141,7 @@ IF(KokkosKernels_ENABLE_BENCHMARK) SET( BENCHMARK_SOURCES BenchmarkMain.cpp + blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp ) KOKKOSKERNELS_ADD_BENCHMARK( diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp new file mode 100644 index 0000000000..98dd9fc149 --- /dev/null +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp @@ -0,0 +1,143 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +// For RPS implementation +#include "KokkosBlas_dot_perf_test.hpp" +#include "KokkosKernels_TestUtils.hpp" +#include + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// The Level 1 BLAS perform scalar, vector and vector-vector operations; +// +// https://github.com/kokkos/kokkos-kernels/wiki/BLAS-1%3A%3Adot +// +// Usage: result = KokkosBlas::dot(x,y); KokkosBlas::dot(r,x,y); +// Multiplies each value of x(i) [x(i,j)] with y(i) or [y(i,j)] and computes the +// sum. (If x and y have scalar type Kokkos::complex, the complex conjugate of +// x(i) or x(i,j) will be used.) VectorX: A rank-1 Kokkos::View VectorY: A +// rank-1 Kokkos::View ReturnVector: A rank-0 or rank-1 Kokkos::View +// +// REQUIREMENTS: +// Y.rank == 1 or X.rank == 1 +// Y.extent(0) == X.extent(0) + +// Dot Test design: +// 1) create 1D View containing 1D matrix, aka a vector; this will be your X +// input matrix; 2) create 1D View containing 1D matrix, aka a vector; this will +// be your Y input matrix; 3) perform the dot operation on the two inputs, and +// capture result in "result" + +// Here, m represents the desired length for each 1D matrix; +// "m" is used here, because code from another test was adapted for this test. +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +static void run(benchmark::State& state) { + + const auto m = state.range(0); + const auto repeat = state.range(1); + // Declare type aliases + using Scalar = double; + using MemSpace = typename ExecSpace::memory_space; + using Device = Kokkos::Device; + + std::cout << "Running BLAS Level 1 DOT perfomrance experiment (" + << ExecSpace::name() << ")\n"; + + std::cout << "Each test input vector has a length of " << m << std::endl; + + // Create 1D view w/ Device as the ExecSpace; this is an input vector + // A(view_alloc(WithoutInitializing, "label"), m, n); + Kokkos::View x( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "x"), m); + + // Create 1D view w/ Device as the ExecSpace; this is the output vector + Kokkos::View y( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "y"), m); + + // Declaring variable pool w/ a seeded random number; + // a parallel random number generator, so you + // won't get the same number with a given seed each time + Kokkos::Random_XorShift64_Pool pool(123); + + Kokkos::fill_random(x, pool, 10.0); + Kokkos::fill_random(y, pool, 10.0); + + for (auto _ : state) { + // do a warm up run of dot: + KokkosBlas::dot(x, y); + + // The live test of dot: + Kokkos::fence(); + Kokkos::Timer timer; + + for (int i = 0; i < repeat; i++) { + KokkosBlas::dot(x, y); + ExecSpace().fence(); + } + + // Kokkos Timer set up + double total = timer.seconds(); + double avg = total / repeat; + // Flops calculation for a 1D matrix dot product per test run; + size_t flopsPerRun = (size_t)2 * m; + printf("Avg DOT time: %f s.\n", avg); + printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); + state.SetIterationTime(timer.seconds()); + + state.counters["Avg DOT time:"] = + benchmark::Counter(avg, benchmark::Counter::kDefaults); + state.counters["Avg DOT FLOP/s:"] = + benchmark::Counter(flopsPerRun / avg, benchmark::Counter::kDefaults); + } +} + +BENCHMARK(run) + ->ArgNames({"m","repeat"}) + ->Args({100000,1}) + ->UseManualTime(); From 7dfe9efde20272d3c23c83e60dd0143932762c1d Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Wed, 21 Dec 2022 14:38:33 +0100 Subject: [PATCH 114/442] #5: generalized execution space and removed unused include --- .../blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp index 98dd9fc149..f08867de97 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp @@ -45,9 +45,7 @@ #include #include -// For RPS implementation #include "KokkosBlas_dot_perf_test.hpp" -#include "KokkosKernels_TestUtils.hpp" #include @@ -130,14 +128,15 @@ static void run(benchmark::State& state) { printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); state.SetIterationTime(timer.seconds()); - state.counters["Avg DOT time:"] = + state.counters["Avg DOT time (s):"] = benchmark::Counter(avg, benchmark::Counter::kDefaults); state.counters["Avg DOT FLOP/s:"] = benchmark::Counter(flopsPerRun / avg, benchmark::Counter::kDefaults); } } -BENCHMARK(run) +BENCHMARK(run) ->ArgNames({"m","repeat"}) ->Args({100000,1}) ->UseManualTime(); + From 7be07e5a483576877e6d829e46e5027ff168e200 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Wed, 21 Dec 2022 15:51:17 +0100 Subject: [PATCH 115/442] #5: Fixed clang-format errors --- .../blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp index f08867de97..332ca2f2ca 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp @@ -48,7 +48,6 @@ #include "KokkosBlas_dot_perf_test.hpp" #include - /////////////////////////////////////////////////////////////////////////////////////////////////// // The Level 1 BLAS perform scalar, vector and vector-vector operations; // @@ -76,8 +75,7 @@ template static void run(benchmark::State& state) { - - const auto m = state.range(0); + const auto m = state.range(0); const auto repeat = state.range(1); // Declare type aliases using Scalar = double; @@ -132,11 +130,10 @@ static void run(benchmark::State& state) { benchmark::Counter(avg, benchmark::Counter::kDefaults); state.counters["Avg DOT FLOP/s:"] = benchmark::Counter(flopsPerRun / avg, benchmark::Counter::kDefaults); - } + } } BENCHMARK(run) - ->ArgNames({"m","repeat"}) - ->Args({100000,1}) + ->ArgNames({"m", "repeat"}) + ->Args({100000, 1}) ->UseManualTime(); - From e9c968cdd8d188763c88f00bc7b52617250ae20f Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Fri, 30 Dec 2022 14:45:37 +0100 Subject: [PATCH 116/442] #5: Added dot_mv benchmark test --- perf_test/CMakeLists.txt | 1 + .../KokkosBlas_dot_mv_perf_test_benchmark.cpp | 141 ++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index 788951968d..0136be1f20 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -142,6 +142,7 @@ IF(KokkosKernels_ENABLE_BENCHMARK) BENCHMARK_SOURCES BenchmarkMain.cpp blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp + blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp ) KOKKOSKERNELS_ADD_BENCHMARK( diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp new file mode 100644 index 0000000000..7174c41c7b --- /dev/null +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp @@ -0,0 +1,141 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +#include "KokkosBlas_dot_perf_test.hpp" +#include + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// The Level 1 BLAS perform scalar, vector and vector-vector operations; +// +// https://github.com/kokkos/kokkos-kernels/wiki/BLAS-1%3A%3Adot +// +// Usage: result = KokkosBlas::dot(x,y); KokkosBlas::dot(r,x,y); +// Multiplies each value of x(i) [x(i,j)] with y(i) or [y(i,j)] and computes the +// sum. (If x and y have scalar type Kokkos::complex, the complex conjugate of +// x(i) or x(i,j) will be used.) VectorX: A rank-1 Kokkos::View VectorY: A +// rank-1 Kokkos::View ReturnVector: A rank-0 or rank-1 Kokkos::View +// +// REQUIREMENTS: +// Y.rank == 1 or X.rank == 1 +// Y.extent(0) == X.extent(0) + +// Dot Test design: +// 1) create 1D View containing 1D matrix, aka a vector; this will be your X +// input matrix; 2) create 1D View containing 1D matrix, aka a vector; this will +// be your Y input matrix; 3) perform the dot operation on the two inputs, and +// capture result in "result" + +// Here, m represents the desired length for each 1D matrix; +// "m" is used here, because code from another test was adapted for this test. +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +static void run(benchmark::State& state) { + const auto m = state.range(0); + const auto n = state.range(1); + const auto repeat = state.range(2); + // Declare type aliases + using Scalar = double; + using MemSpace = typename ExecSpace::memory_space; + using Device = Kokkos::Device; + + std::cout << "Running BLAS Level 1 DOT perfomrance experiment (" + << ExecSpace::name() << ")\n"; + + std::cout << "Each test input vector has a length of " << m << std::endl; + + Kokkos::View x( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "x"), m, n); + + Kokkos::View y( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "y"), m, n); + + Kokkos::View result( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "x dot y"), n); + + // Declaring variable pool w/ a seeded random number; + // a parallel random number generator, so you + // won't get the same number with a given seed each time + Kokkos::Random_XorShift64_Pool pool(123); + + Kokkos::fill_random(x, pool, 10.0); + Kokkos::fill_random(y, pool, 10.0); + + for (auto _ : state) { + // do a warm up run of dot: + KokkosBlas::dot(result, x, y); + + // The live test of dot: + + Kokkos::fence(); + Kokkos::Timer timer; + + for (int i = 0; i < repeat; i++) { + KokkosBlas::dot(result, x, y); + ExecSpace().fence(); + } + + // Kokkos Timer set up + double total = timer.seconds(); + double avg = total / repeat; + // Flops calculation for a 1D matrix dot product per test run; + size_t flopsPerRun = (size_t)2 * m * n; + printf("Avg DOT time: %f s.\n", avg); + printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); + state.SetIterationTime(timer.seconds()); + + state.counters["Avg DOT time (s):"] = + benchmark::Counter(avg, benchmark::Counter::kDefaults); + state.counters["Avg DOT FLOP/s:"] = + benchmark::Counter(flopsPerRun / avg, benchmark::Counter::kDefaults); + } +} + +BENCHMARK(run) + ->ArgNames({"m", "n", "repeat"}) + ->Args({100000, 5, 20}) + ->UseManualTime(); From 4fc79084822f1671873f63e1e0b98e48d42b79f0 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Fri, 30 Dec 2022 14:48:45 +0100 Subject: [PATCH 117/442] #5: Fixed clang-format --- .../blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp index 7174c41c7b..5edea8fac6 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp @@ -107,7 +107,7 @@ static void run(benchmark::State& state) { for (auto _ : state) { // do a warm up run of dot: - KokkosBlas::dot(result, x, y); + KokkosBlas::dot(result, x, y); // The live test of dot: @@ -126,7 +126,7 @@ static void run(benchmark::State& state) { size_t flopsPerRun = (size_t)2 * m * n; printf("Avg DOT time: %f s.\n", avg); printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); - state.SetIterationTime(timer.seconds()); + state.SetIterationTime(timer.seconds()); state.counters["Avg DOT time (s):"] = benchmark::Counter(avg, benchmark::Counter::kDefaults); From 56ef2095f8409c37c9782c1b493584fc5169d100 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Fri, 30 Dec 2022 17:56:34 +0100 Subject: [PATCH 118/442] #5: Added team dot benchmark test --- perf_test/CMakeLists.txt | 1 + ...okkosBlas_team_dot_perf_test_benchmark.cpp | 145 ++++++++++++++++++ 2 files changed, 146 insertions(+) create mode 100644 perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index 0136be1f20..4a4cb928bf 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -143,6 +143,7 @@ IF(KokkosKernels_ENABLE_BENCHMARK) BenchmarkMain.cpp blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp + blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp ) KOKKOSKERNELS_ADD_BENCHMARK( diff --git a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp new file mode 100644 index 0000000000..872e246acd --- /dev/null +++ b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp @@ -0,0 +1,145 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include +#include "KokkosKernels_TestUtils.hpp" + +#include + +// Functor to handle the case of a "without Cuda" build +template +struct teamDotFunctor { + // Compile - time check to see if your data type is a Kokkos::View: + static_assert(Kokkos::is_view::value, + "Vector is not a Kokkos::View."); + + using Scalar = typename Vector::non_const_value_type; + // Vector is templated on memory space + using execution_space = ExecSpace; // Kokkos Execution Space + typedef typename Kokkos::TeamPolicy team_policy; + typedef typename team_policy::member_type team_member; + + // Declare Kokkos::View Vectors, x and y + Vector x; + Vector y; + + // Functor instead of KOKKOS_LAMBDA expression + + KOKKOS_INLINE_FUNCTION void operator()(const team_member& team) const { + KokkosBlas::Experimental::dot(team, x, y); + } + // Constructor + teamDotFunctor(Vector X_, Vector Y_) { + x = X_; + y = Y_; + } +}; + +template +static void run(benchmark::State& state) { + const auto m = state.range(0); + const auto repeat = state.range(1); + // Declare type aliases + using Scalar = double; + using MemSpace = typename ExecSpace::memory_space; + + // For the Team implementation of dot; ExecSpace is implicit; + using policy = Kokkos::TeamPolicy; + + // Create 1D view w/ Device as the ExecSpace; this is an input vector + Kokkos::View x("X", m); + // Create 1D view w/ Device as the ExecSpace; this is the output vector + Kokkos::View y("Y", m); + + // Here, deep_copy is filling / copying values into Host memory from Views X + // and Y + Kokkos::deep_copy(x, 3.0); + Kokkos::deep_copy(y, 2.0); + + std::cout << "Running BLAS Level 1 Kokkos Teams-based implementation DOT " + "performance experiment (" + << ExecSpace::name() << ")\n"; + + std::cout << "Each test input vector has a length of " << m << std::endl; + + for (auto _ : state) { + // Warm up run of dot: + teamDotFunctor, ExecSpace> + teamDotFunctorWarmUpInstance(x, y); + + Kokkos::parallel_for("TeamDotUsage -- Warm Up Run", policy(1, Kokkos::AUTO), + teamDotFunctorWarmUpInstance); + + // The live test of dot: + + Kokkos::fence(); + Kokkos::Timer timer; + + teamDotFunctor, ExecSpace> + teamDotFunctorLiveTestInstance(x, y); + Kokkos::parallel_for("TeamDotUsage -- Live Test", policy(1, Kokkos::AUTO), + teamDotFunctorLiveTestInstance); + + // Kokkos Timer set up and data capture + double total = timer.seconds(); + double avg = total / repeat; + // Flops calculation for a 1D matrix dot product per test run; + size_t flopsPerRun = (size_t)2 * m; + printf("Avg DOT time: %f s.\n", avg); + printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); + state.SetIterationTime(timer.seconds()); + + state.counters["Avg DOT time (s):"] = + benchmark::Counter(avg, benchmark::Counter::kDefaults); + state.counters["Avg DOT FLOP/s:"] = + benchmark::Counter(flopsPerRun / avg, benchmark::Counter::kDefaults); + } +} + +BENCHMARK(run) + ->ArgNames({"m", "repeat"}) + ->Args({100000, 1}) + ->UseManualTime(); From 8be3032612dac044b4fa797aa91d4e13a64a5f0e Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Thu, 5 Jan 2023 15:12:50 +0100 Subject: [PATCH 119/442] #5: Added better name for benchmark tests --- perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp | 1 + perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp | 1 + perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp | 1 + 3 files changed, 3 insertions(+) diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp index 5edea8fac6..1e537ceadc 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp @@ -136,6 +136,7 @@ static void run(benchmark::State& state) { } BENCHMARK(run) + ->Name("KokkosBlas_dot_mv") ->ArgNames({"m", "n", "repeat"}) ->Args({100000, 5, 20}) ->UseManualTime(); diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp index 332ca2f2ca..14957994d1 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp @@ -134,6 +134,7 @@ static void run(benchmark::State& state) { } BENCHMARK(run) + ->Name("KokkosBlas_dot") ->ArgNames({"m", "repeat"}) ->Args({100000, 1}) ->UseManualTime(); diff --git a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp index 872e246acd..165f7fe6db 100644 --- a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp @@ -140,6 +140,7 @@ static void run(benchmark::State& state) { } BENCHMARK(run) + ->Name("KokkosBlas_team_dot/run") ->ArgNames({"m", "repeat"}) ->Args({100000, 1}) ->UseManualTime(); From 3ec0cb7fc2be79e47ee075351799b75fc006776f Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Thu, 2 Mar 2023 17:26:40 +0100 Subject: [PATCH 120/442] #5: Rebased on develop and added kernels print_configuration call --- perf_test/Benchmark_Context.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index 0ef4910cc5..74f081b7d9 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -19,6 +19,8 @@ #ifndef KOKKOSKERNELS_PERFTEST_BENCHMARK_CONTEXT_HPP #define KOKKOSKERNELS_PERFTEST_BENCHMARK_CONTEXT_HPP +#include "KokkosKernels_PrintConfiguration.hpp" + #include #include @@ -46,6 +48,7 @@ std::string remove_unwanted_characters(std::string str) { void add_kokkos_configuration(bool verbose) { std::ostringstream msg; Kokkos::print_configuration(msg, verbose); + KokkosKernels::print_configuration(msg); // Iterate over lines returned from kokkos and extract key:value pairs std::stringstream ss{msg.str()}; From a9877dc6fd052f705b6160eb38280098ec9dd9e7 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Thu, 9 Mar 2023 10:12:33 -0700 Subject: [PATCH 121/442] Install doxygen-latex for HTML docs --- .github/workflows/docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 8a0feef0a2..5152d9f83a 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -15,7 +15,7 @@ jobs: steps: - name: Install Dependencies run: | - sudo apt install doxygen + sudo apt install --no-install-recommends doxygen-latex pip install sphinx pip install breathe pip install sphinx-rtd-theme From 5c270228376be1cec698e1135b7af78b24f48091 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Thu, 9 Mar 2023 10:15:19 -0700 Subject: [PATCH 122/442] Make Sphinix optional --- docs/CMakeLists.txt | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index 40680b0705..34f54edbbb 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -1,10 +1,7 @@ # Source: https://devblogs.microsoft.com/cppblog/clear-functional-c-documentation-with-sphinx-breathe-doxygen-cmake/ # Author: Evan Harvey find_package(Doxygen REQUIRED) -find_package(Sphinx REQUIRED) -set(SPHINX_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}) -set(SPHINX_BUILD ${CMAKE_CURRENT_BINARY_DIR}/docs/sphinx) set(KOKKOS_INCLUDE_DIR ${Kokkos_DIR}/../../../include) file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/conf.py DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) @@ -36,12 +33,21 @@ add_custom_command(OUTPUT ${DOXYGEN_INDEX_FILE} add_custom_target(Doxygen ALL DEPENDS ${DOXYGEN_INDEX_FILE}) - -add_custom_target(Sphinx ALL - COMMAND ${SPHINX_EXECUTABLE} -b html - # Tell Breathe where to find the Doxygen output - -Dbreathe_projects.${PROJECT_NAME}=${DOXYGEN_OUTPUT_DIR}/xml - ${SPHINX_SOURCE} ${SPHINX_BUILD} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS Doxygen - COMMENT "Generating documentation with Sphinx") +## If we can find sphinx, add that target too +find_package(Sphinx) + +if (Sphinx_FOUND) + set(SPHINX_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}) + set(SPHINX_BUILD ${CMAKE_CURRENT_BINARY_DIR}/docs/sphinx) + + add_custom_target(Sphinx ALL + COMMAND ${SPHINX_EXECUTABLE} -b html + # Tell Breathe where to find the Doxygen output + -Dbreathe_projects.${PROJECT_NAME}=${DOXYGEN_OUTPUT_DIR}/xml + ${SPHINX_SOURCE} ${SPHINX_BUILD} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS Doxygen + COMMENT "Generating documentation with Sphinx") +else() # Sphinx_FOUND + message(STATUS "Sphinx not found. Only Doxygen docs can be built") +endif() # Sphinx_FOUND From ff31df01ec77f1bc18c7bf384d7e1642861de48f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 13 Mar 2023 09:47:56 -0600 Subject: [PATCH 123/442] .github: Automation reminder --- .github/workflows/format.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index de5d35e09f..a7b90c54b4 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -41,5 +41,8 @@ jobs: git --no-pager diff --patch --cached echo "==== End Format Patch ====" + echo "To automate formatting, see:" + echo " https://kokkos-kernels.readthedocs.io/en/latest/developer/style.html#id1" + exit 1 fi From 9b0dfbd0ff1b887dfdae2a252241f6adf74e6ed9 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 13 Mar 2023 11:34:12 -0600 Subject: [PATCH 124/442] CUDA 11.4: fixing some failing build while trying to reproduce issue #1725 This basically removes a few function parameters that are unused in the cuda branch for version higher than 11.4. --- sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 4 ++++ sparse/src/KokkosSparse_sptrsv_handle.hpp | 2 +- sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 7090c4f948..17611c3f2c 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -299,6 +299,10 @@ void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle, typedef typename KernelHandle::scalar_t scalar_type; typedef typename KernelHandle::memory_space memory_space; + (void)row_map; + (void)entries; + (void)values; + const bool is_cuda_space = std::is_same::value || std::is_same::value || diff --git a/sparse/src/KokkosSparse_sptrsv_handle.hpp b/sparse/src/KokkosSparse_sptrsv_handle.hpp index 049ab14b1e..7c9027d24a 100644 --- a/sparse/src/KokkosSparse_sptrsv_handle.hpp +++ b/sparse/src/KokkosSparse_sptrsv_handle.hpp @@ -138,7 +138,7 @@ class SPTRSVHandle { cusparseSpSVDescr_t spsvDescr; void *pBuffer{nullptr}; - cuSparseHandleType(bool transpose_, bool is_lower) { + cuSparseHandleType(bool transpose_, bool /*is_lower*/) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreate(&handle)); KOKKOS_CUSPARSE_SAFE_CALL( diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp index 50555d9815..3096278c21 100644 --- a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp @@ -45,7 +45,7 @@ template void spgemm_numeric_cusparse( - KernelHandle *handle, lno_t m, lno_t n, lno_t k, + KernelHandle *handle, lno_t /*m*/, lno_t /*n*/, lno_t /*k*/, const ConstRowMapType &row_mapA, const ConstEntriesType &entriesA, const ConstValuesType &valuesA, const ConstRowMapType &row_mapB, const ConstEntriesType &entriesB, const ConstValuesType &valuesB, From b8ebb9564adbf9c339d3338150a16a72a7ee2d18 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 23 Jan 2023 11:40:43 -0700 Subject: [PATCH 125/442] scripts/cm_test_all_sandia: - Add boiler plate for gnu/10.2.1 and intel/19.0.5.281. --- scripts/cm_test_all_sandia | 61 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index fada06b816..33bf7312d0 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -155,6 +155,10 @@ if [[ "$HOSTNAME" == *blake* ]]; then # Warning: very generic name module load git fi +if [[ "$HOSTNAME" == *solo* ]]; then # Warning: very generic name + MACHINE=solo +fi + if [[ "$HOSTNAME" == kokkos-dev-2* ]]; then MACHINE=kokkos-dev-2 fi @@ -758,6 +762,63 @@ elif [ "$MACHINE" = "blake" ]; then ARCH_FLAG="--arch=SKX" fi SPACK_HOST_ARCH="+skx" +elif [ "$MACHINE" = "solo" ]; then + ###MODULE_ENVIRONMENT="source /etc/profile.d/modules.sh" + ###eval "$MODULE_ENVIRONMENT" + SKIP_HWLOC=True + export SLURM_TASKS_PER_NODE=32 + + module load cmake/3.22.3 + + BASE_MODULE_LIST="cmake/3.22.3,/" + BASE_MODULE_LIST_INTEL="cmake/3.22.3,/" + BASE_MODULE_LIST_ONEAPI="cmake/3.22.3,/oneAPI/base-toolkit/" + ONEAPI_WARNING_FLAGS="" + + ###GNU72_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.2.20/gnu/7.2.0" + GNU102_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/???/gnu/10.2.1" + + if [ "$SPOT_CHECK" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + # TODO: Failing toolchains: + #"intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + #"pgi/18.7.0 $BASE_MODULE_LIST $GNU_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" + ###"intel/19.1.144 $BASE_MODULE_LIST_INTEL "OpenMP_Serial" icpc $INTEL_WARNING_FLAGS" + ###"gnu/7.2.0 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GNU_WARNING_FLAGS" + ###"clang/10.0.1 $BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" + ###"intel/19.5.281 $BASE_MODULE_LIST_INTEL "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" + ###"gnu/11.2.0 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GNU_WARNING_FLAGS" + COMPILERS=( + "gnu/10.2.1 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GNU_WARNING_FLAGS" + ) + elif [ "$SPOT_CHECK_TPLS" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + # TODO: Failing toolchains: + #"pgi/18.7.0 $BASE_MODULE_LIST $GNU_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" + ###"gnu/7.2.0 $GNU72_MODULE_TPL_LIST "OpenMP_Serial" g++ $GNU_WARNING_FLAGS" + COMPILERS=("intel/19.0.5.281 $BASE_MODULE_LIST_INTEL "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" + "gnu/10.2.1 $GNU102_MODULE_TPL_LIST "OpenMP_Serial" g++ $GNU_WARNING_FLAGS" + ) + else + ###"intel/19.1.144 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + ###"intel/19.3.199 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + ###"intel/19.5.281 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + ###"intel/2021.1.1 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" + ###"gnu/7.2.0 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" + ###"gnu/8.1.0 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" + ###"gnu/8.2.0 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" + ###"gnu/9.2.0 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" + ###"clang/10.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + COMPILERS=( + "gnu/10.2.1 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" + ) + + fi + + if [ -z "$ARCH_FLAG" ]; then + ARCH_FLAG="--arch=BDW" + fi + SPACK_HOST_ARCH="+bdw" elif [ "$MACHINE" = "kokkos-dev-2" ]; then MODULE_ENVIRONMENT="source /projects/sems/modulefiles/utils/sems-archive-modules-init.sh ; module use /home/projects/x86-64/modulefiles/local" eval "$MODULE_ENVIRONMENT" From 678783275ee03046346ec27ceeb0dc5d728a1362 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 25 Jan 2023 10:17:22 -0700 Subject: [PATCH 126/442] Get a C++17 stdlibc++ in the path --- scripts/cm_test_all_sandia | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 33bf7312d0..74fe1640d3 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -771,7 +771,7 @@ elif [ "$MACHINE" = "solo" ]; then module load cmake/3.22.3 BASE_MODULE_LIST="cmake/3.22.3,/" - BASE_MODULE_LIST_INTEL="cmake/3.22.3,/" + BASE_MODULE_LIST_INTEL="cmake/3.22.3,/,gnu/10.2.1" BASE_MODULE_LIST_ONEAPI="cmake/3.22.3,/oneAPI/base-toolkit/" ONEAPI_WARNING_FLAGS="" From 3ac5a6fe1d4617d88c31bd4886f2f2f7ca91e990 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 25 Jan 2023 10:27:26 -0700 Subject: [PATCH 127/442] Use stdlibc++ from gnu 8.2.1 --- scripts/cm_test_all_sandia | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 74fe1640d3..0f72e13986 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -771,7 +771,7 @@ elif [ "$MACHINE" = "solo" ]; then module load cmake/3.22.3 BASE_MODULE_LIST="cmake/3.22.3,/" - BASE_MODULE_LIST_INTEL="cmake/3.22.3,/,gnu/10.2.1" + BASE_MODULE_LIST_INTEL="cmake/3.22.3,gnu/8.2.1,/" BASE_MODULE_LIST_ONEAPI="cmake/3.22.3,/oneAPI/base-toolkit/" ONEAPI_WARNING_FLAGS="" From f2184cf60a7621e84c8d89d9b1127a6b92779286 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 31 Jan 2023 14:41:05 -0700 Subject: [PATCH 128/442] Use openblas tpl --- scripts/cm_test_all_sandia | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 0f72e13986..90230e06ee 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -157,6 +157,7 @@ fi if [[ "$HOSTNAME" == *solo* ]]; then # Warning: very generic name MACHINE=solo + module use /projects/netpub/openblas/modulefiles fi if [[ "$HOSTNAME" == kokkos-dev-2* ]]; then @@ -776,7 +777,7 @@ elif [ "$MACHINE" = "solo" ]; then ONEAPI_WARNING_FLAGS="" ###GNU72_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.2.20/gnu/7.2.0" - GNU102_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/???/gnu/10.2.1" + GNU102_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.3.21" if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) From 4b4e7b82f898b878b5159ec7c2f72167c3b4a7ed Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 31 Jan 2023 14:41:56 -0700 Subject: [PATCH 129/442] Cleanup. Need clang toolchain --- scripts/cm_test_all_sandia | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 90230e06ee..57b0b4f78f 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -780,35 +780,14 @@ elif [ "$MACHINE" = "solo" ]; then GNU102_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.3.21" if [ "$SPOT_CHECK" = "True" ]; then - # Format: (compiler module-list build-list exe-name warning-flag) - # TODO: Failing toolchains: - #"intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - #"pgi/18.7.0 $BASE_MODULE_LIST $GNU_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" - ###"intel/19.1.144 $BASE_MODULE_LIST_INTEL "OpenMP_Serial" icpc $INTEL_WARNING_FLAGS" - ###"gnu/7.2.0 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GNU_WARNING_FLAGS" - ###"clang/10.0.1 $BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" - ###"intel/19.5.281 $BASE_MODULE_LIST_INTEL "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" - ###"gnu/11.2.0 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GNU_WARNING_FLAGS" COMPILERS=( "gnu/10.2.1 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GNU_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then - # Format: (compiler module-list build-list exe-name warning-flag) - # TODO: Failing toolchains: - #"pgi/18.7.0 $BASE_MODULE_LIST $GNU_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" - ###"gnu/7.2.0 $GNU72_MODULE_TPL_LIST "OpenMP_Serial" g++ $GNU_WARNING_FLAGS" COMPILERS=("intel/19.0.5.281 $BASE_MODULE_LIST_INTEL "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" "gnu/10.2.1 $GNU102_MODULE_TPL_LIST "OpenMP_Serial" g++ $GNU_WARNING_FLAGS" ) else - ###"intel/19.1.144 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - ###"intel/19.3.199 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - ###"intel/19.5.281 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - ###"intel/2021.1.1 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" - ###"gnu/7.2.0 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" - ###"gnu/8.1.0 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" - ###"gnu/8.2.0 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" - ###"gnu/9.2.0 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" ###"clang/10.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" COMPILERS=( "gnu/10.2.1 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" From fa5bdf509294cc4d863fa3236b1606d20fbfa989 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 31 Jan 2023 14:48:22 -0700 Subject: [PATCH 130/442] More cleanup --- scripts/cm_test_all_sandia | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 57b0b4f78f..8282aac66e 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -764,8 +764,6 @@ elif [ "$MACHINE" = "blake" ]; then fi SPACK_HOST_ARCH="+skx" elif [ "$MACHINE" = "solo" ]; then - ###MODULE_ENVIRONMENT="source /etc/profile.d/modules.sh" - ###eval "$MODULE_ENVIRONMENT" SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=32 From 5c8067c93262e0e982e060aa699ea8de87e833a6 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 31 Jan 2023 14:52:41 -0700 Subject: [PATCH 131/442] More cleanup. --- scripts/cm_test_all_sandia | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 8282aac66e..e8c72c8696 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -771,10 +771,8 @@ elif [ "$MACHINE" = "solo" ]; then BASE_MODULE_LIST="cmake/3.22.3,/" BASE_MODULE_LIST_INTEL="cmake/3.22.3,gnu/8.2.1,/" - BASE_MODULE_LIST_ONEAPI="cmake/3.22.3,/oneAPI/base-toolkit/" ONEAPI_WARNING_FLAGS="" - ###GNU72_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.2.20/gnu/7.2.0" GNU102_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.3.21" if [ "$SPOT_CHECK" = "True" ]; then From 950f633b7e79547f3cecdf5850a3816cafdf6b57 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 13 Feb 2023 13:28:12 -0700 Subject: [PATCH 132/442] pull in mkl --- scripts/cm_test_all_sandia | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index e8c72c8696..74d4498967 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -780,7 +780,7 @@ elif [ "$MACHINE" = "solo" ]; then "gnu/10.2.1 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GNU_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then - COMPILERS=("intel/19.0.5.281 $BASE_MODULE_LIST_INTEL "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" + COMPILERS=("intel/19.0.5.281 $BASE_MODULE_LIST_INTEL,mkl/19.0.5.281 "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" "gnu/10.2.1 $GNU102_MODULE_TPL_LIST "OpenMP_Serial" g++ $GNU_WARNING_FLAGS" ) else From cb11f0cfff326cc4e79fb35274768dea927c90f0 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 13 Feb 2023 13:36:38 -0700 Subject: [PATCH 133/442] Use clang modules --- scripts/cm_test_all_sandia | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 74d4498967..866fc45a83 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -158,6 +158,7 @@ fi if [[ "$HOSTNAME" == *solo* ]]; then # Warning: very generic name MACHINE=solo module use /projects/netpub/openblas/modulefiles + module use /projects/netpub/clang/modulefiles fi if [[ "$HOSTNAME" == kokkos-dev-2* ]]; then From 560f37286a699ba09adbab0ede48110ee64219ac Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Wed, 15 Mar 2023 08:27:18 -0700 Subject: [PATCH 134/442] Fix unused-parameter nstreams error --- sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index c8e9cd4ed7..9521420bfb 100644 --- a/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -175,7 +175,8 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, } else #endif { - lnchunks(i) = 1; + // Workaround to fix unused-parameter nstreams error + lnchunks(i) = static_cast(nstreams / nstreams); lnrowsperchunk(i) = lnrows; } if (maxrowsperchunk < static_cast(lnrowsperchunk(i))) { From 221f7abc004caef8c0478f33bba00e08845e6aa4 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 15 Mar 2023 15:58:25 -0600 Subject: [PATCH 135/442] Work around instance resource limits --- blas/unit_test/Test_Blas3_gemm.hpp | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/blas/unit_test/Test_Blas3_gemm.hpp b/blas/unit_test/Test_Blas3_gemm.hpp index a210806929..1e434bc4a7 100644 --- a/blas/unit_test/Test_Blas3_gemm.hpp +++ b/blas/unit_test/Test_Blas3_gemm.hpp @@ -258,8 +258,8 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, } template -void impl_test_stream_gemm(const int M, const int N, const int K, - const Scalar alpha, const Scalar beta) { +void impl_test_stream_gemm_psge2(const int M, const int N, const int K, + const Scalar alpha, const Scalar beta) { using execution_space = TestExecSpace; using ViewTypeA = Kokkos::View; using ViewTypeB = Kokkos::View; @@ -372,12 +372,17 @@ void test_gemm() { } } } - Test::impl_test_stream_gemm(53, 42, 17, 4.5, - 3.0); // General code path - Test::impl_test_stream_gemm( - 13, 1, 17, 4.5, 3.0); // gemv based gemm code path - Test::impl_test_stream_gemm(7, 13, 17, 4.5, - 3.0); // dot based gemm code path + auto pool_size = execution_space().impl_thread_pool_size(); + if (pool_size >= 2) { + Test::impl_test_stream_gemm_psge2( + 53, 42, 17, 4.5, + 3.0); // General code path + Test::impl_test_stream_gemm_psge2( + 13, 1, 17, 4.5, 3.0); // gemv based gemm code path + Test::impl_test_stream_gemm_psge2( + 7, 13, 17, 4.5, + 3.0); // dot based gemm code path + } } template From be9310d975035b7d9af75446f83285d33e96428e Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 15 Mar 2023 17:07:15 -0600 Subject: [PATCH 136/442] Reduce BatchedGemm test coverage --- .../unit_test/Test_Batched_BatchedGemm.hpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp index 746be07eb7..c4e09d6e68 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp @@ -257,11 +257,15 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, ASSERT_EQ(batchedGemmHandle.get_kernel_algo_type(), algo_type); - if (algo_type == BaseHeuristicAlgos::SQUARE || - algo_type == BaseTplAlgos::ARMPL || + if (algo_type == BaseTplAlgos::ARMPL || algo_type == BaseKokkosBatchedAlgos::KK_SERIAL || algo_type == GemmKokkosBatchedAlgos::KK_SERIAL_RANK0 || algo_type == GemmKokkosBatchedAlgos::KK_DBLBUF) { + impl_test_batched_gemm_with_handle( + &batchedGemmHandle, N, matAdim1, matAdim2, matBdim1, matBdim2, + matCdim1, matCdim2, 1.5, 3.0); + } else if (algo_type == BaseHeuristicAlgos::SQUARE) { // Invoke 4 times to ensure we cover all paths for alpha and beta impl_test_batched_gemm_with_handle( @@ -316,13 +320,12 @@ template void test_batched_gemm_with_layout(int N) { // Square cases - for (int i = 0; i < 5; ++i) { + { + int i = 0; Test::impl_test_batched_gemm(N, i, i, i, i, i, i); - } - { - int i = 10; + i = 10; Test::impl_test_batched_gemm(N, i, i, i, i, i, i); @@ -336,7 +339,7 @@ void test_batched_gemm_with_layout(int N) { } // Non-square cases - for (int i = 0; i < 5; ++i) { + for (int i = 1; i < 5; ++i) { int dimM = 1 * i; int dimN = 2 * i; int dimK = 3 * i; From bc9265b0d3127832d06b43cbe19b490dc72c413c Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 16 Mar 2023 08:46:43 -0600 Subject: [PATCH 137/442] Fix typo --- blas/unit_test/Test_Blas3_gemm.hpp | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/blas/unit_test/Test_Blas3_gemm.hpp b/blas/unit_test/Test_Blas3_gemm.hpp index 1e434bc4a7..9db7f987b9 100644 --- a/blas/unit_test/Test_Blas3_gemm.hpp +++ b/blas/unit_test/Test_Blas3_gemm.hpp @@ -257,16 +257,15 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, } } -template +template void impl_test_stream_gemm_psge2(const int M, const int N, const int K, const Scalar alpha, const Scalar beta) { - using execution_space = TestExecSpace; - using ViewTypeA = Kokkos::View; - using ViewTypeB = Kokkos::View; - using ViewTypeC = Kokkos::View; - using ScalarC = typename ViewTypeC::value_type; - using APT = Kokkos::Details::ArithTraits; - using mag_type = typename APT::mag_type; + using ViewTypeA = Kokkos::View; + using ViewTypeB = Kokkos::View; + using ViewTypeC = Kokkos::View; + using ScalarC = typename ViewTypeC::value_type; + using APT = Kokkos::Details::ArithTraits; + using mag_type = typename APT::mag_type; const char tA[] = {"N"}; const char tB[] = {"N"}; @@ -372,14 +371,14 @@ void test_gemm() { } } } - auto pool_size = execution_space().impl_thread_pool_size(); + auto pool_size = TestExecSpace().impl_thread_pool_size(); if (pool_size >= 2) { - Test::impl_test_stream_gemm_psge2( + Test::impl_test_stream_gemm_psge2( 53, 42, 17, 4.5, 3.0); // General code path - Test::impl_test_stream_gemm_psge2( + Test::impl_test_stream_gemm_psge2( 13, 1, 17, 4.5, 3.0); // gemv based gemm code path - Test::impl_test_stream_gemm_psge2( + Test::impl_test_stream_gemm_psge2( 7, 13, 17, 4.5, 3.0); // dot based gemm code path } From 3b466361cc368b95eb1e1c0ec1fc9c93a47146b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 9 Mar 2023 16:01:52 +0100 Subject: [PATCH 138/442] Generate git information during build --- CMakeLists.txt | 4 + cmake/KokkosKernels_Version_Info.cpp.in | 31 ++++++ cmake/KokkosKernels_Version_Info.hpp | 34 +++++++ cmake/kokkoskernels_git_info.cmake | 122 ++++++++++++++++++++++++ perf_test/CMakeLists.txt | 4 +- 5 files changed, 193 insertions(+), 2 deletions(-) create mode 100644 cmake/KokkosKernels_Version_Info.cpp.in create mode 100644 cmake/KokkosKernels_Version_Info.hpp create mode 100644 cmake/kokkoskernels_git_info.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 4bf2adec47..7404ecd4f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,6 +49,10 @@ ELSE() ENDIF() INCLUDE(cmake/fake_tribits.cmake) INCLUDE(cmake/kokkoskernels_tribits.cmake) +IF(NOT KOKKOSKERNELS_HAS_TRILINOS) + INCLUDE(cmake/kokkoskernels_git_info.cmake) + check_git_setup() +ENDIF() OPTION(BUILD_SHARED_LIBS "Build shared libraries" OFF) diff --git a/cmake/KokkosKernels_Version_Info.cpp.in b/cmake/KokkosKernels_Version_Info.cpp.in new file mode 100644 index 0000000000..a8aa6f19ae --- /dev/null +++ b/cmake/KokkosKernels_Version_Info.cpp.in @@ -0,0 +1,31 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "KokkosKernels_Version_Info.hpp" + +namespace KokkosKernels { +namespace Impl { + +std::string GIT_BRANCH = R"branch(@GIT_BRANCH@)branch"; +std::string GIT_COMMIT_HASH = "@GIT_COMMIT_HASH@"; +std::string GIT_CLEAN_STATUS = "@GIT_CLEAN_STATUS@"; +std::string GIT_COMMIT_DESCRIPTION = + R"message(@GIT_COMMIT_DESCRIPTION@)message"; +std::string GIT_COMMIT_DATE = "@GIT_COMMIT_DATE@"; + +} // namespace Impl + +} // namespace KokkosKernels diff --git a/cmake/KokkosKernels_Version_Info.hpp b/cmake/KokkosKernels_Version_Info.hpp new file mode 100644 index 0000000000..122c54b85d --- /dev/null +++ b/cmake/KokkosKernels_Version_Info.hpp @@ -0,0 +1,34 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_GIT_VERSION_INFO_H +#define KOKKOSKERNELS_GIT_VERSION_INFO_H + +#include + +namespace KokkosKernels { +namespace Impl { + +extern std::string GIT_BRANCH; +extern std::string GIT_COMMIT_HASH; +extern std::string GIT_CLEAN_STATUS; +extern std::string GIT_COMMIT_DESCRIPTION; +extern std::string GIT_COMMIT_DATE; + +} // namespace Impl +} // namespace KokkosKernels + +#endif diff --git a/cmake/kokkoskernels_git_info.cmake b/cmake/kokkoskernels_git_info.cmake new file mode 100644 index 0000000000..120ffe2c91 --- /dev/null +++ b/cmake/kokkoskernels_git_info.cmake @@ -0,0 +1,122 @@ +# https://jonathanhamberg.com/post/cmake-embedding-git-hash/ + +find_package(Git QUIET) + +SET(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) +SET(pre_configure_dir ${CMAKE_CURRENT_LIST_DIR}) +SET(post_configure_dir ${CMAKE_BINARY_DIR}/generated) + +SET(pre_configure_file ${pre_configure_dir}/KokkosKernels_Version_Info.cpp.in) +SET(post_configure_file ${post_configure_dir}/KokkosKernels_Version_Info.cpp) + +FUNCTION(check_git_write git_hash git_clean_status) + FILE( + WRITE + ${CMAKE_BINARY_DIR}/git-state.txt + "${git_hash}-${git_clean_status}") +ENDFUNCTION() + +FUNCTION(check_git_read git_hash) + IF(EXISTS ${CMAKE_BINARY_DIR}/git-state.txt) + FILE(STRINGS ${CMAKE_BINARY_DIR}/git-state.txt CONTENT) + LIST(GET CONTENT 0 var) + + message(DEBUG "Cached Git hash: ${var}") + SET(${git_hash} ${var} PARENT_SCOPE) + else() + SET(${git_hash} "INVALID" PARENT_SCOPE) + ENDIF() +ENDFUNCTION() + +FUNCTION(check_git_version) + IF(NOT EXISTS ${post_configure_dir}/KokkosKernels_Version_Info.hpp) + FILE( + COPY ${pre_configure_dir}/KokkosKernels_Version_Info.hpp + DESTINATION ${post_configure_dir}) + ENDIF() + + IF(NOT Git_FOUND OR NOT EXISTS ${KOKKOSKERNELS_TOP_SOURCE_DIR}/.git) + configure_file(${pre_configure_file} ${post_configure_file} @ONLY) + return() + ENDIF() + + # Get the current working branch + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD + WORKING_DIRECTORY ${KOKKOSKERNELS_TOP_SOURCE_DIR} + OUTPUT_VARIABLE GIT_BRANCH + OUTPUT_STRIP_TRAILING_WHITESPACE) + + # Get the latest commit description + execute_process( + COMMAND ${GIT_EXECUTABLE} show -s --format=%s + WORKING_DIRECTORY ${KOKKOSKERNELS_TOP_SOURCE_DIR} + OUTPUT_VARIABLE GIT_COMMIT_DESCRIPTION + OUTPUT_STRIP_TRAILING_WHITESPACE) + + # Get the latest commit date + execute_process( + COMMAND ${GIT_EXECUTABLE} log -1 --format=%cI + WORKING_DIRECTORY ${KOKKOSKERNELS_TOP_SOURCE_DIR} + OUTPUT_VARIABLE GIT_COMMIT_DATE + OUTPUT_STRIP_TRAILING_WHITESPACE) + + # Check if repo is dirty / clean + execute_process( + COMMAND ${GIT_EXECUTABLE} diff-index --quiet HEAD -- + WORKING_DIRECTORY ${KOKKOSKERNELS_TOP_SOURCE_DIR} + RESULT_VARIABLE IS_DIRTY + OUTPUT_STRIP_TRAILING_WHITESPACE) + + IF(IS_DIRTY EQUAL 0) + SET(GIT_CLEAN_STATUS "CLEAN") + else() + SET(GIT_CLEAN_STATUS "DIRTY") + ENDIF() + + # Get the latest abbreviated commit hash of the working branch + execute_process( + COMMAND ${GIT_EXECUTABLE} log -1 --format=%h + WORKING_DIRECTORY ${KOKKOSKERNELS_TOP_SOURCE_DIR} + OUTPUT_VARIABLE GIT_COMMIT_HASH + OUTPUT_STRIP_TRAILING_WHITESPACE) + + check_git_read(GIT_HASH_CACHE) + + IF(NOT EXISTS ${post_configure_dir}) + file(MAKE_DIRECTORY ${post_configure_dir}) + ENDIF() + + # Only update the git_version.cpp if the hash has changed. This will + # prevent us from rebuilding the project more than we need to. + IF(NOT "${GIT_COMMIT_HASH}-${GIT_CLEAN_STATUS}" STREQUAL ${GIT_HASH_CACHE} + OR NOT EXISTS ${post_configure_file}) + # Set the GIT_HASH_CACHE variable so the next build won't have + # to regenerate the source file. + check_git_write(${GIT_COMMIT_HASH} ${GIT_CLEAN_STATUS}) + + configure_file(${pre_configure_file} ${post_configure_file} @ONLY) + message(STATUS "Configured git information in ${post_configure_file}") + ENDIF() +ENDFUNCTION() + +FUNCTION(check_git_setup) + add_custom_target( + AlwaysCheckGit COMMAND ${CMAKE_COMMAND} + -DRUN_CHECK_GIT_VERSION=1 + -DKOKKOSKERNELS_TOP_SOURCE_DIR=${KOKKOSKERNELS_TOP_SOURCE_DIR} + -P ${CURRENT_LIST_DIR}/kokkoskernels_git_info.cmake + BYPRODUCTS ${post_configure_file}) + + add_library(impl_git_version ${CMAKE_BINARY_DIR}/generated/KokkosKernels_Version_Info.cpp) + target_include_directories(impl_git_version PUBLIC ${CMAKE_BINARY_DIR}/generated) + target_compile_features(impl_git_version PRIVATE cxx_raw_string_literals) + add_dependencies(impl_git_version AlwaysCheckGit) + + check_git_version() +ENDFUNCTION() + +# This is used to run this function from an external cmake process. +IF(RUN_CHECK_GIT_VERSION) + check_git_version() +ENDIF() diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index 4a4cb928bf..4520ac5946 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -27,7 +27,7 @@ if (KokkosKernels_ENABLE_PERFTESTS) TARGET_COMPILE_FEATURES(kokkoskernelsperf_gtest PUBLIC cxx_std_11) KOKKOSKERNELS_INCLUDE_DIRECTORIES(sparse) - + if(KokkosKernels_ENABLE_TESTS_AND_PERFSUITE) #Add RPS implementations of KK perf tests here KOKKOSKERNELS_ADD_EXECUTABLE( @@ -116,7 +116,7 @@ IF(KokkosKernels_ENABLE_BENCHMARK) ) TARGET_LINK_LIBRARIES( ${BENCHMARK_NAME} - PRIVATE benchmark::benchmark Kokkos::kokkoskernels + PRIVATE benchmark::benchmark Kokkos::kokkoskernels impl_git_version ) FOREACH(SOURCE_FILE ${BENCHMARK_SOURCES}) SET_SOURCE_FILES_PROPERTIES( From c32f3ad06e17343f31fc530f909b9035291d8f6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 9 Mar 2023 16:09:28 +0100 Subject: [PATCH 139/442] Include git information in benchmark context --- perf_test/BenchmarkMain.cpp | 8 ++++++++ perf_test/Benchmark_Context.hpp | 18 ++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/perf_test/BenchmarkMain.cpp b/perf_test/BenchmarkMain.cpp index ce74d8dba5..2128c38f14 100644 --- a/perf_test/BenchmarkMain.cpp +++ b/perf_test/BenchmarkMain.cpp @@ -21,6 +21,14 @@ #include #include +#include + +static void BM_StringCreation(benchmark::State& state) { + for (auto _ : state) std::string empty_string; +} + +BENCHMARK(BM_StringCreation)->ArgNames({"N", "R"})->Args({100'000, 1'000}); + int main(int argc, char** argv) { Kokkos::initialize(argc, argv); benchmark::Initialize(&argc, argv); diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index 74f081b7d9..f661c85427 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -26,6 +26,8 @@ #include #include +#include +#include namespace KokkosKernelsBenchmark { @@ -65,10 +67,26 @@ void add_kokkos_configuration(bool verbose) { } } +inline void add_git_info() { + if (!KokkosKernels::Impl::GIT_BRANCH.empty()) { + benchmark::AddCustomContext("GIT_BRANCH", KokkosKernels::Impl::GIT_BRANCH); + benchmark::AddCustomContext("GIT_COMMIT_HASH", + KokkosKernels::Impl::GIT_COMMIT_HASH); + benchmark::AddCustomContext("GIT_CLEAN_STATUS", + KokkosKernels::Impl::GIT_CLEAN_STATUS); + benchmark::AddCustomContext("GIT_COMMIT_DESCRIPTION", + KokkosKernels::Impl::GIT_COMMIT_DESCRIPTION); + benchmark::AddCustomContext("GIT_COMMIT_DATE", + KokkosKernels::Impl::GIT_COMMIT_DATE); + } +} + /// \brief Gather all context information and add it to benchmark context data void add_benchmark_context(bool verbose = false) { // Add Kokkos configuration to benchmark context data add_kokkos_configuration(verbose); + + add_git_info(); } } // namespace KokkosKernelsBenchmark From 2f9352acc701b14d5a0358221e6c8c79b9b49e42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 14 Mar 2023 15:59:24 +0100 Subject: [PATCH 140/442] Switch to header-only implementation --- CMakeLists.txt | 9 +++--- cmake/KokkosKernels_Version_Info.cpp.in | 31 ------------------- ....hpp => KokkosKernels_Version_Info.hpp.in} | 11 ++++--- cmake/kokkoskernels_git_info.cmake | 25 +++------------ perf_test/CMakeLists.txt | 2 +- 5 files changed, 16 insertions(+), 62 deletions(-) delete mode 100644 cmake/KokkosKernels_Version_Info.cpp.in rename cmake/{KokkosKernels_Version_Info.hpp => KokkosKernels_Version_Info.hpp.in} (69%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7404ecd4f7..a89354a765 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,10 +49,6 @@ ELSE() ENDIF() INCLUDE(cmake/fake_tribits.cmake) INCLUDE(cmake/kokkoskernels_tribits.cmake) -IF(NOT KOKKOSKERNELS_HAS_TRILINOS) - INCLUDE(cmake/kokkoskernels_git_info.cmake) - check_git_setup() -ENDIF() OPTION(BUILD_SHARED_LIBS "Build shared libraries" OFF) @@ -327,6 +323,11 @@ ELSE() $) ENDIF() + IF(NOT KOKKOSKERNELS_HAS_TRILINOS) + INCLUDE(cmake/kokkoskernels_git_info.cmake) + check_git_setup() + ENDIF() + # FIXME_SYCL waiting for compiler support IF (KOKKOS_ENABLE_SYCL) SET(KOKKOSKERNELS_INTEL_ARCHS ${Kokkos_ARCH}) diff --git a/cmake/KokkosKernels_Version_Info.cpp.in b/cmake/KokkosKernels_Version_Info.cpp.in deleted file mode 100644 index a8aa6f19ae..0000000000 --- a/cmake/KokkosKernels_Version_Info.cpp.in +++ /dev/null @@ -1,31 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include "KokkosKernels_Version_Info.hpp" - -namespace KokkosKernels { -namespace Impl { - -std::string GIT_BRANCH = R"branch(@GIT_BRANCH@)branch"; -std::string GIT_COMMIT_HASH = "@GIT_COMMIT_HASH@"; -std::string GIT_CLEAN_STATUS = "@GIT_CLEAN_STATUS@"; -std::string GIT_COMMIT_DESCRIPTION = - R"message(@GIT_COMMIT_DESCRIPTION@)message"; -std::string GIT_COMMIT_DATE = "@GIT_COMMIT_DATE@"; - -} // namespace Impl - -} // namespace KokkosKernels diff --git a/cmake/KokkosKernels_Version_Info.hpp b/cmake/KokkosKernels_Version_Info.hpp.in similarity index 69% rename from cmake/KokkosKernels_Version_Info.hpp rename to cmake/KokkosKernels_Version_Info.hpp.in index 122c54b85d..52b64bd2e5 100644 --- a/cmake/KokkosKernels_Version_Info.hpp +++ b/cmake/KokkosKernels_Version_Info.hpp.in @@ -22,11 +22,12 @@ namespace KokkosKernels { namespace Impl { -extern std::string GIT_BRANCH; -extern std::string GIT_COMMIT_HASH; -extern std::string GIT_CLEAN_STATUS; -extern std::string GIT_COMMIT_DESCRIPTION; -extern std::string GIT_COMMIT_DATE; +const std::string GIT_BRANCH = R"branch(@GIT_BRANCH@)branch"; +const std::string GIT_COMMIT_HASH = "@GIT_COMMIT_HASH@"; +const std::string GIT_CLEAN_STATUS = "@GIT_CLEAN_STATUS@"; +const std::string GIT_COMMIT_DESCRIPTION = + R"message(@GIT_COMMIT_DESCRIPTION@)message"; +const std::string GIT_COMMIT_DATE = "@GIT_COMMIT_DATE@"; } // namespace Impl } // namespace KokkosKernels diff --git a/cmake/kokkoskernels_git_info.cmake b/cmake/kokkoskernels_git_info.cmake index 120ffe2c91..86d126591a 100644 --- a/cmake/kokkoskernels_git_info.cmake +++ b/cmake/kokkoskernels_git_info.cmake @@ -3,11 +3,8 @@ find_package(Git QUIET) SET(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) -SET(pre_configure_dir ${CMAKE_CURRENT_LIST_DIR}) -SET(post_configure_dir ${CMAKE_BINARY_DIR}/generated) - -SET(pre_configure_file ${pre_configure_dir}/KokkosKernels_Version_Info.cpp.in) -SET(post_configure_file ${post_configure_dir}/KokkosKernels_Version_Info.cpp) +SET(pre_configure_file ${CURRENT_LIST_DIR}/KokkosKernels_Version_Info.hpp.in) +SET(post_configure_file ${CMAKE_BINARY_DIR}/KokkosKernels_Version_Info.hpp) FUNCTION(check_git_write git_hash git_clean_status) FILE( @@ -29,12 +26,6 @@ FUNCTION(check_git_read git_hash) ENDFUNCTION() FUNCTION(check_git_version) - IF(NOT EXISTS ${post_configure_dir}/KokkosKernels_Version_Info.hpp) - FILE( - COPY ${pre_configure_dir}/KokkosKernels_Version_Info.hpp - DESTINATION ${post_configure_dir}) - ENDIF() - IF(NOT Git_FOUND OR NOT EXISTS ${KOKKOSKERNELS_TOP_SOURCE_DIR}/.git) configure_file(${pre_configure_file} ${post_configure_file} @ONLY) return() @@ -83,11 +74,7 @@ FUNCTION(check_git_version) check_git_read(GIT_HASH_CACHE) - IF(NOT EXISTS ${post_configure_dir}) - file(MAKE_DIRECTORY ${post_configure_dir}) - ENDIF() - - # Only update the git_version.cpp if the hash has changed. This will + # Only update the version header if the hash has changed. This will # prevent us from rebuilding the project more than we need to. IF(NOT "${GIT_COMMIT_HASH}-${GIT_CLEAN_STATUS}" STREQUAL ${GIT_HASH_CACHE} OR NOT EXISTS ${post_configure_file}) @@ -108,11 +95,7 @@ FUNCTION(check_git_setup) -P ${CURRENT_LIST_DIR}/kokkoskernels_git_info.cmake BYPRODUCTS ${post_configure_file}) - add_library(impl_git_version ${CMAKE_BINARY_DIR}/generated/KokkosKernels_Version_Info.cpp) - target_include_directories(impl_git_version PUBLIC ${CMAKE_BINARY_DIR}/generated) - target_compile_features(impl_git_version PRIVATE cxx_raw_string_literals) - add_dependencies(impl_git_version AlwaysCheckGit) - + add_dependencies(kokkoskernels AlwaysCheckGit) check_git_version() ENDFUNCTION() diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index 4520ac5946..64f4579679 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -116,7 +116,7 @@ IF(KokkosKernels_ENABLE_BENCHMARK) ) TARGET_LINK_LIBRARIES( ${BENCHMARK_NAME} - PRIVATE benchmark::benchmark Kokkos::kokkoskernels impl_git_version + PRIVATE benchmark::benchmark Kokkos::kokkoskernels ) FOREACH(SOURCE_FILE ${BENCHMARK_SOURCES}) SET_SOURCE_FILES_PROPERTIES( From e8b2d6cd0abe4f833219368cbbfe7c6cd5e8637e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 14 Mar 2023 16:32:27 +0100 Subject: [PATCH 141/442] Use constexpr variables for git info --- cmake/KokkosKernels_Version_Info.hpp.in | 12 ++++++------ perf_test/BenchmarkMain.cpp | 2 -- perf_test/Benchmark_Context.hpp | 20 +++++++++++--------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/cmake/KokkosKernels_Version_Info.hpp.in b/cmake/KokkosKernels_Version_Info.hpp.in index 52b64bd2e5..bf55183f01 100644 --- a/cmake/KokkosKernels_Version_Info.hpp.in +++ b/cmake/KokkosKernels_Version_Info.hpp.in @@ -17,17 +17,17 @@ #ifndef KOKKOSKERNELS_GIT_VERSION_INFO_H #define KOKKOSKERNELS_GIT_VERSION_INFO_H -#include +#include namespace KokkosKernels { namespace Impl { -const std::string GIT_BRANCH = R"branch(@GIT_BRANCH@)branch"; -const std::string GIT_COMMIT_HASH = "@GIT_COMMIT_HASH@"; -const std::string GIT_CLEAN_STATUS = "@GIT_CLEAN_STATUS@"; -const std::string GIT_COMMIT_DESCRIPTION = +constexpr std::string_view GIT_BRANCH = R"branch(@GIT_BRANCH@)branch"; +constexpr std::string_view GIT_COMMIT_HASH = "@GIT_COMMIT_HASH@"; +constexpr std::string_view GIT_CLEAN_STATUS = "@GIT_CLEAN_STATUS@"; +constexpr std::string_view GIT_COMMIT_DESCRIPTION = R"message(@GIT_COMMIT_DESCRIPTION@)message"; -const std::string GIT_COMMIT_DATE = "@GIT_COMMIT_DATE@"; +constexpr std::string_view GIT_COMMIT_DATE = "@GIT_COMMIT_DATE@"; } // namespace Impl } // namespace KokkosKernels diff --git a/perf_test/BenchmarkMain.cpp b/perf_test/BenchmarkMain.cpp index 2128c38f14..bc0ff87759 100644 --- a/perf_test/BenchmarkMain.cpp +++ b/perf_test/BenchmarkMain.cpp @@ -21,8 +21,6 @@ #include #include -#include - static void BM_StringCreation(benchmark::State& state) { for (auto _ : state) std::string empty_string; } diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index f661c85427..cf987c222c 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -69,15 +69,17 @@ void add_kokkos_configuration(bool verbose) { inline void add_git_info() { if (!KokkosKernels::Impl::GIT_BRANCH.empty()) { - benchmark::AddCustomContext("GIT_BRANCH", KokkosKernels::Impl::GIT_BRANCH); - benchmark::AddCustomContext("GIT_COMMIT_HASH", - KokkosKernels::Impl::GIT_COMMIT_HASH); - benchmark::AddCustomContext("GIT_CLEAN_STATUS", - KokkosKernels::Impl::GIT_CLEAN_STATUS); - benchmark::AddCustomContext("GIT_COMMIT_DESCRIPTION", - KokkosKernels::Impl::GIT_COMMIT_DESCRIPTION); - benchmark::AddCustomContext("GIT_COMMIT_DATE", - KokkosKernels::Impl::GIT_COMMIT_DATE); + benchmark::AddCustomContext("GIT_BRANCH", + std::string(KokkosKernels::Impl::GIT_BRANCH)); + benchmark::AddCustomContext( + "GIT_COMMIT_HASH", std::string(KokkosKernels::Impl::GIT_COMMIT_HASH)); + benchmark::AddCustomContext( + "GIT_CLEAN_STATUS", std::string(KokkosKernels::Impl::GIT_CLEAN_STATUS)); + benchmark::AddCustomContext( + "GIT_COMMIT_DESCRIPTION", + std::string(KokkosKernels::Impl::GIT_COMMIT_DESCRIPTION)); + benchmark::AddCustomContext( + "GIT_COMMIT_DATE", std::string(KokkosKernels::Impl::GIT_COMMIT_DATE)); } } From a21ce0982679ea84fbfa67f3f41315a854552be4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 14 Mar 2023 21:18:48 +0100 Subject: [PATCH 142/442] Enable benchmarks in CI --- .github/workflows/osx.yml | 1 + CMakeLists.txt | 5 +---- perf_test/CMakeLists.txt | 2 ++ 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 6f4e362d89..172506222d 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -91,6 +91,7 @@ jobs: -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ -DCMAKE_CXX_FLAGS="-Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wuninitialized" \ -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkosKernels_ENABLE_BENCHMARK=ON \ -DKokkosKernels_ENABLE_TESTS=ON \ -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ diff --git a/CMakeLists.txt b/CMakeLists.txt index a89354a765..f89fe95b66 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -396,10 +396,7 @@ ELSE() KOKKOSKERNELS_ADD_TEST_DIRECTORIES(sparse/unit_test) ENDIF() IF (KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) - IF (KokkosKernels_ENABLE_PERFTESTS) - MESSAGE(STATUS "Enabling perf tests.") - KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) - ENDIF () + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) IF (KokkosKernels_ENABLE_EXAMPLES) MESSAGE(STATUS "Enabling examples.") KOKKOSKERNELS_ADD_EXAMPLE_DIRECTORIES(example) diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index 64f4579679..d134ecee01 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -1,4 +1,6 @@ if (KokkosKernels_ENABLE_PERFTESTS) + MESSAGE(STATUS "Enabling perf tests.") + KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) From 72de68a8d8be2a0cc08cdc32b2839f7df5618c83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 15 Mar 2023 12:41:28 +0100 Subject: [PATCH 143/442] Revert "Enable benchmarks in CI" This reverts commit 723bd51a2b83b2a59cbf5287cc32e2a0a0fa30b6. --- .github/workflows/osx.yml | 1 - CMakeLists.txt | 5 ++++- perf_test/CMakeLists.txt | 2 -- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 172506222d..6f4e362d89 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -91,7 +91,6 @@ jobs: -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ -DCMAKE_CXX_FLAGS="-Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wuninitialized" \ -DCMAKE_INSTALL_PREFIX=$PWD/../install \ - -DKokkosKernels_ENABLE_BENCHMARK=ON \ -DKokkosKernels_ENABLE_TESTS=ON \ -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ diff --git a/CMakeLists.txt b/CMakeLists.txt index f89fe95b66..a89354a765 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -396,7 +396,10 @@ ELSE() KOKKOSKERNELS_ADD_TEST_DIRECTORIES(sparse/unit_test) ENDIF() IF (KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) - KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) + IF (KokkosKernels_ENABLE_PERFTESTS) + MESSAGE(STATUS "Enabling perf tests.") + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) + ENDIF () IF (KokkosKernels_ENABLE_EXAMPLES) MESSAGE(STATUS "Enabling examples.") KOKKOSKERNELS_ADD_EXAMPLE_DIRECTORIES(example) diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index d134ecee01..64f4579679 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -1,6 +1,4 @@ if (KokkosKernels_ENABLE_PERFTESTS) - MESSAGE(STATUS "Enabling perf tests.") - KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) From 54d70dc83c66cb7809a24d7f946c00c05f1244bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 16 Mar 2023 16:11:52 +0100 Subject: [PATCH 144/442] Remove sample benchmark --- perf_test/BenchmarkMain.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/perf_test/BenchmarkMain.cpp b/perf_test/BenchmarkMain.cpp index bc0ff87759..ce74d8dba5 100644 --- a/perf_test/BenchmarkMain.cpp +++ b/perf_test/BenchmarkMain.cpp @@ -21,12 +21,6 @@ #include #include -static void BM_StringCreation(benchmark::State& state) { - for (auto _ : state) std::string empty_string; -} - -BENCHMARK(BM_StringCreation)->ArgNames({"N", "R"})->Args({100'000, 1'000}); - int main(int argc, char** argv) { Kokkos::initialize(argc, argv); benchmark::Initialize(&argc, argv); From 9cc9328c7aaa561bf690a883622fa4e4f8e21faa Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Tue, 21 Feb 2023 18:47:54 +0100 Subject: [PATCH 145/442] #5: added TplsVersion file and print methods --- .../src/KokkosKernels_PrintConfiguration.hpp | 38 +++++++----- common/src/KokkosKernels_TplsVersion.hpp | 58 +++++++++++++++++++ 2 files changed, 81 insertions(+), 15 deletions(-) create mode 100644 common/src/KokkosKernels_TplsVersion.hpp diff --git a/common/src/KokkosKernels_PrintConfiguration.hpp b/common/src/KokkosKernels_PrintConfiguration.hpp index 99775f388c..eeba1b5e20 100644 --- a/common/src/KokkosKernels_PrintConfiguration.hpp +++ b/common/src/KokkosKernels_PrintConfiguration.hpp @@ -18,11 +18,31 @@ #define _KOKKOSKERNELS_PRINT_CONFIGURATION_HPP #include "KokkosKernels_config.h" - +#include "KokkosKernels_TplsVersion.hpp" #include namespace KokkosKernels { namespace Impl { + +inline void print_cublas_version(std::ostream& os) { +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CUBLAS: " << get_cublas_version() << "\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CUBLAS: no\n"; +#endif +} + +inline void print_cusparse_version(std::ostream& os) { +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: " << get_cusparse_version() << "\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: no\n"; +#endif +} inline void print_enabled_tpls(std::ostream& os) { #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK os << " " @@ -73,20 +93,8 @@ inline void print_enabled_tpls(std::ostream& os) { os << " " << "KOKKOSKERNELS_ENABLE_TPL_MKL: no\n"; #endif -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS - os << " " - << "KOKKOSKERNELS_ENABLE_TPL_CUBLAS: yes\n"; -#else - os << " " - << "KOKKOSKERNELS_ENABLE_TPL_CUBLAS: no\n"; -#endif -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - os << " " - << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: yes\n"; -#else - os << " " - << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: no\n"; -#endif + print_cublas_version(os); + print_cusparse_version(os); #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS os << " " << "KOKKOSKERNELS_ENABLE_TPL_ROCBLAS: yes\n"; diff --git a/common/src/KokkosKernels_TplsVersion.hpp b/common/src/KokkosKernels_TplsVersion.hpp new file mode 100644 index 0000000000..881c78e271 --- /dev/null +++ b/common/src/KokkosKernels_TplsVersion.hpp @@ -0,0 +1,58 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef _KOKKOSKERNELS_TPLS_VERSIONS_HPP +#define _KOKKOSKERNELS_TPLS_VERSIONS_HPP + +#include "KokkosKernels_config.h" +#include + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) +#include "cublas_v2.h" +#endif + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) +#include "cusparse.h" +#endif + +namespace KokkosKernels { + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) +inline std::string get_cublas_version() { + // Print version + std::stringstream ss; + + ss << CUBLAS_VER_MAJOR << "." << CUBLAS_VER_MINOR << "." << CUBLAS_VER_PATCH + << CUBLAS_VER_BUILD; + + return ss.str(); +} +#endif + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) +inline std::string get_cusparse_version() { + // Print version + std::stringstream ss; + + ss << CUSPARSE_VER_MAJOR << "." << CUSPARSE_VER_MINOR << "." + << CUSPARSE_VER_PATCH << CUSPARSE_VER_BUILD; + + return ss.str(); +} +#endif + +} // namespace KokkosKernels +#endif // _KOKKOSKERNELS_TPLS_VERSIONS_HPP From b6f4c80e9b6d1e43999305c0a0453774ae335a1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 14 Mar 2023 17:52:27 +0100 Subject: [PATCH 146/442] Rename functions --- common/src/KokkosKernels_PrintConfiguration.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/src/KokkosKernels_PrintConfiguration.hpp b/common/src/KokkosKernels_PrintConfiguration.hpp index eeba1b5e20..de60e38713 100644 --- a/common/src/KokkosKernels_PrintConfiguration.hpp +++ b/common/src/KokkosKernels_PrintConfiguration.hpp @@ -24,7 +24,7 @@ namespace KokkosKernels { namespace Impl { -inline void print_cublas_version(std::ostream& os) { +inline void print_cublas_version_if_enabled(std::ostream& os) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS os << " " << "KOKKOSKERNELS_ENABLE_TPL_CUBLAS: " << get_cublas_version() << "\n"; @@ -34,7 +34,7 @@ inline void print_cublas_version(std::ostream& os) { #endif } -inline void print_cusparse_version(std::ostream& os) { +inline void print_cusparse_version_if_enabled(std::ostream& os) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE os << " " << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: " << get_cusparse_version() << "\n"; @@ -93,8 +93,8 @@ inline void print_enabled_tpls(std::ostream& os) { os << " " << "KOKKOSKERNELS_ENABLE_TPL_MKL: no\n"; #endif - print_cublas_version(os); - print_cusparse_version(os); + print_cublas_version_if_enabled(os); + print_cusparse_version_if_enabled(os); #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS os << " " << "KOKKOSKERNELS_ENABLE_TPL_ROCBLAS: yes\n"; From 323cefa5d224af40742370d79b339a34ca29f60a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 15 Mar 2023 20:46:00 +0100 Subject: [PATCH 147/442] Do not print CUBLAS_VER_BUILD CUBLAS_VER_BUILD is not defined in CUDA-10.1.243. --- common/src/KokkosKernels_TplsVersion.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/common/src/KokkosKernels_TplsVersion.hpp b/common/src/KokkosKernels_TplsVersion.hpp index 881c78e271..028cf0f8e4 100644 --- a/common/src/KokkosKernels_TplsVersion.hpp +++ b/common/src/KokkosKernels_TplsVersion.hpp @@ -35,8 +35,7 @@ inline std::string get_cublas_version() { // Print version std::stringstream ss; - ss << CUBLAS_VER_MAJOR << "." << CUBLAS_VER_MINOR << "." << CUBLAS_VER_PATCH - << CUBLAS_VER_BUILD; + ss << CUBLAS_VER_MAJOR << "." << CUBLAS_VER_MINOR << "." << CUBLAS_VER_PATCH; return ss.str(); } From ee059d078446f6290d3cc00a17a6aff7afbda702 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 16 Mar 2023 17:01:43 +0100 Subject: [PATCH 148/442] Improve readability --- common/src/KokkosKernels_PrintConfiguration.hpp | 5 +++-- common/src/KokkosKernels_TplsVersion.hpp | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/common/src/KokkosKernels_PrintConfiguration.hpp b/common/src/KokkosKernels_PrintConfiguration.hpp index de60e38713..cd2333b3ec 100644 --- a/common/src/KokkosKernels_PrintConfiguration.hpp +++ b/common/src/KokkosKernels_PrintConfiguration.hpp @@ -27,7 +27,7 @@ namespace Impl { inline void print_cublas_version_if_enabled(std::ostream& os) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS os << " " - << "KOKKOSKERNELS_ENABLE_TPL_CUBLAS: " << get_cublas_version() << "\n"; + << "KOKKOSKERNELS_ENABLE_TPL_CUBLAS: " << cublas_version_string() << "\n"; #else os << " " << "KOKKOSKERNELS_ENABLE_TPL_CUBLAS: no\n"; @@ -37,7 +37,8 @@ inline void print_cublas_version_if_enabled(std::ostream& os) { inline void print_cusparse_version_if_enabled(std::ostream& os) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE os << " " - << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: " << get_cusparse_version() << "\n"; + << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: " << cusparse_version_string() + << "\n"; #else os << " " << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: no\n"; diff --git a/common/src/KokkosKernels_TplsVersion.hpp b/common/src/KokkosKernels_TplsVersion.hpp index 028cf0f8e4..38de7c1399 100644 --- a/common/src/KokkosKernels_TplsVersion.hpp +++ b/common/src/KokkosKernels_TplsVersion.hpp @@ -18,7 +18,7 @@ #define _KOKKOSKERNELS_TPLS_VERSIONS_HPP #include "KokkosKernels_config.h" -#include +#include #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) #include "cublas_v2.h" @@ -31,7 +31,7 @@ namespace KokkosKernels { #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) -inline std::string get_cublas_version() { +inline std::string cublas_version_string() { // Print version std::stringstream ss; @@ -42,12 +42,12 @@ inline std::string get_cublas_version() { #endif #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) -inline std::string get_cusparse_version() { +inline std::string cusparse_version_string() { // Print version std::stringstream ss; ss << CUSPARSE_VER_MAJOR << "." << CUSPARSE_VER_MINOR << "." - << CUSPARSE_VER_PATCH << CUSPARSE_VER_BUILD; + << CUSPARSE_VER_PATCH << "." << CUSPARSE_VER_BUILD; return ss.str(); } From 1e0fb0249c317b1a7dde82dd23f9d896f38f7f38 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 3 Feb 2023 12:06:12 -0700 Subject: [PATCH 149/442] Fix/enhance backend issues on spadd perftest - Add support for SYCL, HIP backends - Always allow running on Serial by passing in no backend flags - before, this would fail because an invalid device_id was passed to Kokkos::initialize - Print the Kokkos configuration - Print out which backend is actually being run --- perf_test/sparse/KokkosSparse_spadd.cpp | 123 ++++++++++++++++-------- 1 file changed, 82 insertions(+), 41 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index 13a7c26d2e..1fa0418a7a 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -32,12 +32,10 @@ #include #endif -#if defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) - struct Params { int use_cuda = 0; + int use_hip = 0; + int use_sycl = 0; int use_openmp = 0; int use_threads = 0; int use_mkl = 0; @@ -75,7 +73,6 @@ void run_experiment(const Params& params) { using entries_t = typename graph_t::entries_type::non_const_type; using values_t = typename crsMat_t::values_type::non_const_type; - std::cout << "************************************* \n"; std::cout << "************************************* \n"; crsMat_t A; crsMat_t B; @@ -335,10 +332,14 @@ void run_experiment(const Params& params) { void print_options() { std::cerr << "Options\n" << std::endl; - std::cerr - << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp " - "[numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'" - << std::endl; + std::cerr << "\t[Required] BACKEND:\n" + << "\t\t'--threads [numThreads]' |\n" + << "\t\t'--openmp [numThreads]' |\n" + << "\t\t'--cuda [deviceIndex]' |\n" + << "\t\t'--hip [deviceIndex]' |\n" + << "\t\t'--sycl [deviceIndex]'\n\n" + << "\tIf no parallel backend is requested, Serial will be used " + "(if enabled)\n\n"; std::cerr << "\t[Optional] --amtx :: 1st input matrix" << std::endl; std::cerr << "\t[Optional] --bmtx :: 2nd input matrix" << std::endl; @@ -383,6 +384,12 @@ int parse_inputs(Params& params, int argc, char** argv) { params.use_openmp = atoi(argv[++i]); } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { params.use_cuda = atoi(argv[++i]) + 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) { + params.use_hip = atoi(argv[++i]) + 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--sycl")) { + params.use_sycl = atoi(argv[++i]) + 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { + params.use_cuda = atoi(argv[++i]) + 1; } else if (0 == Test::string_compare_no_case(argv[i], "--mkl")) { params.use_mkl = 1; } else if (0 == Test::string_compare_no_case(argv[i], "--cusparse")) { @@ -435,15 +442,26 @@ int main(int argc, char** argv) { if (parse_inputs(params, argc, argv)) { return 1; } - const int num_threads = - params.use_openmp; // Assumption is that use_openmp variable is provided - // as number of threads - const int device_id = params.use_cuda - 1; + // Assumption is that use_openmp/use_threads variables are provided + // as numbers of threads + int num_threads = 1; + if (params.use_openmp) + num_threads = params.use_openmp; + else if (params.use_threads) + num_threads = params.use_threads; + int device_id = 0; + if (params.use_cuda) + device_id = params.use_cuda - 1; + else if (params.use_hip) + device_id = params.use_hip - 1; + else if (params.use_sycl) + device_id = params.use_sycl - 1; Kokkos::initialize(Kokkos::InitializationSettings() .set_num_threads(num_threads) .set_device_id(device_id)); - // Kokkos::print_configuration(std::cout); + Kokkos::print_configuration(std::cout); + std::cout << '\n'; // First, make sure that requested TPL (if any) is actually available #if !defined(KOKKOSKERNELS_ENABLE_TPL_MKL) @@ -457,10 +475,7 @@ int main(int argc, char** argv) { "To run cuSPARSE SpAdd, must enable the cuSPARSE TPL in cmake"); #endif - bool useOMP = params.use_openmp != 0; - bool useCUDA = params.use_cuda != 0; - - if (params.use_cusparse && !useCUDA) { + if (params.use_cusparse && !params.use_cuda) { throw std::invalid_argument( "To run cuSPARSE SpAdd, must supply the '--cuda ' flag"); } @@ -470,56 +485,82 @@ int main(int argc, char** argv) { "If running MKL, can't output the result to file"); } - bool useSerial = !useOMP && !useCUDA; + bool ran = false; - if (useOMP) { + if (params.use_openmp) { #if defined(KOKKOS_ENABLE_OPENMP) + std::cout << "Running on OpenMP backend.\n"; using crsMat_t = KokkosSparse::CrsMatrix; run_experiment(params); + ran = true; #else std::cout << "ERROR: OpenMP requested, but not available.\n"; return 1; #endif } - if (useCUDA) { + if (params.use_threads) { +#if defined(KOKKOS_ENABLE_THREADS) + std::cout << "Running on Threads backend.\n"; + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); + ran = true; +#else + std::cout << "ERROR: Threads requested, but not available.\n"; + return 1; +#endif + } + if (params.use_cuda) { #if defined(KOKKOS_ENABLE_CUDA) + std::cout << "Running on Cuda backend.\n"; using crsMat_t = KokkosSparse::CrsMatrix; run_experiment(params); + ran = true; #else std::cout << "ERROR: CUDA requested, but not available.\n"; return 1; #endif } - if (useSerial) { + if (params.use_hip) { +#if defined(KOKKOS_ENABLE_HIP) + std::cout << "Running on HIP backend.\n"; + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); + ran = true; +#else + std::cout << "ERROR: HIP requested, but not available.\n"; + return 1; +#endif + } + if (params.use_sycl) { +#if defined(KOKKOS_ENABLE_SYCL) + std::cout << "Running on SYCL backend.\n"; + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); + ran = true; +#else + std::cout << "ERROR: SYCL requested, but not available.\n"; + return 1; +#endif + } + if (!ran) { + // lastly #if defined(KOKKOS_ENABLE_SERIAL) + std::cout << "Running on Serial backend.\n"; using crsMat_t = KokkosSparse::CrsMatrix; run_experiment(params); #else - std::cout << "ERROR: Serial device requested, but not available.\n"; + std::cout << "ERROR: Tried to run on Serial device (as no parallel " + "backends requested), but Serial is not enabled.\n"; return 1; #endif } Kokkos::finalize(); return 0; } - -#else -int main() { -#if !defined(KOKKOSKERNELS_INST_DOUBLE) - std::cout << " not defined KOKKOSKERNELS_INST_DOUBLE" << std::endl; -#endif - -#if !defined(KOKKOSKERNELS_INST_OFFSET_INT) - std::cout << " not defined KOKKOSKERNELS_INST_OFFSET_INT" << std::endl; - -#endif - -#if !defined(KOKKOSKERNELS_INST_ORDINAL_INT) - std::cout << " not defined KOKKOSKERNELS_INST_ORDINAL_INT" << std::endl; - -#endif -} -#endif From 2dff920639826d2b05653830f82ca13ec8c7c0f0 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 3 Feb 2023 12:58:17 -0700 Subject: [PATCH 150/442] Avoid errors about not finalizing Kokkos --- perf_test/sparse/KokkosSparse_spadd.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index 1fa0418a7a..4b65878e26 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -496,6 +496,7 @@ int main(int argc, char** argv) { ran = true; #else std::cout << "ERROR: OpenMP requested, but not available.\n"; + Kokkos::finalize(); return 1; #endif } @@ -508,6 +509,7 @@ int main(int argc, char** argv) { ran = true; #else std::cout << "ERROR: Threads requested, but not available.\n"; + Kokkos::finalize(); return 1; #endif } @@ -520,6 +522,7 @@ int main(int argc, char** argv) { ran = true; #else std::cout << "ERROR: CUDA requested, but not available.\n"; + Kokkos::finalize(); return 1; #endif } @@ -532,6 +535,7 @@ int main(int argc, char** argv) { ran = true; #else std::cout << "ERROR: HIP requested, but not available.\n"; + Kokkos::finalize(); return 1; #endif } @@ -545,6 +549,7 @@ int main(int argc, char** argv) { ran = true; #else std::cout << "ERROR: SYCL requested, but not available.\n"; + Kokkos::finalize(); return 1; #endif } @@ -558,6 +563,7 @@ int main(int argc, char** argv) { #else std::cout << "ERROR: Tried to run on Serial device (as no parallel " "backends requested), but Serial is not enabled.\n"; + Kokkos::finalize(); return 1; #endif } From 2cfc5082be800882a0f0676edcb8edfb3486f45c Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 16 Mar 2023 14:38:51 -0600 Subject: [PATCH 151/442] spadd perf test: use common infrastructure --- perf_test/sparse/KokkosSparse_spadd.cpp | 399 +++++++++--------------- 1 file changed, 141 insertions(+), 258 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index 4b65878e26..f69d24d523 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -20,8 +20,12 @@ #include "KokkosSparse_IOUtils.hpp" #include "KokkosSparse_Utils_cusparse.hpp" #include "KokkosSparse_Utils_mkl.hpp" -#include "KokkosSparse_spadd.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +#include "KokkosSparse_spadd.hpp" + +using perf_test::CommonInputParams; #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #include @@ -32,15 +36,10 @@ #include #endif -struct Params { - int use_cuda = 0; - int use_hip = 0; - int use_sycl = 0; - int use_openmp = 0; - int use_threads = 0; - int use_mkl = 0; - int use_cusparse = 0; - bool sorted = true; +struct LocalParams { + bool use_mkl = false; + bool use_cusparse = false; + bool sorted = true; std::string amtx; std::string bmtx; std::string cmtx; @@ -53,17 +52,113 @@ struct Params { int numericRepeat = 1; // how many times to call numeric per overall run }; -template -void run_experiment(const Params& params) { +void print_options() { + std::cerr << "Options\n" << std::endl; + + std::cerr << perf_test::list_common_options(); + + std::cerr << "\t[Optional] --amtx :: 1st input matrix" << std::endl; + std::cerr << "\t[Optional] --bmtx :: 2nd input matrix" << std::endl; + std::cerr << "\t[Optional] --cmtx :: output matrix for C = A+B" + << std::endl; + std::cerr << "\t[Optional] --mkl :: run SpAdd from MKL" << std::endl; + std::cerr << "\t[Optional] --cusparse :: run SpAdd from cuSPARSE " + << std::endl; + std::cerr << "\t[Optional] --sorted :: sort rows of inputs, and run the " + "sorted algorithm" + << std::endl; + std::cerr << "\t[Optional] --unsorted :: run the unsorted algorithm" + << std::endl; + std::cerr << "\t[Optional] --repeat :: how many times to repeat overall " + "spadd (symbolic + repeated numeric)" + << std::endl; + std::cerr << "\t[Optional] --numeric-repeat :: how many times to repeat " + "numeric per symbolic" + << std::endl; + std::cerr << "\t[Optional] --verbose :: enable verbose output" + << std::endl; + std::cerr << "\nSettings for randomly generated A/B matrices" << std::endl; + std::cerr << "\t[Optional] --m :: number of rows to generate" + << std::endl; + std::cerr << "\t[Optional] --n :: number of cols to generate" + << std::endl; + std::cerr + << "\t[Optional] --nnz :: number of entries per row to generate" + << std::endl; + std::cerr << "\t[Optional] --bdiag :: generate B as a diagonal matrix" + << std::endl; +} + +int parse_inputs(LocalParams& params, int argc, char** argv) { + bool printHelp = false; + bool discard; + for (int i = 1; i < argc; ++i) { + // if (perf_test::check_arg_str(i, argc, argv, "--amtx", params.amtx)) { + // ++i; + if (perf_test::check_arg_bool(i, argc, argv, "--mkl", params.use_mkl)) { + } else if (perf_test::check_arg_bool(i, argc, argv, "--cusparse", + params.use_cusparse)) { + } else if (perf_test::check_arg_bool(i, argc, argv, "--sorted", + params.sorted)) { + } else if (perf_test::check_arg_bool(i, argc, argv, "--unsorted", + discard)) { + params.sorted = false; + } else if (perf_test::check_arg_str(i, argc, argv, "--amtx", params.amtx)) { + // A at C=AxB + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--bmtx", params.bmtx)) { + // B at C=AxB. + // if not provided, C = AxA will be performed. + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--cmtx", params.cmtx)) { + // if provided, C will be written to given file. + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--m", params.m)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--n", params.n)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--nnz", + params.nnzPerRow)) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--bdiag", + params.bDiag)) { + } else if (perf_test::check_arg_int(i, argc, argv, "--repeat", + params.repeat)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--numeric-repeat", + params.numericRepeat)) { + // Reuse the symbolic step this many times. + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--verbose", + params.verbose)) { + } else if (perf_test::check_arg_bool(i, argc, argv, "-h", printHelp)) { + } else if (perf_test::check_arg_bool(i, argc, argv, "--help", printHelp)) { + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + return 1; + } + } + if (printHelp) { + print_options(); + return 1; + } + return 0; +} + +template +void run_experiment(int argc, char** argv, CommonInputParams) { using namespace KokkosSparse; using namespace KokkosSparse::Experimental; - using size_type = typename crsMat_t::size_type; - using lno_t = typename crsMat_t::ordinal_type; - using scalar_t = typename crsMat_t::value_type; - using device_t = typename crsMat_t::device_type; - using exec_space = typename device_t::execution_space; - using mem_space = typename device_t::memory_space; + using mem_space = typename exec_space::memory_space; + using device_t = typename Kokkos::Device; + using size_type = default_size_type; + using lno_t = default_lno_t; + using scalar_t = default_scalar; + using crsMat_t = + KokkosSparse::CrsMatrix; using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, exec_space, mem_space, mem_space>; @@ -73,6 +168,30 @@ void run_experiment(const Params& params) { using entries_t = typename graph_t::entries_type::non_const_type; using values_t = typename crsMat_t::values_type::non_const_type; + LocalParams params; + if (parse_inputs(params, argc, argv)) return; + + // First, make sure that requested TPL (if any) is actually available +#if !defined(KOKKOSKERNELS_ENABLE_TPL_MKL) + if (params.use_mkl) + throw std::invalid_argument( + "To run MKL SpAdd, must enable the MKL TPL in cmake"); +#endif +#if !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + if (params.use_cusparse) + throw std::invalid_argument( + "To run cuSPARSE SpAdd, must enable the cuSPARSE TPL in cmake"); +#else + if (params.use_cusparse && !std::is_same::value) + throw std::invalid_argument( + "To run cuSPARSE SpAdd, must select the Cuda backend"); +#endif + + if (params.cmtx.length() && params.use_mkl) { + throw std::invalid_argument( + "If running MKL, can't output the result to file"); + } + std::cout << "************************************* \n"; crsMat_t A; crsMat_t B; @@ -329,244 +448,8 @@ void run_experiment(const Params& params) { } } -void print_options() { - std::cerr << "Options\n" << std::endl; - - std::cerr << "\t[Required] BACKEND:\n" - << "\t\t'--threads [numThreads]' |\n" - << "\t\t'--openmp [numThreads]' |\n" - << "\t\t'--cuda [deviceIndex]' |\n" - << "\t\t'--hip [deviceIndex]' |\n" - << "\t\t'--sycl [deviceIndex]'\n\n" - << "\tIf no parallel backend is requested, Serial will be used " - "(if enabled)\n\n"; - - std::cerr << "\t[Optional] --amtx :: 1st input matrix" << std::endl; - std::cerr << "\t[Optional] --bmtx :: 2nd input matrix" << std::endl; - std::cerr << "\t[Optional] --cmtx :: output matrix for C = A+B" - << std::endl; - std::cerr << "\t[Optional] --mkl :: run SpAdd from MKL" << std::endl; - std::cerr << "\t[Optional] --cusparse :: run SpAdd from cuSPARSE " - << std::endl; - std::cerr << "\t[Optional] --sorted :: sort rows of inputs, and run the " - "sorted algorithm" - << std::endl; - std::cerr << "\t[Optional] --unsorted :: run the unsorted algorithm" - << std::endl; - std::cerr << "\t[Optional] --repeat :: how many times to repeat overall " - "spadd (symbolic + repeated numeric)" - << std::endl; - std::cerr << "\t[Optional] --numeric-repeat :: how many times to repeat " - "numeric per symbolic" - << std::endl; - std::cerr << "\t[Optional] --verbose :: enable verbose output" - << std::endl; - std::cerr << "\nSettings for randomly generated A/B matrices" << std::endl; - std::cerr << "\t[Optional] --m :: number of rows to generate" - << std::endl; - std::cerr << "\t[Optional] --n :: number of cols to generate" - << std::endl; - std::cerr - << "\t[Optional] --nnz :: number of entries per row to generate" - << std::endl; - std::cerr - << "\t[Optional] --nnz :: number of entries per row to generate" - << std::endl; - std::cerr << "\t[Optional] --bdiag :: generate B as a diagonal matrix" - << std::endl; -} - -int parse_inputs(Params& params, int argc, char** argv) { - for (int i = 1; i < argc; ++i) { - if (0 == Test::string_compare_no_case(argv[i], "--threads")) { - params.use_threads = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) { - params.use_openmp = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { - params.use_cuda = atoi(argv[++i]) + 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) { - params.use_hip = atoi(argv[++i]) + 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--sycl")) { - params.use_sycl = atoi(argv[++i]) + 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { - params.use_cuda = atoi(argv[++i]) + 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--mkl")) { - params.use_mkl = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--cusparse")) { - params.use_cusparse = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--sorted")) { - params.sorted = true; - } else if (0 == Test::string_compare_no_case(argv[i], "--unsorted")) { - params.sorted = false; - } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) { - // A at C=AxB - params.amtx = argv[++i]; - } else if (0 == Test::string_compare_no_case(argv[i], "--bmtx")) { - // B at C=AxB. - // if not provided, C = AxA will be performed. - params.bmtx = argv[++i]; - } else if (0 == Test::string_compare_no_case(argv[i], "--cmtx")) { - // if provided, C will be written to given file. - // has to have ".bin", or ".crs" extension. - params.cmtx = argv[++i]; - } else if (0 == Test::string_compare_no_case(argv[i], "--m")) { - params.m = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--n")) { - params.n = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--nnz")) { - params.nnzPerRow = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--bdiag")) { - params.bDiag = true; - } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) { - // if provided, C will be written to given file. - // has to have ".bin", or ".crs" extension. - params.repeat = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--numeric-repeat")) { - // Reuse the symbolic step this many times. - params.numericRepeat = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--verbose")) { - params.verbose = true; - } else { - std::cerr << "Unrecognized command line argument #" << i << ": " - << argv[i] << std::endl; - print_options(); - return 1; - } - } - return 0; -} - +#define KOKKOSKERNELS_PERF_TEST_NAME run_experiment +#include "KokkosKernels_perf_test_instantiation.hpp" int main(int argc, char** argv) { - Params params; - - if (parse_inputs(params, argc, argv)) { - return 1; - } - // Assumption is that use_openmp/use_threads variables are provided - // as numbers of threads - int num_threads = 1; - if (params.use_openmp) - num_threads = params.use_openmp; - else if (params.use_threads) - num_threads = params.use_threads; - int device_id = 0; - if (params.use_cuda) - device_id = params.use_cuda - 1; - else if (params.use_hip) - device_id = params.use_hip - 1; - else if (params.use_sycl) - device_id = params.use_sycl - 1; - - Kokkos::initialize(Kokkos::InitializationSettings() - .set_num_threads(num_threads) - .set_device_id(device_id)); - Kokkos::print_configuration(std::cout); - std::cout << '\n'; - - // First, make sure that requested TPL (if any) is actually available -#if !defined(KOKKOSKERNELS_ENABLE_TPL_MKL) - if (params.use_mkl) - throw std::invalid_argument( - "To run MKL SpAdd, must enable the MKL TPL in cmake"); -#endif -#if !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) - if (params.use_cusparse) - throw std::invalid_argument( - "To run cuSPARSE SpAdd, must enable the cuSPARSE TPL in cmake"); -#endif - - if (params.use_cusparse && !params.use_cuda) { - throw std::invalid_argument( - "To run cuSPARSE SpAdd, must supply the '--cuda ' flag"); - } - - if (params.cmtx.length() && params.use_mkl) { - throw std::invalid_argument( - "If running MKL, can't output the result to file"); - } - - bool ran = false; - - if (params.use_openmp) { -#if defined(KOKKOS_ENABLE_OPENMP) - std::cout << "Running on OpenMP backend.\n"; - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); - ran = true; -#else - std::cout << "ERROR: OpenMP requested, but not available.\n"; - Kokkos::finalize(); - return 1; -#endif - } - if (params.use_threads) { -#if defined(KOKKOS_ENABLE_THREADS) - std::cout << "Running on Threads backend.\n"; - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); - ran = true; -#else - std::cout << "ERROR: Threads requested, but not available.\n"; - Kokkos::finalize(); - return 1; -#endif - } - if (params.use_cuda) { -#if defined(KOKKOS_ENABLE_CUDA) - std::cout << "Running on Cuda backend.\n"; - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); - ran = true; -#else - std::cout << "ERROR: CUDA requested, but not available.\n"; - Kokkos::finalize(); - return 1; -#endif - } - if (params.use_hip) { -#if defined(KOKKOS_ENABLE_HIP) - std::cout << "Running on HIP backend.\n"; - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); - ran = true; -#else - std::cout << "ERROR: HIP requested, but not available.\n"; - Kokkos::finalize(); - return 1; -#endif - } - if (params.use_sycl) { -#if defined(KOKKOS_ENABLE_SYCL) - std::cout << "Running on SYCL backend.\n"; - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); - ran = true; -#else - std::cout << "ERROR: SYCL requested, but not available.\n"; - Kokkos::finalize(); - return 1; -#endif - } - if (!ran) { - // lastly -#if defined(KOKKOS_ENABLE_SERIAL) - std::cout << "Running on Serial backend.\n"; - using crsMat_t = - KokkosSparse::CrsMatrix; - run_experiment(params); -#else - std::cout << "ERROR: Tried to run on Serial device (as no parallel " - "backends requested), but Serial is not enabled.\n"; - Kokkos::finalize(); - return 1; -#endif - } - Kokkos::finalize(); - return 0; -} + return main_instantiation(argc, argv); +} // main From 60881471b1cabd824ecf54675157fb9f74c97301 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Sat, 18 Mar 2023 11:49:20 -0600 Subject: [PATCH 152/442] Remove unused variable (#1734) --- sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp index 3096278c21..5f555f926e 100644 --- a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp @@ -519,7 +519,6 @@ void spgemm_numeric_mkl( const_cast(colidxB.data()), const_cast(valuesB.data())); auto mklSpgemmHandle = handle->get_mkl_spgemm_handle(); - bool computedEntries = false; matrix_descr generalDescr; generalDescr.type = SPARSE_MATRIX_TYPE_GENERAL; generalDescr.mode = SPARSE_FILL_MODE_FULL; From 664bfc4d3e2144feaa279eb7cf66bb6b5adaee05 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Sat, 18 Mar 2023 11:49:38 -0600 Subject: [PATCH 153/442] Fix kk_generate_diagonally_dominant_sparse_matrix hang (#1689) * Fix kk_generate_diagonally_dominant_sparse_matrix hang Use bandwidth to cap the max entries per row, so that the row-filling loop doesn't run forever looking for a column that isn't already present. * Diag-dominant matrix generator: error if bandwidth too small If bandwidth is too small for the requested nnz and row_size_variance, error out with a detailed message. --- sparse/src/KokkosSparse_IOUtils.hpp | 41 ++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/sparse/src/KokkosSparse_IOUtils.hpp b/sparse/src/KokkosSparse_IOUtils.hpp index d957cf4949..77934b4f3e 100644 --- a/sparse/src/KokkosSparse_IOUtils.hpp +++ b/sparse/src/KokkosSparse_IOUtils.hpp @@ -117,16 +117,42 @@ void kk_diagonally_dominant_sparseMatrix_generate( OrdinalType row_size_variance, OrdinalType bandwidth, ScalarType *&values, SizeType *&rowPtr, OrdinalType *&colInd, ScalarType diagDominance = 10 * Kokkos::ArithTraits::one()) { - rowPtr = new SizeType[nrows + 1]; - + rowPtr = new SizeType[nrows + 1]; OrdinalType elements_per_row = nnz / nrows; + // Set a hard limit to the actual entries in any one row, so that the + // loop to find a column not already taken will terminate quickly. + OrdinalType max_elements_per_row = 0.7 * bandwidth; + OrdinalType requested_max_elements_per_row = + elements_per_row + 0.5 * row_size_variance; + if (requested_max_elements_per_row > max_elements_per_row) { + std::cerr + << "kk_diagonally_dominant_sparseMatrix_generate: given the bandwidth (" + << bandwidth << "),\n"; + std::cerr << " can insert a maximum of " << max_elements_per_row + << " entries per row (0.7*bandwidth).\n"; + std::cerr << " But given the requested average entries per row of " + << elements_per_row << " and variance of " << row_size_variance + << ",\n"; + std::cerr << " there should be up to " << requested_max_elements_per_row + << " entries per row.\n"; + std::cerr << " Increase the bandwidth, or decrease nnz and/or " + "row_size_variance.\n"; + throw std::invalid_argument( + "kk_diagonally_dominant_sparseMatrix_generate: requested too many " + "entries per row for the given bandwidth."); + } srand(13721); rowPtr[0] = 0; for (int row = 0; row < nrows; row++) { - int varianz = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance; - if (varianz < 1) varianz = 1; - if (varianz > 0.75 * ncols) varianz = 0.75 * ncols; - rowPtr[row + 1] = rowPtr[row] + elements_per_row + varianz; + // variance is how many more (or less) entries this row has compared to the + // mean (elements_per_row). + OrdinalType variance = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance; + OrdinalType entries_in_row = elements_per_row + variance; + // Always have at least one entry (for the diagonal) + if (entries_in_row < 1) entries_in_row = 1; + if (entries_in_row > max_elements_per_row) + entries_in_row = max_elements_per_row; + rowPtr[row + 1] = rowPtr[row] + entries_in_row; if (rowPtr[row + 1] <= rowPtr[row]) // This makes sure that there is rowPtr[row + 1] = rowPtr[row] + 1; // at least one nonzero in the row } @@ -141,6 +167,9 @@ void kk_diagonally_dominant_sparseMatrix_generate( for (SizeType k = rowPtr[row]; k < rowPtr[row + 1] - 1; k++) { while (true) { OrdinalType pos = (1.0 * rand() / RAND_MAX - 0.5) * bandwidth + row; + // When bandwidth would extend past the columns of the matrix, wrap + // the entry around to the other side. This means the final matrix can + // actually have structural bandwidth close to ncols. while (pos < 0) pos += ncols; while (pos >= ncols) pos -= ncols; From 55f24857ee7b45f9be1d6c02ac3a009dde221911 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Mon, 20 Mar 2023 09:40:17 -0600 Subject: [PATCH 154/442] perf test utils: fix device ID parsing (#1739) Make the device ids (like "--cuda ") use zero-based numbering to be consistent with behavior of Kokkos and other perf tests.If a machine has one GPU, then "--cuda 0" should select it. Before, "--cuda 1" was necessary to select it, and "--cuda 0" would not enable Cuda at all. --- perf_test/KokkosKernels_perf_test_utilities.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/perf_test/KokkosKernels_perf_test_utilities.hpp b/perf_test/KokkosKernels_perf_test_utilities.hpp index b798d55a8e..cc7f70ccec 100644 --- a/perf_test/KokkosKernels_perf_test_utilities.hpp +++ b/perf_test/KokkosKernels_perf_test_utilities.hpp @@ -111,6 +111,9 @@ bool check_arg_str(int const i, int const argc, char** argv, char const* name, void parse_common_options(int& argc, char** argv, CommonInputParams& params) { // Skip the program name, start with argIdx=1 int argIdx = 1; + // Note: after parsing a GPU device ID, always add 1 to it. + // If e.g. params.use_cuda is 0, that means CUDA will not be used at all. + // But if it's N, then it means run on CUDA device N-1. while (argIdx < argc) { bool remove_flag = false; if (check_arg_int(argIdx, argc, argv, "--threads", params.use_threads)) { @@ -119,10 +122,13 @@ void parse_common_options(int& argc, char** argv, CommonInputParams& params) { params.use_openmp)) { remove_flag = true; } else if (check_arg_int(argIdx, argc, argv, "--cuda", params.use_cuda)) { + params.use_cuda++; remove_flag = true; } else if (check_arg_int(argIdx, argc, argv, "--hip", params.use_hip)) { + params.use_hip++; remove_flag = true; } else if (check_arg_int(argIdx, argc, argv, "--sycl", params.use_sycl)) { + params.use_sycl++; remove_flag = true; } From fd7f6e515333149e33038a4424146688a4418809 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 20 Mar 2023 10:35:17 -0600 Subject: [PATCH 155/442] cm_test_all_sandia: Add llvm/10.0.1 --- scripts/cm_test_all_sandia | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 866fc45a83..4e1a2254ab 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -779,6 +779,7 @@ elif [ "$MACHINE" = "solo" ]; then if [ "$SPOT_CHECK" = "True" ]; then COMPILERS=( "gnu/10.2.1 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GNU_WARNING_FLAGS" + "llvm/10.0.1 $BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then COMPILERS=("intel/19.0.5.281 $BASE_MODULE_LIST_INTEL,mkl/19.0.5.281 "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" From bf9ed2aee5fb5b45144e4a54f6039330a2eae32b Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 20 Mar 2023 12:12:35 -0600 Subject: [PATCH 156/442] ParIlut: create and destroy spgemm handle for each usage (#1736) * ParIlut: create and destroy spgemm handle for each usage This fixes memory errors on Cuda * Formatting --- .../impl/KokkosSparse_par_ilut_numeric_impl.hpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index aa8af73d69..cedc2dbd43 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -78,6 +78,11 @@ struct IlutWrap { const URowMapType& U_row_map, const UEntriesType& U_entries, const UValuesType& U_values, LURowMapType& LU_row_map, LUEntriesType& LU_entries, LUValuesType& LU_values) { + std::string myalg("SPGEMM_KK_MEMORY"); + KokkosSparse::SPGEMMAlgorithm spgemm_algorithm = + KokkosSparse::StringToSPGEMMAlgorithm(myalg); + kh.create_spgemm_handle(spgemm_algorithm); + const size_type nrows = ih.get_nrows(); KokkosSparse::Experimental::spgemm_symbolic( @@ -95,6 +100,8 @@ struct IlutWrap { // Need to sort LU CRS if on CUDA! sort_crs_matrix(LU_row_map, LU_entries, LU_values); + + kh.destroy_spgemm_handle(); } /** @@ -722,6 +729,8 @@ struct IlutWrap { RRowMapType& R_row_map, REntriesType& R_entries, RValuesType& R_values, LURowMapType& LU_row_map, LUEntriesType& LU_entries, LUValuesType& LU_values) { + scalar_t result; + multiply_matrices(kh, ih, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, LU_row_map, LU_entries, LU_values); @@ -737,8 +746,6 @@ struct IlutWrap { &kh, A_row_map, A_entries, A_values, 1., LU_row_map, LU_entries, LU_values, -1., R_row_map, R_entries, R_values); - scalar_t result; - auto policy = ih.get_default_team_policy(); Kokkos::parallel_reduce( @@ -858,11 +865,6 @@ struct IlutWrap { thandle.get_residual_norm_delta_stop(); const size_type max_iter = thandle.get_max_iter(); - std::string myalg("SPGEMM_KK_MEMORY"); - KokkosSparse::SPGEMMAlgorithm spgemm_algorithm = - KokkosSparse::StringToSPGEMMAlgorithm(myalg); - kh.create_spgemm_handle(spgemm_algorithm); - kh.create_spadd_handle(true /*we expect inputs to be sorted*/); // @@ -975,7 +977,6 @@ struct IlutWrap { ++itr; } - kh.destroy_spgemm_handle(); kh.destroy_spadd_handle(); } // end ilut_numeric From 2a5309b3983643ef3abeb3003b7eaeebc135f322 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 20 Mar 2023 12:36:20 -0600 Subject: [PATCH 157/442] Use concurrency() rather than impl_thread_pool_size() --- blas/unit_test/Test_Blas3_gemm.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/unit_test/Test_Blas3_gemm.hpp b/blas/unit_test/Test_Blas3_gemm.hpp index 9db7f987b9..179afcdecd 100644 --- a/blas/unit_test/Test_Blas3_gemm.hpp +++ b/blas/unit_test/Test_Blas3_gemm.hpp @@ -371,7 +371,7 @@ void test_gemm() { } } } - auto pool_size = TestExecSpace().impl_thread_pool_size(); + auto pool_size = TestExecSpace().concurrency(); if (pool_size >= 2) { Test::impl_test_stream_gemm_psge2( 53, 42, 17, 4.5, From 0e507ae3812c1b5b2a12ccb6f151ffc864aa32da Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 21 Mar 2023 07:31:46 -0600 Subject: [PATCH 158/442] openblas is now in standard modulepath --- scripts/cm_test_all_sandia | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 4e1a2254ab..102e3b098b 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -157,7 +157,6 @@ fi if [[ "$HOSTNAME" == *solo* ]]; then # Warning: very generic name MACHINE=solo - module use /projects/netpub/openblas/modulefiles module use /projects/netpub/clang/modulefiles fi From 1554ee7a8a0073d5fe380b38e7b68117e93f9dcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 22 Mar 2023 16:23:48 +0100 Subject: [PATCH 159/442] Extract benchmark CMake code into a separate file - extract code into a separate file - use consistent formatting - remove clang-tidy configuration - use `target_include_directories` instead of `include_directories` --- CMakeLists.txt | 3 + cmake/kokkoskernels_benchmarks.cmake | 84 ++++++++++++++++++++++ perf_test/CMakeLists.txt | 100 ++------------------------- 3 files changed, 92 insertions(+), 95 deletions(-) create mode 100644 cmake/kokkoskernels_benchmarks.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index a89354a765..ff5bef95db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,6 +88,9 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS) include_directories(tpls/rajaperf/src) set(KokkosKernels_ENABLE_PERFTESTS ON CACHE BOOL "Whether to build tests including Perfsuite. Default: OFF" FORCE) ENDIF() + IF(KokkosKernels_ENABLE_BENCHMARK) + INCLUDE(cmake/kokkoskernels_benchmarks.cmake) + ENDIF() ENDIF () KOKKOSKERNELS_ADD_OPTION( diff --git a/cmake/kokkoskernels_benchmarks.cmake b/cmake/kokkoskernels_benchmarks.cmake new file mode 100644 index 0000000000..1bceb9ea81 --- /dev/null +++ b/cmake/kokkoskernels_benchmarks.cmake @@ -0,0 +1,84 @@ +IF(KOKKOSKERNELS_HAS_TRILINOS) + MESSAGE( + FATAL_ERROR + "Benchmarks are not supported when building as part of Trilinos") +ENDIF() + +FIND_PACKAGE(benchmark QUIET) + +IF(benchmark_FOUND) + MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}") +ELSE() + MESSAGE(STATUS "No installed google benchmark found, fetching from GitHub") + INCLUDE(FetchContent) + SET(BENCHMARK_ENABLE_TESTING OFF) + + LIST(APPEND CMAKE_MESSAGE_INDENT "[benchmark] ") + + # Note: recent bug (google/benchmark#1441) is preventing us from using + # the latest benchmark release. + FetchContent_Declare( + googlebenchmark + URL https://github.com/google/benchmark/archive/refs/tags/v1.6.2.tar.gz + URL_HASH MD5=14d14849e075af116143a161bc3b927b + ) + FetchContent_MakeAvailable(googlebenchmark) + LIST(POP_BACK CMAKE_MESSAGE_INDENT) + + TARGET_COMPILE_OPTIONS(benchmark PRIVATE -w) + TARGET_COMPILE_OPTIONS(benchmark_main PRIVATE -w) +ENDIF() + +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +FUNCTION(KOKKOSKERNELS_ADD_BENCHMARK NAME) + CMAKE_PARSE_ARGUMENTS( + BENCHMARK + "" + "" + "SOURCES" + ${ARGN} + ) + + IF(DEFINED BENCHMARK_UNPARSED_ARGUMENTS) + MESSAGE( + WARNING + "Unexpected arguments when adding a benchmark: " + ${BENCHMARK_UNPARSED_ARGUMENTS} + ) + ENDIF() + + SET(BENCHMARK_NAME ${PACKAGE_NAME}_${NAME}) + + ADD_EXECUTABLE( + ${BENCHMARK_NAME} + ${CMAKE_SOURCE_DIR}/perf_test/BenchmarkMain.cpp ${BENCHMARK_SOURCES} + ) + TARGET_LINK_LIBRARIES( + ${BENCHMARK_NAME} + PRIVATE benchmark::benchmark Kokkos::kokkoskernels + ) + TARGET_INCLUDE_DIRECTORIES( + ${BENCHMARK_NAME} + SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include + ) + + FOREACH(SOURCE_FILE ${BENCHMARK_SOURCES}) + SET_SOURCE_FILES_PROPERTIES( + ${SOURCE_FILE} + PROPERTIES LANGUAGE CXX + ) + ENDFOREACH() + + STRING(TIMESTAMP BENCHMARK_TIME "%Y-%m-%d_T%H-%M-%S" UTC) + SET( + BENCHMARK_ARGS + --benchmark_counters_tabular=true + --benchmark_out=${BENCHMARK_NAME}_${BENCHMARK_TIME}.json + ) + + ADD_TEST( + NAME ${BENCHMARK_NAME} + COMMAND ${BENCHMARK_NAME} ${BENCHMARK_ARGS} + ) +ENDFUNCTION() diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index 64f4579679..a74e6043fc 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -53,102 +53,12 @@ if (KokkosKernels_ENABLE_PERFTESTS) endif() -IF(KokkosKernels_ENABLE_BENCHMARK) - - IF (KOKKOSKERNELS_HAS_TRILINOS) - message(FATAL_ERROR "Benchmarks are not supported when building as part of Trilinos") - ENDIF() - - find_package(benchmark QUIET) - - IF(benchmark_FOUND) - MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}") - ELSE() - message(STATUS "No installed google benchmark found, fetching from GitHub") - include(FetchContent) - SET(BENCHMARK_ENABLE_TESTING OFF) - - list(APPEND CMAKE_MESSAGE_INDENT " ") - #Note: recent bug (google/benchmark#1441) is preventing us from using - # the latest benchmark release. - FetchContent_Declare( - googlebenchmark - URL https://github.com/google/benchmark/archive/refs/tags/v1.6.2.tar.gz - URL_HASH MD5=14d14849e075af116143a161bc3b927b - ) - FetchContent_MakeAvailable(googlebenchmark) - list(POP_BACK CMAKE_MESSAGE_INDENT) - - include_directories(${benchmark_SOURCE_DIR}/include) - - # Suppress clang-tidy diagnostics on code that we do not have control over - IF(CMAKE_CXX_CLANG_TIDY) - SET_TARGET_PROPERTIES(benchmark PROPERTIES CXX_CLANG_TIDY "") - ENDIF() - - target_compile_options(benchmark PRIVATE -w) - target_compile_options(benchmark_main PRIVATE -w) - ENDIF() - - KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) - - FUNCTION(KOKKOSKERNELS_ADD_BENCHMARK NAME) - CMAKE_PARSE_ARGUMENTS( - BENCHMARK - "" - "" - "SOURCES" - ${ARGN} - ) - IF(DEFINED BENCHMARK_UNPARSED_ARGUMENTS) - MESSAGE( - WARNING - "Unexpected arguments when adding a benchmark: " - ${BENCHMARK_UNPARSED_ARGUMENTS} - ) - ENDIF() - - SET(BENCHMARK_NAME ${PACKAGE_NAME}_${NAME}) - - ADD_EXECUTABLE( - ${BENCHMARK_NAME} - ${BENCHMARK_SOURCES} - ) - TARGET_LINK_LIBRARIES( - ${BENCHMARK_NAME} - PRIVATE benchmark::benchmark Kokkos::kokkoskernels - ) - FOREACH(SOURCE_FILE ${BENCHMARK_SOURCES}) - SET_SOURCE_FILES_PROPERTIES( - ${SOURCE_FILE} - PROPERTIES LANGUAGE CXX - ) - ENDFOREACH() - - STRING(TIMESTAMP BENCHMARK_TIME "%Y-%m-%d_T%H-%M-%S" UTC) - SET( - BENCHMARK_ARGS - --benchmark_counters_tabular=true - --benchmark_out=${BENCHMARK_NAME}_${BENCHMARK_TIME}.json - ) - - ADD_TEST( - NAME ${BENCHMARK_NAME} - COMMAND ${BENCHMARK_NAME} ${BENCHMARK_ARGS} - ) - ENDFUNCTION() - - SET( - BENCHMARK_SOURCES - BenchmarkMain.cpp - blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp - blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp - blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp - ) - +if(KokkosKernels_ENABLE_BENCHMARK) KOKKOSKERNELS_ADD_BENCHMARK( PerformanceTest_Benchmark - SOURCES ${BENCHMARK_SOURCES} + SOURCES + blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp + blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp + blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp ) - endif() From 0912b67accf8016ff70cfbcf774d6ccdcda92a39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 22 Mar 2023 17:25:30 +0100 Subject: [PATCH 160/442] Include google benchmark lib version in benchmark output --- CMakeLists.txt | 8 ++--- cmake/KokkosKernels_Version_Info.hpp.in | 9 +++--- cmake/kokkoskernels_benchmarks.cmake | 3 +- ...cmake => kokkoskernels_version_info.cmake} | 6 ++-- perf_test/Benchmark_Context.hpp | 32 +++++++++++-------- 5 files changed, 33 insertions(+), 25 deletions(-) rename cmake/{kokkoskernels_git_info.cmake => kokkoskernels_version_info.cmake} (94%) diff --git a/CMakeLists.txt b/CMakeLists.txt index ff5bef95db..346a329d82 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -324,11 +324,11 @@ ELSE() ENDFOREACH() TARGET_INCLUDE_DIRECTORIES(kokkoskernels PUBLIC $) - ENDIF() - IF(NOT KOKKOSKERNELS_HAS_TRILINOS) - INCLUDE(cmake/kokkoskernels_git_info.cmake) - check_git_setup() + IF(KokkosKernels_ENABLE_BENCHMARK) + INCLUDE(cmake/kokkoskernels_version_info.cmake) + check_version_info() + ENDIF() ENDIF() # FIXME_SYCL waiting for compiler support diff --git a/cmake/KokkosKernels_Version_Info.hpp.in b/cmake/KokkosKernels_Version_Info.hpp.in index bf55183f01..62bcaed88c 100644 --- a/cmake/KokkosKernels_Version_Info.hpp.in +++ b/cmake/KokkosKernels_Version_Info.hpp.in @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOSKERNELS_GIT_VERSION_INFO_H -#define KOKKOSKERNELS_GIT_VERSION_INFO_H +#ifndef KOKKOSKERNELS_VERSION_INFO_HPP +#define KOKKOSKERNELS_VERSION_INFO_HPP #include @@ -27,9 +27,10 @@ constexpr std::string_view GIT_COMMIT_HASH = "@GIT_COMMIT_HASH@"; constexpr std::string_view GIT_CLEAN_STATUS = "@GIT_CLEAN_STATUS@"; constexpr std::string_view GIT_COMMIT_DESCRIPTION = R"message(@GIT_COMMIT_DESCRIPTION@)message"; -constexpr std::string_view GIT_COMMIT_DATE = "@GIT_COMMIT_DATE@"; +constexpr std::string_view GIT_COMMIT_DATE = "@GIT_COMMIT_DATE@"; +constexpr std::string_view BENCHMARK_VERSION = "@BENCHMARK_VERSION@"; } // namespace Impl } // namespace KokkosKernels -#endif +#endif // KOKKOSKERNELS_VERSION_INFO_HPP diff --git a/cmake/kokkoskernels_benchmarks.cmake b/cmake/kokkoskernels_benchmarks.cmake index 1bceb9ea81..7bb262247d 100644 --- a/cmake/kokkoskernels_benchmarks.cmake +++ b/cmake/kokkoskernels_benchmarks.cmake @@ -17,9 +17,10 @@ ELSE() # Note: recent bug (google/benchmark#1441) is preventing us from using # the latest benchmark release. + SET(BENCHMARK_VERSION 1.6.2) FetchContent_Declare( googlebenchmark - URL https://github.com/google/benchmark/archive/refs/tags/v1.6.2.tar.gz + URL https://github.com/google/benchmark/archive/refs/tags/v${BENCHMARK_VERSION}.tar.gz URL_HASH MD5=14d14849e075af116143a161bc3b927b ) FetchContent_MakeAvailable(googlebenchmark) diff --git a/cmake/kokkoskernels_git_info.cmake b/cmake/kokkoskernels_version_info.cmake similarity index 94% rename from cmake/kokkoskernels_git_info.cmake rename to cmake/kokkoskernels_version_info.cmake index 86d126591a..33c7c222e6 100644 --- a/cmake/kokkoskernels_git_info.cmake +++ b/cmake/kokkoskernels_version_info.cmake @@ -87,12 +87,14 @@ FUNCTION(check_git_version) ENDIF() ENDFUNCTION() -FUNCTION(check_git_setup) +# Pass BENCHMARK_VERSION variable to configure benchmark library version +FUNCTION(check_version_info) add_custom_target( AlwaysCheckGit COMMAND ${CMAKE_COMMAND} -DRUN_CHECK_GIT_VERSION=1 -DKOKKOSKERNELS_TOP_SOURCE_DIR=${KOKKOSKERNELS_TOP_SOURCE_DIR} - -P ${CURRENT_LIST_DIR}/kokkoskernels_git_info.cmake + -DBENCHMARK_VERSION=${BENCHMARK_VERSION} + -P ${CURRENT_LIST_DIR}/kokkoskernels_version_info.cmake BYPRODUCTS ${post_configure_file}) add_dependencies(kokkoskernels AlwaysCheckGit) diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index cf987c222c..7946126e57 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -67,19 +67,23 @@ void add_kokkos_configuration(bool verbose) { } } -inline void add_git_info() { - if (!KokkosKernels::Impl::GIT_BRANCH.empty()) { - benchmark::AddCustomContext("GIT_BRANCH", - std::string(KokkosKernels::Impl::GIT_BRANCH)); - benchmark::AddCustomContext( - "GIT_COMMIT_HASH", std::string(KokkosKernels::Impl::GIT_COMMIT_HASH)); - benchmark::AddCustomContext( - "GIT_CLEAN_STATUS", std::string(KokkosKernels::Impl::GIT_CLEAN_STATUS)); - benchmark::AddCustomContext( - "GIT_COMMIT_DESCRIPTION", - std::string(KokkosKernels::Impl::GIT_COMMIT_DESCRIPTION)); - benchmark::AddCustomContext( - "GIT_COMMIT_DATE", std::string(KokkosKernels::Impl::GIT_COMMIT_DATE)); +inline void add_version_info() { + using namespace KokkosKernels::Impl; + + if (!GIT_BRANCH.empty()) { + benchmark::AddCustomContext("GIT_BRANCH", std::string(GIT_BRANCH)); + benchmark::AddCustomContext("GIT_COMMIT_HASH", + std::string(GIT_COMMIT_HASH)); + benchmark::AddCustomContext("GIT_CLEAN_STATUS", + std::string(GIT_CLEAN_STATUS)); + benchmark::AddCustomContext("GIT_COMMIT_DESCRIPTION", + std::string(GIT_COMMIT_DESCRIPTION)); + benchmark::AddCustomContext("GIT_COMMIT_DATE", + std::string(GIT_COMMIT_DATE)); + } + if (!BENCHMARK_VERSION.empty()) { + benchmark::AddCustomContext("GOOGLE_BENCHMARK_VERSION", + std::string(BENCHMARK_VERSION)); } } @@ -88,7 +92,7 @@ void add_benchmark_context(bool verbose = false) { // Add Kokkos configuration to benchmark context data add_kokkos_configuration(verbose); - add_git_info(); + add_version_info(); } } // namespace KokkosKernelsBenchmark From d2f9e011342ee12d1b80f2180c7d300c629afe1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 22 Mar 2023 17:38:08 +0100 Subject: [PATCH 161/442] Mark functions as inline where appropriate --- perf_test/Benchmark_Context.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index 7946126e57..07ba07202f 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -33,7 +33,7 @@ namespace KokkosKernelsBenchmark { /// \brief Remove unwanted spaces and colon signs from input string. In case of /// invalid input it will return an empty string. -std::string remove_unwanted_characters(std::string str) { +inline std::string remove_unwanted_characters(std::string str) { auto from = str.find_first_not_of(" :"); auto to = str.find_last_not_of(" :"); @@ -47,7 +47,7 @@ std::string remove_unwanted_characters(std::string str) { /// \brief Extract all key:value pairs from kokkos configuration and add it to /// the benchmark context -void add_kokkos_configuration(bool verbose) { +inline void add_kokkos_configuration(bool verbose) { std::ostringstream msg; Kokkos::print_configuration(msg, verbose); KokkosKernels::print_configuration(msg); @@ -88,7 +88,7 @@ inline void add_version_info() { } /// \brief Gather all context information and add it to benchmark context data -void add_benchmark_context(bool verbose = false) { +inline void add_benchmark_context(bool verbose = false) { // Add Kokkos configuration to benchmark context data add_kokkos_configuration(verbose); From 252fbf8a2603dc7f9c523f71e431a38e1fcffbe5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 22 Mar 2023 17:41:27 +0100 Subject: [PATCH 162/442] Clarify comments for context helper functions --- perf_test/Benchmark_Context.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index 07ba07202f..16a7d4c4e8 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -67,6 +67,8 @@ inline void add_kokkos_configuration(bool verbose) { } } +/// \brief Add Kokkos Kernels git info and google benchmark release to +/// benchmark context. inline void add_version_info() { using namespace KokkosKernels::Impl; @@ -87,11 +89,9 @@ inline void add_version_info() { } } -/// \brief Gather all context information and add it to benchmark context data +/// \brief Gather all context information and add it to benchmark context inline void add_benchmark_context(bool verbose = false) { - // Add Kokkos configuration to benchmark context data add_kokkos_configuration(verbose); - add_version_info(); } From 38789c2cc7521076496fb298f4c864e2eb479c42 Mon Sep 17 00:00:00 2001 From: Tom Ransegnola Date: Wed, 22 Mar 2023 13:03:45 -0600 Subject: [PATCH 163/442] add ability to generate compile_commands.json for clangd --- .gitignore | 5 +++++ cm_generate_makefile.bash | 13 +++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index fa032cb2cb..d64726e92e 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,8 @@ .project *.o TAGS + +#Clangd indexing +compile_commands.json +.cache/ +.vscode/ \ No newline at end of file diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index 3eab04694a..939be4aab3 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -367,8 +367,8 @@ display_help_text() { echo "--disable-perftests: Do not build Kokkos Kernels performance tests" echo "--enable-perftests: build Kokkos Kernels performance tests (default)" echo "--deprecated-code Enable deprecated code (disabled by default)" - echo "--enable-perfsuite: build Kokkos Kernels performance tests with -RAJAPerf Suite" + echo "--enable-perfsuite: build Kokkos Kernels performance tests with RAJAPerf Suite" + echo "--export-compile-commands: export cmake compile_commands.json file" } @@ -382,6 +382,8 @@ KOKKOSKERNELS_DO_PERFTESTS=ON KOKKOSKERNELS_DO_PERFSUITE=OFF KOKKOSKERNELS_DO_EXAMPLES=ON +CMAKE_EXPORT_COMPILE_COMMANDS=OFF + #Build static libraries by default BUILD_SHARED_LIBRARIES=OFF @@ -543,6 +545,9 @@ do # This is the default KOKKOSKERNELS_DO_TESTS=ON ;; + --export-compile-commands) + CMAKE_EXPORT_COMPILE_COMMANDS=ON + ;; --enable-perfsuite) KOKKOSKERNELS_DO_PERFSUITE=ON ;; @@ -816,6 +821,6 @@ cd $STORE_KOKKOSKERNELS_BUILD_PATH # Configure kokkos-kernels echo "" -echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS=\"${KOKKOS_CXXFLAGS}\" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS=\"${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED}\" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} ${KOKKOSKERNELS_PATH} +echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS=\"${KOKKOS_CXXFLAGS}\" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS=\"${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED}\" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} ${KOKKOSKERNELS_PATH} echo "" -cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS="${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED//\"}" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} ${KOKKOSKERNELS_PATH} +cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS="${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED//\"}" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} ${KOKKOSKERNELS_PATH} From 943cfc6bbbb1864dd7dd5784e807ce9e6067fb0f Mon Sep 17 00:00:00 2001 From: Tom Ransegnola Date: Wed, 22 Mar 2023 13:04:51 -0600 Subject: [PATCH 164/442] add access to inv permutations to mdf handle --- sparse/src/KokkosSparse_mdf_handle.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sparse/src/KokkosSparse_mdf_handle.hpp b/sparse/src/KokkosSparse_mdf_handle.hpp index 6f6f2658be..189bccfb18 100644 --- a/sparse/src/KokkosSparse_mdf_handle.hpp +++ b/sparse/src/KokkosSparse_mdf_handle.hpp @@ -89,6 +89,7 @@ struct MDF_handle { } col_ind_type get_permutation() { return permutation; } + col_ind_type get_permutation_inv() { return permutation_inv; } void sort_factors() { KokkosSparse::sort_crs_matrix(L); From 5595b4a92284aa615f09dd3fdc18f325142c2228 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 14 Mar 2023 11:17:01 -0600 Subject: [PATCH 165/442] Leverage std library in BsrMatrix constructor --- sparse/src/KokkosSparse_BsrMatrix.hpp | 198 +++++++++++++++----------- 1 file changed, 111 insertions(+), 87 deletions(-) diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index ea4e50a8fe..89d0fcc65a 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -451,12 +451,12 @@ class BsrMatrix { } } - /// \brief Constructor that copies raw arrays of host data in - /// coordinate format. + /// \brief Construct BsrMatrix from host data in COO format. /// - /// On input, each entry of the sparse matrix is stored in val[k], - /// with row index rows[k] and column index cols[k]. We assume that - /// the entries are sorted in increasing order by row index. + /// The COO matrix must already have a block structure. + /// Each entry k of the input sparse matrix has a value stored in val[k], + /// row index in rows[k] and column index in cols[k]. + /// The COO data must be sorted by increasing row index /// /// This constructor is mainly useful for benchmarking or for /// reading the sparse matrix's data from a file. @@ -465,19 +465,19 @@ class BsrMatrix { /// \param nrows [in] The number of rows. /// \param ncols [in] The number of columns. /// \param annz [in] The number of entries. - /// \param val [in] The entries. - /// \param rows [in] The row indices. rows[k] is the row index of + /// \param vals [in] The entries. + /// \param rows [in] The row indices. rows[k] is the row index of /// val[k]. - /// \param cols [in] The column indices. cols[k] is the column + /// \param cols [in] The column indices. cols[k] is the column /// index of val[k]. - /// \param blockdim [in] The block dimensions. + /// \param blockdim [in] The block size of the constructed BsrMatrix. /// \param pad [in] If true, pad the sparse matrix's storage with /// zeros in order to improve cache alignment and / or /// vectorization. /// /// The \c pad argument is currently not used. BsrMatrix(const std::string& label, OrdinalType nrows, OrdinalType ncols, - size_type annz, ScalarType* val, OrdinalType* rows, + size_type annz, ScalarType* vals, OrdinalType* rows, OrdinalType* cols, OrdinalType blockdim, bool pad = false) { (void)label; (void)pad; @@ -496,93 +496,117 @@ class BsrMatrix { assert((nrows % blockDim_ == 0) && "BsrMatrix: input CrsMatrix rows is not a multiple of block size"); } + if (annz % (blockDim_ * blockDim_)) { + throw std::runtime_error( + "BsrMatrix:: annz should be a multiple of the number of entries in a " + "block"); + } - numCols_ = ncols / blockDim_; - ordinal_type tmp_num_rows = nrows / blockDim_; - - // - // Wrap the raw pointers in unmanaged host Views - // Note that the inputs are in coordinate format. - // So unman_rows and unman_cols have the same type. - // - typename values_type::HostMirror unman_val(val, annz); - typename index_type::HostMirror unman_rows(rows, annz); - typename index_type::HostMirror unman_cols(cols, annz); - - typename row_map_type::non_const_type tmp_row_map( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "rowmap"), - tmp_num_rows + 1); - auto row_map_host = Kokkos::create_mirror_view(tmp_row_map); - Kokkos::deep_copy(row_map_host, 0); - - if (annz > 0) { - ordinal_type iblock = 0; - std::set set_blocks; - for (size_type ii = 0; ii <= annz; ++ii) { - if ((ii == annz) || ((unman_rows(ii) / blockDim_) > iblock)) { - // Flush the stored entries - row_map_host(iblock + 1) = set_blocks.size(); - if (ii == annz) break; - set_blocks.clear(); - iblock = unman_rows(ii) / blockDim_; - } - ordinal_type tmp_jblock = unman_cols(ii) / blockDim_; - set_blocks.insert(tmp_jblock); + using Coord = std::pair; // row, col + using CoordComp = std::function; // type that can order Coords + using Entry = std::pair; // (row, col), val + using Blocks = std::map, + CoordComp>; // map a block to its non-zeros, sorted + // by row, then col + + numCols_ = ncols / blockDim_; + ordinal_type numRows = nrows / blockDim_; + size_type numBlocks = annz / (blockDim_ * blockDim_); + + // device data + typename row_map_type::non_const_type row_map_device( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map_device"), + numRows + 1); + index_type entries_device("entries_device", numBlocks); + Kokkos::resize(values, annz); + + // mirror views on host + auto row_map_host = Kokkos::create_mirror_view(row_map_device); + auto entries_host = Kokkos::create_mirror_view(entries_device); + auto values_host = Kokkos::create_mirror_view(values); + + auto coord_by_row_col = [](const Coord& a, const Coord& b) { + const auto& arow = std::get<0>(a); + const auto& brow = std::get<0>(b); + const auto& acol = std::get<1>(a); + const auto& bcol = std::get<1>(b); + if (arow < brow) { + return true; + } else if (arow > brow) { + return false; + } else { + return acol < bcol; + } + }; + + auto entry_by_row_col = [coord_by_row_col](const Entry& a, const Entry& b) { + return coord_by_row_col(std::get<0>(a), std::get<0>(b)); + }; + + // organize all blocks and their entries + Blocks blocks(coord_by_row_col); + for (size_type i = 0; i < annz; ++i) { + const ordinal_type row = rows[i]; + const ordinal_type col = cols[i]; + const ScalarType val = vals[i]; + const Coord block = Coord(row / blockDim_, col / blockDim_); + const Entry entry(Coord(row, col), val); + + // add entry to the correct block + auto it = blocks.find(block); + if (it == blocks.end()) { + std::vector entries = {entry}; + entries.reserve(blockDim_ * blockDim_); + blocks[block] = std::move(entries); // new block with entry + } else { + it->second.push_back(entry); // add entry to block } } - for (size_type ii = 0; ii < annz; ++ii) - row_map_host(ii + 1) += row_map_host(ii); - - Kokkos::deep_copy(tmp_row_map, row_map_host); - - // Create temporary Views for row_map and entries - // because the StaticCrsGraph ctor requires View inputs - index_type tmp_entries("tmp_entries", row_map_host(tmp_num_rows)); - auto tmp_entries_host = Kokkos::create_mirror_view(tmp_entries); - - Kokkos::resize(values, row_map_host(tmp_num_rows) * blockDim_ * blockDim_); - auto values_host = Kokkos::create_mirror_view(values); - Kokkos::deep_copy(values_host, 0); - - if (annz > 0) { - //--- Fill tmp_entries - ordinal_type cur_block = 0; - std::set set_blocks; - for (size_type ii = 0; ii <= annz; ++ii) { - if ((ii == annz) || ((unman_rows(ii) / blockDim_) > cur_block)) { - // Flush the stored entries - ordinal_type ipos = row_map_host(cur_block); - for (auto jblock : set_blocks) tmp_entries_host(ipos++) = jblock; - if (ii == annz) break; - set_blocks.clear(); - cur_block = unman_rows(ii) / blockDim_; - } - ordinal_type tmp_jblock = unman_cols(ii) / blockDim_; - set_blocks.insert(tmp_jblock); + // write block data out to BSR format + ordinal_type row = 0; // current row we're in + size_t bi = 0; // how many blocks so far + for (auto& kv : blocks) { // iterating through blocks in row/col order + const Coord& block = kv.first; // block's position + auto& entries = kv.second; // non-zeros in the block + + if (OrdinalType(entries.size()) != blockDim_ * blockDim_) { + std::stringstream ss; + ss << "BsrMatrix: block " << block.first << "," << block.second + << " had only " << entries.size() << " non-zeros, expected " + << blockDim_ * blockDim_; + KokkosKernels::Impl::throw_runtime_exception(ss.str()); } - //--- Fill numerical values - for (size_type ii = 0; ii < annz; ++ii) { - const auto ilocal = unman_rows(ii) % blockDim_; - const auto jblock = unman_cols(ii) / blockDim_; - const auto jlocal = unman_cols(ii) % blockDim_; - for (auto jj = row_map_host(jblock); jj < row_map_host(jblock + 1); - ++jj) { - if (tmp_entries_host(jj) == jblock) { - const auto shift = - jj * blockDim_ * blockDim_ + ilocal * blockDim_ + jlocal; - values_host(shift) = unman_val(ii); - break; - } - } + + // update row-map if block is in a new row + for (; row < block.first; ++row) { + row_map_host(row + 1) = bi; // `row` ends at bi + } + + // record column of block + entries_host(bi) = block.second; // block's column + + // add contiguous entries of block sorted by row/col + std::sort(entries.begin(), entries.end(), entry_by_row_col); + for (size_type ei = 0; ei < size_type(entries.size()); ++ei) { + values_host(bi * blockDim_ * blockDim_ + ei) = std::get<1>(entries[ei]); } + + // next block + ++bi; + } + // complete row map if last blocks are empty + for (; row < numRows; ++row) { + row_map_host(row + 1) = bi; } - Kokkos::deep_copy(tmp_entries, tmp_entries_host); + // move graph data to the requested device + Kokkos::deep_copy(row_map_device, row_map_host); + Kokkos::deep_copy(entries_device, entries_host); Kokkos::deep_copy(values, values_host); - // Initialize graph using the temp entries and row_map Views - graph = staticcrsgraph_type(tmp_entries, tmp_row_map); + graph = staticcrsgraph_type(entries_device, row_map_device); } /// \brief Constructor that accepts a row map, column indices, and From 11d442b51eb677cf5372538ee8dd44d5823cb111 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Fri, 24 Mar 2023 10:10:12 -0600 Subject: [PATCH 166/442] Deprecate Kokkos::Details::ArithTraits (#1748) * Deprecate Kokkos::Details::ArithTraits Use Kokkos::ArithTraits instead * Don't refer to "ArithTraits" inside Kokkos::Details --- batched/KokkosBatched_Util.hpp | 9 +- .../impl/KokkosBatched_AddRadial_Internal.hpp | 6 +- ...tched_ApplyHouseholder_Serial_Internal.hpp | 4 +- ...d_ApplyHouseholder_TeamVector_Internal.hpp | 8 +- ...hed_Eigendecomposition_Serial_Internal.hpp | 2 +- ...kkosBatched_Eigenvalue_Serial_Internal.hpp | 2 +- .../KokkosBatched_Francis_Serial_Internal.hpp | 4 +- .../KokkosBatched_Givens_Serial_Internal.hpp | 7 +- ...HessenbergQR_WithShift_Serial_Internal.hpp | 2 +- ...kosBatched_Householder_Serial_Internal.hpp | 7 +- ...atched_Householder_TeamVector_Internal.hpp | 7 +- .../impl/KokkosBatched_LU_Serial_Internal.hpp | 2 +- .../impl/KokkosBatched_LU_Team_Internal.hpp | 2 +- ...ftEigenvectorFromSchur_Serial_Internal.hpp | 2 +- .../impl/KokkosBatched_Normalize_Internal.hpp | 8 +- ...htEigenvectorFromSchur_Serial_Internal.hpp | 2 +- ...KokkosBatched_Schur2x2_Serial_Internal.hpp | 2 +- .../KokkosBatched_Schur_Serial_Internal.hpp | 2 +- .../KokkosBatched_Trmm_Serial_Internal.hpp | 8 +- .../impl/KokkosBatched_Vector_SIMD_Arith.hpp | 8 +- .../impl/KokkosBatched_Vector_SIMD_Math.hpp | 32 ++--- ...Batched_WilkinsonShift_Serial_Internal.hpp | 18 ++- batched/dense/src/KokkosBatched_Vector.hpp | 2 - .../dense/src/KokkosBatched_Vector_SIMD.hpp | 2 +- .../unit_test/Test_Batched_BatchedGemm.hpp | 2 +- .../unit_test/Test_Batched_SerialAxpy.hpp | 2 +- .../unit_test/Test_Batched_SerialGemm.hpp | 2 +- .../unit_test/Test_Batched_SerialGesv.hpp | 5 +- .../Test_Batched_SerialInverseLU.hpp | 2 +- .../dense/unit_test/Test_Batched_SerialLU.hpp | 2 +- .../unit_test/Test_Batched_SerialSolveLU.hpp | 2 +- .../unit_test/Test_Batched_SerialTrmm.hpp | 4 +- .../unit_test/Test_Batched_SerialTrsm.hpp | 2 +- .../unit_test/Test_Batched_SerialTrsv.hpp | 2 +- .../unit_test/Test_Batched_SerialTrtri.hpp | 4 +- .../dense/unit_test/Test_Batched_TeamAxpy.hpp | 2 +- .../dense/unit_test/Test_Batched_TeamGemm.hpp | 2 +- .../dense/unit_test/Test_Batched_TeamGesv.hpp | 5 +- .../unit_test/Test_Batched_TeamInverseLU.hpp | 2 +- .../dense/unit_test/Test_Batched_TeamLU.hpp | 2 +- .../unit_test/Test_Batched_TeamSolveLU.hpp | 2 +- .../dense/unit_test/Test_Batched_TeamTrsm.hpp | 2 +- .../dense/unit_test/Test_Batched_TeamTrsv.hpp | 2 +- .../unit_test/Test_Batched_TeamVectorAxpy.hpp | 2 +- .../unit_test/Test_Batched_TeamVectorGemm.hpp | 2 +- .../unit_test/Test_Batched_TeamVectorGesv.hpp | 5 +- .../unit_test/Test_Batched_TeamVectorQR.hpp | 2 +- ...atched_TeamVectorQR_WithColumnPivoting.hpp | 2 +- .../Test_Batched_TeamVectorSolveUTV.hpp | 2 +- .../Test_Batched_TeamVectorSolveUTV2.hpp | 2 +- .../unit_test/Test_Batched_TeamVectorUTV.hpp | 2 +- .../Test_Batched_VectorArithmatic.hpp | 8 +- .../unit_test/Test_Batched_VectorLogical.hpp | 2 +- .../unit_test/Test_Batched_VectorMath.hpp | 4 +- .../unit_test/Test_Batched_VectorMisc.hpp | 2 +- .../unit_test/Test_Batched_VectorRelation.hpp | 2 +- .../unit_test/Test_Batched_VectorView.hpp | 16 +-- .../impl/KokkosBatched_CG_TeamVector_Impl.hpp | 6 +- .../impl/KokkosBatched_CG_Team_Impl.hpp | 6 +- .../impl/KokkosBatched_GMRES_Serial_Impl.hpp | 4 +- .../KokkosBatched_GMRES_TeamVector_Impl.hpp | 4 +- .../impl/KokkosBatched_GMRES_Team_Impl.hpp | 4 +- .../impl/KokkosBatched_Spmv_Serial_Impl.hpp | 6 +- .../KokkosBatched_Spmv_TeamVector_Impl.hpp | 6 +- .../impl/KokkosBatched_Spmv_Team_Impl.hpp | 6 +- .../sparse/src/KokkosBatched_CrsMatrix.hpp | 19 ++- .../sparse/src/KokkosBatched_JacobiPrec.hpp | 13 +- .../src/KokkosBatched_Krylov_Handle.hpp | 2 +- batched/sparse/src/KokkosBatched_Spmv.hpp | 16 +-- .../unit_test/Test_Batched_SerialGMRES.hpp | 7 +- .../unit_test/Test_Batched_SerialSpmv.hpp | 2 +- .../sparse/unit_test/Test_Batched_TeamCG.hpp | 7 +- .../unit_test/Test_Batched_TeamGMRES.hpp | 7 +- .../unit_test/Test_Batched_TeamSpmv.hpp | 2 +- .../unit_test/Test_Batched_TeamVectorCG.hpp | 7 +- .../Test_Batched_TeamVectorGMRES.hpp | 7 +- .../unit_test/Test_Batched_TeamVectorSpmv.hpp | 2 +- blas/impl/KokkosBlas1_abs_impl.hpp | 8 +- blas/impl/KokkosBlas1_axpby_impl.hpp | 4 +- blas/impl/KokkosBlas1_axpby_mv_impl.hpp | 8 +- blas/impl/KokkosBlas1_axpby_spec.hpp | 8 +- blas/impl/KokkosBlas1_dot_impl.hpp | 2 +- blas/impl/KokkosBlas1_iamax_impl.hpp | 2 +- blas/impl/KokkosBlas1_mult_impl.hpp | 12 +- blas/impl/KokkosBlas1_nrm2_impl.hpp | 7 +- blas/impl/KokkosBlas1_nrm2w_impl.hpp | 7 +- blas/impl/KokkosBlas1_nrminf_impl.hpp | 4 +- blas/impl/KokkosBlas1_reciprocal_impl.hpp | 8 +- blas/impl/KokkosBlas1_scal_impl.hpp | 4 +- blas/impl/KokkosBlas1_scal_mv_impl.hpp | 8 +- blas/impl/KokkosBlas1_scal_spec.hpp | 6 +- blas/impl/KokkosBlas1_sum_impl.hpp | 2 +- blas/impl/KokkosBlas1_team_abs_spec.hpp | 4 +- blas/impl/KokkosBlas1_team_dot_spec.hpp | 2 +- blas/impl/KokkosBlas1_team_nrm2_spec.hpp | 6 +- blas/impl/KokkosBlas1_update_impl.hpp | 4 +- blas/impl/KokkosBlas1_update_spec.hpp | 12 +- blas/impl/KokkosBlas2_gemv_impl.hpp | 71 +++++----- ...osBlas2_serial_gemv_inner_multiple_dot.hpp | 2 +- blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp | 4 +- blas/impl/KokkosBlas3_gemm_impl.hpp | 16 +-- blas/impl/KokkosBlas3_trsm_impl.hpp | 4 +- blas/src/KokkosBlas1_axpby.hpp | 3 +- blas/src/KokkosBlas1_team_axpby.hpp | 4 +- blas/unit_test/Test_Blas1_abs.hpp | 4 +- blas/unit_test/Test_Blas1_asum.hpp | 2 +- blas/unit_test/Test_Blas1_iamax.hpp | 8 +- blas/unit_test/Test_Blas1_nrm1.hpp | 2 +- blas/unit_test/Test_Blas1_nrm2.hpp | 11 +- blas/unit_test/Test_Blas1_nrm2_squared.hpp | 4 +- blas/unit_test/Test_Blas1_nrminf.hpp | 9 +- blas/unit_test/Test_Blas1_reciprocal.hpp | 2 +- blas/unit_test/Test_Blas1_scal.hpp | 4 +- blas/unit_test/Test_Blas1_serial_setscal.hpp | 2 +- blas/unit_test/Test_Blas1_team_abs.hpp | 4 +- blas/unit_test/Test_Blas1_team_axpby.hpp | 2 +- blas/unit_test/Test_Blas1_team_nrm2.hpp | 5 +- blas/unit_test/Test_Blas1_team_scal.hpp | 4 +- blas/unit_test/Test_Blas1_team_setscal.hpp | 2 +- blas/unit_test/Test_Blas3_gemm.hpp | 8 +- blas/unit_test/Test_Blas3_trmm.hpp | 6 +- blas/unit_test/Test_Blas3_trsm.hpp | 4 +- blas/unit_test/Test_Blas_gesv.hpp | 4 +- blas/unit_test/Test_Blas_trtri.hpp | 4 +- common/src/KokkosKernels_SimpleUtils.hpp | 10 +- common/src/KokkosKernels_Utils.hpp | 2 +- common/src/Kokkos_ArithTraits.hpp | 14 +- common/src/Kokkos_InnerProductSpaceTraits.hpp | 32 ++--- common/unit_test/Test_Common_ArithTraits.hpp | 124 ++++++++---------- example/batched_solve/team_GMRES.cpp | 3 +- .../KokkosBatched_Test_BlockTridiagDirect.cpp | 2 +- .../KokkosBatched_Test_BlockTridiagJacobi.cpp | 2 +- .../sparse/CG/KokkosBatched_Test_CG.cpp | 5 +- .../sparse/GMRES/KokkosBatched_Test_GMRES.cpp | 5 +- .../sparse/SPMV/KokkosBatched_SPMV_View.hpp | 2 +- .../sparse/SPMV/KokkosBatched_Test_SPMV.cpp | 2 +- .../KokkosBatched_Test_cusolverDn.cpp | 2 +- .../KokkosBatched_Test_cusolverSp.cpp | 2 +- perf_test/sparse/KokkosSparse_sptrsv_aux.hpp | 2 +- .../sparse/KokkosSparse_sptrsv_cholmod.cpp | 2 +- .../sparse/KokkosSparse_sptrsv_superlu.cpp | 2 +- .../sparse/KokkosSparse_sptrsv_supernode.cpp | 2 +- perf_test/sparse/spmv/Kokkos_SPMV.hpp | 2 +- .../sparse/spmv/Kokkos_SPMV_Inspector.hpp | 2 +- ...KokkosSparse_cluster_gauss_seidel_impl.hpp | 34 ++--- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 47 ++++--- ...kkosSparse_getDiagCopyWithOffsets_impl.hpp | 2 +- .../impl/KokkosSparse_sor_sequential_impl.hpp | 10 +- .../impl/KokkosSparse_spmv_bsrmatrix_impl.hpp | 8 +- .../impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 9 +- sparse/impl/KokkosSparse_spmv_impl.hpp | 40 +++--- sparse/impl/KokkosSparse_spmv_spec.hpp | 4 +- sparse/impl/KokkosSparse_spmv_struct_impl.hpp | 27 ++-- sparse/impl/KokkosSparse_spmv_struct_spec.hpp | 6 +- sparse/impl/KokkosSparse_trsv_impl.hpp | 12 +- ...okkosSparse_twostage_gauss_seidel_impl.hpp | 8 +- sparse/src/KokkosSparse_BsrMatrix.hpp | 10 +- sparse/src/KokkosSparse_IOUtils.hpp | 3 +- sparse/unit_test/Test_Sparse_Utils.hpp | 2 +- .../Test_Sparse_block_gauss_seidel.hpp | 14 +- sparse/unit_test/Test_Sparse_bspgemm.hpp | 2 +- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 45 ++++--- .../unit_test/Test_Sparse_replaceSumInto.hpp | 6 +- .../Test_Sparse_replaceSumIntoLonger.hpp | 8 +- .../unit_test/Test_Sparse_spgemm_jacobi.hpp | 2 +- sparse/unit_test/Test_Sparse_spiluk.hpp | 2 +- test_common/KokkosKernels_TestUtils.hpp | 10 +- 167 files changed, 579 insertions(+), 646 deletions(-) diff --git a/batched/KokkosBatched_Util.hpp b/batched/KokkosBatched_Util.hpp index acfd5cab68..f6b08764c9 100644 --- a/batched/KokkosBatched_Util.hpp +++ b/batched/KokkosBatched_Util.hpp @@ -69,13 +69,12 @@ struct is_vector : public std::false_type {}; template struct is_same_mag_type { - static const bool is_specialized = - (Kokkos::Details::ArithTraits::is_specialized && - Kokkos::Details::ArithTraits::is_specialized); + static const bool is_specialized = (Kokkos::ArithTraits::is_specialized && + Kokkos::ArithTraits::is_specialized); static const bool is_mag_type_same = - std::is_same::mag_type, - typename Kokkos::Details::ArithTraits::mag_type>::value; + std::is_same::mag_type, + typename Kokkos::ArithTraits::mag_type>::value; static const bool value = is_specialized && is_mag_type_same; }; diff --git a/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp b/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp index cd90657e30..24ecafe0a0 100644 --- a/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp @@ -38,8 +38,7 @@ struct SerialAddRadialInternal { #endif for (int i = 0; i < m; ++i) { // const auto a_real = RealPart(A[i*as]); - const auto a_real = - Kokkos::Details::ArithTraits::real(A[i * as]); + const auto a_real = Kokkos::ArithTraits::real(A[i * as]); A[i * as] += ValueType(minus_abs_tiny) * ValueType(a_real < 0); A[i * as] += ValueType(abs_tiny) * ValueType(a_real >= 0); } @@ -62,8 +61,7 @@ struct TeamAddRadialInternal { Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { // const auto a_real = RealPart(A[i*as]); - const auto a_real = - Kokkos::Details::ArithTraits::real(A[i * as]); + const auto a_real = Kokkos::ArithTraits::real(A[i * as]); A[i * as] += ValueType(minus_abs_tiny) * ValueType(a_real < 0); A[i * as] += ValueType(abs_tiny) * ValueType(a_real >= 0); }); diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp index da8e989b47..611e9440b5 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp @@ -56,7 +56,7 @@ struct SerialApplyLeftHouseholderInternal { for (int j = 0; j < n; ++j) { value_type tmp = a1t[j * a1ts]; for (int i = 0; i < m; ++i) - tmp += Kokkos::Details::ArithTraits::conj(u2[i * u2s]) * + tmp += Kokkos::ArithTraits::conj(u2[i * u2s]) * A2[i * as0 + j * as1]; w1t[j] = tmp * inv_tau; // /= (*tau); } @@ -109,7 +109,7 @@ struct SerialApplyRightHouseholderInternal { for (int j = 0; j < n; ++j) for (int i = 0; i < m; ++i) A2[i * as0 + j * as1] -= - w1[i] * Kokkos::Details::ArithTraits::conj(u2[j * u2s]); + w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); return 0; } diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp index 4d513fcf3d..2754818fbf 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp @@ -59,7 +59,7 @@ struct TeamVectorApplyLeftHouseholderInternal { Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(member, m), [&](const int &i, value_type &val) { - val += Kokkos::Details::ArithTraits::conj(u2[i * u2s]) * + val += Kokkos::ArithTraits::conj(u2[i * u2s]) * A2[i * as0 + j * as1]; }, tmp); @@ -146,8 +146,7 @@ struct TeamVectorApplyRightHouseholderInternal { Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, m), [&](const int &i) { A2[i * as0 + j * as1] -= - w1[i] * Kokkos::Details::ArithTraits::conj( - u2[j * u2s]); + w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); }); }); } else { @@ -156,8 +155,7 @@ struct TeamVectorApplyRightHouseholderInternal { Kokkos::parallel_for( Kokkos::TeamThreadRange(member, m), [&](const int &i) { A2[i * as0 + j * as1] -= - w1[i] * Kokkos::Details::ArithTraits::conj( - u2[j * u2s]); + w1[i] * Kokkos::ArithTraits::conj(u2[j * u2s]); }); }); } diff --git a/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp index f89b76e162..c857de19c2 100644 --- a/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp @@ -72,7 +72,7 @@ struct SerialEigendecompositionInternal { "Serial eigendecomposition on device and/or without LAPACK " "is not implemented yet"); // typedef RealType real_type; - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; // const real_type one(1), zero(0), tol = 1e2*ats::epsilon(); // //const Kokkos::pair identity(one, zero); diff --git a/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp index ed2e442342..ae4cf10634 100644 --- a/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp @@ -68,7 +68,7 @@ struct SerialEigenvalueInternal { const bool restart = false, const int user_max_iteration = -1) { typedef RealType real_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const real_type zero(0), nan(ats::nan()), tol = 1e2 * ats::epsilon(); const int max_iteration = user_max_iteration < 0 ? 300 : user_max_iteration; diff --git a/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp index ba4bc0ed9c..21587f4481 100644 --- a/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp @@ -82,9 +82,9 @@ struct SerialFrancisInternal { } else { const value_type val = H[(m - 1) * hs]; const auto dist_lambda1 = - Kokkos::Details::ArithTraits::abs(lambda1.real() - val); + Kokkos::ArithTraits::abs(lambda1.real() - val); const auto dist_lambda2 = - Kokkos::Details::ArithTraits::abs(lambda2.real() - val); + Kokkos::ArithTraits::abs(lambda2.real() - val); const value_type lambda = dist_lambda1 < dist_lambda2 ? lambda1.real() : lambda2.real(); s = 2 * lambda; diff --git a/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp index f20e754010..4d80c6a250 100644 --- a/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp @@ -54,13 +54,12 @@ struct SerialGivensInternal { } else { // here we do not care overflow caused by the division although it is // probable.... - r = Kokkos::Details::ArithTraits::sqrt(chi1 * chi1 + - chi2 * chi2); + r = Kokkos::ArithTraits::sqrt(chi1 * chi1 + chi2 * chi2); cs = chi1 / r; sn = chi2 / r; - if (Kokkos::Details::ArithTraits::abs(chi1) > - Kokkos::Details::ArithTraits::abs(chi2) && + if (Kokkos::ArithTraits::abs(chi1) > + Kokkos::ArithTraits::abs(chi2) && cs < zero) { cs = -cs; sn = -sn; diff --git a/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp index eba6cdfc59..3d2b75e64d 100644 --- a/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp @@ -37,7 +37,7 @@ struct SerialHessenbergQR_WithShiftInternal { /* */ ValueType *HH, const int hs0, const int hs1, const ValueType shift, /* */ Kokkos::pair *GG, const bool request_schur) { typedef ValueType value_type; - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; const int hs = hs0 + hs1; const value_type zero(0), one(1); diff --git a/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp index bf2bd7d954..05654a2f37 100644 --- a/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp @@ -35,7 +35,7 @@ struct SerialLeftHouseholderInternal { /* */ ValueType* x2, const int x2s, /* */ ValueType* tau) { typedef ValueType value_type; - typedef typename Kokkos::Details::ArithTraits::mag_type mag_type; + typedef typename Kokkos::ArithTraits::mag_type mag_type; const mag_type zero(0); const mag_type half(0.5); @@ -58,11 +58,10 @@ struct SerialLeftHouseholderInternal { } /// compute magnitude of chi1, equal to norm2 of chi1 - const mag_type norm_chi1 = - Kokkos::Details::ArithTraits::abs(*chi1); + const mag_type norm_chi1 = Kokkos::ArithTraits::abs(*chi1); /// compute 2 norm of x using norm_chi1 and norm_x2 - const mag_type norm_x = Kokkos::Details::ArithTraits::sqrt( + const mag_type norm_x = Kokkos::ArithTraits::sqrt( norm_x2_square + norm_chi1 * norm_chi1); /// compute alpha diff --git a/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp index 40cc0714e3..64fe24fa31 100644 --- a/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp @@ -36,7 +36,7 @@ struct TeamVectorLeftHouseholderInternal { /* */ ValueType *x2, const int x2s, /* */ ValueType *tau) { typedef ValueType value_type; - typedef typename Kokkos::Details::ArithTraits::mag_type mag_type; + typedef typename Kokkos::ArithTraits::mag_type mag_type; const mag_type zero(0); const mag_type half(0.5); @@ -64,11 +64,10 @@ struct TeamVectorLeftHouseholderInternal { } /// compute magnitude of chi1, equal to norm2 of chi1 - const mag_type norm_chi1 = - Kokkos::Details::ArithTraits::abs(*chi1); + const mag_type norm_chi1 = Kokkos::ArithTraits::abs(*chi1); /// compute 2 norm of x using norm_chi1 and norm_x2 - const mag_type norm_x = Kokkos::Details::ArithTraits::sqrt( + const mag_type norm_x = Kokkos::ArithTraits::sqrt( norm_x2_square + norm_chi1 * norm_chi1); /// compute alpha diff --git a/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp index 4b9c215aba..e6b34d8f1b 100644 --- a/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp @@ -62,7 +62,7 @@ KOKKOS_INLINE_FUNCTION int SerialLU_Internal::invoke( if (tiny != 0) { ValueType &alpha11_reference = A[p * as0 + p * as1]; const auto alpha11_real = - Kokkos::Details::ArithTraits::real(alpha11_reference); + Kokkos::ArithTraits::real(alpha11_reference); alpha11_reference += minus_abs_tiny * ValueType(alpha11_real < 0); alpha11_reference += abs_tiny * ValueType(alpha11_real >= 0); } diff --git a/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp b/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp index a5a033b451..cbc811de5e 100644 --- a/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp @@ -68,7 +68,7 @@ KOKKOS_INLINE_FUNCTION int TeamLU_Internal::invoke( if (member.team_rank() == 0) { ValueType &alpha11_reference = A[p * as0 + p * as1]; const auto alpha11_real = - Kokkos::Details::ArithTraits::real(alpha11_reference); + Kokkos::ArithTraits::real(alpha11_reference); alpha11_reference += minus_abs_tiny * ValueType(alpha11_real < 0); alpha11_reference += abs_tiny * ValueType(alpha11_real >= 0); } diff --git a/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp index 9a4ce3378d..ea87217a37 100644 --- a/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp @@ -52,7 +52,7 @@ struct SerialLeftEigenvectorFromSchurInternal { /* */ ValueType *w, const int *blks) { typedef ValueType value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; // typedef typename ats::mag_type mag_type; typedef Kokkos::complex complex_type; diff --git a/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp b/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp index e2a7016422..42adf8eeba 100644 --- a/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp @@ -31,7 +31,7 @@ struct SerialNormalizeInternal { /* */ ValueType *KOKKOS_RESTRICT v, const int vs) { typedef ValueType value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; typedef typename ats::mag_type mag_type; mag_type norm(0); @@ -42,7 +42,7 @@ struct SerialNormalizeInternal { const auto v_at_i = v[i * vs]; norm += ats::real(v_at_i * ats::conj(v_at_i)); } - norm = Kokkos::Details::ArithTraits::sqrt(norm); + norm = Kokkos::ArithTraits::sqrt(norm); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif @@ -58,7 +58,7 @@ struct SerialNormalizeInternal { /* */ RealType *KOKKOS_RESTRICT vi, const int vis) { typedef RealType real_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; typedef typename ats::mag_type mag_type; mag_type norm(0); @@ -70,7 +70,7 @@ struct SerialNormalizeInternal { const auto vi_at_i = vi[i * vis]; norm += vr_at_i * vr_at_i + vi_at_i * vi_at_i; } - norm = Kokkos::Details::ArithTraits::sqrt(norm); + norm = Kokkos::ArithTraits::sqrt(norm); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif diff --git a/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp index 2eeb3ccbed..4716506064 100644 --- a/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp @@ -52,7 +52,7 @@ struct SerialRightEigenvectorFromSchurInternal { /* */ ValueType *w, const int *blks) { typedef ValueType value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; // typedef typename ats::mag_type mag_type; typedef Kokkos::complex complex_type; diff --git a/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp index 9e305186df..22a599ed58 100644 --- a/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp @@ -37,7 +37,7 @@ struct SerialSchur2x2Internal { Kokkos::complex* lambda2, bool* is_complex) { typedef RealType real_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const real_type zero(0), one(1), half(0.5), minus_one(-1); /// compute G = [ gamma -sigma; /// sigma gamma ]; diff --git a/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp index 2ff19975fc..c7f35d5c4f 100644 --- a/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp @@ -76,7 +76,7 @@ struct SerialSchurInternal { const bool restart = false, const int user_max_iteration = -1) { typedef RealType real_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const real_type /* one(1), */ zero(0), tol = 1e2 * ats::epsilon(); const int max_iteration = user_max_iteration < 0 ? 300 : user_max_iteration; if (wlen < m * 5) diff --git a/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp index e08089593a..3e4024974b 100644 --- a/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp @@ -77,7 +77,7 @@ SerialTrmmInternalLeftLower::invoke( const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; int left_m = am; int right_n = bn; // echo-TODO: See about coniditionally setting conjOp at compile time. @@ -162,7 +162,7 @@ SerialTrmmInternalRightLower::invoke( const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; int left_m = bm; int right_n = an; // echo-TODO: See about coniditionally setting conjOp at compile time. @@ -248,7 +248,7 @@ SerialTrmmInternalLeftUpper::invoke( const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; int left_m = am; int right_n = bn; // echo-TODO: See about coniditionally setting conjOp at compile time. @@ -330,7 +330,7 @@ SerialTrmmInternalRightUpper::invoke( const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, /**/ ValueType *KOKKOS_RESTRICT B, const int bs0, const int bs1) { const ScalarType one(1.0), zero(0.0); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; int left_m = bm; int right_n = an; // echo-TODO: See about coniditionally setting conjOp at compile time. diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp index 94f662a0a8..f87492ea5a 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp @@ -152,7 +152,7 @@ template KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator++(Vector, l> &a, int) { Vector, l> a0 = a; - a = a + typename Kokkos::Details::ArithTraits::mag_type(1); + a = a + typename Kokkos::ArithTraits::mag_type(1); return a0; } @@ -160,7 +160,7 @@ template KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( T, l) operator++(Vector, l> &a) { - a = a + typename Kokkos::Details::ArithTraits::mag_type(1); + a = a + typename Kokkos::ArithTraits::mag_type(1); return a; } @@ -355,7 +355,7 @@ template KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_TYPE(T, l) operator--(Vector, l> &a, int) { Vector, l> a0 = a; - a = a - typename Kokkos::Details::ArithTraits::mag_type(1); + a = a - typename Kokkos::ArithTraits::mag_type(1); return a0; } @@ -363,7 +363,7 @@ template KOKKOS_FORCEINLINE_FUNCTION static KOKKOSKERNELS_SIMD_ARITH_RETURN_REFERENCE_TYPE( T, l) operator--(Vector, l> &a) { - a = a - typename Kokkos::Details::ArithTraits::mag_type(1); + a = a - typename Kokkos::ArithTraits::mag_type(1); return a; } diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp index 19f4fcb54f..69bbb53c6b 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp @@ -32,7 +32,7 @@ namespace KokkosBatched { template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) sqrt(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -48,7 +48,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) cbrt(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -64,7 +64,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) log(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -80,7 +80,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) log10(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -96,7 +96,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) exp(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -112,7 +112,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T0, l) pow(const Vector, l> &a, const Vector, l> &b) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -140,7 +140,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_TYPE(T0, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) sin(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -156,7 +156,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) cos(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -172,7 +172,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) tan(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -188,7 +188,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) sinh(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -204,7 +204,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) cosh(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -220,7 +220,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) tanh(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -236,7 +236,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) asin(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -252,7 +252,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) acos(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -268,7 +268,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) atan(const Vector, l> &a) { - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep @@ -284,7 +284,7 @@ KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) template KOKKOS_INLINE_FUNCTION static KOKKOSKERNELS_SIMD_MATH_RETURN_FLOAT_TYPE(T, l) atan2(const Vector, l> &a, const Vector, l> &b) { - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; Vector, l> r_val; #if defined(KOKKOS_ENABLE_PRAGMA_IVDEP) #pragma ivdep diff --git a/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp index 6b90b6a962..0d3a9b3df9 100644 --- a/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp @@ -52,18 +52,16 @@ struct SerialWilkinsonShiftInternal { if (v < 0) { // complex - const value_type sqrt_v = - Kokkos::Details::ArithTraits::sqrt(-v); - *lambda1 = Kokkos::complex(p, sqrt_v); - *lambda2 = Kokkos::complex(p, -sqrt_v); - *is_complex = true; + const value_type sqrt_v = Kokkos::ArithTraits::sqrt(-v); + *lambda1 = Kokkos::complex(p, sqrt_v); + *lambda2 = Kokkos::complex(p, -sqrt_v); + *is_complex = true; } else { // real - const value_type sqrt_v = - Kokkos::Details::ArithTraits::sqrt(v); - *lambda1 = Kokkos::complex(p + sqrt_v); - *lambda2 = Kokkos::complex(p - sqrt_v); - *is_complex = false; + const value_type sqrt_v = Kokkos::ArithTraits::sqrt(v); + *lambda1 = Kokkos::complex(p + sqrt_v); + *lambda2 = Kokkos::complex(p - sqrt_v); + *is_complex = false; } return 0; } diff --git a/batched/dense/src/KokkosBatched_Vector.hpp b/batched/dense/src/KokkosBatched_Vector.hpp index 30b8677d17..23fd62655a 100644 --- a/batched/dense/src/KokkosBatched_Vector.hpp +++ b/batched/dense/src/KokkosBatched_Vector.hpp @@ -251,7 +251,6 @@ struct MagnitudeScalarType>, l>> { // arith traits overload for vector types namespace Kokkos { -namespace Details { // do not use Vector alone as other can use the name. @@ -337,7 +336,6 @@ class ArithTraits< } }; -} // namespace Details } // namespace Kokkos #endif diff --git a/batched/dense/src/KokkosBatched_Vector_SIMD.hpp b/batched/dense/src/KokkosBatched_Vector_SIMD.hpp index e938198467..e27419e7c2 100644 --- a/batched/dense/src/KokkosBatched_Vector_SIMD.hpp +++ b/batched/dense/src/KokkosBatched_Vector_SIMD.hpp @@ -36,7 +36,7 @@ class Vector, l> { public: using type = Vector, l>; using value_type = T; - using mag_type = typename Kokkos::Details::ArithTraits::mag_type; + using mag_type = typename Kokkos::ArithTraits::mag_type; enum : int { vector_length = l }; diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp index c4e09d6e68..ac38da8270 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp @@ -37,7 +37,7 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, using transB = typename ParamTagType::transB; using batchLayout = typename ParamTagType::batchLayout; using view_layout = typename ViewType::array_layout; - using ats = Kokkos::Details::ArithTraits; + using ats = Kokkos::ArithTraits; int ret = 0; auto algo_type = batchedGemmHandle->get_kernel_algo_type(); diff --git a/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp b/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp index ebefbbabd2..2bde3f7fad 100644 --- a/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialAxpy.hpp @@ -65,7 +65,7 @@ void impl_test_batched_axpy(const int N, const int BlkSize) { typedef typename ViewType::value_type value_type; typedef typename ViewType::const_value_type const_value_type; typedef typename alphaViewType::const_value_type alpha_const_value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), Y1("y1", N, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_SerialGemm.hpp b/batched/dense/unit_test/Test_Batched_SerialGemm.hpp index c8f745b006..8304657849 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGemm.hpp @@ -81,7 +81,7 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, using transA = typename ParamTagType::transA; using transB = typename ParamTagType::transB; using value_type = typename ViewType::value_type; - using ats = Kokkos::Details::ArithTraits; + using ats = Kokkos::ArithTraits; /// randomized input testing views ScalarType alpha = ScalarType(1.5); diff --git a/batched/dense/unit_test/Test_Batched_SerialGesv.hpp b/batched/dense/unit_test/Test_Batched_SerialGesv.hpp index f487515a7c..3b17d81d48 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGesv.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGesv.hpp @@ -71,10 +71,9 @@ template void impl_test_batched_gesv(const int N, const int BlkSize) { typedef typename MatrixType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; using NormViewType = Kokkos::View; diff --git a/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp b/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp index c59d1aed1b..d3cbd6c024 100644 --- a/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp @@ -140,7 +140,7 @@ template void impl_test_batched_inverselu(const int N, const int BlkSize) { typedef typename AViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views AViewType a0("a0", N, BlkSize, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_SerialLU.hpp b/batched/dense/unit_test/Test_Batched_SerialLU.hpp index 335b4ee9bf..23b72893b2 100644 --- a/batched/dense/unit_test/Test_Batched_SerialLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialLU.hpp @@ -61,7 +61,7 @@ struct Functor_TestBatchedSerialLU { template void impl_test_batched_lu(const int N, const int BlkSize) { typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp b/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp index dd39be3dd1..48e8e5dead 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSolveLU.hpp @@ -139,7 +139,7 @@ struct Functor_TestBatchedSerialSolveLU { template void impl_test_batched_solvelu(const int N, const int BlkSize) { typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views ViewType a0("a0", N, BlkSize, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp b/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp index 7082d183a5..af38e62e4d 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrmm.hpp @@ -61,7 +61,7 @@ struct VanillaGEMM { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::Details::ArithTraits APT; + typedef Kokkos::ArithTraits APT; typedef typename APT::mag_type mag_type; ScalarA alpha; ScalarC beta; @@ -150,7 +150,7 @@ void impl_test_batched_trmm(const int N, const int nRows, const int nCols, const char* trans) { typedef typename ViewType::value_type value_type; typedef typename DeviceType::execution_space execution_space; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; ScalarType alpha(1.0); ScalarType beta(0.0); diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp index f109c44e0b..c0ef098652 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsm.hpp @@ -75,7 +75,7 @@ template void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views ScalarType alpha(1.0); diff --git a/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp b/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp index 9dc003dd19..f05a6f7fa5 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrsv.hpp @@ -74,7 +74,7 @@ template void impl_test_batched_trsv(const int N, const int BlkSize) { typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views ScalarType alpha(1.5); diff --git a/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp b/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp index ee0982efd4..8f4ae64b7e 100644 --- a/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialTrtri.hpp @@ -63,7 +63,7 @@ struct VanillaGEMM { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::Details::ArithTraits APT; + typedef Kokkos::ArithTraits APT; typedef typename APT::mag_type mag_type; ScalarA alpha; ScalarC beta; @@ -143,7 +143,7 @@ template ats; + typedef Kokkos::ArithTraits ats; ScalarType alpha(1.0); ScalarType beta(0.0); diff --git a/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp b/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp index 873b244bd8..7941fc0284 100644 --- a/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamAxpy.hpp @@ -77,7 +77,7 @@ void impl_test_batched_axpy(const int N, const int BlkSize, const int N_team) { typedef typename ViewType::value_type value_type; typedef typename ViewType::const_value_type const_value_type; typedef typename alphaViewType::const_value_type alpha_const_value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), Y1("y1", N, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_TeamGemm.hpp b/batched/dense/unit_test/Test_Batched_TeamGemm.hpp index c60552827e..9023a009af 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGemm.hpp @@ -90,7 +90,7 @@ void impl_test_batched_teamgemm(const int N, const int matAdim1, using transB = typename ParamTagType::transB; using execution_space = typename DeviceType::execution_space; using value_type = typename ViewType::value_type; - using ats = Kokkos::Details::ArithTraits; + using ats = Kokkos::ArithTraits; /// randomized input testing views ScalarType alpha = ScalarType(1.5), beta = ScalarType(3.0); diff --git a/batched/dense/unit_test/Test_Batched_TeamGesv.hpp b/batched/dense/unit_test/Test_Batched_TeamGesv.hpp index 2ecdb60bcf..89f67e2731 100644 --- a/batched/dense/unit_test/Test_Batched_TeamGesv.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamGesv.hpp @@ -83,10 +83,9 @@ template void impl_test_batched_gesv(const int N, const int BlkSize) { typedef typename MatrixType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; using NormViewType = Kokkos::View; diff --git a/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp b/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp index 4a0d60ba2f..8657de9856 100644 --- a/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamInverseLU.hpp @@ -161,7 +161,7 @@ template void impl_test_batched_inverselu(const int N, const int BlkSize) { typedef typename AViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views AViewType a0("a0", N, BlkSize, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_TeamLU.hpp b/batched/dense/unit_test/Test_Batched_TeamLU.hpp index f7ac07ce46..04e191b9cb 100644 --- a/batched/dense/unit_test/Test_Batched_TeamLU.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamLU.hpp @@ -69,7 +69,7 @@ struct Functor_TestBatchedTeamLU { template void impl_test_batched_lu(const int N, const int BlkSize) { typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp b/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp index 77b3d697a8..41287f9b52 100644 --- a/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamSolveLU.hpp @@ -155,7 +155,7 @@ struct Functor_TestBatchedTeamSolveLU { template void impl_test_batched_solvelu(const int N, const int BlkSize) { typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views ViewType a0("a0", N, BlkSize, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp index 63effa103c..2f7781745d 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsm.hpp @@ -85,7 +85,7 @@ template void impl_test_batched_trsm(const int N, const int BlkSize, const int NumCols) { typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views ScalarType alpha(1.0); diff --git a/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp b/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp index 4d20bbc7cf..bb00b78736 100644 --- a/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamTrsv.hpp @@ -83,7 +83,7 @@ template void impl_test_batched_trsv(const int N, const int BlkSize) { typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; /// randomized input testing views ScalarType alpha(1.5); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp index 83d47edf8f..5ea8a80717 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorAxpy.hpp @@ -78,7 +78,7 @@ void impl_test_batched_axpy(const int N, const int BlkSize, const int N_team) { typedef typename ViewType::value_type value_type; typedef typename ViewType::const_value_type const_value_type; typedef typename alphaViewType::const_value_type alpha_const_value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; ViewType X0("x0", N, BlkSize), X1("x1", N, BlkSize), Y0("y0", N, BlkSize), Y1("y1", N, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp index d3aa42c49a..327f28353e 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGemm.hpp @@ -85,7 +85,7 @@ void impl_test_batched_teamvectorgemm(const int N, const int matAdim1, using transB = typename ParamTagType::transB; using execution_space = typename DeviceType::execution_space; using value_type = typename ViewType::value_type; - using ats = Kokkos::Details::ArithTraits; + using ats = Kokkos::ArithTraits; /// randomized input testing views ScalarType alpha = ScalarType(1.5), beta = ScalarType(3.0); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp index 8392e1b9fc..2026f2f81d 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorGesv.hpp @@ -84,10 +84,9 @@ template void impl_test_batched_gesv(const int N, const int BlkSize) { typedef typename MatrixType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; using NormViewType = Kokkos::View; diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp index d7e237094d..e6133be92a 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp @@ -110,7 +110,7 @@ template void impl_test_batched_qr(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const value_type one(1); /// randomized input testing views MatrixViewType a("a", N, BlkSize, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp index 648ae43566..1abd45f93d 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp @@ -119,7 +119,7 @@ template void impl_test_batched_qr_with_columnpivoting(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; // const value_type one(1); /// randomized input testing views MatrixViewType a("a", N, BlkSize, BlkSize); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp index 9d080e6e48..29496c1b87 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV.hpp @@ -132,7 +132,7 @@ template void impl_test_batched_solve_utv(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; // const value_type one(1); /// randomized input testing views MatrixViewType r("r", N, BlkSize, 3); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp index 0d52d0d0e4..45d6093f2a 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorSolveUTV2.hpp @@ -136,7 +136,7 @@ template void impl_test_batched_solve_utv2(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; // const value_type one(1); /// randomized input testing views MatrixViewType r("r", N, BlkSize, 3); diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp index f61cf2729a..527c93e059 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorUTV.hpp @@ -166,7 +166,7 @@ template void impl_test_batched_utv(const int N, const int BlkSize) { typedef typename MatrixViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; // const value_type one(1); /// randomized input testing views MatrixViewType r("r", N, BlkSize, 3); diff --git a/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp b/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp index 0c0c80f7b8..1006325f94 100644 --- a/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorArithmatic.hpp @@ -52,10 +52,10 @@ void impl_test_complex_real_imag_value() { a[k].imag() = k * 5 + 4; } - const auto a_real = Kokkos::Details::ArithTraits::real(a); - const auto a_imag = Kokkos::Details::ArithTraits::imag(a); + const auto a_real = Kokkos::ArithTraits::real(a); + const auto a_imag = Kokkos::ArithTraits::imag(a); - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); for (int k = 0; k < vector_length; ++k) { EXPECT_NEAR(a[k].real(), a_real[k], eps); @@ -71,7 +71,7 @@ void impl_test_batched_vector_arithmatic() { typedef typename vector_type::value_type value_type; const int vector_length = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; typedef typename ats::mag_type mag_type; vector_type a, b, c; diff --git a/batched/dense/unit_test/Test_Batched_VectorLogical.hpp b/batched/dense/unit_test/Test_Batched_VectorLogical.hpp index a740bac9dd..9393afd77b 100644 --- a/batched/dense/unit_test/Test_Batched_VectorLogical.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorLogical.hpp @@ -45,7 +45,7 @@ void impl_test_batched_vector_logical() { typedef ValueType value_type; const int vector_length = VectorLength; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; typedef typename ats::mag_type mag_type; vector_int_type a, b; diff --git a/batched/dense/unit_test/Test_Batched_VectorMath.hpp b/batched/dense/unit_test/Test_Batched_VectorMath.hpp index da0556fc0f..d2aa9eb7bc 100644 --- a/batched/dense/unit_test/Test_Batched_VectorMath.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorMath.hpp @@ -46,7 +46,7 @@ void impl_test_batched_vector_math() { typedef typename vector_type::value_type value_type; const int vector_length = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; typedef typename ats::mag_type mag_type; vector_type a, b, aref, bref; @@ -136,7 +136,7 @@ int test_batched_vector_math() { // template // int test_complex_pow() { -// typedef Kokkos::Details::ArithTraits > ats; +// typedef Kokkos::ArithTraits > ats; // typedef typename ats::mag_type mag_type; // const mag_type eps = 1.0e3 * ats::epsilon(); diff --git a/batched/dense/unit_test/Test_Batched_VectorMisc.hpp b/batched/dense/unit_test/Test_Batched_VectorMisc.hpp index e465af5417..70d0e10cd2 100644 --- a/batched/dense/unit_test/Test_Batched_VectorMisc.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorMisc.hpp @@ -46,7 +46,7 @@ void impl_test_batched_vector_misc() { typedef typename vector_type::value_type value_type; const int vector_length = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; typedef typename ats::mag_type mag_type; vector_type a, b, c; diff --git a/batched/dense/unit_test/Test_Batched_VectorRelation.hpp b/batched/dense/unit_test/Test_Batched_VectorRelation.hpp index bf6c76d1ec..54eb2938e5 100644 --- a/batched/dense/unit_test/Test_Batched_VectorRelation.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorRelation.hpp @@ -46,7 +46,7 @@ void impl_test_batched_vector_relation() { typedef typename vector_type::value_type value_type; const int vector_length = vector_type::vector_length; - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; // typedef typename ats::mag_type mag_type; vector_type a, b; diff --git a/batched/dense/unit_test/Test_Batched_VectorView.hpp b/batched/dense/unit_test/Test_Batched_VectorView.hpp index a5b752b3d1..9c0e9845d9 100644 --- a/batched/dense/unit_test/Test_Batched_VectorView.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorView.hpp @@ -67,7 +67,7 @@ void impl_verify_vector_view( const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP EXPECT_NEAR_KK(a.access(i0 / vl, i1, i2, i3, i4, i5, i6, i7)[i0 % vl], @@ -79,7 +79,7 @@ void impl_verify_vector_view( const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP EXPECT_NEAR_KK(a.access(i0, i1 / vl, i2, i3, i4, i5, i6, i7)[i1 % vl], @@ -91,7 +91,7 @@ void impl_verify_vector_view( const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP EXPECT_NEAR_KK(a.access(i0, i1, i2 / vl, i3, i4, i5, i6, i7)[i2 % vl], @@ -103,7 +103,7 @@ void impl_verify_vector_view( const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP EXPECT_NEAR_KK(a.access(i0, i1, i2, i3 / vl, i4, i5, i6, i7)[i3 % vl], @@ -115,7 +115,7 @@ void impl_verify_vector_view( const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4 / vl, i5, i6, i7)[i4 % vl], @@ -127,7 +127,7 @@ void impl_verify_vector_view( const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5 / vl, i6, i7)[i5 % vl], @@ -139,7 +139,7 @@ void impl_verify_vector_view( const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5, i6 / vl, i7)[i6 % vl], @@ -151,7 +151,7 @@ void impl_verify_vector_view( const SimdViewAccess >& b) { typedef typename VectorViewType::value_type vector_type; constexpr int vl = vector_type::vector_length; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const typename ats::mag_type eps = 1.0e3 * ats::epsilon(); TEST_LOOP EXPECT_NEAR_KK(a.access(i0, i1, i2, i3, i4, i5, i6, i7 / vl)[i7 % vl], diff --git a/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp index 030e452249..c11ad96959 100644 --- a/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp @@ -43,7 +43,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( const VectorViewType& _X, const KrylovHandleType& handle, const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView) { typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< + typedef typename Kokkos::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; const size_t maximum_iteration = handle.get_max_iteration(); @@ -179,7 +179,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( typename VectorViewType::array_layout, typename VectorViewType::execution_space::scratch_memory_space>; using ScratchPadNormViewType = Kokkos::View< - typename Kokkos::Details::ArithTraits< + typename Kokkos::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type**, typename VectorViewType::execution_space::scratch_memory_space>; @@ -201,7 +201,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( const int last_matrix = handle.last_index(member.league_rank()); using ScratchPadNormViewType = Kokkos::View< - typename Kokkos::Details::ArithTraits< + typename Kokkos::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type**, typename VectorViewType::execution_space::scratch_memory_space>; diff --git a/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp index aa528322ad..bf2f1d2e86 100644 --- a/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp @@ -41,7 +41,7 @@ KOKKOS_INLINE_FUNCTION int TeamCG::invoke( const VectorViewType& _X, const KrylovHandle& handle, const TMPViewType& _TMPView, const TMPNormViewType& _TMPNormView) { typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< + typedef typename Kokkos::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; size_t maximum_iteration = handle.get_max_iteration(); @@ -177,7 +177,7 @@ KOKKOS_INLINE_FUNCTION int TeamCG::invoke( typename VectorViewType::array_layout, typename VectorViewType::execution_space::scratch_memory_space>; using ScratchPadNormViewType = Kokkos::View< - typename Kokkos::Details::ArithTraits< + typename Kokkos::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type**, typename VectorViewType::execution_space::scratch_memory_space>; @@ -199,7 +199,7 @@ KOKKOS_INLINE_FUNCTION int TeamCG::invoke( const int last_matrix = handle.last_index(member.league_rank()); using ScratchPadNormViewType = Kokkos::View< - typename Kokkos::Details::ArithTraits< + typename Kokkos::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type**, typename VectorViewType::execution_space::scratch_memory_space>; diff --git a/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp b/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp index 071b2d6634..923b67c105 100644 --- a/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp @@ -45,9 +45,9 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, const KrylovHandleType& handle, const int GMRES_id) { typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< + typedef typename Kokkos::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; - typedef Kokkos::Details::ArithTraits ATM; + typedef Kokkos::ArithTraits ATM; using SerialCopy1D = SerialCopy; using SerialCopy2D = SerialCopy; diff --git a/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp index e76d8c4239..a7219ecc91 100644 --- a/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp @@ -48,9 +48,9 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, const TMPViewType& _TMPView) { typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< + typedef typename Kokkos::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; - typedef Kokkos::Details::ArithTraits ATM; + typedef Kokkos::ArithTraits ATM; using TeamVectorCopy1D = TeamVectorCopy; diff --git a/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp index 15cb7bdca9..bb8f446f07 100644 --- a/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp @@ -47,9 +47,9 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( const KrylovHandleType& handle, const ArnoldiViewType& _ArnoldiView, const TMPViewType& _TMPView) { typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< + typedef typename Kokkos::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; - typedef Kokkos::Details::ArithTraits ATM; + typedef Kokkos::ArithTraits ATM; using TeamCopy1D = TeamCopy; diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp index cafdc602a0..b7527d923c 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp @@ -215,11 +215,11 @@ struct SerialSpmv { template KOKKOS_INLINE_FUNCTION static int invoke( - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type& alpha, const ValuesViewType& values, const IntView& row_ptr, const IntView& colIndices, const xViewType& X, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type& beta, const yViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) @@ -277,7 +277,7 @@ struct SerialSpmv { #endif return SerialSpmvInternal::template invoke< - typename Kokkos::Details::ArithTraits< + typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type, typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, diff --git a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp index 5c0edbd390..516aded68e 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp @@ -403,11 +403,11 @@ struct TeamVectorSpmv { typename yViewType, int dobeta> KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type& alpha, const ValuesViewType& values, const IntView& row_ptr, const IntView& colIndices, const xViewType& X, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type& beta, const yViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) @@ -466,7 +466,7 @@ struct TeamVectorSpmv { return TeamVectorSpmvInternal::template invoke< MemberType, - typename Kokkos::Details::ArithTraits< + typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type, typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp index fb9f44e8b0..d8282d0aeb 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp @@ -254,11 +254,11 @@ struct TeamSpmv { typename yViewType, int dobeta> KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type& alpha, const ValuesViewType& values, const IntView& row_ptr, const IntView& colIndices, const xViewType& X, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type& beta, const yViewType& Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) @@ -317,7 +317,7 @@ struct TeamSpmv { return TeamSpmvInternal::template invoke< MemberType, - typename Kokkos::Details::ArithTraits< + typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type, typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, diff --git a/batched/sparse/src/KokkosBatched_CrsMatrix.hpp b/batched/sparse/src/KokkosBatched_CrsMatrix.hpp index 4ceddeea44..92acc91a9e 100644 --- a/batched/sparse/src/KokkosBatched_CrsMatrix.hpp +++ b/batched/sparse/src/KokkosBatched_CrsMatrix.hpp @@ -29,9 +29,8 @@ namespace KokkosBatched { template class CrsMatrix { public: - using ScalarType = typename ValuesViewType::non_const_value_type; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; + using ScalarType = typename ValuesViewType::non_const_value_type; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; private: ValuesViewType values; @@ -82,10 +81,9 @@ class CrsMatrix { typename XViewType, typename YViewType> KOKKOS_INLINE_FUNCTION void apply( const MemberType &member, const XViewType &X, const YViewType &Y, - MagnitudeType alpha = Kokkos::Details::ArithTraits::one(), - MagnitudeType beta = - Kokkos::Details::ArithTraits::zero()) const { - if (beta == Kokkos::Details::ArithTraits::zero()) { + MagnitudeType alpha = Kokkos::ArithTraits::one(), + MagnitudeType beta = Kokkos::ArithTraits::zero()) const { + if (beta == Kokkos::ArithTraits::zero()) { if (member.team_size() == 1 && n_operators == 8) KokkosBatched::TeamVectorSpmv::template invoke< ValuesViewType, IntViewType, XViewType, YViewType, 0>( @@ -109,10 +107,9 @@ class CrsMatrix { template KOKKOS_INLINE_FUNCTION void apply( const XViewType &X, const YViewType &Y, - MagnitudeType alpha = Kokkos::Details::ArithTraits::one(), - MagnitudeType beta = - Kokkos::Details::ArithTraits::zero()) const { - if (beta == Kokkos::Details::ArithTraits::zero()) + MagnitudeType alpha = Kokkos::ArithTraits::one(), + MagnitudeType beta = Kokkos::ArithTraits::zero()) const { + if (beta == Kokkos::ArithTraits::zero()) KokkosBatched::SerialSpmv::template invoke< ValuesViewType, IntViewType, XViewType, YViewType, 0>( alpha, values, row_ptr, colIndices, X, beta, Y); diff --git a/batched/sparse/src/KokkosBatched_JacobiPrec.hpp b/batched/sparse/src/KokkosBatched_JacobiPrec.hpp index 1185ec94d4..728bb2d921 100644 --- a/batched/sparse/src/KokkosBatched_JacobiPrec.hpp +++ b/batched/sparse/src/KokkosBatched_JacobiPrec.hpp @@ -29,9 +29,8 @@ namespace KokkosBatched { template class JacobiPrec { public: - using ScalarType = typename ValuesViewType::non_const_value_type; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; + using ScalarType = typename ValuesViewType::non_const_value_type; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; private: ValuesViewType diag_values; @@ -55,8 +54,8 @@ class JacobiPrec { template KOKKOS_INLINE_FUNCTION void computeInverse(const MemberType &member) const { - auto one = Kokkos::Details::ArithTraits::one(); - auto epsilon = Kokkos::Details::ArithTraits::epsilon(); + auto one = Kokkos::ArithTraits::one(); + auto epsilon = Kokkos::ArithTraits::epsilon(); int tooSmall = 0; if (std::is_same::value) { for (int i = 0; i < n_operators; ++i) @@ -118,8 +117,8 @@ class JacobiPrec { } KOKKOS_INLINE_FUNCTION void computeInverse() const { - auto one = Kokkos::Details::ArithTraits::one(); - auto epsilon = Kokkos::Details::ArithTraits::epsilon(); + auto one = Kokkos::ArithTraits::one(); + auto epsilon = Kokkos::ArithTraits::epsilon(); int tooSmall = 0; for (int i = 0; i < n_operators; ++i) diff --git a/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp b/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp index 89fbd79e04..9992742dd8 100644 --- a/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp +++ b/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp @@ -87,7 +87,7 @@ class KrylovHandle { batched_size(_batched_size), N_team(_N_team), monitor_residual(_monitor_residual) { - tolerance = Kokkos::Details::ArithTraits::epsilon(); + tolerance = Kokkos::ArithTraits::epsilon(); max_tolerance = 1e-30; if (std::is_same::value) max_tolerance = 1e-50; if (monitor_residual) { diff --git a/batched/sparse/src/KokkosBatched_Spmv.hpp b/batched/sparse/src/KokkosBatched_Spmv.hpp index 9debd0bc27..da70acb6bb 100644 --- a/batched/sparse/src/KokkosBatched_Spmv.hpp +++ b/batched/sparse/src/KokkosBatched_Spmv.hpp @@ -75,11 +75,11 @@ struct SerialSpmv { template KOKKOS_INLINE_FUNCTION static int invoke( - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type &alpha, const ValuesViewType &values, const IntView &row_ptr, const IntView &colIndices, const xViewType &X, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type &beta, const yViewType &Y); }; @@ -139,11 +139,11 @@ struct TeamSpmv { typename yViewType, int dobeta> KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type &alpha, const ValuesViewType &values, const IntView &row_ptr, const IntView &colIndices, const xViewType &x, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type &beta, const yViewType &y); }; @@ -205,11 +205,11 @@ struct TeamVectorSpmv { typename yViewType, int dobeta> KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type &alpha, const ValuesViewType &values, const IntView &row_ptr, const IntView &colIndices, const xViewType &x, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type &beta, const yViewType &y); }; @@ -276,11 +276,11 @@ struct Spmv { typename yViewType, int dobeta> KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type &alpha, const ValuesViewType &values, const IntView &row_ptr, const IntView &colIndices, const xViewType &x, - const typename Kokkos::Details::ArithTraits< + const typename Kokkos::ArithTraits< typename ValuesViewType::non_const_value_type>::mag_type &beta, const yViewType &y) { int r_val = 0; diff --git a/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp b/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp index c8833d27df..45b6a71f99 100644 --- a/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp +++ b/batched/sparse/unit_test/Test_Batched_SerialGMRES.hpp @@ -109,7 +109,7 @@ template void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const int nnz = (BlkSize - 2) * 3 + 2 * 2; @@ -125,9 +125,8 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { using Layout = typename ValuesViewType::array_layout; using EXSP = typename ValuesViewType::execution_space; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; - using NormViewType = Kokkos::View; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; + using NormViewType = Kokkos::View; using Norm2DViewType = Kokkos::View; using Scalar3DViewType = Kokkos::View; diff --git a/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp b/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp index 0aeb69fbc5..338a93d0eb 100644 --- a/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp +++ b/batched/sparse/unit_test/Test_Batched_SerialSpmv.hpp @@ -86,7 +86,7 @@ template void impl_test_batched_spmv(const int N, const int BlkSize) { typedef typename ValuesViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const int nnz = (BlkSize - 2) * 3 + 2 * 2; diff --git a/batched/sparse/unit_test/Test_Batched_TeamCG.hpp b/batched/sparse/unit_test/Test_Batched_TeamCG.hpp index d6aa0cc949..41fa682bdd 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamCG.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamCG.hpp @@ -95,7 +95,7 @@ template void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const int nnz = (BlkSize - 2) * 3 + 2 * 2; @@ -110,9 +110,8 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { using Layout = typename ValuesViewType::array_layout; using EXSP = typename ValuesViewType::execution_space; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; - using NormViewType = Kokkos::View; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; + using NormViewType = Kokkos::View; using Norm2DViewType = Kokkos::View; using Scalar3DViewType = Kokkos::View; diff --git a/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp b/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp index a0ef9bdd4f..2b7ab73790 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamGMRES.hpp @@ -133,7 +133,7 @@ template void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const int nnz = (BlkSize - 2) * 3 + 2 * 2; @@ -149,9 +149,8 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { using Layout = typename ValuesViewType::array_layout; using EXSP = typename ValuesViewType::execution_space; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; - using NormViewType = Kokkos::View; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; + using NormViewType = Kokkos::View; using Norm2DViewType = Kokkos::View; using Scalar3DViewType = Kokkos::View; diff --git a/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp b/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp index 2efc3e9786..5c077f75ed 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamSpmv.hpp @@ -111,7 +111,7 @@ template void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const int nnz = (BlkSize - 2) * 3 + 2 * 2; diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp index d326d1429d..abadf27953 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorCG.hpp @@ -97,7 +97,7 @@ template void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const int nnz = (BlkSize - 2) * 3 + 2 * 2; @@ -112,9 +112,8 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { using Layout = typename ValuesViewType::array_layout; using EXSP = typename ValuesViewType::execution_space; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; - using NormViewType = Kokkos::View; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; + using NormViewType = Kokkos::View; using Norm2DViewType = Kokkos::View; using Scalar3DViewType = Kokkos::View; diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp index d62e814e91..f4f208a829 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorGMRES.hpp @@ -133,7 +133,7 @@ template void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const int nnz = (BlkSize - 2) * 3 + 2 * 2; @@ -149,9 +149,8 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { using Layout = typename ValuesViewType::array_layout; using EXSP = typename ValuesViewType::execution_space; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; - using NormViewType = Kokkos::View; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; + using NormViewType = Kokkos::View; using Norm2DViewType = Kokkos::View; using Scalar3DViewType = Kokkos::View; diff --git a/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp b/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp index d54f3c20e4..67d944b159 100644 --- a/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp +++ b/batched/sparse/unit_test/Test_Batched_TeamVectorSpmv.hpp @@ -119,7 +119,7 @@ template void impl_test_batched_spmv(const int N, const int BlkSize, const int N_team) { typedef typename ValuesViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; const int nnz = (BlkSize - 2) * 3 + 2 * 2; diff --git a/blas/impl/KokkosBlas1_abs_impl.hpp b/blas/impl/KokkosBlas1_abs_impl.hpp index a1b86cffb7..d23ba1d7ed 100644 --- a/blas/impl/KokkosBlas1_abs_impl.hpp +++ b/blas/impl/KokkosBlas1_abs_impl.hpp @@ -32,7 +32,7 @@ template struct MV_Abs_Functor { typedef typename RMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; RMV R_; @@ -70,7 +70,7 @@ template struct MV_AbsSelf_Functor { typedef typename RMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; RMV R_; @@ -100,7 +100,7 @@ template struct V_Abs_Functor { typedef typename RV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; RV R_; XV X_; @@ -130,7 +130,7 @@ template struct V_AbsSelf_Functor { typedef typename RV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; RV R_; diff --git a/blas/impl/KokkosBlas1_axpby_impl.hpp b/blas/impl/KokkosBlas1_axpby_impl.hpp index f6242c1514..00fc445ec9 100644 --- a/blas/impl/KokkosBlas1_axpby_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_impl.hpp @@ -54,7 +54,7 @@ template ATS; + typedef Kokkos::ArithTraits ATS; XV m_x; YV m_y; @@ -188,7 +188,7 @@ struct Axpby_Functor { typedef typename YV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; XV m_x; YV m_y; diff --git a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp index 38fe86ecd8..4ef3201163 100644 --- a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp @@ -45,7 +45,7 @@ template ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; XMV m_x; @@ -288,7 +288,7 @@ struct Axpby_MV_Functor { typedef typename YMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; XMV m_x; @@ -502,7 +502,7 @@ template ATS; + typedef Kokkos::ArithTraits ATS; XMV m_x; YMV m_y; @@ -730,7 +730,7 @@ struct Axpby_MV_Unroll_Functor { typedef typename YMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; XMV m_x; YMV m_y; diff --git a/blas/impl/KokkosBlas1_axpby_spec.hpp b/blas/impl/KokkosBlas1_axpby_spec.hpp index a02c34eca3..6561163fd1 100644 --- a/blas/impl/KokkosBlas1_axpby_spec.hpp +++ b/blas/impl/KokkosBlas1_axpby_spec.hpp @@ -225,8 +225,8 @@ struct Axpby ATA; - typedef Kokkos::Details::ArithTraits ATB; + typedef Kokkos::ArithTraits ATA; + typedef Kokkos::ArithTraits ATB; static void axpby(const AV& alpha, const XMV& X, const BV& beta, const YMV& Y) { @@ -327,8 +327,8 @@ struct Axpby ATA; - typedef Kokkos::Details::ArithTraits ATB; + typedef Kokkos::ArithTraits ATA; + typedef Kokkos::ArithTraits ATB; static void axpby(const AV& alpha, const XV& X, const BV& beta, const YV& Y) { static_assert(Kokkos::is_view::value, diff --git a/blas/impl/KokkosBlas1_dot_impl.hpp b/blas/impl/KokkosBlas1_dot_impl.hpp index cfcd0020ef..56d964ca07 100644 --- a/blas/impl/KokkosBlas1_dot_impl.hpp +++ b/blas/impl/KokkosBlas1_dot_impl.hpp @@ -56,7 +56,7 @@ struct DotFunctor { } KOKKOS_INLINE_FUNCTION void init(value_type& update) const { - update = Kokkos::Details::ArithTraits::zero(); + update = Kokkos::ArithTraits::zero(); } KOKKOS_INLINE_FUNCTION void join(value_type& update, diff --git a/blas/impl/KokkosBlas1_iamax_impl.hpp b/blas/impl/KokkosBlas1_iamax_impl.hpp index 855a503422..369084aacb 100644 --- a/blas/impl/KokkosBlas1_iamax_impl.hpp +++ b/blas/impl/KokkosBlas1_iamax_impl.hpp @@ -82,7 +82,7 @@ struct V_Iamax_Functor { template void V_Iamax_Invoke(const RV& r, const XV& X) { using execution_space = typename XV::execution_space; - using AT = Kokkos::Details::ArithTraits; + using AT = Kokkos::ArithTraits; using mag_type = typename AT::mag_type; const SizeType numRows = static_cast(X.extent(0)); diff --git a/blas/impl/KokkosBlas1_mult_impl.hpp b/blas/impl/KokkosBlas1_mult_impl.hpp index 988b5cf534..754cce4d12 100644 --- a/blas/impl/KokkosBlas1_mult_impl.hpp +++ b/blas/impl/KokkosBlas1_mult_impl.hpp @@ -39,7 +39,7 @@ template ATS; + typedef Kokkos::ArithTraits ATS; const size_type m_n; typename CMV::const_value_type m_c; @@ -107,7 +107,7 @@ template ATS; + typedef Kokkos::ArithTraits ATS; typename CV::const_value_type m_c; CV m_C; @@ -152,8 +152,8 @@ void V_Mult_Generic(typename CV::const_value_type& c, const CV& C, const BV& B) { using Kokkos::ALL; using Kokkos::subview; - typedef Kokkos::Details::ArithTraits ATA; - typedef Kokkos::Details::ArithTraits ATC; + typedef Kokkos::ArithTraits ATA; + typedef Kokkos::ArithTraits ATC; typedef typename CV::execution_space execution_space; const SizeType numRows = C.extent(0); @@ -197,8 +197,8 @@ template void MV_Mult_Generic(typename CMV::const_value_type& c, const CMV& C, typename AV::const_value_type& ab, const AV& A, const BMV& B) { - typedef Kokkos::Details::ArithTraits ATA; - typedef Kokkos::Details::ArithTraits ATC; + typedef Kokkos::ArithTraits ATA; + typedef Kokkos::ArithTraits ATC; typedef typename CMV::execution_space execution_space; if (C.extent(1) == 1) { diff --git a/blas/impl/KokkosBlas1_nrm2_impl.hpp b/blas/impl/KokkosBlas1_nrm2_impl.hpp index 32f4660f18..c852447f7b 100644 --- a/blas/impl/KokkosBlas1_nrm2_impl.hpp +++ b/blas/impl/KokkosBlas1_nrm2_impl.hpp @@ -39,7 +39,7 @@ struct V_Nrm2_Functor { typedef SizeType size_type; typedef typename XV::non_const_value_type xvalue_type; typedef Kokkos::Details::InnerProductSpaceTraits IPT; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef typename IPT::mag_type value_type; typename XV::const_type m_x; @@ -80,8 +80,7 @@ struct V_Nrm2_Functor { KOKKOS_INLINE_FUNCTION void final(value_type& update) const { if (m_take_sqrt) update = - Kokkos::Details::ArithTraits::sqrt( - update); + Kokkos::ArithTraits::sqrt(update); } }; @@ -96,7 +95,7 @@ struct Nrm2_MV_Functor { typedef typename RV::non_const_value_type rvalue_type; typedef typename XV::non_const_value_type xvalue_type; typedef Kokkos::Details::InnerProductSpaceTraits IPT; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef typename IPT::mag_type value_type; using TeamMem = typename Kokkos::TeamPolicy::member_type; diff --git a/blas/impl/KokkosBlas1_nrm2w_impl.hpp b/blas/impl/KokkosBlas1_nrm2w_impl.hpp index 69667bf838..770846599f 100644 --- a/blas/impl/KokkosBlas1_nrm2w_impl.hpp +++ b/blas/impl/KokkosBlas1_nrm2w_impl.hpp @@ -40,7 +40,7 @@ struct V_Nrm2w_Functor { typedef SizeType size_type; typedef typename XV::non_const_value_type xvalue_type; typedef Kokkos::Details::InnerProductSpaceTraits IPT; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef typename IPT::mag_type value_type; typename XV::const_type m_x, m_w; @@ -83,8 +83,7 @@ struct V_Nrm2w_Functor { KOKKOS_INLINE_FUNCTION void final(value_type& update) const { if (m_take_sqrt) update = - Kokkos::Details::ArithTraits::sqrt( - update); + Kokkos::ArithTraits::sqrt(update); } }; @@ -93,7 +92,7 @@ struct Nrm2w_MV_Functor { typedef typename RV::non_const_value_type rvalue_type; typedef typename XV::non_const_value_type xvalue_type; typedef Kokkos::Details::InnerProductSpaceTraits IPT; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef typename IPT::mag_type value_type; using TeamMem = typename Kokkos::TeamPolicy::member_type; diff --git a/blas/impl/KokkosBlas1_nrminf_impl.hpp b/blas/impl/KokkosBlas1_nrminf_impl.hpp index c42aff8ba2..adbe5feb82 100644 --- a/blas/impl/KokkosBlas1_nrminf_impl.hpp +++ b/blas/impl/KokkosBlas1_nrminf_impl.hpp @@ -38,7 +38,7 @@ struct V_NrmInf_Functor { typedef SizeType size_type; typedef typename XV::non_const_value_type xvalue_type; typedef Kokkos::Details::InnerProductSpaceTraits IPT; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef typename IPT::mag_type value_type; typename XV::const_type m_x; @@ -72,7 +72,7 @@ struct V_NrmInf_Functor { template void V_NrmInf_Invoke(const RV& r, const XV& X) { typedef typename XV::execution_space execution_space; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; const SizeType numRows = static_cast(X.extent(0)); diff --git a/blas/impl/KokkosBlas1_reciprocal_impl.hpp b/blas/impl/KokkosBlas1_reciprocal_impl.hpp index 1db1b9100d..6bef6080f0 100644 --- a/blas/impl/KokkosBlas1_reciprocal_impl.hpp +++ b/blas/impl/KokkosBlas1_reciprocal_impl.hpp @@ -32,7 +32,7 @@ template struct MV_Reciprocal_Functor { typedef typename RMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; RMV R_; @@ -71,7 +71,7 @@ template struct MV_ReciprocalSelf_Functor { typedef typename RMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; RMV R_; @@ -102,7 +102,7 @@ template struct V_Reciprocal_Functor { typedef typename RV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; RV R_; XV X_; @@ -132,7 +132,7 @@ template struct V_ReciprocalSelf_Functor { typedef typename RV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; RV R_; diff --git a/blas/impl/KokkosBlas1_scal_impl.hpp b/blas/impl/KokkosBlas1_scal_impl.hpp index 21974f1f7e..b1b3bd0264 100644 --- a/blas/impl/KokkosBlas1_scal_impl.hpp +++ b/blas/impl/KokkosBlas1_scal_impl.hpp @@ -46,7 +46,7 @@ template struct V_Scal_Functor { typedef typename RV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; RV m_r; XV m_x; @@ -103,7 +103,7 @@ struct V_Scal_Functor { typedef typename RV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; RV m_r; XV m_x; diff --git a/blas/impl/KokkosBlas1_scal_mv_impl.hpp b/blas/impl/KokkosBlas1_scal_mv_impl.hpp index f43101bd10..aded2fd19c 100644 --- a/blas/impl/KokkosBlas1_scal_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_scal_mv_impl.hpp @@ -47,7 +47,7 @@ template ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; RMV R_; @@ -129,7 +129,7 @@ struct MV_Scal_Functor { typedef typename RMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; RMV m_r; @@ -200,7 +200,7 @@ template ATS; + typedef Kokkos::ArithTraits ATS; RMV m_r; XMV m_x; @@ -261,7 +261,7 @@ struct MV_Scal_Unroll_Functor { typedef typename RMV::execution_space execution_space; typedef SizeType size_type; - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; RMV m_r; XMV m_x; diff --git a/blas/impl/KokkosBlas1_scal_spec.hpp b/blas/impl/KokkosBlas1_scal_spec.hpp index 82bf4709b7..8d85f5f1e9 100644 --- a/blas/impl/KokkosBlas1_scal_spec.hpp +++ b/blas/impl/KokkosBlas1_scal_spec.hpp @@ -113,7 +113,7 @@ struct Scal { typedef typename XV::non_const_value_type AV; typedef typename XV::size_type size_type; - typedef Kokkos::Details::ArithTraits ATA; + typedef Kokkos::ArithTraits ATA; static void scal(const RV& R, const AV& alpha, const XV& X) { static_assert(Kokkos::is_view::value, @@ -172,7 +172,7 @@ struct Scal struct Scal { typedef typename XMV::size_type size_type; - typedef Kokkos::Details::ArithTraits ATA; + typedef Kokkos::ArithTraits ATA; static void scal(const RMV& R, const AV& av, const XMV& X) { static_assert(Kokkos::is_view::value, @@ -232,7 +232,7 @@ struct Scal { typedef typename XMV::non_const_value_type AV; typedef typename XMV::size_type size_type; - typedef Kokkos::Details::ArithTraits ATA; + typedef Kokkos::ArithTraits ATA; static void scal(const RMV& R, const AV& alpha, const XMV& X) { static_assert(Kokkos::is_view::value, diff --git a/blas/impl/KokkosBlas1_sum_impl.hpp b/blas/impl/KokkosBlas1_sum_impl.hpp index 08dc4e36e4..20f88e6845 100644 --- a/blas/impl/KokkosBlas1_sum_impl.hpp +++ b/blas/impl/KokkosBlas1_sum_impl.hpp @@ -40,7 +40,7 @@ struct V_Sum_Functor { typedef SizeType size_type; typedef typename XV::non_const_value_type xvalue_type; typedef Kokkos::Details::InnerProductSpaceTraits IPT; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef typename RV::non_const_value_type value_type; typename XV::const_type m_x; diff --git a/blas/impl/KokkosBlas1_team_abs_spec.hpp b/blas/impl/KokkosBlas1_team_abs_spec.hpp index 82418fe7d1..bcd9545738 100644 --- a/blas/impl/KokkosBlas1_team_abs_spec.hpp +++ b/blas/impl/KokkosBlas1_team_abs_spec.hpp @@ -35,7 +35,7 @@ struct team_abs_tpl_spec_avail { template ::value> struct TeamAbs { - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; static KOKKOS_INLINE_FUNCTION void team_abs(const TeamType& team, const RV& R, const XV& X); @@ -43,7 +43,7 @@ struct TeamAbs { template struct TeamAbs { - typedef Kokkos::Details::ArithTraits ATS; + typedef Kokkos::ArithTraits ATS; static KOKKOS_INLINE_FUNCTION void team_abs(const TeamType& team, const RV& R, const XV& X) { diff --git a/blas/impl/KokkosBlas1_team_dot_spec.hpp b/blas/impl/KokkosBlas1_team_dot_spec.hpp index c141694926..041920d109 100644 --- a/blas/impl/KokkosBlas1_team_dot_spec.hpp +++ b/blas/impl/KokkosBlas1_team_dot_spec.hpp @@ -53,7 +53,7 @@ struct TeamDot { static KOKKOS_INLINE_FUNCTION dot_type team_dot(const TeamType& team, const XV& X, const YV& Y) { - dot_type result = 0.0; // Kokkos::Details::ArithTraitszero(); + dot_type result = 0.0; // Kokkos::ArithTraitszero(); int N = X.extent(0); Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, N), diff --git a/blas/impl/KokkosBlas1_team_nrm2_spec.hpp b/blas/impl/KokkosBlas1_team_nrm2_spec.hpp index 4ed19ef5df..ef050cb73b 100644 --- a/blas/impl/KokkosBlas1_team_nrm2_spec.hpp +++ b/blas/impl/KokkosBlas1_team_nrm2_spec.hpp @@ -40,7 +40,7 @@ struct TeamNrm2 { typedef Kokkos::Details::InnerProductSpaceTraits< typename XV::non_const_value_type> IPT; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; static KOKKOS_INLINE_FUNCTION mag_type team_nrm2(const TeamType& team, const XV& X); @@ -53,11 +53,11 @@ struct TeamNrm2 { typedef Kokkos::Details::InnerProductSpaceTraits< typename XV::non_const_value_type> IPT; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; static KOKKOS_INLINE_FUNCTION mag_type team_nrm2(const TeamType& team, const XV& X) { - mag_type result = 0.0; // Kokkos::Details::ArithTraitszero(); + mag_type result = 0.0; // Kokkos::ArithTraitszero(); int N = X.extent(0); Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, N), diff --git a/blas/impl/KokkosBlas1_update_impl.hpp b/blas/impl/KokkosBlas1_update_impl.hpp index 5866764faf..99c80f8d3c 100644 --- a/blas/impl/KokkosBlas1_update_impl.hpp +++ b/blas/impl/KokkosBlas1_update_impl.hpp @@ -45,7 +45,7 @@ template ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; const typename XMV::non_const_value_type alpha_; @@ -215,7 +215,7 @@ template ATS; + typedef Kokkos::ArithTraits ATS; const size_type numCols; const typename XV::non_const_value_type alpha_; diff --git a/blas/impl/KokkosBlas1_update_spec.hpp b/blas/impl/KokkosBlas1_update_spec.hpp index 78a6d9aa09..0b33e5224e 100644 --- a/blas/impl/KokkosBlas1_update_spec.hpp +++ b/blas/impl/KokkosBlas1_update_spec.hpp @@ -117,9 +117,9 @@ struct Update { template struct Update { typedef typename XMV::size_type size_type; - typedef Kokkos::Details::ArithTraits ATA; - typedef Kokkos::Details::ArithTraits ATB; - typedef Kokkos::Details::ArithTraits ATC; + typedef Kokkos::ArithTraits ATA; + typedef Kokkos::ArithTraits ATB; + typedef Kokkos::ArithTraits ATC; static void update(const typename XMV::non_const_value_type& alpha, const XMV& X, @@ -222,9 +222,9 @@ struct Update { template struct Update { typedef typename XV::size_type size_type; - typedef Kokkos::Details::ArithTraits ATA; - typedef Kokkos::Details::ArithTraits ATB; - typedef Kokkos::Details::ArithTraits ATC; + typedef Kokkos::ArithTraits ATA; + typedef Kokkos::ArithTraits ATB; + typedef Kokkos::ArithTraits ATC; static void update(const typename XV::non_const_value_type& alpha, const XV& X, const typename YV::non_const_value_type& beta, diff --git a/blas/impl/KokkosBlas2_gemv_impl.hpp b/blas/impl/KokkosBlas2_gemv_impl.hpp index 7d7403c14b..730f88602a 100644 --- a/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -180,7 +180,7 @@ struct SingleLevelTransposeGEMV { KOKKOS_INLINE_FUNCTION void operator()(const IndexType& i, value_type y_cur) const { - using Kokkos::Details::ArithTraits; + using Kokkos::ArithTraits; using KAT = ArithTraits; const auto x_i = x_(i); @@ -238,9 +238,9 @@ void singleLevelGemv(const typename AViewType::execution_space& space, // depend on that or its implementation details. Instead, we reuse // an instantiation of the non-transpose case for alpha=0. if (A.extent(0) == 0 && (tr != 'N' && tr != 'n')) { - if (beta == Kokkos::Details::ArithTraits::zero()) { - Kokkos::deep_copy(y, Kokkos::Details::ArithTraits::zero()); - } else if (beta != Kokkos::Details::ArithTraits::one()) { + if (beta == Kokkos::ArithTraits::zero()) { + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + } else if (beta != Kokkos::ArithTraits::one()) { // "Fake out" a scal() by using the non-transpose alpha=0, // general beta case. This assumes that the functor doesn't // check dimensions. @@ -255,12 +255,11 @@ void singleLevelGemv(const typename AViewType::execution_space& space, } if (tr == 'N' || tr == 'n') { - if (alpha == Kokkos::Details::ArithTraits::zero()) { - if (beta == Kokkos::Details::ArithTraits::zero()) { + if (alpha == Kokkos::ArithTraits::zero()) { + if (beta == Kokkos::ArithTraits::zero()) { // Fill y with zeros - Kokkos::deep_copy(y, - Kokkos::Details::ArithTraits::zero()); - } else if (beta == Kokkos::Details::ArithTraits::one()) { + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + } else if (beta == Kokkos::ArithTraits::one()) { // Do nothing (y := 1 * y) } else { // beta != 0 && beta != 1 using functor_type = @@ -269,14 +268,14 @@ void singleLevelGemv(const typename AViewType::execution_space& space, functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); } - } else if (alpha == Kokkos::Details::ArithTraits::one()) { - if (beta == Kokkos::Details::ArithTraits::zero()) { + } else if (alpha == Kokkos::ArithTraits::one()) { + if (beta == Kokkos::ArithTraits::zero()) { using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); - } else if (beta == Kokkos::Details::ArithTraits::one()) { + } else if (beta == Kokkos::ArithTraits::one()) { using functor_type = SingleLevelNontransposeGEMV; @@ -290,13 +289,13 @@ void singleLevelGemv(const typename AViewType::execution_space& space, Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); } } else { // alpha != 0 and alpha != 1 - if (beta == Kokkos::Details::ArithTraits::zero()) { + if (beta == Kokkos::ArithTraits::zero()) { using functor_type = SingleLevelNontransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", range, functor); - } else if (beta == Kokkos::Details::ArithTraits::one()) { + } else if (beta == Kokkos::ArithTraits::one()) { using functor_type = SingleLevelNontransposeGEMV; @@ -311,12 +310,11 @@ void singleLevelGemv(const typename AViewType::execution_space& space, } } } else if (tr == 'T' || tr == 't') { // transpose, no conjugate - if (alpha == Kokkos::Details::ArithTraits::zero()) { - if (beta == Kokkos::Details::ArithTraits::zero()) { + if (alpha == Kokkos::ArithTraits::zero()) { + if (beta == Kokkos::ArithTraits::zero()) { // Fill y with zeros - Kokkos::deep_copy(y, - Kokkos::Details::ArithTraits::zero()); - } else if (beta == Kokkos::Details::ArithTraits::one()) { + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + } else if (beta == Kokkos::ArithTraits::one()) { // Do nothing (y := 1 * y) } else { // beta != 0 && beta != 1 using functor_type = @@ -326,15 +324,15 @@ void singleLevelGemv(const typename AViewType::execution_space& space, Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } - } else if (alpha == Kokkos::Details::ArithTraits::one()) { - if (beta == Kokkos::Details::ArithTraits::zero()) { + } else if (alpha == Kokkos::ArithTraits::one()) { + if (beta == Kokkos::ArithTraits::zero()) { using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); - } else if (beta == Kokkos::Details::ArithTraits::one()) { + } else if (beta == Kokkos::ArithTraits::one()) { using functor_type = SingleLevelTransposeGEMV; @@ -350,14 +348,14 @@ void singleLevelGemv(const typename AViewType::execution_space& space, functor); } } else { // alpha != 0 and alpha != 1 - if (beta == Kokkos::Details::ArithTraits::zero()) { + if (beta == Kokkos::ArithTraits::zero()) { using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); - } else if (beta == Kokkos::Details::ArithTraits::one()) { + } else if (beta == Kokkos::ArithTraits::one()) { using functor_type = SingleLevelTransposeGEMV; @@ -374,12 +372,11 @@ void singleLevelGemv(const typename AViewType::execution_space& space, } } } else if (tr == 'C' || tr == 'c' || tr == 'H' || tr == 'h') { // conj xpose - if (alpha == Kokkos::Details::ArithTraits::zero()) { - if (beta == Kokkos::Details::ArithTraits::zero()) { + if (alpha == Kokkos::ArithTraits::zero()) { + if (beta == Kokkos::ArithTraits::zero()) { // Fill y with zeros - Kokkos::deep_copy(y, - Kokkos::Details::ArithTraits::zero()); - } else if (beta == Kokkos::Details::ArithTraits::one()) { + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + } else if (beta == Kokkos::ArithTraits::one()) { // Do nothing (y := 1 * y) } else { // beta != 0 && beta != 1 using functor_type = @@ -389,15 +386,15 @@ void singleLevelGemv(const typename AViewType::execution_space& space, Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); } - } else if (alpha == Kokkos::Details::ArithTraits::one()) { - if (beta == Kokkos::Details::ArithTraits::zero()) { + } else if (alpha == Kokkos::ArithTraits::one()) { + if (beta == Kokkos::ArithTraits::zero()) { using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); - } else if (beta == Kokkos::Details::ArithTraits::one()) { + } else if (beta == Kokkos::ArithTraits::one()) { using functor_type = SingleLevelTransposeGEMV; @@ -413,14 +410,14 @@ void singleLevelGemv(const typename AViewType::execution_space& space, functor); } } else { // alpha != 0 and alpha != 1 - if (beta == Kokkos::Details::ArithTraits::zero()) { + if (beta == Kokkos::ArithTraits::zero()) { using functor_type = SingleLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_reduce("KokkosBlas::gemv[SingleLevelTranspose]", range, functor); - } else if (beta == Kokkos::Details::ArithTraits::one()) { + } else if (beta == Kokkos::ArithTraits::one()) { using functor_type = SingleLevelTransposeGEMV; @@ -604,7 +601,7 @@ struct TwoLevelTransposeGEMV { public: KOKKOS_INLINE_FUNCTION void operator()(const member_type& team) const { - using Kokkos::Details::ArithTraits; + using Kokkos::ArithTraits; using KAT_A = ArithTraits; using KAT_Y = ArithTraits; @@ -668,7 +665,7 @@ void twoLevelGemv(const typename AViewType::execution_space& space, using team_policy_type = Kokkos::TeamPolicy; using range_policy_type = Kokkos::RangePolicy; - using Kokkos::Details::ArithTraits; + using Kokkos::ArithTraits; using KAT = ArithTraits; using YKAT = ArithTraits; @@ -746,7 +743,7 @@ void twoLevelGemv(const typename AViewType::execution_space& space, } else { if (alpha == KAT::zero() && beta == KAT::zero()) { // Fill y with zeros - Kokkos::deep_copy(y, Kokkos::Details::ArithTraits::zero()); + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); } else if (alpha == KAT::zero() && beta == KAT::one()) { // Do nothing (y := 1 * y) } else if (tr == 'T') { diff --git a/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp b/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp index 31975dfa9e..aa7efc9122 100644 --- a/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp +++ b/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp @@ -31,7 +31,7 @@ struct OpID { struct OpConj { template KOKKOS_INLINE_FUNCTION ValueType operator()(ValueType v) const { - using KAT = Kokkos::Details::ArithTraits; + using KAT = Kokkos::ArithTraits; return KAT::conj(v); } }; diff --git a/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp b/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp index 462ac0c744..f54a1dd68c 100644 --- a/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp +++ b/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp @@ -43,8 +43,8 @@ struct DotBasedGEMM { using size_A = typename AV::size_type; using scalar_C = typename CV::non_const_value_type; using size_C = typename CV::size_type; - using AVT = Kokkos::Details::ArithTraits; - using CVT = Kokkos::Details::ArithTraits; + using AVT = Kokkos::ArithTraits; + using CVT = Kokkos::ArithTraits; const scalar_A alpha; const scalar_C beta; diff --git a/blas/impl/KokkosBlas3_gemm_impl.hpp b/blas/impl/KokkosBlas3_gemm_impl.hpp index f1f89b9908..b0271ad23d 100644 --- a/blas/impl/KokkosBlas3_gemm_impl.hpp +++ b/blas/impl/KokkosBlas3_gemm_impl.hpp @@ -65,7 +65,7 @@ template { typedef typename ViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, @@ -115,7 +115,7 @@ struct impl_deep_copy_matrix_block { typedef typename ViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, @@ -159,7 +159,7 @@ template { typedef typename ViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, @@ -209,7 +209,7 @@ struct impl_deep_copy_matrix_block { typedef typename ViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, @@ -258,7 +258,7 @@ template { typedef typename ViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, @@ -308,7 +308,7 @@ struct impl_deep_copy_matrix_block { typedef typename ViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION static void copy(const TeamHandle& team, const ViewTypeScratch& A_scr, @@ -356,7 +356,7 @@ template struct impl_update_matrix_block { typedef typename ViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION static void update(const TeamHandle& team, const value_type& beta, @@ -417,7 +417,7 @@ template { typedef typename ViewType::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; KOKKOS_INLINE_FUNCTION static void update(const TeamHandle& team, const value_type& beta, diff --git a/blas/impl/KokkosBlas3_trsm_impl.hpp b/blas/impl/KokkosBlas3_trsm_impl.hpp index 9700b62e67..87cac8b86a 100644 --- a/blas/impl/KokkosBlas3_trsm_impl.hpp +++ b/blas/impl/KokkosBlas3_trsm_impl.hpp @@ -40,7 +40,7 @@ int SerialTrsmInternalLeftLowerConj(const bool use_unit_diag, const int m, const int as0, const int as1, /**/ ValueType* KOKKOS_RESTRICT B, const int bs0, const int bs1) { - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; const ScalarType one(1.0), zero(0.0); @@ -79,7 +79,7 @@ int SerialTrsmInternalLeftUpperConj(const bool use_unit_diag, const int m, const int as0, const int as1, /**/ ValueType* KOKKOS_RESTRICT B, const int bs0, const int bs1) { - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; const ScalarType one(1.0), zero(0.0); diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index ff037e59d1..45b82e8fcd 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -92,8 +92,7 @@ void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { template void axpy(const AV& a, const XMV& X, const YMV& Y) { - axpby(a, X, - Kokkos::Details::ArithTraits::one(), + axpby(a, X, Kokkos::ArithTraits::one(), Y); } diff --git a/blas/src/KokkosBlas1_team_axpby.hpp b/blas/src/KokkosBlas1_team_axpby.hpp index 165683df01..374bc42390 100644 --- a/blas/src/KokkosBlas1_team_axpby.hpp +++ b/blas/src/KokkosBlas1_team_axpby.hpp @@ -37,9 +37,7 @@ axpy(const TeamType& team, const typename XVector::non_const_value_type& a, const XVector& x, const YVector& y) { KokkosBlas::Experimental::axpby( team, a, x, - Kokkos::Details::ArithTraits< - typename YVector::non_const_value_type>::one(), - y); + Kokkos::ArithTraits::one(), y); } } // namespace Experimental diff --git a/blas/unit_test/Test_Blas1_abs.hpp b/blas/unit_test/Test_Blas1_abs.hpp index ff91087613..6ed2f9dbb3 100644 --- a/blas/unit_test/Test_Blas1_abs.hpp +++ b/blas/unit_test/Test_Blas1_abs.hpp @@ -24,7 +24,7 @@ template void impl_test_abs(int N) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef Kokkos::View< ScalarA * [2], @@ -97,7 +97,7 @@ template void impl_test_abs_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef multivector_layout_adapter vfA_type; typedef multivector_layout_adapter vfB_type; diff --git a/blas/unit_test/Test_Blas1_asum.hpp b/blas/unit_test/Test_Blas1_asum.hpp index 624bfc9d09..b1d617061b 100644 --- a/blas/unit_test/Test_Blas1_asum.hpp +++ b/blas/unit_test/Test_Blas1_asum.hpp @@ -23,7 +23,7 @@ namespace Test { template void impl_test_asum(int N) { typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef Kokkos::ArithTraits MAT; typedef Kokkos::View< diff --git a/blas/unit_test/Test_Blas1_iamax.hpp b/blas/unit_test/Test_Blas1_iamax.hpp index ced1759301..2b9885e30f 100644 --- a/blas/unit_test/Test_Blas1_iamax.hpp +++ b/blas/unit_test/Test_Blas1_iamax.hpp @@ -23,7 +23,7 @@ namespace Test { template void impl_test_iamax(int N) { typedef typename ViewTypeA::non_const_value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef typename AT::mag_type mag_type; using size_type = typename ViewTypeA::size_type; @@ -42,7 +42,7 @@ void impl_test_iamax(int N) { typename ViewTypeA::const_type c_a = a; - mag_type expected_result = Kokkos::Details::ArithTraits::min(); + mag_type expected_result = Kokkos::ArithTraits::min(); size_type expected_max_loc = 0; for (int i = 0; i < N; i++) { mag_type val = AT::abs(h_a(i)); @@ -114,7 +114,7 @@ void impl_test_iamax(int N) { template void impl_test_iamax_mv(int N, int K) { typedef typename ViewTypeA::non_const_value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef typename AT::mag_type mag_type; typedef typename ViewTypeA::size_type size_type; @@ -145,7 +145,7 @@ void impl_test_iamax_mv(int N, int K) { size_type* expected_max_loc = new size_type[K]; for (int j = 0; j < K; j++) { - expected_result[j] = Kokkos::Details::ArithTraits::min(); + expected_result[j] = Kokkos::ArithTraits::min(); for (int i = 0; i < N; i++) { mag_type val = AT::abs(h_a(i, j)); if (val > expected_result[j]) { diff --git a/blas/unit_test/Test_Blas1_nrm1.hpp b/blas/unit_test/Test_Blas1_nrm1.hpp index b64aab9c3c..e17f8b988a 100644 --- a/blas/unit_test/Test_Blas1_nrm1.hpp +++ b/blas/unit_test/Test_Blas1_nrm1.hpp @@ -66,7 +66,7 @@ void impl_test_nrm1(int N) { template void impl_test_nrm1_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef typename AT::mag_type mag_type; typedef Kokkos::ArithTraits MAT; diff --git a/blas/unit_test/Test_Blas1_nrm2.hpp b/blas/unit_test/Test_Blas1_nrm2.hpp index d17c9af505..b7444b76df 100644 --- a/blas/unit_test/Test_Blas1_nrm2.hpp +++ b/blas/unit_test/Test_Blas1_nrm2.hpp @@ -23,7 +23,7 @@ namespace Test { template void impl_test_nrm2(int N) { typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; ViewTypeA a("A", N); @@ -45,8 +45,8 @@ void impl_test_nrm2(int N) { for (int i = 0; i < N; i++) { expected_result += AT::abs(h_a(i)) * AT::abs(h_a(i)); } - expected_result = Kokkos::Details::ArithTraits::sqrt( - expected_result); + expected_result = + Kokkos::ArithTraits::sqrt(expected_result); typename AT::mag_type nonconst_result = KokkosBlas::nrm2(a); EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); @@ -58,7 +58,7 @@ void impl_test_nrm2(int N) { template void impl_test_nrm2_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef multivector_layout_adapter vfA_type; @@ -90,8 +90,7 @@ void impl_test_nrm2_mv(int N, int K) { expected_result[j] += AT::abs(h_a(i, j)) * AT::abs(h_a(i, j)); } expected_result[j] = - Kokkos::Details::ArithTraits::sqrt( - expected_result[j]); + Kokkos::ArithTraits::sqrt(expected_result[j]); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; diff --git a/blas/unit_test/Test_Blas1_nrm2_squared.hpp b/blas/unit_test/Test_Blas1_nrm2_squared.hpp index ebebd57b9a..7bfb46446f 100644 --- a/blas/unit_test/Test_Blas1_nrm2_squared.hpp +++ b/blas/unit_test/Test_Blas1_nrm2_squared.hpp @@ -23,7 +23,7 @@ namespace Test { template void impl_test_nrm2_squared(int N) { typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef Kokkos::View< ScalarA * [2], @@ -68,7 +68,7 @@ void impl_test_nrm2_squared(int N) { template void impl_test_nrm2_squared_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef multivector_layout_adapter vfA_type; diff --git a/blas/unit_test/Test_Blas1_nrminf.hpp b/blas/unit_test/Test_Blas1_nrminf.hpp index 8da5550afa..9a8a79c115 100644 --- a/blas/unit_test/Test_Blas1_nrminf.hpp +++ b/blas/unit_test/Test_Blas1_nrminf.hpp @@ -23,7 +23,7 @@ namespace Test { template void impl_test_nrminf(int N) { typedef typename ViewTypeA::non_const_value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; ViewTypeA a("A", N); @@ -42,7 +42,7 @@ void impl_test_nrminf(int N) { double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; typename AT::mag_type expected_result = - Kokkos::Details::ArithTraits::min(); + Kokkos::ArithTraits::min(); for (int i = 0; i < N; i++) if (AT::abs(h_a(i)) > expected_result) expected_result = AT::abs(h_a(i)); @@ -58,7 +58,7 @@ void impl_test_nrminf(int N) { template void impl_test_nrminf_mv(int N, int K) { typedef typename ViewTypeA::non_const_value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef multivector_layout_adapter vfA_type; @@ -85,8 +85,7 @@ void impl_test_nrminf_mv(int N, int K) { typename AT::mag_type* expected_result = new typename AT::mag_type[K]; for (int j = 0; j < K; j++) { - expected_result[j] = - Kokkos::Details::ArithTraits::min(); + expected_result[j] = Kokkos::ArithTraits::min(); for (int i = 0; i < N; i++) { if (AT::abs(h_a(i, j)) > expected_result[j]) expected_result[j] = AT::abs(h_a(i, j)); diff --git a/blas/unit_test/Test_Blas1_reciprocal.hpp b/blas/unit_test/Test_Blas1_reciprocal.hpp index 49dd1c6119..687aacf1d9 100644 --- a/blas/unit_test/Test_Blas1_reciprocal.hpp +++ b/blas/unit_test/Test_Blas1_reciprocal.hpp @@ -25,7 +25,7 @@ template void impl_test_reciprocal(int N) { using ScalarA = typename ViewTypeA::value_type; using ScalarB = typename ViewTypeB::value_type; - using AT = Kokkos::Details::ArithTraits; + using AT = Kokkos::ArithTraits; using MagnitudeA = typename AT::mag_type; using MagnitudeB = typename Kokkos::ArithTraits::mag_type; diff --git a/blas/unit_test/Test_Blas1_scal.hpp b/blas/unit_test/Test_Blas1_scal.hpp index 1c572073a5..4c414ea735 100644 --- a/blas/unit_test/Test_Blas1_scal.hpp +++ b/blas/unit_test/Test_Blas1_scal.hpp @@ -25,7 +25,7 @@ template void impl_test_scal(int N) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; ScalarA a(3); typename AT::mag_type eps = AT::epsilon() * 1000; @@ -76,7 +76,7 @@ template void impl_test_scal_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef multivector_layout_adapter vfA_type; typedef multivector_layout_adapter vfB_type; diff --git a/blas/unit_test/Test_Blas1_serial_setscal.hpp b/blas/unit_test/Test_Blas1_serial_setscal.hpp index bb33aa451a..80a0561d60 100644 --- a/blas/unit_test/Test_Blas1_serial_setscal.hpp +++ b/blas/unit_test/Test_Blas1_serial_setscal.hpp @@ -99,7 +99,7 @@ template ats; + typedef Kokkos::ArithTraits ats; /// radomized input testing views const ScalarType alpha = 11.1; diff --git a/blas/unit_test/Test_Blas1_team_abs.hpp b/blas/unit_test/Test_Blas1_team_abs.hpp index 318f04c58e..8cb8d9cf43 100644 --- a/blas/unit_test/Test_Blas1_team_abs.hpp +++ b/blas/unit_test/Test_Blas1_team_abs.hpp @@ -39,7 +39,7 @@ void impl_test_team_abs(int N) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef Kokkos::View< ScalarA * [2], @@ -141,7 +141,7 @@ void impl_test_team_abs_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef multivector_layout_adapter vfA_type; typedef multivector_layout_adapter vfB_type; diff --git a/blas/unit_test/Test_Blas1_team_axpby.hpp b/blas/unit_test/Test_Blas1_team_axpby.hpp index e776085a66..3e071e7537 100644 --- a/blas/unit_test/Test_Blas1_team_axpby.hpp +++ b/blas/unit_test/Test_Blas1_team_axpby.hpp @@ -193,7 +193,7 @@ void impl_test_team_axpby_mv(int N, int K) { Kokkos::View r("Dot::Result", K); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; // KokkosBlas::axpby(a,x,b,y); Kokkos::parallel_for( diff --git a/blas/unit_test/Test_Blas1_team_nrm2.hpp b/blas/unit_test/Test_Blas1_team_nrm2.hpp index 8ac35e5cbc..05d4970bcd 100644 --- a/blas/unit_test/Test_Blas1_team_nrm2.hpp +++ b/blas/unit_test/Test_Blas1_team_nrm2.hpp @@ -35,7 +35,7 @@ void impl_test_team_nrm2(int N, int K) { const team_policy policy(K, Kokkos::AUTO); typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef multivector_layout_adapter vfA_type; @@ -64,8 +64,7 @@ void impl_test_team_nrm2(int N, int K) { for (int i = 0; i < N; i++) expected_result[j] += AT::abs(h_a(i, j)) * AT::abs(h_a(i, j)); expected_result[j] = - Kokkos::Details::ArithTraits::sqrt( - expected_result[j]); + Kokkos::ArithTraits::sqrt(expected_result[j]); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; diff --git a/blas/unit_test/Test_Blas1_team_scal.hpp b/blas/unit_test/Test_Blas1_team_scal.hpp index a33d5cd930..5d9f298d06 100644 --- a/blas/unit_test/Test_Blas1_team_scal.hpp +++ b/blas/unit_test/Test_Blas1_team_scal.hpp @@ -39,7 +39,7 @@ void impl_test_team_scal(int N) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef Kokkos::View< ScalarA * [2], @@ -157,7 +157,7 @@ void impl_test_team_scal_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; typedef multivector_layout_adapter vfA_type; typedef multivector_layout_adapter vfB_type; diff --git a/blas/unit_test/Test_Blas1_team_setscal.hpp b/blas/unit_test/Test_Blas1_team_setscal.hpp index fd30cc5bfb..ff593d3eeb 100644 --- a/blas/unit_test/Test_Blas1_team_setscal.hpp +++ b/blas/unit_test/Test_Blas1_team_setscal.hpp @@ -111,7 +111,7 @@ template ats; + typedef Kokkos::ArithTraits ats; /// radomized input testing views const ScalarType alpha = 11.1; diff --git a/blas/unit_test/Test_Blas3_gemm.hpp b/blas/unit_test/Test_Blas3_gemm.hpp index a210806929..adfe99f866 100644 --- a/blas/unit_test/Test_Blas3_gemm.hpp +++ b/blas/unit_test/Test_Blas3_gemm.hpp @@ -35,7 +35,7 @@ struct gemm_VanillaGEMM { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::Details::ArithTraits APT; + typedef Kokkos::ArithTraits APT; typedef typename APT::mag_type mag_type; ScalarA alpha; ScalarC beta; @@ -142,7 +142,7 @@ struct DiffGEMM { ViewTypeC C, C2; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::Details::ArithTraits APT; + typedef Kokkos::ArithTraits APT; typedef typename APT::mag_type mag_type; KOKKOS_INLINE_FUNCTION @@ -177,7 +177,7 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::Details::ArithTraits APT; + typedef Kokkos::ArithTraits APT; typedef typename APT::mag_type mag_type; double machine_eps = APT::epsilon(); @@ -265,7 +265,7 @@ void impl_test_stream_gemm(const int M, const int N, const int K, using ViewTypeB = Kokkos::View; using ViewTypeC = Kokkos::View; using ScalarC = typename ViewTypeC::value_type; - using APT = Kokkos::Details::ArithTraits; + using APT = Kokkos::ArithTraits; using mag_type = typename APT::mag_type; const char tA[] = {"N"}; diff --git a/blas/unit_test/Test_Blas3_trmm.hpp b/blas/unit_test/Test_Blas3_trmm.hpp index f52dd8dd54..188999c5e0 100644 --- a/blas/unit_test/Test_Blas3_trmm.hpp +++ b/blas/unit_test/Test_Blas3_trmm.hpp @@ -56,7 +56,7 @@ struct trmm_VanillaGEMM { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::Details::ArithTraits APT; + typedef Kokkos::ArithTraits APT; typedef typename APT::mag_type mag_type; ScalarA alpha; ScalarC beta; @@ -102,7 +102,7 @@ void impl_test_trmm(const char* side, const char* uplo, const char* trans, const char* diag, int M, int N, Scalar alpha) { using execution_space = typename ViewTypeA::device_type::execution_space; using ScalarA = typename ViewTypeA::value_type; - using APT = Kokkos::Details::ArithTraits; + using APT = Kokkos::ArithTraits; using mag_type = typename APT::mag_type; double machine_eps = APT::epsilon(); @@ -118,7 +118,7 @@ void impl_test_trmm(const char* side, const char* uplo, const char* trans, // printf("KokkosBlas::trmm test for alpha %g, %c %c %c %c, M %d, N %d, eps // %g, ViewType: %s\n", - // Kokkos::Details::ArithTraits::real(alpha),side[0],uplo[0],trans[0],diag[0],M,N,eps,typeid(ViewTypeA).name()); + // Kokkos::ArithTraits::real(alpha),side[0],uplo[0],trans[0],diag[0],M,N,eps,typeid(ViewTypeA).name()); typename ViewTypeA::HostMirror host_A = Kokkos::create_mirror_view(A); typename ViewTypeB::HostMirror host_B_actual = Kokkos::create_mirror_view(B); diff --git a/blas/unit_test/Test_Blas3_trsm.hpp b/blas/unit_test/Test_Blas3_trsm.hpp index 79859aa24a..5edd175652 100644 --- a/blas/unit_test/Test_Blas3_trsm.hpp +++ b/blas/unit_test/Test_Blas3_trsm.hpp @@ -56,7 +56,7 @@ struct trsm_VanillaGEMM { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::Details::ArithTraits APT; + typedef Kokkos::ArithTraits APT; typedef typename APT::mag_type mag_type; ScalarA alpha; ScalarC beta; @@ -104,7 +104,7 @@ void impl_test_trsm(const char* side, const char* uplo, const char* trans, typename ViewTypeA::value_type alpha) { using execution_space = typename ViewTypeA::device_type::execution_space; using ScalarA = typename ViewTypeA::value_type; - using APT = Kokkos::Details::ArithTraits; + using APT = Kokkos::ArithTraits; using mag_type = typename APT::mag_type; double machine_eps = APT::epsilon(); diff --git a/blas/unit_test/Test_Blas_gesv.hpp b/blas/unit_test/Test_Blas_gesv.hpp index 207a06db07..81c94b9109 100644 --- a/blas/unit_test/Test_Blas_gesv.hpp +++ b/blas/unit_test/Test_Blas_gesv.hpp @@ -36,7 +36,7 @@ template void impl_test_gesv(const char* mode, const char* padding, int N) { typedef typename Device::execution_space execution_space; typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Kokkos::Random_XorShift64_Pool rand_pool(13718); @@ -141,7 +141,7 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, int nrhs) { typedef typename Device::execution_space execution_space; typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits ats; + typedef Kokkos::ArithTraits ats; Kokkos::Random_XorShift64_Pool rand_pool(13718); diff --git a/blas/unit_test/Test_Blas_trtri.hpp b/blas/unit_test/Test_Blas_trtri.hpp index 518b96495f..0bebb9edf0 100644 --- a/blas/unit_test/Test_Blas_trtri.hpp +++ b/blas/unit_test/Test_Blas_trtri.hpp @@ -55,7 +55,7 @@ struct VanillaGEMM { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::Details::ArithTraits APT; + typedef Kokkos::ArithTraits APT; typedef typename APT::mag_type mag_type; ScalarA alpha; ScalarC beta; @@ -101,7 +101,7 @@ int impl_test_trtri(int bad_diag_idx, const char* uplo, const char* diag, const int M, const int N) { using execution_space = typename ViewTypeA::device_type::execution_space; using ScalarA = typename ViewTypeA::value_type; - using APT = Kokkos::Details::ArithTraits; + using APT = Kokkos::ArithTraits; using mag_type = typename APT::mag_type; double machine_eps = APT::epsilon(); diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index baefbe8c35..27a9d4ebe8 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -22,7 +22,7 @@ #define KOKKOSKERNELS_MACRO_MIN(x, y) ((x) < (y) ? (x) : (y)) #define KOKKOSKERNELS_MACRO_MAX(x, y) ((x) < (y) ? (y) : (x)) #define KOKKOSKERNELS_MACRO_ABS(x) \ - Kokkos::Details::ArithTraits::type>::abs(x) + Kokkos::ArithTraits::type>::abs(x) namespace KokkosKernels { @@ -38,7 +38,7 @@ class SquareRootFunctor { KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const { typedef typename ViewType::value_type value_type; - theView_(i) = Kokkos::Details::ArithTraits::sqrt(theView_(i)); + theView_(i) = Kokkos::ArithTraits::sqrt(theView_(i)); } private: @@ -219,7 +219,7 @@ inline void kk_reduce_view2(size_t num_elements, view_t arr, } template ::mag_type> struct IsIdenticalFunctor { view_type1 view1; @@ -232,7 +232,7 @@ struct IsIdenticalFunctor { KOKKOS_INLINE_FUNCTION void operator()(const size_t &i, size_t &is_equal) const { typedef typename view_type2::non_const_value_type val_type; - typedef Kokkos::Details::ArithTraits KAT; + typedef Kokkos::ArithTraits KAT; typedef typename KAT::mag_type mag_type; const mag_type val_diff = KAT::abs(view1(i) - view2(i)); @@ -266,7 +266,7 @@ bool kk_is_identical_view(view_type1 view1, view_type2 view2, eps_type eps) { } template ::mag_type> struct IsRelativelyIdenticalFunctor { view_type1 view1; diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index fd04bd2529..2a4b749f92 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -837,7 +837,7 @@ template void zero_vector(typename value_array_type::value_type /* num_elements */, value_array_type &vector) { typedef typename value_array_type::non_const_value_type val_type; - Kokkos::deep_copy(vector, Kokkos::Details::ArithTraits::zero()); + Kokkos::deep_copy(vector, Kokkos::ArithTraits::zero()); } template diff --git a/common/src/Kokkos_ArithTraits.hpp b/common/src/Kokkos_ArithTraits.hpp index 98ac27f1c9..31744f7a8f 100644 --- a/common/src/Kokkos_ArithTraits.hpp +++ b/common/src/Kokkos_ArithTraits.hpp @@ -18,7 +18,7 @@ #define KOKKOS_ARITHTRAITS_HPP /// \file Kokkos_ArithTraits.hpp -/// \brief Declaration and definition of Kokkos::Details::ArithTraits +/// \brief Declaration and definition of Kokkos::ArithTraits #include #include @@ -195,7 +195,6 @@ KOKKOS_FORCEINLINE_FUNCTION IntType intPowUnsigned(const IntType x, } // namespace namespace Kokkos { -namespace Details { // Macro to automate the wrapping of Kokkos Mathematical Functions // in the ArithTraits struct for real floating point types, hopefully @@ -2043,13 +2042,12 @@ struct [[deprecated]] ArithTraits { }; #endif // HAVE_KOKKOS_QD -} // namespace Details +namespace Details { +template +using ArithTraits [[deprecated("Use Kokkos::ArithTraits instead")]] = + ::Kokkos::ArithTraits; -// Promote ArithTraits into Kokkos namespace. At some point, we -// will remove it from the Details namespace completely. We leave -// it there for now, because a lot of code depends on it being -// there. -using Details::ArithTraits; +} // namespace Details } // namespace Kokkos #endif // KOKKOS_ARITHTRAITS_HPP diff --git a/common/src/Kokkos_InnerProductSpaceTraits.hpp b/common/src/Kokkos_InnerProductSpaceTraits.hpp index 072125115c..c2bc475c45 100644 --- a/common/src/Kokkos_InnerProductSpaceTraits.hpp +++ b/common/src/Kokkos_InnerProductSpaceTraits.hpp @@ -105,7 +105,7 @@ namespace Details { /// /// \section Kokkos_IPST_new Adding a specialization for a new type T /// -/// You must first add a specialization of ArithTraits. Please +/// You must first add a specialization of Kokkos::ArithTraits. Please /// note that if CUDA does not support using T in device functions, /// then you must not mark norm() or dot() as device functions /// in your specialization. (Simply omit the KOKKOS_FORCEINLINE_FUNCTION @@ -119,14 +119,14 @@ class InnerProductSpaceTraits { typedef T val_type; //! The type returned by norm(x) for a value x of type val_type. - typedef typename ArithTraits::mag_type mag_type; + typedef typename Kokkos::ArithTraits::mag_type mag_type; //! The type returned by dot(x,y) for values x and y of type val_type. typedef val_type dot_type; //! The "norm" (absolute value or magnitude) of a value x of type val_type. static KOKKOS_FORCEINLINE_FUNCTION mag_type norm(const val_type& x) { - return ArithTraits::abs(x); + return Kokkos::ArithTraits::abs(x); } /// \brief The "dot product" of two values x and y of type val_type. /// @@ -146,11 +146,11 @@ class InnerProductSpaceTraits { template <> struct InnerProductSpaceTraits { typedef long double val_type; - typedef ArithTraits::mag_type mag_type; + typedef Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; static mag_type norm(const val_type& x) { - return ArithTraits::abs(x); + return Kokkos::ArithTraits::abs(x); } static dot_type dot(const val_type& x, const val_type& y) { return x * y; } }; @@ -160,11 +160,11 @@ template class InnerProductSpaceTraits> { public: typedef Kokkos::complex val_type; - typedef typename ArithTraits::mag_type mag_type; + typedef typename Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; static KOKKOS_FORCEINLINE_FUNCTION mag_type norm(const val_type& x) { - return ArithTraits::abs(x); + return Kokkos::ArithTraits::abs(x); } static KOKKOS_FORCEINLINE_FUNCTION dot_type dot(const val_type& x, const val_type& y) { @@ -179,11 +179,11 @@ class InnerProductSpaceTraits> { template struct InnerProductSpaceTraits> { typedef std::complex val_type; - typedef typename ArithTraits::mag_type mag_type; + typedef typename Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; static mag_type norm(const val_type& x) { - return ArithTraits::abs(x); + return Kokkos::ArithTraits::abs(x); } static dot_type dot(const val_type& x, const val_type& y) { return std::conj(x) * y; @@ -200,11 +200,11 @@ struct InnerProductSpaceTraits> { template <> struct InnerProductSpaceTraits<__float128> { typedef __float128 val_type; - typedef typename ArithTraits::mag_type mag_type; + typedef typename Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; static mag_type norm(const val_type& x) { - return ArithTraits::abs(x); + return Kokkos::ArithTraits::abs(x); } static dot_type dot(const val_type& x, const val_type& y) { return x * y; } }; @@ -223,17 +223,17 @@ struct InnerProductSpaceTraits<__float128> { // functions. It should be possible to use Kokkos' support for // aggregate types to implement device function support for dd_real // and qd_real, but we have not done this yet (as of 07 Jan 2014). -// Hence, the class methods of the ArithTraits specializations for +// Hence, the class methods of the Kokkos::ArithTraits specializations for // dd_real and qd_real are not marked as device functions. #ifdef HAVE_KOKKOS_QD template <> struct InnerProductSpaceTraits { typedef dd_real val_type; - typedef ArithTraits::mag_type mag_type; + typedef Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; static mag_type norm(const val_type& x) { - return ArithTraits::abs(x); + return Kokkos::ArithTraits::abs(x); } static dot_type dot(const val_type& x, const val_type& y) { return x * y; } }; @@ -241,11 +241,11 @@ struct InnerProductSpaceTraits { template <> struct InnerProductSpaceTraits { typedef qd_real val_type; - typedef ArithTraits::mag_type mag_type; + typedef Kokkos::ArithTraits::mag_type mag_type; typedef val_type dot_type; static mag_type norm(const val_type& x) { - return ArithTraits::abs(x); + return Kokkos::ArithTraits::abs(x); } static dot_type dot(const val_type& x, const val_type& y) { return x * y; } }; diff --git a/common/unit_test/Test_Common_ArithTraits.hpp b/common/unit_test/Test_Common_ArithTraits.hpp index 29d0498055..8aa963b2ab 100644 --- a/common/unit_test/Test_Common_ArithTraits.hpp +++ b/common/unit_test/Test_Common_ArithTraits.hpp @@ -15,15 +15,15 @@ //@HEADER /// \file ArithTraitsTest.hpp -/// \brief Templated test for Kokkos::Details::ArithTraits +/// \brief Templated test for Kokkos::ArithTraits /// /// This header file is an implementation detail of the tests for -/// Kokkos::Details::ArithTraits. Users must not rely on it existing, +/// Kokkos::ArithTraits. Users must not rely on it existing, /// or on its contents. This header file should not be /// installed with Kokkos' other header files. /// /// On the other hand, this header file does give examples of how to -/// use Kokkos::Details::ArithTraits, so it may be useful for users to +/// use Kokkos::ArithTraits, so it may be useful for users to /// read it. #ifndef KOKKOS_ARITHTRAITSTEST_HPP @@ -51,7 +51,7 @@ #endif namespace { -// Whether Kokkos::Details::ArithTraits implements +// Whether Kokkos::ArithTraits implements // transcendental functions. These include sqrt, pow, log, and // log10. template @@ -92,8 +92,8 @@ struct HasTranscendentals { } // namespace /// \class ArithTraitsTesterBase -/// \brief Base class providing tests for Kokkos::Details::ArithTraits -/// \tparam ScalarType Any type for which Kokkos::Details::ArithTraits +/// \brief Base class providing tests for Kokkos::ArithTraits +/// \tparam ScalarType Any type for which Kokkos::ArithTraits /// has a specialization, and which can be executed on the parallel /// device. /// \tparam DeviceType A Kokkos parallel device type. @@ -107,8 +107,8 @@ struct HasTranscendentals { /// types. /// /// This class provides a Kokkos reduction operator for testing -/// Kokkos::Details::ArithTraits. This test works for any type -/// ScalarType for which Kokkos::Details::ArithTraits has a +/// Kokkos::ArithTraits. This test works for any type +/// ScalarType for which Kokkos::ArithTraits has a /// specialization, and which can be executed on the parallel device. /// /// The tests include those suitable for execution on the parallel @@ -162,7 +162,7 @@ class ArithTraitsTesterBase { KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; (void)iwork; // not using this argument int success = 1; @@ -273,7 +273,7 @@ class ArithTraitsTesterBase { /// /// \return \c 1 if all the tests pass, else \c 0. int testHost(std::ostream& out) const { - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; using std::endl; int success = 1; @@ -378,7 +378,7 @@ class ArithTraitsTesterBase { /// \brief Base class of ArithTraitsTester that exercises /// transcendental functions, if and only if ArithTraits /// implements them. -/// \tparam ScalarType Any type for which Kokkos::Details::ArithTraits +/// \tparam ScalarType Any type for which Kokkos::ArithTraits /// implements transcendental functions, along with the requirements /// imposed by ArithTraitsTesterBase. /// \tparam DeviceType A Kokkos parallel device type. @@ -441,7 +441,7 @@ class ArithTraitsTesterTranscendentalBase KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); - // typedef Kokkos::Details::ArithTraits AT; + // typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable int success = 1; @@ -462,7 +462,7 @@ class ArithTraitsTesterTranscendentalBase protected: virtual int testHostImpl(std::ostream& out) const { using std::endl; - // typedef Kokkos::Details::ArithTraits AT; + // typedef Kokkos::ArithTraits AT; int success = 1; if (HasTranscendentals::value) { @@ -495,20 +495,16 @@ class ArithTraitsTesterTranscendentalBase KOKKOS_INLINE_FUNCTION bool equal(const ScalarType& a, const ScalarType& b) const { - if (b != Kokkos::Details::ArithTraits::zero()) { + if (b != Kokkos::ArithTraits::zero()) { if (a > b) - return (a - b) / b < - 2 * Kokkos::Details::ArithTraits::epsilon(); + return (a - b) / b < 2 * Kokkos::ArithTraits::epsilon(); else - return (b - a) / b < - 2 * Kokkos::Details::ArithTraits::epsilon(); + return (b - a) / b < 2 * Kokkos::ArithTraits::epsilon(); } else { if (a > b) - return (a - b) < - 2 * Kokkos::Details::ArithTraits::epsilon(); + return (a - b) < 2 * Kokkos::ArithTraits::epsilon(); else - return (b - a) < - 2 * Kokkos::Details::ArithTraits::epsilon(); + return (b - a) < 2 * Kokkos::ArithTraits::epsilon(); } } @@ -524,7 +520,7 @@ class ArithTraitsTesterTranscendentalBase KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable int success = 1; @@ -733,7 +729,7 @@ class ArithTraitsTesterTranscendentalBase protected: virtual int testHostImpl(std::ostream& out) const { using std::endl; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; int success = 1; if (!HasTranscendentals::value) { @@ -946,7 +942,7 @@ class ArithTraitsTesterTranscendentalBase }; /// \class ArithTraitsTesterComplexBase -/// \brief Execute Kokkos::Details::ArithTraits tests relevant to +/// \brief Execute Kokkos::ArithTraits tests relevant to /// complex numbers (whether or not \c ScalarType is itself a /// complex-valued type). /// @@ -958,8 +954,7 @@ class ArithTraitsTesterTranscendentalBase /// complex, but the specific tests that are run will depend on /// ScalarType. template ::is_complex> + const int is_complex = Kokkos::ArithTraits::is_complex> class ArithTraitsTesterComplexBase : public ArithTraitsTesterTranscendentalBase { private: @@ -1009,7 +1004,7 @@ class ArithTraitsTesterComplexBase KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable int success = 1; @@ -1048,7 +1043,7 @@ class ArithTraitsTesterComplexBase protected: virtual int testHostImpl(std::ostream& out) const { using std::endl; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; int success = 1; // Apparently, std::numeric_limits::is_signed is 1 only for real @@ -1095,7 +1090,7 @@ class ArithTraitsTesterComplexBase KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable int success = 1; @@ -1103,7 +1098,7 @@ class ArithTraitsTesterComplexBase FAILURE(); } typedef typename AT::mag_type mag_type; - const mag_type one = Kokkos::Details::ArithTraits::one(); + const mag_type one = Kokkos::ArithTraits::one(); // This presumes that ScalarType, being a complex number, has a // constructor which takes two mag_type arguments. @@ -1129,7 +1124,7 @@ class ArithTraitsTesterComplexBase protected: virtual int testHostImpl(std::ostream& out) const { using std::endl; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; int success = 1; if (!AT::is_complex) { @@ -1137,7 +1132,7 @@ class ArithTraitsTesterComplexBase FAILURE(); } typedef typename AT::mag_type mag_type; - const mag_type one = Kokkos::Details::ArithTraits::one(); + const mag_type one = Kokkos::ArithTraits::one(); // This presumes that ScalarType, being a complex number, has a // constructor which takes two mag_type arguments. @@ -1173,7 +1168,7 @@ class ArithTraitsTesterComplexBase /// \tparam DeviceType A Kokkos parallel device type. /// /// Kokkos reduction operator for testing those attributes of -/// Kokkos::Details::ArithTraits relevant to floating-point types. +/// Kokkos::ArithTraits relevant to floating-point types. /// /// The tests include those suitable for execution on the parallel /// device (operator()) and those suitable for execution on the host @@ -1181,17 +1176,14 @@ class ArithTraitsTesterComplexBase /// executions of the test. All redundant executions must return /// '1' (passed). template ::is_exact> + const int is_exact = Kokkos::ArithTraits::is_exact> class ArithTraitsTesterFloatingPointBase : public ArithTraitsTesterComplexBase< - ScalarType, DeviceType, - Kokkos::Details::ArithTraits::is_complex> { + ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> { private: //! The base class of this class. typedef ArithTraitsTesterComplexBase< - ScalarType, DeviceType, - Kokkos::Details::ArithTraits::is_complex> + ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> base_type; public: @@ -1217,13 +1209,11 @@ class ArithTraitsTesterFloatingPointBase template class ArithTraitsTesterFloatingPointBase : public ArithTraitsTesterComplexBase< - ScalarType, DeviceType, - Kokkos::Details::ArithTraits::is_complex> { + ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> { private: //! The base class of this class. typedef ArithTraitsTesterComplexBase< - ScalarType, DeviceType, - Kokkos::Details::ArithTraits::is_complex> + ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> base_type; public: @@ -1238,7 +1228,7 @@ class ArithTraitsTesterFloatingPointBase KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable int success = 1; @@ -1284,7 +1274,7 @@ class ArithTraitsTesterFloatingPointBase protected: virtual int testHostImpl(std::ostream& out) const { - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; using std::endl; int success = 1; @@ -1338,13 +1328,11 @@ class ArithTraitsTesterFloatingPointBase template class ArithTraitsTesterFloatingPointBase : public ArithTraitsTesterComplexBase< - ScalarType, DeviceType, - Kokkos::Details::ArithTraits::is_complex> { + ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> { private: //! The base class of this class. typedef ArithTraitsTesterComplexBase< - ScalarType, DeviceType, - Kokkos::Details::ArithTraits::is_complex> + ScalarType, DeviceType, Kokkos::ArithTraits::is_complex> base_type; public: @@ -1359,7 +1347,7 @@ class ArithTraitsTesterFloatingPointBase KOKKOS_INLINE_FUNCTION void operator()(size_type iwork, value_type& dst) const { TRACE(); - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; (void)iwork; // forestall compiler warning for unused variable int success = 1; @@ -1380,7 +1368,7 @@ class ArithTraitsTesterFloatingPointBase protected: virtual int testHostImpl(std::ostream& out) const { - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; using std::endl; int success = 1; @@ -1399,8 +1387,8 @@ class ArithTraitsTesterFloatingPointBase }; /// \class ArithTraitsTester -/// \brief Tests for Kokkos::Details::ArithTraits -/// \tparam ScalarType Any type for which Kokkos::Details::ArithTraits +/// \brief Tests for Kokkos::ArithTraits +/// \tparam ScalarType Any type for which Kokkos::ArithTraits /// has a specialization, and which can be executed on the parallel /// device. /// \tparam DeviceType A Kokkos parallel device type. @@ -1415,9 +1403,9 @@ class ArithTraitsTesterFloatingPointBase /// for host functions do use run-time polymorphism. /// /// This class (through its base class) provides a Kokkos reduction -/// operator for testing Kokkos::Details::ArithTraits. This test +/// operator for testing Kokkos::ArithTraits. This test /// works for any type ScalarType for which -/// Kokkos::Details::ArithTraits has a specialization, and which can +/// Kokkos::ArithTraits has a specialization, and which can /// be executed on the parallel device. /// /// The tests include those suitable for execution on the parallel @@ -1438,8 +1426,8 @@ class ArithTraitsTester KOKKOS_INLINE_FUNCTION ArithTraitsTester() {} }; -/// \brief Run the Kokkos::Details::ArithTraits tests on the parallel device. -/// \tparam ScalarType Any type for which Kokkos::Details::ArithTraits +/// \brief Run the Kokkos::ArithTraits tests on the parallel device. +/// \tparam ScalarType Any type for which Kokkos::ArithTraits /// has a specialization, and which can be executed on the parallel /// device. /// \tparam DeviceType A Kokkos parallel device type. @@ -1457,17 +1445,15 @@ int testArithTraitsOnDevice(std::ostream& out, const int verbose) { functor_type(), success); if (success) { if (verbose) - out << Kokkos::Details::ArithTraits::name() << " passed" - << endl; + out << Kokkos::ArithTraits::name() << " passed" << endl; } else { - out << Kokkos::Details::ArithTraits::name() << " FAILED" - << endl; + out << Kokkos::ArithTraits::name() << " FAILED" << endl; } return success; } -/// \brief Run the Kokkos::Details::ArithTraits tests on the host. -/// \tparam ScalarType Any type for which Kokkos::Details::ArithTraits +/// \brief Run the Kokkos::ArithTraits tests on the host. +/// \tparam ScalarType Any type for which Kokkos::ArithTraits /// has a specialization. /// \tparam DeviceType A Kokkos parallel device type. /// @@ -1482,16 +1468,14 @@ int testArithTraitsOnHost(std::ostream& out, const int verbose) { if (localSuccess) { if (verbose) - out << Kokkos::Details::ArithTraits::name() << " passed" - << endl; + out << Kokkos::ArithTraits::name() << " passed" << endl; } else { - out << Kokkos::Details::ArithTraits::name() << " FAILED" - << endl; + out << Kokkos::ArithTraits::name() << " FAILED" << endl; } return localSuccess; } -/// \brief Run the Kokkos::Details::ArithTraits tests for all (valid) +/// \brief Run the Kokkos::ArithTraits tests for all (valid) /// scalar types, on the given parallel device. /// \tparam DeviceType A Kokkos parallel device type. /// @@ -1586,7 +1570,7 @@ int runAllArithTraitsDeviceTests(std::ostream& out, const int verbose) { return success && curSuccess; } -/// \brief Run the Kokkos::Details::ArithTraits tests for all scalar +/// \brief Run the Kokkos::ArithTraits tests for all scalar /// types, on the host. /// \tparam DeviceType A Kokkos parallel device type. /// diff --git a/example/batched_solve/team_GMRES.cpp b/example/batched_solve/team_GMRES.cpp index 057902b6f2..b543ddaad6 100644 --- a/example/batched_solve/team_GMRES.cpp +++ b/example/batched_solve/team_GMRES.cpp @@ -236,8 +236,7 @@ int main(int /*argc*/, char ** /*argv*/) { using Layout = typename AMatrixValueView::array_layout; using EXSP = typename AMatrixValueView::execution_space; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; using Norm2DViewType = Kokkos::View; using Scalar3DViewType = Kokkos::View; diff --git a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp index 5d6bf72450..314439b6c0 100644 --- a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -245,7 +245,7 @@ int main(int argc, char *argv[]) { #endif Kokkos::print_configuration(std::cout); - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; Kokkos::Timer timer; /// diff --git a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp index 950674e39e..3f15ca0b2d 100644 --- a/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp +++ b/perf_test/batched/dense/KokkosBatched_Test_BlockTridiagJacobi.cpp @@ -178,7 +178,7 @@ int main(int argc, char *argv[]) { #endif Kokkos::print_configuration(std::cout); - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; Kokkos::Timer timer; /// diff --git a/perf_test/batched/sparse/CG/KokkosBatched_Test_CG.cpp b/perf_test/batched/sparse/CG/KokkosBatched_Test_CG.cpp index 40c0ad8f3d..5bf6061fe4 100644 --- a/perf_test/batched/sparse/CG/KokkosBatched_Test_CG.cpp +++ b/perf_test/batched/sparse/CG/KokkosBatched_Test_CG.cpp @@ -46,7 +46,7 @@ int main(int argc, char *argv[]) { #endif Kokkos::print_configuration(std::cout); - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; Kokkos::Timer timer; /// @@ -220,8 +220,7 @@ int main(int argc, char *argv[]) { using Layout = typename AMatrixValueViewLL::array_layout; using EXSP = typename AMatrixValueViewLL::execution_space; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; using Norm2DViewType = Kokkos::View; using Scalar3DViewType = Kokkos::View; diff --git a/perf_test/batched/sparse/GMRES/KokkosBatched_Test_GMRES.cpp b/perf_test/batched/sparse/GMRES/KokkosBatched_Test_GMRES.cpp index 6c6e88b8e0..c0ce8f0bd4 100644 --- a/perf_test/batched/sparse/GMRES/KokkosBatched_Test_GMRES.cpp +++ b/perf_test/batched/sparse/GMRES/KokkosBatched_Test_GMRES.cpp @@ -50,7 +50,7 @@ int main(int argc, char *argv[]) { #endif Kokkos::print_configuration(std::cout); - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; /// /// input arguments parsing @@ -250,8 +250,7 @@ int main(int argc, char *argv[]) { using Layout = typename AMatrixValueViewLL::array_layout; using EXSP = typename AMatrixValueViewLL::execution_space; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; + using MagnitudeType = typename Kokkos::ArithTraits::mag_type; using Norm2DViewType = Kokkos::View; using Scalar3DViewType = Kokkos::View; diff --git a/perf_test/batched/sparse/SPMV/KokkosBatched_SPMV_View.hpp b/perf_test/batched/sparse/SPMV/KokkosBatched_SPMV_View.hpp index 35efd40a16..17b8ad6d3e 100644 --- a/perf_test/batched/sparse/SPMV/KokkosBatched_SPMV_View.hpp +++ b/perf_test/batched/sparse/SPMV/KokkosBatched_SPMV_View.hpp @@ -23,7 +23,7 @@ struct BSPMV_Functor_View { typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; typedef typename AMatrix::non_const_value_type entries_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; const value_type* alpha; const AMatrix m_A_values; diff --git a/perf_test/batched/sparse/SPMV/KokkosBatched_Test_SPMV.cpp b/perf_test/batched/sparse/SPMV/KokkosBatched_Test_SPMV.cpp index 81d828c51d..06ea55e303 100644 --- a/perf_test/batched/sparse/SPMV/KokkosBatched_Test_SPMV.cpp +++ b/perf_test/batched/sparse/SPMV/KokkosBatched_Test_SPMV.cpp @@ -126,7 +126,7 @@ int main(int argc, char *argv[]) { #endif Kokkos::print_configuration(std::cout); - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; Kokkos::Timer timer; /// diff --git a/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverDn.cpp b/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverDn.cpp index 546cc84cab..2294c23805 100644 --- a/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverDn.cpp +++ b/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverDn.cpp @@ -163,7 +163,7 @@ int main(int argc, char *argv[]) { #endif Kokkos::print_configuration(std::cout); - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; /// /// input arguments parsing diff --git a/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverSp.cpp b/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverSp.cpp index d1a21b3053..808e235edc 100644 --- a/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverSp.cpp +++ b/perf_test/batched/sparse/cusolver/KokkosBatched_Test_cusolverSp.cpp @@ -381,7 +381,7 @@ int main(int argc, char *argv[]) { #endif Kokkos::print_configuration(std::cout); - // typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::ArithTraits ats; /// /// input arguments parsing diff --git a/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp b/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp index c4deec656f..65120a8827 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp +++ b/perf_test/sparse/KokkosSparse_sptrsv_aux.hpp @@ -66,7 +66,7 @@ bool check_errors(mag_t tol, crsmat_t &Mtx, scalar_view_t rhs, using lno_t = typename entries_view_t::non_const_value_type; using values_view_t = typename crsmat_t::values_type::non_const_type; using scalar_t = typename values_view_t::value_type; - using STS = Kokkos::Details::ArithTraits; + using STS = Kokkos::ArithTraits; using execution_space = typename scalar_view_t::execution_space; diff --git a/perf_test/sparse/KokkosSparse_sptrsv_cholmod.cpp b/perf_test/sparse/KokkosSparse_sptrsv_cholmod.cpp index 87afbba79a..5de4e6be00 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv_cholmod.cpp +++ b/perf_test/sparse/KokkosSparse_sptrsv_cholmod.cpp @@ -153,7 +153,7 @@ template int test_sptrsv_perf(std::vector tests, std::string &filename, bool u_in_csr, bool invert_diag, bool invert_offdiag, int block_size, int loop) { - using STS = Kokkos::Details::ArithTraits; + using STS = Kokkos::ArithTraits; using mag_type = typename STS::mag_type; // using cholmod_int_type = long; diff --git a/perf_test/sparse/KokkosSparse_sptrsv_superlu.cpp b/perf_test/sparse/KokkosSparse_sptrsv_superlu.cpp index cc9d698554..659874a32c 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv_superlu.cpp +++ b/perf_test/sparse/KokkosSparse_sptrsv_superlu.cpp @@ -308,7 +308,7 @@ int test_sptrsv_perf(std::vector tests, bool verbose, int relax_size, int block_size, int loop) { using ordinal_type = int; using size_type = int; - using STS = Kokkos::Details::ArithTraits; + using STS = Kokkos::ArithTraits; using mag_type = typename STS::mag_type; // Default spaces diff --git a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp index 7ddd6957a9..7301d5e741 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp +++ b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp @@ -52,7 +52,7 @@ int test_sptrsv_perf(std::vector tests, bool verbose, bool invert_offdiag, bool u_in_csr, int loop) { using ordinal_type = int; using size_type = int; - using STS = Kokkos::Details::ArithTraits; + using STS = Kokkos::ArithTraits; using mag_type = typename STS::mag_type; // Default spaces diff --git a/perf_test/sparse/spmv/Kokkos_SPMV.hpp b/perf_test/sparse/spmv/Kokkos_SPMV.hpp index a79e49b764..6668511c4a 100644 --- a/perf_test/sparse/spmv/Kokkos_SPMV.hpp +++ b/perf_test/sparse/spmv/Kokkos_SPMV.hpp @@ -25,7 +25,7 @@ struct SPMV_Functor { typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; const value_type alpha; AMatrix m_A; diff --git a/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp b/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp index 14ab6f8ebe..4e099e6f96 100644 --- a/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp +++ b/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp @@ -28,7 +28,7 @@ struct SPMV_Inspector_Functor { typedef typename AMatrix::non_const_size_type size_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; const value_type alpha; AMatrix m_A; diff --git a/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp index ec6cb6f02d..501e71e3e7 100644 --- a/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp @@ -247,16 +247,16 @@ class ClusterGaussSeidel { nnz_scalar_t _omega; - Team_PSGS( - const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_, - const_scalar_nnz_view_t adj_vals_, x_value_array_type Xvector_, - y_value_array_type Yvector_, nnz_lno_t color_set_begin_, - nnz_lno_t color_set_end_, nnz_lno_persistent_work_view_t color_adj_, - nnz_lno_persistent_work_view_t cluster_offsets_, - nnz_lno_persistent_work_view_t cluster_verts_, - scalar_persistent_work_view_t inverse_diagonal_, - nnz_lno_t clusters_per_team_, - nnz_scalar_t omega_ = Kokkos::Details::ArithTraits::one()) + Team_PSGS(const_lno_row_view_t xadj_, const_lno_nnz_view_t adj_, + const_scalar_nnz_view_t adj_vals_, x_value_array_type Xvector_, + y_value_array_type Yvector_, nnz_lno_t color_set_begin_, + nnz_lno_t color_set_end_, + nnz_lno_persistent_work_view_t color_adj_, + nnz_lno_persistent_work_view_t cluster_offsets_, + nnz_lno_persistent_work_view_t cluster_verts_, + scalar_persistent_work_view_t inverse_diagonal_, + nnz_lno_t clusters_per_team_, + nnz_scalar_t omega_ = Kokkos::ArithTraits::one()) : _xadj(xadj_), _adj(adj_), _adj_vals(adj_vals_), @@ -691,7 +691,7 @@ class ClusterGaussSeidel { _diagonals(diagonals_), num_total_rows(num_total_rows_), rows_per_team(rows_per_team_), - one(Kokkos::Details::ArithTraits::one()) {} + one(Kokkos::ArithTraits::one()) {} KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t row_id) const { @@ -781,12 +781,12 @@ class ClusterGaussSeidel { } template - void apply( - x_value_array_type x_lhs_output_vec, y_value_array_type y_rhs_input_vec, - bool init_zero_x_vector = false, int numIter = 1, - nnz_scalar_t omega = Kokkos::Details::ArithTraits::one(), - bool apply_forward = true, bool apply_backward = true, - bool /*update_y_vector*/ = true) { + void apply(x_value_array_type x_lhs_output_vec, + y_value_array_type y_rhs_input_vec, + bool init_zero_x_vector = false, int numIter = 1, + nnz_scalar_t omega = Kokkos::ArithTraits::one(), + bool apply_forward = true, bool apply_backward = true, + bool /*update_y_vector*/ = true) { auto gsHandle = get_gs_handle(); size_type nnz = entries.extent(0); diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 75f827a84d..e4cfb4b047 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -260,21 +260,20 @@ class PointGaussSeidel { // long rows. nnz_lno_t _long_row_par; - Team_PSGS( - row_lno_persistent_work_view_t xadj_, - nnz_lno_persistent_work_view_t adj_, - scalar_persistent_work_view_t adj_vals_, - scalar_persistent_work_view2d_t Xvector_, - scalar_persistent_work_view2d_t Yvector_, nnz_lno_t color_set_begin, - nnz_lno_t color_set_end, - scalar_persistent_work_view_t permuted_inverse_diagonal_, - pool_memory_space pms, nnz_lno_t _num_max_vals_in_l1 = 0, - nnz_lno_t _num_max_vals_in_l2 = 0, - nnz_scalar_t omega_ = Kokkos::Details::ArithTraits::one(), - - nnz_lno_t block_size_ = 1, nnz_lno_t team_work_size_ = 1, - size_t shared_memory_size_ = 16, int suggested_team_size_ = 1, - int vector_size_ = 1) + Team_PSGS(row_lno_persistent_work_view_t xadj_, + nnz_lno_persistent_work_view_t adj_, + scalar_persistent_work_view_t adj_vals_, + scalar_persistent_work_view2d_t Xvector_, + scalar_persistent_work_view2d_t Yvector_, + nnz_lno_t color_set_begin, nnz_lno_t color_set_end, + scalar_persistent_work_view_t permuted_inverse_diagonal_, + pool_memory_space pms, nnz_lno_t _num_max_vals_in_l1 = 0, + nnz_lno_t _num_max_vals_in_l2 = 0, + nnz_scalar_t omega_ = Kokkos::ArithTraits::one(), + + nnz_lno_t block_size_ = 1, nnz_lno_t team_work_size_ = 1, + size_t shared_memory_size_ = 16, int suggested_team_size_ = 1, + int vector_size_ = 1) : _xadj(xadj_), _adj(adj_), _adj_vals(adj_vals_), @@ -1283,7 +1282,7 @@ class PointGaussSeidel { rows_per_team(rows_per_team_), block_size(block_size_), block_matrix_size(block_matrix_size_), - one(Kokkos::Details::ArithTraits::one()) {} + one(Kokkos::ArithTraits::one()) {} KOKKOS_INLINE_FUNCTION void operator()(const nnz_lno_t& row_id) const { @@ -1489,7 +1488,7 @@ class PointGaussSeidel { void block_apply( x_value_array_type x_lhs_output_vec, y_value_array_type y_rhs_input_vec, bool init_zero_x_vector = false, int numIter = 1, - nnz_scalar_t omega = Kokkos::Details::ArithTraits::one(), + nnz_scalar_t omega = Kokkos::ArithTraits::one(), bool apply_forward = true, bool apply_backward = true, bool update_y_vector = true) { auto gsHandle = this->get_gs_handle(); @@ -1613,7 +1612,7 @@ class PointGaussSeidel { void point_apply( x_value_array_type x_lhs_output_vec, y_value_array_type y_rhs_input_vec, bool init_zero_x_vector = false, int numIter = 1, - nnz_scalar_t omega = Kokkos::Details::ArithTraits::one(), + nnz_scalar_t omega = Kokkos::ArithTraits::one(), bool apply_forward = true, bool apply_backward = true, bool update_y_vector = true) { auto gsHandle = get_gs_handle(); @@ -1690,12 +1689,12 @@ class PointGaussSeidel { } template - void apply( - x_value_array_type x_lhs_output_vec, y_value_array_type y_rhs_input_vec, - bool init_zero_x_vector = false, int numIter = 1, - nnz_scalar_t omega = Kokkos::Details::ArithTraits::one(), - bool apply_forward = true, bool apply_backward = true, - bool update_y_vector = true) { + void apply(x_value_array_type x_lhs_output_vec, + y_value_array_type y_rhs_input_vec, + bool init_zero_x_vector = false, int numIter = 1, + nnz_scalar_t omega = Kokkos::ArithTraits::one(), + bool apply_forward = true, bool apply_backward = true, + bool update_y_vector = true) { auto gsHandle = get_gs_handle(); if (gsHandle->is_numeric_called() == false) { this->initialize_numeric(); diff --git a/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp b/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp index b1ec07e768..91145335f5 100644 --- a/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp +++ b/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp @@ -75,7 +75,7 @@ struct CrsMatrixGetDiagCopyWithOffsetsFunctor { /// \param lclRow [in] The current (local) row of the sparse matrix. KOKKOS_INLINE_FUNCTION void operator()(const LO& lclRow) const { const offset_type INV = KokkosSparse::OrdinalTraits::invalid(); - const scalar_type ZERO = Kokkos::Details::ArithTraits::zero(); + const scalar_type ZERO = Kokkos::ArithTraits::zero(); // If the row lacks a stored diagonal entry, then its value is zero. D_(lclRow) = ZERO; diff --git a/sparse/impl/KokkosSparse_sor_sequential_impl.hpp b/sparse/impl/KokkosSparse_sor_sequential_impl.hpp index fd3b88fb4b..3ca5ee08bf 100644 --- a/sparse/impl/KokkosSparse_sor_sequential_impl.hpp +++ b/sparse/impl/KokkosSparse_sor_sequential_impl.hpp @@ -77,7 +77,7 @@ void gaussSeidel(const LocalOrdinal numRows, const LocalOrdinal numCols, const OffsetType b_stride, RangeScalar* const X, const OffsetType x_stride, const MatrixScalar* const D, const MatrixScalar omega, const char direction[]) { - using Kokkos::Details::ArithTraits; + using Kokkos::ArithTraits; typedef LocalOrdinal LO; const OffsetType theNumRows = static_cast(numRows); const OffsetType theNumCols = static_cast(numCols); @@ -247,7 +247,7 @@ void reorderedGaussSeidel( const MatrixScalar* const D, const LocalOrdinal* const rowInd, const LocalOrdinal numRowInds, // length of rowInd const MatrixScalar omega, const char direction[]) { - using Kokkos::Details::ArithTraits; + using Kokkos::ArithTraits; typedef LocalOrdinal LO; const OffsetType theNumRows = static_cast(numRows); const OffsetType theNumCols = static_cast(numCols); @@ -323,7 +323,7 @@ void reorderedGaussSeidel( for (LO ii = 0; ii < numRowInds; ++ii) { LO i = rowInd[ii]; for (OffsetType c = 0; c < theNumCols; ++c) { - x_temp[c] = Kokkos::Details::ArithTraits::zero(); + x_temp[c] = Kokkos::ArithTraits::zero(); } for (OffsetType k = ptr[i]; k < ptr[i + 1]; ++k) { const LO j = ind[k]; @@ -344,7 +344,7 @@ void reorderedGaussSeidel( for (LO ii = numRowInds - 1; ii != 0; --ii) { LO i = rowInd[ii]; for (OffsetType c = 0; c < theNumCols; ++c) { - x_temp[c] = Kokkos::Details::ArithTraits::zero(); + x_temp[c] = Kokkos::ArithTraits::zero(); } for (OffsetType k = ptr[i]; k < ptr[i + 1]; ++k) { const LO j = ind[k]; @@ -362,7 +362,7 @@ void reorderedGaussSeidel( const LO ii = 0; LO i = rowInd[ii]; for (OffsetType c = 0; c < theNumCols; ++c) { - x_temp[c] = Kokkos::Details::ArithTraits::zero(); + x_temp[c] = Kokkos::ArithTraits::zero(); } for (OffsetType k = ptr[i]; k < ptr[i + 1]; ++k) { const LO j = ind[k]; diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index 1cd4241eae..abf44589f7 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -528,7 +528,7 @@ struct BSR_GEMV_Functor { typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; //! Nonconst version of the type of column indices in the sparse matrix. typedef typename AMatrix::non_const_ordinal_type ordinal_type; @@ -816,7 +816,7 @@ struct BSR_GEMV_Transpose_Functor { typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; //! Nonconst version of the type of column indices in the sparse matrix. typedef typename AMatrix::non_const_ordinal_type ordinal_type; @@ -1143,7 +1143,7 @@ struct BSR_GEMM_Functor { typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; //! Nonconst version of the type of column indices in the sparse matrix. typedef typename AMatrix::non_const_ordinal_type ordinal_type; @@ -1449,7 +1449,7 @@ struct BSR_GEMM_Transpose_Functor { typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; //! Nonconst version of the type of column indices in the sparse matrix. typedef typename AMatrix::non_const_ordinal_type ordinal_type; diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 13b88b3271..ef5c2e0684 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -198,12 +198,9 @@ struct SPMV_MV_BSRMATRIX::is_complex) - method = Method::Fallback; - if (Kokkos::Details::ArithTraits::is_complex) - method = Method::Fallback; - if (Kokkos::Details::ArithTraits::is_complex) - method = Method::Fallback; + if (Kokkos::ArithTraits::is_complex) method = Method::Fallback; + if (Kokkos::ArithTraits::is_complex) method = Method::Fallback; + if (Kokkos::ArithTraits::is_complex) method = Method::Fallback; // can't use tensor cores outside GPU if (!KokkosKernels::Impl::kk_is_gpu_exec_space< typename AMatrix::execution_space>()) diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index e9eb301b23..6a82977e02 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -58,7 +58,7 @@ struct SPMV_Transpose_Functor { typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; typedef typename YVector::non_const_value_type coefficient_type; typedef typename YVector::non_const_value_type y_value_type; @@ -118,7 +118,7 @@ struct SPMV_Functor { typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; const value_type alpha; AMatrix m_A; @@ -515,7 +515,7 @@ static void spmv_beta_transpose(typename YVector::const_value_type& alpha, if (execution_space().concurrency() == 1) { /// serial impl typedef typename AMatrix::non_const_value_type value_type; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; const size_type* KOKKOS_RESTRICT row_map_ptr = A.graph.row_map.data(); const ordinal_type* KOKKOS_RESTRICT col_idx_ptr = A.graph.entries.data(); const value_type* KOKKOS_RESTRICT values_ptr = A.values.data(); @@ -701,8 +701,7 @@ struct SPMV_MV_Transpose_Functor { for (ordinal_type iEntry = 0; iEntry < row_length; iEntry++) { const A_value_type val = - conjugate ? Kokkos::Details::ArithTraits::conj( - row.value(iEntry)) + conjugate ? Kokkos::ArithTraits::conj(row.value(iEntry)) : row.value(iEntry); const ordinal_type ind = row.colidx(iEntry); @@ -744,10 +743,9 @@ struct SPMV_MV_Transpose_Functor { Kokkos::ThreadVectorRange(dev, row_length), [&](ordinal_type iEntry) { const A_value_type val = - conjugate - ? Kokkos::Details::ArithTraits::conj( - row.value(iEntry)) - : row.value(iEntry); + conjugate ? Kokkos::ArithTraits::conj( + row.value(iEntry)) + : row.value(iEntry); const ordinal_type ind = row.colidx(iEntry); if (doalpha != 1) { @@ -821,7 +819,7 @@ struct SPMV_MV_LayoutLeft_Functor { #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { - sum[k] = Kokkos::Details::ArithTraits::zero(); + sum[k] = Kokkos::ArithTraits::zero(); } const auto row = m_A.rowConst(iRow); @@ -834,9 +832,9 @@ struct SPMV_MV_LayoutLeft_Functor { Kokkos::parallel_for( Kokkos::ThreadVectorRange(dev, row.length), [&](ordinal_type iEntry) { const A_value_type val = - conjugate ? Kokkos::Details::ArithTraits::conj( - row.value(iEntry)) - : row.value(iEntry); + conjugate + ? Kokkos::ArithTraits::conj(row.value(iEntry)) + : row.value(iEntry); const ordinal_type ind = row.colidx(iEntry); #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll @@ -911,7 +909,7 @@ struct SPMV_MV_LayoutLeft_Functor { #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { - sum[k] = Kokkos::Details::ArithTraits::zero(); + sum[k] = Kokkos::ArithTraits::zero(); } const auto row = m_A.rowConst(iRow); @@ -923,8 +921,7 @@ struct SPMV_MV_LayoutLeft_Functor { for (ordinal_type iEntry = 0; iEntry < row.length; iEntry++) { const A_value_type val = - conjugate ? Kokkos::Details::ArithTraits::conj( - row.value(iEntry)) + conjugate ? Kokkos::ArithTraits::conj(row.value(iEntry)) : row.value(iEntry); const ordinal_type ind = row.colidx(iEntry); #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL @@ -968,9 +965,9 @@ struct SPMV_MV_LayoutLeft_Functor { Kokkos::ThreadVectorRange(dev, row.length), [&](ordinal_type iEntry, y_value_type& lsum) { const A_value_type val = - conjugate ? Kokkos::Details::ArithTraits::conj( - row.value(iEntry)) - : row.value(iEntry); + conjugate + ? Kokkos::ArithTraits::conj(row.value(iEntry)) + : row.value(iEntry); lsum += val * m_x(row.colidx(iEntry), 0); }, sum); @@ -1004,8 +1001,7 @@ struct SPMV_MV_LayoutLeft_Functor { y_value_type sum = y_value_type(); for (ordinal_type iEntry = 0; iEntry < row.length; iEntry++) { const A_value_type val = - conjugate ? Kokkos::Details::ArithTraits::conj( - row.value(iEntry)) + conjugate ? Kokkos::ArithTraits::conj(row.value(iEntry)) : row.value(iEntry); sum += val * m_x(row.colidx(iEntry), 0); } @@ -1488,7 +1484,7 @@ void spmv_alpha_mv(const char mode[], const typename YVector::non_const_value_type& beta, const YVector& y) { typedef typename YVector::non_const_value_type coefficient_type; - typedef Kokkos::Details::ArithTraits KAT; + typedef Kokkos::ArithTraits KAT; if (beta == KAT::zero()) { spmv_alpha_beta_mv(mode, alpha, A, x, diff --git a/sparse/impl/KokkosSparse_spmv_spec.hpp b/sparse/impl/KokkosSparse_spmv_spec.hpp index d196265b23..329e7b93e3 100644 --- a/sparse/impl/KokkosSparse_spmv_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_spec.hpp @@ -200,7 +200,7 @@ struct SPMV KAT; + typedef Kokkos::ArithTraits KAT; if (alpha == KAT::zero()) { if (beta != KAT::one()) { @@ -240,7 +240,7 @@ struct SPMV_MV KAT; + typedef Kokkos::ArithTraits KAT; if (alpha == KAT::zero()) { spmv_alpha_mv(mode, alpha, A, x, beta, y); diff --git a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index ac3da5e45f..8f217e05aa 100644 --- a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -37,7 +37,7 @@ struct SPMV_Struct_Transpose_Functor { typedef typename AMatrix::non_const_value_type value_type; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; typedef typename YVector::non_const_value_type coefficient_type; typedef typename YVector::non_const_value_type y_value_type; @@ -102,7 +102,7 @@ struct SPMV_Struct_Functor { typedef typename KokkosSparse::SparseRowViewConst row_view_const; typedef typename Kokkos::TeamPolicy team_policy; typedef typename team_policy::member_type team_member; - typedef Kokkos::Details::ArithTraits ATV; + typedef Kokkos::ArithTraits ATV; typedef Kokkos::View > shared_ordinal_1d; @@ -979,10 +979,9 @@ struct SPMV_MV_Struct_Transpose_Functor { Kokkos::ThreadVectorRange(dev, row_length), [&](ordinal_type iEntry) { const A_value_type val = - conjugate - ? Kokkos::Details::ArithTraits::conj( - row.value(iEntry)) - : row.value(iEntry); + conjugate ? Kokkos::ArithTraits::conj( + row.value(iEntry)) + : row.value(iEntry); const ordinal_type ind = row.colidx(iEntry); if (doalpha != 1) { @@ -1054,7 +1053,7 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { #pragma unroll #endif for (int k = 0; k < UNROLL; ++k) { - sum[k] = Kokkos::Details::ArithTraits::zero(); + sum[k] = Kokkos::ArithTraits::zero(); } const auto row = m_A.rowConst(iRow); @@ -1062,9 +1061,9 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { Kokkos::parallel_for( Kokkos::ThreadVectorRange(dev, row.length), [&](ordinal_type iEntry) { const A_value_type val = - conjugate ? Kokkos::Details::ArithTraits::conj( - row.value(iEntry)) - : row.value(iEntry); + conjugate + ? Kokkos::ArithTraits::conj(row.value(iEntry)) + : row.value(iEntry); const ordinal_type ind = row.colidx(iEntry); #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL @@ -1139,9 +1138,9 @@ struct SPMV_MV_Struct_LayoutLeft_Functor { Kokkos::ThreadVectorRange(dev, row.length), [&](ordinal_type iEntry, y_value_type& lsum) { const A_value_type val = - conjugate ? Kokkos::Details::ArithTraits::conj( - row.value(iEntry)) - : row.value(iEntry); + conjugate + ? Kokkos::ArithTraits::conj(row.value(iEntry)) + : row.value(iEntry); lsum += val * m_x(row.colidx(iEntry), 0); }, sum); @@ -1465,7 +1464,7 @@ void spmv_alpha_mv_struct(const char mode[], const typename YVector::non_const_value_type& beta, const YVector& y) { typedef typename YVector::non_const_value_type coefficient_type; - typedef Kokkos::Details::ArithTraits KAT; + typedef Kokkos::ArithTraits KAT; if (beta == KAT::zero()) { spmv_alpha_beta_mv_struct( diff --git a/sparse/impl/KokkosSparse_spmv_struct_spec.hpp b/sparse/impl/KokkosSparse_spmv_struct_spec.hpp index fde9bf4dcf..7ade8e2536 100644 --- a/sparse/impl/KokkosSparse_spmv_struct_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_struct_spec.hpp @@ -201,9 +201,9 @@ struct SPMV_STRUCT& structure, const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { - typedef Kokkos::Details::ArithTraits KAT; + typedef Kokkos::ArithTraits KAT; - typedef Kokkos::Details::ArithTraits KAT; + typedef Kokkos::ArithTraits KAT; if (alpha == KAT::zero()) { if (beta != KAT::one()) { @@ -242,7 +242,7 @@ struct SPMV_MV_STRUCT KAT; + typedef Kokkos::ArithTraits KAT; if (alpha == KAT::zero()) { spmv_alpha_mv_struct(mode, alpha, A, x, diff --git a/sparse/impl/KokkosSparse_trsv_impl.hpp b/sparse/impl/KokkosSparse_trsv_impl.hpp index f87ea5da82..fbbd547e34 100644 --- a/sparse/impl/KokkosSparse_trsv_impl.hpp +++ b/sparse/impl/KokkosSparse_trsv_impl.hpp @@ -72,7 +72,7 @@ void lowerTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, local_ordinal_type; typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type; - typedef Kokkos::Details::ArithTraits STS; + typedef Kokkos::ArithTraits STS; const local_ordinal_type numRows = A.numRows(); // const local_ordinal_type numCols = A.numCols (); @@ -190,7 +190,7 @@ void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, typename CrsMatrixType::row_map_type ptr = A.graph.row_map; typename CrsMatrixType::index_type ind = A.graph.entries; typename CrsMatrixType::values_type val = A.values; - typedef Kokkos::Details::ArithTraits STS; + typedef Kokkos::ArithTraits STS; // If local_ordinal_type is unsigned and numRows is 0, the loop // below will have entirely the wrong number of iterations. @@ -425,7 +425,7 @@ void upperTriSolveCscUnitDiagConj(RangeMultiVectorType X, local_ordinal_type; typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type; - typedef Kokkos::Details::ArithTraits STS; + typedef Kokkos::ArithTraits STS; const local_ordinal_type numRows = A.numRows(); const local_ordinal_type numCols = A.numCols(); @@ -486,7 +486,7 @@ void upperTriSolveCscConj(RangeMultiVectorType X, const CrsMatrixType& A, local_ordinal_type; typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type; - typedef Kokkos::Details::ArithTraits STS; + typedef Kokkos::ArithTraits STS; const local_ordinal_type numRows = A.numRows(); const local_ordinal_type numCols = A.numCols(); @@ -600,7 +600,7 @@ void lowerTriSolveCscUnitDiagConj(RangeMultiVectorType X, local_ordinal_type; typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type; - typedef Kokkos::Details::ArithTraits STS; + typedef Kokkos::ArithTraits STS; const local_ordinal_type numRows = A.numRows(); const local_ordinal_type numCols = A.numCols(); @@ -638,7 +638,7 @@ void lowerTriSolveCscConj(RangeMultiVectorType X, const CrsMatrixType& A, local_ordinal_type; typedef typename CrsMatrixType::values_type::non_const_value_type matrix_scalar_type; - typedef Kokkos::Details::ArithTraits STS; + typedef Kokkos::ArithTraits STS; const local_ordinal_type numRows = A.numRows(); const local_ordinal_type numCols = A.numCols(); diff --git a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index a64d7f76a0..00fdcd2442 100644 --- a/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -82,7 +82,7 @@ class TwostageGaussSeidel { using internal_vector_view_t = typename TwoStageGaussSeidelHandleType::vector_view_t; - using ST = Kokkos::Details::ArithTraits; + using ST = Kokkos::ArithTraits; using mag_t = typename ST::mag_type; private: @@ -407,7 +407,7 @@ class TwostageGaussSeidel { // functor for storing both valuesL & valuesU (with parallel_for) KOKKOS_INLINE_FUNCTION void operator()(const Tag_valuesLU &, const ordinal_t i) const { - const_scalar_t one = Kokkos::Details::ArithTraits::one(); + const_scalar_t one = Kokkos::ArithTraits::one(); ordinal_t nnzL = row_map(i); ordinal_t nnzU = row_map2(i); ordinal_t nnzLa = 0; @@ -851,8 +851,8 @@ class TwostageGaussSeidel { bool init_zero_x_vector = false, int numIter = 1, scalar_t omega = ST::one(), bool apply_forward = true, bool apply_backward = true, bool /*update_y_vector*/ = true) { - const_scalar_t one = Kokkos::Details::ArithTraits::one(); - const_scalar_t zero = Kokkos::Details::ArithTraits::zero(); + const_scalar_t one = Kokkos::ArithTraits::one(); + const_scalar_t zero = Kokkos::ArithTraits::zero(); #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS double tic; Kokkos::Timer timer; diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index ea4e50a8fe..2765bc116d 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -156,12 +156,12 @@ struct BsrRowView { } /// \brief Return offset into colidx_ for the requested block idx - /// If none found, return Kokkos::Details::ArithTraits::max + /// If none found, return Kokkos::ArithTraits::max /// \param idx_to_match [in] local block idx within block-row KOKKOS_INLINE_FUNCTION ordinal_type findRelBlockOffset(const ordinal_type idx_to_match, bool /*is_sorted*/ = false) const { - ordinal_type offset = Kokkos::Details::ArithTraits::max(); + ordinal_type offset = Kokkos::ArithTraits::max(); for (ordinal_type blk_offset = 0; blk_offset < length; ++blk_offset) { ordinal_type idx = colidx_[blk_offset]; if (idx == idx_to_match) { @@ -292,14 +292,14 @@ struct BsrRowViewConst { } /// \brief Return offset into colidx_ for the requested block idx - /// If none found, return Kokkos::Details::ArithTraits::max + /// If none found, return Kokkos::ArithTraits::max /// \param idx_to_match [in] local block idx within block-row KOKKOS_INLINE_FUNCTION ordinal_type findRelBlockOffset(const ordinal_type& idx_to_match, bool /*is_sorted*/ = false) const { typedef typename std::remove_cv::type non_const_ordinal_type; non_const_ordinal_type offset = - Kokkos::Details::ArithTraits::max(); + Kokkos::ArithTraits::max(); for (non_const_ordinal_type blk_offset = 0; blk_offset < length; ++blk_offset) { ordinal_type idx = colidx_[blk_offset]; @@ -979,7 +979,7 @@ class BsrMatrix { // + 1] (not global offset) colidx_ and values_ are already offset to the // beginning of blockrow rowi auto blk_offset = row_view.findRelBlockOffset(cols[i], is_sorted); - if (blk_offset != Kokkos::Details::ArithTraits::max()) { + if (blk_offset != Kokkos::ArithTraits::max()) { ordinal_type offset_into_vals = i * block_size * block_size; // stride == 1 assumed between elements diff --git a/sparse/src/KokkosSparse_IOUtils.hpp b/sparse/src/KokkosSparse_IOUtils.hpp index 77934b4f3e..c5f024f4f6 100644 --- a/sparse/src/KokkosSparse_IOUtils.hpp +++ b/sparse/src/KokkosSparse_IOUtils.hpp @@ -177,8 +177,7 @@ void kk_diagonally_dominant_sparseMatrix_generate( entriesInRow.insert(pos); colInd[k] = pos; values[k] = 100.0 * rand() / RAND_MAX - 50.0; - total_values += - Kokkos::Details::ArithTraits::abs(values[k]); + total_values += Kokkos::ArithTraits::abs(values[k]); break; } } diff --git a/sparse/unit_test/Test_Sparse_Utils.hpp b/sparse/unit_test/Test_Sparse_Utils.hpp index 73320e9358..cbd81e9b08 100644 --- a/sparse/unit_test/Test_Sparse_Utils.hpp +++ b/sparse/unit_test/Test_Sparse_Utils.hpp @@ -118,7 +118,7 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) { return false; } - typedef typename Kokkos::Details::ArithTraits< + typedef typename Kokkos::ArithTraits< typename scalar_view_t::non_const_value_type>::mag_type eps_type; eps_type eps = std::is_same::value ? 3.7e-3 : 1e-7; diff --git a/sparse/unit_test/Test_Sparse_block_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_block_gauss_seidel.hpp index 4c445f439f..11830e0224 100644 --- a/sparse/unit_test/Test_Sparse_block_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_block_gauss_seidel.hpp @@ -75,7 +75,7 @@ int run_block_gauss_seidel_1( GSApplyType apply_type = Test::symmetric, bool skip_symbolic = false, bool skip_numeric = false, size_t shmem_size = 32128, typename mtx_t::value_type omega = - Kokkos::Details::ArithTraits::one()) { + Kokkos::ArithTraits::one()) { typedef typename mtx_t::StaticCrsGraphType graph_t; typedef typename graph_t::row_map_type lno_view_t; typedef typename graph_t::entries_type lno_nnz_view_t; @@ -156,7 +156,7 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_view_t; typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type lno_nnz_view_t; - typedef typename Kokkos::Details::ArithTraits::mag_type mag_t; + typedef typename Kokkos::ArithTraits::mag_type mag_t; lno_t numCols = numRows; @@ -243,7 +243,7 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type lno_nnz_view_t; typedef Kokkos::View scalar_view2d_t; - typedef typename Kokkos::Details::ArithTraits::mag_type mag_t; + typedef typename Kokkos::ArithTraits::mag_type mag_t; lno_t numCols = numRows; @@ -289,8 +289,8 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, for (lno_t j = 0; j < nv; j++) { sum += solution_host(j, i) * solution_host(j, i); } - initial_norms[i] = Kokkos::Details::ArithTraits::sqrt( - Kokkos::Details::ArithTraits::abs(sum)); + initial_norms[i] = Kokkos::ArithTraits::sqrt( + Kokkos::ArithTraits::abs(sum)); } for (const auto gs_algorithm : params.gs_algorithms) { @@ -322,8 +322,8 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, scalar_t diff = x_host(r, c) - solution_host(r, c); sum += diff * diff; } - mag_t result_res = Kokkos::Details::ArithTraits::sqrt( - Kokkos::Details::ArithTraits::abs(sum)); + mag_t result_res = Kokkos::ArithTraits::sqrt( + Kokkos::ArithTraits::abs(sum)); EXPECT_LT(result_res, params.tolerance * initial_norms[c]); } } diff --git a/sparse/unit_test/Test_Sparse_bspgemm.hpp b/sparse/unit_test/Test_Sparse_bspgemm.hpp index b760e7e69c..58a2a18b8a 100644 --- a/sparse/unit_test/Test_Sparse_bspgemm.hpp +++ b/sparse/unit_test/Test_Sparse_bspgemm.hpp @@ -123,7 +123,7 @@ bool is_same_block_matrix(bsrMat_t output_mat_actual, return false; } - typedef typename Kokkos::Details::ArithTraits< + typedef typename Kokkos::ArithTraits< typename scalar_view_t::non_const_value_type>::mag_type eps_type; eps_type eps = std::is_same::value ? 3e-2 : 5e-7; diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 7960a1a9bc..358205b713 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -153,7 +153,7 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, typename KokkosSparse::CrsMatrix crsMat_t; typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef typename Kokkos::Details::ArithTraits::mag_type mag_t; + typedef typename Kokkos::ArithTraits::mag_type mag_t; srand(245); lno_t numCols = numRows; crsMat_t input_mat = @@ -177,8 +177,8 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, int apply_count = 3; // test symmetric, forward, backward scalar_view_t x_vector( Kokkos::view_alloc(Kokkos::WithoutInitializing, "x vector"), nv); - const scalar_t one = Kokkos::Details::ArithTraits::one(); - const scalar_t zero = Kokkos::Details::ArithTraits::zero(); + const scalar_t one = Kokkos::ArithTraits::one(); + const scalar_t zero = Kokkos::ArithTraits::zero(); //*** Point-coloring version **** for (int apply_type = 0; apply_type < apply_count; ++apply_type) { Kokkos::Timer timer1; @@ -242,7 +242,7 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, typedef Kokkos::View scalar_view2d_t; typedef Kokkos::View host_scalar_view2d_t; - typedef typename Kokkos::Details::ArithTraits::mag_type mag_t; + typedef typename Kokkos::ArithTraits::mag_type mag_t; lno_t numCols = numRows; crsMat_t input_mat = @@ -270,11 +270,11 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, for (lno_t j = 0; j < nv; j++) { sum += solution_x(j, i) * solution_x(j, i); } - initial_norms[i] = Kokkos::Details::ArithTraits::sqrt( - Kokkos::Details::ArithTraits::abs(sum)); + initial_norms[i] = Kokkos::ArithTraits::sqrt( + Kokkos::ArithTraits::abs(sum)); } int apply_count = 3; // test symmetric, forward, backward - const scalar_t zero = Kokkos::Details::ArithTraits::zero(); + const scalar_t zero = Kokkos::ArithTraits::zero(); //*** Point-coloring version **** for (int apply_type = 0; apply_type < apply_count; ++apply_type) { Kokkos::Timer timer1; @@ -289,8 +289,8 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, scalar_t diff = x_host(j, i) - solution_x(j, i); diffDot += diff * diff; } - mag_t res = Kokkos::Details::ArithTraits::sqrt( - Kokkos::Details::ArithTraits::abs(diffDot)); + mag_t res = Kokkos::ArithTraits::sqrt( + Kokkos::ArithTraits::abs(diffDot)); EXPECT_LT(res, initial_norms[i]); } } @@ -312,8 +312,8 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, scalar_t diff = x_host(j, i) - solution_x(j, i); diffDot += diff * diff; } - mag_t res = Kokkos::Details::ArithTraits::sqrt( - Kokkos::Details::ArithTraits::abs(diffDot)); + mag_t res = Kokkos::ArithTraits::sqrt( + Kokkos::ArithTraits::abs(diffDot)); EXPECT_LT(res, initial_norms[i]); } } @@ -332,8 +332,8 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, scalar_t diff = x_host(j, i) - solution_x(j, i); diffDot += diff * diff; } - mag_t res = Kokkos::Details::ArithTraits::sqrt( - Kokkos::Details::ArithTraits::abs(diffDot)); + mag_t res = Kokkos::ArithTraits::sqrt( + Kokkos::ArithTraits::abs(diffDot)); EXPECT_LT(res, initial_norms[i]); } } @@ -350,8 +350,8 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, scalar_t diff = x_host(j, i) - solution_x(j, i); diffDot += diff * diff; } - mag_t res = Kokkos::Details::ArithTraits::sqrt( - Kokkos::Details::ArithTraits::abs(diffDot)); + mag_t res = Kokkos::ArithTraits::sqrt( + Kokkos::ArithTraits::abs(diffDot)); EXPECT_LT(res, initial_norms[i]); } } @@ -361,8 +361,8 @@ template void test_sequential_sor(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { - const scalar_t zero = Kokkos::Details::ArithTraits::zero(); - const scalar_t one = Kokkos::Details::ArithTraits::one(); + const scalar_t zero = Kokkos::ArithTraits::zero(); + const scalar_t one = Kokkos::ArithTraits::one(); srand(245); typedef typename device::execution_space exec_space; typedef @@ -419,10 +419,9 @@ void test_sequential_sor(lno_t numRows, size_type nnz, lno_t bandwidth, // Copy solution back Kokkos::deep_copy(x, x_host); // Check against gold solution - scalar_t xSq = KokkosBlas::dot(x, x); - scalar_t solnDot = KokkosBlas::dot(x, xgold); - double scaledSolutionDot = - Kokkos::Details::ArithTraits::abs(solnDot / xSq); + scalar_t xSq = KokkosBlas::dot(x, x); + scalar_t solnDot = KokkosBlas::dot(x, xgold); + double scaledSolutionDot = Kokkos::ArithTraits::abs(solnDot / xSq); EXPECT_TRUE(0.99 < scaledSolutionDot); } @@ -533,7 +532,7 @@ void test_gauss_seidel_long_rows(lno_t numRows, lno_t numLongRows, typedef typename crsMat_t::values_type::non_const_type scalar_view_t; typedef typename crsMat_t::index_type::non_const_type entries_view_t; typedef typename crsMat_t::row_map_type::non_const_type rowmap_view_t; - typedef typename Kokkos::Details::ArithTraits::mag_type mag_t; + typedef typename Kokkos::ArithTraits::mag_type mag_t; const scalar_t one = Kokkos::ArithTraits::one(); srand(245); std::vector rowmap = {0}; @@ -630,7 +629,7 @@ void test_gauss_seidel_custom_coloring(lno_t numRows, lno_t nnzPerRow) { typename KokkosSparse::CrsMatrix crsMat_t; typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef typename Kokkos::Details::ArithTraits::mag_type mag_t; + typedef typename Kokkos::ArithTraits::mag_type mag_t; const scalar_t one = Kokkos::ArithTraits::one(); size_type nnz = nnzPerRow * numRows; crsMat_t input_mat = diff --git a/sparse/unit_test/Test_Sparse_replaceSumInto.hpp b/sparse/unit_test/Test_Sparse_replaceSumInto.hpp index af61aeb320..f8427dc925 100644 --- a/sparse/unit_test/Test_Sparse_replaceSumInto.hpp +++ b/sparse/unit_test/Test_Sparse_replaceSumInto.hpp @@ -50,7 +50,7 @@ class ModifyEvenNumberedRows { ordinal_type cols[1]; value_type vals[1]; - const value_type ONE = Kokkos::Details::ArithTraits::one(); + const value_type ONE = Kokkos::ArithTraits::one(); const value_type THREE = ONE + ONE + ONE; cols[0] = lclRow; @@ -97,7 +97,7 @@ bool checkWhetherEvenNumberedRowsWereModified(const CrsMatrixType& A, typedef typename CrsMatrixType::value_type SC; typedef typename CrsMatrixType::ordinal_type LO; - const SC ONE = Kokkos::Details::ArithTraits::one(); + const SC ONE = Kokkos::ArithTraits::one(); const SC TWO = ONE + ONE; const SC THREE = ONE + ONE + ONE; @@ -135,7 +135,7 @@ void testOneCase(bool& /*success*/, // Teuchos::FancyOStream& out, std::ostream& out, const CrsMatrixType& A, const bool replace, const bool sorted, const bool atomic) { - using Kokkos::Details::ArithTraits; + using Kokkos::ArithTraits; typedef typename CrsMatrixType::value_type value_type; // Teuchos::OSTab tab0 (out); diff --git a/sparse/unit_test/Test_Sparse_replaceSumIntoLonger.hpp b/sparse/unit_test/Test_Sparse_replaceSumIntoLonger.hpp index 76bbfe37a9..98affff57d 100644 --- a/sparse/unit_test/Test_Sparse_replaceSumIntoLonger.hpp +++ b/sparse/unit_test/Test_Sparse_replaceSumIntoLonger.hpp @@ -47,7 +47,7 @@ class ModifyEntries { KOKKOS_FUNCTION void operator()(const ordinal_type& lclRow, ordinal_type& numModified) const { - typedef Kokkos::Details::ArithTraits KAT; + typedef Kokkos::ArithTraits KAT; typedef typename KAT::mag_type mag_type; const scalar_type ONE = KAT::one(); @@ -171,7 +171,7 @@ void checkWhetherEntriesWereModified( // using Teuchos::RCP; typedef typename CrsMatrixType::value_type value_type; typedef typename CrsMatrixType::ordinal_type ordinal_type; - typedef Kokkos::Details::ArithTraits KAT; + typedef Kokkos::ArithTraits KAT; // If debug is false, we capture all output in an // std::ostringstream, and don't print it unless the test fails @@ -281,7 +281,7 @@ void testOneCaseImpl(bool& /*success*/, std::ostream& out, // Restore original values. auto val_h = Kokkos::create_mirror_view(A.values); - const scalar_type ONE = Kokkos::Details::ArithTraits::one(); + const scalar_type ONE = Kokkos::ArithTraits::one(); scalar_type curVal = ONE; for (ordinal_type k = 0; k < A.numCols(); ++k, curVal += ONE) { val_h[k] = curVal; @@ -388,7 +388,7 @@ void testAllSizes(bool& success, typedef typename matrix_type::value_type value_type; typedef typename matrix_type::ordinal_type ordinal_type; typedef typename matrix_type::size_type size_type; - const value_type ONE = Kokkos::Details::ArithTraits::one(); + const value_type ONE = Kokkos::ArithTraits::one(); // Teuchos::OSTab tab0 (out); out << "maxNumEnt: " << maxNumEnt << endl; diff --git a/sparse/unit_test/Test_Sparse_spgemm_jacobi.hpp b/sparse/unit_test/Test_Sparse_spgemm_jacobi.hpp index cfdf13a709..25a5d155a7 100644 --- a/sparse/unit_test/Test_Sparse_spgemm_jacobi.hpp +++ b/sparse/unit_test/Test_Sparse_spgemm_jacobi.hpp @@ -165,7 +165,7 @@ bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2) { return false; } - typedef typename Kokkos::Details::ArithTraits< + typedef typename Kokkos::ArithTraits< typename scalar_view_t::non_const_value_type>::mag_type eps_type; eps_type eps = std::is_same::value ? 2 * 1e-3 : 1e-7; diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 12065781f1..840a4702a0 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -50,7 +50,7 @@ void run_test_spiluk() { typedef Kokkos::View RowMapType; typedef Kokkos::View EntriesType; typedef Kokkos::View ValuesType; - typedef Kokkos::Details::ArithTraits AT; + typedef Kokkos::ArithTraits AT; const size_type nrows = 9; const size_type nnz = 21; diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 761c919aac..fe68d68d07 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -106,8 +106,8 @@ struct multivector_layout_adapter { template void EXPECT_NEAR_KK(Scalar1 val1, Scalar2 val2, Scalar3 tol, std::string msg = "") { - typedef Kokkos::Details::ArithTraits AT1; - typedef Kokkos::Details::ArithTraits AT3; + typedef Kokkos::ArithTraits AT1; + typedef Kokkos::ArithTraits AT3; EXPECT_LE((double)AT1::abs(val1 - val2), (double)AT3::abs(tol)) << msg; } @@ -116,8 +116,8 @@ void EXPECT_NEAR_KK_REL(Scalar1 val1, Scalar2 val2, Scalar3 tol, std::string msg = "") { typedef typename std::remove_reference::type hv1_type; typedef typename std::remove_reference::type hv2_type; - const auto ahv1 = Kokkos::Details::ArithTraits::abs(val1); - const auto ahv2 = Kokkos::Details::ArithTraits::abs(val2); + const auto ahv1 = Kokkos::ArithTraits::abs(val1); + const auto ahv2 = Kokkos::ArithTraits::abs(val2); EXPECT_NEAR_KK(val1, val2, tol * Kokkos::max(ahv1, ahv2), msg); } @@ -205,7 +205,7 @@ struct SharedVanillaGEMM { typedef Kokkos::View SubviewTypeB; - typedef Kokkos::Details::ArithTraits APT; + typedef Kokkos::ArithTraits APT; typedef typename APT::mag_type mag_type; ScalarA alpha; ScalarC beta; From c773957165d1058794bde140ceea99b608c805a4 Mon Sep 17 00:00:00 2001 From: kliegeois Date: Fri, 24 Mar 2023 08:03:35 -0600 Subject: [PATCH 167/442] Add calls to KokkosBlas Dot and Axpy for team batched kernels when m==1 --- .../dense/impl/KokkosBatched_Axpy_Impl.hpp | 19 +++ .../dense/impl/KokkosBatched_Copy_Impl.hpp | 142 ++++++++++++++++++ .../dense/impl/KokkosBatched_Dot_Internal.hpp | 36 +++++ .../unit_test/Test_Batched_TeamVectorQR.hpp | 2 +- ...atched_TeamVectorQR_WithColumnPivoting.hpp | 2 +- 5 files changed, 199 insertions(+), 2 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp b/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp index 232ef5278c..beaef112f3 100644 --- a/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp @@ -19,6 +19,7 @@ /// \author Kim Liegeois (knliege@sandia.gov) #include "KokkosBatched_Util.hpp" +#include "KokkosBlas1_team_axpby.hpp" namespace KokkosBatched { @@ -177,6 +178,7 @@ struct TeamVectorAxpyInternal { /// /// Serial Impl /// =========== + template KOKKOS_INLINE_FUNCTION int SerialAxpy::invoke(const alphaViewType& alpha, const XViewType& X, @@ -212,6 +214,9 @@ KOKKOS_INLINE_FUNCTION int SerialAxpy::invoke(const alphaViewType& alpha, } #endif + // No need to check if X.extent(0)==1 in the serial case as we don't + // parallelize the kernel anyway. + return SerialAxpyInternal::template invoke< typename alphaViewType::non_const_value_type, typename XViewType::non_const_value_type>( @@ -259,6 +264,13 @@ KOKKOS_INLINE_FUNCTION int TeamAxpy::invoke( } #endif + if (X.extent(0) == 1) { + KokkosBlas::Experimental::axpy( + member, alpha.data()[0], Kokkos::subview(X, 0, Kokkos::ALL), + Kokkos::subview(Y, 0, Kokkos::ALL)); + return 0; + } + return TeamAxpyInternal::template invoke< MemberType, typename alphaViewType::non_const_value_type, typename XViewType::non_const_value_type>( @@ -307,6 +319,13 @@ KOKKOS_INLINE_FUNCTION int TeamVectorAxpy::invoke( } #endif + if (X.extent(0) == 1) { + KokkosBlas::Experimental::axpy( + member, alpha.data()[0], Kokkos::subview(X, 0, Kokkos::ALL), + Kokkos::subview(Y, 0, Kokkos::ALL)); + return 0; + } + return TeamVectorAxpyInternal::invoke< MemberType, typename alphaViewType::non_const_value_type, typename XViewType::non_const_value_type, diff --git a/batched/dense/impl/KokkosBatched_Copy_Impl.hpp b/batched/dense/impl/KokkosBatched_Copy_Impl.hpp index 110735ca13..2f0be4b661 100644 --- a/batched/dense/impl/KokkosBatched_Copy_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Copy_Impl.hpp @@ -47,6 +47,25 @@ template <> template KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( const AViewType &A, const BViewType &B) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, + "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, + "KokkosBatched::copy: BViewType must have rank 2."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x %d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); + return 1; + } +#endif return SerialCopyInternal::invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); @@ -56,6 +75,25 @@ template <> template KOKKOS_INLINE_FUNCTION int SerialCopy::invoke( const AViewType &A, const BViewType &B) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, + "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, + "KokkosBatched::copy: BViewType must have rank 2."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x %d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), (int)B.extent(1)); + return 1; + } +#endif return SerialCopyInternal::invoke(A.extent(1), A.extent(0), A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); @@ -93,6 +131,32 @@ struct TeamCopy { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, + "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, + "KokkosBatched::copy: BViewType must have rank 2."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " + "%d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), + (int)B.extent(1)); + return 1; + } +#endif + if (A.extent(0) == 1) { + return TeamCopy::invoke( + member, Kokkos::subview(A, 0, Kokkos::ALL), + Kokkos::subview(B, 0, Kokkos::ALL)); + } return TeamCopyInternal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); @@ -105,6 +169,32 @@ struct TeamCopy { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, + "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, + "KokkosBatched::copy: BViewType must have rank 2."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " + "%d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), + (int)B.extent(1)); + return 1; + } +#endif + if (A.extent(1) == 1) { + return TeamCopy::invoke( + member, Kokkos::subview(A, Kokkos::ALL, 0), + Kokkos::subview(B, Kokkos::ALL, 0)); + } return TeamCopyInternal::invoke(member, A.extent(1), A.extent(0), A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); @@ -143,6 +233,32 @@ struct TeamVectorCopy { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, + "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, + "KokkosBatched::copy: BViewType must have rank 2."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " + "%d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), + (int)B.extent(1)); + return 1; + } +#endif + if (A.extent(0) == 1) { + return TeamVectorCopy::invoke( + member, Kokkos::subview(A, 0, Kokkos::ALL), + Kokkos::subview(B, 0, Kokkos::ALL)); + } return TeamVectorCopyInternal::invoke(member, A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_0(), B.stride_1()); @@ -155,6 +271,32 @@ struct TeamVectorCopy { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const AViewType &A, const BViewType &B) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::copy: BViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, + "KokkosBatched::copy: AViewType must have rank 2."); + static_assert(BViewType::rank == 2, + "KokkosBatched::copy: BViewType must have rank 2."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != B.extent(0) || A.extent(1) != B.extent(1)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::copy: Dimensions of A and B do not match: A: %d x " + "%d, " + "B: %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)B.extent(0), + (int)B.extent(1)); + return 1; + } +#endif + if (A.extent(1) == 1) { + return TeamVectorCopy::invoke( + member, Kokkos::subview(A, Kokkos::ALL, 0), + Kokkos::subview(B, Kokkos::ALL, 0)); + } return TeamVectorCopyInternal::invoke(member, A.extent(1), A.extent(0), A.data(), A.stride_1(), A.stride_0(), B.data(), B.stride_0(), B.stride_1()); diff --git a/batched/dense/impl/KokkosBatched_Dot_Internal.hpp b/batched/dense/impl/KokkosBatched_Dot_Internal.hpp index c50da7a3d4..a6a7673e7b 100644 --- a/batched/dense/impl/KokkosBatched_Dot_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Dot_Internal.hpp @@ -19,6 +19,7 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) #include "KokkosBatched_Util.hpp" +#include "KokkosBlas1_team_dot.hpp" namespace KokkosBatched { @@ -162,6 +163,7 @@ struct TeamVectorDotInternal { /// /// Serial Impl /// =========== + template <> struct SerialDot { template @@ -256,6 +258,7 @@ struct SerialDot { /// /// Team Impl /// =============== + template struct TeamDot { template @@ -295,6 +298,14 @@ struct TeamDot { return 1; } #endif + + if (X.extent(1) == 1) { + dot(0) = KokkosBlas::Experimental::dot( + member, Kokkos::subview(X, Kokkos::ALL, 0), + Kokkos::subview(Y, Kokkos::ALL, 0)); + return 0; + } + return TeamDotInternal::template invoke< MemberType, typename XViewType::non_const_value_type, typename NormViewType::non_const_value_type>( @@ -341,6 +352,14 @@ struct TeamDot { return 1; } #endif + + if (X.extent(0) == 1) { + dot(0) = KokkosBlas::Experimental::dot( + member, Kokkos::subview(X, 0, Kokkos::ALL), + Kokkos::subview(Y, 0, Kokkos::ALL)); + return 0; + } + return TeamDotInternal::template invoke< MemberType, typename XViewType::non_const_value_type, typename NormViewType::non_const_value_type>( @@ -352,6 +371,7 @@ struct TeamDot { /// /// TeamVector Impl /// =============== + template struct TeamVectorDot { template @@ -391,6 +411,14 @@ struct TeamVectorDot { return 1; } #endif + + if (X.extent(1) == 1) { + dot(0) = KokkosBlas::Experimental::dot( + member, Kokkos::subview(X, Kokkos::ALL, 0), + Kokkos::subview(Y, Kokkos::ALL, 0)); + return 0; + } + return TeamVectorDotInternal::template invoke< MemberType, typename XViewType::non_const_value_type, typename NormViewType::non_const_value_type>( @@ -437,6 +465,14 @@ struct TeamVectorDot { return 1; } #endif + + if (X.extent(0) == 1) { + dot(0) = KokkosBlas::Experimental::dot( + member, Kokkos::subview(X, 0, Kokkos::ALL), + Kokkos::subview(Y, 0, Kokkos::ALL)); + return 0; + } + return TeamVectorDotInternal::template invoke< MemberType, typename XViewType::non_const_value_type, typename NormViewType::non_const_value_type>( diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp index d7e237094d..a591ef0ced 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorQR.hpp @@ -78,7 +78,7 @@ struct Functor_TestBatchedTeamVectorQR { member.team_barrier(); /// xx = bb; - TeamVectorCopy::invoke(member, bb, xx); + TeamVectorCopy::invoke(member, bb, xx); member.team_barrier(); /// xx = Q^{T}xx; diff --git a/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp b/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp index 648ae43566..e15a435e8b 100644 --- a/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp +++ b/batched/dense/unit_test/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp @@ -80,7 +80,7 @@ struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting { member.team_barrier(); /// xx = bb; - TeamVectorCopy::invoke(member, bb, xx); + TeamVectorCopy::invoke(member, bb, xx); member.team_barrier(); /// xx = Q^{T} xx; From 6bcfac5bd7f526de4080663fa97f8ed75815dec1 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 28 Feb 2023 15:11:05 -0700 Subject: [PATCH 168/442] Adds team- and thread-based lower-bound and upper-bound search and predicates. --- common/src/KokkosKernels_Iota.hpp | 2 + common/src/KokkosKernels_LowerBound.hpp | 470 ++++++++++++++++++++ common/src/KokkosKernels_Predicates.hpp | 167 +++++++ common/src/KokkosKernels_UpperBound.hpp | 101 +++++ common/unit_test/Test_Common.hpp | 2 + common/unit_test/Test_Common_Iota.hpp | 9 + common/unit_test/Test_Common_LowerBound.hpp | 256 +++++++++++ common/unit_test/Test_Common_UpperBound.hpp | 245 ++++++++++ 8 files changed, 1252 insertions(+) create mode 100644 common/src/KokkosKernels_LowerBound.hpp create mode 100644 common/src/KokkosKernels_Predicates.hpp create mode 100644 common/src/KokkosKernels_UpperBound.hpp create mode 100644 common/unit_test/Test_Common_LowerBound.hpp create mode 100644 common/unit_test/Test_Common_UpperBound.hpp diff --git a/common/src/KokkosKernels_Iota.hpp b/common/src/KokkosKernels_Iota.hpp index c5d6a8dfac..5b7e24ca24 100644 --- a/common/src/KokkosKernels_Iota.hpp +++ b/common/src/KokkosKernels_Iota.hpp @@ -135,6 +135,8 @@ template struct is_iota> : public std::true_type {}; template struct is_iota> : public std::true_type {}; +template +inline constexpr bool is_iota_v = is_iota::value; } // namespace Impl } // namespace KokkosKernels diff --git a/common/src/KokkosKernels_LowerBound.hpp b/common/src/KokkosKernels_LowerBound.hpp new file mode 100644 index 0000000000..22df9545ef --- /dev/null +++ b/common/src/KokkosKernels_LowerBound.hpp @@ -0,0 +1,470 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef _KOKKOSKERNELS_LOWERBOUND_HPP +#define _KOKKOSKERNELS_LOWERBOUND_HPP + +/*! \file KokkosKernels_LowerBound.hpp + Define thread and team-collaborative lower-bound search + + Lower-bound search takes a Kokkos::View, a search value, and a binary + predicate. + It returns an index to the first element of the view that does not + satisfy pred(element, value), or the size of the view if no such + element exists. + + All elements for which pred(element, value) is true must precede those + for which it is false. + + The default predicate is less-than, i.e. pred(a,b) = a < b. + In this case, lower-bound search returns the first index where the value is + >= the view entry. + + The type of the predicate function must be equivalent to the following: + \verbatim + bool operator(const T &a, const T&b); + \endverbatim + KokkosKernels_Predicates.hpp defines a variety of common predicates, + available in KokkosKernels namespace. + + Examples: + \verbatim + value = 3 + view = {0,1,2,3,4} + = {t,t,t,f,f} + result = 3 + + value = -1 + view = {0,1,2,3,4} + = {f,f,f,f,f} + result = 0 + + value = 5 + view = {0,1,2,3,4} + = {t,t,t,t,t} + result = 5 + + value = 1 + view = {0,1,1,1,2} + = {t,f,f,f,f} + result = 1 + \endverbatim + + Contrast with upper-bound, which returns first index for which pred(value, + element) is true + */ + +#include + +#include "KokkosKernels_Predicates.hpp" +#include "KokkosKernels_SimpleUtils.hpp" + +namespace KokkosKernels { +namespace Impl { + +/*! \brief Single-thread sequential lower-bound search + + \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota + \tparam Pred a binary predicate function + \param view the view to search + \param value the value to search for + \param pred a binary predicate function + \returns index of first element in view where pred(element, value) is false, + or view.size if no such element exists + + At most view.size() predicate function calls +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type +lower_bound_sequential_thread( + const ViewLike &view, const typename ViewLike::non_const_value_type &value, + Pred pred = Pred()) { + using size_type = typename ViewLike::size_type; + static_assert(1 == ViewLike::rank, + "lower_bound_sequential_thread requires rank-1 views"); + static_assert(is_iota_v || Kokkos::is_view::value, + "lower_bound_sequential_thread requires a " + "KokkosKernels::Impl::Iota or a Kokkos::View"); + + size_type i = 0; + while (i < view.size() && pred(view(i), value)) { + ++i; + } + return i; +} + +/*! \brief Single-thread binary lower-bound search + + \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota + \tparam Pred a binary predicate function + \param view the view to search + \param value the value to search for + \param pred a binary predicate function + \returns index of first element in view where pred(element, value) is false, + or view.size if no such element exists + + At most log2(view.size()) + 1 predicate function calls +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_binary_thread( + const ViewLike &view, const typename ViewLike::non_const_value_type &value, + Pred pred = Pred()) { + using size_type = typename ViewLike::size_type; + static_assert(1 == ViewLike::rank, + "lower_bound_binary_thread requires rank-1 views"); + static_assert(is_iota_v || Kokkos::is_view::value, + "lower_bound_binary_thread requires a " + "KokkosKernels::Impl::Iota or a Kokkos::View"); + + size_type lo = 0; + size_type hi = view.size(); + while (lo < hi) { + size_type mid = (lo + hi) / 2; + const auto &ve = view(mid); + if (pred(ve, value)) { // mid satisfies predicate, look in higher half not + // including mid + lo = mid + 1; + } else { + hi = mid; + } + } + return lo; +} + +} // namespace Impl + +/*! \brief single-thread lower-bound search + + \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota + \tparam Pred a binary predicate function + \param view the view to search + \param value the value to search for + \param pred a binary predicate function + \returns index of first element in view where pred(element, value) is false, + or view.size if no such element exists + + This minimizes the calls to predicate: + for view.size() >= 8, this does a binary search, otherwise, a linear search +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_thread( + const ViewLike &view, const typename ViewLike::non_const_value_type &value, + Pred pred = Pred()) { + static_assert(1 == ViewLike::rank, + "lower_bound_thread requires rank-1 views"); + static_assert(is_iota_v || Kokkos::is_view::value, + "lower_bound_thread requires a " + "KokkosKernels::Impl::Iota or a Kokkos::View"); + /* + sequential search makes on average 0.5 * view.size memory accesses + binary search makes log2(view.size)+1 accesses + + log2(x) <= 0.5x roughly when x >= 8 + */ + if (view.size() >= 8) { + return Impl::lower_bound_binary_thread(view, value, pred); + } else { + return Impl::lower_bound_sequential_thread(view, value, pred); + } +} + +namespace Impl { + +/*! \brief Team-collaborative sequential lower-bound search + + \tparam TeamMember the team policy member type + \tparam ViewLike A Kokkos::View or KokkosKernels::Iota + \tparam Pred The type of the predicate function to call + + \param handle The Kokkos team handle + \param view The view-like to search + \param value The value to compare in the predicate + \param lo The first index to search + \param hi One-past the last index to search + \param pred Apply pred(view(i), value) + + \returns To all team members, the smallest i for which pred(view(i), value) + is false for i in [lo, hi), or hi if no such value + + Uses a single thread to call \c lower_bound_thread, and broadcasts that + to all team members. +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_single_team( + const TeamMember &handle, const ViewLike &view, + const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + typename ViewLike::size_type idx; + Kokkos::single( + Kokkos::PerTeam(handle), + [&](typename ViewLike::size_type &lidx) { + lidx = KokkosKernels::lower_bound_thread(view, value, pred); + }, + idx); + return idx; +} + +/*! \brief Team-collaborative sequential lower-bound search + + \tparam TeamMember the team policy member type + \tparam ViewLike A Kokkos::View or KokkosKernels::Iota + \tparam Pred The type of the predicate function to call + + \param handle The Kokkos team handle + \param view The view-like to search + \param value The value to compare in the predicate + \param lo The first index to search + \param hi One-past the last index to search + \param pred Apply pred(view(i), value) + + \returns To all team members, the smallest i for which pred(view(i), value) + is false for i in [lo, hi), or hi if no such value + + Apply pred(view(i), value) for i in [lo, hi) +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_sequential_team( + const TeamMember &handle, const ViewLike &view, + const typename ViewLike::non_const_value_type &value, + typename ViewLike::size_type lo, typename ViewLike::size_type hi, + Pred pred = Pred()) { + using size_type = typename ViewLike::size_type; + static_assert(1 == ViewLike::rank, + "lower_bound_sequential_team requires rank-1 views"); + static_assert(is_iota_v || Kokkos::is_view::value, + "lower_bound_sequential_team requires a " + "KokkosKernels::Impl::Iota or a Kokkos::View"); + + if (lo == hi) { + return hi; + } + size_type teamI; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(handle, lo, hi), + [&](const size_type &i, size_type &li) { + li = KOKKOSKERNELS_MACRO_MIN(li, hi); + if (i < li) { // no need to search higher than the smallest so far + if (!pred(view(i), value)) { // look for the smallest index that does + // not satisfy + li = i; + } + } + }, + Kokkos::Min(teamI)); + return teamI; +} + +/*! \brief Team-collaborative sequential lower-bound search + + \tparam TeamMember the team policy member type + \tparam ViewLike A Kokkos::View or KokkosKernels::Iota + \tparam Pred The type of the predicate function to call + + \param handle The Kokkos team handle + \param view The view-like to search + \param value The value to compare in the predicate + \param pred Apply pred(view(i), value) + + \returns To all team members, the smallest i for which pred(view(i), value) + is false or view.size() if no such value +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_sequential_team( + const TeamMember &handle, const ViewLike &view, + const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + return lower_bound_sequential_team(handle, view, value, 0, view.size(), pred); +} + +/*! \brief A range for the k-ary lower bound search + + The RangeReducer will maximize the lower bound and + minimize the upper bound +*/ +template +struct Range { + T lb; /// lower-bound + T ub; /// upper-bound + + KOKKOS_INLINE_FUNCTION + Range() { init(); } + + KOKKOS_INLINE_FUNCTION + constexpr Range(const T &_lb, const T &_ub) : lb(_lb), ub(_ub) {} + + KOKKOS_INLINE_FUNCTION + void init() { + lb = Kokkos::Experimental::finite_min_v; // will be max'd + ub = Kokkos::Experimental::finite_max_v; // will be min'd + } +}; + +/// \brief maximizes the lower bound, and minimizes the upper bound of a Range +template +struct RangeReducer { + using reducer = RangeReducer; + using value_type = Range; + using result_view_type = + Kokkos::View *, Space, Kokkos::MemoryUnmanaged>; + + private: + value_type &value; + + public: + KOKKOS_INLINE_FUNCTION + RangeReducer(value_type &value_) : value(value_) {} + + KOKKOS_INLINE_FUNCTION + void join(value_type &dst, const value_type &src) const { + dst.lb = KOKKOSKERNELS_MACRO_MAX(dst.lb, src.lb); + dst.ub = KOKKOSKERNELS_MACRO_MIN(dst.ub, src.ub); + } + + KOKKOS_INLINE_FUNCTION + void init(value_type &val) const { val.init(); } + + KOKKOS_INLINE_FUNCTION + value_type &reference() const { return value; } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return result_view_type(&value, 1); } + + KOKKOS_INLINE_FUNCTION + bool references_scalar() const { return true; } +}; + +/*! \brief team-collaborative K-ary lower-bound search + + \tparam TeamMember the team policy member type + \tparam ViewLike A Kokkos::View or KokkosKernels::Iota + \tparam Pred the binary predicate function type + + Actually, K+1-ary, where K is the size of the team + Split the view into k+1 segments at K points + Evalute the predicate in parallel at each point and use a joint min-max + parallel reduction: + * The lower bound is after the max index where the predicate was true + * The upper bound is no greater than the min index where the predicate was + false Once there are fewer values left than threads in the team, switch to + team sequential search +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_kary_team( + const TeamMember &handle, const ViewLike &view, + const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + static_assert(1 == ViewLike::rank, + "lower_bound_kary_team requires rank-1 views"); + static_assert(is_iota_v || Kokkos::is_view::value, + "lower_bound_kary_team requires a " + "KokkosKernels::Impl::Iota or a Kokkos::View"); + + using size_type = typename ViewLike::size_type; + + size_type lo = 0; + size_type hi = view.size(); + while (lo < hi) { + // if fewer than team_size elements left, just hit them all sequentially + if (lo + handle.team_size() >= hi) { + return lower_bound_sequential_team(handle, view, value, lo, hi, pred); + } + + // otherwise, split the region up among threads + size_type mid = + lo + (hi - lo) * (handle.team_rank() + 1) / (handle.team_size() + 1); + auto ve = view(mid); + + // reduce across threads to figure out where the new search bounds are + // if a thread satisfies the predicate, the first element that does not + // satisfy must be after that thread's search point. we want the max such + // point across all threads if a thread does not satisfy the predicate, the + // first element that does not satisfy must be before or equal. we want the + // min such point across all threads + Range teamRange; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(handle, 0, handle.team_size()), + [&](const int &, Range &lr) { + lr.lb = KOKKOSKERNELS_MACRO_MAX(lo, lr.lb); // no lower than lo + lr.ub = KOKKOSKERNELS_MACRO_MIN(hi, lr.ub); // no higher than hi + // if pred(view(mid), value), then the lower bound is above this + if (pred(ve, value)) { + lr.lb = mid + 1; + } else { // otherwise the lower bound is no larger than this + lr.ub = mid; + } + }, + RangeReducer(teamRange)); + + // next iteration, search in the newly-discovered window + hi = teamRange.ub; + lo = teamRange.lb; + } + return lo; +} + +} // namespace Impl + +/*! \brief Team-collaborative lower-bound search + + \tparam TeamMember the team policy member type the Kokkos team handle + \tparam View the type of view + \tparam Pred the type of the predicate + + \param handle a Kokkos team handle + \param view a Kokkos::View to search + \param value the value to search for + \param pred the predicate to test entries in the view + + \returns The smallest i in range [0, view.size()) for which pred(view(i), + value) is not true, or view.size() if no such `i` exists + + default pred is `element < value`, i.e. return the index to the first + element in the view that does not satisfy `element < value`. For well-ordered + types this is the first element where element >= value + + Pred should be a binary function comparing two `typename + View::non_const_value_type` +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type lower_bound_team( + const TeamMember &handle, const ViewLike &view, + const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + static_assert(1 == ViewLike::rank, "lower_bound_team requires rank-1 views"); + static_assert(is_iota_v || Kokkos::is_view::value, + "lower_bound_team requires a " + "KokkosKernels::Impl::Iota or a Kokkos::View"); + + /* kary search is A = (k-1) * (logk(view.size()) + 1) accesses + + sequential search is B = view.size() accesses + + A < B is true ruoughly when view.size() > 3 * k + */ + if (view.size() > 3 * size_t(handle.team_size())) { + return Impl::lower_bound_kary_team(handle, view, value, pred); + } else { + return Impl::lower_bound_sequential_team(handle, view, value, pred); + } +} + +} // namespace KokkosKernels + +#endif // _KOKKOSKERNELS_LOWERBOUND_HPP \ No newline at end of file diff --git a/common/src/KokkosKernels_Predicates.hpp b/common/src/KokkosKernels_Predicates.hpp new file mode 100644 index 0000000000..a741d1353a --- /dev/null +++ b/common/src/KokkosKernels_Predicates.hpp @@ -0,0 +1,167 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef _KOKKOSKERNELS_PREDICATES_HPP +#define _KOKKOSKERNELS_PREDICATES_HPP + +#include "Kokkos_ArithTraits.hpp" + +/*! \file KokkosKernels_Predicates.hpp + * Define predicates for KokkosKernels search functions + */ + +namespace KokkosKernels { + +/** + * @brief Struct template for a greater-than predicate + * @tparam T Type to be compared + */ +template +struct GT { + using value_type = T; + static_assert(!Kokkos::ArithTraits::is_complex, + "Please define custom predicates for ordering complex types"); + + /** + * @brief Return true if a is greater than b + * @param a First value to be compared + * @param b Second value to be compared + */ + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, + const value_type &b) const + noexcept { + return a > b; + } +}; + +/*! \brief "Greater-than-or-equal" predicate, a >= b + \tparam T the type to compare +*/ +template +struct GTE { + using value_type = T; + static_assert(!Kokkos::ArithTraits::is_complex, + "Please define custom predicates for ordering complex types"); + + /// \brief return a >= b + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, + const value_type &b) const + noexcept { + return a >= b; + } +}; + +/*! \brief "Less-than" predicate, a < b + \tparam T the type to compare +*/ +template +struct LT { + using value_type = T; + static_assert(!Kokkos::ArithTraits::is_complex, + "Please define custom predicates for ordering complex types"); + + /// \brief return a < b + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, + const value_type &b) const + noexcept { + return a < b; + } +}; + +/*! \brief "Less-than-or-equal" predicate, a <= b + \tparam T the type to compare +*/ +template +struct LTE { + using value_type = T; + static_assert(!Kokkos::ArithTraits::is_complex, + "Please define custom predicates for ordering complex types"); + + /// \brief return a <= b + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, + const value_type &b) const + noexcept { + return a <= b; + } +}; + +/*! \brief "Equal" predicate, a == b + \tparam T the type to compare +*/ +template +struct Equal { + using value_type = T; + + /// \brief return a == b + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, + const value_type &b) const { + return a == b; + } +}; + +/** + * @brief Struct template for inverting a predicate + * @tparam Pred Predicate type to be inverted + */ +template +struct Neg { + using value_type = typename Pred::value_type; + + /** + * @brief Constructor + * @param pred Predicate object to be inverted + */ + KOKKOS_INLINE_FUNCTION + constexpr Neg(const Pred &pred) : pred_(pred) {} + + /** + * @brief Return the boolean inverse of the underlying predicate + * @param a First value to be compared by the predicate + * @param b Second value to be compared by the predicate + * @return Boolean inverse of the result of the predicate applied to a and b + */ + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, + const value_type &b) const { + return !pred_(a, b); + } + + private: + Pred pred_; //< Underlying predicate object +}; + +/*! \brief Reflect a predicate, pred(b, a) + \tparam Pred the type of the predicate to reflect +*/ +template +struct Refl { + using value_type = typename Pred::value_type; + + KOKKOS_INLINE_FUNCTION + constexpr Refl(const Pred &pred) : pred_(pred) {} + + /// \brief return the underlying binary predicate with reversed arguments + KOKKOS_INLINE_FUNCTION constexpr bool operator()(const value_type &a, + const value_type &b) const { + return pred_(b, a); + } + + private: + Pred pred_; +}; + +} // namespace KokkosKernels + +#endif // _KOKKOSKERNELS_PREDICATES_HPP \ No newline at end of file diff --git a/common/src/KokkosKernels_UpperBound.hpp b/common/src/KokkosKernels_UpperBound.hpp new file mode 100644 index 0000000000..901c865743 --- /dev/null +++ b/common/src/KokkosKernels_UpperBound.hpp @@ -0,0 +1,101 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef _KOKKOSKERNELS_UPPERBOUND_HPP +#define _KOKKOSKERNELS_UPPERBOUND_HPP + +/*! \file KokkosKernels_UpperBound.hpp + Define thread and team-collaborative upper-bound search + + Upper-bound search takes a Kokkos::View, a search value, and a binary + predicate. + It returns an index to the first element of the view such that pred(value, + element) is true + + This is implemented by calling lower_bound functions with inverted and + reflected predicates, i.e. upper_bound(view, val, pred) = lower_bound(value, + val, Inv(Refl(pred))); + + Examples: + \verbatim + value = 3 + view = {0,1,2,3,4} + = {f,f,f,f,t} + result = 4 + + value = -1 + view = {0,1,2,3,4} + = {t,t,t,t,t} + result = 0 + + value = 5 + view = {0,1,2,3,4} + = {f,f,f,f,f} + result = 5 + + value = 1 + view = {0,1,1,1,2} + = {f,f,f,f,t} + result = 4 + \endverbatim + + Contrast with lower-bound, which returns first index for which pred(element, + value) is false + */ + +#include "KokkosKernels_LowerBound.hpp" + +namespace KokkosKernels { + +/*! \brief single-thread upper-bound search + + \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota + \tparam Pred a binary predicate function + \param view the view to search + \param value the value to search for + \param pred a binary predicate function + \returns index of first element in view where pred(value,element) is true, + or view.size if no such element exists +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type upper_bound_thread( + const ViewLike &view, const typename ViewLike::non_const_value_type &value, + Pred pred = Pred()) { + return lower_bound_thread(view, value, Neg(Refl(pred))); +} + +/*! \brief team-collaborative upper-bound search + + \tparam ViewLike A Kokkos::View or KokkosKernels::Impl::Iota + \tparam Pred a binary predicate function + \param view the view to search + \param value the value to search for + \param pred a binary predicate function + \returns index of first element in view where pred(value,element) is true, + or view.size if no such element exists +*/ +template > +KOKKOS_INLINE_FUNCTION typename ViewLike::size_type upper_bound_team( + const TeamMember &handle, const ViewLike &view, + const typename ViewLike::non_const_value_type &value, Pred pred = Pred()) { + return lower_bound_team(handle, view, value, Neg(Refl(pred))); +} + +} // namespace KokkosKernels + +#endif // _KOKKOSKERNELS_UPPERBOUND_HPP \ No newline at end of file diff --git a/common/unit_test/Test_Common.hpp b/common/unit_test/Test_Common.hpp index 9b26f9bf9e..2ccf9c2103 100644 --- a/common/unit_test/Test_Common.hpp +++ b/common/unit_test/Test_Common.hpp @@ -25,5 +25,7 @@ #include #include #include +#include +#include #endif // TEST_COMMON_HPP diff --git a/common/unit_test/Test_Common_Iota.hpp b/common/unit_test/Test_Common_Iota.hpp index 7207d6f4b1..cae207d56b 100644 --- a/common/unit_test/Test_Common_Iota.hpp +++ b/common/unit_test/Test_Common_Iota.hpp @@ -85,8 +85,17 @@ void test_iota_subview() { EXPECT_EQ(sub(1), 9); } +template +void test_is_iota() { + static_assert(KokkosKernels::Impl::is_iota_v>, + "Iota should be an Iota"); + static_assert(!KokkosKernels::Impl::is_iota_v, + "int should not be an Iota"); +} + template void test_iota() { + test_is_iota(); test_iota_constructor(); test_iota_rank(); test_iota_subview(); diff --git a/common/unit_test/Test_Common_LowerBound.hpp b/common/unit_test/Test_Common_LowerBound.hpp new file mode 100644 index 0000000000..f2b54eed32 --- /dev/null +++ b/common/unit_test/Test_Common_LowerBound.hpp @@ -0,0 +1,256 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/// \file Test_Common_LowerBound.hpp +/// \brief Tests lower bounds search routines + +#include +#include + +template +size_t std_lower_bound(const std::vector &haystack, + const Ordinal needle) { + const auto it = std::lower_bound(haystack.begin(), haystack.end(), needle); + return it - haystack.begin(); +} + +/*! \brief count the number of incorrect values */ +template +struct ThreadLowerBoundFunctor { + using hv_value_type = typename HaystackView::non_const_value_type; + using hv_size_type = typename HaystackView::size_type; + + ThreadLowerBoundFunctor(const hv_size_type &expected, + const HaystackView &haystack, + const hv_value_type &needle) + : expected_(expected), haystack_(haystack), needle_(needle) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const size_t i, int &lerrCount) const { + if (0 == i) { + hv_size_type idx = KokkosKernels::lower_bound_thread(haystack_, needle_); + if (idx != expected_) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", + __FILE__, __LINE__, int(i), + int(expected_), int(idx)); + ++lerrCount; + } + } + } + + hv_size_type expected_; + HaystackView haystack_; + hv_value_type needle_; +}; + +template +void test_lower_bound_thread(const std::vector &_haystack, + const T &_needle) { + using execution_space = typename Device::execution_space; + using Policy = Kokkos::RangePolicy; + using view_t = Kokkos::View; + using u_const_view_t = Kokkos::View>; + using size_type = typename u_const_view_t::size_type; + + // get expected value + const size_type expected = std_lower_bound(_haystack, _needle); + + // create device views of input data + u_const_view_t uhaystack(_haystack.data(), _haystack.size()); + view_t haystack("haystack", uhaystack.size()); + Kokkos::deep_copy(haystack, uhaystack); + + // test lower_bound search + int errCount; + // run a single thread + Kokkos::parallel_reduce(Policy(0, 1), + ThreadLowerBoundFunctor(expected, haystack, _needle), + errCount); + + EXPECT_EQ(0, errCount); +} + +/*! \brief count the number of incorrect values */ +template +struct TeamLowerBoundFunctor { + using hv_value_type = typename HaystackView::non_const_value_type; + using hv_size_type = typename HaystackView::size_type; + + TeamLowerBoundFunctor(const hv_size_type &expected, + const HaystackView &haystack, + const hv_value_type &needle) + : expected_(expected), haystack_(haystack), needle_(needle) {} + + KOKKOS_INLINE_FUNCTION void operator()(const Member &handle, + int &lerrCount) const { + hv_size_type idx = + KokkosKernels::lower_bound_team(handle, haystack_, needle_); + if (idx != expected_) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", + __FILE__, __LINE__, int(handle.team_rank()), + int(expected_), int(idx)); + ++lerrCount; + } + } + + hv_size_type expected_; + HaystackView haystack_; + hv_value_type needle_; +}; + +template +void test_lower_bound_team(const std::vector &_haystack, const T _needle) { + using execution_space = typename Device::execution_space; + using Policy = Kokkos::TeamPolicy; + using Member = typename Policy::member_type; + using view_t = Kokkos::View; + using u_const_view_t = Kokkos::View>; + using size_type = typename u_const_view_t::size_type; + + // get expected value + const size_type expected = std_lower_bound(_haystack, _needle); + + // create device views of input data + u_const_view_t uhaystack(_haystack.data(), _haystack.size()); + view_t haystack("haystack", uhaystack.size()); + Kokkos::deep_copy(haystack, uhaystack); + + // test lower_bound search + const int leagueSize = 1; + const int teamSize = + KokkosKernels::Impl::kk_is_gpu_exec_space() ? 64 : 1; + int errCount; + Kokkos::parallel_reduce( + Policy(leagueSize, teamSize), + TeamLowerBoundFunctor(expected, haystack, _needle), + errCount); + + EXPECT_EQ(0, errCount); +} + +template +void test_lower_bound(const std::vector &haystack, const T needle) { + test_lower_bound_thread(haystack, needle); + test_lower_bound_team(haystack, needle); +} + +template +T randn(T n) { + if constexpr (std::is_floating_point_v) { + return T(rand()) / T(RAND_MAX) * n; + } else { + return T(rand()) % n; + } +} + +/* define specific and random lower-bound test cases + */ +template +void test_lower_bound() { + test_lower_bound({}, T(0)); + test_lower_bound({}, T(1)); + test_lower_bound({}, T(-1)); + + test_lower_bound({0}, T(0)); + test_lower_bound({0}, T(1)); + test_lower_bound({0}, T(-1)); + + test_lower_bound({1}, T(0)); + test_lower_bound({1}, T(1)); + test_lower_bound({1}, T(-1)); + + test_lower_bound({T(-1)}, T(0)); + test_lower_bound({T(-1)}, T(1)); + test_lower_bound({T(-1)}, T(-1)); + + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(-1)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(0)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(1)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(2)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(2.4)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(2.5)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(2.6)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(3)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(4)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(5)); + test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(6)); + + auto randn = [](T n) { + if constexpr (std::is_floating_point_v) { + return T(rand()) / T(RAND_MAX) * n; + } else { + return T(rand()) % n; + } + }; + + T maxEntry = 20; + const int numTests = 100; + for (int n = 0; n < numTests; ++n) { + for (size_t sz : {10, 100, 1000}) { + // generate a sorted random vector + std::vector haystack; + for (size_t i = 0; i < sz; ++i) { + haystack.push_back(randn(maxEntry)); + } + std::sort(haystack.begin(), haystack.end()); + + // generate a random value to search for + const T needle = randn(maxEntry); + + // do the test + test_lower_bound(haystack, needle); + } + } +} + +#define EXECUTE_TEST(T, DEVICE) \ + TEST_F(TestCategory, common##_##lower_bound##_##T##_##DEVICE) { \ + test_lower_bound(); \ + } + +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(int64_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(float, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(double, TestExecSpace) +#endif + +#undef EXECUTE_TEST \ No newline at end of file diff --git a/common/unit_test/Test_Common_UpperBound.hpp b/common/unit_test/Test_Common_UpperBound.hpp new file mode 100644 index 0000000000..b99ffbb0a6 --- /dev/null +++ b/common/unit_test/Test_Common_UpperBound.hpp @@ -0,0 +1,245 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/// \file Test_Common_UpperBound.hpp +/// \brief Tests upper bounds search routines + +#include +#include + +template +size_t std_upper_bound(const std::vector &haystack, + const Ordinal needle) { + const auto it = std::upper_bound(haystack.begin(), haystack.end(), needle); + return it - haystack.begin(); +} + +/*! \brief count the number of incorrect values */ +template +struct ThreadUpperBoundFunctor { + using hv_value_type = typename HaystackView::non_const_value_type; + using hv_size_type = typename HaystackView::size_type; + + ThreadUpperBoundFunctor(const hv_size_type &expected, + const HaystackView &haystack, + const hv_value_type &needle) + : expected_(expected), haystack_(haystack), needle_(needle) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const size_t i, int &lerrCount) const { + if (0 == i) { + hv_size_type idx = KokkosKernels::upper_bound_thread(haystack_, needle_); + if (idx != expected_) { + printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, + int(i), int(expected_), int(idx)); + ++lerrCount; + } + } + } + + hv_size_type expected_; + HaystackView haystack_; + hv_value_type needle_; +}; + +template +void test_upper_bound_thread(const std::vector &_haystack, + const T &_needle) { + using execution_space = typename Device::execution_space; + using Policy = Kokkos::RangePolicy; + using view_t = Kokkos::View; + using u_const_view_t = Kokkos::View>; + using hv_size_type = typename u_const_view_t::size_type; + + // get expected value + const hv_size_type expected = std_upper_bound(_haystack, _needle); + + // create device views of input data + u_const_view_t uhaystack(_haystack.data(), _haystack.size()); + view_t haystack("haystack", uhaystack.size()); + Kokkos::deep_copy(haystack, uhaystack); + + // test upper_bound search + int errCount; + // run a single thread + Kokkos::parallel_reduce(Policy(0, 1), + ThreadUpperBoundFunctor(expected, haystack, _needle), + errCount); + + EXPECT_EQ(0, errCount); +} + +/*! \brief count the number of incorrect values */ +template +struct TeamUpperBoundFunctor { + using hv_value_type = typename HaystackView::non_const_value_type; + using hv_size_type = typename HaystackView::size_type; + + TeamUpperBoundFunctor(const hv_size_type &expected, + const HaystackView &haystack, + const hv_value_type &needle) + : expected_(expected), haystack_(haystack), needle_(needle) {} + + KOKKOS_INLINE_FUNCTION void operator()(const Member &handle, + int &lerrCount) const { + hv_size_type idx = + KokkosKernels::upper_bound_team(handle, haystack_, needle_); + if (idx != expected_) { + printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, + int(handle.team_rank()), int(expected_), int(idx)); + ++lerrCount; + } + } + + hv_size_type expected_; + HaystackView haystack_; + hv_value_type needle_; +}; + +template +void test_upper_bound_team(const std::vector &_haystack, const T _needle) { + using execution_space = typename Device::execution_space; + using Policy = Kokkos::TeamPolicy; + using Member = typename Policy::member_type; + using view_t = Kokkos::View; + using u_const_view_t = Kokkos::View>; + using hv_size_type = typename u_const_view_t::size_type; + + // get expected value + const hv_size_type expected = std_upper_bound(_haystack, _needle); + + // create device views of input data + u_const_view_t uhaystack(_haystack.data(), _haystack.size()); + view_t haystack("haystack", uhaystack.size()); + Kokkos::deep_copy(haystack, uhaystack); + + // test upper_bound search + const int leagueSize = 1; + const int teamSize = + KokkosKernels::Impl::kk_is_gpu_exec_space() ? 64 : 1; + int errCount; + Kokkos::parallel_reduce( + Policy(leagueSize, teamSize), + TeamUpperBoundFunctor(expected, haystack, _needle), + errCount); + + EXPECT_EQ(0, errCount); +} + +template +void test_upper_bound(const std::vector &haystack, const T needle) { + test_upper_bound_thread(haystack, needle); + test_upper_bound_team(haystack, needle); +} + +/* define specific and random lower-bound test cases + */ +template +void test_upper_bound() { + test_upper_bound({}, T(0)); + test_upper_bound({}, T(1)); + test_upper_bound({}, T(-1)); + + test_upper_bound({0}, T(0)); + test_upper_bound({0}, T(1)); + test_upper_bound({0}, T(-1)); + + test_upper_bound({1}, T(0)); + test_upper_bound({1}, T(1)); + test_upper_bound({1}, T(-1)); + + test_upper_bound({T(-1)}, T(0)); + test_upper_bound({T(-1)}, T(1)); + test_upper_bound({T(-1)}, T(-1)); + + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(-1)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(0)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(1)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(2)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(2.4)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(2.5)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(2.6)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(3)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(4)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(5)); + test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(6)); + + auto randn = [](T n) { + if constexpr (std::is_floating_point_v) { + return T(rand()) / T(RAND_MAX) * n; + } else { + return T(rand()) % n; + } + }; + + constexpr T maxEntry = 20; + const int numTests = 100; + for (int n = 0; n < numTests; ++n) { + for (size_t sz : {10, 100, 1000}) { + // generate a sorted random vector + std::vector haystack; + for (size_t i = 0; i < sz; ++i) { + haystack.push_back(randn(maxEntry)); + } + std::sort(haystack.begin(), haystack.end()); + + // generate a random value to search for + const T needle = randn(maxEntry); + + // do the test + test_upper_bound(haystack, needle); + } + } +} + +#define EXECUTE_TEST(T, DEVICE) \ + TEST_F(TestCategory, common##_##upper_bound##_##T##_##DEVICE) { \ + test_upper_bound(); \ + } + +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(int64_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_ORDINAL_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(float, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_TEST(double, TestExecSpace) +#endif + +#undef EXECUTE_TEST \ No newline at end of file From a2c1610a883179ad8d2c5d76c45a9180f002b0c7 Mon Sep 17 00:00:00 2001 From: Tom Ransegnola Date: Wed, 29 Mar 2023 15:08:56 -0600 Subject: [PATCH 169/442] accept r-value A matrix --- sparse/src/KokkosSparse_mdf.hpp | 4 ++-- sparse/src/KokkosSparse_mdf_handle.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sparse/src/KokkosSparse_mdf.hpp b/sparse/src/KokkosSparse_mdf.hpp index 90fa3beeef..672da5b4de 100644 --- a/sparse/src/KokkosSparse_mdf.hpp +++ b/sparse/src/KokkosSparse_mdf.hpp @@ -34,7 +34,7 @@ namespace KokkosSparse { namespace Experimental { template -void mdf_symbolic(crs_matrix_type& A, MDF_handle& handle) { +void mdf_symbolic(const crs_matrix_type& A, MDF_handle& handle) { using size_type = typename crs_matrix_type::size_type; using ordinal_type = typename crs_matrix_type::ordinal_type; @@ -63,7 +63,7 @@ void mdf_symbolic(crs_matrix_type& A, MDF_handle& handle) { } // mdf_symbolic template -void mdf_numeric(crs_matrix_type& A, MDF_handle& handle) { +void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: entries_type::non_const_type; using values_type = typename crs_matrix_type::values_type::non_const_type; diff --git a/sparse/src/KokkosSparse_mdf_handle.hpp b/sparse/src/KokkosSparse_mdf_handle.hpp index 189bccfb18..4e23280235 100644 --- a/sparse/src/KokkosSparse_mdf_handle.hpp +++ b/sparse/src/KokkosSparse_mdf_handle.hpp @@ -62,7 +62,7 @@ struct MDF_handle { crs_matrix_type L, U; - MDF_handle(const crs_matrix_type A) + MDF_handle(const crs_matrix_type & A) : numRows(A.numRows()), permutation(col_ind_type("row permutation", A.numRows())), permutation_inv(col_ind_type("inverse row permutation", A.numRows())), From a94163cbca6eaf083df750d3a3e603e8b0bc2f66 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Wed, 29 Mar 2023 20:51:52 -0600 Subject: [PATCH 170/442] Patch Trilinos #11663 (#1757) This was intended to be a temporary patch, but it will need to stay until 4.1. This means it has to be included in 4.0.1. --- sparse/src/KokkosSparse_spgemm_symbolic.hpp | 27 +++++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/sparse/src/KokkosSparse_spgemm_symbolic.hpp b/sparse/src/KokkosSparse_spgemm_symbolic.hpp index ff37199699..486d999e41 100644 --- a/sparse/src/KokkosSparse_spgemm_symbolic.hpp +++ b/sparse/src/KokkosSparse_spgemm_symbolic.hpp @@ -140,15 +140,26 @@ void spgemm_symbolic(KernelHandle *handle, // Verify that graphs A and B are sorted. // This test is designed to be as efficient as possible, but still skip // it in a release build. + // + // Temporary fix for Trilinos issue #11655: Only perform this check if a TPL + // is to be called. The KokkosKernels (non-TPL) implementation does not + // actually require sorted indices yet. And Tpetra uses size_type = size_t, so + // it will (currently) not be calling a TPL path. #ifndef NDEBUG - if (!KokkosSparse::Impl::isCrsGraphSorted(const_a_r, const_a_l)) - throw std::runtime_error( - "KokkosSparse::spgemm_symbolic: entries of A are not sorted within " - "rows. May use KokkosSparse::sort_crs_matrix to sort it."); - if (!KokkosSparse::Impl::isCrsGraphSorted(const_b_r, const_b_l)) - throw std::runtime_error( - "KokkosSparse::spgemm_symbolic: entries of B are not sorted within " - "rows. May use KokkosSparse::sort_crs_matrix to sort it."); + if constexpr (KokkosSparse::Impl::spgemm_symbolic_tpl_spec_avail< + const_handle_type, Internal_alno_row_view_t_, + Internal_alno_nnz_view_t_, Internal_blno_row_view_t_, + Internal_blno_nnz_view_t_, + Internal_clno_row_view_t_>::value) { + if (!KokkosSparse::Impl::isCrsGraphSorted(const_a_r, const_a_l)) + throw std::runtime_error( + "KokkosSparse::spgemm_symbolic: entries of A are not sorted within " + "rows. May use KokkosSparse::sort_crs_matrix to sort it."); + if (!KokkosSparse::Impl::isCrsGraphSorted(const_b_r, const_b_l)) + throw std::runtime_error( + "KokkosSparse::spgemm_symbolic: entries of B are not sorted within " + "rows. May use KokkosSparse::sort_crs_matrix to sort it."); + } #endif auto algo = tmp_handle.get_spgemm_handle()->get_algorithm_type(); From 27ec2cdb851a2102e94e8ddee937538cefb12dae Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Wed, 29 Mar 2023 21:57:19 -0600 Subject: [PATCH 171/442] Spgemm perf test enhancements (#1664) * SpGEMM perf test enhancements - SYCL, OpenMPTarget support - Document the --checkoutput option in the help text - When checking output, just give the max absolute error in values so the user can decide what's acceptable (instead of erroring out based on a fixed epsilon) * spgemm perftest: fix device id for sycl/omptarget * Fix sometimes-uninitialized warning * SpGEMM perf test: allow Serial to be used Pass device_id = 0 to Kokkos::initialize if no backend is requested. That way, if the build has a GPU backend is enabled but no backend is requested, Kokkos::initialize won't fail and Serial can be used. * One more spgemm perftest fix - Initialize use_openmptarget in TestParameters constructor (this needs to be zeroed out, otherwise it gets used as device id) - Print out which backend is actually being run * spgemm perftest: error out if backend not available * Fix unused variable warning * Finish spgemm perf test refactor - Add check_arg_double(...) to perf test utils - Use perf test utils for arg parsing, selecting backend - Get rid of the ability to have distinct fast/slow memory spaces - Merge the three files that made up the perf test into just KokkosSparse_spgemm.cpp - Change some types in KokkosKernels::Experiment::Parameters to be safer - on/off values should be bool, not int - strings should be std::string, not char* --- .../KokkosKernels_perf_test_utilities.hpp | 36 ++ perf_test/graph/KokkosGraph_color.cpp | 6 +- perf_test/graph/KokkosGraph_run_triangle.hpp | 2 +- .../sparse/KokkosSparse_multimem_spgemm.hpp | 216 ------- perf_test/sparse/KokkosSparse_run_spgemm.hpp | 301 --------- .../sparse/KokkosSparse_run_spgemm_jacobi.hpp | 6 +- perf_test/sparse/KokkosSparse_spgemm.cpp | 588 ++++++++++++------ .../sparse/KokkosSparse_spgemm_jacobi.cpp | 4 +- test_common/KokkosKernels_TestParameters.hpp | 34 +- test_common/KokkosKernels_TestUtils.hpp | 4 + 10 files changed, 461 insertions(+), 736 deletions(-) delete mode 100644 perf_test/sparse/KokkosSparse_multimem_spgemm.hpp delete mode 100644 perf_test/sparse/KokkosSparse_run_spgemm.hpp diff --git a/perf_test/KokkosKernels_perf_test_utilities.hpp b/perf_test/KokkosKernels_perf_test_utilities.hpp index cc7f70ccec..fdbee134eb 100644 --- a/perf_test/KokkosKernels_perf_test_utilities.hpp +++ b/perf_test/KokkosKernels_perf_test_utilities.hpp @@ -67,6 +67,26 @@ void process_arg_int(char const* str_val, int& val) { } } +void process_arg_double(char const* str_val, double& val) { + errno = 0; + char* ptr_end; + val = std::strtod(str_val, &ptr_end); + + if (str_val == ptr_end) { + std::stringstream ss; + ss << "Error: cannot convert command line argument '" << str_val + << "' to a double.\n"; + throw std::invalid_argument(ss.str()); + } + + if (errno == ERANGE) { + std::stringstream ss; + ss << "Error: converted value for command line argument '" << str_val + << "' falls out of range.\n"; + throw std::invalid_argument(ss.str()); + } +} + bool check_arg_int(int const i, int const argc, char** argv, char const* name, int& val) { if (0 != Test::string_compare_no_case(argv[i], name)) { @@ -83,6 +103,22 @@ bool check_arg_int(int const i, int const argc, char** argv, char const* name, return true; } +bool check_arg_double(int const i, int const argc, char** argv, + char const* name, double& val) { + if (0 != Test::string_compare_no_case(argv[i], name)) { + return false; + } + + if (i < argc - 1) { + process_arg_double(argv[i + 1], val); + } else { + std::stringstream msg; + msg << name << " input argument needs to be followed by a real number"; + throw std::invalid_argument(msg.str()); + } + return true; +} + bool check_arg_bool(int const i, int const /*argc*/, char** argv, char const* name, bool& val) { if (0 != Test::string_compare_no_case(argv[i], name)) { diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp index cc969e52a1..57f241d7b1 100644 --- a/perf_test/graph/KokkosGraph_color.cpp +++ b/perf_test/graph/KokkosGraph_color.cpp @@ -379,7 +379,7 @@ void run_experiment(crsGraph_t crsGraph, int num_cols, Parameters params) { } } - if (params.coloring_output_file != NULL) { + if (params.coloring_output_file != "") { std::ofstream os(params.coloring_output_file, std::ofstream::out); KokkosKernels::Impl::print_1Dview(os, colors, true, "\n"); } @@ -420,7 +420,7 @@ void run_multi_mem_experiment(Parameters params) { // typedef typename slow_graph_t::entries_type::const_type // const_slow_cols_view_t; - char *a_mat_file = params.a_mtx_bin_file; + const char *a_mat_file = params.a_mtx_bin_file.c_str(); // char *b_mat_file = params.b_mtx_bin_file; // char *c_mat_file = params.c_mtx_bin_file; @@ -581,7 +581,7 @@ int main(int argc, char **argv) { if (parse_inputs(params, argc, argv)) { return 1; } - if (params.a_mtx_bin_file == NULL) { + if (params.a_mtx_bin_file == "") { std::cerr << "Provide a matrix file" << std::endl; return 0; } diff --git a/perf_test/graph/KokkosGraph_run_triangle.hpp b/perf_test/graph/KokkosGraph_run_triangle.hpp index 30d1ec77f6..2bdea59bea 100644 --- a/perf_test/graph/KokkosGraph_run_triangle.hpp +++ b/perf_test/graph/KokkosGraph_run_triangle.hpp @@ -64,7 +64,7 @@ bool is_same_graph(crsGraph_t output_mat1, crsGraph_t output_mat2) { if (!is_identical) return false; if (!is_identical) { - std::cout << "Incorret values" << std::endl; + std::cout << "Incorrect values" << std::endl; } return true; } diff --git a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp b/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp deleted file mode 100644 index 269baf3fdc..0000000000 --- a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp +++ /dev/null @@ -1,216 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include "KokkosSparse_CrsMatrix.hpp" -#include "KokkosSparse_run_spgemm.hpp" -#include "KokkosSparse_IOUtils.hpp" - -namespace KokkosKernels { - -namespace Experiment { - -template -void run_multi_mem_spgemm(Parameters params) { - typedef exec_space myExecSpace; - typedef Kokkos::Device myFastDevice; - typedef Kokkos::Device mySlowExecSpace; - - typedef typename KokkosSparse::CrsMatrix - fast_crstmat_t; - typedef typename KokkosSparse::CrsMatrix - slow_crstmat_t; - - char *a_mat_file = params.a_mtx_bin_file; - char *b_mat_file = params.b_mtx_bin_file; - char *c_mat_file = params.c_mtx_bin_file; - - slow_crstmat_t a_slow_crsmat, b_slow_crsmat, c_slow_crsmat; - fast_crstmat_t a_fast_crsmat, b_fast_crsmat, c_fast_crsmat; - - // read a and b matrices and store them on slow or fast memory. - - if (params.a_mem_space == 1) { - a_fast_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix(a_mat_file); - } else { - a_slow_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix(a_mat_file); - } - - if ((b_mat_file == NULL || strcmp(b_mat_file, a_mat_file) == 0) && - params.b_mem_space == params.a_mem_space) { - std::cout << "Using A matrix for B as well" << std::endl; - b_fast_crsmat = a_fast_crsmat; - b_slow_crsmat = a_slow_crsmat; - } else if (params.b_mem_space == 1) { - if (b_mat_file == NULL) b_mat_file = a_mat_file; - b_fast_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix(b_mat_file); - } else { - if (b_mat_file == NULL) b_mat_file = a_mat_file; - b_slow_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix(b_mat_file); - } - - if (params.a_mem_space == 1) { - if (params.b_mem_space == 1) { - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, fast_crstmat_t, fast_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_fast_crsmat, - params); - } else { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, fast_crstmat_t, fast_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_fast_crsmat, - params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, fast_crstmat_t, slow_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_fast_crsmat, - params); - } else { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, fast_crstmat_t, slow_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_fast_crsmat, - params); - } - } - } else { - // B is in slow memory - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, slow_crstmat_t, fast_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_slow_crsmat, - params); - } else { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, slow_crstmat_t, fast_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_slow_crsmat, - params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, slow_crstmat_t, slow_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_slow_crsmat, - params); - } else { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, slow_crstmat_t, slow_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_slow_crsmat, - params); - } - } - } - } else { - // A is in slow memory - if (params.b_mem_space == 1) { - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, fast_crstmat_t, fast_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_fast_crsmat, - params); - } else { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, fast_crstmat_t, fast_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_fast_crsmat, - params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, fast_crstmat_t, slow_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_fast_crsmat, - params); - } else { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, fast_crstmat_t, slow_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_fast_crsmat, - params); - } - } - } else { - // B is in slow memory - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, slow_crstmat_t, fast_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_slow_crsmat, - params); - } else { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, slow_crstmat_t, fast_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_slow_crsmat, - params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, slow_crstmat_t, slow_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_slow_crsmat, - params); - } else { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, slow_crstmat_t, slow_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_slow_crsmat, - params); - } - } - } - } - - if (c_mat_file != NULL) { - if (params.c_mem_space == 1) { - KokkosSparse::sort_crs_matrix(c_fast_crsmat); - - KokkosSparse::Impl::write_graph_bin( - (lno_t)(c_fast_crsmat.numRows()), - (size_type)(c_fast_crsmat.graph.entries.extent(0)), - c_fast_crsmat.graph.row_map.data(), - c_fast_crsmat.graph.entries.data(), c_fast_crsmat.values.data(), - c_mat_file); - } else { - KokkosSparse::sort_crs_matrix(c_slow_crsmat); - - KokkosSparse::Impl::write_graph_bin( - (lno_t)c_slow_crsmat.numRows(), - (size_type)c_slow_crsmat.graph.entries.extent(0), - c_slow_crsmat.graph.row_map.data(), - c_slow_crsmat.graph.entries.data(), c_slow_crsmat.values.data(), - c_mat_file); - } - } -} - -} // namespace Experiment -} // namespace KokkosKernels diff --git a/perf_test/sparse/KokkosSparse_run_spgemm.hpp b/perf_test/sparse/KokkosSparse_run_spgemm.hpp deleted file mode 100644 index 67d61d1f75..0000000000 --- a/perf_test/sparse/KokkosSparse_run_spgemm.hpp +++ /dev/null @@ -1,301 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include "KokkosSparse_spgemm.hpp" -#include "KokkosKernels_TestParameters.hpp" -#include "KokkosSparse_SortCrs.hpp" - -#define TRANPOSEFIRST false -#define TRANPOSESECOND false - -namespace KokkosKernels { - -namespace Experiment { -template -bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2) { - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type lno_view_t; - typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - - size_t nrows1 = output_mat1.graph.row_map.extent(0); - size_t nentries1 = output_mat1.graph.entries.extent(0); - size_t nvals1 = output_mat1.values.extent(0); - - size_t nrows2 = output_mat2.graph.row_map.extent(0); - size_t nentries2 = output_mat2.graph.entries.extent(0); - size_t nvals2 = output_mat2.values.extent(0); - - KokkosSparse::sort_crs_matrix(output_mat1); - - if (nrows1 != nrows2) { - std::cerr << "row count is different" << std::endl; - return false; - } - if (nentries1 != nentries2) { - std::cerr << "nentries2 is different" << std::endl; - return false; - } - if (nvals1 != nvals2) { - std::cerr << "nvals1 is different" << std::endl; - return false; - } - - KokkosSparse::sort_crs_matrix(output_mat2); - - bool is_identical = true; - is_identical = KokkosKernels::Impl::kk_is_identical_view< - typename graph_t::row_map_type, typename graph_t::row_map_type, - typename lno_view_t::value_type, typename device::execution_space>( - output_mat1.graph.row_map, output_mat2.graph.row_map, 0); - if (!is_identical) { - std::cerr << "rowmaps differ" << std::endl; - return false; - } - - is_identical = KokkosKernels::Impl::kk_is_identical_view< - lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type, - typename device::execution_space>(output_mat1.graph.entries, - output_mat2.graph.entries, 0); - if (!is_identical) { - for (size_t i = 0; i < nrows1; ++i) { - size_t rb = output_mat1.graph.row_map(i); - size_t re = output_mat1.graph.row_map(i + 1); - bool incorrect = false; - for (size_t j = rb; j < re; ++j) { - if (output_mat1.graph.entries(j) != output_mat2.graph.entries(j)) { - incorrect = true; - break; - } - } - if (incorrect) { - for (size_t j = rb; j < re; ++j) { - std::cerr << "row:" << i << " j:" << j - << " h_ent1(j):" << output_mat1.graph.entries(j) - << " h_ent2(j):" << output_mat2.graph.entries(j) - << " rb:" << rb << " re:" << re << std::endl; - } - } - } - std::cerr << "entries differ" << std::endl; - return false; - } - - is_identical = KokkosKernels::Impl::kk_is_identical_view< - scalar_view_t, scalar_view_t, typename scalar_view_t::value_type, - typename device::execution_space>(output_mat1.values, output_mat2.values, - 0.000001); - if (!is_identical) { - std::cerr << "Incorret values" << std::endl; - } - return true; -} - -template -crsMat_t3 run_experiment(crsMat_t crsMat, crsMat_t2 crsMat2, - Parameters params) { - using namespace KokkosSparse; - using namespace KokkosSparse::Experimental; - using device_t = Kokkos::Device; - int algorithm = params.algorithm; - int repeat = params.repeat; - int chunk_size = params.chunk_size; - - int shmemsize = params.shmemsize; - int team_size = params.team_size; - int use_dynamic_scheduling = params.use_dynamic_scheduling; - int verbose = params.verbose; - int calculate_read_write_cost = params.calculate_read_write_cost; - // char spgemm_step = params.spgemm_step; - int vector_size = params.vector_size; - int check_output = params.check_output; - int mkl_keep_output = params.mkl_keep_output; - // spgemm_step++; - typedef typename crsMat_t3::values_type::non_const_type scalar_view_t; - typedef typename crsMat_t3::row_map_type::non_const_type lno_view_t; - typedef typename crsMat_t3::index_type::non_const_type lno_nnz_view_t; - typedef typename lno_nnz_view_t::value_type lno_t; - typedef typename lno_view_t::value_type size_type; - typedef typename scalar_view_t::value_type scalar_t; - - lno_view_t row_mapC; - lno_nnz_view_t entriesC; - scalar_view_t valuesC; - - typedef KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_t, scalar_t, ExecSpace, TempMemSpace, PersistentMemSpace> - KernelHandle; - - typedef typename lno_nnz_view_t::value_type idx; - typedef typename lno_view_t::value_type size_type; - - KernelHandle kh; - kh.set_team_work_size(chunk_size); - kh.set_shmem_size(shmemsize); - kh.set_suggested_team_size(team_size); - kh.set_suggested_vector_size(vector_size); - - if (use_dynamic_scheduling) { - kh.set_dynamic_scheduling(true); - } - if (verbose) { - kh.set_verbose(true); - } - - const idx m = crsMat.numRows(); - const idx n = crsMat2.numRows(); - const idx k = crsMat2.numCols(); - - if (verbose) std::cout << "m:" << m << " n:" << n << " k:" << k << std::endl; - if (n < crsMat.numCols()) { - std::cerr << "left.numCols():" << crsMat.numCols() - << " right.numRows():" << crsMat2.numRows() << std::endl; - exit(1); - } - - // The reference product (for verifying correctness) - // Don't allocate them if they won't be used, but they must be declared here. - lno_view_t row_mapC_ref; - lno_nnz_view_t entriesC_ref; - scalar_view_t valuesC_ref; - // Reference output has same type as actual output - crsMat_t3 Ccrsmat_ref; - - if (check_output) { - if (verbose) std::cout << "Running a reference algorithm" << std::endl; - row_mapC_ref = lno_view_t("non_const_lnow_row", m + 1); - KernelHandle sequential_kh; - sequential_kh.set_team_work_size(chunk_size); - sequential_kh.set_shmem_size(shmemsize); - sequential_kh.set_suggested_team_size(team_size); - sequential_kh.create_spgemm_handle(KokkosSparse::SPGEMM_SERIAL); - - if (use_dynamic_scheduling) { - sequential_kh.set_dynamic_scheduling(true); - } - - spgemm_symbolic(&sequential_kh, m, n, k, crsMat.graph.row_map, - crsMat.graph.entries, TRANPOSEFIRST, crsMat2.graph.row_map, - crsMat2.graph.entries, TRANPOSESECOND, row_mapC_ref); - - ExecSpace().fence(); - - size_type c_nnz_size = sequential_kh.get_spgemm_handle()->get_c_nnz(); - entriesC_ref = lno_nnz_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), - c_nnz_size); - valuesC_ref = scalar_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size); - - spgemm_numeric(&sequential_kh, m, n, k, crsMat.graph.row_map, - crsMat.graph.entries, crsMat.values, TRANPOSEFIRST, - - crsMat2.graph.row_map, crsMat2.graph.entries, crsMat2.values, - TRANPOSESECOND, row_mapC_ref, entriesC_ref, valuesC_ref); - ExecSpace().fence(); - - Ccrsmat_ref = crsMat_t3("CorrectC", m, k, valuesC_ref.extent(0), - valuesC_ref, row_mapC_ref, entriesC_ref); - } - - for (int i = 0; i < repeat; ++i) { - kh.create_spgemm_handle(KokkosSparse::SPGEMMAlgorithm(algorithm)); - - kh.get_spgemm_handle()->mkl_keep_output = mkl_keep_output; - kh.get_spgemm_handle()->set_mkl_sort_option(params.mkl_sort_option); - - // if mkl2 input needs to be converted to 1base. - kh.get_spgemm_handle()->mkl_convert_to_1base = true; - - // 250000 default. if cache-mode is used on KNL can increase to 1M. - kh.get_spgemm_handle()->MaxColDenseAcc = params.MaxColDenseAcc; - - if (i == 0) { - kh.get_spgemm_handle()->set_read_write_cost_calc( - calculate_read_write_cost); - } - // do the compression whether in 2 step, or 1 step. - kh.get_spgemm_handle()->set_compression_steps(!params.compression2step); - // whether to scale the hash more. default is 1, so no scale. - kh.get_spgemm_handle()->set_min_hash_size_scale(params.minhashscale); - // max occupancy in 1-level LP hashes. LL hashes can be 100% - kh.get_spgemm_handle()->set_first_level_hash_cut_off( - params.first_level_hash_cut_off); - // min reduction on FLOPs to run compression - kh.get_spgemm_handle()->set_compression_cut_off(params.compression_cut_off); - - row_mapC = lno_view_t("non_const_lnow_row", m + 1); - entriesC = lno_nnz_view_t("entriesC (empty)", 0); - valuesC = scalar_view_t("valuesC (empty)", 0); - - Kokkos::Timer timer1; - spgemm_symbolic(&kh, m, n, k, crsMat.graph.row_map, crsMat.graph.entries, - TRANPOSEFIRST, crsMat2.graph.row_map, crsMat2.graph.entries, - TRANPOSESECOND, row_mapC); - - ExecSpace().fence(); - double symbolic_time = timer1.seconds(); - - Kokkos::Timer timer3; - size_type c_nnz_size = kh.get_spgemm_handle()->get_c_nnz(); - if (verbose) std::cout << "C SIZE:" << c_nnz_size << std::endl; - if (c_nnz_size) { - entriesC = lno_nnz_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), - c_nnz_size); - valuesC = scalar_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), - c_nnz_size); - } - spgemm_numeric(&kh, m, n, k, crsMat.graph.row_map, crsMat.graph.entries, - crsMat.values, TRANPOSEFIRST, - - crsMat2.graph.row_map, crsMat2.graph.entries, crsMat2.values, - TRANPOSESECOND, row_mapC, entriesC, valuesC); - ExecSpace().fence(); - double numeric_time = timer3.seconds(); - - std::cout << "mm_time:" << symbolic_time + numeric_time - << " symbolic_time:" << symbolic_time - << " numeric_time:" << numeric_time << std::endl; - } - if (verbose) { - std::cout << "row_mapC:" << row_mapC.extent(0) << std::endl; - std::cout << "entriesC:" << entriesC.extent(0) << std::endl; - std::cout << "valuesC:" << valuesC.extent(0) << std::endl; - KokkosKernels::Impl::print_1Dview(valuesC); - KokkosKernels::Impl::print_1Dview(entriesC); - KokkosKernels::Impl::print_1Dview(row_mapC); - } - crsMat_t3 Ccrsmat_result("CrsMatrixC", m, k, valuesC.extent(0), valuesC, - row_mapC, entriesC); - if (check_output) { - bool is_identical = - is_same_matrix(Ccrsmat_result, Ccrsmat_ref); - if (!is_identical) { - std::cerr << "Result differs. If values are differing, might be floating " - "point order error." - << std::endl; - exit(1); - } - } - return Ccrsmat_result; -} - -} // namespace Experiment -} // namespace KokkosKernels diff --git a/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp b/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp index a2004e007b..db4141368a 100644 --- a/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp +++ b/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp @@ -299,9 +299,9 @@ void run_spgemm_jacobi(Parameters params) { void, size_type> slow_crstmat_t; - char *a_mat_file = params.a_mtx_bin_file; - char *b_mat_file = params.b_mtx_bin_file; - char *c_mat_file = params.c_mtx_bin_file; + const char *a_mat_file = params.a_mtx_bin_file.c_str(); + const char *b_mat_file = params.b_mtx_bin_file.c_str(); + const char *c_mat_file = params.c_mtx_bin_file.c_str(); slow_crstmat_t a_slow_crsmat, b_slow_crsmat, c_slow_crsmat; fast_crstmat_t a_fast_crsmat, b_fast_crsmat, c_fast_crsmat; diff --git a/perf_test/sparse/KokkosSparse_spgemm.cpp b/perf_test/sparse/KokkosSparse_spgemm.cpp index d46e9f6f11..cee68ef11a 100644 --- a/perf_test/sparse/KokkosSparse_spgemm.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm.cpp @@ -16,22 +16,119 @@ #include #include "KokkosKernels_config.h" #include "KokkosKernels_default_types.hpp" -#include "KokkosKernels_IOUtils.hpp" -#include "KokkosSparse_multimem_spgemm.hpp" +#include "KokkosSparse_IOUtils.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_spgemm.hpp" +#include "KokkosSparse_SortCrs.hpp" +#include "KokkosBlas1_nrminf.hpp" +#include "KokkosBlas1_axpby.hpp" +#include "KokkosKernels_TestParameters.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +#define TRANSPOSEFIRST false +#define TRANSPOSESECOND false + +template +bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) { + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type lno_view_t; + typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + + size_t nrows1 = output_mat_actual.graph.row_map.extent(0); + size_t ncols1 = output_mat_actual.graph.row_map.extent(0); + size_t nentries1 = output_mat_actual.graph.entries.extent(0); + size_t nvals1 = output_mat_actual.values.extent(0); + + size_t nrows2 = output_mat_reference.graph.row_map.extent(0); + size_t ncols2 = output_mat_reference.graph.row_map.extent(0); + size_t nentries2 = output_mat_reference.graph.entries.extent(0); + size_t nvals2 = output_mat_reference.values.extent(0); + + if (nrows1 != nrows2 || ncols1 != ncols2) { + std::cerr << "Wrong dimensions: is " << nrows1 << 'x' << ncols1 + << " but should be " << nrows2 << 'x' << ncols2 << '\n'; + return false; + } + if (nentries1 != nentries2) { + std::cerr << "Wrong number of entries: " << nentries1 + << ", but should have " << nentries2 << '\n'; + return false; + } + if (nvals1 != nvals2) { + std::cerr << "Wrong number of values: " << nvals1 << ", but should have " + << nvals2 << '\n'; + return false; + } + + bool is_identical = true; + is_identical = KokkosKernels::Impl::kk_is_identical_view< + typename graph_t::row_map_type, typename graph_t::row_map_type, + typename lno_view_t::value_type, typename device::execution_space>( + output_mat_actual.graph.row_map, output_mat_reference.graph.row_map, 0); + if (!is_identical) { + std::cerr << "Wrong rowmap:\n"; + KokkosKernels::Impl::print_1Dview(std::cerr, + output_mat_actual.graph.row_map); + std::cerr << "but should be:\n"; + KokkosKernels::Impl::print_1Dview(std::cerr, + output_mat_reference.graph.row_map); + return false; + } + + is_identical = KokkosKernels::Impl::kk_is_identical_view< + lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type, + typename device::execution_space>(output_mat_actual.graph.entries, + output_mat_reference.graph.entries, 0); + if (!is_identical) { + for (size_t i = 0; i < nrows1; ++i) { + size_t rb = output_mat_actual.graph.row_map(i); + size_t re = output_mat_actual.graph.row_map(i + 1); + bool incorrect = false; + for (size_t j = rb; j < re; ++j) { + if (output_mat_actual.graph.entries(j) != + output_mat_reference.graph.entries(j)) { + incorrect = true; + break; + } + } + if (incorrect) { + for (size_t j = rb; j < re; ++j) { + std::cerr << "row:" << i << " j:" << j + << " h_ent1(j):" << output_mat_actual.graph.entries(j) + << " h_ent2(j):" << output_mat_reference.graph.entries(j) + << " rb:" << rb << " re:" << re << std::endl; + } + } + } + std::cerr << "Wrong entries, see above." << std::endl; + return false; + } + + scalar_view_t valueDiff( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "spgemm values diff"), + output_mat_actual.values.extent(0)); + Kokkos::deep_copy(valueDiff, output_mat_actual.values); + KokkosBlas::axpy(-1.0, output_mat_reference.values, valueDiff); + auto maxDiff = KokkosBlas::nrminf(valueDiff); + + std::cout + << "Absolute maximum difference between actual and reference C values: " + << maxDiff << '\n'; + + return true; +} void print_options() { std::cerr << "Options\n" << std::endl; + std::cerr << perf_test::list_common_options(); + std::cerr << "\t[Required] INPUT MATRIX: '--amtx [left_hand_side.mtx]' -- for C=AxA" << std::endl; - std::cerr << "\t[Optional] BACKEND: '--threads [numThreads]' | '--openmp " - "[numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip " - "[hipDeviceIndex]' --> if none are specified, Serial is used " - "(if enabled)" - << std::endl; std::cerr << "\t[Optional] '--algorithm " "[DEFAULT=KKDEFAULT=KKSPGEMM|KKMEM|KKDENSE]' --> to choose algorithm. " @@ -47,158 +144,113 @@ void print_options() { "250k, which is max k value to choose dense accumulators. This " "can be increased with more memory bandwidth." << std::endl; - std::cerr - << "\tThe memory space used for each matrix: '--memspaces [0|1|....15]' " - "--> Bits representing the use of HBM for Work, C, B, and A " - "respectively. For example 12 = 1100, will store work arrays and C on " - "HBM. A and B will be stored DDR. To use this enable multilevel " - "memory in Kokkos, check generate_makefile.sh" - << std::endl; - std::cerr << "\tLoop scheduling: '--dynamic': Use this for dynamic " - "scheduling of the loops. (Better performance most of the time)" + std::cerr << "\t[Optional] '--dynamic': Use this for dynamic " + "loop scheduling. (Better performance most of the time)" + << std::endl; + std::cerr << "\t[Optional] '--verbose': detailed output about SpGEMM and the " + "output matrix" + << std::endl; + std::cerr << "\t[Optional] '--checkoutput': verify result against serial " + "reference implementation" << std::endl; - std::cerr << "\tVerbose Output: '--verbose'" << std::endl; -} - -static char* getNextArg(int& i, int argc, char** argv) { - i++; - if (i >= argc) { - std::cerr << "Error: expected additional command-line argument!\n"; - exit(1); - } - return argv[i]; } int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc, char** argv) { + std::string algoStr; + bool printHelp; for (int i = 1; i < argc; ++i) { - if (0 == Test::string_compare_no_case(argv[i], "--threads")) { - params.use_threads = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) { - params.use_openmp = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { - params.use_cuda = atoi(getNextArg(i, argc, argv)) + 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) { - params.use_hip = atoi(getNextArg(i, argc, argv)) + 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) { - params.repeat = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--hashscale")) { - params.minhashscale = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--chunksize")) { - params.chunk_size = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--teamsize")) { - params.team_size = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--vectorsize")) { - params.vector_size = atoi(getNextArg(i, argc, argv)); - } - - else if (0 == Test::string_compare_no_case(argv[i], "--compression2step")) { - params.compression2step = true; - } else if (0 == Test::string_compare_no_case(argv[i], "--shmem")) { - params.shmemsize = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--memspaces")) { - int memspaces = atoi(getNextArg(i, argc, argv)); - int memspaceinfo = memspaces; - std::cout << "memspaceinfo:" << memspaceinfo << std::endl; - if (memspaceinfo & 1) { - params.a_mem_space = 1; - std::cout << "Using HBM for A" << std::endl; - } else { - params.a_mem_space = 0; - std::cout << "Using DDR4 for A" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; - if (memspaceinfo & 1) { - params.b_mem_space = 1; - std::cout << "Using HBM for B" << std::endl; - } else { - params.b_mem_space = 0; - std::cout << "Using DDR4 for B" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; - if (memspaceinfo & 1) { - params.c_mem_space = 1; - std::cout << "Using HBM for C" << std::endl; - } else { - params.c_mem_space = 0; - std::cout << "Using DDR4 for C" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; - if (memspaceinfo & 1) { - params.work_mem_space = 1; - std::cout << "Using HBM for work memory space" << std::endl; - } else { - params.work_mem_space = 0; - std::cout << "Using DDR4 for work memory space" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--CRWC")) { - params.calculate_read_write_cost = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--CIF")) { - params.coloring_input_file = getNextArg(i, argc, argv); - } else if (0 == Test::string_compare_no_case(argv[i], "--COF")) { - params.coloring_output_file = getNextArg(i, argc, argv); - } else if (0 == Test::string_compare_no_case(argv[i], "--CCO")) { + if (perf_test::check_arg_int(i, argc, argv, "--repeat", params.repeat)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--hashscale", + params.minhashscale)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--chunksize", + params.chunk_size)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--teamsize", + params.team_size)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--vectorsize", + params.vector_size)) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--compression2step", + params.compression2step)) { + } else if (perf_test::check_arg_int(i, argc, argv, "--shmem", + params.shmemsize)) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--CRWC", + params.calculate_read_write_cost)) { + } else if (perf_test::check_arg_str(i, argc, argv, "--CIF", + params.coloring_input_file)) { + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--COF", + params.coloring_output_file)) { + ++i; + } else if (perf_test::check_arg_double(i, argc, argv, "--CCO", + params.compression_cut_off)) { // if 0.85 set, if compression does not reduce flops by at least 15% // symbolic will run on original matrix. otherwise, it will compress the // graph and run symbolic on compressed one. - params.compression_cut_off = atof(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--FLHCO")) { + ++i; + } else if (perf_test::check_arg_double(i, argc, argv, "--FLHCO", + params.first_level_hash_cut_off)) { // if linear probing is used as hash, what is the max occupancy percantage // we allow in the hash. - params.first_level_hash_cut_off = atof(getNextArg(i, argc, argv)); - } - - else if (0 == Test::string_compare_no_case(argv[i], "--flop")) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--flop", + params.calculate_read_write_cost)) { // print flop statistics. only for the first repeat. - params.calculate_read_write_cost = 1; - } - - else if (0 == Test::string_compare_no_case(argv[i], "--mklsort")) { + // note: if either --CRWC or --flop is passed, this parameter is set to + // true + } else if (perf_test::check_arg_int(i, argc, argv, "--mklsort", + params.mkl_sort_option)) { // when mkl2 is run, the sort option to use. // 7:not to sort the output // 8:to sort the output - params.mkl_sort_option = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--mklkeepout")) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--mklkeepout", + params.mkl_keep_output)) { // mkl output is not kept. - params.mkl_keep_output = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--checkoutput")) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--checkoutput", + params.check_output)) { // check correctness - params.check_output = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) { + } else if (perf_test::check_arg_str(i, argc, argv, "--amtx", + params.a_mtx_bin_file)) { // A at C=AxB - params.a_mtx_bin_file = getNextArg(i, argc, argv); - } - - else if (0 == Test::string_compare_no_case(argv[i], "--bmtx")) { + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--bmtx", + params.b_mtx_bin_file)) { // B at C=AxB. // if not provided, C = AxA will be performed. - params.b_mtx_bin_file = getNextArg(i, argc, argv); - } else if (0 == Test::string_compare_no_case(argv[i], "--cmtx")) { + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--cmtx", + params.c_mtx_bin_file)) { // if provided, C will be written to given file. // has to have ".bin", or ".crs" extension. - params.c_mtx_bin_file = getNextArg(i, argc, argv); - } else if (0 == Test::string_compare_no_case(argv[i], "--dynamic")) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--dynamic", + params.use_dynamic_scheduling)) { // dynamic scheduling will be used for loops. // currently it is default already. // so has to use the dynamic schedulin. - params.use_dynamic_scheduling = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--DENSEACCMAX")) { + } else if (perf_test::check_arg_int(i, argc, argv, "--DENSEACCMAX", + params.MaxColDenseAcc)) { // on CPUs and KNLs if DEFAULT algorithm or KKSPGEMM is chosen, // it uses dense accumulators for smaller matrices based on the size of // column (k) in B. Max column size is 250,000 for k to use dense // accumulators. this parameter overwrites this. with cache mode, or CPUs // with smaller thread count, where memory bandwidth is not an issue, this // cut-off can be increased to be more than 250,000 - params.MaxColDenseAcc = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--verbose")) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--verbose", + params.verbose)) { // print the timing and information about the inner steps. // if you are timing TPL libraries, for correct timing use verbose option, // because there are pre- post processing in these TPL kernel wraps. - params.verbose = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--algorithm")) { - char* algoStr = getNextArg(i, argc, argv); - + } else if (perf_test::check_arg_str(i, argc, argv, "--algorithm", + algoStr)) { if (0 == Test::string_compare_no_case(algoStr, "DEFAULT")) { params.algorithm = KokkosSparse::SPGEMM_KK; } else if (0 == Test::string_compare_no_case(algoStr, "KKDEFAULT")) { @@ -218,11 +270,14 @@ int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc, } else { - std::cerr << "Unrecognized command line argument #" << i << ": " - << argv[i] << std::endl; + std::cerr << "Unrecognized value for --algorithm (argument #" << i + << "): " << argv[i] << std::endl; print_options(); return 1; } + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "-h", printHelp)) { + } else if (perf_test::check_arg_bool(i, argc, argv, "--help", printHelp)) { } else { std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl; @@ -230,96 +285,239 @@ int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc, return 1; } } + if (printHelp) { + print_options(); + return 1; + } return 0; } -int main(int argc, char** argv) { +template +void run_spgemm(int argc, char** argv, perf_test::CommonInputParams) { + using namespace KokkosSparse; + using namespace KokkosSparse::Experimental; + + using MemSpace = typename ExecSpace::memory_space; using size_type = default_size_type; using lno_t = default_lno_t; using scalar_t = default_scalar; + using device_t = Kokkos::Device; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, ExecSpace, MemSpace, MemSpace>; KokkosKernels::Experiment::Parameters params; if (parse_inputs(params, argc, argv)) { - return 1; + return; } - if (params.a_mtx_bin_file == NULL) { + if (params.a_mtx_bin_file == "") { std::cerr << "Provide a and b matrix files" << std::endl; print_options(); - return 0; + return; + } + + crsMat_t A, B, C; + + // read a and b matrices + + A = KokkosSparse::Impl::read_kokkos_crst_matrix( + params.a_mtx_bin_file.c_str()); + + if ((params.b_mtx_bin_file == "" || + params.a_mtx_bin_file == params.b_mtx_bin_file)) { + std::cout << "B is not provided or is the same as A. Multiplying AxA." + << std::endl; + B = A; + } else { + B = KokkosSparse::Impl::read_kokkos_crst_matrix( + params.b_mtx_bin_file.c_str()); + } + + int algorithm = params.algorithm; + int repeat = params.repeat; + int chunk_size = params.chunk_size; + + int shmemsize = params.shmemsize; + int team_size = params.team_size; + int use_dynamic_scheduling = params.use_dynamic_scheduling; + int verbose = params.verbose; + int calculate_read_write_cost = params.calculate_read_write_cost; + // char spgemm_step = params.spgemm_step; + int vector_size = params.vector_size; + int check_output = params.check_output; + int mkl_keep_output = params.mkl_keep_output; + // spgemm_step++; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + typedef typename crsMat_t::row_map_type::non_const_type lno_view_t; + typedef typename crsMat_t::index_type::non_const_type lno_nnz_view_t; + + lno_view_t row_mapC; + lno_nnz_view_t entriesC; + scalar_view_t valuesC; + + KernelHandle kh; + kh.set_team_work_size(chunk_size); + kh.set_shmem_size(shmemsize); + kh.set_suggested_team_size(team_size); + kh.set_suggested_vector_size(vector_size); + + if (use_dynamic_scheduling) { + kh.set_dynamic_scheduling(true); } - if (params.b_mtx_bin_file == NULL) { - std::cout << "B is not provided. Multiplying AxA." << std::endl; + if (verbose) { + kh.set_verbose(true); } - const int num_threads = std::max(params.use_openmp, params.use_threads); - const int device_id = - params.use_cuda ? params.use_cuda - 1 : params.use_hip - 1; - - Kokkos::initialize(Kokkos::InitializationSettings() - .set_num_threads(num_threads) - .set_device_id(device_id)); - Kokkos::print_configuration(std::cout); - -#if defined(KOKKOS_ENABLE_OPENMP) - - if (params.use_openmp) { -#ifdef KOKKOSKERNELS_INST_MEMSPACE_HBWSPACE - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::OpenMP, - Kokkos::Experimental::HBWSpace, Kokkos::HostSpace>(params); -#else - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::OpenMP, - Kokkos::OpenMP::memory_space, Kokkos::OpenMP::memory_space>(params); -#endif + const lno_t m = A.numRows(); + const lno_t n = B.numRows(); + const lno_t k = B.numCols(); + + if (verbose) std::cout << "m:" << m << " n:" << n << " k:" << k << std::endl; + if (n < A.numCols()) { + std::cerr << "left.numCols():" << A.numCols() + << " right.numRows():" << B.numRows() << std::endl; + exit(1); } -#endif - -#if defined(KOKKOS_ENABLE_CUDA) - if (params.use_cuda) { -#ifdef KOKKOSKERNELS_INST_MEMSPACE_CUDAHOSTPINNEDSPACE - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::Cuda, Kokkos::Cuda::memory_space, - Kokkos::CudaHostPinnedSpace>(params); -#else - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::Cuda, Kokkos::Cuda::memory_space, - Kokkos::Cuda::memory_space>(params); - -#endif + + // The reference product (for verifying correctness) + // Don't allocate them if they won't be used, but they must be declared here. + lno_view_t row_mapC_ref; + lno_nnz_view_t entriesC_ref; + scalar_view_t valuesC_ref; + // Reference output has same type as actual output + crsMat_t C_ref; + + if (check_output) { + if (verbose) std::cout << "Running a reference algorithm" << std::endl; + row_mapC_ref = lno_view_t("non_const_lnow_row", m + 1); + KernelHandle sequential_kh; + sequential_kh.set_team_work_size(chunk_size); + sequential_kh.set_shmem_size(shmemsize); + sequential_kh.set_suggested_team_size(team_size); + sequential_kh.create_spgemm_handle(KokkosSparse::SPGEMM_SERIAL); + + if (use_dynamic_scheduling) { + sequential_kh.set_dynamic_scheduling(true); + } + + spgemm_symbolic(&sequential_kh, m, n, k, A.graph.row_map, A.graph.entries, + TRANSPOSEFIRST, B.graph.row_map, B.graph.entries, + TRANSPOSESECOND, row_mapC_ref); + + ExecSpace().fence(); + + size_type c_nnz_size = sequential_kh.get_spgemm_handle()->get_c_nnz(); + entriesC_ref = lno_nnz_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), + c_nnz_size); + valuesC_ref = scalar_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size); + + spgemm_numeric(&sequential_kh, m, n, k, A.graph.row_map, A.graph.entries, + A.values, TRANSPOSEFIRST, + + B.graph.row_map, B.graph.entries, B.values, TRANSPOSESECOND, + row_mapC_ref, entriesC_ref, valuesC_ref); + ExecSpace().fence(); + + C_ref = crsMat_t("CorrectC", m, k, valuesC_ref.extent(0), valuesC_ref, + row_mapC_ref, entriesC_ref); } -#endif -#if defined(KOKKOS_ENABLE_HIP) - if (params.use_hip) { - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::Experimental::HIP, - Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(params); + for (int i = 0; i < repeat; ++i) { + kh.create_spgemm_handle(KokkosSparse::SPGEMMAlgorithm(algorithm)); + + kh.get_spgemm_handle()->mkl_keep_output = mkl_keep_output; + kh.get_spgemm_handle()->set_mkl_sort_option(params.mkl_sort_option); + + // if mkl2 input needs to be converted to 1base. + kh.get_spgemm_handle()->mkl_convert_to_1base = true; + + // 250000 default. if cache-mode is used on KNL can increase to 1M. + kh.get_spgemm_handle()->MaxColDenseAcc = params.MaxColDenseAcc; + + if (i == 0) { + kh.get_spgemm_handle()->set_read_write_cost_calc( + calculate_read_write_cost); + } + // do the compression whether in 2 step, or 1 step. + kh.get_spgemm_handle()->set_compression_steps(!params.compression2step); + // whether to scale the hash more. default is 1, so no scale. + kh.get_spgemm_handle()->set_min_hash_size_scale(params.minhashscale); + // max occupancy in 1-level LP hashes. LL hashes can be 100% + kh.get_spgemm_handle()->set_first_level_hash_cut_off( + params.first_level_hash_cut_off); + // min reduction on FLOPs to run compression + kh.get_spgemm_handle()->set_compression_cut_off(params.compression_cut_off); + + row_mapC = lno_view_t("non_const_lnow_row", m + 1); + entriesC = lno_nnz_view_t("entriesC (empty)", 0); + valuesC = scalar_view_t("valuesC (empty)", 0); + + Kokkos::Timer timer1; + spgemm_symbolic(&kh, m, n, k, A.graph.row_map, A.graph.entries, + TRANSPOSEFIRST, B.graph.row_map, B.graph.entries, + TRANSPOSESECOND, row_mapC); + + ExecSpace().fence(); + double symbolic_time = timer1.seconds(); + + Kokkos::Timer timer3; + size_type c_nnz_size = kh.get_spgemm_handle()->get_c_nnz(); + if (verbose) std::cout << "C SIZE:" << c_nnz_size << std::endl; + if (c_nnz_size) { + entriesC = lno_nnz_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), + c_nnz_size); + valuesC = scalar_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), + c_nnz_size); + } + spgemm_numeric(&kh, m, n, k, A.graph.row_map, A.graph.entries, A.values, + TRANSPOSEFIRST, B.graph.row_map, B.graph.entries, B.values, + TRANSPOSESECOND, row_mapC, entriesC, valuesC); + + ExecSpace().fence(); + double numeric_time = timer3.seconds(); + + std::cout << "mm_time:" << symbolic_time + numeric_time + << " symbolic_time:" << symbolic_time + << " numeric_time:" << numeric_time << std::endl; } -#endif - -#if defined(KOKKOS_ENABLE_THREADS) - // If only serial is enabled (or no other device was specified), run with - // serial - if (params.use_threads) { - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::Threads, Kokkos::HostSpace, - Kokkos::HostSpace>(params); + if (verbose) { + std::cout << "row_mapC:" << row_mapC.extent(0) << std::endl; + std::cout << "entriesC:" << entriesC.extent(0) << std::endl; + std::cout << "valuesC:" << valuesC.extent(0) << std::endl; + KokkosKernels::Impl::print_1Dview(valuesC); + KokkosKernels::Impl::print_1Dview(entriesC); + KokkosKernels::Impl::print_1Dview(row_mapC); } -#endif - -#if defined(KOKKOS_ENABLE_SERIAL) - // If only serial is enabled (or no other device was specified), run with - // serial - if (!params.use_openmp && !params.use_cuda && !params.use_threads) { - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::Serial, Kokkos::HostSpace, - Kokkos::HostSpace>(params); + crsMat_t C_result("CrsMatrixC", m, k, valuesC.extent(0), valuesC, row_mapC, + entriesC); + if (check_output) { + bool is_identical = is_same_matrix(C_result, C_ref); + if (!is_identical) { + std::cerr << "SpGEMM result differs with reference implementation.\n"; + exit(1); + } else { + std::cerr << "SpGEMM result matches reference implementation.\n"; + } } -#endif - Kokkos::finalize(); + if (params.c_mtx_bin_file != "") { + KokkosSparse::sort_crs_matrix(C_result); - return 0; + KokkosSparse::Impl::write_graph_bin( + (lno_t)(C_result.numRows()), (size_type)(C_result.nnz()), + C_result.graph.row_map.data(), C_result.graph.entries.data(), + C_result.values.data(), params.c_mtx_bin_file.c_str()); + } } + +#define KOKKOSKERNELS_PERF_TEST_NAME run_spgemm +#include "KokkosKernels_perf_test_instantiation.hpp" +int main(int argc, char** argv) { + return main_instantiation(argc, argv); +} // main diff --git a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp index bcb71e951a..ff30fdf565 100644 --- a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp @@ -219,12 +219,12 @@ int main(int argc, char** argv) { if (parse_inputs(params, argc, argv)) { return 1; } - if (params.a_mtx_bin_file == NULL) { + if (params.a_mtx_bin_file == "") { std::cerr << "Provide a and b matrix files" << std::endl; print_options(); return 0; } - if (params.b_mtx_bin_file == NULL) { + if (params.b_mtx_bin_file == "") { std::cout << "B is not provided. Multiplying AxA." << std::endl; } diff --git a/test_common/KokkosKernels_TestParameters.hpp b/test_common/KokkosKernels_TestParameters.hpp index 713c201a8f..e3312c0a41 100644 --- a/test_common/KokkosKernels_TestParameters.hpp +++ b/test_common/KokkosKernels_TestParameters.hpp @@ -29,26 +29,28 @@ struct Parameters { int multi_color_scale; int shmemsize; int team_size; - int use_dynamic_scheduling; - int verbose; + bool use_dynamic_scheduling; + bool verbose; int spgemm_step; int vector_size; - int check_output; + bool check_output; int mkl_sort_option; int mkl_keep_output; - int calculate_read_write_cost; - char *coloring_input_file; - char *coloring_output_file; + bool calculate_read_write_cost; + std::string coloring_input_file; + std::string coloring_output_file; int minhashscale; int use_threads; int use_openmp; int use_cuda; int use_hip; + int use_sycl; + int use_openmptarget; int use_serial; int a_mem_space, b_mem_space, c_mem_space, work_mem_space; - char *a_mtx_bin_file, *b_mtx_bin_file, *c_mtx_bin_file; + std::string a_mtx_bin_file, b_mtx_bin_file, c_mtx_bin_file; bool compression2step; int left_lower_triangle, right_lower_triangle; int left_sort, right_sort; @@ -62,7 +64,7 @@ struct Parameters { int cache_flush; double first_level_hash_cut_off; double compression_cut_off; - size_t MaxColDenseAcc; + int MaxColDenseAcc; // 0 - no flush // 1 - soft flush // 2 - hard flush with rand. @@ -74,24 +76,26 @@ struct Parameters { multi_color_scale = 1; shmemsize = 16128; team_size = -1; - use_dynamic_scheduling = 0; - verbose = 0; + use_dynamic_scheduling = false; + verbose = false; spgemm_step = '0'; vector_size = -1; - check_output = 0; + check_output = false; mkl_sort_option = 7; mkl_keep_output = 1; - calculate_read_write_cost = 0; - coloring_input_file = NULL; - coloring_output_file = NULL; + calculate_read_write_cost = false; + coloring_input_file = ""; + coloring_output_file = ""; minhashscale = 1; use_threads = 0; use_openmp = 0; use_cuda = 0; use_hip = 0; + use_sycl = 0; + use_openmptarget = 0; use_serial = 0; a_mem_space = b_mem_space = c_mem_space = work_mem_space = 1; - a_mtx_bin_file = b_mtx_bin_file = c_mtx_bin_file = NULL; + a_mtx_bin_file = b_mtx_bin_file = c_mtx_bin_file = ""; compression2step = true; left_lower_triangle = 0; diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index fe68d68d07..130187ef35 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -520,6 +520,10 @@ int string_compare_no_case(const char* str1, const char* str2) { return strcmp(str1_s.c_str(), str2_s.c_str()); } +int string_compare_no_case(const std::string& str1, const std::string& str2) { + return string_compare_no_case(str1.c_str(), str2.c_str()); +} + /// /brief Cs (Compressed Sparse) matrix class for testing purposes. /// This class is for testing purposes only and will generate a random /// Crs / Ccs matrix when instantiated. The class is intentionally written From 8efb0356c7ef48b1ccdbcf2174a18c103b6a0aa7 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Mon, 9 Jan 2023 19:44:57 +0100 Subject: [PATCH 172/442] #7: (v1): old way for rocsparse and rocblas --- cmake/Modules/FindTPLROCBLAS.cmake | 59 +++++++++++++++++++++++++++- cmake/Modules/FindTPLROCSPARSE.cmake | 46 +++++++++++++++++++++- 2 files changed, 103 insertions(+), 2 deletions(-) diff --git a/cmake/Modules/FindTPLROCBLAS.cmake b/cmake/Modules/FindTPLROCBLAS.cmake index 0217e8cf2c..91e89e2a00 100644 --- a/cmake/Modules/FindTPLROCBLAS.cmake +++ b/cmake/Modules/FindTPLROCBLAS.cmake @@ -1,3 +1,60 @@ +MESSAGE( "Enter in FindTPLROCBLAS.cmake") + +## MPL: v3 +## MPL: 12/29/2022: CMake regular way to find a package +#FIND_PACKAGE(ROCBLAS) +#if(TARGET roc::rocsparse) +### MPL: 12/29/2022: Variable TPL_ROCBLAS_IMPORTED_NAME follows the requested convention +### of KokkosKernel (method kokkoskernels_import_tpl of kokkoskernels_tpls.cmake) + #SET(TPL_ROCBLAS_IMPORTED_NAME roc::rocblas) + #SET(TPL_IMPORTED_NAME roc::rocblas) +### MPL: 12/29/2022: A target comming from a TPL must follows the requested convention +### of KokkosKernel (method kokkoskernels_link_tpl of kokkoskernels_tpls.cmake) + #ADD_LIBRARY(KokkosKernels::ROCBLAS ALIAS roc::rocblas) +#ELSE() +# MESSAGE(FATAL_ERROR "Package ROCBLAS requested but not found") +#ENDIF() + +## MPL: v2 +## MPL: 12/26/2022: This bloc is not necessary anymore since ROCBLAS installation provide a cmake config file. +## Should we verify for different version of ROCBLAS ? +## GOAL: The code is commented for now and we aim to remove it +#IF (ROCBLAS_LIBRARY_DIRS AND ROCBLAS_LIBRARIES) +# KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES} LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) +#ELSEIF (ROCBLAS_LIBRARIES) +# KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES}) +#ELSEIF (ROCBLAS_LIBRARY_DIRS) +#ELSE() +# # MPL: 12/26/2022: USE FIND_PACKAGE and check if the requested target is the more modern way to do it +# # MPL: 12/28/2022 : This logical bloc is based on the logical bloc coming from FindTPLCUBLAS. But instead of +# # expecting a ROCBLAS_FOUND variable to be set. We expect the TARGET roc::rocblas to be defined (more modern) +# FIND_PACKAGE(ROCBLAS) +# if(NOT TARGET roc::rocblas) +# MESSAGE( "TARGET roc::rocblas NOT FOUND") +# #Important note here: this find Module is named TPLROCBLAS +# #The eventual target is named roc::rocblas. To avoid naming conflicts +# #the find module is called TPLROCBLAS. This call will cause +# #the find_package call to fail in a "standard" CMake way +# FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCBLAS REQUIRED_VARS ROCBLAS_FOUND) +# ELSE() +# # MPL: 12/26/2022: USING FIND_PACKAGE_HANDLE_STANDARD_ARGS can be ok in modern CMAKE but with a Find module +# # if the package is found, we can verify that some variables are defined using FIND_PACKAGE_HANDLE_STANDARD_ARGS +# MESSAGE( "TARGET roc::rocblas FOUND") +# #The libraries might be empty - OR they might explicitly be not found +# IF("${ROCBLAS_LIBRARIES}" MATCHES "NOTFOUND") +# MESSAGE( "ROCBLAS_LIBRARIES is not found") + +# FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCBLAS REQUIRED_VARS ROCBLAS_LIBRARIES) +# ELSE() +# MESSAGE( "ROCBLAS_LIBRARIES is not found") +# # 12/28/2022: ROCBLAS_LIBRARIES is found using find_packge which defines it as a target and not a lib +# message("TPLROCBLAS LIBRARIES VARIABLE IS ${ROCBLAS_LIBRARIES}") +# KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCBLAS INTERFACE LINK_LIBRARIES ${ROCBLAS_LIBRARIES}) +# ENDIF() +# endif() +#ENDIF() + +## MPL: v1 IF (ROCBLAS_LIBRARY_DIRS AND ROCBLAS_LIBRARIES) KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES} LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) ELSEIF (ROCBLAS_LIBRARIES) @@ -34,4 +91,4 @@ ELSE() # FIND_PACKAGE(ROCBLAS REQUIRED) # KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCBLAS INTERFACE LINK_LIBRARIES ${ROCBLAS_LIBRARIES}) # GET_TARGET_PROPERTY(ROCBLAS_LINK_LIBRARIES ${ROCBLAS_LIBRARIES} IMPORTED_LINK_INTERFACE_LIBRARIES) -ENDIF() \ No newline at end of file +ENDIF() diff --git a/cmake/Modules/FindTPLROCSPARSE.cmake b/cmake/Modules/FindTPLROCSPARSE.cmake index 52a0261b48..866f825c66 100644 --- a/cmake/Modules/FindTPLROCSPARSE.cmake +++ b/cmake/Modules/FindTPLROCSPARSE.cmake @@ -1,3 +1,47 @@ +# MPL: 05/01/2023: This file follows the partern of FindTPLROCBLAS.cmake + +# MPL: v3 +#FIND_PACKAGE(ROCSPARSE) +#if(TARGET roc::rocsparse) +# SET(TPL_ROCSPARSE_IMPORTED_NAME roc::rocsparse) +# SET(TPL_IMPORTED_NAME roc::rocsparse) +# ADD_LIBRARY(KokkosKernels::ROCSPARSE ALIAS roc::rocsparse) +#ELSE() +# MESSAGE(FATAL_ERROR "Package ROCSPARSE requested but not found") +#ENDIF() + +## MPL: v2 +#IF (ROCSPARSE_LIBRARY_DIRS AND ROCSPARSE_LIBRARIES) +# KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES} LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) +#ELSEIF (ROCSPARSE_LIBRARIES) +# KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES}) +#ELSEIF (ROCSPARSE_LIBRARY_DIRS) +#ELSE() +# FIND_PACKAGE(ROCSPARSE) +# if(NOT TARGET roc::rocsparse) +# MESSAGE( "TARGET roc::ROCSPARSE NOT FOUND") +# #Important note here: this find Module is named TPLROCSPARSE +# #The eventual target is named roc::ROCSPARSE. To avoid naming conflicts +# #the find module is called TPLROCSPARSE. This call will cause +# #the find_package call to fail in a "standard" CMake way +# FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSPARSE REQUIRED_VARS ROCSPARSE_FOUND) +# ELSE() +# # if the package is found, we can verify that some variables are defined using FIND_PACKAGE_HANDLE_STANDARD_ARGS +# MESSAGE( "TARGET roc::ROCSPARSE FOUND") +# #The libraries might be empty - OR they might explicitly be not found +# IF("${ROCSPARSE_LIBRARIES}" MATCHES "NOTFOUND") +# MESSAGE( "ROCSPARSE_LIBRARIES is not found") + +# FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSPARSE REQUIRED_VARS ROCSPARSE_LIBRARIES) +# ELSE() +# MESSAGE( "ROCSPARSE_LIBRARIES is not found") +# message("TPLROCSPARSE LIBRARIES VARIABLE IS ${ROCSPARSE_LIBRARIES}") +# KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSPARSE INTERFACE LINK_LIBRARIES ${ROCSPARSE_LIBRARIES}) +# ENDIF() +# endif() +#ENDIF() + +# MPL: v1 IF (ROCSPARSE_LIBRARY_DIRS AND ROCSPARSE_LIBRARIES) KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES} LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) ELSEIF (ROCSPARSE_LIBRARIES) @@ -34,4 +78,4 @@ ELSE() # FIND_PACKAGE(ROCSPARSE REQUIRED) # KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSPARSE INTERFACE LINK_LIBRARIES ${ROCSPARSE_LIBRARIES}) # GET_TARGET_PROPERTY(ROCSPARSE_LINK_LIBRARIES ${ROCSPARSE_LIBRARIES} IMPORTED_LINK_INTERFACE_LIBRARIES) -ENDIF() \ No newline at end of file +ENDIF() From 5c8d760a330437e6a21e2a06901f048a29ac54eb Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Tue, 10 Jan 2023 10:59:02 +0100 Subject: [PATCH 173/442] #7: V2 Added hybrid version for rocblas and rocsparse --- cmake/Modules/FindTPLROCBLAS.cmake | 142 +++++++++++++-------------- cmake/Modules/FindTPLROCSPARSE.cmake | 124 +++++++++++------------ 2 files changed, 132 insertions(+), 134 deletions(-) diff --git a/cmake/Modules/FindTPLROCBLAS.cmake b/cmake/Modules/FindTPLROCBLAS.cmake index 91e89e2a00..2425b39e96 100644 --- a/cmake/Modules/FindTPLROCBLAS.cmake +++ b/cmake/Modules/FindTPLROCBLAS.cmake @@ -1,5 +1,3 @@ -MESSAGE( "Enter in FindTPLROCBLAS.cmake") - ## MPL: v3 ## MPL: 12/29/2022: CMake regular way to find a package #FIND_PACKAGE(ROCBLAS) @@ -15,80 +13,80 @@ MESSAGE( "Enter in FindTPLROCBLAS.cmake") # MESSAGE(FATAL_ERROR "Package ROCBLAS requested but not found") #ENDIF() -## MPL: v2 -## MPL: 12/26/2022: This bloc is not necessary anymore since ROCBLAS installation provide a cmake config file. -## Should we verify for different version of ROCBLAS ? -## GOAL: The code is commented for now and we aim to remove it -#IF (ROCBLAS_LIBRARY_DIRS AND ROCBLAS_LIBRARIES) -# KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES} LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) -#ELSEIF (ROCBLAS_LIBRARIES) -# KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES}) -#ELSEIF (ROCBLAS_LIBRARY_DIRS) -#ELSE() -# # MPL: 12/26/2022: USE FIND_PACKAGE and check if the requested target is the more modern way to do it -# # MPL: 12/28/2022 : This logical bloc is based on the logical bloc coming from FindTPLCUBLAS. But instead of -# # expecting a ROCBLAS_FOUND variable to be set. We expect the TARGET roc::rocblas to be defined (more modern) -# FIND_PACKAGE(ROCBLAS) -# if(NOT TARGET roc::rocblas) -# MESSAGE( "TARGET roc::rocblas NOT FOUND") -# #Important note here: this find Module is named TPLROCBLAS -# #The eventual target is named roc::rocblas. To avoid naming conflicts -# #the find module is called TPLROCBLAS. This call will cause -# #the find_package call to fail in a "standard" CMake way -# FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCBLAS REQUIRED_VARS ROCBLAS_FOUND) -# ELSE() -# # MPL: 12/26/2022: USING FIND_PACKAGE_HANDLE_STANDARD_ARGS can be ok in modern CMAKE but with a Find module -# # if the package is found, we can verify that some variables are defined using FIND_PACKAGE_HANDLE_STANDARD_ARGS -# MESSAGE( "TARGET roc::rocblas FOUND") -# #The libraries might be empty - OR they might explicitly be not found -# IF("${ROCBLAS_LIBRARIES}" MATCHES "NOTFOUND") -# MESSAGE( "ROCBLAS_LIBRARIES is not found") - -# FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCBLAS REQUIRED_VARS ROCBLAS_LIBRARIES) -# ELSE() -# MESSAGE( "ROCBLAS_LIBRARIES is not found") -# # 12/28/2022: ROCBLAS_LIBRARIES is found using find_packge which defines it as a target and not a lib -# message("TPLROCBLAS LIBRARIES VARIABLE IS ${ROCBLAS_LIBRARIES}") -# KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCBLAS INTERFACE LINK_LIBRARIES ${ROCBLAS_LIBRARIES}) -# ENDIF() -# endif() -#ENDIF() - -## MPL: v1 +# MPL: v2 +# MPL: 12/26/2022: This bloc is not necessary anymore since ROCBLAS installation provide a cmake config file. +# Should we verify for different version of ROCBLAS ? +# GOAL: The code is commented for now and we aim to remove it IF (ROCBLAS_LIBRARY_DIRS AND ROCBLAS_LIBRARIES) KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES} LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) ELSEIF (ROCBLAS_LIBRARIES) KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES}) ELSEIF (ROCBLAS_LIBRARY_DIRS) - KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES rocblas LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) -ELSEIF (KokkosKernels_ROCBLAS_ROOT) - KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE - LIBRARIES - rocblas - LIBRARY_PATHS - ${KokkosKernels_ROCBLAS_ROOT}/lib - HEADERS - rocblas.h - HEADER_PATHS - ${KokkosKernels_ROCBLAS_ROOT}/include - ) -ELSEIF (DEFINED ENV{ROCM_PATH}) - MESSAGE(STATUS "Detected ROCM_PATH: ENV{ROCM_PATH}") - SET(ROCBLAS_ROOT "$ENV{ROCM_PATH}/rocblas") - KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE - LIBRARIES - rocblas - LIBRARY_PATHS - ${ROCBLAS_ROOT}/lib - HEADERS - rocblas.h - HEADER_PATHS - ${ROCBLAS_ROOT}/include - ) ELSE() - MESSAGE(ERROR "rocBLAS was not detected properly, please disable it or provide sufficient information at configure time.") - # Todo: figure out how to use the target defined during rocblas installation - # FIND_PACKAGE(ROCBLAS REQUIRED) - # KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCBLAS INTERFACE LINK_LIBRARIES ${ROCBLAS_LIBRARIES}) - # GET_TARGET_PROPERTY(ROCBLAS_LINK_LIBRARIES ${ROCBLAS_LIBRARIES} IMPORTED_LINK_INTERFACE_LIBRARIES) + # MPL: 12/26/2022: USE FIND_PACKAGE and check if the requested target is the more modern way to do it + # MPL: 12/28/2022 : This logical bloc is based on the logical bloc coming from FindTPLCUBLAS. But instead of + # expecting a ROCBLAS_FOUND variable to be set. We expect the TARGET roc::rocblas to be defined (more modern) + FIND_PACKAGE(ROCBLAS) + if(NOT TARGET roc::rocblas) + MESSAGE( "TARGET roc::rocblas NOT FOUND") + #Important note here: this find Module is named TPLROCBLAS + #The eventual target is named roc::rocblas. To avoid naming conflicts + #the find module is called TPLROCBLAS. This call will cause + #the find_package call to fail in a "standard" CMake way + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCBLAS REQUIRED_VARS ROCBLAS_FOUND) + ELSE() + # MPL: 12/26/2022: USING FIND_PACKAGE_HANDLE_STANDARD_ARGS can be ok in modern CMAKE but with a Find module + # if the package is found, we can verify that some variables are defined using FIND_PACKAGE_HANDLE_STANDARD_ARGS + MESSAGE( "TARGET roc::rocblas FOUND") + #The libraries might be empty - OR they might explicitly be not found + IF("${ROCBLAS_LIBRARIES}" MATCHES "NOTFOUND") + MESSAGE( "ROCBLAS_LIBRARIES is not found") + + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCBLAS REQUIRED_VARS ROCBLAS_LIBRARIES) + ELSE() + MESSAGE( "ROCBLAS_LIBRARIES is not found") + # 12/28/2022: ROCBLAS_LIBRARIES is found using find_packge which defines it as a target and not a lib + message("TPLROCBLAS LIBRARIES VARIABLE IS ${ROCBLAS_LIBRARIES}") + KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCBLAS INTERFACE LINK_LIBRARIES ${ROCBLAS_LIBRARIES}) + ENDIF() + endif() ENDIF() + +## MPL: v1 +#IF (ROCBLAS_LIBRARY_DIRS AND ROCBLAS_LIBRARIES) +# KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES} LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) +#ELSEIF (ROCBLAS_LIBRARIES) +# KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES}) +#ELSEIF (ROCBLAS_LIBRARY_DIRS) +# KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES rocblas LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) +#ELSEIF (KokkosKernels_ROCBLAS_ROOT) +# KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE +# LIBRARIES +# rocblas +# LIBRARY_PATHS +# ${KokkosKernels_ROCBLAS_ROOT}/lib +# HEADERS +# rocblas.h +# HEADER_PATHS +# ${KokkosKernels_ROCBLAS_ROOT}/include +# ) +#ELSEIF (DEFINED ENV{ROCM_PATH}) +# MESSAGE(STATUS "Detected ROCM_PATH: ENV{ROCM_PATH}") +# SET(ROCBLAS_ROOT "$ENV{ROCM_PATH}/rocblas") +# KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE +# LIBRARIES +# rocblas +# LIBRARY_PATHS +# ${ROCBLAS_ROOT}/lib +# HEADERS +# rocblas.h +# HEADER_PATHS +# ${ROCBLAS_ROOT}/include +# ) +#ELSE() +# MESSAGE(ERROR "rocBLAS was not detected properly, please disable it or provide sufficient information at configure time.") +# # Todo: figure out how to use the target defined during rocblas installation +# # FIND_PACKAGE(ROCBLAS REQUIRED) +# # KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCBLAS INTERFACE LINK_LIBRARIES ${ROCBLAS_LIBRARIES}) +# # GET_TARGET_PROPERTY(ROCBLAS_LINK_LIBRARIES ${ROCBLAS_LIBRARIES} IMPORTED_LINK_INTERFACE_LIBRARIES) +#ENDIF() diff --git a/cmake/Modules/FindTPLROCSPARSE.cmake b/cmake/Modules/FindTPLROCSPARSE.cmake index 866f825c66..de8a3ca5df 100644 --- a/cmake/Modules/FindTPLROCSPARSE.cmake +++ b/cmake/Modules/FindTPLROCSPARSE.cmake @@ -10,72 +10,72 @@ # MESSAGE(FATAL_ERROR "Package ROCSPARSE requested but not found") #ENDIF() -## MPL: v2 -#IF (ROCSPARSE_LIBRARY_DIRS AND ROCSPARSE_LIBRARIES) -# KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES} LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) -#ELSEIF (ROCSPARSE_LIBRARIES) -# KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES}) -#ELSEIF (ROCSPARSE_LIBRARY_DIRS) -#ELSE() -# FIND_PACKAGE(ROCSPARSE) -# if(NOT TARGET roc::rocsparse) -# MESSAGE( "TARGET roc::ROCSPARSE NOT FOUND") -# #Important note here: this find Module is named TPLROCSPARSE -# #The eventual target is named roc::ROCSPARSE. To avoid naming conflicts -# #the find module is called TPLROCSPARSE. This call will cause -# #the find_package call to fail in a "standard" CMake way -# FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSPARSE REQUIRED_VARS ROCSPARSE_FOUND) -# ELSE() -# # if the package is found, we can verify that some variables are defined using FIND_PACKAGE_HANDLE_STANDARD_ARGS -# MESSAGE( "TARGET roc::ROCSPARSE FOUND") -# #The libraries might be empty - OR they might explicitly be not found -# IF("${ROCSPARSE_LIBRARIES}" MATCHES "NOTFOUND") -# MESSAGE( "ROCSPARSE_LIBRARIES is not found") - -# FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSPARSE REQUIRED_VARS ROCSPARSE_LIBRARIES) -# ELSE() -# MESSAGE( "ROCSPARSE_LIBRARIES is not found") -# message("TPLROCSPARSE LIBRARIES VARIABLE IS ${ROCSPARSE_LIBRARIES}") -# KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSPARSE INTERFACE LINK_LIBRARIES ${ROCSPARSE_LIBRARIES}) -# ENDIF() -# endif() -#ENDIF() - -# MPL: v1 +# MPL: v2 IF (ROCSPARSE_LIBRARY_DIRS AND ROCSPARSE_LIBRARIES) KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES} LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) ELSEIF (ROCSPARSE_LIBRARIES) KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES}) ELSEIF (ROCSPARSE_LIBRARY_DIRS) - KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES rocsparse LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) -ELSEIF (KokkosKernels_ROCSPARSE_ROOT) - KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE - LIBRARIES - rocsparse - LIBRARY_PATHS - ${KokkosKernels_ROCSPARSE_ROOT}/lib - HEADERS - rocsparse.h - HEADER_PATHS - ${KokkosKernels_ROCSPARSE_ROOT}/include - ) -ELSEIF (DEFINED ENV{ROCM_PATH}) - MESSAGE(STATUS "Detected ROCM_PATH: ENV{ROCM_PATH}") - SET(ROCSPARSE_ROOT "$ENV{ROCM_PATH}/rocsparse") - KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE - LIBRARIES - rocsparse - LIBRARY_PATHS - ${ROCSPARSE_ROOT}/lib - HEADERS - rocsparse.h - HEADER_PATHS - ${ROCSPARSE_ROOT}/include - ) ELSE() - MESSAGE(ERROR "rocSPARSE was not detected properly, please disable it or provide sufficient information at configure time.") - # Todo: figure out how to use the target defined during rocsparse installation - # FIND_PACKAGE(ROCSPARSE REQUIRED) - # KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSPARSE INTERFACE LINK_LIBRARIES ${ROCSPARSE_LIBRARIES}) - # GET_TARGET_PROPERTY(ROCSPARSE_LINK_LIBRARIES ${ROCSPARSE_LIBRARIES} IMPORTED_LINK_INTERFACE_LIBRARIES) + FIND_PACKAGE(ROCSPARSE) + if(NOT TARGET roc::rocsparse) + MESSAGE( "TARGET roc::ROCSPARSE NOT FOUND") + #Important note here: this find Module is named TPLROCSPARSE + #The eventual target is named roc::ROCSPARSE. To avoid naming conflicts + #the find module is called TPLROCSPARSE. This call will cause + #the find_package call to fail in a "standard" CMake way + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSPARSE REQUIRED_VARS ROCSPARSE_FOUND) + ELSE() + # if the package is found, we can verify that some variables are defined using FIND_PACKAGE_HANDLE_STANDARD_ARGS + MESSAGE( "TARGET roc::ROCSPARSE FOUND") + #The libraries might be empty - OR they might explicitly be not found + IF("${ROCSPARSE_LIBRARIES}" MATCHES "NOTFOUND") + MESSAGE( "ROCSPARSE_LIBRARIES is not found") + + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSPARSE REQUIRED_VARS ROCSPARSE_LIBRARIES) + ELSE() + MESSAGE( "ROCSPARSE_LIBRARIES is not found") + message("TPLROCSPARSE LIBRARIES VARIABLE IS ${ROCSPARSE_LIBRARIES}") + KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSPARSE INTERFACE LINK_LIBRARIES ${ROCSPARSE_LIBRARIES}) + ENDIF() + endif() ENDIF() + +## MPL: v1 +#IF (ROCSPARSE_LIBRARY_DIRS AND ROCSPARSE_LIBRARIES) +# KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES} LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) +#ELSEIF (ROCSPARSE_LIBRARIES) +# KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES}) +#ELSEIF (ROCSPARSE_LIBRARY_DIRS) +# KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES rocsparse LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) +#ELSEIF (KokkosKernels_ROCSPARSE_ROOT) +# KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE +# LIBRARIES +# rocsparse +# LIBRARY_PATHS +# ${KokkosKernels_ROCSPARSE_ROOT}/lib +# HEADERS +# rocsparse.h +# HEADER_PATHS +# ${KokkosKernels_ROCSPARSE_ROOT}/include +# ) +#ELSEIF (DEFINED ENV{ROCM_PATH}) +# MESSAGE(STATUS "Detected ROCM_PATH: ENV{ROCM_PATH}") +# SET(ROCSPARSE_ROOT "$ENV{ROCM_PATH}/rocsparse") +# KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE +# LIBRARIES +# rocsparse +# LIBRARY_PATHS +# ${ROCSPARSE_ROOT}/lib +# HEADERS +# rocsparse.h +# HEADER_PATHS +# ${ROCSPARSE_ROOT}/include +# ) +#ELSE() +# MESSAGE(ERROR "rocSPARSE was not detected properly, please disable it or provide sufficient information at configure time.") +# # Todo: figure out how to use the target defined during rocsparse installation +# # FIND_PACKAGE(ROCSPARSE REQUIRED) +# # KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSPARSE INTERFACE LINK_LIBRARIES ${ROCSPARSE_LIBRARIES}) +# # GET_TARGET_PROPERTY(ROCSPARSE_LINK_LIBRARIES ${ROCSPARSE_LIBRARIES} IMPORTED_LINK_INTERFACE_LIBRARIES) +#ENDIF() From f49d41eadc08c66ebfed900c74a2957c9f844668 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Tue, 10 Jan 2023 11:12:29 +0100 Subject: [PATCH 174/442] #7: V3: simplest way to get rocsparse and rocblas --- cmake/Modules/FindTPLROCBLAS.cmake | 102 +++++++++++++-------------- cmake/Modules/FindTPLROCSPARSE.cmake | 74 +++++++++---------- 2 files changed, 88 insertions(+), 88 deletions(-) diff --git a/cmake/Modules/FindTPLROCBLAS.cmake b/cmake/Modules/FindTPLROCBLAS.cmake index 2425b39e96..9c95ae5e77 100644 --- a/cmake/Modules/FindTPLROCBLAS.cmake +++ b/cmake/Modules/FindTPLROCBLAS.cmake @@ -1,57 +1,57 @@ -## MPL: v3 -## MPL: 12/29/2022: CMake regular way to find a package -#FIND_PACKAGE(ROCBLAS) -#if(TARGET roc::rocsparse) -### MPL: 12/29/2022: Variable TPL_ROCBLAS_IMPORTED_NAME follows the requested convention -### of KokkosKernel (method kokkoskernels_import_tpl of kokkoskernels_tpls.cmake) - #SET(TPL_ROCBLAS_IMPORTED_NAME roc::rocblas) - #SET(TPL_IMPORTED_NAME roc::rocblas) -### MPL: 12/29/2022: A target comming from a TPL must follows the requested convention -### of KokkosKernel (method kokkoskernels_link_tpl of kokkoskernels_tpls.cmake) - #ADD_LIBRARY(KokkosKernels::ROCBLAS ALIAS roc::rocblas) -#ELSE() -# MESSAGE(FATAL_ERROR "Package ROCBLAS requested but not found") -#ENDIF() - -# MPL: v2 -# MPL: 12/26/2022: This bloc is not necessary anymore since ROCBLAS installation provide a cmake config file. -# Should we verify for different version of ROCBLAS ? -# GOAL: The code is commented for now and we aim to remove it -IF (ROCBLAS_LIBRARY_DIRS AND ROCBLAS_LIBRARIES) - KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES} LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) -ELSEIF (ROCBLAS_LIBRARIES) - KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES}) -ELSEIF (ROCBLAS_LIBRARY_DIRS) +# MPL: v3 +# MPL: 12/29/2022: CMake regular way to find a package +FIND_PACKAGE(ROCBLAS) +if(TARGET roc::rocblas) +## MPL: 12/29/2022: Variable TPL_ROCBLAS_IMPORTED_NAME follows the requested convention +## of KokkosKernel (method kokkoskernels_import_tpl of kokkoskernels_tpls.cmake) + SET(TPL_ROCBLAS_IMPORTED_NAME roc::rocblas) + SET(TPL_IMPORTED_NAME roc::rocblas) +## MPL: 12/29/2022: A target comming from a TPL must follows the requested convention +## of KokkosKernel (method kokkoskernels_link_tpl of kokkoskernels_tpls.cmake) + ADD_LIBRARY(KokkosKernels::ROCBLAS ALIAS roc::rocblas) ELSE() - # MPL: 12/26/2022: USE FIND_PACKAGE and check if the requested target is the more modern way to do it - # MPL: 12/28/2022 : This logical bloc is based on the logical bloc coming from FindTPLCUBLAS. But instead of - # expecting a ROCBLAS_FOUND variable to be set. We expect the TARGET roc::rocblas to be defined (more modern) - FIND_PACKAGE(ROCBLAS) - if(NOT TARGET roc::rocblas) - MESSAGE( "TARGET roc::rocblas NOT FOUND") - #Important note here: this find Module is named TPLROCBLAS - #The eventual target is named roc::rocblas. To avoid naming conflicts - #the find module is called TPLROCBLAS. This call will cause - #the find_package call to fail in a "standard" CMake way - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCBLAS REQUIRED_VARS ROCBLAS_FOUND) - ELSE() - # MPL: 12/26/2022: USING FIND_PACKAGE_HANDLE_STANDARD_ARGS can be ok in modern CMAKE but with a Find module - # if the package is found, we can verify that some variables are defined using FIND_PACKAGE_HANDLE_STANDARD_ARGS - MESSAGE( "TARGET roc::rocblas FOUND") - #The libraries might be empty - OR they might explicitly be not found - IF("${ROCBLAS_LIBRARIES}" MATCHES "NOTFOUND") - MESSAGE( "ROCBLAS_LIBRARIES is not found") - - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCBLAS REQUIRED_VARS ROCBLAS_LIBRARIES) - ELSE() - MESSAGE( "ROCBLAS_LIBRARIES is not found") - # 12/28/2022: ROCBLAS_LIBRARIES is found using find_packge which defines it as a target and not a lib - message("TPLROCBLAS LIBRARIES VARIABLE IS ${ROCBLAS_LIBRARIES}") - KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCBLAS INTERFACE LINK_LIBRARIES ${ROCBLAS_LIBRARIES}) - ENDIF() - endif() + MESSAGE(FATAL_ERROR "Package ROCBLAS requested but not found") ENDIF() +## MPL: v2 +## MPL: 12/26/2022: This bloc is not necessary anymore since ROCBLAS installation provide a cmake config file. +## Should we verify for different version of ROCBLAS ? +## GOAL: The code is commented for now and we aim to remove it +#IF (ROCBLAS_LIBRARY_DIRS AND ROCBLAS_LIBRARIES) +# KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES} LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) +#ELSEIF (ROCBLAS_LIBRARIES) +# KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES}) +#ELSEIF (ROCBLAS_LIBRARY_DIRS) +#ELSE() +# # MPL: 12/26/2022: USE FIND_PACKAGE and check if the requested target is the more modern way to do it +# # MPL: 12/28/2022 : This logical bloc is based on the logical bloc coming from FindTPLCUBLAS. But instead of +# # expecting a ROCBLAS_FOUND variable to be set. We expect the TARGET roc::rocblas to be defined (more modern) +# FIND_PACKAGE(ROCBLAS) +# if(NOT TARGET roc::rocblas) +# MESSAGE( "TARGET roc::rocblas NOT FOUND") +# #Important note here: this find Module is named TPLROCBLAS +# #The eventual target is named roc::rocblas. To avoid naming conflicts +# #the find module is called TPLROCBLAS. This call will cause +# #the find_package call to fail in a "standard" CMake way +# FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCBLAS REQUIRED_VARS ROCBLAS_FOUND) +# ELSE() +# # MPL: 12/26/2022: USING FIND_PACKAGE_HANDLE_STANDARD_ARGS can be ok in modern CMAKE but with a Find module +# # if the package is found, we can verify that some variables are defined using FIND_PACKAGE_HANDLE_STANDARD_ARGS +# MESSAGE( "TARGET roc::rocblas FOUND") +# #The libraries might be empty - OR they might explicitly be not found +# IF("${ROCBLAS_LIBRARIES}" MATCHES "NOTFOUND") +# MESSAGE( "ROCBLAS_LIBRARIES is not found") + +# FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCBLAS REQUIRED_VARS ROCBLAS_LIBRARIES) +# ELSE() +# MESSAGE( "ROCBLAS_LIBRARIES is not found") +# # 12/28/2022: ROCBLAS_LIBRARIES is found using find_packge which defines it as a target and not a lib +# message("TPLROCBLAS LIBRARIES VARIABLE IS ${ROCBLAS_LIBRARIES}") +# KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCBLAS INTERFACE LINK_LIBRARIES ${ROCBLAS_LIBRARIES}) +# ENDIF() +# endif() +#ENDIF() + ## MPL: v1 #IF (ROCBLAS_LIBRARY_DIRS AND ROCBLAS_LIBRARIES) # KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES} LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) diff --git a/cmake/Modules/FindTPLROCSPARSE.cmake b/cmake/Modules/FindTPLROCSPARSE.cmake index de8a3ca5df..b64b80677c 100644 --- a/cmake/Modules/FindTPLROCSPARSE.cmake +++ b/cmake/Modules/FindTPLROCSPARSE.cmake @@ -1,46 +1,46 @@ # MPL: 05/01/2023: This file follows the partern of FindTPLROCBLAS.cmake # MPL: v3 -#FIND_PACKAGE(ROCSPARSE) -#if(TARGET roc::rocsparse) -# SET(TPL_ROCSPARSE_IMPORTED_NAME roc::rocsparse) -# SET(TPL_IMPORTED_NAME roc::rocsparse) -# ADD_LIBRARY(KokkosKernels::ROCSPARSE ALIAS roc::rocsparse) -#ELSE() -# MESSAGE(FATAL_ERROR "Package ROCSPARSE requested but not found") -#ENDIF() - -# MPL: v2 -IF (ROCSPARSE_LIBRARY_DIRS AND ROCSPARSE_LIBRARIES) - KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES} LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) -ELSEIF (ROCSPARSE_LIBRARIES) - KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES}) -ELSEIF (ROCSPARSE_LIBRARY_DIRS) +FIND_PACKAGE(ROCSPARSE) +if(TARGET roc::rocsparse) + SET(TPL_ROCSPARSE_IMPORTED_NAME roc::rocsparse) + SET(TPL_IMPORTED_NAME roc::rocsparse) + ADD_LIBRARY(KokkosKernels::ROCSPARSE ALIAS roc::rocsparse) ELSE() - FIND_PACKAGE(ROCSPARSE) - if(NOT TARGET roc::rocsparse) - MESSAGE( "TARGET roc::ROCSPARSE NOT FOUND") - #Important note here: this find Module is named TPLROCSPARSE - #The eventual target is named roc::ROCSPARSE. To avoid naming conflicts - #the find module is called TPLROCSPARSE. This call will cause - #the find_package call to fail in a "standard" CMake way - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSPARSE REQUIRED_VARS ROCSPARSE_FOUND) - ELSE() - # if the package is found, we can verify that some variables are defined using FIND_PACKAGE_HANDLE_STANDARD_ARGS - MESSAGE( "TARGET roc::ROCSPARSE FOUND") - #The libraries might be empty - OR they might explicitly be not found - IF("${ROCSPARSE_LIBRARIES}" MATCHES "NOTFOUND") - MESSAGE( "ROCSPARSE_LIBRARIES is not found") - - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSPARSE REQUIRED_VARS ROCSPARSE_LIBRARIES) - ELSE() - MESSAGE( "ROCSPARSE_LIBRARIES is not found") - message("TPLROCSPARSE LIBRARIES VARIABLE IS ${ROCSPARSE_LIBRARIES}") - KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSPARSE INTERFACE LINK_LIBRARIES ${ROCSPARSE_LIBRARIES}) - ENDIF() - endif() + MESSAGE(FATAL_ERROR "Package ROCSPARSE requested but not found") ENDIF() +## MPL: v2 +#IF (ROCSPARSE_LIBRARY_DIRS AND ROCSPARSE_LIBRARIES) +# KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES} LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) +#ELSEIF (ROCSPARSE_LIBRARIES) +# KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES}) +#ELSEIF (ROCSPARSE_LIBRARY_DIRS) +#ELSE() +# FIND_PACKAGE(ROCSPARSE) +# if(NOT TARGET roc::rocsparse) +# MESSAGE( "TARGET roc::ROCSPARSE NOT FOUND") +# #Important note here: this find Module is named TPLROCSPARSE +# #The eventual target is named roc::ROCSPARSE. To avoid naming conflicts +# #the find module is called TPLROCSPARSE. This call will cause +# #the find_package call to fail in a "standard" CMake way +# FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSPARSE REQUIRED_VARS ROCSPARSE_FOUND) +# ELSE() +# # if the package is found, we can verify that some variables are defined using FIND_PACKAGE_HANDLE_STANDARD_ARGS +# MESSAGE( "TARGET roc::ROCSPARSE FOUND") +# #The libraries might be empty - OR they might explicitly be not found +# IF("${ROCSPARSE_LIBRARIES}" MATCHES "NOTFOUND") +# MESSAGE( "ROCSPARSE_LIBRARIES is not found") + +# FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSPARSE REQUIRED_VARS ROCSPARSE_LIBRARIES) +# ELSE() +# MESSAGE( "ROCSPARSE_LIBRARIES is not found") +# message("TPLROCSPARSE LIBRARIES VARIABLE IS ${ROCSPARSE_LIBRARIES}") +# KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSPARSE INTERFACE LINK_LIBRARIES ${ROCSPARSE_LIBRARIES}) +# ENDIF() +# endif() +#ENDIF() + ## MPL: v1 #IF (ROCSPARSE_LIBRARY_DIRS AND ROCSPARSE_LIBRARIES) # KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES} LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) From 25dbdcb9b75a6dc909e10ff251486aee100c0319 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Tue, 24 Jan 2023 10:58:11 +0100 Subject: [PATCH 175/442] #7 Removed V2 and V1. --- cmake/Modules/FindTPLROCBLAS.cmake | 79 ---------------------------- cmake/Modules/FindTPLROCSPARSE.cmake | 72 ------------------------- 2 files changed, 151 deletions(-) diff --git a/cmake/Modules/FindTPLROCBLAS.cmake b/cmake/Modules/FindTPLROCBLAS.cmake index 9c95ae5e77..c0a9de3b50 100644 --- a/cmake/Modules/FindTPLROCBLAS.cmake +++ b/cmake/Modules/FindTPLROCBLAS.cmake @@ -1,4 +1,3 @@ -# MPL: v3 # MPL: 12/29/2022: CMake regular way to find a package FIND_PACKAGE(ROCBLAS) if(TARGET roc::rocblas) @@ -12,81 +11,3 @@ if(TARGET roc::rocblas) ELSE() MESSAGE(FATAL_ERROR "Package ROCBLAS requested but not found") ENDIF() - -## MPL: v2 -## MPL: 12/26/2022: This bloc is not necessary anymore since ROCBLAS installation provide a cmake config file. -## Should we verify for different version of ROCBLAS ? -## GOAL: The code is commented for now and we aim to remove it -#IF (ROCBLAS_LIBRARY_DIRS AND ROCBLAS_LIBRARIES) -# KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES} LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) -#ELSEIF (ROCBLAS_LIBRARIES) -# KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES}) -#ELSEIF (ROCBLAS_LIBRARY_DIRS) -#ELSE() -# # MPL: 12/26/2022: USE FIND_PACKAGE and check if the requested target is the more modern way to do it -# # MPL: 12/28/2022 : This logical bloc is based on the logical bloc coming from FindTPLCUBLAS. But instead of -# # expecting a ROCBLAS_FOUND variable to be set. We expect the TARGET roc::rocblas to be defined (more modern) -# FIND_PACKAGE(ROCBLAS) -# if(NOT TARGET roc::rocblas) -# MESSAGE( "TARGET roc::rocblas NOT FOUND") -# #Important note here: this find Module is named TPLROCBLAS -# #The eventual target is named roc::rocblas. To avoid naming conflicts -# #the find module is called TPLROCBLAS. This call will cause -# #the find_package call to fail in a "standard" CMake way -# FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCBLAS REQUIRED_VARS ROCBLAS_FOUND) -# ELSE() -# # MPL: 12/26/2022: USING FIND_PACKAGE_HANDLE_STANDARD_ARGS can be ok in modern CMAKE but with a Find module -# # if the package is found, we can verify that some variables are defined using FIND_PACKAGE_HANDLE_STANDARD_ARGS -# MESSAGE( "TARGET roc::rocblas FOUND") -# #The libraries might be empty - OR they might explicitly be not found -# IF("${ROCBLAS_LIBRARIES}" MATCHES "NOTFOUND") -# MESSAGE( "ROCBLAS_LIBRARIES is not found") - -# FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCBLAS REQUIRED_VARS ROCBLAS_LIBRARIES) -# ELSE() -# MESSAGE( "ROCBLAS_LIBRARIES is not found") -# # 12/28/2022: ROCBLAS_LIBRARIES is found using find_packge which defines it as a target and not a lib -# message("TPLROCBLAS LIBRARIES VARIABLE IS ${ROCBLAS_LIBRARIES}") -# KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCBLAS INTERFACE LINK_LIBRARIES ${ROCBLAS_LIBRARIES}) -# ENDIF() -# endif() -#ENDIF() - -## MPL: v1 -#IF (ROCBLAS_LIBRARY_DIRS AND ROCBLAS_LIBRARIES) -# KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES} LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) -#ELSEIF (ROCBLAS_LIBRARIES) -# KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES}) -#ELSEIF (ROCBLAS_LIBRARY_DIRS) -# KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES rocblas LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) -#ELSEIF (KokkosKernels_ROCBLAS_ROOT) -# KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE -# LIBRARIES -# rocblas -# LIBRARY_PATHS -# ${KokkosKernels_ROCBLAS_ROOT}/lib -# HEADERS -# rocblas.h -# HEADER_PATHS -# ${KokkosKernels_ROCBLAS_ROOT}/include -# ) -#ELSEIF (DEFINED ENV{ROCM_PATH}) -# MESSAGE(STATUS "Detected ROCM_PATH: ENV{ROCM_PATH}") -# SET(ROCBLAS_ROOT "$ENV{ROCM_PATH}/rocblas") -# KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE -# LIBRARIES -# rocblas -# LIBRARY_PATHS -# ${ROCBLAS_ROOT}/lib -# HEADERS -# rocblas.h -# HEADER_PATHS -# ${ROCBLAS_ROOT}/include -# ) -#ELSE() -# MESSAGE(ERROR "rocBLAS was not detected properly, please disable it or provide sufficient information at configure time.") -# # Todo: figure out how to use the target defined during rocblas installation -# # FIND_PACKAGE(ROCBLAS REQUIRED) -# # KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCBLAS INTERFACE LINK_LIBRARIES ${ROCBLAS_LIBRARIES}) -# # GET_TARGET_PROPERTY(ROCBLAS_LINK_LIBRARIES ${ROCBLAS_LIBRARIES} IMPORTED_LINK_INTERFACE_LIBRARIES) -#ENDIF() diff --git a/cmake/Modules/FindTPLROCSPARSE.cmake b/cmake/Modules/FindTPLROCSPARSE.cmake index b64b80677c..5f985ff3a8 100644 --- a/cmake/Modules/FindTPLROCSPARSE.cmake +++ b/cmake/Modules/FindTPLROCSPARSE.cmake @@ -1,6 +1,4 @@ # MPL: 05/01/2023: This file follows the partern of FindTPLROCBLAS.cmake - -# MPL: v3 FIND_PACKAGE(ROCSPARSE) if(TARGET roc::rocsparse) SET(TPL_ROCSPARSE_IMPORTED_NAME roc::rocsparse) @@ -9,73 +7,3 @@ if(TARGET roc::rocsparse) ELSE() MESSAGE(FATAL_ERROR "Package ROCSPARSE requested but not found") ENDIF() - -## MPL: v2 -#IF (ROCSPARSE_LIBRARY_DIRS AND ROCSPARSE_LIBRARIES) -# KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES} LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) -#ELSEIF (ROCSPARSE_LIBRARIES) -# KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES}) -#ELSEIF (ROCSPARSE_LIBRARY_DIRS) -#ELSE() -# FIND_PACKAGE(ROCSPARSE) -# if(NOT TARGET roc::rocsparse) -# MESSAGE( "TARGET roc::ROCSPARSE NOT FOUND") -# #Important note here: this find Module is named TPLROCSPARSE -# #The eventual target is named roc::ROCSPARSE. To avoid naming conflicts -# #the find module is called TPLROCSPARSE. This call will cause -# #the find_package call to fail in a "standard" CMake way -# FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSPARSE REQUIRED_VARS ROCSPARSE_FOUND) -# ELSE() -# # if the package is found, we can verify that some variables are defined using FIND_PACKAGE_HANDLE_STANDARD_ARGS -# MESSAGE( "TARGET roc::ROCSPARSE FOUND") -# #The libraries might be empty - OR they might explicitly be not found -# IF("${ROCSPARSE_LIBRARIES}" MATCHES "NOTFOUND") -# MESSAGE( "ROCSPARSE_LIBRARIES is not found") - -# FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLROCSPARSE REQUIRED_VARS ROCSPARSE_LIBRARIES) -# ELSE() -# MESSAGE( "ROCSPARSE_LIBRARIES is not found") -# message("TPLROCSPARSE LIBRARIES VARIABLE IS ${ROCSPARSE_LIBRARIES}") -# KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSPARSE INTERFACE LINK_LIBRARIES ${ROCSPARSE_LIBRARIES}) -# ENDIF() -# endif() -#ENDIF() - -## MPL: v1 -#IF (ROCSPARSE_LIBRARY_DIRS AND ROCSPARSE_LIBRARIES) -# KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES} LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) -#ELSEIF (ROCSPARSE_LIBRARIES) -# KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES}) -#ELSEIF (ROCSPARSE_LIBRARY_DIRS) -# KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES rocsparse LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) -#ELSEIF (KokkosKernels_ROCSPARSE_ROOT) -# KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE -# LIBRARIES -# rocsparse -# LIBRARY_PATHS -# ${KokkosKernels_ROCSPARSE_ROOT}/lib -# HEADERS -# rocsparse.h -# HEADER_PATHS -# ${KokkosKernels_ROCSPARSE_ROOT}/include -# ) -#ELSEIF (DEFINED ENV{ROCM_PATH}) -# MESSAGE(STATUS "Detected ROCM_PATH: ENV{ROCM_PATH}") -# SET(ROCSPARSE_ROOT "$ENV{ROCM_PATH}/rocsparse") -# KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE -# LIBRARIES -# rocsparse -# LIBRARY_PATHS -# ${ROCSPARSE_ROOT}/lib -# HEADERS -# rocsparse.h -# HEADER_PATHS -# ${ROCSPARSE_ROOT}/include -# ) -#ELSE() -# MESSAGE(ERROR "rocSPARSE was not detected properly, please disable it or provide sufficient information at configure time.") -# # Todo: figure out how to use the target defined during rocsparse installation -# # FIND_PACKAGE(ROCSPARSE REQUIRED) -# # KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSPARSE INTERFACE LINK_LIBRARIES ${ROCSPARSE_LIBRARIES}) -# # GET_TARGET_PROPERTY(ROCSPARSE_LINK_LIBRARIES ${ROCSPARSE_LIBRARIES} IMPORTED_LINK_INTERFACE_LIBRARIES) -#ENDIF() From 45a8d3baf77c749b37899d4c53d2d47f28e17162 Mon Sep 17 00:00:00 2001 From: Tom Ransegnola Date: Thu, 30 Mar 2023 10:25:33 -0600 Subject: [PATCH 176/442] address reviewer comments and run clang-format --- cm_generate_makefile.bash | 1 - sparse/src/KokkosSparse_mdf_handle.hpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index 939be4aab3..d6c125899f 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -367,7 +367,6 @@ display_help_text() { echo "--disable-perftests: Do not build Kokkos Kernels performance tests" echo "--enable-perftests: build Kokkos Kernels performance tests (default)" echo "--deprecated-code Enable deprecated code (disabled by default)" - echo "--enable-perfsuite: build Kokkos Kernels performance tests with RAJAPerf Suite" echo "--export-compile-commands: export cmake compile_commands.json file" diff --git a/sparse/src/KokkosSparse_mdf_handle.hpp b/sparse/src/KokkosSparse_mdf_handle.hpp index 4e23280235..03fd660b95 100644 --- a/sparse/src/KokkosSparse_mdf_handle.hpp +++ b/sparse/src/KokkosSparse_mdf_handle.hpp @@ -62,7 +62,7 @@ struct MDF_handle { crs_matrix_type L, U; - MDF_handle(const crs_matrix_type & A) + MDF_handle(const crs_matrix_type& A) : numRows(A.numRows()), permutation(col_ind_type("row permutation", A.numRows())), permutation_inv(col_ind_type("inverse row permutation", A.numRows())), From 8e77c01cc80096f3efb3d96c721f66a1b628dca0 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 30 Mar 2023 14:37:41 -0600 Subject: [PATCH 177/442] TPLs: replicating changes made in Trilinos for ROCBLAS/ROCSPARSE See Trilinos PR #11746 https://github.com/trilinos/Trilinos/pull/11746 --- cmake/Dependencies.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 4ce5a98dc0..d3b393ddde 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,6 +1,6 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms - LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE METIS SuperLU Cholmod CUBLAS + LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE METIS SuperLU Cholmod CUBLAS ROCBLAS ROCSPARSE TEST_OPTIONAL_TPLS yaml-cpp ) # NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in From 1491bd433323ec9a8e1e9626adcc9e0fed6a301a Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Thu, 30 Mar 2023 23:21:22 -0600 Subject: [PATCH 178/442] Add exec instance support to sort/sort_and_merge utils (#1744) For each existing overload of the sort/sort_and_merge utilities (for crs matrix, crs graph, bsr matrix), add a version that accepts an execution space instance. The functions that don't take an instance are now implemented in terms of the ones that do. Test the new overloads. --- common/src/KokkosKernels_SimpleUtils.hpp | 44 ++++- sparse/src/KokkosSparse_SortCrs.hpp | 209 +++++++++++++++++------ sparse/unit_test/Test_Sparse_SortCrs.hpp | 74 ++++++-- 3 files changed, 252 insertions(+), 75 deletions(-) diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index 27a9d4ebe8..f24ffb49f1 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -79,22 +79,38 @@ struct InclusiveParallelPrefixSum { /*** * \brief Function performs the exclusive parallel prefix sum. That is each - * entry holds the sum until itself. \param num_elements: size of the array + * entry holds the sum until itself. + * \param exec: the execution space instance on which to run + * \param num_elements: size of the array * \param arr: the array for which the prefix sum will be performed. */ template inline void kk_exclusive_parallel_prefix_sum( - typename view_t::value_type num_elements, view_t arr) { + const MyExecSpace &exec, typename view_t::value_type num_elements, + view_t arr) { typedef Kokkos::RangePolicy my_exec_space; Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", - my_exec_space(0, num_elements), + my_exec_space(exec, 0, num_elements), ExclusiveParallelPrefixSum(arr)); } +/*** + * \brief Function performs the exclusive parallel prefix sum. That is each + * entry holds the sum until itself. + * \param num_elements: size of the array + * \param arr: the array for which the prefix sum will be performed. + */ +template +inline void kk_exclusive_parallel_prefix_sum( + typename view_t::value_type num_elements, view_t arr) { + kk_exclusive_parallel_prefix_sum(MyExecSpace(), num_elements, arr); +} + /*** * \brief Function performs the exclusive parallel prefix sum. That is each * entry holds the sum until itself. This version also returns the final sum * equivalent to the sum-reduction of arr before doing the scan. + * \param exec: the execution space instance on which to run * \param num_elements: size of the array * \param arr: the array for which the prefix sum will be performed. * \param finalSum: will be set to arr[num_elements - 1] after computing the @@ -102,14 +118,30 @@ inline void kk_exclusive_parallel_prefix_sum( */ template inline void kk_exclusive_parallel_prefix_sum( - typename view_t::value_type num_elements, view_t arr, - typename view_t::non_const_value_type &finalSum) { + const MyExecSpace &exec, typename view_t::value_type num_elements, + view_t arr, typename view_t::non_const_value_type &finalSum) { typedef Kokkos::RangePolicy my_exec_space; Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", - my_exec_space(0, num_elements), + my_exec_space(exec, 0, num_elements), ExclusiveParallelPrefixSum(arr), finalSum); } +/*** + * \brief Function performs the exclusive parallel prefix sum. That is each + * entry holds the sum until itself. This version also returns the final sum + * equivalent to the sum-reduction of arr before doing the scan. + * \param num_elements: size of the array + * \param arr: the array for which the prefix sum will be performed. + * \param finalSum: will be set to arr[num_elements - 1] after computing the + * prefix sum. + */ +template +inline void kk_exclusive_parallel_prefix_sum( + typename view_t::value_type num_elements, view_t arr, + typename view_t::non_const_value_type &finalSum) { + kk_exclusive_parallel_prefix_sum(MyExecSpace(), num_elements, arr, finalSum); +} + /*** * \brief Function performs the inclusive parallel prefix sum. That is each * entry holds the sum until itself including itself. \param num_elements: size diff --git a/sparse/src/KokkosSparse_SortCrs.hpp b/sparse/src/KokkosSparse_SortCrs.hpp index d04ddb5a30..6cdfd9c1c9 100644 --- a/sparse/src/KokkosSparse_SortCrs.hpp +++ b/sparse/src/KokkosSparse_SortCrs.hpp @@ -33,44 +33,95 @@ template +void sort_bsr_matrix(const execution_space& exec, const lno_t blockdim, + const rowmap_t& rowmap, const entries_t& entries, + const values_t& values); + +// Sort a BRS matrix: within each row, sort entries ascending by column and +// permute the values accordingly. template void sort_bsr_matrix(const bsrMat_t& A); +// Sort a BRS matrix on the given execution space instance: within each row, +// sort entries ascending by column and permute the values accordingly. +template +void sort_bsr_matrix(const typename bsrMat_t::execution_space& exec, + const bsrMat_t& A); + // ---------------------------------- // CRS matrix/graph sorting utilities // ---------------------------------- // The sort_crs* functions sort the adjacent column list for each row into -// ascending order. +// ascending order. Each version either takes an execution space instance as a +// parameter, or uses the default instance. template void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values); +template +void sort_crs_matrix(const execution_space& exec, const rowmap_t& rowmap, + const entries_t& entries, const values_t& values); + template void sort_crs_matrix(const crsMat_t& A); +template +void sort_crs_matrix(const typename crsMat_t::execution_space& exec, + const crsMat_t& A); + template void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries); +template +void sort_crs_graph(const execution_space& exec, const rowmap_t& rowmap, + const entries_t& entries); + template void sort_crs_graph(const crsGraph_t& G); +template +void sort_crs_graph(const typename crsGraph_t::execution_space& exec, + const crsGraph_t& G); + // sort_and_merge_matrix produces a new matrix which is equivalent to A but is // sorted and has no duplicated entries: each (i, j) is unique. Values for -// duplicated entries are summed. +// duplicated entries are summed. Each version either takes an execution space +// instance as a parameter, or uses the default instance. + template crsMat_t sort_and_merge_matrix(const crsMat_t& A); +template +crsMat_t sort_and_merge_matrix(const typename crsMat_t::execution_space& exec, + const crsMat_t& A); + template crsGraph_t sort_and_merge_graph(const crsGraph_t& G); +template +crsGraph_t sort_and_merge_graph( + const typename crsGraph_t::execution_space& exec, const crsGraph_t& G); + template void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, rowmap_t& rowmap_out, entries_t& entries_out); +template +void sort_and_merge_graph(const exec_space& exec, + const typename rowmap_t::const_type& rowmap_in, + const entries_t& entries_in, rowmap_t& rowmap_out, + entries_t& entries_out); + namespace Impl { template -void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, - const values_t& values) { +void sort_crs_matrix(const execution_space& exec, const rowmap_t& rowmap, + const entries_t& entries, const values_t& values) { using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); @@ -371,7 +422,7 @@ void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, funct(useRadix, rowmap, entries, values); if (useRadix) { Kokkos::parallel_for("sort_crs_matrix", - Kokkos::RangePolicy(0, numRows), + Kokkos::RangePolicy(exec, 0, numRows), funct); } else { // Try to get teamsize to be largest power of 2 not greater than avg entries @@ -383,33 +434,40 @@ void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, while (idealTeamSize < avgDeg / 2) { idealTeamSize *= 2; } - team_pol temp(numRows, 1); + team_pol temp(exec, numRows, 1); lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); lno_t teamSize = std::min(idealTeamSize, maxTeamSize); - Kokkos::parallel_for("sort_crs_matrix", team_pol(numRows, teamSize), funct); + Kokkos::parallel_for("sort_crs_matrix", team_pol(exec, numRows, teamSize), + funct); } } +template +void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, + const values_t& values) { + sort_crs_matrix(execution_space(), rowmap, entries, values); +} + +template +void sort_crs_matrix(const typename crsMat_t::execution_space& exec, + const crsMat_t& A) { + sort_crs_matrix(exec, A.graph.row_map, A.graph.entries, A.values); +} + template void sort_crs_matrix(const crsMat_t& A) { - // Note: rowmap_t has const values, but that's OK as sorting doesn't modify it - using rowmap_t = typename crsMat_t::row_map_type; - using entries_t = typename crsMat_t::index_type::non_const_type; - using values_t = typename crsMat_t::values_type::non_const_type; - using exec_space = typename crsMat_t::execution_space; - // NOTE: the rowmap of a StaticCrsGraph is const-valued, but the - // entries and CrsMatrix values are non-const (so sorting them directly - // is allowed) - sort_crs_matrix( - A.graph.row_map, A.graph.entries, A.values); + sort_crs_matrix(typename crsMat_t::execution_space(), A.graph.row_map, + A.graph.entries, A.values); } // Sort a BRS matrix: within each row, sort entries ascending by column and // permute the values accordingly. template -void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, - const entries_t& entries, const values_t& values) { +void sort_bsr_matrix(const execution_space& exec, const lno_t blockdim, + const rowmap_t& rowmap, const entries_t& entries, + const values_t& values) { // TODO: this is O(N^2) mock for debugging - do regular implementation based // on Radix/Bitonic sort (like CSR) IDEA: maybe we need only one general // Radix2/Bitonic2 and CSR sorting may call it with blockSize=1 ? @@ -421,26 +479,40 @@ void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, Impl::sort_bsr_functor bsr_sorter( rowmap, entries, values, blocksize); Kokkos::parallel_for("sort_bsr_matrix", - Kokkos::RangePolicy(0, numRows), + Kokkos::RangePolicy(exec, 0, numRows), bsr_sorter); } +template +void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, + const entries_t& entries, const values_t& values) { + sort_bsr_matrix(execution_space(), blockdim, rowmap, entries, values); +} + // Sort a BSR matrix (like CRS but single values are replaced with contignous // blocks) template -void sort_bsr_matrix(const bsrMat_t& A) { +void sort_bsr_matrix(const typename bsrMat_t::execution_space& exec, + const bsrMat_t& A) { // NOTE: unlike rowmap, entries and values are non-const, so we can sort them // directly sort_bsr_matrix( - A.blockDim(), A.graph.row_map, A.graph.entries, A.values); + exec, A.blockDim(), A.graph.row_map, A.graph.entries, A.values); +} + +template +void sort_bsr_matrix(const bsrMat_t& A) { + sort_bsr_matrix(typename bsrMat_t::execution_space(), A); } // Sort a CRS graph: within each row, sort entries ascending by column. template -void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { +void sort_crs_graph(const execution_space& exec, const rowmap_t& rowmap, + const entries_t& entries) { using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); @@ -450,7 +522,7 @@ void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { useRadix, rowmap, entries); if (useRadix) { Kokkos::parallel_for("sort_crs_graph", - Kokkos::RangePolicy(0, numRows), + Kokkos::RangePolicy(exec, 0, numRows), funct); } else { // Try to get teamsize to be largest power of 2 less than or equal to @@ -463,26 +535,37 @@ void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { while (idealTeamSize < avgDeg / 2) { idealTeamSize *= 2; } - team_pol temp(numRows, 1); + team_pol temp(exec, numRows, 1); lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); lno_t teamSize = std::min(idealTeamSize, maxTeamSize); - Kokkos::parallel_for("sort_crs_graph", team_pol(numRows, teamSize), funct); + Kokkos::parallel_for("sort_crs_graph", team_pol(exec, numRows, teamSize), + funct); } } +template +void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { + sort_crs_graph(execution_space(), rowmap, entries); +} + template -void sort_crs_graph(const crsGraph_t& G) { +void sort_crs_graph(const typename crsGraph_t::execution_space& exec, + const crsGraph_t& G) { static_assert( !std::is_const::value, "sort_crs_graph requires StaticCrsGraph entries to be non-const."); - sort_crs_graph(G.row_map, G.entries); + sort_crs_graph(exec, G.row_map, G.entries); +} + +template +void sort_crs_graph(const crsGraph_t& G) { + sort_crs_graph(typename crsGraph_t::execution_space(), G); } // Sort the rows of matrix, and merge duplicate entries. template -crsMat_t sort_and_merge_matrix(const crsMat_t& A) { +crsMat_t sort_and_merge_matrix(const typename crsMat_t::execution_space& exec, + const crsMat_t& A) { using c_rowmap_t = typename crsMat_t::row_map_type; using rowmap_t = typename crsMat_t::row_map_type::non_const_type; using entries_t = typename crsMat_t::index_type::non_const_type; @@ -490,25 +573,27 @@ crsMat_t sort_and_merge_matrix(const crsMat_t& A) { using size_type = typename rowmap_t::non_const_value_type; using exec_space = typename crsMat_t::execution_space; using range_t = Kokkos::RangePolicy; - sort_crs_matrix(A); + sort_crs_matrix(exec, A); // Count entries per row into a new rowmap, in terms of merges that can be // done - rowmap_t mergedRowmap( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"), - A.numRows() + 1); + rowmap_t mergedRowmap(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "SortedMerged rowmap"), + A.numRows() + 1); size_type numCompressedEntries = 0; - Kokkos::parallel_reduce(range_t(0, A.numRows()), + Kokkos::parallel_reduce(range_t(exec, 0, A.numRows()), Impl::MergedRowmapFunctor( mergedRowmap, A.graph.row_map, A.graph.entries), numCompressedEntries); // Prefix sum to get rowmap KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - A.numRows() + 1, mergedRowmap); - entries_t mergedEntries("SortedMerged entries", numCompressedEntries); - values_t mergedValues("SortedMerged values", numCompressedEntries); + exec, A.numRows() + 1, mergedRowmap); + entries_t mergedEntries(Kokkos::view_alloc(exec, "SortedMerged entries"), + numCompressedEntries); + values_t mergedValues(Kokkos::view_alloc(exec, "SortedMerged values"), + numCompressedEntries); // Compute merged entries and values Kokkos::parallel_for( - range_t(0, A.numRows()), + range_t(exec, 0, A.numRows()), Impl::MatrixMergedEntriesFunctor( A.graph.row_map, A.graph.entries, A.values, mergedRowmap, mergedEntries, mergedValues)); @@ -518,8 +603,14 @@ crsMat_t sort_and_merge_matrix(const crsMat_t& A) { mergedEntries); } +template +crsMat_t sort_and_merge_matrix(const crsMat_t& A) { + return sort_and_merge_matrix(typename crsMat_t::execution_space(), A); +} + template -void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, +void sort_and_merge_graph(const exec_space& exec, + const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, rowmap_t& rowmap_out, entries_t& entries_out) { using size_type = typename rowmap_t::non_const_value_type; @@ -535,30 +626,41 @@ void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, } numRows--; // Sort in place - sort_crs_graph(rowmap_in, entries_in); + sort_crs_graph(exec, rowmap_in, + entries_in); // Count entries per row into a new rowmap, in terms of merges that can be // done - rowmap_out = rowmap_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"), - numRows + 1); + rowmap_out = rowmap_t(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "SortedMerged rowmap"), + numRows + 1); size_type numCompressedEntries = 0; - Kokkos::parallel_reduce(range_t(0, numRows), + Kokkos::parallel_reduce(range_t(exec, 0, numRows), Impl::MergedRowmapFunctor( rowmap_out, rowmap_in, entries_in), numCompressedEntries); // Prefix sum to get rowmap KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - numRows + 1, rowmap_out); - entries_out = entries_t("SortedMerged entries", numCompressedEntries); + exec, numRows + 1, rowmap_out); + entries_out = entries_t(Kokkos::view_alloc(exec, "SortedMerged entries"), + numCompressedEntries); // Compute merged entries and values Kokkos::parallel_for( - range_t(0, numRows), + range_t(exec, 0, numRows), Impl::GraphMergedEntriesFunctor( rowmap_in, entries_in, rowmap_out, entries_out)); } +template +void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, + const entries_t& entries_in, rowmap_t& rowmap_out, + entries_t& entries_out) { + return sort_and_merge_graph(exec_space(), rowmap_in, entries_in, rowmap_out, + entries_out); +} + template -crsGraph_t sort_and_merge_graph(const crsGraph_t& G) { +crsGraph_t sort_and_merge_graph( + const typename crsGraph_t::execution_space& exec, const crsGraph_t& G) { using rowmap_t = typename crsGraph_t::row_map_type::non_const_type; using entries_t = typename crsGraph_t::entries_type; static_assert( @@ -567,11 +669,16 @@ crsGraph_t sort_and_merge_graph(const crsGraph_t& G) { rowmap_t mergedRowmap; entries_t mergedEntries; sort_and_merge_graph(G.row_map, G.entries, mergedRowmap, + entries_t>(exec, G.row_map, G.entries, mergedRowmap, mergedEntries); return crsGraph_t(mergedEntries, mergedRowmap); } +template +crsGraph_t sort_and_merge_graph(const crsGraph_t& G) { + return sort_and_merge_graph(typename crsGraph_t::execution_space(), G); +} + } // namespace KokkosSparse namespace KokkosKernels { diff --git a/sparse/unit_test/Test_Sparse_SortCrs.hpp b/sparse/unit_test/Test_Sparse_SortCrs.hpp index 935b994045..089fdd73c7 100644 --- a/sparse/unit_test/Test_Sparse_SortCrs.hpp +++ b/sparse/unit_test/Test_Sparse_SortCrs.hpp @@ -34,7 +34,8 @@ template void testSortCRS(default_lno_t numRows, default_lno_t numCols, - default_size_type nnz, bool doValues, bool doStructInterface) { + default_size_type nnz, bool doValues, bool doStructInterface, + bool useExecInstance) { using scalar_t = default_scalar; using lno_t = default_lno_t; using size_type = default_size_type; @@ -89,17 +90,36 @@ void testSortCRS(default_lno_t numRows, default_lno_t numCols, // call the actual sort routine being tested if (doValues) { if (doStructInterface) { - KokkosSparse::sort_crs_matrix(A); + if (useExecInstance) { + KokkosSparse::sort_crs_matrix(exec_space(), A); + } else { + KokkosSparse::sort_crs_matrix(A); + } } else { - KokkosSparse::sort_crs_matrix( - A.graph.row_map, A.graph.entries, A.values); + if (useExecInstance) { + KokkosSparse::sort_crs_matrix(exec_space(), A.graph.row_map, + A.graph.entries, A.values); + } else { + KokkosSparse::sort_crs_matrix(A.graph.row_map, + A.graph.entries, A.values); + } } } else { if (doStructInterface) { - KokkosSparse::sort_crs_graph(A.graph); + if (useExecInstance) { + KokkosSparse::sort_crs_graph(exec_space(), A.graph); + } else { + KokkosSparse::sort_crs_graph(A.graph); + } } else { - KokkosSparse::sort_crs_graph( - A.graph.row_map, A.graph.entries); + if (useExecInstance) { + KokkosSparse::sort_crs_graph(exec_space(), A.graph.row_map, + A.graph.entries); + } else { + KokkosSparse::sort_crs_graph( + A.graph.row_map, A.graph.entries); + } } } // Copy to host and compare @@ -166,7 +186,7 @@ void testSortCRSUnmanaged(bool doValues, bool doStructInterface) { } template -void testSortAndMerge() { +void testSortAndMerge(bool useExecInstance) { using size_type = default_size_type; using lno_t = default_lno_t; using scalar_t = default_scalar; @@ -214,7 +234,12 @@ void testSortAndMerge() { Kokkos::deep_copy(devInValues, hostInValues); crsMat_t input("Input", nrows, ncols, nnz, devInValues, devInRowmap, devInEntries); - crsMat_t output = KokkosSparse::sort_and_merge_matrix(input); + crsMat_t output; + if (useExecInstance) { + output = KokkosSparse::sort_and_merge_matrix(exec_space(), input); + } else { + output = KokkosSparse::sort_and_merge_matrix(input); + } exec_space().fence(); EXPECT_EQ(output.numRows(), nrows); EXPECT_EQ(output.numCols(), ncols); @@ -254,29 +279,42 @@ void testSortAndMerge() { TEST_F(TestCategory, common_sort_crsgraph) { for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { - testSortCRS(10, 10, 20, false, doStructInterface); - testSortCRS(100, 100, 2000, false, doStructInterface); - testSortCRS(1000, 1000, 30000, false, doStructInterface); + for (int useExecInstance = 0; useExecInstance < 2; useExecInstance++) { + testSortCRS(10, 10, 20, false, doStructInterface, + useExecInstance); + testSortCRS(100, 100, 2000, false, doStructInterface, + useExecInstance); + testSortCRS(1000, 1000, 30000, false, doStructInterface, + useExecInstance); + } testSortCRSUnmanaged(false, doStructInterface); } } TEST_F(TestCategory, common_sort_crsmatrix) { for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { - testSortCRS(10, 10, 20, true, doStructInterface); - testSortCRS(100, 100, 2000, true, doStructInterface); - testSortCRS(1000, 1000, 30000, true, doStructInterface); + for (int useExecInstance = 0; useExecInstance < 2; useExecInstance++) { + testSortCRS(10, 10, 20, true, doStructInterface, + useExecInstance); + testSortCRS(100, 100, 2000, true, doStructInterface, + useExecInstance); + testSortCRS(1000, 1000, 30000, true, doStructInterface, + useExecInstance); + } testSortCRSUnmanaged(true, doStructInterface); } } TEST_F(TestCategory, common_sort_crs_longrows) { - testSortCRS(1, 50000, 10000, false, false); - testSortCRS(1, 50000, 10000, true, false); + testSortCRS(1, 50000, 10000, false, false, false); + testSortCRS(1, 50000, 10000, true, false, false); + testSortCRS(1, 50000, 10000, false, false, true); + testSortCRS(1, 50000, 10000, true, false, true); } TEST_F(TestCategory, common_sort_merge_crsmatrix) { - testSortAndMerge(); + testSortAndMerge(false); + testSortAndMerge(true); } #endif // KOKKOSSPARSE_SORTCRSTEST_HPP From aaadaa0dd321ad72075b79c9532e36cba3ee5aff Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 31 Mar 2023 07:50:48 -0600 Subject: [PATCH 179/442] docs: Include BatchedGemm --- docs/developer/apidocs.rst | 1 + docs/developer/apidocs/batched_dense_host.rst | 8 ++++++++ 2 files changed, 9 insertions(+) create mode 100644 docs/developer/apidocs/batched_dense_host.rst diff --git a/docs/developer/apidocs.rst b/docs/developer/apidocs.rst index 82797c5801..a3df431c6a 100644 --- a/docs/developer/apidocs.rst +++ b/docs/developer/apidocs.rst @@ -11,4 +11,5 @@ The source documentation is extracted from the C++ files using Doxygen. apidocs/blas3 apidocs/sparse apidocs/batched_dense + apidocs/batched_dense_host apidocs/batched_sparse \ No newline at end of file diff --git a/docs/developer/apidocs/batched_dense_host.rst b/docs/developer/apidocs/batched_dense_host.rst new file mode 100644 index 0000000000..d6392067b4 --- /dev/null +++ b/docs/developer/apidocs/batched_dense_host.rst @@ -0,0 +1,8 @@ +BATCHED -- KokkosKernels batched host-level interfaces +========================================================= + +BatchedGemm +----------- +.. doxygenfunction:: KokkosBatched::BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) +.. doxygenclass:: KokkosBatched::BatchedGemmHandle + :members: \ No newline at end of file From f2c217d57805ab5e5222c16f1750535460567630 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 31 Mar 2023 09:52:29 -0600 Subject: [PATCH 180/442] .github: Update to actions/checkout@v3 --- .github/workflows/docs.yml | 4 ++-- .github/workflows/format.yml | 2 +- .github/workflows/osx.yml | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 5152d9f83a..b429ec415f 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -21,12 +21,12 @@ jobs: pip install sphinx-rtd-theme - name: checkout_kokkos_kernels - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: repository: kokkos/kokkos ref: develop diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index a7b90c54b4..220461fe62 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -13,7 +13,7 @@ jobs: clang-format-check: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Install Dependencies run: sudo apt install clang-format-8 diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 6f4e362d89..769957b953 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -50,12 +50,12 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: repository: kokkos/kokkos ref: develop From 01c49a8d26d6a9e3fdf73f49d3ac82565f0c145e Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 31 Mar 2023 10:30:23 -0600 Subject: [PATCH 181/442] docs: Add stubs for some sparse APIs --- docs/developer/apidocs/sparse.rst | 34 +++- docs/developer/contrib.rst | 8 + sparse/src/KokkosSparse_gauss_seidel.hpp | 220 +++++++++++++++++++++++ sparse/src/KokkosSparse_gmres.hpp | 10 ++ sparse/src/KokkosSparse_par_ilut.hpp | 33 ++++ sparse/src/KokkosSparse_spgemm.hpp | 69 ++++++- 6 files changed, 370 insertions(+), 4 deletions(-) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index ed877ac567..cb9aef9af2 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -37,8 +37,36 @@ trsv spgemm ------ -.. doxygenfunction:: KokkosSparse::spgemm +.. doxygenfunction:: spgemm_symbolic(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) +.. doxygenfunction:: spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) +.. doxygenfunction:: spgemm(const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode) -gauss +block_spgemm +------------ +.. doxygenfunction:: block_spgemm_symbolic(KernelHandle& kh, const AMatrixType& A, const bool transposeA, const BMatrixType& B,const bool transposeB, CMatrixType& C) +.. doxygenfunction:: block_spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) + +gauss_seidel +------------ +.. doxygenfunction:: gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) +.. doxygenfunctions:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) +.. doxygenfunctions:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) +.. doxygenfunction:: symmetric_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: backward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) + +block_gauss_seidel +------------------ +.. doxygenfunction:: block_gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) +.. doxygenfunction:: block_gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols,typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) +.. doxygenfunction:: symmetric_block_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: forward_sweep_block_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: backward_sweep_block_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) + +par_ilut +-------- +.. doxygenfunction:: par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, AEntriesType& A_entries, LRowMapType& L_rowmap, URowMapType& U_rowmap) +.. doxygenfunction:: par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, AEntriesType& A_entries, AValuesType& A_values, LRowMapType& L_rowmap, LEntriesType& L_entries, LValuesType& L_values, URowMapType& U_rowmap, UEntriesType& U_entries, UValuesType& U_values, bool deterministic) + +gmres ----- -.. doxygenfunction:: KokkosSparse::gauss +.. doxygenfunction:: gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, Preconditioner* precond) \ No newline at end of file diff --git a/docs/developer/contrib.rst b/docs/developer/contrib.rst index 0b02ebf190..c0117126bd 100644 --- a/docs/developer/contrib.rst +++ b/docs/developer/contrib.rst @@ -24,6 +24,14 @@ In general, we prefer that the prototype has the doxygen style comment rather th KOKKOS_INLINE_FUNCTION ViewValueType access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::Yes &); +**NOTE:** To have vscode generate the "\\\\\\" style stubs: + +1. install the C/C++ IntelliSense, debugging, and code browsing extention. + +2. go to Settings, Extensions, C/C++, Doxygen Documentation Generator Settings, and ensure the setting for Doxdocgen is "\\\\\\". + +3. place your cursor on the line above `template ...` and type "\\\\\\". + Library policies ---------------- diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index 89654243cc..a2c6e89e82 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -25,6 +25,19 @@ namespace KokkosSparse { namespace Experimental { +/// +/// @brief +/// +/// @tparam KernelHandle +/// @tparam lno_row_view_t_ +/// @tparam lno_nnz_view_t_ +/// @param handle +/// @param num_rows +/// @param num_cols +/// @param row_map +/// @param entries +/// @param is_graph_symmetric +/// template void gauss_seidel_symbolic(KernelHandle *handle, @@ -85,6 +98,20 @@ void gauss_seidel_symbolic(KernelHandle *handle, is_graph_symmetric); } +/// +/// @brief +/// +/// @tparam KernelHandle +/// @tparam lno_row_view_t_ +/// @tparam lno_nnz_view_t_ +/// @param handle +/// @param num_rows +/// @param num_cols +/// @param block_size +/// @param row_map +/// @param entries +/// @param is_graph_symmetric +/// template void block_gauss_seidel_symbolic( @@ -104,6 +131,22 @@ void block_gauss_seidel_symbolic( is_graph_symmetric); } +/// +/// @brief +/// +/// @tparam format +/// @tparam KernelHandle +/// @tparam lno_row_view_t_ +/// @tparam lno_nnz_view_t_ +/// @tparam scalar_nnz_view_t_ +/// @param handle +/// @param num_rows +/// @param num_cols +/// @param row_map +/// @param entries +/// @param values +/// @param is_graph_symmetric +/// template ::type, \ typename std::remove_const::type>::value +/// @brief +/// @tparam KernelHandle +/// @tparam AMatrix +/// @tparam BType +/// @tparam XType +/// @param handle +/// @param A +/// @param B +/// @param X +/// @param precond template void gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, diff --git a/sparse/src/KokkosSparse_par_ilut.hpp b/sparse/src/KokkosSparse_par_ilut.hpp index 21371792c0..9b9b8f9923 100644 --- a/sparse/src/KokkosSparse_par_ilut.hpp +++ b/sparse/src/KokkosSparse_par_ilut.hpp @@ -44,6 +44,17 @@ namespace Experimental { std::is_same::type, \ typename std::remove_const::type>::value +/// @brief +/// @tparam KernelHandle +/// @tparam ARowMapType +/// @tparam AEntriesType +/// @tparam LRowMapType +/// @tparam URowMapType +/// @param handle +/// @param A_rowmap +/// @param A_entries +/// @param L_rowmap +/// @param U_rowmap template void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, @@ -165,6 +176,28 @@ void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, } // par_ilut_symbolic +/// @brief +/// @tparam KernelHandle +/// @tparam ARowMapType +/// @tparam AEntriesType +/// @tparam AValuesType +/// @tparam LRowMapType +/// @tparam LEntriesType +/// @tparam LValuesType +/// @tparam URowMapType +/// @tparam UEntriesType +/// @tparam UValuesType +/// @param handle +/// @param A_rowmap +/// @param A_entries +/// @param A_values +/// @param L_rowmap +/// @param L_entries +/// @param L_values +/// @param U_rowmap +/// @param U_entries +/// @param U_values +/// @param deterministic template void spgemm_symbolic(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) { @@ -54,7 +68,20 @@ void spgemm_symbolic(KernelHandle& kh, const AMatrix& A, const bool Amode, entriesC); } -// Symbolic phase for block SpGEMM (BSR matrices) +/// +/// @brief Symbolic phase for block SpGEMM (BSR matrices) +/// +/// @tparam KernelHandle +/// @tparam AMatrixType +/// @tparam BMatrixType +/// @tparam CMatrixType +/// @param kh +/// @param A +/// @param transposeA +/// @param B +/// @param transposeB +/// @param C +/// template void block_spgemm_symbolic(KernelHandle& kh, const AMatrixType& A, @@ -95,6 +122,20 @@ void block_spgemm_symbolic(KernelHandle& kh, const AMatrixType& A, row_mapC, entriesC, blockDim); } +/// +/// @brief +/// +/// @tparam KernelHandle +/// @tparam AMatrix +/// @tparam BMatrix +/// @tparam CMatrix +/// @param kh +/// @param A +/// @param Amode +/// @param B +/// @param Bmode +/// @param C +/// template void spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) { @@ -108,6 +149,20 @@ void spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, B.values, Bmode, C.graph.row_map, C.graph.entries, C.values); } +/// +/// @brief +/// +/// @tparam KernelHandle +/// @tparam AMatrix +/// @tparam BMatrix +/// @tparam CMatrix +/// @param kh +/// @param A +/// @param Amode +/// @param B +/// @param Bmode +/// @param C +/// template void block_spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) { @@ -123,6 +178,18 @@ void block_spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, B.values, Bmode, C.graph.row_map, C.graph.entries, C.values, blockDim); } +/// +/// @brief +/// +/// @tparam CMatrix +/// @tparam AMatrix +/// @tparam BMatrix +/// @param A +/// @param Amode +/// @param B +/// @param Bmode +/// @return CMatrix +/// template CMatrix spgemm(const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode) { From 3065eb31cc834a051c2d573a5ad3f2c08837db13 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Fri, 31 Mar 2023 12:34:05 -0600 Subject: [PATCH 182/442] ROCSPARSE: fix unused variable in unit-test --- sparse/unit_test/Test_Sparse_rocsparse.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sparse/unit_test/Test_Sparse_rocsparse.hpp b/sparse/unit_test/Test_Sparse_rocsparse.hpp index f4c9e4e741..87ec7877b8 100644 --- a/sparse/unit_test/Test_Sparse_rocsparse.hpp +++ b/sparse/unit_test/Test_Sparse_rocsparse.hpp @@ -66,6 +66,7 @@ void test_rocsparse_safe_call() { void test_rocsparse_singleton() { KokkosKernels::Impl::RocsparseSingleton& s = KokkosKernels::Impl::RocsparseSingleton::singleton(); + (void) s; } TEST_F(TestCategory, sparse_rocsparse_version) { test_rocsparse_version(); } From a9189f56a9909c69d39ead635f7edea4b8d95a23 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Fri, 31 Mar 2023 12:39:51 -0600 Subject: [PATCH 183/442] clang-format... --- sparse/unit_test/Test_Sparse_rocsparse.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/unit_test/Test_Sparse_rocsparse.hpp b/sparse/unit_test/Test_Sparse_rocsparse.hpp index 87ec7877b8..804c777daa 100644 --- a/sparse/unit_test/Test_Sparse_rocsparse.hpp +++ b/sparse/unit_test/Test_Sparse_rocsparse.hpp @@ -66,7 +66,7 @@ void test_rocsparse_safe_call() { void test_rocsparse_singleton() { KokkosKernels::Impl::RocsparseSingleton& s = KokkosKernels::Impl::RocsparseSingleton::singleton(); - (void) s; + (void)s; } TEST_F(TestCategory, sparse_rocsparse_version) { test_rocsparse_version(); } From 75c14cd0bb36d201e24d48d7d57a732d462326d7 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 31 Mar 2023 13:56:30 -0600 Subject: [PATCH 184/442] Add par_ilut symbolic docs --- sparse/src/KokkosSparse_par_ilut.hpp | 32 ++++++++++++++++++---------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/sparse/src/KokkosSparse_par_ilut.hpp b/sparse/src/KokkosSparse_par_ilut.hpp index 9b9b8f9923..84d2f7c89e 100644 --- a/sparse/src/KokkosSparse_par_ilut.hpp +++ b/sparse/src/KokkosSparse_par_ilut.hpp @@ -44,17 +44,27 @@ namespace Experimental { std::is_same::type, \ typename std::remove_const::type>::value -/// @brief -/// @tparam KernelHandle -/// @tparam ARowMapType -/// @tparam AEntriesType -/// @tparam LRowMapType -/// @tparam URowMapType -/// @param handle -/// @param A_rowmap -/// @param A_entries -/// @param L_rowmap -/// @param U_rowmap +/// @brief Performs the symbolic phase of par_ilut (non-blocking). +/// +/// The sparsity pattern of A will be analyzed and L_rowmap and U_rowmap will be +/// populated with the L (lower triangular) and U (upper triagular) non-zero +/// counts respectively. Having a separate symbolic phase allows for reuse when +/// dealing with multiple matrices with the same sparsity pattern. This routine +/// will set some values on handle for symbolic info (row count, nnz counts). +/// +/// @tparam KernelHandle Template for the KernelHandle type +/// @tparam ARowMapType Template for A_rowmap type +/// @tparam AEntriesType Template for A_entries type +/// @tparam LRowMapType Template for L_rowmap type +/// @tparam URowMapType Template for U_rowmap type +/// @param handle The kernel handle. It is expected that create_par_ilut_handle +/// has been called on it +/// @param A_rowmap The row map (row nnz offsets) for the A CSR (Input) +/// @param A_entries The entries (column ids) for the A CSR (Input) +/// @param L_rowmap The row map for the L CSR, should already be sized correctly +/// (numRows+1) (Output) +/// @param U_rowmap The row map for the U CSR, should already be sized correctly +/// (numRows+1) (Output) template void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, From c9d22ca1bcb39ddfbbe2163647c13c74e570efbb Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Thu, 19 Jan 2023 10:55:36 +0100 Subject: [PATCH 185/442] #8: made functionnal current version (v1) for MKL --- cmake/Modules/FindTPLMKL.cmake | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cmake/Modules/FindTPLMKL.cmake b/cmake/Modules/FindTPLMKL.cmake index 3d5f297f52..97e7fb6b6c 100644 --- a/cmake/Modules/FindTPLMKL.cmake +++ b/cmake/Modules/FindTPLMKL.cmake @@ -1,3 +1,5 @@ +message("CMAKE_CXX_COMPILER_ID : ${CMAKE_CXX_COMPILER_ID}") + IF (CMAKE_CXX_COMPILER_ID STREQUAL "Intel") TRY_COMPILE(KOKKOSKERNELS_HAS_MKL_ARG ${KOKKOSKERNELS_TOP_BUILD_DIR}/tpl_tests @@ -21,6 +23,10 @@ ELSEIF(WIN32) ) ENDIF() ELSE() + +#find_package(mkl) + #V1: old version plus small modif on header : +# ${MKL_ROOT}/include => ${MKL_ROOT}/include/mkl IF (NOT DEFINED ENV{MKLROOT}) SET(NO_MKL_ROOT_GIVEN "MKL-NOTFOUND") MESSAGE(WARNING "No MKLROOT environment variable specified - must source mklvars.sh to configure MKL path") @@ -44,7 +50,7 @@ ELSE() HEADER mkl.h HEADER_PATHS - ${MKL_ROOT}/include + ${MKL_ROOT}/include/mkl ) ENDIF() ENDIF() From 5edb51a459ed860f9bc600cf2e7132373c4ebdbe Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Thu, 2 Mar 2023 14:07:30 +0100 Subject: [PATCH 186/442] #8 update FindTPLMKL.cmake to use find_package(MKL) --- cmake/Modules/FindTPLMKL.cmake | 126 +++++++++++++++++++-------------- 1 file changed, 73 insertions(+), 53 deletions(-) diff --git a/cmake/Modules/FindTPLMKL.cmake b/cmake/Modules/FindTPLMKL.cmake index 97e7fb6b6c..ee769bc4fa 100644 --- a/cmake/Modules/FindTPLMKL.cmake +++ b/cmake/Modules/FindTPLMKL.cmake @@ -1,56 +1,76 @@ -message("CMAKE_CXX_COMPILER_ID : ${CMAKE_CXX_COMPILER_ID}") - -IF (CMAKE_CXX_COMPILER_ID STREQUAL "Intel") - TRY_COMPILE(KOKKOSKERNELS_HAS_MKL_ARG - ${KOKKOSKERNELS_TOP_BUILD_DIR}/tpl_tests - ${KOKKOSKERNELS_TOP_SOURCE_DIR}/cmake/compile_tests/mkl.cpp - LINK_LIBRARIES -mkl - COMPILE_DEFINITIONS -mkl) - KOKKOSKERNELS_CREATE_IMPORTED_TPL(MKL INTERFACE COMPILE_OPTIONS -mkl LINK_OPTIONS -mkl) - INCLUDE(FindPackageHandleStandardArgs) - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLMKL DEFAULT_MSG KOKKOSKERNELS_HAS_MKL_ARG) -ELSEIF(WIN32) - SET(BLA_VENDOR Intel10_64lp) - FIND_PACKAGE(BLAS REQUIRED) - IF (NOT DEFINED ENV{MKLROOT}) - SET(NO_MKL_ROOT_GIVEN "MKL-NOTFOUND") - MESSAGE(WARNING "No MKLROOT environment variable specified - must source mklvars.sh to configure MKL path") - FIND_PACKAGE_HANDLE_STANDARD_ARGS(MKL NO_MKL_ROOT_GIVEN) - ELSE() - KOKKOSKERNELS_CREATE_IMPORTED_TPL(MKL INTERFACE - LINK_OPTIONS ${BLAS_LINKER_FLAGS} - LINK_LIBRARIES ${BLAS_LIBRARIES} - ) +find_package(MKL) +if(TARGET MKL::MKL) + MESSAGE("TARGET MKL::MKL FOUND") + SET(TPL_MKL_IMPORTED_NAME MKL::MKL) + SET(TPL_IMPORTED_NAME MKL::MKL) + ADD_LIBRARY(MKL INTERFACE) + TARGET_LINK_LIBRARIES(MKL INTERFACE MKL::MKL) + ADD_LIBRARY(KokkosKernels::MKL ALIAS MKL ) + GET_TARGET_PROPERTY(LIB_TYPE ${TPL_IMPORTED_NAME} TYPE) + MESSAGE("LIB_TYPE: ${LIB_TYPE}") +# kokkoskernels_export_imported_tpl install MKL with target name MKL instead of +# MKL::MKL or KokkosKernels::MKL, so we need to install a specific ALIAS one + if(TARGET MKL) + MESSAGE("TARGET MKL CREATED") ENDIF() -ELSE() -#find_package(mkl) - #V1: old version plus small modif on header : -# ${MKL_ROOT}/include => ${MKL_ROOT}/include/mkl - IF (NOT DEFINED ENV{MKLROOT}) - SET(NO_MKL_ROOT_GIVEN "MKL-NOTFOUND") - MESSAGE(WARNING "No MKLROOT environment variable specified - must source mklvars.sh to configure MKL path") - FIND_PACKAGE_HANDLE_STANDARD_ARGS(MKL NO_MKL_ROOT_GIVEN) - ELSE() - SET(MKL_ROOT $ENV{MKLROOT}) - #go ahead and use LD_LIBRARY_PATH to find certain libs - LIST(APPEND ENV_LIBDIRS ENV LD_LIBRARY_PATH) - #override what CMake looks for - #gnu_thread does not work on some platforms - #just always use intel_thread - KOKKOSKERNELS_FIND_IMPORTED(MKL INTERFACE - LIBRARIES - mkl_intel_lp64 - mkl_intel_thread - mkl_core - iomp5 - LIBRARY_PATHS - ${MKL_ROOT}/lib/intel64 - ${ENV_LIBDIRS} - HEADER - mkl.h - HEADER_PATHS - ${MKL_ROOT}/include/mkl - ) - ENDIF() +ELSE() + MESSAGE(FATAL_ERROR "Package MKL requested but not found") ENDIF() + + + +#IF (CMAKE_CXX_COMPILER_ID STREQUAL "Intel") +# TRY_COMPILE(KOKKOSKERNELS_HAS_MKL_ARG +# ${KOKKOSKERNELS_TOP_BUILD_DIR}/tpl_tests +# ${KOKKOSKERNELS_TOP_SOURCE_DIR}/cmake/compile_tests/mkl.cpp +# LINK_LIBRARIES -mkl +# COMPILE_DEFINITIONS -mkl) +# KOKKOSKERNELS_CREATE_IMPORTED_TPL(MKL INTERFACE COMPILE_OPTIONS -mkl LINK_OPTIONS -mkl) +# INCLUDE(FindPackageHandleStandardArgs) +# FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLMKL DEFAULT_MSG KOKKOSKERNELS_HAS_MKL_ARG) +#ELSEIF(WIN32) +# SET(BLA_VENDOR Intel10_64lp) +# FIND_PACKAGE(BLAS REQUIRED) +# IF (NOT DEFINED ENV{MKLROOT}) +# SET(NO_MKL_ROOT_GIVEN "MKL-NOTFOUND") +# MESSAGE(WARNING "No MKLROOT environment variable specified - must source mklvars.sh to configure MKL path") +# FIND_PACKAGE_HANDLE_STANDARD_ARGS(MKL NO_MKL_ROOT_GIVEN) +# ELSE() +# KOKKOSKERNELS_CREATE_IMPORTED_TPL(MKL INTERFACE +# LINK_OPTIONS ${BLAS_LINKER_FLAGS} +# LINK_LIBRARIES ${BLAS_LIBRARIES} +# ) +# ENDIF() +#ELSE() + +##find_package(mkl) +# #V1: old version plus small modif on header : +## ${MKL_ROOT}/include => ${MKL_ROOT}/include/mkl +# IF (NOT DEFINED ENV{MKLROOT}) +# SET(NO_MKL_ROOT_GIVEN "MKL-NOTFOUND") +# MESSAGE(WARNING "No MKLROOT environment variable specified - must source mklvars.sh to configure MKL path") +# FIND_PACKAGE_HANDLE_STANDARD_ARGS(MKL NO_MKL_ROOT_GIVEN) +# ELSE() +# SET(MKL_ROOT $ENV{MKLROOT}) +# #go ahead and use LD_LIBRARY_PATH to find certain libs +# LIST(APPEND ENV_LIBDIRS ENV LD_LIBRARY_PATH) +# #override what CMake looks for +# #gnu_thread does not work on some platforms +# #just always use intel_thread +# KOKKOSKERNELS_FIND_IMPORTED(MKL INTERFACE +# LIBRARIES +# mkl_intel_lp64 +# mkl_intel_thread +# mkl_core +# iomp5 +# LIBRARY_PATHS +# ${MKL_ROOT}/lib/intel64 +# ${ENV_LIBDIRS} +# HEADER +# mkl.h +# HEADER_PATHS +# ${MKL_ROOT}/include +# ) +# ENDIF() +#ENDIF() From 24cb9017b5330b242e2940c64c58782b4fd76a3e Mon Sep 17 00:00:00 2001 From: kliegeois Date: Thu, 30 Mar 2023 09:14:42 -0600 Subject: [PATCH 187/442] Add calls to KokkosBlas Gemv and Spmv for team batched kernels when m==1 --- .../KokkosBatched_Gemv_TeamVector_Impl.hpp | 12 ++ .../impl/KokkosBatched_Gemv_Team_Impl.hpp | 20 +++ .../KokkosBatched_Spmv_TeamVector_Impl.hpp | 15 ++ .../impl/KokkosBatched_Spmv_Team_Impl.hpp | 15 ++ blas/impl/KokkosBlas2_team_spmv_impl.hpp | 132 ++++++++++++++++ blas/impl/KokkosBlas2_team_spmv_spec.hpp | 68 +++++++++ blas/src/KokkosBlas2_team_spmv.hpp | 141 ++++++++++++++++++ 7 files changed, 403 insertions(+) create mode 100644 blas/impl/KokkosBlas2_team_spmv_impl.hpp create mode 100644 blas/impl/KokkosBlas2_team_spmv_spec.hpp create mode 100644 blas/src/KokkosBlas2_team_spmv.hpp diff --git a/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp index 26e22fb00c..a0b948bb13 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp @@ -20,6 +20,7 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Gemv_TeamVector_Internal.hpp" +#include "KokkosBlas2_team_gemv.hpp" namespace KokkosBatched { @@ -48,6 +49,17 @@ struct TeamVectorGemv { static_assert(AViewType::rank == 3, "Batched TeamVectorGemv requires rank-3 A matrix (use " "KokkosBlas::TeamVectorGemv for regular rank-2 matrix)"); + if (A.extent(0) == 1) { + KokkosBlas::TeamVectorGemv< + MemberType, Trans::NoTranspose, + Algo::Gemv::Unblocked>::invoke(member, alpha, + Kokkos::subview(A, 0, Kokkos::ALL, + Kokkos::ALL), + Kokkos::subview(x, 0, Kokkos::ALL), + beta, + Kokkos::subview(y, 0, Kokkos::ALL)); + return 0; + } return TeamVectorGemvInternal::template invoke< MemberType, ScalarType, typename AViewType::array_layout, typename AViewType::non_const_value_type>( diff --git a/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp index b86796f4ff..1407bf43b4 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp @@ -20,6 +20,7 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Gemv_Team_Internal.hpp" +#include "KokkosBlas2_team_gemv.hpp" namespace KokkosBatched { @@ -48,6 +49,17 @@ struct TeamGemv { static_assert(AViewType::rank == 3, "Batched TeamGemv requires rank-3 A matrix (use " "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + if (A.extent(0) == 1) { + KokkosBlas::TeamGemv< + MemberType, Trans::NoTranspose, + Algo::Gemv::Unblocked>::invoke(member, alpha, + Kokkos::subview(A, 0, Kokkos::ALL, + Kokkos::ALL), + Kokkos::subview(x, 0, Kokkos::ALL), + beta, + Kokkos::subview(y, 0, Kokkos::ALL)); + return 0; + } return TeamGemvInternal::template invoke< MemberType, ScalarType, typename AViewType::array_layout, typename AViewType::non_const_value_type>( @@ -90,6 +102,14 @@ struct TeamGemv { static_assert(AViewType::rank == 3, "Batched TeamGemv requires rank-3 A matrix (use " "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + if (A.extent(0) == 1) { + KokkosBlas:: + TeamGemv::invoke( + member, alpha, Kokkos::subview(A, 0, Kokkos::ALL, Kokkos::ALL), + Kokkos::subview(x, 0, Kokkos::ALL), beta, + Kokkos::subview(y, 0, Kokkos::ALL)); + return 0; + } return TeamGemvInternal::template invoke< MemberType, ScalarType, typename AViewType::array_layout, typename AViewType::non_const_value_type>( diff --git a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp index 516aded68e..9dadce2e78 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp @@ -19,6 +19,7 @@ /// \author Kim Liegeois (knliege@sandia.gov) #include "KokkosBatched_Util.hpp" +#include "KokkosBlas2_team_spmv.hpp" namespace KokkosBatched { @@ -386,6 +387,13 @@ struct TeamVectorSpmv { return 1; } #endif + if (values.extent(0) == 1) { + KokkosBlas::Experimental::team_vector_spmv( + member, alpha.data()[0], Kokkos::subview(values, 0, Kokkos::ALL), + row_ptr, colIndices, Kokkos::subview(X, 0, Kokkos::ALL), + beta.data()[0], Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); + return 0; + } return TeamVectorSpmvInternal::template invoke< MemberType, typename alphaViewType::non_const_value_type, @@ -463,6 +471,13 @@ struct TeamVectorSpmv { return 1; } #endif + if (values.extent(0) == 1) { + KokkosBlas::Experimental::team_vector_spmv( + member, alpha, Kokkos::subview(values, 0, Kokkos::ALL), row_ptr, + colIndices, Kokkos::subview(X, 0, Kokkos::ALL), beta, + Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); + return 0; + } return TeamVectorSpmvInternal::template invoke< MemberType, diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp index d8282d0aeb..5c35feccdd 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp @@ -19,6 +19,7 @@ /// \author Kim Liegeois (knliege@sandia.gov) #include "KokkosBatched_Util.hpp" +#include "KokkosBlas2_team_spmv.hpp" namespace KokkosBatched { @@ -237,6 +238,13 @@ struct TeamSpmv { return 1; } #endif + if (values.extent(0) == 1) { + KokkosBlas::Experimental::team_spmv( + member, alpha.data()[0], Kokkos::subview(values, 0, Kokkos::ALL), + row_ptr, colIndices, Kokkos::subview(X, 0, Kokkos::ALL), + beta.data()[0], Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); + return 0; + } return TeamSpmvInternal::template invoke< MemberType, typename alphaViewType::non_const_value_type, @@ -314,6 +322,13 @@ struct TeamSpmv { return 1; } #endif + if (values.extent(0) == 1) { + KokkosBlas::Experimental::team_spmv( + member, alpha, Kokkos::subview(values, 0, Kokkos::ALL), row_ptr, + colIndices, Kokkos::subview(X, 0, Kokkos::ALL), beta, + Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); + return 0; + } return TeamSpmvInternal::template invoke< MemberType, diff --git a/blas/impl/KokkosBlas2_team_spmv_impl.hpp b/blas/impl/KokkosBlas2_team_spmv_impl.hpp new file mode 100644 index 0000000000..36313d51b3 --- /dev/null +++ b/blas/impl/KokkosBlas2_team_spmv_impl.hpp @@ -0,0 +1,132 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_TEAM_SPMV_IMPL_HPP_ +#define KOKKOSBLAS2_TEAM_SPMV_IMPL_HPP_ + +#include +#include +#include +#include + +namespace KokkosBlas { +namespace Impl { + +struct TeamSpmvInternal { + template + KOKKOS_INLINE_FUNCTION static void invoke( + const MemberType& member, const OrdinalType numRows, + const ScalarType alpha, const ValueType* KOKKOS_RESTRICT values, + const OrdinalType valuess0, const OrdinalType* KOKKOS_RESTRICT row_ptr, + const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, + const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT x, + const OrdinalType xs0, const ScalarType beta, + /**/ ValueType* KOKKOS_RESTRICT y, const OrdinalType ys0); +}; + +struct TeamVectorSpmvInternal { + template + KOKKOS_INLINE_FUNCTION static void invoke( + const MemberType& member, const OrdinalType numRows, + const ScalarType alpha, const ValueType* KOKKOS_RESTRICT values, + const OrdinalType valuess0, const OrdinalType* KOKKOS_RESTRICT row_ptr, + const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, + const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT x, + const OrdinalType xs0, const ScalarType beta, + /**/ ValueType* KOKKOS_RESTRICT y, const OrdinalType ys0); +}; + +template +KOKKOS_INLINE_FUNCTION void TeamSpmvInternal::invoke( + const MemberType& member, const OrdinalType numRows, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, + const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, + const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT x, + const OrdinalType xs0, const ScalarType beta, + /**/ ValueType* KOKKOS_RESTRICT y, const OrdinalType ys0) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, 0, numRows), + [&](const OrdinalType& iRow) { + const OrdinalType rowLength = + row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + ValueType sum = 0; +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (OrdinalType iEntry = 0; iEntry < rowLength; ++iEntry) { + sum += values[(row_ptr[iRow * row_ptrs0] + iEntry) * valuess0] * + x[colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * + colIndicess0] * + xs0]; + } + + sum *= alpha; + + if (dobeta == 0) { + y[iRow * ys0] = sum; + } else { + y[iRow * ys0] = beta * y[iRow * ys0] + sum; + } + }); +} + +template +KOKKOS_INLINE_FUNCTION void TeamVectorSpmvInternal::invoke( + const MemberType& member, const OrdinalType numRows, const ScalarType alpha, + const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, + const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, + const OrdinalType* KOKKOS_RESTRICT colIndices, + const OrdinalType colIndicess0, const ValueType* KOKKOS_RESTRICT x, + const OrdinalType xs0, const ScalarType beta, + /**/ ValueType* KOKKOS_RESTRICT y, const OrdinalType ys0) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, 0, numRows), + [&](const OrdinalType& iRow) { + const OrdinalType rowLength = + row_ptr[(iRow + 1) * row_ptrs0] - row_ptr[iRow * row_ptrs0]; + + ValueType sum = 0; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(member, rowLength), + [&](const OrdinalType& iEntry, ValueType& val) { + val += values[(row_ptr[iRow * row_ptrs0] + iEntry) * valuess0] * + x[colIndices[(row_ptr[iRow * row_ptrs0] + iEntry) * + colIndicess0] * + xs0]; + }, + sum); + + sum *= alpha; + + if (dobeta == 0) { + y[iRow * ys0] = sum; + } else { + y[iRow * ys0] = beta * y[iRow * ys0] + sum; + } + }); +} + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/impl/KokkosBlas2_team_spmv_spec.hpp b/blas/impl/KokkosBlas2_team_spmv_spec.hpp new file mode 100644 index 0000000000..6967d91571 --- /dev/null +++ b/blas/impl/KokkosBlas2_team_spmv_spec.hpp @@ -0,0 +1,68 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_TEAM_SPMV_SPEC_HPP_ +#define KOKKOSBLAS2_TEAM_SPMV_SPEC_HPP_ + +#include +#include +#include +#include +#include + +namespace KokkosBlas { + +template +struct TeamSpmv { + template + KOKKOS_INLINE_FUNCTION static void invoke( + const MemberType& member, const ScalarType alpha, + const ValuesViewType& values, const IntView& row_ptr, + const IntView& colIndices, const xViewType& x, const ScalarType beta, + const yViewType& y) { + Impl::TeamSpmvInternal::invoke< + MemberType, ScalarType, typename ValuesViewType::non_const_value_type, + typename IntView::non_const_value_type, dobeta>( + member, x.extent(0), alpha, values.data(), values.stride_0(), + row_ptr.data(), row_ptr.stride_0(), colIndices.data(), + colIndices.stride_0(), x.data(), x.stride_0(), beta, y.data(), + y.stride_0()); + } +}; + +template +struct TeamVectorSpmv { + template + KOKKOS_INLINE_FUNCTION static void invoke( + const MemberType& member, const ScalarType alpha, + const ValuesViewType& values, const IntView& row_ptr, + const IntView& colIndices, const xViewType& x, const ScalarType beta, + const yViewType& y) { + Impl::TeamVectorSpmvInternal::invoke< + MemberType, ScalarType, typename ValuesViewType::non_const_value_type, + typename IntView::non_const_value_type, dobeta>( + member, x.extent(0), alpha, values.data(), values.stride_0(), + row_ptr.data(), row_ptr.stride_0(), colIndices.data(), + colIndices.stride_0(), x.data(), x.stride_0(), beta, y.data(), + y.stride_0()); + } +}; + +} // namespace KokkosBlas + +#endif diff --git a/blas/src/KokkosBlas2_team_spmv.hpp b/blas/src/KokkosBlas2_team_spmv.hpp new file mode 100644 index 0000000000..366570b954 --- /dev/null +++ b/blas/src/KokkosBlas2_team_spmv.hpp @@ -0,0 +1,141 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBLAS2_TEAM_SPMV_HPP_ +#define KOKKOSBLAS2_TEAM_SPMV_HPP_ + +/// \file KokkosBlas2_spmv.hpp + +#include +#include +#include +#include // requires C++11, but so does Kokkos +#include + +namespace KokkosBlas { +namespace Experimental { + +/// \brief Sparse matrix-vector multiply: y = beta*y + alpha*A*x. +/// +template +void KOKKOS_INLINE_FUNCTION team_spmv( + const TeamType &team, const ScalarType &alpha, const ValuesViewType &values, + const IntView &row_ptr, const IntView &colIndices, const xViewType &x, + const ScalarType &beta, const yViewType &y, const int dobeta) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "ValuesViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "IntView must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "xViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "yViewType must be a Kokkos::View."); + static_assert(static_cast(ValuesViewType::rank) == 1, + "ValuesViewType must have rank 1."); + static_assert(static_cast(IntView::rank) == 1, + "IntView must have rank 1."); + static_assert(static_cast(xViewType::rank) == 1, + "xViewType must have rank 1."); + static_assert(static_cast(yViewType::rank) == 1, + "yViewType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (values.extent(0) != colIndices.extent(0)) { + std::ostringstream os; + os << "KokkosBlas::spmv: Dimensions of values and colIndices do not match: " + << "values: " << values.extent(0) + << ", colIndices: " << colIndices.extent(0); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (x.extent(0) != y.extent(0) || (x.extent(0) + 1) != row_ptr.extent(0)) { + std::ostringstream os; + os << "KokkosBlas::spmv: Dimensions of x, y, and row_ptr do not match: " + << "x: " << x.extent(0) << ", y: " << y.extent(0) + << ", row_ptr: " << row_ptr.extent(0); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } +#endif // KOKKOSKERNELS_DEBUG_LEVEL + + if (dobeta == 1) + KokkosBlas::TeamSpmv::template invoke< + ScalarType, ValuesViewType, IntView, xViewType, yViewType, 1>( + team, alpha, values, row_ptr, colIndices, x, beta, y); + else + KokkosBlas::TeamSpmv::template invoke< + ScalarType, ValuesViewType, IntView, xViewType, yViewType, 0>( + team, alpha, values, row_ptr, colIndices, x, beta, y); +} + +/// \brief Sparse matrix-vector multiply: y = beta*y + alpha*A*x. +/// +template +void KOKKOS_INLINE_FUNCTION team_vector_spmv( + const TeamType &team, const ScalarType &alpha, const ValuesViewType &values, + const IntView &row_ptr, const IntView &colIndices, const xViewType &x, + const ScalarType &beta, const yViewType &y, const int dobeta) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "ValuesViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "IntView must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "xViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "yViewType must be a Kokkos::View."); + static_assert(static_cast(ValuesViewType::rank) == 1, + "ValuesViewType must have rank 1."); + static_assert(static_cast(IntView::rank) == 1, + "IntView must have rank 1."); + static_assert(static_cast(xViewType::rank) == 1, + "xViewType must have rank 1."); + static_assert(static_cast(yViewType::rank) == 1, + "yViewType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (values.extent(0) != colIndices.extent(0)) { + std::ostringstream os; + os << "KokkosBlas::spmv: Dimensions of values and colIndices do not match: " + << "values: " << values.extent(0) + << ", colIndices: " << colIndices.extent(0); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (x.extent(0) != y.extent(0) || (x.extent(0) + 1) != row_ptr.extent(0)) { + std::ostringstream os; + os << "KokkosBlas::spmv: Dimensions of x, y, and row_ptr do not match: " + << "x: " << x.extent(0) << ", y: " << y.extent(0) + << ", row_ptr: " << row_ptr.extent(0); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } +#endif // KOKKOSKERNELS_DEBUG_LEVEL + + if (dobeta == 1) + KokkosBlas::TeamVectorSpmv::template invoke< + ScalarType, ValuesViewType, IntView, xViewType, yViewType, 1>( + team, alpha, values, row_ptr, colIndices, x, beta, y); + else + KokkosBlas::TeamVectorSpmv::template invoke< + ScalarType, ValuesViewType, IntView, xViewType, yViewType, 0>( + team, alpha, values, row_ptr, colIndices, x, beta, y); +} + +} // namespace Experimental +} // namespace KokkosBlas + +#endif // KOKKOS_BLAS2_MV_HPP_ From bebcf360d8efb20af32f9254ef79ac71b3521e15 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Mon, 3 Apr 2023 15:04:20 -0600 Subject: [PATCH 188/442] Using Kokkos::ArithTraits instead of Kokkos::Details::ArithTraits --- sparse/unit_test/Test_Sparse_spiluk.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 44e9c37258..d6ffa5f46c 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -276,7 +276,7 @@ void run_test_spiluk_streams(int test_algo, int nstreams) { using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; using crsMat_t = CrsMatrix; - using AT = Kokkos::Details::ArithTraits; + using AT = Kokkos::ArithTraits; const size_type nrows = 9; const size_type nnz = 21; From 6c003deb34defeeb5ccfa5f8190c79f880e74184 Mon Sep 17 00:00:00 2001 From: kliegeois Date: Mon, 3 Apr 2023 15:44:56 -0600 Subject: [PATCH 189/442] Fix the doc of KokkosBlas2_team_spmv.hpp --- blas/src/KokkosBlas2_team_spmv.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/src/KokkosBlas2_team_spmv.hpp b/blas/src/KokkosBlas2_team_spmv.hpp index 366570b954..8e56553b0c 100644 --- a/blas/src/KokkosBlas2_team_spmv.hpp +++ b/blas/src/KokkosBlas2_team_spmv.hpp @@ -16,7 +16,7 @@ #ifndef KOKKOSBLAS2_TEAM_SPMV_HPP_ #define KOKKOSBLAS2_TEAM_SPMV_HPP_ -/// \file KokkosBlas2_spmv.hpp +/// \file KokkosBlas2_team_spmv.hpp #include #include From 53599f47d849166c68a8cf3db7bcfb4d9d1c1498 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Mon, 3 Apr 2023 17:41:27 -0600 Subject: [PATCH 190/442] Fix #1758 (#1762) For BLAS routines producing a complex scalar result (like zdotc), prefer to get the result via a pointer argument, rather than as a direct return value. Directly returning a std::complex from an "extern C" function is technically not allowed and Clang warns about it. --- CheckHostBlasReturnComplex.cmake | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/CheckHostBlasReturnComplex.cmake b/CheckHostBlasReturnComplex.cmake index 30063b1cc3..b9528ce45a 100644 --- a/CheckHostBlasReturnComplex.cmake +++ b/CheckHostBlasReturnComplex.cmake @@ -20,8 +20,8 @@ FUNCTION(CHECK_HOST_BLAS_RETURN_COMPLEX VARNAME) #define F77_BLAS_MANGLE${F77_BLAS_MANGLE} extern \"C\" { - std::complex F77_BLAS_MANGLE(zdotc,ZDOTC)( - const int* n, + void F77_BLAS_MANGLE(zdotc,ZDOTC)( + std::complex* result, const int* n, const std::complex x[], const int* incx, const std::complex y[], const int* incy); } @@ -35,13 +35,23 @@ int main() { TWO = std::complex(0.0,2.0); f[0] = ONE; f[1] = TWO; - std::complex ret - = F77_BLAS_MANGLE(zdotc,ZDOTC)(&NUM, f, &INC, f, &INC); + std::complex ret; + F77_BLAS_MANGLE(zdotc,ZDOTC)(&ret, &NUM, f, &INC, f, &INC); return (ret.real() == double(5.0) ? 0 : 1); } " ) - CHECK_CXX_SOURCE_RUNS("${SOURCE}" ${VARNAME}) +# Test whether the above program, which assumes BLAS can give back complex results +# via pointer arguments, compiles and runs correctly. +# If it does, assume that we don't need to get complex results as direct return values, +# which causes -Wreturn-type-c-linkage warnings. +CHECK_CXX_SOURCE_RUNS("${SOURCE}" KK_BLAS_RESULT_AS_POINTER_ARG) + +IF(${KK_BLAS_RESULT_AS_POINTER_ARG}) + SET(VARNAME OFF) +ELSE() + SET(VARNAME ON) +ENDIF() ENDFUNCTION() From 4dd7e613cda32b7f929d31ca66abe34c796e4325 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 4 Apr 2023 07:15:21 -0600 Subject: [PATCH 191/442] Add par_ilu numeric docs --- sparse/src/KokkosSparse_par_ilut.hpp | 53 ++++++++++++++++------------ 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/sparse/src/KokkosSparse_par_ilut.hpp b/sparse/src/KokkosSparse_par_ilut.hpp index 84d2f7c89e..42cfc1e7d2 100644 --- a/sparse/src/KokkosSparse_par_ilut.hpp +++ b/sparse/src/KokkosSparse_par_ilut.hpp @@ -44,7 +44,8 @@ namespace Experimental { std::is_same::type, \ typename std::remove_const::type>::value -/// @brief Performs the symbolic phase of par_ilut (non-blocking). +/// @brief Performs the symbolic phase of par_ilut. +/// This is a non-blocking function. /// /// The sparsity pattern of A will be analyzed and L_rowmap and U_rowmap will be /// populated with the L (lower triangular) and U (upper triagular) non-zero @@ -186,28 +187,34 @@ void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, } // par_ilut_symbolic -/// @brief -/// @tparam KernelHandle -/// @tparam ARowMapType -/// @tparam AEntriesType -/// @tparam AValuesType -/// @tparam LRowMapType -/// @tparam LEntriesType -/// @tparam LValuesType -/// @tparam URowMapType -/// @tparam UEntriesType -/// @tparam UValuesType -/// @param handle -/// @param A_rowmap -/// @param A_entries -/// @param A_values -/// @param L_rowmap -/// @param L_entries -/// @param L_values -/// @param U_rowmap -/// @param U_entries -/// @param U_values -/// @param deterministic +/// @brief Performs the numeric phase (for specific CSRs, not reusable) of the +/// par_ilut +/// algorithm (described in the header). This is a non-blocking +/// functions. It is expected that par_ilut_symbolic has already been +/// called for the +// provided KernelHandle. +/// +/// @tparam KernelHandle Template for the handle type +/// @tparam ARowMapType Template for the A_rowmap type +/// @tparam AEntriesType Template for the A_entries type +/// @tparam AValuesType Template for the A_values type +/// @tparam LRowMapType Template for the L_rowmap type +/// @tparam LEntriesType Template for the L_entries type +/// @tparam LValuesType Template for the L_values type +/// @tparam URowMapType Template for the U_rowmap type +/// @tparam UEntriesType Template for the U_entries type +/// @tparam UValuesType Template for the U_values type +/// @param handle The kernel handle. It is expected that create_par_ilut_handle +/// has been called on it +/// @param A_rowmap The row map (row nnz offsets) for the A CSR (Input) +/// @param A_entries The entries (column ids) for the A CSR (Input) +/// @param A_values The values (non-zero matrix values) for the A CSR (Input) +/// @param L_rowmap The row map (row nnz offsets) for the L CSR (Input/Output) +/// @param L_entries The entries (column ids) for the L CSR (Output) +/// @param L_values The values (non-zero matrix values) for the L CSR (Output) +/// @param U_rowmap The row map (row nnz offsets) for the U CSR (Input/Output) +/// @param U_entries The entries (column ids) for the U CSR (Output) +/// @param U_values The values (non-zero matrix values) for the U CSR (Output) template Date: Tue, 4 Apr 2023 11:06:55 -0600 Subject: [PATCH 192/442] Update docs/developer/apidocs/sparse.rst Co-authored-by: James Foucar --- docs/developer/apidocs/sparse.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index cb9aef9af2..aeab315ae4 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -66,7 +66,8 @@ par_ilut -------- .. doxygenfunction:: par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, AEntriesType& A_entries, LRowMapType& L_rowmap, URowMapType& U_rowmap) .. doxygenfunction:: par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, AEntriesType& A_entries, AValuesType& A_values, LRowMapType& L_rowmap, LEntriesType& L_entries, LValuesType& L_values, URowMapType& U_rowmap, UEntriesType& U_entries, UValuesType& U_values, bool deterministic) - +.. doxygenclass:: KokkosSparse::PAR_ILUTHandle + :members: gmres ----- .. doxygenfunction:: gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, Preconditioner* precond) \ No newline at end of file From 43bf36595f125ffa4e71530c4522210c8ec99259 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 4 Apr 2023 11:14:39 -0600 Subject: [PATCH 193/442] Make Werror build happy --- sparse/src/KokkosSparse_par_ilut.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sparse/src/KokkosSparse_par_ilut.hpp b/sparse/src/KokkosSparse_par_ilut.hpp index 42cfc1e7d2..d42b38e5e1 100644 --- a/sparse/src/KokkosSparse_par_ilut.hpp +++ b/sparse/src/KokkosSparse_par_ilut.hpp @@ -215,6 +215,7 @@ void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, /// @param U_rowmap The row map (row nnz offsets) for the U CSR (Input/Output) /// @param U_entries The entries (column ids) for the U CSR (Output) /// @param U_values The values (non-zero matrix values) for the U CSR (Output) +/// @param deterministic Please ignore. This parameter will be removed soon. template Date: Tue, 4 Apr 2023 11:15:16 -0600 Subject: [PATCH 194/442] Formatting --- docs/developer/apidocs/sparse.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index aeab315ae4..bee4b2d89c 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -68,6 +68,7 @@ par_ilut .. doxygenfunction:: par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, AEntriesType& A_entries, AValuesType& A_values, LRowMapType& L_rowmap, LEntriesType& L_entries, LValuesType& L_values, URowMapType& U_rowmap, UEntriesType& U_entries, UValuesType& U_values, bool deterministic) .. doxygenclass:: KokkosSparse::PAR_ILUTHandle :members: + gmres ----- .. doxygenfunction:: gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, Preconditioner* precond) \ No newline at end of file From aa96a83ad44d66c615cc18aec650f1b2a5f307c3 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 4 Apr 2023 14:07:21 -0600 Subject: [PATCH 195/442] Jgfouca/remove par ilut limitations (#1755) * Remove deterministic from par_ilut precond test Now that spgemm memory errors have been fixed, it appears to work * Add verbose mode to par_ilut * Fixes for GPU * Fix end_rel_res type to work when scalar is complex * Turn off asynch fixed point on GPU * Reorganize par_ilut handle, group conceptually similar members * Refactor par_ilut deterministic setting Change to async_update and move it to the handle. compute_l_u_factors does not need to run in a serial exespace, that is way overkill. Simply turning async_updates off should allow for deterministic results. Now that we are iterating more than once, it looks like even the hardcoded fixture test works fine with async_updates on since the multiple iterations corrects any "bad" results. * Add comment for par_ilut_precond test settings * Fix ordering warning * par_ilut: add test for nrows=0 --- sparse/impl/KokkosSparse_gmres_impl.hpp | 1 + .../KokkosSparse_par_ilut_numeric_impl.hpp | 218 ++++++++---------- .../KokkosSparse_par_ilut_numeric_spec.hpp | 8 +- .../KokkosSparse_par_ilut_symbolic_impl.hpp | 6 + .../KokkosSparse_par_ilut_symbolic_spec.hpp | 1 - sparse/src/KokkosKernels_Handle.hpp | 13 +- sparse/src/KokkosSparse_par_ilut.hpp | 35 ++- sparse/src/KokkosSparse_par_ilut_handle.hpp | 110 ++++++--- sparse/unit_test/Test_Sparse_par_ilut.hpp | 131 +++++++---- 9 files changed, 300 insertions(+), 223 deletions(-) diff --git a/sparse/impl/KokkosSparse_gmres_impl.hpp b/sparse/impl/KokkosSparse_gmres_impl.hpp index aa90a70757..8c7231f90c 100644 --- a/sparse/impl/KokkosSparse_gmres_impl.hpp +++ b/sparse/impl/KokkosSparse_gmres_impl.hpp @@ -92,6 +92,7 @@ struct GmresWrap { std::cout << " ortho: " << ((ortho == GmresHandle::Ortho::CGS2) ? "CGS2" : "MGS") << std::endl; + std::cout << " precond: " << (precond ? "ON" : "OFF") << std::endl; } // Make tmp work views diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index cedc2dbd43..89dcd12c5b 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -433,64 +433,12 @@ struct IlutWrap { return Kokkos::make_pair(a_val - sum, ut_nnz); } - template - KOKKOS_FUNCTION static void compute_l_u_factors_impl( - const ARowMapType& A_row_map, const AEntriesType& A_entries, - const AValuesType& A_values, LRowMapType& L_row_map, - LEntriesType& L_entries, LValuesType& L_values, URowMapType& U_row_map, - UEntriesType& U_entries, UValuesType& U_values, UtRowMapType& Ut_row_map, - UtEntriesType& Ut_entries, UtValuesType& Ut_values, MemberType& team) { - const auto row_idx = team.league_rank(); - - const auto l_row_nnz_begin = L_row_map(row_idx); - const auto l_row_nnz_end = L_row_map(row_idx + 1); - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, l_row_nnz_begin, l_row_nnz_end - 1), - [&](const size_type l_nnz) { - const auto col_idx = L_entries(l_nnz); - const auto u_diag = Ut_values(Ut_row_map(col_idx + 1) - 1); - if (u_diag != 0.0) { - const auto new_val = - compute_sum(row_idx, col_idx, A_row_map, A_entries, A_values, - L_row_map, L_entries, L_values, Ut_row_map, - Ut_entries, Ut_values) - .first / - u_diag; - L_values(l_nnz) = new_val; - } - }); - - team.team_barrier(); - - const auto u_row_nnz_begin = U_row_map(row_idx); - const auto u_row_nnz_end = U_row_map(row_idx + 1); - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, u_row_nnz_begin, u_row_nnz_end), - [&](const size_type u_nnz) { - const auto col_idx = U_entries(u_nnz); - const auto sum = compute_sum(row_idx, col_idx, A_row_map, A_entries, - A_values, L_row_map, L_entries, L_values, - Ut_row_map, Ut_entries, Ut_values); - const auto new_val = sum.first; - const auto ut_nnz = sum.second; - U_values(u_nnz) = new_val; - Ut_values(ut_nnz) = new_val; // ut_nnz is not guarateed to fail into - // range used exclusively by this team - }); - } - /** * Implements a single iteration/sweep of the fixed-point ILU algorithm. * The results of this function are non-deterministic due to concurrent - * reading and writing of Ut values. deterministic can be set to true to - * make this function determistic, but it will be run in Serial exe space - * if so. + * reading and writing of Ut values. async_update can be set to false to + * make this function determistic, but that could cause par_ilut + * to take longer (more iterations) to converge. */ template ; - using smember_type = typename spolicy_type::member_type; - - const size_type nrows = ih.get_nrows(); - spolicy_type policy(nrows, 1); - - auto A_row_map_h = Kokkos::create_mirror_view(A_row_map); - auto A_entries_h = Kokkos::create_mirror_view(A_entries); - auto A_values_h = Kokkos::create_mirror_view(A_values); - auto L_row_map_h = Kokkos::create_mirror_view(L_row_map); - auto L_entries_h = Kokkos::create_mirror_view(L_entries); - auto L_values_h = Kokkos::create_mirror_view(L_values); - auto U_row_map_h = Kokkos::create_mirror_view(U_row_map); - auto U_entries_h = Kokkos::create_mirror_view(U_entries); - auto U_values_h = Kokkos::create_mirror_view(U_values); - auto Ut_row_map_h = Kokkos::create_mirror_view(Ut_row_map); - auto Ut_entries_h = Kokkos::create_mirror_view(Ut_entries); - auto Ut_values_h = Kokkos::create_mirror_view(Ut_values); - - Kokkos::deep_copy(A_row_map_h, A_row_map); - Kokkos::deep_copy(A_entries_h, A_entries); - Kokkos::deep_copy(A_values_h, A_values); - Kokkos::deep_copy(L_row_map_h, L_row_map); - Kokkos::deep_copy(L_entries_h, L_entries); - Kokkos::deep_copy(L_values_h, L_values); - Kokkos::deep_copy(U_row_map_h, U_row_map); - Kokkos::deep_copy(U_entries_h, U_entries); - Kokkos::deep_copy(U_values_h, U_values); - Kokkos::deep_copy(Ut_row_map_h, Ut_row_map); - Kokkos::deep_copy(Ut_entries_h, Ut_entries); - Kokkos::deep_copy(Ut_values_h, Ut_values); - - Kokkos::parallel_for( - "compute_l_u_factors", policy, - KOKKOS_LAMBDA(const smember_type& team) { - compute_l_u_factors_impl( - A_row_map_h, A_entries_h, A_values_h, L_row_map_h, L_entries_h, - L_values_h, U_row_map_h, U_entries_h, U_values_h, Ut_row_map_h, - Ut_entries_h, Ut_values_h, team); - }); + UtValuesType& Ut_values, const bool async_update) { + const size_type nrows = ih.get_nrows(); + Kokkos::parallel_for( + "compute_l_u_factors", range_policy(0, nrows), + KOKKOS_LAMBDA(const size_type row_idx) { + const auto l_row_nnz_begin = L_row_map(row_idx); + const auto l_row_nnz_end = + L_row_map(row_idx + 1) - 1; // skip diagonal for L + + for (auto l_nnz = l_row_nnz_begin; l_nnz < l_row_nnz_end; ++l_nnz) { + const auto col_idx = L_entries(l_nnz); + const auto u_diag = Ut_values(Ut_row_map(col_idx + 1) - 1); + if (u_diag != 0.0) { + const auto new_val = + compute_sum(row_idx, col_idx, A_row_map, A_entries, A_values, + L_row_map, L_entries, L_values, Ut_row_map, + Ut_entries, Ut_values) + .first / + u_diag; + L_values(l_nnz) = new_val; + } + } - Kokkos::deep_copy(L_values, L_values_h); - Kokkos::deep_copy(U_values, U_values_h); - Kokkos::deep_copy(Ut_values, Ut_values_h); -#else - throw std::runtime_error( - "compute_l_u factors cannot be deterministic without Kokkos::Serial " - "available"); -#endif - } else { - const auto policy = ih.get_default_team_policy(); - - Kokkos::parallel_for( - "compute_l_u_factors", policy, - KOKKOS_LAMBDA(const member_type& team) { - compute_l_u_factors_impl(A_row_map, A_entries, A_values, L_row_map, - L_entries, L_values, U_row_map, U_entries, - U_values, Ut_row_map, Ut_entries, - Ut_values, team); - }); - } + const auto u_row_nnz_begin = U_row_map(row_idx); + const auto u_row_nnz_end = U_row_map(row_idx + 1); + + for (auto u_nnz = u_row_nnz_begin; u_nnz < u_row_nnz_end; ++u_nnz) { + const auto col_idx = U_entries(u_nnz); + const auto sum = compute_sum( + row_idx, col_idx, A_row_map, A_entries, A_values, L_row_map, + L_entries, L_values, Ut_row_map, Ut_entries, Ut_values); + const auto new_val = sum.first; + const auto ut_nnz = sum.second; + U_values(u_nnz) = new_val; + + // ut_nnz is not guarateed to fail into range used exclusively + // by this thread. Updating it here opens up potential race + // conditions that cause problems on GPU but usually causes + // faster convergence. + if (async_update) { + Ut_values(ut_nnz) = new_val; + } + } + }); } /** @@ -853,7 +781,8 @@ struct IlutWrap { const AValuesType& A_values, LRowMapType& L_row_map, LEntriesType& L_entries, LValuesType& L_values, URowMapType& U_row_map, UEntriesType& U_entries, - UValuesType& U_values, bool deterministic) { + UValuesType& U_values) { + // Get config settings from handle const size_type nrows = thandle.get_nrows(); const auto fill_in_limit = thandle.get_fill_in_limit(); const auto l_nnz_limit = @@ -865,6 +794,21 @@ struct IlutWrap { thandle.get_residual_norm_delta_stop(); const size_type max_iter = thandle.get_max_iter(); + const auto verbose = thandle.get_verbose(); + constexpr bool on_gpu = + KokkosKernels::Impl::kk_is_gpu_exec_space(); + const auto async_update = !on_gpu && thandle.get_async_update(); + + if (verbose) { + std::cout << "Starting PARILUT with..." << std::endl; + std::cout << " num_rows: " << nrows << std::endl; + std::cout << " fill_in_limit: " << fill_in_limit << std::endl; + std::cout << " max_iter: " << max_iter << std::endl; + std::cout << " res_norm_delta_stop: " << residual_norm_delta_stop + << std::endl; + std::cout << " async_update: " << async_update << std::endl; + } + kh.create_spadd_handle(true /*we expect inputs to be sorted*/); // @@ -890,8 +834,8 @@ struct IlutWrap { auto V_copy = Kokkos::create_mirror_view(V_copy_d); size_type itr = 0; + scalar_t curr_residual = std::numeric_limits::max(); scalar_t prev_residual = std::numeric_limits::max(); - bool converged = false; // Set the initial L/U values for the initial approximation initialize_LU(thandle, A_row_map, A_entries, A_values, L_row_map, L_entries, @@ -900,7 +844,8 @@ struct IlutWrap { // // main loop // - while (!converged && itr < max_iter) { + bool stop = nrows == 0; // Don't iterate at all if nrows=0 + while (!stop && itr < max_iter) { // LU = L*U if (prev_residual == std::numeric_limits::max()) { multiply_matrices(kh, thandle, L_row_map, L_entries, L_values, @@ -923,7 +868,7 @@ struct IlutWrap { compute_l_u_factors( thandle, A_row_map, A_entries, A_values, L_new_row_map, L_new_entries, L_new_values, U_new_row_map, U_new_entries, U_new_values, - Ut_new_row_map, Ut_new_entries, Ut_new_values, deterministic); + Ut_new_row_map, Ut_new_entries, Ut_new_values, async_update); // Filter smallest elements from L_new and U_new. Store result back // in L and U. @@ -957,18 +902,28 @@ struct IlutWrap { compute_l_u_factors(thandle, A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, Ut_new_row_map, Ut_new_entries, Ut_new_values, - deterministic); + async_update); - // Compute residual and terminate if converged + // Compute residual and check stop conditions { - const auto curr_residual = compute_residual_norm( + curr_residual = compute_residual_norm( kh, thandle, A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, R_row_map, R_entries, R_values, LU_row_map, LU_entries, LU_values); - if (karith::abs(prev_residual - curr_residual) <= - karith::abs(residual_norm_delta_stop)) { - converged = true; + if (verbose) { + std::cout << "Completed itr " << itr + << ", residual is: " << curr_residual << std::endl; + } + + const auto curr_delta = karith::abs(prev_residual - curr_residual); + if (curr_delta <= residual_norm_delta_stop) { + if (verbose) { + std::cout << " Itr-to-itr residual change has dropped below " + "residual_norm_delta_stop, stop" + << std::endl; + } + stop = true; } else { prev_residual = curr_residual; } @@ -977,6 +932,13 @@ struct IlutWrap { ++itr; } + curr_residual = nrows == 0 ? scalar_t(0.) : curr_residual; + if (verbose) { + std::cout << "PAR_ILUT stopped in " << itr << " iterations with residual " + << curr_residual << std::endl; + } + thandle.set_stats(itr, curr_residual); + kh.destroy_spadd_handle(); } // end ilut_numeric diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp index 72130a07ad..fd3bc2b8bb 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp @@ -114,8 +114,7 @@ struct PAR_ILUT_NUMERIC { const AValuesType &A_values, LRowMapType &L_row_map, LEntriesType &L_entries, LValuesType &L_values, URowMapType &U_row_map, - UEntriesType &U_entries, UValuesType &U_values, - bool deterministic = false); + UEntriesType &U_entries, UValuesType &U_values); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY @@ -135,15 +134,14 @@ struct PAR_ILUT_NUMERICget_par_ilut_handle(); using Ilut = Experimental::IlutWrap< typename std::remove_pointer::type>; Ilut::ilut_numeric(*handle, *par_ilut_handle, A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, U_row_map, - U_entries, U_values, deterministic); + U_entries, U_values); } }; diff --git a/sparse/impl/KokkosSparse_par_ilut_symbolic_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_symbolic_impl.hpp index 573f550832..4e331d2f11 100644 --- a/sparse/impl/KokkosSparse_par_ilut_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_symbolic_impl.hpp @@ -44,6 +44,10 @@ void ilut_symbolic(IlutHandle& thandle, const ARowMapType& A_row_map_d, using size_type = typename IlutHandle::size_type; using Ilut = IlutWrap; + const size_type a_nrows = A_row_map_d.extent(0); + const size_type nrows = a_nrows > 0 ? (a_nrows - 1) : 0; + thandle.set_nrows(nrows); + const auto policy = thandle.get_default_team_policy(); // Sizing for the initial L/U approximation @@ -83,8 +87,10 @@ void ilut_symbolic(IlutHandle& thandle, const ARowMapType& A_row_map_d, const size_type nnzsL = Ilut::prefix_sum(L_row_map_d); const size_type nnzsU = Ilut::prefix_sum(U_row_map_d); + // Set symbolic info on handle thandle.set_nnzL(nnzsL); thandle.set_nnzU(nnzsU); + thandle.set_symbolic_complete(); } // end ilut_symbolic diff --git a/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp b/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp index a27a685a00..b822d12ab0 100644 --- a/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp @@ -108,7 +108,6 @@ struct PAR_ILUT_SYMBOLICset_symbolic_complete(); } }; #endif diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index 6beaab6cba..1f080b7bce 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -867,12 +867,17 @@ class KokkosKernelsHandle { } PAR_ILUTHandleType *get_par_ilut_handle() { return this->par_ilutHandle; } - void create_par_ilut_handle(const size_type nrows, const size_type nnzL = 0, - const size_type nnzU = 0, - const size_type max_iter = 1) { + void create_par_ilut_handle( + const size_type max_iter = 20, + const typename PAR_ILUTHandleType::float_t residual_norm_delta_stop = + 1e-2, + const typename PAR_ILUTHandleType::float_t fill_in_limit = 0.75, + const bool async_update = true, const bool verbose = false) { this->destroy_par_ilut_handle(); this->is_owner_of_the_par_ilut_handle = true; - this->par_ilutHandle = new PAR_ILUTHandleType(nrows, nnzL, nnzU, max_iter); + this->par_ilutHandle = + new PAR_ILUTHandleType(max_iter, residual_norm_delta_stop, + fill_in_limit, async_update, verbose); this->par_ilutHandle->set_team_size(this->team_work_size); this->par_ilutHandle->set_vector_size(this->vector_size); } diff --git a/sparse/src/KokkosSparse_par_ilut.hpp b/sparse/src/KokkosSparse_par_ilut.hpp index 21371792c0..c7def802b9 100644 --- a/sparse/src/KokkosSparse_par_ilut.hpp +++ b/sparse/src/KokkosSparse_par_ilut.hpp @@ -20,9 +20,13 @@ /// This file provides KokkosSparse::par_ilut. This function performs a /// local (no MPI) sparse ILU(t) on matrices stored in /// compressed row sparse ("Crs") format. It is expected that symbolic -/// is called before numeric. The numeric function offers a deterministic -/// flag that will force the function to have deterministic results. This -/// is useful for testing but incurs a big performance penalty. +/// is called before numeric. The handle offers an async_update +/// flag that controls whether asynchronous updates are allowed while computing +/// L U factors. This is useful for testing as it allows for repeatable +/// (deterministic) results but may cause the algorithm to take longer (more +/// iterations) to converge. The par_ilut algorithm will repeat (iterate) until +/// max_iters is hit or the improvement in the residual from iter to iter drops +/// below a certain threshold. /// /// This algorithm is described in the paper: /// PARILUT - A New Parallel Threshold ILU Factorization - Anzt, Chow, Dongarra @@ -114,6 +118,13 @@ void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, "par_ilut_symbolic: KernelHandle and Views have different execution " "spaces."); + if (A_rowmap.extent(0) != 0) { + KK_REQUIRE_MSG(A_rowmap.extent(0) == L_rowmap.extent(0), + "L row map size does not match A row map"); + KK_REQUIRE_MSG(A_rowmap.extent(0) == U_rowmap.extent(0), + "U row map size does not match A row map"); + } + using c_size_t = typename KernelHandle::const_size_type; using c_lno_t = typename KernelHandle::const_nnz_lno_t; using c_scalar_t = typename KernelHandle::const_nnz_scalar_t; @@ -173,8 +184,7 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, AEntriesType& A_entries, AValuesType& A_values, LRowMapType& L_rowmap, LEntriesType& L_entries, LValuesType& L_values, URowMapType& U_rowmap, - UEntriesType& U_entries, UValuesType& U_values, - bool deterministic) { + UEntriesType& U_entries, UValuesType& U_values) { using size_type = typename KernelHandle::size_type; using ordinal_type = typename KernelHandle::nnz_lno_t; using scalar_type = typename KernelHandle::nnz_scalar_t; @@ -437,11 +447,16 @@ void par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, KokkosSparse::Impl::PAR_ILUT_NUMERIC< const_handle_type, ARowMap_Internal, AEntries_Internal, AValues_Internal, LRowMap_Internal, LEntries_Internal, LValues_Internal, URowMap_Internal, - UEntries_Internal, - UValues_Internal>::par_ilut_numeric(&tmp_handle, A_rowmap_i, A_entries_i, - A_values_i, L_rowmap_i, L_entries_i, - L_values_i, U_rowmap_i, U_entries_i, - U_values_i, deterministic); + UEntries_Internal, UValues_Internal>::par_ilut_numeric(&tmp_handle, + A_rowmap_i, + A_entries_i, + A_values_i, + L_rowmap_i, + L_entries_i, + L_values_i, + U_rowmap_i, + U_entries_i, + U_values_i); // These may have been resized L_entries = L_entries_i; diff --git a/sparse/src/KokkosSparse_par_ilut_handle.hpp b/sparse/src/KokkosSparse_par_ilut_handle.hpp index 04f546cdd6..7ae11d297c 100644 --- a/sparse/src/KokkosSparse_par_ilut_handle.hpp +++ b/sparse/src/KokkosSparse_par_ilut_handle.hpp @@ -24,6 +24,12 @@ namespace KokkosSparse { namespace Experimental { +/** + * Handle for par_ilut. Contains useful types, par_ilut configuration settings, + * symbolic settings and scalar output info. + * + * For more info, see KokkosSparse_par_ilut.hpp doxygen + */ template class PAR_ILUTHandle { @@ -66,41 +72,59 @@ class PAR_ILUTHandle { typename nnz_row_view_t::memory_traits>; private: - size_type nrows; - size_type nnzL; - size_type nnzU; - size_type max_iter; - nnz_scalar_t residual_norm_delta_stop; - - bool symbolic_complete; - - int team_size; - int vector_size; - - float_t fill_in_limit; + // User inputs + size_type max_iter; /// Hard cap on the number of par_ilut iterations + float_t residual_norm_delta_stop; /// When the change in residual from + /// iteration to iteration drops below + /// this, the algorithm will stop (even if + /// max_iters has not been hit) + float_t fill_in_limit; /// The threshold for the ILU factorization + bool async_update; /// Whether compute LU factors should do asychronous + /// updates. When ON, the algorithm will usually converge + /// faster but it makes the algorithm non-deterministic. + /// This will always be OFF for GPU since it doesn't work + /// there. + bool verbose; /// Print information while executing par_ilut + + // Stored by parent KokkosKernelsHandle + int team_size; /// Kokkos team size. Set by the parent handle. -1 implies + /// AUTO + int vector_size; /// Kokkos vector size. Set by the parent handle. + + // Stored by symbolic phase + size_type + nrows; /// Number of rows in the CSRs given to the symbolic par_ilut + size_type nnzL; /// Number of non-zero entries in the L part of A in the CSRs + /// given to the symbolic par_ilut + size_type nnzU; /// Number of non-zero entries in the U part of A in the CSRs + /// given to the symbolic par_ilut + bool symbolic_complete; /// Whether symbolic par_ilut has been called + + // Outputs + int num_iters; /// The number of iterations par_ilut took to finish + nnz_scalar_t end_rel_res; /// The A - LU residual norm at the time the + /// algorithm finished public: - PAR_ILUTHandle(const size_type nrows_, const size_type nnzL_ = 0, - const size_type nnzU_ = 0, const size_type max_iter_ = 1) - : nrows(nrows_), - nnzL(nnzL_), - nnzU(nnzU_), - max_iter(max_iter_), - residual_norm_delta_stop(0.), - symbolic_complete(false), + // See KokkosKernelsHandle::create_par_ilut_handle for default user input + // values + PAR_ILUTHandle(const size_type max_iter_, + const float_t residual_norm_delta_stop_, + const float_t fill_in_limit_, const bool async_update_, + const bool verbose_) + : max_iter(max_iter_), + residual_norm_delta_stop(residual_norm_delta_stop_), + fill_in_limit(fill_in_limit_), + async_update(async_update_), + verbose(verbose_), team_size(-1), vector_size(-1), - fill_in_limit(0.75) {} - - void reset_handle(const size_type nrows_, const size_type nnzL_, - const size_type nnzU_) { - set_nrows(nrows_); - set_nnzL(nnzL_); - set_nnzU(nnzU_); - set_residual_norm_delta_stop(0.); - reset_symbolic_complete(); - set_fill_in_limit(0.75); - } + nrows(0), + nnzL(0), + nnzU(0), + symbolic_complete(false), + num_iters(-1), + end_rel_res(-1) {} KOKKOS_INLINE_FUNCTION ~PAR_ILUTHandle() {} @@ -137,11 +161,10 @@ class PAR_ILUTHandle { void set_max_iter(const size_type max_iter_) { this->max_iter = max_iter_; } int get_max_iter() const { return this->max_iter; } - void set_residual_norm_delta_stop( - const nnz_scalar_t residual_norm_delta_stop_) { + void set_residual_norm_delta_stop(const float_t residual_norm_delta_stop_) { this->residual_norm_delta_stop = residual_norm_delta_stop_; } - nnz_scalar_t get_residual_norm_delta_stop() const { + float_t get_residual_norm_delta_stop() const { return this->residual_norm_delta_stop; } @@ -150,6 +173,16 @@ class PAR_ILUTHandle { } float_t get_fill_in_limit() const { return this->fill_in_limit; } + bool get_verbose() const { return verbose; } + + void set_verbose(const bool verbose_) { this->verbose = verbose_; } + + bool get_async_update() const { return async_update; } + + void set_async_update(const bool async_update_) { + this->async_update = async_update_; + } + TeamPolicy get_default_team_policy() const { if (team_size == -1) { return TeamPolicy(nrows, Kokkos::AUTO); @@ -157,6 +190,15 @@ class PAR_ILUTHandle { return TeamPolicy(nrows, team_size); } } + + int get_num_iters() const { return num_iters; } + + nnz_scalar_t get_end_rel_res() const { return end_rel_res; } + + void set_stats(int num_iters_, nnz_scalar_t end_rel_res_) { + num_iters = num_iters_; + end_rel_res = end_rel_res_; + } }; } // namespace Experimental diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index 377d8127ec..9b99c1000d 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -176,9 +176,10 @@ void run_test_par_ilut() { // Make kernel handle KernelHandle kh; - kh.create_par_ilut_handle(nrows); + kh.create_par_ilut_handle(); auto par_ilut_handle = kh.get_par_ilut_handle(); + par_ilut_handle->set_async_update(false); // Allocate L and U CRS views as outputs RowMapType L_row_map("L_row_map", nrows + 1); @@ -190,8 +191,8 @@ void run_test_par_ilut() { const size_type nnzL = par_ilut_handle->get_nnzL(); const size_type nnzU = par_ilut_handle->get_nnzU(); - EXPECT_EQ(nnzL, 10); - EXPECT_EQ(nnzU, 8); + ASSERT_EQ(nnzL, 10); + ASSERT_EQ(nnzU, 8); EntriesType L_entries("L_entries", nnzL); ValuesType L_values("L_values", nnzL); @@ -199,13 +200,7 @@ void run_test_par_ilut() { ValuesType U_values("U_values", nnzU); par_ilut_numeric(&kh, row_map, entries, values, L_row_map, L_entries, - L_values, U_row_map, U_entries, U_values, -#ifdef KOKKOS_ENABLE_SERIAL - true /*deterministic*/ -#else - false /*cannot ask for determinism*/ -#endif - ); + L_values, U_row_map, U_entries, U_values); // Use this to check LU // std::vector > expected_LU = { @@ -259,10 +254,6 @@ void run_test_par_ilut() { // check_matrix("U numeric", U_row_map, U_entries, U_values, // expected_U_candidates); - // Serial is required for deterministic mode and the checks below cannot - // reliably pass without determinism. -#ifdef KOKKOS_ENABLE_SERIAL - // Use these fixtures to test full numeric std::vector> expected_L_candidates = { {1., 0., 0., 0.}, @@ -283,10 +274,7 @@ void run_test_par_ilut() { check_matrix("U numeric", U_row_map, U_entries, U_values, expected_U_candidates); - // Checking - kh.destroy_par_ilut_handle(); -#endif } template ::mag_type; // Create a diagonally dominant sparse matrix to test: + // par_ilut settings max_iters, res_delta_stop, fill_in_limit, and + // async_update are all left as defaults constexpr auto n = 5000; constexpr auto m = 15; constexpr auto tol = ParIlut::TolMeta::value; @@ -330,8 +320,9 @@ void run_test_par_ilut_precond() { typename std::remove_reference::type; using ViewVectorType = typename GMRESHandle::nnz_value_view_t; - kh.create_par_ilut_handle(numRows); + kh.create_par_ilut_handle(); auto par_ilut_handle = kh.get_par_ilut_handle(); + par_ilut_handle->set_verbose(verbose); // Pull out views from CRS auto row_map = A.graph.row_map; @@ -354,13 +345,7 @@ void run_test_par_ilut_precond() { ValuesType U_values("U_values", nnzU); par_ilut_numeric(&kh, row_map, entries, values, L_row_map, L_entries, - L_values, U_row_map, U_entries, U_values, -#ifdef KOKKOS_ENABLE_SERIAL - true /*deterministic*/ -#else - false /*cannot ask for determinism*/ -#endif - ); + L_values, U_row_map, U_entries, U_values); // Create CRSs sp_matrix_type L("L", numRows, numCols, L_values.extent(0), L_values, @@ -396,10 +381,7 @@ void run_test_par_ilut_precond() { EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); } - // Solve Ax = b with LU preconditioner. Currently only works - // when deterministic mode in par_ilut is on, which is only - // possible when Kokkos::Serial has been enabled. -#ifdef KOKKOS_ENABLE_SERIAL + // Solve Ax = b with LU preconditioner. { gmres_handle->reset_handle(m, tol); gmres_handle->set_verbose(verbose); @@ -426,9 +408,65 @@ void run_test_par_ilut_precond() { EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); EXPECT_LT(num_iters_precond, num_iters_plain); } -#else - EXPECT_EQ(num_iters_precond, 0); -#endif +} + +template +void run_test_par_ilut_zerorow_A() { + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, typename device::execution_space, + typename device::memory_space, typename device::memory_space>; + + const size_type nrows = 0; + + // Allocate device CRS views for A + RowMapType row_map("row_map", 0); + EntriesType entries("entries", 0); + ValuesType values("values", 0); + + // Create host mirror views for CRS A + auto hrow_map = Kokkos::create_mirror_view(row_map); + auto hentries = Kokkos::create_mirror_view(entries); + auto hvalues = Kokkos::create_mirror_view(values); + + // Make kernel handle + KernelHandle kh; + + kh.create_par_ilut_handle(); + + auto par_ilut_handle = kh.get_par_ilut_handle(); + + // Allocate L and U CRS views as outputs + RowMapType L_row_map("L_row_map", nrows + 1); + RowMapType U_row_map("U_row_map", nrows + 1); + + // Initial L/U approximations for A + par_ilut_symbolic(&kh, row_map, entries, L_row_map, U_row_map); + + const size_type nnzL = par_ilut_handle->get_nnzL(); + const size_type nnzU = par_ilut_handle->get_nnzU(); + + ASSERT_EQ(nnzL, 0); + ASSERT_EQ(nnzU, 0); + + EntriesType L_entries("L_entries", nnzL); + ValuesType L_values("L_values", nnzL); + EntriesType U_entries("U_entries", nnzU); + ValuesType U_values("U_values", nnzU); + + par_ilut_numeric(&kh, row_map, entries, values, L_row_map, L_entries, + L_values, U_row_map, U_entries, U_values); + + const auto itrs = par_ilut_handle->get_num_iters(); + const auto end_rel_res = par_ilut_handle->get_end_rel_res(); + + EXPECT_EQ(itrs, 0); + EXPECT_EQ(end_rel_res, scalar_t(0.)); + + kh.destroy_par_ilut_handle(); } } // namespace Test @@ -445,15 +483,26 @@ void test_par_ilut_precond() { Test::run_test_par_ilut_precond(); } -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, \ - sparse##_##par_ilut##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_par_ilut(); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##par_ilut_precond##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_par_ilut_precond(); \ +template +void test_par_ilut_zerorow_A() { + Test::run_test_par_ilut_zerorow_A(); +} + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, \ + sparse##_##par_ilut##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_par_ilut(); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##par_ilut_zerorow_A##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_par_ilut_zerorow_A(); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##par_ilut_precond##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_par_ilut_precond(); \ } #define NO_TEST_COMPLEX From a975fa3e0c04355305e5cbb1da955dae04f35c54 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Wed, 5 Apr 2023 16:32:22 +0200 Subject: [PATCH 196/442] #8 Updated FindTPLMKL.cmake to support SYCL option from kokkos --- cmake/Modules/FindTPLMKL.cmake | 150 +++++++++++++++++---------------- 1 file changed, 78 insertions(+), 72 deletions(-) diff --git a/cmake/Modules/FindTPLMKL.cmake b/cmake/Modules/FindTPLMKL.cmake index ee769bc4fa..163d3c280d 100644 --- a/cmake/Modules/FindTPLMKL.cmake +++ b/cmake/Modules/FindTPLMKL.cmake @@ -1,76 +1,82 @@ find_package(MKL) -if(TARGET MKL::MKL) - MESSAGE("TARGET MKL::MKL FOUND") - SET(TPL_MKL_IMPORTED_NAME MKL::MKL) - SET(TPL_IMPORTED_NAME MKL::MKL) - ADD_LIBRARY(MKL INTERFACE) - TARGET_LINK_LIBRARIES(MKL INTERFACE MKL::MKL) - ADD_LIBRARY(KokkosKernels::MKL ALIAS MKL ) - GET_TARGET_PROPERTY(LIB_TYPE ${TPL_IMPORTED_NAME} TYPE) - MESSAGE("LIB_TYPE: ${LIB_TYPE}") -# kokkoskernels_export_imported_tpl install MKL with target name MKL instead of -# MKL::MKL or KokkosKernels::MKL, so we need to install a specific ALIAS one - if(TARGET MKL) - MESSAGE("TARGET MKL CREATED") - ENDIF() +IF(TARGET MKL::MKL) + # MKL version >= 2021 (see kokkos wiki and intel documentation. MKL CMake module file has been introduced starting MKL >= 2021) + MESSAGE("TARGET MKL::MKL FOUND") + IF (KOKKOS_ENABLE_SYCL) #get from kokkos-core + # MKL version >= 2022 (see kokkos wiki) + MESSAGE("KOKKOS_ENABLE_SYCL Detected") + IF (TARGET MKL::MKL_DPCPP) + MESSAGE("TARGET MKL::MKL_DPCPP FOUND") + ENDIF() + MESSAGE(FATAL_ERROR "KOKKOS_ENABLE_SYCL activated but the target MKL_DPCPP wasn't found") + ENDIF() + + SET(TPL_MKL_IMPORTED_NAME MKL::MKL) + SET(TPL_IMPORTED_NAME MKL::MKL) + ADD_LIBRARY(MKL INTERFACE) + IF(KOKKOS_ENABLE_SYCL) + TARGET_LINK_LIBRARIES(MKL INTERFACE MKL::MKL MKL::MKL_DPCPP) + ELSE() + TARGET_LINK_LIBRARIES(MKL INTERFACE MKL::MKL ) + ENDIF() + ADD_LIBRARY(KokkosKernels::MKL ALIAS MKL ) + GET_TARGET_PROPERTY(LIB_TYPE ${TPL_IMPORTED_NAME} TYPE) + MESSAGE("LIB_TYPE: ${LIB_TYPE}") + # kokkoskernels_export_imported_tpl install MKL with target name MKL instead of + # MKL::MKL or KokkosKernels::MKL, so we need to install a specific ALIAS one + if(TARGET MKL) + MESSAGE("TARGET MKL CREATED") + ENDIF() +ELSEIF (CMAKE_CXX_COMPILER_ID STREQUAL "Intel") # Regular wary with MKL version < 2021 (Where MKL doesn't provide cmake module file) + TRY_COMPILE(KOKKOSKERNELS_HAS_MKL_ARG + ${KOKKOSKERNELS_TOP_BUILD_DIR}/tpl_tests + ${KOKKOSKERNELS_TOP_SOURCE_DIR}/cmake/compile_tests/mkl.cpp + LINK_LIBRARIES -mkl + COMPILE_DEFINITIONS -mkl) + KOKKOSKERNELS_CREATE_IMPORTED_TPL(MKL INTERFACE COMPILE_OPTIONS -mkl LINK_OPTIONS -mkl) + INCLUDE(FindPackageHandleStandardArgs) + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLMKL DEFAULT_MSG KOKKOSKERNELS_HAS_MKL_ARG) +ELSEIF(WIN32) + SET(BLA_VENDOR Intel10_64lp) + FIND_PACKAGE(BLAS REQUIRED) + IF (NOT DEFINED ENV{MKLROOT}) + SET(NO_MKL_ROOT_GIVEN "MKL-NOTFOUND") + MESSAGE(WARNING "No MKLROOT environment variable specified - must source mklvars.sh to configure MKL path") + FIND_PACKAGE_HANDLE_STANDARD_ARGS(MKL NO_MKL_ROOT_GIVEN) + ELSE() + KOKKOSKERNELS_CREATE_IMPORTED_TPL(MKL INTERFACE + LINK_OPTIONS ${BLAS_LINKER_FLAGS} + LINK_LIBRARIES ${BLAS_LIBRARIES} + ) + ENDIF() ELSE() - MESSAGE(FATAL_ERROR "Package MKL requested but not found") + #TODO: old version plus small modif on header : + # ${MKL_ROOT}/include => ${MKL_ROOT}/include/mkl + IF (NOT DEFINED ENV{MKLROOT}) + SET(NO_MKL_ROOT_GIVEN "MKL-NOTFOUND") + MESSAGE(WARNING "No MKLROOT environment variable specified - must source mklvars.sh to configure MKL path") + FIND_PACKAGE_HANDLE_STANDARD_ARGS(MKL NO_MKL_ROOT_GIVEN) + ELSE() + SET(MKL_ROOT $ENV{MKLROOT}) + #go ahead and use LD_LIBRARY_PATH to find certain libs + LIST(APPEND ENV_LIBDIRS ENV LD_LIBRARY_PATH) + #override what CMake looks for + #gnu_thread does not work on some platforms + #just always use intel_thread + KOKKOSKERNELS_FIND_IMPORTED(MKL INTERFACE + LIBRARIES + mkl_intel_lp64 + mkl_intel_thread + mkl_core + iomp5 + LIBRARY_PATHS + ${MKL_ROOT}/lib/intel64 + ${ENV_LIBDIRS} + HEADER + mkl.h + HEADER_PATHS + ${MKL_ROOT}/include + ) + ENDIF() ENDIF() - - - -#IF (CMAKE_CXX_COMPILER_ID STREQUAL "Intel") -# TRY_COMPILE(KOKKOSKERNELS_HAS_MKL_ARG -# ${KOKKOSKERNELS_TOP_BUILD_DIR}/tpl_tests -# ${KOKKOSKERNELS_TOP_SOURCE_DIR}/cmake/compile_tests/mkl.cpp -# LINK_LIBRARIES -mkl -# COMPILE_DEFINITIONS -mkl) -# KOKKOSKERNELS_CREATE_IMPORTED_TPL(MKL INTERFACE COMPILE_OPTIONS -mkl LINK_OPTIONS -mkl) -# INCLUDE(FindPackageHandleStandardArgs) -# FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLMKL DEFAULT_MSG KOKKOSKERNELS_HAS_MKL_ARG) -#ELSEIF(WIN32) -# SET(BLA_VENDOR Intel10_64lp) -# FIND_PACKAGE(BLAS REQUIRED) -# IF (NOT DEFINED ENV{MKLROOT}) -# SET(NO_MKL_ROOT_GIVEN "MKL-NOTFOUND") -# MESSAGE(WARNING "No MKLROOT environment variable specified - must source mklvars.sh to configure MKL path") -# FIND_PACKAGE_HANDLE_STANDARD_ARGS(MKL NO_MKL_ROOT_GIVEN) -# ELSE() -# KOKKOSKERNELS_CREATE_IMPORTED_TPL(MKL INTERFACE -# LINK_OPTIONS ${BLAS_LINKER_FLAGS} -# LINK_LIBRARIES ${BLAS_LIBRARIES} -# ) -# ENDIF() -#ELSE() - -##find_package(mkl) -# #V1: old version plus small modif on header : -## ${MKL_ROOT}/include => ${MKL_ROOT}/include/mkl -# IF (NOT DEFINED ENV{MKLROOT}) -# SET(NO_MKL_ROOT_GIVEN "MKL-NOTFOUND") -# MESSAGE(WARNING "No MKLROOT environment variable specified - must source mklvars.sh to configure MKL path") -# FIND_PACKAGE_HANDLE_STANDARD_ARGS(MKL NO_MKL_ROOT_GIVEN) -# ELSE() -# SET(MKL_ROOT $ENV{MKLROOT}) -# #go ahead and use LD_LIBRARY_PATH to find certain libs -# LIST(APPEND ENV_LIBDIRS ENV LD_LIBRARY_PATH) -# #override what CMake looks for -# #gnu_thread does not work on some platforms -# #just always use intel_thread -# KOKKOSKERNELS_FIND_IMPORTED(MKL INTERFACE -# LIBRARIES -# mkl_intel_lp64 -# mkl_intel_thread -# mkl_core -# iomp5 -# LIBRARY_PATHS -# ${MKL_ROOT}/lib/intel64 -# ${ENV_LIBDIRS} -# HEADER -# mkl.h -# HEADER_PATHS -# ${MKL_ROOT}/include -# ) -# ENDIF() -#ENDIF() From 8f3574e333d0cb3bc2047fc3816cbdb3f14aab78 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Wed, 5 Apr 2023 14:08:44 -0600 Subject: [PATCH 197/442] spgemm handle: check that A,B,C graphs never change (#1742) * spgemm handle: check that A,B,C graphs never change Resolves #1738. The graphs (rowptrs/entries) of A, B and C can't change for a given spgemm handle (over all symbolic and numeric calls). This adds checks to symbolic and numeric for this (the raw pointers of rowptrs/entries are recorded in the handle, and then compared against in all subsequent calls). * spgemm: check A/B graphs don't change (debug only) In the spgemm handle, use hashes to make sure that the graphs of A and B don't change between the first symbolic call, and all subsequent symbolic and numeric calls. This is an O(N) test so it's only enabled in a debug build. In spgemm test, trigger these checks on purpose to make sure they work. * spgemm: remove checks for identical A/B graph pointers * Improved checks, doxy comments for hashView(v) Make sure that v's element type can contain no padding bytes. For example, struct{double, double} would be fine, but struct{char, double} is not. * Guard use of std::has_unique_object_representations (because sometimes it's not available from icpc, even with C++17) --- common/src/KokkosKernels_SimpleUtils.hpp | 43 ++++++++++++++ sparse/src/KokkosSparse_spgemm_handle.hpp | 54 +++++++++++++++++ sparse/src/KokkosSparse_spgemm_numeric.hpp | 19 +++++- sparse/src/KokkosSparse_spgemm_symbolic.hpp | 19 +++++- sparse/unit_test/Test_Sparse_spgemm.hpp | 66 +++++++++++++++++++-- 5 files changed, 195 insertions(+), 6 deletions(-) diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index f24ffb49f1..a271695246 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -412,6 +412,49 @@ KOKKOS_FORCEINLINE_FUNCTION Value xorshiftHash(Value v) { : static_cast(x * 2685821657736338717ULL - 1); } +struct ViewHashFunctor { + ViewHashFunctor(const uint8_t *data_) : data(data_) {} + + KOKKOS_INLINE_FUNCTION void operator()(size_t i, uint32_t &lhash) const { + // Compute a hash/digest of both the index i, and data[i]. Then add that to + // overall hash. + uint32_t x = uint32_t(i); + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + x ^= uint32_t(data[i]); + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + lhash += x; + } + + const uint8_t *data; +}; + +/// \brief Compute a hash of a view. +/// \param v: the view to hash. Must be contiguous, and its element type must +/// not contain any padding bytes. +template +uint32_t hashView(const View &v) { + assert(v.span_is_contiguous()); + // Note: This type trait is supposed to be part of C++17, + // but it's not defined on Intel 19 (with GCC 7.2.0 standard library). + // So just check if it's available before using. +#ifdef __cpp_lib_has_unique_object_representations + static_assert(std::has_unique_object_representations< + typename View::non_const_value_type>::value, + "KokkosKernels::Impl::hashView: the view's element type must " + "not have any padding bytes."); +#endif + size_t nbytes = v.span() * sizeof(typename View::value_type); + uint32_t h; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, nbytes), + ViewHashFunctor(reinterpret_cast(v.data())), h); + return h; +} + template struct SequentialFillFunctor { using size_type = typename V::size_type; diff --git a/sparse/src/KokkosSparse_spgemm_handle.hpp b/sparse/src/KokkosSparse_spgemm_handle.hpp index 197fa5ec14..7cdba6030c 100644 --- a/sparse/src/KokkosSparse_spgemm_handle.hpp +++ b/sparse/src/KokkosSparse_spgemm_handle.hpp @@ -776,6 +776,60 @@ class SPGEMMHandle { } bool get_compression_step() { return is_compression_single_step; } + + private: + // An SpGEMM handle can be reused for multiple products C = A*B, but only if + // the sparsity patterns of A and B do not change. Enforce this (in debug + // builds only) by recording hashes of the graphs, and then checking they + // match in later calls. + bool computedInputHashes = false; + uint32_t a_graph_hash = 0U; + uint32_t b_graph_hash = 0U; + + public: + template + bool checkMatrixIdentitiesSymbolic(const a_rowptrs_t &a_rowptrsIn, + const a_entries_t &a_entriesIn, + const b_rowptrs_t &b_rowptrsIn, + const b_entries_t &b_entriesIn) { +#ifndef NDEBUG + // If this is the first symbolic call, assign the handle's CRS pointers to + // check against later + if (!computedInputHashes) { + a_graph_hash = KokkosKernels::Impl::hashView(a_rowptrsIn) ^ + KokkosKernels::Impl::hashView(a_entriesIn); + b_graph_hash = KokkosKernels::Impl::hashView(b_rowptrsIn) ^ + KokkosKernels::Impl::hashView(b_entriesIn); + computedInputHashes = true; + } else { + if (a_graph_hash != (KokkosKernels::Impl::hashView(a_rowptrsIn) ^ + KokkosKernels::Impl::hashView(a_entriesIn))) + return false; + if (b_graph_hash != (KokkosKernels::Impl::hashView(b_rowptrsIn) ^ + KokkosKernels::Impl::hashView(b_entriesIn))) + return false; + } +#endif + return true; + } + + template + bool checkMatrixIdentitiesNumeric(const a_rowptrs_t &a_rowptrsIn, + const a_entries_t &a_entriesIn, + const b_rowptrs_t &b_rowptrsIn, + const b_entries_t &b_entriesIn) { +#ifndef NDEBUG + if (a_graph_hash != (KokkosKernels::Impl::hashView(a_rowptrsIn) ^ + KokkosKernels::Impl::hashView(a_entriesIn))) + return false; + if (b_graph_hash != (KokkosKernels::Impl::hashView(b_rowptrsIn) ^ + KokkosKernels::Impl::hashView(b_entriesIn))) + return false; +#endif + return true; + } }; inline SPGEMMAlgorithm StringToSPGEMMAlgorithm(std::string &name) { diff --git a/sparse/src/KokkosSparse_spgemm_numeric.hpp b/sparse/src/KokkosSparse_spgemm_numeric.hpp index 043a01b9f1..e0930c04ee 100644 --- a/sparse/src/KokkosSparse_spgemm_numeric.hpp +++ b/sparse/src/KokkosSparse_spgemm_numeric.hpp @@ -236,7 +236,24 @@ void spgemm_numeric(KernelHandle *handle, return; } - auto algo = tmp_handle.get_spgemm_handle()->get_algorithm_type(); + auto spgemmHandle = tmp_handle.get_spgemm_handle(); + + if (!spgemmHandle) { + throw std::invalid_argument( + "KokkosSparse::spgemm_numeric: the given KernelHandle does not have " + "an SpGEMM handle associated with it."); + } + + if (!spgemmHandle->checkMatrixIdentitiesNumeric(const_a_r, const_a_l, + const_b_r, const_b_l)) { + throw std::invalid_argument( + "KokkosSparse::spgemm_numeric: once used, an spgemm handle cannot be " + "reused for a product with a different sparsity pattern.\n" + "The rowptrs and entries of A and B must be identical to those " + "passed to the first spgemm_symbolic and spgemm_numeric calls."); + } + + auto algo = spgemmHandle->get_algorithm_type(); if (algo == SPGEMM_DEBUG || algo == SPGEMM_SERIAL) { // Never call a TPL if serial/debug is requested (this is needed for diff --git a/sparse/src/KokkosSparse_spgemm_symbolic.hpp b/sparse/src/KokkosSparse_spgemm_symbolic.hpp index 486d999e41..2bde5f6e20 100644 --- a/sparse/src/KokkosSparse_spgemm_symbolic.hpp +++ b/sparse/src/KokkosSparse_spgemm_symbolic.hpp @@ -162,7 +162,24 @@ void spgemm_symbolic(KernelHandle *handle, } #endif - auto algo = tmp_handle.get_spgemm_handle()->get_algorithm_type(); + auto spgemmHandle = tmp_handle.get_spgemm_handle(); + + if (!spgemmHandle) { + throw std::invalid_argument( + "KokkosSparse::spgemm_symbolic: the given KernelHandle does not have " + "an SpGEMM handle associated with it."); + } + + if (!spgemmHandle->checkMatrixIdentitiesSymbolic(const_a_r, const_a_l, + const_b_r, const_b_l)) { + throw std::invalid_argument( + "KokkosSparse::spgemm_symbolic: once used, an spgemm handle cannot be " + "reused for a product with a different sparsity pattern.\n" + "The rowptrs and entries of A and B must be identical to those " + "passed to the first spgemm_symbolic call."); + } + + auto algo = spgemmHandle->get_algorithm_type(); if (algo == SPGEMM_DEBUG || algo == SPGEMM_SERIAL) { // Never call a TPL if serial/debug is requested (this is needed for diff --git a/sparse/unit_test/Test_Sparse_spgemm.hpp b/sparse/unit_test/Test_Sparse_spgemm.hpp index 4d53b1e126..bd1e68c370 100644 --- a/sparse/unit_test/Test_Sparse_spgemm.hpp +++ b/sparse/unit_test/Test_Sparse_spgemm.hpp @@ -78,12 +78,13 @@ void run_spgemm_noreuse(crsMat_t A, crsMat_t B, crsMat_t &C) { } template -int run_spgemm(crsMat_t A, crsMat_t B, +int run_spgemm(crsMat_t &A, crsMat_t &B, KokkosSparse::SPGEMMAlgorithm spgemm_algorithm, crsMat_t &C, bool testReuse) { typedef typename crsMat_t::size_type size_type; typedef typename crsMat_t::ordinal_type lno_t; typedef typename crsMat_t::value_type scalar_t; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; typedef KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, typename device::execution_space, @@ -113,7 +114,14 @@ int run_spgemm(crsMat_t A, crsMat_t B, EXPECT_TRUE(sh->is_numeric_called()); if (testReuse) { - // Give A and B completely new random values, and re-run just numeric + // Give A and B completely new random values (changing both the pointer + // and contents), and re-run just numeric. + A.values = scalar_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "new A values"), + A.nnz()); + B.values = scalar_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "new B values"), + B.nnz()); randomize_matrix_values(A.values); randomize_matrix_values(B.values); KokkosSparse::spgemm_numeric(kh, A, false, B, false, C); @@ -127,7 +135,7 @@ int run_spgemm(crsMat_t A, crsMat_t B, } template -int run_spgemm_old_interface(crsMat_t A, crsMat_t B, +int run_spgemm_old_interface(crsMat_t &A, crsMat_t &B, KokkosSparse::SPGEMMAlgorithm spgemm_algorithm, crsMat_t &result, bool testReuse) { typedef typename crsMat_t::StaticCrsGraphType graph_t; @@ -188,7 +196,14 @@ int run_spgemm_old_interface(crsMat_t A, crsMat_t B, EXPECT_TRUE(sh->is_numeric_called()); if (testReuse) { - // Give A and B completely new random values, and re-run just numeric + // Give A and B completely new random values (changing both the pointer + // and contents), and re-run just numeric. + A.values = scalar_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "new A values"), + A.nnz()); + B.values = scalar_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "new B values"), + B.nnz()); randomize_matrix_values(A.values); randomize_matrix_values(B.values); KokkosSparse::Experimental::spgemm_numeric( @@ -468,6 +483,48 @@ void test_issue402() { << "SpGEMM still has issue 402 bug; C=AA' is incorrect!\n"; } +template +void test_issue1738() { + // Make sure that std::invalid_argument is thrown if you: + // - call numeric where an input matrix's entries have changed. + // - try to reuse an spgemm handle by calling symbolic with new input + // matrices + // This check is only enabled in debug builds. +#ifndef NDEBUG + using crsMat_t = CrsMatrix; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, typename device::execution_space, + typename device::memory_space, typename device::memory_space>; + crsMat_t A1 = KokkosSparse::Impl::kk_generate_diag_matrix(100); + crsMat_t B1 = KokkosSparse::Impl::kk_generate_diag_matrix(100); + crsMat_t A2 = KokkosSparse::Impl::kk_generate_diag_matrix(50); + crsMat_t B2 = KokkosSparse::Impl::kk_generate_diag_matrix(50); + { + KernelHandle kh; + kh.create_spgemm_handle(); + crsMat_t C1; + KokkosSparse::spgemm_symbolic(kh, A1, false, B1, false, C1); + KokkosSparse::spgemm_numeric(kh, A1, false, B1, false, C1); + crsMat_t C2; + EXPECT_THROW(KokkosSparse::spgemm_symbolic(kh, A2, false, B2, false, C2), + std::invalid_argument); + } + { + KernelHandle kh; + kh.create_spgemm_handle(); + crsMat_t C1; + KokkosSparse::spgemm_symbolic(kh, A1, false, B1, false, C1); + // Note: A1 is a 100x100 diagonal matrix, so the first entry in the first + // row is 0. Change it to a 1 and make sure spgemm_numeric notices that it + // changed. + Kokkos::deep_copy(Kokkos::subview(A1.graph.entries, 0), 1); + EXPECT_THROW(KokkosSparse::spgemm_numeric(kh, A1, false, B1, false, C1), + std::invalid_argument); + } +#endif +} + #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##spgemm##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ @@ -513,6 +570,7 @@ void test_issue402() { test_spgemm_symbolic(true, false); \ test_spgemm_symbolic(false, false); \ test_issue402(); \ + test_issue1738(); \ } // test_spgemm(50000, 50000 * 30, 100, 10); From 70db534be52bc4afe7d6759e33c2ecd953e55099 Mon Sep 17 00:00:00 2001 From: Tom Ransegnola Date: Wed, 5 Apr 2023 14:45:10 -0600 Subject: [PATCH 198/442] add support for complex data types in MDF --- sparse/impl/KokkosSparse_mdf_impl.hpp | 350 +++++++++++++++----------- sparse/src/KokkosSparse_mdf.hpp | 11 +- sparse/unit_test/Test_Sparse_mdf.hpp | 3 - 3 files changed, 215 insertions(+), 149 deletions(-) diff --git a/sparse/impl/KokkosSparse_mdf_impl.hpp b/sparse/impl/KokkosSparse_mdf_impl.hpp index b8a25485f5..d5194ee6dd 100644 --- a/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -17,9 +17,36 @@ #ifndef KOKKOSSPARSE_MDF_IMPL_HPP_ #define KOKKOSSPARSE_MDF_IMPL_HPP_ +#include +#include +#include "Kokkos_ArithTraits.hpp" + namespace KokkosSparse { namespace Impl { +template +struct add_N_pointers { + using type = typename add_N_pointers, N - 1>::type; +}; +template +struct add_N_pointers { + using type = T; +}; + +template +auto create_mag_mirror_view(const Kokkos::View& v) { + using src_view_t = Kokkos::View; + using KAS = Kokkos::ArithTraits; + using mag_type = typename KAS::mag_type; + using data_type = typename add_N_pointers::type; + return Kokkos::View( + Kokkos::ViewAllocateWithoutInitializing(v.label() + "::Magnitude"), + v.layout()); +} + +template +using mag_mirror_view_t = decltype(create_mag_mirror_view(SrcView())); + template struct MDF_count_lower { using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: @@ -54,27 +81,30 @@ struct MDF_discarded_fill_norm { using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; using col_ind_type = typename static_crs_graph_type::entries_type::non_const_type; - using values_type = typename crs_matrix_type::values_type::non_const_type; - using size_type = typename crs_matrix_type::size_type; - using ordinal_type = typename crs_matrix_type::ordinal_type; - using scalar_type = typename crs_matrix_type::value_type; - using KAS = typename Kokkos::ArithTraits; + using values_type = typename crs_matrix_type::values_type::non_const_type; + using values_mag_type = KokkosSparse::Impl::mag_mirror_view_t; + using size_type = typename crs_matrix_type::size_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using scalar_type = typename crs_matrix_type::value_type; + using KAS = typename Kokkos::ArithTraits; + using scalar_mag_type = typename KAS::mag_type; + using KAM = typename Kokkos::ArithTraits; - const scalar_type zero = KAS::zero(); + const scalar_mag_type zero = KAM::zero(); crs_matrix_type A, At; ordinal_type factorization_step; col_ind_type permutation; - values_type discarded_fill; + values_mag_type discarded_fill; col_ind_type deficiency; int verbosity; MDF_discarded_fill_norm(crs_matrix_type A_, crs_matrix_type At_, ordinal_type factorization_step_, col_ind_type permutation_, - values_type discarded_fill_, col_ind_type deficiency_, - int verbosity_) + values_mag_type discarded_fill_, + col_ind_type deficiency_, int verbosity_) : A(A_), At(At_), factorization_step(factorization_step_), @@ -85,10 +115,11 @@ struct MDF_discarded_fill_norm { KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { - ordinal_type rowIdx = permutation(i); - scalar_type discard_norm = zero, diag_val = zero; - bool entryIsDiscarded = true; - ordinal_type numFillEntries = 0; + ordinal_type rowIdx = permutation(i); + scalar_mag_type discard_norm = zero; + scalar_type diag_val = zero; + bool entryIsDiscarded = true; + ordinal_type numFillEntries = 0; for (size_type alphaIdx = At.graph.row_map(rowIdx); alphaIdx < At.graph.row_map(rowIdx + 1); ++alphaIdx) { ordinal_type fillRowIdx = At.graph.entries(alphaIdx); @@ -125,13 +156,15 @@ struct MDF_discarded_fill_norm { KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * KAS::abs(At.values(alphaIdx) * A.values(betaIdx)); if (verbosity > 1) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Adding value A[%d,%d]=%f to discard norm of row %d\n", - int(At.graph.entries(alphaIdx)), - int(A.graph.entries(betaIdx)), - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)), - int(rowIdx)); + if constexpr (std::is_arithmetic_v) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Adding value A[%d,%d]=%f to discard norm of row %d\n", + int(At.graph.entries(alphaIdx)), + int(A.graph.entries(betaIdx)), + KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * + KAS::abs(At.values(alphaIdx) * A.values(betaIdx)), + int(rowIdx)); + } } } } @@ -139,25 +172,34 @@ struct MDF_discarded_fill_norm { } else if (fillRowIdx == rowIdx) { diag_val = At.values(alphaIdx); if (verbosity > 1) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d diagonal value dected, values(%d)=%f\n", int(rowIdx), - int(alphaIdx), At.values(alphaIdx)); + if constexpr (std::is_arithmetic_v) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Row %d diagonal value detected, values(%d)=%f\n", int(rowIdx), + int(alphaIdx), At.values(alphaIdx)); + } else if constexpr (std::is_arithmetic_v) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Row %d diagonal value detected, |values(%d)|=%f\n", + int(rowIdx), int(alphaIdx), KAS::abs(At.values(alphaIdx))); + } } } } // TODO add a check on `diag_val == zero` - discard_norm = discard_norm / (diag_val * diag_val); + discard_norm = discard_norm / KAS::abs(diag_val * diag_val); discarded_fill(rowIdx) = discard_norm; deficiency(rowIdx) = numFillEntries; - if (verbosity > 0) { - const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) - - A.graph.row_map(rowIdx) - 1); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", - static_cast(rowIdx), - static_cast(KAS::sqrt(discard_norm)), - static_cast(deficiency(rowIdx)), static_cast(degree)); + + if constexpr (std::is_arithmetic_v) { + if (verbosity > 0) { + const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) - + A.graph.row_map(rowIdx) - 1); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", + static_cast(rowIdx), + static_cast(KAM::sqrt(discard_norm)), + static_cast(deficiency(rowIdx)), static_cast(degree)); + } } } @@ -168,20 +210,23 @@ struct MDF_selective_discarded_fill_norm { using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; using col_ind_type = typename static_crs_graph_type::entries_type::non_const_type; - using values_type = typename crs_matrix_type::values_type::non_const_type; - using size_type = typename crs_matrix_type::size_type; - using ordinal_type = typename crs_matrix_type::ordinal_type; - using scalar_type = typename crs_matrix_type::value_type; - using KAS = typename Kokkos::ArithTraits; + using values_type = typename crs_matrix_type::values_type::non_const_type; + using size_type = typename crs_matrix_type::size_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using scalar_type = typename crs_matrix_type::value_type; + using KAS = typename Kokkos::ArithTraits; + using scalar_mag_type = typename KAS::mag_type; + using KAM = typename Kokkos::ArithTraits; + using values_mag_type = KokkosSparse::Impl::mag_mirror_view_t; - const scalar_type zero = KAS::zero(); + const scalar_mag_type zero = KAS::abs(KAS::zero()); crs_matrix_type A, At; ordinal_type factorization_step; col_ind_type permutation; col_ind_type update_list; - values_type discarded_fill; + values_mag_type discarded_fill; col_ind_type deficiency; int verbosity; @@ -189,7 +234,7 @@ struct MDF_selective_discarded_fill_norm { ordinal_type factorization_step_, col_ind_type permutation_, col_ind_type update_list_, - values_type discarded_fill_, + values_mag_type discarded_fill_, col_ind_type deficiency_, int verbosity_) : A(A_), At(At_), @@ -202,10 +247,11 @@ struct MDF_selective_discarded_fill_norm { KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { - ordinal_type rowIdx = permutation(update_list(i)); - scalar_type discard_norm = zero, diag_val = zero; - bool entryIsDiscarded = true; - ordinal_type numFillEntries = 0; + ordinal_type rowIdx = permutation(update_list(i)); + scalar_mag_type discard_norm = zero; + scalar_type diag_val = zero; + bool entryIsDiscarded = true; + ordinal_type numFillEntries = 0; for (size_type alphaIdx = At.graph.row_map(rowIdx); alphaIdx < At.graph.row_map(rowIdx + 1); ++alphaIdx) { ordinal_type fillRowIdx = At.graph.entries(alphaIdx); @@ -242,14 +288,16 @@ struct MDF_selective_discarded_fill_norm { KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * KAS::abs(At.values(alphaIdx) * A.values(betaIdx)); if (verbosity > 1) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Adding value A[%d,%d]=%f to discard norm of row %d\n", - static_cast(At.graph.entries(alphaIdx)), - static_cast(A.graph.entries(betaIdx)), - static_cast( - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * - KAS::abs(At.values(alphaIdx) * A.values(betaIdx))), - static_cast(rowIdx)); + if constexpr (std::is_arithmetic_v) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Adding value A[%d,%d]=%f to discard norm of row %d\n", + static_cast(At.graph.entries(alphaIdx)), + static_cast(A.graph.entries(betaIdx)), + static_cast( + KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * + KAS::abs(At.values(alphaIdx) * A.values(betaIdx))), + static_cast(rowIdx)); + } } } } @@ -257,26 +305,36 @@ struct MDF_selective_discarded_fill_norm { } else if (fillRowIdx == rowIdx) { diag_val = At.values(alphaIdx); if (verbosity > 1) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d diagonal value dected, values(%d)=%f\n", - static_cast(rowIdx), static_cast(alphaIdx), - static_cast(At.values(alphaIdx))); + if constexpr (std::is_arithmetic_v) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Row %d diagonal value dected, values(%d)=%f\n", + static_cast(rowIdx), static_cast(alphaIdx), + static_cast(At.values(alphaIdx))); + } else if constexpr (std::is_arithmetic_v) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Row %d diagonal value dected, |values(%d)|=%f\n", + static_cast(rowIdx), static_cast(alphaIdx), + static_cast(KAS::abs(At.values(alphaIdx)))); + } } } } // TODO add a check on `diag_val == zero` - discard_norm = discard_norm / (diag_val * diag_val); + discard_norm = discard_norm / KAS::abs(diag_val * diag_val); discarded_fill(rowIdx) = discard_norm; deficiency(rowIdx) = numFillEntries; - if (verbosity > 0) { - const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) - - A.graph.row_map(rowIdx) - 1); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", - static_cast(rowIdx), - static_cast(KAS::sqrt(discard_norm)), - static_cast(deficiency(rowIdx)), static_cast(degree)); + + if constexpr (std::is_arithmetic_v) { + if (verbosity > 0) { + const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) - + A.graph.row_map(rowIdx) - 1); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", + static_cast(rowIdx), + static_cast(KAM::sqrt(discard_norm)), + static_cast(deficiency(rowIdx)), static_cast(degree)); + } } } @@ -289,23 +347,24 @@ struct MDF_select_row { entries_type::non_const_type; using row_map_type = typename crs_matrix_type::StaticCrsGraphType::row_map_type; - using size_type = typename crs_matrix_type::size_type; - using ordinal_type = typename crs_matrix_type::ordinal_type; - using scalar_type = typename crs_matrix_type::value_type; + using size_type = typename crs_matrix_type::size_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using scalar_type = typename crs_matrix_type::value_type; + using values_mag_type = KokkosSparse::Impl::mag_mirror_view_t; // type used to perform the reduction // do not confuse it with scalar_type! using value_type = typename crs_matrix_type::ordinal_type; value_type factorization_step; - values_type discarded_fill; + values_mag_type discarded_fill; col_ind_type deficiency; row_map_type row_map; col_ind_type permutation; - MDF_select_row(value_type factorization_step_, values_type discarded_fill_, - col_ind_type deficiency_, row_map_type row_map_, - col_ind_type permutation_) + MDF_select_row(value_type factorization_step_, + values_mag_type discarded_fill_, col_ind_type deficiency_, + row_map_type row_map_, col_ind_type permutation_) : factorization_step(factorization_step_), discarded_fill(discarded_fill_), deficiency(deficiency_), @@ -399,10 +458,12 @@ struct MDF_factorize_row { row_map_type::non_const_type; using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: entries_type::non_const_type; - using values_type = typename crs_matrix_type::values_type::non_const_type; - using ordinal_type = typename crs_matrix_type::ordinal_type; - using size_type = typename crs_matrix_type::size_type; - using value_type = typename crs_matrix_type::value_type; + using values_type = typename crs_matrix_type::values_type::non_const_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using size_type = typename crs_matrix_type::size_type; + using value_type = typename crs_matrix_type::value_type; + using values_mag_type = KokkosSparse::Impl::mag_mirror_view_t; + using value_mag_type = typename values_mag_type::value_type; crs_matrix_type A, At; @@ -415,7 +476,7 @@ struct MDF_factorize_row { values_type valuesU; col_ind_type permutation, permutation_inv; - values_type discarded_fill; + values_mag_type discarded_fill; col_ind_type factored; ordinal_type selected_row_idx, factorization_step; @@ -426,7 +487,7 @@ struct MDF_factorize_row { values_type valuesL_, row_map_type row_mapU_, col_ind_type entriesU_, values_type valuesU_, col_ind_type permutation_, col_ind_type permutation_inv_, - values_type discarded_fill_, col_ind_type factored_, + values_mag_type discarded_fill_, col_ind_type factored_, ordinal_type selected_row_idx_, ordinal_type factorization_step_, int verbosity_) : A(A_), @@ -448,7 +509,7 @@ struct MDF_factorize_row { KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type /* idx */) const { const ordinal_type selected_row = permutation(selected_row_idx); - discarded_fill(selected_row) = Kokkos::ArithTraits::max(); + discarded_fill(selected_row) = Kokkos::ArithTraits::max(); // Swap entries in permutation vectors permutation(selected_row_idx) = permutation(factorization_step); @@ -481,32 +542,34 @@ struct MDF_factorize_row { } } row_mapU(factorization_step + 1) = U_entryIdx; - - if (verbosity > 0) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("Diagonal values of row %d is %f\n", - static_cast(selected_row), - static_cast(diag)); - } - - if (verbosity > 2) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("U, row_map={ "); - for (ordinal_type rowIdx = 0; rowIdx < factorization_step + 1; ++rowIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", - static_cast(row_mapU(rowIdx))); + if constexpr (std::is_arithmetic_v) { + if (verbosity > 0) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Diagonal values of row %d is %f\n", + static_cast(selected_row), + static_cast(diag)); } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, entries={ "); - for (size_type entryIdx = row_mapU(0); - entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", - static_cast(entriesU(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); - for (size_type entryIdx = row_mapU(0); - entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", - static_cast(valuesU(entryIdx))); + + if (verbosity > 2) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("U, row_map={ "); + for (ordinal_type rowIdx = 0; rowIdx < factorization_step + 1; + ++rowIdx) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", + static_cast(row_mapU(rowIdx))); + } + KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, entries={ "); + for (size_type entryIdx = row_mapU(0); + entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", + static_cast(entriesU(entryIdx))); + } + KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); + for (size_type entryIdx = row_mapU(0); + entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", + static_cast(valuesU(entryIdx))); + } + KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } // Insert the lower part of the selected column of A @@ -526,26 +589,28 @@ struct MDF_factorize_row { } row_mapL(factorization_step + 1) = L_entryIdx; - if (verbosity > 2) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "L(%d), [row_map(%d), row_map(%d)[ = [%d, %d[, entries={ ", - static_cast(factorization_step), - static_cast(factorization_step), - static_cast(factorization_step + 1), - static_cast(row_mapL(factorization_step)), - static_cast(row_mapL(factorization_step + 1))); - for (size_type entryIdx = row_mapL(factorization_step); - entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", - static_cast(entriesL(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); - for (size_type entryIdx = row_mapL(factorization_step); - entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", - static_cast(valuesL(entryIdx))); + if constexpr (std::is_arithmetic_v) { + if (verbosity > 2) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "L(%d), [row_map(%d), row_map(%d)[ = [%d, %d[, entries={ ", + static_cast(factorization_step), + static_cast(factorization_step), + static_cast(factorization_step + 1), + static_cast(row_mapL(factorization_step)), + static_cast(row_mapL(factorization_step + 1))); + for (size_type entryIdx = row_mapL(factorization_step); + entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", + static_cast(entriesL(entryIdx))); + } + KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); + for (size_type entryIdx = row_mapL(factorization_step); + entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", + static_cast(valuesL(entryIdx))); + } + KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } // If this was the last row no need to update A and At! @@ -599,13 +664,14 @@ struct MDF_factorize_row { if (A.graph.entries(entryIdx) == fillColIdx) { A.values(entryIdx) -= At.values(alphaIdx) * A.values(betaIdx) / diag_val; - - if (verbosity > 1) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "A[%d, %d] -= %f\n", static_cast(fillRowIdx), - static_cast(fillColIdx), - static_cast(At.values(alphaIdx) * - A.values(betaIdx) / diag_val)); + if constexpr (std::is_arithmetic_v) { + if (verbosity > 1) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "A[%d, %d] -= %f\n", static_cast(fillRowIdx), + static_cast(fillColIdx), + static_cast(At.values(alphaIdx) * + A.values(betaIdx) / diag_val)); + } } } } @@ -624,19 +690,21 @@ struct MDF_factorize_row { factored(selected_row) = 1; - if (verbosity > 0) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in A: { "); - for (size_type entryIdx = 0; entryIdx < A.nnz(); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", - static_cast(A.values(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); - KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in At: { "); - for (size_type entryIdx = 0; entryIdx < At.nnz(); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", - static_cast(At.values(entryIdx))); + if constexpr (std::is_arithmetic_v) { + if (verbosity > 0) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in A: { "); + for (size_type entryIdx = 0; entryIdx < A.nnz(); ++entryIdx) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "%f ", static_cast(A.values(entryIdx))); + } + KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in At: { "); + for (size_type entryIdx = 0; entryIdx < At.nnz(); ++entryIdx) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "%f ", static_cast(At.values(entryIdx))); + } + KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } } // operator() diff --git a/sparse/src/KokkosSparse_mdf.hpp b/sparse/src/KokkosSparse_mdf.hpp index 672da5b4de..c49c291926 100644 --- a/sparse/src/KokkosSparse_mdf.hpp +++ b/sparse/src/KokkosSparse_mdf.hpp @@ -66,9 +66,10 @@ template void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: entries_type::non_const_type; - using values_type = typename crs_matrix_type::values_type::non_const_type; - using ordinal_type = typename crs_matrix_type::ordinal_type; - using value_type = typename crs_matrix_type::value_type; + using values_type = typename crs_matrix_type::values_type::non_const_type; + using values_mag_type = KokkosSparse::Impl::mag_mirror_view_t; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using value_mag_type = typename values_mag_type::value_type; using execution_space = typename crs_matrix_type::execution_space; using range_policy_type = Kokkos::RangePolicy; @@ -82,14 +83,14 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { crs_matrix_type Atmp = crs_matrix_type("A fill", A); crs_matrix_type At = KokkosSparse::Impl::transpose_matrix(A); KokkosSparse::sort_crs_matrix(At); - values_type discarded_fill("discarded fill", A.numRows()); + values_mag_type discarded_fill("discarded fill", A.numRows()); col_ind_type deficiency("deficiency", A.numRows()); col_ind_type update_list_length("update list length", 1); typename col_ind_type::HostMirror update_list_length_host = Kokkos::create_mirror_view(update_list_length); col_ind_type update_list("update list", A.numRows()); col_ind_type factored("factored rows", A.numRows()); - Kokkos::deep_copy(discarded_fill, Kokkos::ArithTraits::max()); + Kokkos::deep_copy(discarded_fill, Kokkos::ArithTraits::max()); Kokkos::deep_copy(deficiency, Kokkos::ArithTraits::max()); KokkosSparse::Impl::MDF_discarded_fill_norm MDF_df_norm( diff --git a/sparse/unit_test/Test_Sparse_mdf.hpp b/sparse/unit_test/Test_Sparse_mdf.hpp index 41204c9b4d..f6e4d0bc84 100644 --- a/sparse/unit_test/Test_Sparse_mdf.hpp +++ b/sparse/unit_test/Test_Sparse_mdf.hpp @@ -192,9 +192,6 @@ void test_mdf() { test_mdf(); \ } -#define NO_TEST_COMPLEX - #include #undef KOKKOSKERNELS_EXECUTE_TEST -#undef NO_TEST_COMPLEX From 8ef7d05e81e99733f53dc363efcbe586a2c19175 Mon Sep 17 00:00:00 2001 From: kliegeois Date: Wed, 5 Apr 2023 12:28:05 -0600 Subject: [PATCH 199/442] Move TeamSpmv and TeamVectorSpmv to KokkosSparse --- .../sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp | 2 +- batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp | 2 +- .../impl/KokkosSparse_spmv_team_impl.hpp | 4 ++-- .../impl/KokkosSparse_spmv_team_spec.hpp | 6 +++--- .../src/KokkosSparse_spmv_team.hpp | 8 ++++---- 5 files changed, 11 insertions(+), 11 deletions(-) rename blas/impl/KokkosBlas2_team_spmv_impl.hpp => sparse/impl/KokkosSparse_spmv_team_impl.hpp (98%) rename blas/impl/KokkosBlas2_team_spmv_spec.hpp => sparse/impl/KokkosSparse_spmv_team_spec.hpp (95%) rename blas/src/KokkosBlas2_team_spmv.hpp => sparse/src/KokkosSparse_spmv_team.hpp (97%) diff --git a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp index 9dadce2e78..3b6dbf9769 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp @@ -19,7 +19,7 @@ /// \author Kim Liegeois (knliege@sandia.gov) #include "KokkosBatched_Util.hpp" -#include "KokkosBlas2_team_spmv.hpp" +#include "KokkosSparse_spmv_team.hpp" namespace KokkosBatched { diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp index 5c35feccdd..a508c14cce 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp @@ -19,7 +19,7 @@ /// \author Kim Liegeois (knliege@sandia.gov) #include "KokkosBatched_Util.hpp" -#include "KokkosBlas2_team_spmv.hpp" +#include "KokkosSparse_spmv_team.hpp" namespace KokkosBatched { diff --git a/blas/impl/KokkosBlas2_team_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_team_impl.hpp similarity index 98% rename from blas/impl/KokkosBlas2_team_spmv_impl.hpp rename to sparse/impl/KokkosSparse_spmv_team_impl.hpp index 36313d51b3..1c6efd14f0 100644 --- a/blas/impl/KokkosBlas2_team_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_team_impl.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOSBLAS2_TEAM_SPMV_IMPL_HPP_ -#define KOKKOSBLAS2_TEAM_SPMV_IMPL_HPP_ +#ifndef KOKKOSSPARSE_SPMV_TEAM_IMPL_HPP_ +#define KOKKOSSPARSE_SPMV_TEAM_IMPL_HPP_ #include #include diff --git a/blas/impl/KokkosBlas2_team_spmv_spec.hpp b/sparse/impl/KokkosSparse_spmv_team_spec.hpp similarity index 95% rename from blas/impl/KokkosBlas2_team_spmv_spec.hpp rename to sparse/impl/KokkosSparse_spmv_team_spec.hpp index 6967d91571..a148833a4a 100644 --- a/blas/impl/KokkosBlas2_team_spmv_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_team_spec.hpp @@ -14,14 +14,14 @@ // //@HEADER -#ifndef KOKKOSBLAS2_TEAM_SPMV_SPEC_HPP_ -#define KOKKOSBLAS2_TEAM_SPMV_SPEC_HPP_ +#ifndef KOKKOSSPARSE_SPMV_TEAM_SPEC_HPP_ +#define KOKKOSSPARSE_SPMV_TEAM_SPEC_HPP_ #include #include #include #include -#include +#include namespace KokkosBlas { diff --git a/blas/src/KokkosBlas2_team_spmv.hpp b/sparse/src/KokkosSparse_spmv_team.hpp similarity index 97% rename from blas/src/KokkosBlas2_team_spmv.hpp rename to sparse/src/KokkosSparse_spmv_team.hpp index 8e56553b0c..e62f2807ad 100644 --- a/blas/src/KokkosBlas2_team_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv_team.hpp @@ -13,16 +13,16 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOSBLAS2_TEAM_SPMV_HPP_ -#define KOKKOSBLAS2_TEAM_SPMV_HPP_ +#ifndef KOKKOSSPARSE_SPMV_TEAM_HPP_ +#define KOKKOSSPARSE_SPMV_TEAM_HPP_ -/// \file KokkosBlas2_team_spmv.hpp +/// \file KokkosSparse_spmv_team.hpp #include #include #include #include // requires C++11, but so does Kokkos -#include +#include namespace KokkosBlas { namespace Experimental { From 1c2105bb183ea05d835b0a1fa61551e520040ce9 Mon Sep 17 00:00:00 2001 From: Tom Ransegnola Date: Wed, 5 Apr 2023 15:46:49 -0600 Subject: [PATCH 200/442] remove deprecated Rank call --- sparse/impl/KokkosSparse_mdf_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/impl/KokkosSparse_mdf_impl.hpp b/sparse/impl/KokkosSparse_mdf_impl.hpp index d5194ee6dd..884a678b85 100644 --- a/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -38,7 +38,7 @@ auto create_mag_mirror_view(const Kokkos::View& v) { using src_view_t = Kokkos::View; using KAS = Kokkos::ArithTraits; using mag_type = typename KAS::mag_type; - using data_type = typename add_N_pointers::type; + using data_type = typename add_N_pointers::type; return Kokkos::View( Kokkos::ViewAllocateWithoutInitializing(v.label() + "::Magnitude"), v.layout()); From 51ac81620724247769ba8b5f6167115934add8cf Mon Sep 17 00:00:00 2001 From: Tom Ransegnola Date: Wed, 5 Apr 2023 18:59:34 -0600 Subject: [PATCH 201/442] use crs_matrix view traits for magnitude view --- sparse/impl/KokkosSparse_mdf_impl.hpp | 49 +++++++++------------------ sparse/src/KokkosSparse_mdf.hpp | 8 ++--- 2 files changed, 20 insertions(+), 37 deletions(-) diff --git a/sparse/impl/KokkosSparse_mdf_impl.hpp b/sparse/impl/KokkosSparse_mdf_impl.hpp index 884a678b85..d8754e591c 100644 --- a/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -24,29 +24,16 @@ namespace KokkosSparse { namespace Impl { -template -struct add_N_pointers { - using type = typename add_N_pointers, N - 1>::type; -}; -template -struct add_N_pointers { - using type = T; +template +struct MDF_types { + using scalar_type = typename crs_matrix_type::value_type; + using KAS = typename Kokkos::ArithTraits; + using scalar_mag_type = typename KAS::mag_type; + using values_mag_type = Kokkos::View; }; -template -auto create_mag_mirror_view(const Kokkos::View& v) { - using src_view_t = Kokkos::View; - using KAS = Kokkos::ArithTraits; - using mag_type = typename KAS::mag_type; - using data_type = typename add_N_pointers::type; - return Kokkos::View( - Kokkos::ViewAllocateWithoutInitializing(v.label() + "::Magnitude"), - v.layout()); -} - -template -using mag_mirror_view_t = decltype(create_mag_mirror_view(SrcView())); - template struct MDF_count_lower { using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: @@ -82,7 +69,7 @@ struct MDF_discarded_fill_norm { using col_ind_type = typename static_crs_graph_type::entries_type::non_const_type; using values_type = typename crs_matrix_type::values_type::non_const_type; - using values_mag_type = KokkosSparse::Impl::mag_mirror_view_t; + using values_mag_type = typename MDF_types::values_mag_type; using size_type = typename crs_matrix_type::size_type; using ordinal_type = typename crs_matrix_type::ordinal_type; using scalar_type = typename crs_matrix_type::value_type; @@ -90,8 +77,6 @@ struct MDF_discarded_fill_norm { using scalar_mag_type = typename KAS::mag_type; using KAM = typename Kokkos::ArithTraits; - const scalar_mag_type zero = KAM::zero(); - crs_matrix_type A, At; ordinal_type factorization_step; col_ind_type permutation; @@ -116,8 +101,8 @@ struct MDF_discarded_fill_norm { KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { ordinal_type rowIdx = permutation(i); - scalar_mag_type discard_norm = zero; - scalar_type diag_val = zero; + scalar_mag_type discard_norm = KAM::zero(); + scalar_type diag_val = KAS::zero(); bool entryIsDiscarded = true; ordinal_type numFillEntries = 0; for (size_type alphaIdx = At.graph.row_map(rowIdx); @@ -217,9 +202,7 @@ struct MDF_selective_discarded_fill_norm { using KAS = typename Kokkos::ArithTraits; using scalar_mag_type = typename KAS::mag_type; using KAM = typename Kokkos::ArithTraits; - using values_mag_type = KokkosSparse::Impl::mag_mirror_view_t; - - const scalar_mag_type zero = KAS::abs(KAS::zero()); + using values_mag_type = typename MDF_types::values_mag_type; crs_matrix_type A, At; ordinal_type factorization_step; @@ -248,8 +231,8 @@ struct MDF_selective_discarded_fill_norm { KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { ordinal_type rowIdx = permutation(update_list(i)); - scalar_mag_type discard_norm = zero; - scalar_type diag_val = zero; + scalar_mag_type discard_norm = KAM::zero(); + scalar_type diag_val = KAS::zero(); bool entryIsDiscarded = true; ordinal_type numFillEntries = 0; for (size_type alphaIdx = At.graph.row_map(rowIdx); @@ -350,7 +333,7 @@ struct MDF_select_row { using size_type = typename crs_matrix_type::size_type; using ordinal_type = typename crs_matrix_type::ordinal_type; using scalar_type = typename crs_matrix_type::value_type; - using values_mag_type = KokkosSparse::Impl::mag_mirror_view_t; + using values_mag_type = typename MDF_types::values_mag_type; // type used to perform the reduction // do not confuse it with scalar_type! @@ -462,7 +445,7 @@ struct MDF_factorize_row { using ordinal_type = typename crs_matrix_type::ordinal_type; using size_type = typename crs_matrix_type::size_type; using value_type = typename crs_matrix_type::value_type; - using values_mag_type = KokkosSparse::Impl::mag_mirror_view_t; + using values_mag_type = typename MDF_types::values_mag_type; using value_mag_type = typename values_mag_type::value_type; crs_matrix_type A, At; diff --git a/sparse/src/KokkosSparse_mdf.hpp b/sparse/src/KokkosSparse_mdf.hpp index c49c291926..1c5216bfe5 100644 --- a/sparse/src/KokkosSparse_mdf.hpp +++ b/sparse/src/KokkosSparse_mdf.hpp @@ -66,10 +66,10 @@ template void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: entries_type::non_const_type; - using values_type = typename crs_matrix_type::values_type::non_const_type; - using values_mag_type = KokkosSparse::Impl::mag_mirror_view_t; - using ordinal_type = typename crs_matrix_type::ordinal_type; - using value_mag_type = typename values_mag_type::value_type; + using values_mag_type = + typename KokkosSparse::Impl::MDF_types::values_mag_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using value_mag_type = typename values_mag_type::value_type; using execution_space = typename crs_matrix_type::execution_space; using range_policy_type = Kokkos::RangePolicy; From 9ff29b38d7fa2f7e36291dfce33907fa720b3cb8 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 27 Mar 2023 13:15:58 -0600 Subject: [PATCH 202/442] ODE: adding new component for time integration Currently we are only providing explicit time integration with ButchedTableau as input parameters. These are mostly the explicit Runge-Kutta methods. Adding performance test for Runge-Kutta methods --- CMakeLists.txt | 9 + cmake/KokkosKernels_config.h.in | 1 + cmake/kokkoskernels_components.cmake | 10 +- ode/CMakeLists.txt | 15 + ode/impl/KokkosODE_RungeKuttaTables_impl.hpp | 231 +++++++++++ ode/impl/KokkosODE_RungeKutta_impl.hpp | 110 +++++ ode/unit_test/CMakeLists.txt | 108 +++++ ode/unit_test/Test_ODE.hpp | 22 + ode/unit_test/Test_ODE_RK.hpp | 392 ++++++++++++++++++ ode/unit_test/Test_ODE_RK_chem.hpp | 174 ++++++++ ode/unit_test/backends/Test_Cuda_ODE.cpp | 22 + ode/unit_test/backends/Test_HIP_ODE.cpp | 22 + .../backends/Test_OpenMPTarget_ODE.cpp | 22 + ode/unit_test/backends/Test_OpenMP_ODE.cpp | 22 + ode/unit_test/backends/Test_SYCL_ODE.cpp | 22 + ode/unit_test/backends/Test_Serial_ODE.cpp | 22 + ode/unit_test/backends/Test_Threads_ODE.cpp | 22 + perf_test/CMakeLists.txt | 1 + perf_test/ode/CMakeLists.txt | 28 ++ perf_test/ode/KokkosODE_RK.cpp | 318 ++++++++++++++ 20 files changed, 1572 insertions(+), 1 deletion(-) create mode 100644 ode/CMakeLists.txt create mode 100644 ode/impl/KokkosODE_RungeKuttaTables_impl.hpp create mode 100644 ode/impl/KokkosODE_RungeKutta_impl.hpp create mode 100644 ode/unit_test/CMakeLists.txt create mode 100644 ode/unit_test/Test_ODE.hpp create mode 100644 ode/unit_test/Test_ODE_RK.hpp create mode 100644 ode/unit_test/Test_ODE_RK_chem.hpp create mode 100644 ode/unit_test/backends/Test_Cuda_ODE.cpp create mode 100644 ode/unit_test/backends/Test_HIP_ODE.cpp create mode 100644 ode/unit_test/backends/Test_OpenMPTarget_ODE.cpp create mode 100644 ode/unit_test/backends/Test_OpenMP_ODE.cpp create mode 100644 ode/unit_test/backends/Test_SYCL_ODE.cpp create mode 100644 ode/unit_test/backends/Test_Serial_ODE.cpp create mode 100644 ode/unit_test/backends/Test_Threads_ODE.cpp create mode 100644 perf_test/ode/CMakeLists.txt create mode 100644 perf_test/ode/KokkosODE_RK.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 346a329d82..2baa77084f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,6 +89,7 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS) set(KokkosKernels_ENABLE_PERFTESTS ON CACHE BOOL "Whether to build tests including Perfsuite. Default: OFF" FORCE) ENDIF() IF(KokkosKernels_ENABLE_BENCHMARK) + SET(KOKKOSKERNELS_ENABLE_BENCHMARK ON CACHE BOOL "Benchmark enabled") INCLUDE(cmake/kokkoskernels_benchmarks.cmake) ENDIF() ENDIF () @@ -118,6 +119,7 @@ IF (KokkosKernels_INSTALL_TESTING) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(blas/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(graph/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(sparse/unit_test) + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(ode/unit_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) KOKKOSKERNELS_ADD_EXAMPLE_DIRECTORIES(example) ELSE() @@ -239,6 +241,7 @@ ELSE() MESSAGE(" BLAS: ${KokkosKernels_ENABLE_COMPONENT_BLAS}") MESSAGE(" GRAPH: ${KokkosKernels_ENABLE_COMPONENT_GRAPH}") MESSAGE(" SPARSE: ${KokkosKernels_ENABLE_COMPONENT_SPARSE}") + MESSAGE(" ODE: ${KokkosKernels_ENABLE_COMPONENT_ODE}") MESSAGE("") MESSAGE("Kokkos Kernels TPLs") IF(KOKKOSKERNELS_TPL_LIST) @@ -286,6 +289,9 @@ ELSE() IF (KokkosKernels_ENABLE_COMPONENT_SPARSE) INCLUDE(sparse/CMakeLists.txt) ENDIF() + IF (KokkosKernels_ENABLE_COMPONENT_ODE) + INCLUDE(ode/CMakeLists.txt) + ENDIF() FOREACH(DIR ${KK_INCLUDE_DIRS}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${DIR}) @@ -398,6 +404,9 @@ ELSE() IF (KokkosKernels_ENABLE_COMPONENT_SPARSE) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(sparse/unit_test) ENDIF() + IF (KokkosKernels_ENABLE_COMPONENT_ODE) + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(ode/unit_test) + ENDIF() IF (KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) IF (KokkosKernels_ENABLE_PERFTESTS) MESSAGE(STATUS "Enabling perf tests.") diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index 22a6cd9416..22b7a196fc 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -31,6 +31,7 @@ #cmakedefine HAVE_KOKKOSKERNELS_MKL #cmakedefine KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE +#cmakedefine KOKKOSKERNELS_ENABLE_BENCHMARK /* Define this macro if experimental features of Kokkoskernels are enabled */ #cmakedefine HAVE_KOKKOSKERNELS_EXPERIMENTAL diff --git a/cmake/kokkoskernels_components.cmake b/cmake/kokkoskernels_components.cmake index 56ab1a7c31..1feb5bb8b8 100644 --- a/cmake/kokkoskernels_components.cmake +++ b/cmake/kokkoskernels_components.cmake @@ -44,6 +44,12 @@ KOKKOSKERNELS_ADD_OPTION( BOOL "Whether to build the graph component. Default: OFF" ) +KOKKOSKERNELS_ADD_OPTION( + "ENABLE_COMPONENT_ODE" + OFF + BOOL + "Whether to build the ode component. Default: OFF" +) # Graph depends on everything else because it depends @@ -70,6 +76,7 @@ IF (KokkosKernels_ENABLE_ALL_COMPONENTS) SET(KokkosKernels_ENABLE_COMPONENT_BLAS ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_SPARSE ON CACHE BOOL "" FORCE) SET(KokkosKernels_ENABLE_COMPONENT_GRAPH ON CACHE BOOL "" FORCE) + SET(KokkosKernels_ENABLE_COMPONENT_ODE ON CACHE BOOL "" FORCE) ENDIF() # KOKKOSKERNELS_ALL_COMPONENTS_ENABLED says whether all components are on, @@ -79,7 +86,8 @@ ENDIF() IF ( KokkosKernels_ENABLE_COMPONENT_BATCHED AND KokkosKernels_ENABLE_COMPONENT_BLAS AND KokkosKernels_ENABLE_COMPONENT_GRAPH - AND KokkosKernels_ENABLE_COMPONENT_SPARSE) + AND KokkosKernels_ENABLE_COMPONENT_SPARSE + AND KokkosKernels_ENABLE_COMPONENT_ODE) SET(KOKKOSKERNELS_ALL_COMPONENTS_ENABLED ON CACHE BOOL "" FORCE) ELSE() SET(KOKKOSKERNELS_ALL_COMPONENTS_ENABLED OFF CACHE BOOL "" FORCE) diff --git a/ode/CMakeLists.txt b/ode/CMakeLists.txt new file mode 100644 index 0000000000..9d92dc07ba --- /dev/null +++ b/ode/CMakeLists.txt @@ -0,0 +1,15 @@ +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/ode/src) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/ode/impl) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/ode/unit_test) + +IF (NOT KokkosKernels_ENABLE_COMPONENT_BATCHED) + MESSAGE("blas enabled and batched not enabled, we need to include some headers manually!") + LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/batched) + LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/batched/dense/src) + LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/batched/dense/impl) +ENDIF() + + +# Adding unit-tests +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/ode) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/ode) diff --git a/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp b/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp new file mode 100644 index 0000000000..3458bdf7b0 --- /dev/null +++ b/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp @@ -0,0 +1,231 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS_RUNGEKUTTATABLES_IMPL_HPP +#define KOKKOSBLAS_RUNGEKUTTATABLES_IMPL_HPP + +#include + +namespace KokkosODE { +namespace Impl { +//===================================================================== +// Generalized RK Explicit ODE solver with embedded error estimation +//===================================================================== + +// Methods supported: +// Euler-Heun Method (RKEH) +// Fehlberg 1-2 (RK12) +// Bogacki-Shampine (BS) +// Fehlberg Method (RKF45) +// Cash-Karp Method (CashKarp) +// Dormand-Prince Method (DormandPrince) + +// Format follows form of Butcher Tableau + +// c1| a00 +// c2| a10 a11 +// c3| a20 a21 a22 +// c4| a30 a31 a32 +// . | . . . +// . | . . . +// . | . . . +// cs| as0 as1 . . . . . . ass +//-------------------------------- +// | b0 b1 b2 b3 . . . bs +// | e0 e1 e2 e3 . . . es +// +// And is always in lower triangular form for explicit methods +// For explicit methods the methods on the diagonal will always be zero. +// +// Here, nstages = s = number of stages. +// 'order' refers to the accuracy of the method. +// The array of aij coefficients is ordered by rows as: a = +// {a00,a10,a11,a20,a21,a22....} +// e contains coefficient for error estimation + +template +struct ButcherTableau { }; + +template <> +struct ButcherTableau<0, 0> // Forward Euler +{ + static constexpr int order = 1; + static constexpr int nstages = 1; + + Kokkos::Arraya{{1}}; + Kokkos::Array b{{1}}; + Kokkos::Array c{{0}}; +}; + +template <> +struct ButcherTableau<1, 1> // Euler-Heun Method +{ + static constexpr int order = 2; + static constexpr int nstages = 2; // total dimensions, nstagesxnstages system + Kokkos::Array a{ + {0.0, 1.0, + 0.0}}; //(nstages*nstages+nstages)/2 size of lower triangular matrix + Kokkos::Array b{{0.5, 0.5}}; + Kokkos::Array c{{0.0, 1.0}}; + Kokkos::Array e{{-0.5, 0.5}}; +}; + +template <> +struct ButcherTableau<1, 2> // Known as Fehlberg 1-2 method +{ + static constexpr int order = 2; + static constexpr int nstages = 3; + Kokkos::Array a{ + {0.0, 0.5, 0.0, 1.0 / 256.0, 255.0 / 256.0, 0.0}}; + Kokkos::Array b{{1.0 / 512.0, 255.0 / 256.0, 1. / 512}}; + Kokkos::Array c{{0.0, 1.0 / 2.0, 1.0}}; + Kokkos::Array e{ + {1.0 / 256.0 - 1.0 / 512.0, 0.0, -1.0 / 512.0}}; +}; + +template <> +struct ButcherTableau<2, 3> // Bogacki-Shampine method +{ + static constexpr int order = 3; + static constexpr int nstages = 4; + Kokkos::Array a{ + {0.0, 0.5, 0.0, 0.0, 3.0 / 4.0, 0.0, 2.0 / 9.0, 1.0 / 3.0, 4.0 / 9.0, + 0.0}}; + Kokkos::Array b{{2.0 / 9.0, 1.0 / 3.0, 4.0 / 9.0, 0.0}}; + Kokkos::Array c{{0.0, 0.5, 0.75, 1.0}}; + Kokkos::Array e{{2.0 / 9.0 - 7.0 / 24.0, 1.0 / 3.0 - 0.25, + 4.0 / 9.0 - 1.0 / 3.0, -1.0 / 8.0}}; +}; + +template <> +struct ButcherTableau<4, 5> // Fehlberg Method +{ + static constexpr int order = 5; + static constexpr int nstages = 6; + Kokkos::Array a{{0.0, + 0.25, + 0.0, + 3.0 / 32.0, + 9.0 / 32.0, + 0.0, + 1932.0 / 2197.0, + -7200.0 / 2197.0, + 7296.0 / 2197.0, + 0.0, + 439.0 / 216.0, + -8.0, + 3680.0 / 513.0, + -845.0 / 4104.0, + 0.0, + -8.0 / 27.0, + 2.0, + -3544.0 / 2565.0, + 1859.0 / 4104.0, + -11.0 / 40.0, + 0.0}}; + Kokkos::Array b{{16.0 / 135.0, 0.0, 6656.0 / 12825.0, + 28561.0 / 56430.0, -9.0 / 50.0, + 2.0 / 55.0}}; + Kokkos::Array c{ + {0.0, 0.25, 3.0 / 8.0, 12.0 / 13.0, 1.0, 0.5}}; + Kokkos::Array e{ + {16.0 / 135.0 - 25.0 / 216.0, 0.0, 6656.0 / 12825.0 - 1408.0 / 2565.0, + 28561.0 / 56430.0 - 2197.0 / 4104.0, -9.0 / 50.0 + 0.2, 2.0 / 55.0}}; +}; + +template <> +struct ButcherTableau<4, 5, 1> // Cash-Karp +{ + static constexpr int order = 5; + static constexpr int nstages = 6; + Kokkos::Array a{ + {0.0, + 0.2, + 0.0, + 3.0 / 40.0, + 9.0 / 40.0, + 0.0, + 0.3, + -0.9, + 1.2, + 0.0, + -11.0 / 54.0, + 2.5, + -70.0 / 27.0, + 35.0 / 27.0, + 0.0, + 1631.0 / 55296.0, + 175.0 / 512.0, + 575.0 / 13824.0, + 44275.0 / 110592.0, + 253.0 / 4096.0, + 0.0}}; + Kokkos::Array b{ + {37.0 / 378.0, 0.0, 250.0 / 621.0, 125.0 / 594.0, 0.0, 512.0 / 1771.0}}; + Kokkos::Array c{{0.0, 0.2, 0.3, 0.6, 1.0, 7.0 / 8.0}}; + Kokkos::Array e{{37.0 / 378.0 - 2825.0 / 27648.0, 0.0, + 250.0 / 621.0 - 18575.0 / 48384.0, + 125.0 / 594.0 - 13525.0 / 55296.0, + -277.0 / 14336.0, 512.0 / 1771.0 - 0.25}}; +}; + +template <> +struct ButcherTableau<4, 6> // Referred to as DOPRI5 or RKDP +{ + static constexpr int order = 5; + static constexpr int nstages = 7; + Kokkos::Array a{{0.0, + 0.2, + 0.0, + 3.0 / 40.0, + 9.0 / 40.0, + 0.0, + 44.0 / 45.0, + -56.0 / 15.0, + 32.0 / 9.0, + 0.0, + 19372.0 / 6561.0, + -25360.0 / 2187.0, + 64448.0 / 6561.0, + -212.0 / 729.0, + 0.0, + 9017.0 / 3168.0, + -355.0 / 33.0, + 46732.0 / 5247.0, + 49.0 / 176.0, + -5103.0 / 18656.0, + 0.0, + 35.0 / 384.0, + 0.0, + 500.0 / 1113.0, + 125.0 / 192.0, + -2187.0 / 6784.0, + 11.0 / 84.0, + 0.0}}; + Kokkos::Array b{{35.0 / 384.0, 0.0, 500.0 / 1113.0, + 125.0 / 192.0, -2187.0 / 6784.0, + 11.0 / 84.0, 0.0}}; + Kokkos::Array c{{0.0, 0.2, 0.3, 0.8, 8.0 / 9.0, 1.0, 1.0}}; + Kokkos::Array e{ + {35.0 / 384.0 - 5179.0 / 57600.0, 0.0, 500.0 / 1113.0 - 7571.0 / 16695.0, + 125.0 / 192.0 - 393.0 / 640.0, -2187.0 / 6784.0 + 92097.0 / 339200.0, + 11.0 / 84.0 - 187.0 / 2100.0, -1.0 / 40.0}}; +}; + +} // namespace Impl +} // namespace KokkosODE + +#endif // KOKKOSBLAS_RUNGEKUTTATABLES_IMPL_HPP diff --git a/ode/impl/KokkosODE_RungeKutta_impl.hpp b/ode/impl/KokkosODE_RungeKutta_impl.hpp new file mode 100644 index 0000000000..6b9b09c9f7 --- /dev/null +++ b/ode/impl/KokkosODE_RungeKutta_impl.hpp @@ -0,0 +1,110 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS_RUNGEKUTTA_IMPL_HPP +#define KOKKOSBLAS_RUNGEKUTTA_IMPL_HPP + +#include "Kokkos_Core.hpp" +#include "KokkosBlas1_scal.hpp" +#include "KokkosBlas1_axpby.hpp" + +namespace KokkosODE { +namespace Impl { + +// y_new = y_old + dt*sum(b_i*k_i) i in [1, nstages] +// k_i = f(t+c_i*dt, y_old+sum(a_{ij}*k_i)) j in [1, i-1] +// we need to compute the k_i and store them as we go +// to use them for k_{i+1} computation. +template +KOKKOS_FUNCTION +void RKStep(ode_type& ode, const table_type& table, scalar_type t, scalar_type dt, + const vec_type& y_old, const vec_type& y_new, const vec_type& temp, const mv_type& k_vecs) { + const int neqs = ode.neqs; + const int nstages = table.nstages; + + // first set y_new = y_old + for(int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + y_new(eqIdx) = y_old(eqIdx); + } + + // now accumulate y_new += dt*b_i*k_i + { + // we always start with y_new += dt*b_0*k0 + auto k0 = Kokkos::subview(k_vecs, Kokkos::ALL, 0); + ode.evaluate_function(t + table.c[0]*dt, dt, y_old, k0); + for(int eqIdx = 0; eqIdx < neqs; ++eqIdx) { + y_new(eqIdx) += dt*table.b[0]*k0(eqIdx); + } + } + + // Now that we have k0, we can compute all other k_i + // and accumulate them in y_new. + for(int stageIdx = 1; stageIdx < nstages; ++stageIdx) { + for(int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + temp(eqIdx) = 0; + } + + for(int idx = 0; idx < stageIdx; ++idx) { + for(int eqIdx = 0; eqIdx < neqs; ++eqIdx) { + temp(eqIdx) += table.a[stageIdx*(stageIdx + 1)/2 + idx]*k_vecs(eqIdx, idx); + } + } + KokkosBlas::SerialScale::invoke(dt, temp); + KokkosBlas::serial_axpy(1, y_old, temp); + auto k = Kokkos::subview(k_vecs, Kokkos::ALL, stageIdx); + ode.evaluate_function(t + table.c[stageIdx]*dt, dt, temp, k); + for(int eqIdx = 0; eqIdx < neqs; ++eqIdx) { + y_new(eqIdx) += dt*table.b[stageIdx]*k(eqIdx); + } + } +} // RKStep + + +template +KOKKOS_FUNCTION +void RKSolve(const ode_type& ode, const table_type& table, const scalar_type t_start, + const scalar_type t_end, const scalar_type dt, const int max_steps, + const vec_type& y0, const vec_type& y, const vec_type& temp, const mv_type& k_vecs) { + + scalar_type t = t_start; + for(int stepIdx = 0; (stepIdx < max_steps) && (t < t_end); ++stepIdx) { + RKStep(ode, table, t, dt, y0, y, temp, k_vecs); + for(int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + y0(eqIdx) = y(eqIdx); + } + + if(t < t_end) { + // We may want to print the evolution of the solution over time + // with something similar to the statement below but will need + // to generalize it and make it GPU friendly first, also it + // should be guarded when not doing a debug run, this prints + // a lot... + // std::cout << " step " << stepIdx << " t=" << t << ", y={"; + // for(int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + // std::cout << y(eqIdx) << " "; + // } + // std::cout << "}" << std::endl; + t += dt; + } else { + break; + } + } +} // RKSolve + +} // namespace Impl +} // namespace KokkosODE + +#endif // KOKKOSBLAS_RUNGEKUTTA_IMPL_HPP diff --git a/ode/unit_test/CMakeLists.txt b/ode/unit_test/CMakeLists.txt new file mode 100644 index 0000000000..90d6c45c49 --- /dev/null +++ b/ode/unit_test/CMakeLists.txt @@ -0,0 +1,108 @@ +##################### +# # +# Add include files # +# # +##################### + +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/test_common) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${PACKAGE_SOURCE_DIR}/test_common) + +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/src) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${PACKAGE_SOURCE_DIR}/src) + +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/ode) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/ode) + +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/ode/src) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/ode/src) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/ode/impl) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/ode/impl) + +##################### +# # +# Define unit-tests # +# # +##################### + +##################### +# # +# Add GPU backends # +# # +##################### +IF (KOKKOS_ENABLE_CUDA) + KOKKOSKERNELS_ADD_UNIT_TEST( + ode_cuda + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Cuda_ODE.cpp + COMPONENTS ode + ) +ENDIF () + +IF (KOKKOS_ENABLE_HIP) + KOKKOSKERNELS_ADD_UNIT_TEST( + ode_hip + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_HIP_ODE.cpp + COMPONENTS ode + ) +ENDIF () + +IF (KOKKOS_ENABLE_SYCL) + KOKKOSKERNELS_ADD_UNIT_TEST( + ode_sycl + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_SYCL_ODE.cpp + COMPONENTS ode + ) +ENDIF () + +IF (KOKKOS_ENABLE_OPENMPTARGET) + # KOKKOSKERNELS_ADD_UNIT_TEST( + # ode_openmptarget + # SOURCES + # ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + # backends/Test_OpenMPTarget_ODE.cpp + # COMPONENTS ode + # ) +ENDIF () + + + +##################### +# # +# Add CPU backends # +# # +##################### +IF (KOKKOS_ENABLE_SERIAL) + KOKKOSKERNELS_ADD_UNIT_TEST( + ode_serial + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Serial_ODE.cpp + COMPONENTS ode + ) +ENDIF () + +IF (KOKKOS_ENABLE_OPENMP) + KOKKOSKERNELS_ADD_UNIT_TEST( + ode_openmp + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_OpenMP_ODE.cpp + COMPONENTS ode + ) +ENDIF () + +IF (KOKKOS_ENABLE_THREADS) + KOKKOSKERNELS_ADD_UNIT_TEST( + ode_threads + SOURCES + ${PACKAGE_SOURCE_DIR}/test_common/Test_Main.cpp + backends/Test_Threads_ODE.cpp + COMPONENTS ode + ) +ENDIF () + diff --git a/ode/unit_test/Test_ODE.hpp b/ode/unit_test/Test_ODE.hpp new file mode 100644 index 0000000000..dd929c48fc --- /dev/null +++ b/ode/unit_test/Test_ODE.hpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_ODE_HPP +#define TEST_ODE_HPP + +#include "Test_ODE_RK.hpp" +#include "Test_ODE_RK_chem.hpp" + +#endif // TEST_ODE_HPP diff --git a/ode/unit_test/Test_ODE_RK.hpp b/ode/unit_test/Test_ODE_RK.hpp new file mode 100644 index 0000000000..4043130c3d --- /dev/null +++ b/ode/unit_test/Test_ODE_RK.hpp @@ -0,0 +1,392 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include "KokkosKernels_TestUtils.hpp" + +#include "KokkosODE_RungeKutta_impl.hpp" +#include "KokkosODE_RungeKuttaTables_impl.hpp" + +namespace Test { + +// damped harmonic undriven oscillator +// m y'' + c y' + k y = 0 +// solution: y=A * exp(-xi * omega_0 * t) * sin(sqrt(1-xi^2) * omega_0 * t + phi) +// omega_0 = sqrt(k/m); xi = c / sqrt(4*m*k) +// A and phi depend on y(0) and y'(0); +// Change of variables: x(t) = y(t)*exp(-c/(2m)*t) = y(t)*exp(-xi * omega_0 * t) +// Change of variables: X = [x ] +// [x'] +// Leads to X' = A*X with A = [ 0 1] +// [-d 0] +// with d = k/m - (c/(2m)^2) = (1 - xi^2)*omega_0^2 +struct duho { + + constexpr static int neqs = 2; + const double m, c, k, d; + const double a11 = 0, a12 = 1, a21, a22; + + duho(const double m_, const double c_, const double k_) : m(m_), c(c_), k(k_), d(k_ / m_ - (c_*c_) / (4*m_*m_)), a21(-k / m), a22(-c / m) {}; + + template + KOKKOS_FUNCTION + void evaluate_function(const double /*t*/, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { + f(0) = a11*y(0) + a12*y(1); + f(1) = a21*y(0) + a22*y(1); + } + + template + KOKKOS_FUNCTION + void solution(const double t, const vec_type& y0, const vec_type& y) const { + using KAT = Kokkos::ArithTraits; + + const double gamma = c / (2 * m); + const double omega = KAT::sqrt(k / m - gamma * gamma); + const double phi = KAT::atan((y0(1) + gamma * y0(0)) / (y0(0) * omega)); + const double A = y0(0) / KAT::cos(phi); + + y(0) = A * KAT::cos(omega * t - phi) * KAT::exp(-t * gamma); + y(1) = -y(0) * gamma - omega * A * KAT::sin(omega * t - phi) * KAT::exp(-t * gamma); + } + +}; // duho + +template +struct solution_wrapper{ + + ode_type ode; + scalar_type t; + vec_type y_old, y_ref; + + solution_wrapper(const ode_type& ode_, const scalar_type t_, const vec_type& y_old_, const vec_type& y_ref_) + : ode(ode_), t(t_), y_old(y_old_), y_ref(y_ref_) {}; + + KOKKOS_FUNCTION + void operator() (const int /*idx*/) const { + ode.solution(t, y_old, y_ref); + } +}; + +template +struct RKSolve_wrapper { + + ode_type my_ode; + table_type table; + scalar_type tstart, tend, dt; + int max_steps; + vec_type y_old, y_new, tmp; + mv_type kstack; + + RKSolve_wrapper(const ode_type& my_ode_, const table_type& table_, + const scalar_type tstart_, const scalar_type tend_, const scalar_type dt_, + const int max_steps_, const vec_type& y_old_, const vec_type& y_new_, + const vec_type& tmp_, const mv_type& kstack_) : + my_ode(my_ode_), table(table_), tstart(tstart_), tend(tend_), dt(dt_), max_steps(max_steps_), + y_old(y_old_), y_new(y_new_), tmp(tmp_), kstack(kstack_) {} + + KOKKOS_FUNCTION + void operator() (const int /*idx*/) const { + KokkosODE::Impl::RKSolve(my_ode, table, tstart, tend, dt, max_steps, y_old, y_new, tmp, kstack); + } +}; + +template +void test_method(const std::string label, ode_type& my_ode, + const scalar_type& tstart, const scalar_type& tend, scalar_type& dt, + const int max_steps, vec_type& y_old, vec_type& y_new, + const Kokkos::View& ks, + const Kokkos::View& sol, + typename vec_type::HostMirror y_ref_h) { + using execution_space = typename vec_type::execution_space; + + table_type table; + vec_type tmp("tmp vector", my_ode.neqs); + mv_type kstack("k stack", my_ode.neqs, table.nstages); + + Kokkos::RangePolicy my_policy(0, 1); + RKSolve_wrapper solve_wrapper(my_ode, table, tstart, tend, dt, max_steps, y_old, y_new, tmp, kstack); + Kokkos::parallel_for(my_policy, solve_wrapper); + + auto y_new_h = Kokkos::create_mirror_view(y_new); + Kokkos::deep_copy(y_new_h, y_new); + auto kstack_h = Kokkos::create_mirror_view(kstack); + Kokkos::deep_copy(kstack_h, kstack); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + std::cout << "\n" << label << std::endl; +#endif + for(int stageIdx = 0; stageIdx < table.nstages; ++stageIdx) { + EXPECT_NEAR_KK(ks(0, stageIdx), kstack_h(0, stageIdx), 1e-8); + EXPECT_NEAR_KK(ks(1, stageIdx), kstack_h(1, stageIdx), 1e-8); +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + std::cout << " k" << stageIdx << "={" << kstack_h(0, stageIdx) << ", " << kstack_h(1, stageIdx) << "}" << std::endl; +#endif + } + EXPECT_NEAR_KK(sol(0), y_new_h(0), 1e-8); + EXPECT_NEAR_KK(sol(1), y_new_h(1), 1e-8); +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + std::cout << " y={" << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; + std::cout << " error={" << Kokkos::abs(y_new_h(0) - y_ref_h(0)) / Kokkos::abs(y_ref_h(0)) + << ", " << Kokkos::abs(y_new_h(1) - y_ref_h(1)) / Kokkos::abs(y_ref_h(1)) << "}" << std::endl; +#endif + +} // test_method + +template +void test_RK() { + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + + duho my_oscillator(1, 1, 4); + const int neqs = my_oscillator.neqs; + + vec_type y("solution", neqs), f("function", neqs); + auto y_h = Kokkos::create_mirror(y); + y_h(0) = 1; y_h(1) = 0; + Kokkos::deep_copy(y, y_h); + + constexpr double tstart = 0, tend = 10; + constexpr int max_steps = 1000; + double dt = (tend - tstart) / max_steps; + vec_type y_new("y new", neqs), y_old("y old", neqs); + + // Since y_old_h will be reused to set initial conditions + // for each method tested we do not want to use + // create_mirror_view which would not do a copy + // when y_old is in HostSpace. + typename vec_type::HostMirror y_old_h = Kokkos::create_mirror(y_old); + y_old_h(0) = 1; y_old_h(1) = 0; + + // First compute analytical solution as reference + // and to evaluate the error from each RK method. + vec_type y_ref("reference value", neqs); + auto y_ref_h = Kokkos::create_mirror(y_ref); + { + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::RangePolicy my_policy(0, 1); + solution_wrapper wrapper(my_oscillator, tstart + dt, y_old, y_ref); + Kokkos::parallel_for(my_policy, wrapper); + + Kokkos::deep_copy(y_ref_h, y_ref); +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + std::cout << "\nAnalytical solution" << std::endl; + std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" << std::endl; +#endif + } + + // We perform a single step using a RK method + // and check the values for ki and y_new against + // expected values. + { + Kokkos::deep_copy(y_old, y_old_h); + double ks_raw[2] = {0, -4}; + Kokkos::View ks(ks_raw, 2, 1); + double sol_raw[2] = {1, -0.04}; + Kokkos::View sol(sol_raw, 2); + test_method, vec_type, mv_type, double>("Euler-Forward", my_oscillator, tstart, tend, dt, 1, y_old, y_new, ks, sol, y_ref_h); + } + + { + Kokkos::deep_copy(y_old, y_old_h); + double ks_raw[4] = {0, -0.04, + -4, -3.96}; + Kokkos::View ks(ks_raw, 2, 2); + double sol_raw[2] = {0.9998, -0.0398}; + Kokkos::View sol(sol_raw, 2); + test_method, vec_type, mv_type, double>("Euler-Heun", my_oscillator, tstart, tend, dt, 1, y_old, y_new, ks, sol, y_ref_h); + } + + { + Kokkos::deep_copy(y_old, y_old_h); + double ks_raw[6] = {0, -0.02, -0.03980078, + -4, -3.98, -3.95940234}; + Kokkos::View ks(ks_raw, 2, 3); + double sol_raw[2] = {0.9998, -0.03979999}; + Kokkos::View sol(sol_raw, 2); + test_method, vec_type, mv_type, double>("RKF-12", my_oscillator, tstart, tend, dt, 1, y_old, y_new, ks, sol, y_ref_h); + } + + { + Kokkos::deep_copy(y_old, y_old_h); + double ks_raw[8] = {0, -0.02, -0.02985, -0.039798, + -4, -3.98, -3.96955, -3.95940467}; + Kokkos::View ks(ks_raw, 2, 4); + double sol_raw[2] = {0.99980067, -0.039798}; + Kokkos::View sol(sol_raw, 2); + test_method, vec_type, mv_type, double>("RKBS", my_oscillator, tstart, tend, dt, 1, y_old, y_new, ks, sol, y_ref_h); + } + + { + Kokkos::deep_copy(y_old, y_old_h); + double ks_raw[12] = {0, -0.01, -0.01497188, -0.03674986, -0.03979499, -0.0199505, + -4, -3.99, -3.98491562, -3.96257222, -3.95941166, -3.97984883}; + Kokkos::View ks(ks_raw, 2, 6); + double sol_raw[2] = { 0.99980067, -0.03979801}; + Kokkos::View sol(sol_raw, 2); + test_method, vec_type, mv_type, double>("RKF-45", my_oscillator, tstart, tend, dt, 1, y_old, y_new, ks, sol, y_ref_h); + } + + { + Kokkos::deep_copy(y_old, y_old_h); + double ks_raw[12] = {0, -0.008, -0.011982, -0.02392735, -0.03979862, -0.03484563, + -4, -3.992, -3.987946, -3.97578551, -3.95940328, -3.96454357}; + Kokkos::View ks(ks_raw, 2, 6); + double sol_raw[2] = { 0.99980067, -0.03979801}; + Kokkos::View sol(sol_raw, 2); + test_method, vec_type, mv_type, double>("Cash-Karp", my_oscillator, tstart, tend, dt, 1, y_old, y_new, ks, sol, y_ref_h); + } + +} // test_RK + +template +void test_rate(ode_type& my_ode, const scalar_type& tstart, const scalar_type& tend, + Kokkos::View dt, const int max_steps, + typename vec_type::HostMirror& y_old_h, typename vec_type::HostMirror& y_ref_h, + typename vec_type::HostMirror& error) { + using execution_space = typename vec_type::execution_space; + + table_type table; + vec_type tmp("tmp vector", my_ode.neqs); + mv_type kstack("k stack", my_ode.neqs, table.nstages); + + vec_type y_new("solution", my_ode.neqs); + vec_type y_old("intial conditions", my_ode.neqs); + auto y_new_h = Kokkos::create_mirror(y_new); + + Kokkos::RangePolicy my_policy(0, 1); + for(int idx = 0; idx < dt.extent_int(0); ++idx) { + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::deep_copy(y_new, y_old_h); + RKSolve_wrapper solve_wrapper(my_ode, table, tstart, tend, dt(idx), max_steps, y_old, y_new, tmp, kstack); + Kokkos::parallel_for(my_policy, solve_wrapper); + + Kokkos::deep_copy(y_new_h, y_new); + error(idx) = Kokkos::abs(y_new_h(0) - y_ref_h(0)) / Kokkos::abs(y_ref_h(0)); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + std::cout << "dt=" << dt(idx) << ", error=" << error(idx) + << ", solution: {" << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; +#endif + } + +} // test_method + +template +void test_convergence_rate() { + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + + duho my_oscillator(1, 1, 4); + const int neqs = my_oscillator.neqs; + + vec_type y("solution", neqs), f("function", neqs); + auto y_h = Kokkos::create_mirror(y); + y_h(0) = 1; y_h(1) = 0; + Kokkos::deep_copy(y, y_h); + + constexpr double tstart = 0, tend = 1.024; + constexpr int max_steps = 1024; + Kokkos::View dt("Time Steps", 8); + dt(0) = 0.002; dt(1) = 0.004; dt(2) = 0.008; dt(3) = 0.016; + dt(4) = 0.032; dt(5) = 0.064; dt(6) = 0.128; dt(7) = 0.256; + vec_type y_new("y new", neqs), y_old("y old", neqs); + + // Since y_old_h will be reused to set initial conditions + // for each method tested we do not want to use + // create_mirror_view which would not do a copy + // when y_old is in HostSpace. + typename vec_type::HostMirror y_old_h = Kokkos::create_mirror(y_old); + y_old_h(0) = 1; y_old_h(1) = 0; + + // First compute analytical solution as reference + // and to evaluate the error from each RK method. + vec_type y_ref("reference value", neqs); + auto y_ref_h = Kokkos::create_mirror(y_ref); + { + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::RangePolicy my_policy(0, 1); + solution_wrapper wrapper(my_oscillator, tend, y_old, y_ref); + Kokkos::parallel_for(my_policy, wrapper); + + Kokkos::deep_copy(y_ref_h, y_ref); +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + std::cout << "\nAnalytical solution" << std::endl; + std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" << std::endl; +#endif + } + + typename vec_type::HostMirror error("error", dt.extent(0)); + test_rate, vec_type, mv_type, double>(my_oscillator, tstart, tend, dt, max_steps, y_old_h, y_ref_h, error); + + for(int idx = 1; idx < dt.extent_int(0) - 2; ++idx) { + double expected_ratio = Kokkos::pow(dt(idx + 1) / dt(idx), KokkosODE::Impl::ButcherTableau<1, 1>::order); + double actual_ratio = error(idx+1) / error(idx); + EXPECT_NEAR_KK_REL(actual_ratio, expected_ratio, 0.15); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / Kokkos::abs(expected_ratio); + std::cout << "error ratio: " << actual_ratio << ", expected ratio: " << expected_ratio + << ", rel diff: " << rel_ratio_diff << std::endl; +#endif + } + + Kokkos::deep_copy(error, 0); + test_rate, vec_type, mv_type, double>(my_oscillator, tstart, tend, dt, max_steps, y_old_h, y_ref_h, error); + + for(int idx = 1; idx < dt.extent_int(0) - 2; ++idx) { + double expected_ratio = Kokkos::pow(dt(idx + 1) / dt(idx), KokkosODE::Impl::ButcherTableau<2, 3>::order); + double actual_ratio = error(idx+1) / error(idx); + EXPECT_NEAR_KK_REL(actual_ratio, expected_ratio, 0.05); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / Kokkos::abs(expected_ratio); + std::cout << "error ratio: " << actual_ratio << ", expected ratio: " << expected_ratio + << ", rel diff: " << rel_ratio_diff << std::endl; +#endif + } + + Kokkos::deep_copy(error, 0); + test_rate, vec_type, mv_type, double>(my_oscillator, tstart, tend, dt, max_steps, y_old_h, y_ref_h, error); + + for(int idx = 1; idx < dt.extent_int(0) - 2; ++idx) { + double expected_ratio = Kokkos::pow(dt(idx + 1) / dt(idx), KokkosODE::Impl::ButcherTableau<4, 5>::order); + double actual_ratio = error(idx+1) / error(idx); + EXPECT_NEAR_KK_REL(actual_ratio, expected_ratio, 0.05); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / Kokkos::abs(expected_ratio); + std::cout << "error ratio: " << actual_ratio << ", expected ratio: " << expected_ratio + << ", rel diff: " << rel_ratio_diff << std::endl; +#endif + } +} // test_convergence_rate +} // namespace Test + +int test_RK() { + Test::test_RK(); + + return 1; +} + +int test_RK_conv_rate() { + Test::test_convergence_rate(); + return 1; +} + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, RKSolve_serial) { test_RK(); } +TEST_F(TestCategory, RK_conv_rate) { test_RK_conv_rate(); } +#endif diff --git a/ode/unit_test/Test_ODE_RK_chem.hpp b/ode/unit_test/Test_ODE_RK_chem.hpp new file mode 100644 index 0000000000..e3fee4b461 --- /dev/null +++ b/ode/unit_test/Test_ODE_RK_chem.hpp @@ -0,0 +1,174 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +namespace Test { + +// R1 = 1e-6*1.85e10 * exp(-15618 / T) * (reac) ( 1 – (1- 10^-9) reac) +// d(reac)/dt = -R1 +// d(prod)/dt = R1 +struct chem_model_1 { + + constexpr static int neqs = 2; + // constexpr static double alpha = 1e-6*1.85e10; + constexpr static double alpha = 1.85e10; + constexpr static double beta = 15618; + constexpr static double gamma = 1 - 10^-9; + + const double tstart, tend, T0, T1; + + chem_model_1(const double tstart_ = 0, const double tend_ = 100, + const double T0_ = 300, const double T1_ = 800) : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_) {}; + + template + KOKKOS_FUNCTION + void evaluate_function(const double t, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { + // First compute the temperature + // using linear ramp from T0 to T1 + // between tstart and tend. + double T = (T1 - T0) * (t - tstart) / (tend - tstart) + T0; + + // Evaluate the chemical reaction rate + f(0) = -alpha * Kokkos::exp(-beta / T) * y(0) * (1 - gamma * y(0)); + f(1) = -f(0); + } + +}; + +struct chem_model_2 { + + constexpr static int neqs = 7; + constexpr static double alpha1 = 1e-6*3334169440721739.0*1500; + constexpr static double beta1 = 207850000.0 / 8314.0; + constexpr static double alpha2 = 1e-6*49997793980831.89*1500; + constexpr static double beta2 = 207850000.0 / 8314.0; + + const double tstart, tend, T0, T1; + + chem_model_2(const double tstart_ = 0, const double tend_ = 1200, + const double T0_ = 300, const double T1_ = 1000) : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_) {}; + + template + KOKKOS_FUNCTION + void evaluate_function(const double t, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { + // First compute the temperature + // using linear ramp from T0 to T1 + // between tstart and tend. + double T = (T1 - T0) * (t - tstart) / (1500 - tstart) + T0; + + // Evaluate the chemical reaction rates + double R1 = y(0) * alpha1 * Kokkos::exp(-beta1 / T); + double R2 = y(1) * alpha2 * Kokkos::exp(-beta2 / T); + + // Evaluate the chemical reaction rate + f(0) = -R1; + f(1) = -R2; + f(2) = R1 + 0.08 * R2; + f(3) = 0.147 * R2; + f(4) = 0.453 * R2; + f(5) = 0.187 * R2; + f(6) = 0.133 * R2; + } +}; + +template +void test_chem() { + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + using table_type = KokkosODE::Impl::ButcherTableau<4, 5, 1>; + + { + chem_model_1 chem_model; + const int neqs = chem_model.neqs; + const int max_steps = 15000; + const double dt = 0.1; + + table_type table; + vec_type tmp("tmp vector", neqs); + mv_type kstack("k stack", neqs, table.nstages); + + // Set initial conditions + vec_type y_new("solution", neqs); + vec_type y_old("initial conditions", neqs); + auto y_old_h = Kokkos::create_mirror(y_old); + y_old_h(0) = 1; y_old_h(1) = 0; + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::deep_copy(y_new, y_old_h); + + Kokkos::RangePolicy my_policy(0, 1); + RKSolve_wrapper solve_wrapper(chem_model, table, chem_model.tstart, chem_model.tend, + dt, max_steps, y_old, y_new, tmp, kstack); + Kokkos::parallel_for(my_policy, solve_wrapper); + + auto y_new_h = Kokkos::create_mirror(y_new); + Kokkos::deep_copy(y_new_h, y_new); +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + std::cout << "\nChem model 1" << std::endl; + std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend << std::endl; + std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 << std::endl; + std::cout << " dt=" << dt << std::endl; + std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << "}" << std::endl; + std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; +#endif + } + + { + chem_model_2 chem_model; + const int neqs = chem_model.neqs; + const int max_steps = 1500; + const double dt = 1; + + table_type table; + vec_type tmp("tmp vector", neqs); + mv_type kstack("k stack", neqs, table.nstages); + + // Set initial conditions + vec_type y_new("solution", neqs); + vec_type y_old("initial conditions", neqs); + auto y_old_h = Kokkos::create_mirror(y_old); + y_old_h(0) = 0.25; y_old_h(1) = 0.25; y_old_h(2) = 0; y_old_h(3) = 0; y_old_h(4) = 0; y_old_h(5) = 0; y_old_h(6) = 0; + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::deep_copy(y_new, y_old_h); + + Kokkos::RangePolicy my_policy(0, 1); + RKSolve_wrapper solve_wrapper(chem_model, table, chem_model.tstart, chem_model.tend, + dt, max_steps, y_old, y_new, tmp, kstack); + Kokkos::parallel_for(my_policy, solve_wrapper); + + auto y_new_h = Kokkos::create_mirror(y_new); + Kokkos::deep_copy(y_new_h, y_new); +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + std::cout << "\nChem model 2" << std::endl; + std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend << std::endl; + std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 << std::endl; + std::cout << " dt=" << dt << std::endl; + std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << ", " << y_old_h(2) + << ", " << y_old_h(3) << ", " << y_old_h(4) << ", " << y_old_h(5) << ", " << y_old_h(6) << "}" << std::endl; + std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << ", " << y_new_h(2) + << ", " << y_new_h(3) << ", " << y_new_h(4) << ", " << y_new_h(5) << ", " << y_new_h(6) << "}" << std::endl; +#endif + } +} // test_chem +} // namespace Test + +int test_chem_models() { + Test::test_chem(); + + return 1; +} + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, RK_chem_models) { test_chem_models(); } +#endif diff --git a/ode/unit_test/backends/Test_Cuda_ODE.cpp b/ode/unit_test/backends/Test_Cuda_ODE.cpp new file mode 100644 index 0000000000..c901a6b116 --- /dev/null +++ b/ode/unit_test/backends/Test_Cuda_ODE.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_CUDA_ODE_CPP +#define TEST_CUDA_ODE_CPP + +#include +#include + +#endif // TEST_CUDA_ODE_CPP diff --git a/ode/unit_test/backends/Test_HIP_ODE.cpp b/ode/unit_test/backends/Test_HIP_ODE.cpp new file mode 100644 index 0000000000..8f0d8838dc --- /dev/null +++ b/ode/unit_test/backends/Test_HIP_ODE.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_HIP_ODE_CPP +#define TEST_HIP_ODE_CPP + +#include "Test_HIP.hpp" +#include "Test_ODE.hpp" + +#endif // TEST_HIP_ODE_CPP diff --git a/ode/unit_test/backends/Test_OpenMPTarget_ODE.cpp b/ode/unit_test/backends/Test_OpenMPTarget_ODE.cpp new file mode 100644 index 0000000000..049f0778da --- /dev/null +++ b/ode/unit_test/backends/Test_OpenMPTarget_ODE.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_OPENMPTARGET_ODE_CPP +#define TEST_OPENMPTARGET_ODE_CPP + +#include "Test_OpenMPTarget.hpp" +#include "Test_ODE.hpp" + +#endif // TEST_OPENMPTARGET_ODE_CPP diff --git a/ode/unit_test/backends/Test_OpenMP_ODE.cpp b/ode/unit_test/backends/Test_OpenMP_ODE.cpp new file mode 100644 index 0000000000..3cefeb4666 --- /dev/null +++ b/ode/unit_test/backends/Test_OpenMP_ODE.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_OPENMP_ODE_CPP +#define TEST_OPENMP_ODE_CPP + +#include +#include + +#endif // TEST_OPENMP_ODE_CPP diff --git a/ode/unit_test/backends/Test_SYCL_ODE.cpp b/ode/unit_test/backends/Test_SYCL_ODE.cpp new file mode 100644 index 0000000000..9fd7b8f034 --- /dev/null +++ b/ode/unit_test/backends/Test_SYCL_ODE.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_SYCL_ODE_CPP +#define TEST_SYCL_ODE_CPP + +#include +#include + +#endif // TEST_SYCL_ODE_CPP diff --git a/ode/unit_test/backends/Test_Serial_ODE.cpp b/ode/unit_test/backends/Test_Serial_ODE.cpp new file mode 100644 index 0000000000..31ef4b0489 --- /dev/null +++ b/ode/unit_test/backends/Test_Serial_ODE.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_SERIAL_ODE_CPP +#define TEST_SERIAL_ODE_CPP + +#include +#include + +#endif // TEST_SERIAL_ODE_CPP diff --git a/ode/unit_test/backends/Test_Threads_ODE.cpp b/ode/unit_test/backends/Test_Threads_ODE.cpp new file mode 100644 index 0000000000..ff438a5883 --- /dev/null +++ b/ode/unit_test/backends/Test_Threads_ODE.cpp @@ -0,0 +1,22 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef TEST_THREADS_ODE_CPP +#define TEST_THREADS_ODE_CPP + +#include +#include + +#endif // TEST_THREADS_ODE_CPP diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index a74e6043fc..d46b85b4d7 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -48,6 +48,7 @@ if (KokkosKernels_ENABLE_PERFTESTS) ADD_COMPONENT_SUBDIRECTORY(graph) ADD_COMPONENT_SUBDIRECTORY(sparse) ADD_COMPONENT_SUBDIRECTORY(blas) + ADD_COMPONENT_SUBDIRECTORY(ode) ADD_SUBDIRECTORY(performance) #ADD_SUBDIRECTORY(common) diff --git a/perf_test/ode/CMakeLists.txt b/perf_test/ode/CMakeLists.txt new file mode 100644 index 0000000000..24ddf592bc --- /dev/null +++ b/perf_test/ode/CMakeLists.txt @@ -0,0 +1,28 @@ +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +if(KOKKOSKERNELS_ENABLE_BENCHMARK) + SET(BENCHMARK_NAME ${PACKAGE_NAME}_ode_runge_kutta) + + ADD_EXECUTABLE( + ${BENCHMARK_NAME} + KokkosODE_RK.cpp + ) + TARGET_LINK_LIBRARIES( + ${BENCHMARK_NAME} + PRIVATE benchmark::benchmark Kokkos::kokkoskernels + ) + TARGET_INCLUDE_DIRECTORIES( + ${BENCHMARK_NAME} + SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include + ) + ADD_TEST( + NAME ${BENCHMARK_NAME} + COMMAND ${BENCHMARK_NAME} + ) +else() + KOKKOSKERNELS_ADD_EXECUTABLE( + ${PACKAGE_NAME}_ode_runge_kutta + SOURCES KokkosODE_RK.cpp + ) +endif() diff --git a/perf_test/ode/KokkosODE_RK.cpp b/perf_test/ode/KokkosODE_RK.cpp new file mode 100644 index 0000000000..987cd0610c --- /dev/null +++ b/perf_test/ode/KokkosODE_RK.cpp @@ -0,0 +1,318 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "KokkosODE_RungeKuttaTables_impl.hpp" +#include "KokkosODE_RungeKutta_impl.hpp" +#include "KokkosKernels_TestUtils.hpp" + +#ifdef KOKKOSKERNELS_ENABLE_BENCHMARK +#include +#include "Benchmark_Context.hpp" +#else +#include "KokkosKernels_perf_test_utilities.hpp" +#endif + +namespace { +// R1 = 1e-6*1.85e10 * exp(-15618 / T) * (reac) ( 1 – (1- 10^-9) reac) +// d(reac)/dt = -R1 +// d(prod)/dt = R1 +struct chem_model_1 { + + constexpr static int neqs = 2; + // constexpr static double alpha = 1e-6*1.85e10; + constexpr static double alpha = 1.85e10; + constexpr static double beta = 15618; + constexpr static double gamma = 1 - 10^-9; + + const double tstart, tend, T0, T1; + + chem_model_1(const double tstart_ = 0, const double tend_ = 300, + const double T0_ = 300, const double T1_ = 800) : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_) {}; + + template + KOKKOS_FUNCTION + void evaluate_function(const double t, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { + // First compute the temperature + // using linear ramp from T0 to T1 + // between tstart and tend. + double T = (T1 - T0) * (t - tstart) / (tend - tstart) + T0; + + // Evaluate the chemical reaction rate + f(0) = -alpha * Kokkos::exp(-beta / T) * y(0) * (1 - gamma * y(0)); + f(1) = -f(0); + } + +}; + +// More complex chemical reaction involving two reacting +// species foam A and foam B, that become 5 products. +// The temperature is capped at 1000K once t reaches 1500s +struct chem_model_2 { + + constexpr static int neqs = 7; + constexpr static double alpha1 = 1e-6*3334169440721739.0*1500; + constexpr static double beta1 = 207850000.0 / 8314.0; + constexpr static double alpha2 = 1e-6*49997793980831.89*1500; + constexpr static double beta2 = 207850000.0 / 8314.0; + + const double tstart, tend, T0, T1; + + chem_model_2(const double tstart_ = 0, const double tend_ = 2000, + const double T0_ = 300, const double T1_ = 1000) : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_) {}; + + template + KOKKOS_FUNCTION + void evaluate_function(const double t, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { + // First compute the temperature + // using linear ramp from T0 to T1 + // between tstart and tend. + double T = ((T1 - T0) * (t - tstart) / (1500 - tstart) + T0 < 1000) ? (T1 - T0) * (t - tstart) / (1500 - tstart) + T0 : 1000; + + // Evaluate the chemical reaction rates + double R1 = y(0) * alpha1 * Kokkos::exp(-beta1 / T); + double R2 = y(1) * alpha2 * Kokkos::exp(-beta2 / T); + + // Evaluate the chemical reaction rate + f(0) = -R1; + f(1) = -R2; + f(2) = R1 + 0.08 * R2; + f(3) = 0.147 * R2; + f(4) = 0.453 * R2; + f(5) = 0.187 * R2; + f(6) = 0.133 * R2; + } + +}; + +template +struct RKSolve_wrapper { + + ode_type my_ode; + table_type table; + scalar_type tstart, tend, dt; + int max_steps; + vec_type y_old, y_new, tmp; + mv_type kstack; + + RKSolve_wrapper(const ode_type& my_ode_, const table_type& table_, + const scalar_type tstart_, const scalar_type tend_, const scalar_type dt_, + const int max_steps_, const vec_type& y_old_, const vec_type& y_new_, + const vec_type& tmp_, const mv_type& kstack_) : + my_ode(my_ode_), table(table_), tstart(tstart_), tend(tend_), dt(dt_), max_steps(max_steps_), + y_old(y_old_), y_new(y_new_), tmp(tmp_), kstack(kstack_) {} + + KOKKOS_FUNCTION + void operator() (const int idx) const { + + // Take subviews to create the local problem + auto local_y_old = Kokkos::subview( y_old, Kokkos::pair(2*idx, 2*idx + 1)); + auto local_y_new = Kokkos::subview( y_new, Kokkos::pair(2*idx, 2*idx + 1)); + auto local_tmp = Kokkos::subview( tmp, Kokkos::pair(2*idx, 2*idx + 1)); + auto local_kstack = Kokkos::subview(kstack, Kokkos::pair(2*idx, 2*idx + 1), Kokkos::ALL()); + + // Run Runge-Kutta time integrator + KokkosODE::Impl::RKSolve(my_ode, table, tstart, tend, dt, max_steps, + local_y_old, local_y_new, local_tmp, local_kstack); + } +}; + +struct rk_input_parameters{ + + int num_odes; + int model; + + rk_input_parameters(const int num_odes_, const int model_) : num_odes(num_odes_), model(model_) {}; + +}; + +} // namespace (anonymous) + + +#ifdef KOKKOSKERNELS_ENABLE_BENCHMARK +void run_ode_chem(benchmark::State& state, const rk_input_parameters& params) { +#else +void run_ode_chem(const std::vector& state, const rk_input_parameters& params) { +#endif + using execution_space = Kokkos::DefaultExecutionSpace; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + using table_type = KokkosODE::Impl::ButcherTableau<4, 5, 1>; + + const int num_odes = params.num_odes; + const int model = params.model; + + switch (model) { + case 1: { + chem_model_1 chem_model; + const int neqs = chem_model.neqs; + const int max_steps = 15000; + const double dt = 0.1; + + table_type table; + vec_type tmp("tmp vector", neqs*num_odes); + mv_type kstack("k stack", neqs*num_odes, table.nstages); + + // Set initial conditions + vec_type y_new("solution", neqs*num_odes); + vec_type y_old("initial conditions", neqs*num_odes); + auto y_old_h = Kokkos::create_mirror(y_old); + y_old_h(0) = 1; y_old_h(1) = 0; + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::deep_copy(y_new, y_old_h); + + Kokkos::RangePolicy my_policy(0, num_odes); + RKSolve_wrapper solve_wrapper(chem_model, table, chem_model.tstart, chem_model.tend, + dt, max_steps, y_old, y_new, tmp, kstack); + + Kokkos::Timer time; + time.reset(); + for(auto _ : state) { + (void) _; + Kokkos::parallel_for(my_policy, solve_wrapper); + Kokkos::fence(); + } + double run_time = time.seconds(); + + auto y_new_h = Kokkos::create_mirror(y_new); + Kokkos::deep_copy(y_new_h, y_new); + std::cout << "\nChem model 1" << std::endl; + std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend << std::endl; + std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 << std::endl; + std::cout << " dt=" << dt << std::endl; + std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << "}" << std::endl; + std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; + std::cout << " num odes: " << num_odes << std::endl; + std::cout << " time elapsed: " << run_time << std::endl; + break; + } + case 2: { + chem_model_2 chem_model; + const int neqs = chem_model.neqs; + const int max_steps = 15000; + const double dt = 0.1; + + table_type table; + vec_type tmp("tmp vector", neqs*num_odes); + mv_type kstack("k stack", neqs*num_odes, table.nstages); + + // Set initial conditions + vec_type y_new("solution", neqs*num_odes); + vec_type y_old("initial conditions", neqs*num_odes); + auto y_old_h = Kokkos::create_mirror(y_old); + y_old_h(0) = 0.25; y_old_h(1) = 0.25; y_old(2) = 0; + y_old(3) = 0; y_old(4) = 0; y_old(5) = 0; y_old(6) = 0; + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::deep_copy(y_new, y_old_h); + + Kokkos::RangePolicy my_policy(0, num_odes); + RKSolve_wrapper solve_wrapper(chem_model, table, chem_model.tstart, chem_model.tend, + dt, max_steps, y_old, y_new, tmp, kstack); + + Kokkos::Timer time; + time.reset(); + for(auto _ : state) { + (void) _; + Kokkos::parallel_for(my_policy, solve_wrapper); + Kokkos::fence(); + } + double run_time = time.seconds(); + + auto y_new_h = Kokkos::create_mirror(y_new); + Kokkos::deep_copy(y_new_h, y_new); + std::cout << "\nChem model 2" << std::endl; + std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend << std::endl; + std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 << std::endl; + std::cout << " dt=" << dt << std::endl; + std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << "}" << std::endl; + std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; + std::cout << " num odes: " << num_odes << std::endl; + std::cout << " time elapsed: " << run_time << std::endl; + break; + } + } +} + +#ifdef KOKKOSKERNELS_ENABLE_BENCHMARK +void run_benchmark_wrapper(benchmark::State& state) { + rk_input_parameters params(state.range(0), state.range(1)); + + run_ode_chem(state, params); +} + +#else +template + void run_perftest_wrapper(int argc, char** argv, perf_test::CommonInputParams) { + rk_input_parameters params(1000, 1); + + // Loop over arguments, parse them and fill params struct + for(int argIdx = 1; argIdx < argc - 1; ++argIdx) { + if((0 == Test::string_compare_no_case(argv[argIdx], "-N"))) { + params.num_odes = atoi(argv[argIdx + 1]); + } + if((0 == Test::string_compare_no_case(argv[argIdx], "--model"))) { + params.model = atoi(argv[argIdx + 1]); + } + } + + // Basically we call the run routine + // with std::vector state = {1} which + // means we will do the perf test once. + // we could change that with command + // line argument logic in the future. + // std::vector tmp({1}); + run_ode_chem({1}, params); +} +#endif + + +#ifdef KOKKOSKERNELS_ENABLE_BENCHMARK +// Benchmark style call for performance +// monitoring over time in a highly controlled +// environment +int main(int argc, char** argv) { + Kokkos::initialize(argc, argv); + + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kMillisecond); + KokkosKernelsBenchmark::add_benchmark_context(true); + + + std::string bench_name = "KokkosODE_chem_models"; + benchmark::RegisterBenchmark(bench_name.c_str(), run_benchmark_wrapper) + ->UseRealTime() + ->ArgNames({"n", "model"}) + ->Args({1000, 1}); + + + benchmark::RunSpecifiedBenchmarks(); + + benchmark::Shutdown(); + Kokkos::finalize(); + + return 0; +} + +#else + +// perf test style call for testing and +// tunning of algorithms in a convenient way +#define KOKKOSKERNELS_PERF_TEST_NAME run_perftest_wrapper +#include "KokkosKernels_perf_test_instantiation.hpp" +int main(int argc, char** argv) { + return main_instantiation(argc, argv); +} // main + +#endif // KOKKOSKERNELS_ENABLE_BENCHMARK From 22cd43ce1f059d1a459d5c37d0d73cd085a76fc9 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 29 Mar 2023 17:58:33 -0600 Subject: [PATCH 203/442] ODE: adding support for adaptive time stepping Adding new ODE_params struct that is used to control adaptivity in ODE solver and to set basic parameters such as the number of steps desired, max number of steps allowed, etc... Adding RK4 in case we need it to jump start some implicit methods in the upcoming work... --- ode/impl/KokkosODE_RungeKuttaTables_impl.hpp | 69 ++- ode/impl/KokkosODE_RungeKutta_impl.hpp | 139 ++++-- ode/src/KokkosODE_RungeKutta.hpp | 141 ++++++ ode/src/KokkosODE_Types.hpp | 56 +++ ode/unit_test/Test_ODE_RK.hpp | 373 ++++++++++------ ode/unit_test/Test_ODE_RK_chem.hpp | 117 +++-- .../KokkosKernels_perf_test_utilities.hpp | 4 + perf_test/ode/CMakeLists.txt | 5 - perf_test/ode/KokkosODE_RK.cpp | 420 ++++++++++-------- 9 files changed, 900 insertions(+), 424 deletions(-) create mode 100644 ode/src/KokkosODE_RungeKutta.hpp create mode 100644 ode/src/KokkosODE_Types.hpp diff --git a/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp b/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp index 3458bdf7b0..85a8ec0b45 100644 --- a/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp +++ b/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp @@ -26,15 +26,16 @@ namespace Impl { //===================================================================== // Methods supported: +// Forward Euler (RKFE) // Euler-Heun Method (RKEH) -// Fehlberg 1-2 (RK12) -// Bogacki-Shampine (BS) +// Fehlberg 1-2 (RKF12) +// Bogacki-Shampine (RKBS) +// Runge-Kutta 4th order (RK4) // Fehlberg Method (RKF45) -// Cash-Karp Method (CashKarp) -// Dormand-Prince Method (DormandPrince) +// Cash-Karp Method (RKCK) +// Dormand-Prince Method (RKDP) // Format follows form of Butcher Tableau - // c1| a00 // c2| a10 a11 // c3| a20 a21 a22 @@ -57,19 +58,25 @@ namespace Impl { // e contains coefficient for error estimation template -struct ButcherTableau { }; +struct ButcherTableau {}; template <> -struct ButcherTableau<0, 0> // Forward Euler +struct ButcherTableau<0, 0> // Forward Euler { static constexpr int order = 1; static constexpr int nstages = 1; - Kokkos::Arraya{{1}}; + Kokkos::Array a{{1}}; Kokkos::Array b{{1}}; Kokkos::Array c{{0}}; + Kokkos::Array e{{0}}; }; +// Coefficients obtained from: (see page 39) +// Iserles, A. +// A First Course in the Numerical Analysis of Differential Equations." +// Cambridge: Cambridge University Press. (2008). +// https://doi:10.1017/CBO9780511995569 template <> struct ButcherTableau<1, 1> // Euler-Heun Method { @@ -83,6 +90,11 @@ struct ButcherTableau<1, 1> // Euler-Heun Method Kokkos::Array e{{-0.5, 0.5}}; }; +// Coefficients obtained from: +// Fehlberg, E. +// "Klassische Runge-Kutta-Formeln vierter und niedrigerer Ordnung mit +// Schrittweiten-Kontrolle und ihre Anwendung auf Wärmeleitungsprobleme." +// Computing 6, 61–71 (1970). https://doi.org/10.1007/BF02241732 template <> struct ButcherTableau<1, 2> // Known as Fehlberg 1-2 method { @@ -96,6 +108,11 @@ struct ButcherTableau<1, 2> // Known as Fehlberg 1-2 method {1.0 / 256.0 - 1.0 / 512.0, 0.0, -1.0 / 512.0}}; }; +// Coefficients obtained from: +// P. Bogacki, L.F. Shampine, +// "A 3(2) pair of Runge - Kutta formulas," +// Applied Mathematics Letters, Volume 2, Issue 4, 1989, +// https://doi.org/10.1016/0893-9659(89)90079-7. template <> struct ButcherTableau<2, 3> // Bogacki-Shampine method { @@ -110,6 +127,28 @@ struct ButcherTableau<2, 3> // Bogacki-Shampine method 4.0 / 9.0 - 1.0 / 3.0, -1.0 / 8.0}}; }; +// Coefficients obtained from: +// Hull, David G. +// "Fourth-order Runge-Kutta integration with stepsize control." +// AIAA Journal 15.10 (1977): 1505-1507. +template <> +struct ButcherTableau<3, 3> // RK4 +{ + static constexpr int order = 4; + static constexpr int nstages = 4; + Kokkos::Array a{ + {0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 1.0, 0.0}}; + Kokkos::Array b{ + {1.0 / 6.0, 1.0 / 3.0, 1.0 / 3.0, 1.0 / 6.0}}; + Kokkos::Array c{{0.0, 0.5, 0.5, 1.0}}; + Kokkos::Array e{{1.0 / 6.0, 0.0, -1.0 / 3.0, 1.0 / 6.0}}; +}; + +// Coefficients obtained from: +// Fehlberg, E. +// "Klassische Runge-Kutta-Formeln vierter und niedrigerer Ordnung mit +// Schrittweiten-Kontrolle und ihre Anwendung auf Wärmeleitungsprobleme." +// Computing 6, 61–71 (1970). https://doi.org/10.1007/BF02241732 template <> struct ButcherTableau<4, 5> // Fehlberg Method { @@ -146,6 +185,11 @@ struct ButcherTableau<4, 5> // Fehlberg Method 28561.0 / 56430.0 - 2197.0 / 4104.0, -9.0 / 50.0 + 0.2, 2.0 / 55.0}}; }; +// Coefficients obtained from: +// J. R. Cash and Alan H. Karp. +// "A variable order Runge-Kutta method for initial value problems with rapidly +// varying right-hand sides." ACM Trans. Math. Softw. 16, 3 (Sept. 1990), +// 201–222. https://doi.org/10.1145/79505.79507 template <> struct ButcherTableau<4, 5, 1> // Cash-Karp { @@ -182,8 +226,13 @@ struct ButcherTableau<4, 5, 1> // Cash-Karp -277.0 / 14336.0, 512.0 / 1771.0 - 0.25}}; }; +// Coefficients obtained from: +// J.R. Dormand, P.J. Prince, +// "A family of embedded Runge-Kutta formulae", +// Journal of Computational and Applied Mathematics, Volume 6, Issue 1, 1980, +// https://doi.org/10.1016/0771-050X(80)90013-3. template <> -struct ButcherTableau<4, 6> // Referred to as DOPRI5 or RKDP +struct ButcherTableau<4, 6> // Referred to as DOPRI5 or RKDP { static constexpr int order = 5; static constexpr int nstages = 7; @@ -228,4 +277,4 @@ struct ButcherTableau<4, 6> // Referred to as DOPRI5 or RKDP } // namespace Impl } // namespace KokkosODE -#endif // KOKKOSBLAS_RUNGEKUTTATABLES_IMPL_HPP +#endif // KOKKOSBLAS_RUNGEKUTTATABLES_IMPL_HPP diff --git a/ode/impl/KokkosODE_RungeKutta_impl.hpp b/ode/impl/KokkosODE_RungeKutta_impl.hpp index 6b9b09c9f7..791093c8db 100644 --- a/ode/impl/KokkosODE_RungeKutta_impl.hpp +++ b/ode/impl/KokkosODE_RungeKutta_impl.hpp @@ -20,6 +20,8 @@ #include "Kokkos_Core.hpp" #include "KokkosBlas1_scal.hpp" #include "KokkosBlas1_axpby.hpp" +#include "KokkosODE_RungeKuttaTables_impl.hpp" +#include "KokkosODE_Types.hpp" namespace KokkosODE { namespace Impl { @@ -28,65 +30,119 @@ namespace Impl { // k_i = f(t+c_i*dt, y_old+sum(a_{ij}*k_i)) j in [1, i-1] // we need to compute the k_i and store them as we go // to use them for k_{i+1} computation. -template -KOKKOS_FUNCTION -void RKStep(ode_type& ode, const table_type& table, scalar_type t, scalar_type dt, - const vec_type& y_old, const vec_type& y_new, const vec_type& temp, const mv_type& k_vecs) { +template +KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, + const bool adaptivity, scalar_type t, + scalar_type dt, const vec_type& y_old, + const vec_type& y_new, const vec_type& temp, + const mv_type& k_vecs) { const int neqs = ode.neqs; const int nstages = table.nstages; // first set y_new = y_old - for(int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { - y_new(eqIdx) = y_old(eqIdx); + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + y_new(eqIdx) = y_old(eqIdx); } // now accumulate y_new += dt*b_i*k_i { // we always start with y_new += dt*b_0*k0 auto k0 = Kokkos::subview(k_vecs, Kokkos::ALL, 0); - ode.evaluate_function(t + table.c[0]*dt, dt, y_old, k0); - for(int eqIdx = 0; eqIdx < neqs; ++eqIdx) { - y_new(eqIdx) += dt*table.b[0]*k0(eqIdx); + ode.evaluate_function(t + table.c[0] * dt, dt, y_old, k0); + for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { + y_new(eqIdx) += dt * table.b[0] * k0(eqIdx); } } // Now that we have k0, we can compute all other k_i // and accumulate them in y_new. - for(int stageIdx = 1; stageIdx < nstages; ++stageIdx) { - for(int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { - temp(eqIdx) = 0; + for (int stageIdx = 1; stageIdx < nstages; ++stageIdx) { + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + temp(eqIdx) = 0; } - for(int idx = 0; idx < stageIdx; ++idx) { - for(int eqIdx = 0; eqIdx < neqs; ++eqIdx) { - temp(eqIdx) += table.a[stageIdx*(stageIdx + 1)/2 + idx]*k_vecs(eqIdx, idx); + for (int idx = 0; idx < stageIdx; ++idx) { + for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { + temp(eqIdx) += + table.a[stageIdx * (stageIdx + 1) / 2 + idx] * k_vecs(eqIdx, idx); } } KokkosBlas::SerialScale::invoke(dt, temp); KokkosBlas::serial_axpy(1, y_old, temp); auto k = Kokkos::subview(k_vecs, Kokkos::ALL, stageIdx); - ode.evaluate_function(t + table.c[stageIdx]*dt, dt, temp, k); - for(int eqIdx = 0; eqIdx < neqs; ++eqIdx) { - y_new(eqIdx) += dt*table.b[stageIdx]*k(eqIdx); + ode.evaluate_function(t + table.c[stageIdx] * dt, dt, temp, k); + for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { + y_new(eqIdx) += dt * table.b[stageIdx] * k(eqIdx); } } -} // RKStep + // Compute estimation of the error using k_vecs and table.e + if (adaptivity == true) { + for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { + temp(eqIdx) = 0; + for (int stageIdx = 0; stageIdx < nstages; ++stageIdx) { + temp(eqIdx) += dt * table.e[stageIdx] * k_vecs(eqIdx, stageIdx); + } + } + } +} // RKStep + +template +KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve( + const ode_type& ode, const table_type& table, + const KokkosODE::Experimental::ODE_params& params, + const scalar_type t_start, const scalar_type t_end, const vec_type& y0, + const vec_type& y, const vec_type& temp, const mv_type& k_vecs) { + constexpr scalar_type error_threshold = 1; + bool adapt = params.adaptivity; + if (std::is_same_v>) { + adapt = false; + } + + scalar_type dt = (t_end - t_start) / params.max_steps; + scalar_type t = t_start; + for (int stepIdx = 0; (stepIdx < params.max_steps) && (t < t_end); + ++stepIdx) { + // Set err to be arbitrarily larger than our threshold of 1 + scalar_type error = 2 * error_threshold; + scalar_type tol = 0; + while (error_threshold < error) { + // Take a step of Runge-Kutta integrator + RKStep(ode, table, adapt, t, dt, y0, y, temp, k_vecs); -template -KOKKOS_FUNCTION -void RKSolve(const ode_type& ode, const table_type& table, const scalar_type t_start, - const scalar_type t_end, const scalar_type dt, const int max_steps, - const vec_type& y0, const vec_type& y, const vec_type& temp, const mv_type& k_vecs) { + // Compute the largest error and decide on + // the size of the next time step to take. + error = 0; + if (adapt) { + // Compute the error + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + error = Kokkos::max(error, Kokkos::abs(temp(eqIdx))); + tol = Kokkos::max( + tol, params.abs_tol + + params.rel_tol * Kokkos::max(Kokkos::abs(y(eqIdx)), + Kokkos::abs(y0(eqIdx)))); + } + error = error / tol; - scalar_type t = t_start; - for(int stepIdx = 0; (stepIdx < max_steps) && (t < t_end); ++stepIdx) { - RKStep(ode, table, t, dt, y0, y, temp, k_vecs); - for(int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + // Reduce the time step if error + // is too large and current step + // is rejected. + if (error > 1) { + dt = dt * Kokkos::max(0.2, 0.8 / Kokkos::pow(error, 1 / table.order)); + } + if (dt < params.min_step_size) + return Experimental::ode_solver_status::MIN_SIZE; + } + } + + // Update y0 to stage the next time step. + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { y0(eqIdx) = y(eqIdx); } - if(t < t_end) { + if (t < t_end) { // We may want to print the evolution of the solution over time // with something similar to the statement below but will need // to generalize it and make it GPU friendly first, also it @@ -97,14 +153,27 @@ void RKSolve(const ode_type& ode, const table_type& table, const scalar_type t_s // std::cout << y(eqIdx) << " "; // } // std::cout << "}" << std::endl; - t += dt; + if (adapt) { + // Compute new time increment + dt = dt * + Kokkos::min( + 10.0, + Kokkos::max(2.0, 0.9 * Kokkos::pow(error, 1 / table.order))); + } else { + // Use same increment + t += dt; + } } else { - break; + return Experimental::ode_solver_status::SUCCESS; } } -} // RKSolve -} // namespace Impl -} // namespace KokkosODE + if (t < t_end) return Experimental::ode_solver_status::MAX_STEP; + + return Experimental::ode_solver_status::SUCCESS; +} // RKSolve + +} // namespace Impl +} // namespace KokkosODE -#endif // KOKKOSBLAS_RUNGEKUTTA_IMPL_HPP +#endif // KOKKOSBLAS_RUNGEKUTTA_IMPL_HPP diff --git a/ode/src/KokkosODE_RungeKutta.hpp b/ode/src/KokkosODE_RungeKutta.hpp new file mode 100644 index 0000000000..c41d79c1ef --- /dev/null +++ b/ode/src/KokkosODE_RungeKutta.hpp @@ -0,0 +1,141 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSODE_RUNGEKUTTA_HPP +#define KOKKOSODE_RUNGEKUTTA_HPP + +/// \author Luc Berger-Vergiat (lberg@sandia.gov) +/// \file KokkosODE_RungeKutta.hpp + +#include "Kokkos_Core.hpp" +#include "KokkosODE_Types.hpp" + +#include "KokkosODE_RungeKutta_impl.hpp" + +namespace KokkosODE { +namespace Experimental { + +/// \brief RK_type is an enum tye that conveniently +/// describes the Runge-Kutta methods implemented. +enum RK_type : int { + RKFE = 0, ///< Forward Euler method (no adaptivity available for this method) + RKEH = 1, ///< Euler-Heun method + RKF12 = 2, ///< Fehlberg order 2 method + RKBS = 3, ///< Bogacki-Shampine method + RK4 = 4, ///< Runge-Kutta classic order 4 method + RKF45 = 5, ///< Fehlberg order 5 method + RKCK = 6, ///< Cash-Karp method + RKDP = 7 ///< Dormand-Prince method +}; + +template +struct RK_Tableau_helper { + using table_type = void; +}; + +template <> +struct RK_Tableau_helper { + using table_type = KokkosODE::Impl::ButcherTableau<0, 0>; +}; + +template <> +struct RK_Tableau_helper { + using table_type = KokkosODE::Impl::ButcherTableau<1, 1>; +}; + +template <> +struct RK_Tableau_helper { + using table_type = KokkosODE::Impl::ButcherTableau<1, 2>; +}; + +template <> +struct RK_Tableau_helper { + using table_type = KokkosODE::Impl::ButcherTableau<2, 3>; +}; + +template <> +struct RK_Tableau_helper { + using table_type = KokkosODE::Impl::ButcherTableau<3, 3>; +}; + +template <> +struct RK_Tableau_helper { + using table_type = KokkosODE::Impl::ButcherTableau<4, 5>; +}; + +template <> +struct RK_Tableau_helper { + using table_type = KokkosODE::Impl::ButcherTableau<4, 5, 1>; +}; + +template <> +struct RK_Tableau_helper { + using table_type = KokkosODE::Impl::ButcherTableau<4, 6>; +}; + +/// \brief Unspecialized version of the RungeKutta solvers +/// +/// \tparam RK_type an RK_type enum value used to specify +/// which Runge Kutta method is to be used. +template +struct RungeKutta { + using table_type = typename RK_Tableau_helper::table_type; + + /// \brief order returns the convergence order of the method + KOKKOS_FUNCTION + static int order() { return table_type::order; } + + /// \brief num_stages returns the number of stages used by the method + KOKKOS_FUNCTION + static int num_stages() { return table_type::nstages; } + + /// \brief Solve integrates an ordinary differential equation + /// + /// The integration is carried with the method specified as template + /// parameter to the RungeKutta struct. This method is static and + /// marked as KOKKOS_FUNCTION so it can be used on host and device. + /// + /// \tparam ode_type the type of the ode object to integrated + /// \tparam vec_type a rank-1 view + /// \tparam mv_type a rank-2 view + /// \tparam scalar_type a floating point type + /// + /// \param ode [in]: the ode to integrate + /// \param params [in]: standard input parameters of ODE integrators + /// \param t_start [in]: time at which the integration starts + /// \param t_end [in]: time at which the integration stops + /// \param y0 [in/out]: vector of initial conditions, set to the solution + /// at the end of the integration + /// \param y [out]: vector of solution at t_end + /// \param temp [in]: vector for temporary storage + /// \param k_vecs [in]: vectors for temporary storage + /// + /// \return ode_solver_status an enum that describes success of failure + /// of the integration method once it at terminated. + template + KOKKOS_FUNCTION static ode_solver_status Solve( + const ode_type& ode, const KokkosODE::Experimental::ODE_params& params, + const scalar_type t_start, const scalar_type t_end, const vec_type& y0, + const vec_type& y, const vec_type& temp, const mv_type& k_vecs) { + table_type table; + return KokkosODE::Impl::RKSolve(ode, table, params, t_start, t_end, y0, y, + temp, k_vecs); + } +}; + +} // namespace Experimental +} // namespace KokkosODE +#endif // KOKKOSODE_RUNGEKUTTA_HPP diff --git a/ode/src/KokkosODE_Types.hpp b/ode/src/KokkosODE_Types.hpp new file mode 100644 index 0000000000..136ff75536 --- /dev/null +++ b/ode/src/KokkosODE_Types.hpp @@ -0,0 +1,56 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSODE_TYPES_HPP +#define KOKKOSODE_TYPES_HPP + +namespace KokkosODE { +namespace Experimental { + +enum ode_solver_status { SUCCESS = 0, MAX_STEP = 1, MIN_SIZE = 2 }; + +struct ODE_params { + bool adaptivity; + int num_steps, max_steps; + double abs_tol, rel_tol, min_step_size; + + // Constructor that only specify the desired number of steps. + // In this case no adaptivity is provided, the time step will + // be constant such that dt = (tend - tstart) / num_steps; + KOKKOS_FUNCTION + ODE_params(const int num_steps_) + : adaptivity(false), + num_steps(num_steps_), + max_steps(num_steps_), + abs_tol(0), + rel_tol(0), + min_step_size(0) {} + + /// ODE_parms construtor for adaptive time stepping. + KOKKOS_FUNCTION + ODE_params(const int num_steps_, const int max_steps_, const double abs_tol_, + const double rel_tol_, const double min_step_size_) + : adaptivity(true), + num_steps(num_steps_), + max_steps(max_steps_), + abs_tol(abs_tol_), + rel_tol(rel_tol_), + min_step_size(min_step_size_) {} +}; + +} // namespace Experimental +} // namespace KokkosODE +#endif // KOKKOSODE_TYPES_HPP diff --git a/ode/unit_test/Test_ODE_RK.hpp b/ode/unit_test/Test_ODE_RK.hpp index 4043130c3d..1e851108f3 100644 --- a/ode/unit_test/Test_ODE_RK.hpp +++ b/ode/unit_test/Test_ODE_RK.hpp @@ -17,107 +17,124 @@ #include #include "KokkosKernels_TestUtils.hpp" -#include "KokkosODE_RungeKutta_impl.hpp" -#include "KokkosODE_RungeKuttaTables_impl.hpp" +#include "KokkosODE_RungeKutta.hpp" namespace Test { // damped harmonic undriven oscillator // m y'' + c y' + k y = 0 -// solution: y=A * exp(-xi * omega_0 * t) * sin(sqrt(1-xi^2) * omega_0 * t + phi) -// omega_0 = sqrt(k/m); xi = c / sqrt(4*m*k) -// A and phi depend on y(0) and y'(0); -// Change of variables: x(t) = y(t)*exp(-c/(2m)*t) = y(t)*exp(-xi * omega_0 * t) -// Change of variables: X = [x ] +// solution: y=A * exp(-xi * omega_0 * t) * sin(sqrt(1-xi^2) * omega_0 * t + +// phi) omega_0 = sqrt(k/m); xi = c / sqrt(4*m*k) A and phi depend on y(0) and +// y'(0); Change of variables: x(t) = y(t)*exp(-c/(2m)*t) = y(t)*exp(-xi * +// omega_0 * t) Change of variables: X = [x ] // [x'] // Leads to X' = A*X with A = [ 0 1] // [-d 0] // with d = k/m - (c/(2m)^2) = (1 - xi^2)*omega_0^2 struct duho { - constexpr static int neqs = 2; const double m, c, k, d; const double a11 = 0, a12 = 1, a21, a22; - duho(const double m_, const double c_, const double k_) : m(m_), c(c_), k(k_), d(k_ / m_ - (c_*c_) / (4*m_*m_)), a21(-k / m), a22(-c / m) {}; + duho(const double m_, const double c_, const double k_) + : m(m_), + c(c_), + k(k_), + d(k_ / m_ - (c_ * c_) / (4 * m_ * m_)), + a21(-k / m), + a22(-c / m){}; template - KOKKOS_FUNCTION - void evaluate_function(const double /*t*/, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { - f(0) = a11*y(0) + a12*y(1); - f(1) = a21*y(0) + a22*y(1); + KOKKOS_FUNCTION void evaluate_function(const double /*t*/, + const double /*dt*/, + const vec_type1& y, + const vec_type2& f) const { + f(0) = a11 * y(0) + a12 * y(1); + f(1) = a21 * y(0) + a22 * y(1); } template - KOKKOS_FUNCTION - void solution(const double t, const vec_type& y0, const vec_type& y) const { + KOKKOS_FUNCTION void solution(const double t, const vec_type& y0, + const vec_type& y) const { using KAT = Kokkos::ArithTraits; - const double gamma = c / (2 * m); - const double omega = KAT::sqrt(k / m - gamma * gamma); - const double phi = KAT::atan((y0(1) + gamma * y0(0)) / (y0(0) * omega)); - const double A = y0(0) / KAT::cos(phi); + const double gamma = c / (2 * m); + const double omega = KAT::sqrt(k / m - gamma * gamma); + const double phi = KAT::atan((y0(1) + gamma * y0(0)) / (y0(0) * omega)); + const double A = y0(0) / KAT::cos(phi); y(0) = A * KAT::cos(omega * t - phi) * KAT::exp(-t * gamma); - y(1) = -y(0) * gamma - omega * A * KAT::sin(omega * t - phi) * KAT::exp(-t * gamma); + y(1) = -y(0) * gamma - + omega * A * KAT::sin(omega * t - phi) * KAT::exp(-t * gamma); } -}; // duho +}; // duho template -struct solution_wrapper{ - +struct solution_wrapper { ode_type ode; scalar_type t; vec_type y_old, y_ref; - solution_wrapper(const ode_type& ode_, const scalar_type t_, const vec_type& y_old_, const vec_type& y_ref_) - : ode(ode_), t(t_), y_old(y_old_), y_ref(y_ref_) {}; + solution_wrapper(const ode_type& ode_, const scalar_type t_, + const vec_type& y_old_, const vec_type& y_ref_) + : ode(ode_), t(t_), y_old(y_old_), y_ref(y_ref_){}; KOKKOS_FUNCTION - void operator() (const int /*idx*/) const { - ode.solution(t, y_old, y_ref); - } + void operator()(const int /*idx*/) const { ode.solution(t, y_old, y_ref); } }; -template +template struct RKSolve_wrapper { + using ode_params = KokkosODE::Experimental::ODE_params; ode_type my_ode; - table_type table; - scalar_type tstart, tend, dt; + ode_params params; + scalar_type tstart, tend; int max_steps; vec_type y_old, y_new, tmp; mv_type kstack; - RKSolve_wrapper(const ode_type& my_ode_, const table_type& table_, - const scalar_type tstart_, const scalar_type tend_, const scalar_type dt_, - const int max_steps_, const vec_type& y_old_, const vec_type& y_new_, - const vec_type& tmp_, const mv_type& kstack_) : - my_ode(my_ode_), table(table_), tstart(tstart_), tend(tend_), dt(dt_), max_steps(max_steps_), - y_old(y_old_), y_new(y_new_), tmp(tmp_), kstack(kstack_) {} + RKSolve_wrapper(const ode_type& my_ode_, const ode_params& params_, + const scalar_type tstart_, const scalar_type tend_, + const vec_type& y_old_, const vec_type& y_new_, + const vec_type& tmp_, const mv_type& kstack_) + : my_ode(my_ode_), + params(params_), + tstart(tstart_), + tend(tend_), + y_old(y_old_), + y_new(y_new_), + tmp(tmp_), + kstack(kstack_) {} KOKKOS_FUNCTION - void operator() (const int /*idx*/) const { - KokkosODE::Impl::RKSolve(my_ode, table, tstart, tend, dt, max_steps, y_old, y_new, tmp, kstack); + void operator()(const int /*idx*/) const { + KokkosODE::Experimental::RungeKutta::Solve( + my_ode, params, tstart, tend, y_old, y_new, tmp, kstack); } }; -template +template void test_method(const std::string label, ode_type& my_ode, - const scalar_type& tstart, const scalar_type& tend, scalar_type& dt, - const int max_steps, vec_type& y_old, vec_type& y_new, - const Kokkos::View& ks, - const Kokkos::View& sol, - typename vec_type::HostMirror y_ref_h) { + const scalar_type& tstart, const scalar_type& tend, + const int num_steps, vec_type& y_old, vec_type& y_new, + const int order, const int num_stages, + const Kokkos::View& ks, + const Kokkos::View& sol, + typename vec_type::HostMirror y_ref_h) { using execution_space = typename vec_type::execution_space; + using solver_type = KokkosODE::Experimental::RungeKutta; - table_type table; + KokkosODE::Experimental::ODE_params params(num_steps); vec_type tmp("tmp vector", my_ode.neqs); - mv_type kstack("k stack", my_ode.neqs, table.nstages); + mv_type kstack("k stack", my_ode.neqs, solver_type::num_stages()); Kokkos::RangePolicy my_policy(0, 1); - RKSolve_wrapper solve_wrapper(my_ode, table, tstart, tend, dt, max_steps, y_old, y_new, tmp, kstack); + RKSolve_wrapper + solve_wrapper(my_ode, params, tstart, tend, y_old, y_new, tmp, kstack); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror_view(y_new); @@ -125,42 +142,56 @@ void test_method(const std::string label, ode_type& my_ode, auto kstack_h = Kokkos::create_mirror_view(kstack); Kokkos::deep_copy(kstack_h, kstack); -#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + EXPECT_EQ(solver_type::order(), order); + EXPECT_EQ(solver_type::num_stages(), num_stages); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "\n" << label << std::endl; + std::cout << " order: " << solver_type::order() << std::endl; + std::cout << " number of stages: " << solver_type::num_stages() << std::endl; +#else + (void)label; #endif - for(int stageIdx = 0; stageIdx < table.nstages; ++stageIdx) { + for (int stageIdx = 0; stageIdx < solver_type::num_stages(); ++stageIdx) { EXPECT_NEAR_KK(ks(0, stageIdx), kstack_h(0, stageIdx), 1e-8); EXPECT_NEAR_KK(ks(1, stageIdx), kstack_h(1, stageIdx), 1e-8); -#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - std::cout << " k" << stageIdx << "={" << kstack_h(0, stageIdx) << ", " << kstack_h(1, stageIdx) << "}" << std::endl; +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << " k" << stageIdx << "={" << kstack_h(0, stageIdx) << ", " + << kstack_h(1, stageIdx) << "}" << std::endl; #endif } EXPECT_NEAR_KK(sol(0), y_new_h(0), 1e-8); EXPECT_NEAR_KK(sol(1), y_new_h(1), 1e-8); -#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) +#if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << " y={" << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; - std::cout << " error={" << Kokkos::abs(y_new_h(0) - y_ref_h(0)) / Kokkos::abs(y_ref_h(0)) - << ", " << Kokkos::abs(y_new_h(1) - y_ref_h(1)) / Kokkos::abs(y_ref_h(1)) << "}" << std::endl; + std::cout << " error={" + << Kokkos::abs(y_new_h(0) - y_ref_h(0)) / Kokkos::abs(y_ref_h(0)) + << ", " + << Kokkos::abs(y_new_h(1) - y_ref_h(1)) / Kokkos::abs(y_ref_h(1)) + << "}" << std::endl; +#else + (void)y_ref_h; #endif -} // test_method +} // test_method template void test_RK() { - using vec_type = Kokkos::View; - using mv_type = Kokkos::View; + using RK_type = KokkosODE::Experimental::RK_type; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; duho my_oscillator(1, 1, 4); - const int neqs = my_oscillator.neqs; - + const int neqs = my_oscillator.neqs; + vec_type y("solution", neqs), f("function", neqs); auto y_h = Kokkos::create_mirror(y); - y_h(0) = 1; y_h(1) = 0; + y_h(0) = 1; + y_h(1) = 0; Kokkos::deep_copy(y, y_h); - constexpr double tstart = 0, tend = 10; - constexpr int max_steps = 1000; - double dt = (tend - tstart) / max_steps; + constexpr double tstart = 0, tend = 0.01; + constexpr int num_steps = 1000; + double dt = (tend - tstart) / num_steps; vec_type y_new("y new", neqs), y_old("y old", neqs); // Since y_old_h will be reused to set initial conditions @@ -168,7 +199,8 @@ void test_RK() { // create_mirror_view which would not do a copy // when y_old is in HostSpace. typename vec_type::HostMirror y_old_h = Kokkos::create_mirror(y_old); - y_old_h(0) = 1; y_old_h(1) = 0; + y_old_h(0) = 1; + y_old_h(1) = 0; // First compute analytical solution as reference // and to evaluate the error from each RK method. @@ -181,9 +213,10 @@ void test_RK() { Kokkos::parallel_for(my_policy, wrapper); Kokkos::deep_copy(y_ref_h, y_ref); -#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) +#if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "\nAnalytical solution" << std::endl; - std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" << std::endl; + std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" + << std::endl; #endif } @@ -196,112 +229,151 @@ void test_RK() { Kokkos::View ks(ks_raw, 2, 1); double sol_raw[2] = {1, -0.04}; Kokkos::View sol(sol_raw, 2); - test_method, vec_type, mv_type, double>("Euler-Forward", my_oscillator, tstart, tend, dt, 1, y_old, y_new, ks, sol, y_ref_h); + test_method( + "Euler-Forward", my_oscillator, tstart, tend, 1, y_old, y_new, 1, 1, ks, + sol, y_ref_h); } { Kokkos::deep_copy(y_old, y_old_h); - double ks_raw[4] = {0, -0.04, - -4, -3.96}; + double ks_raw[4] = {0, -0.04, -4, -3.96}; Kokkos::View ks(ks_raw, 2, 2); double sol_raw[2] = {0.9998, -0.0398}; Kokkos::View sol(sol_raw, 2); - test_method, vec_type, mv_type, double>("Euler-Heun", my_oscillator, tstart, tend, dt, 1, y_old, y_new, ks, sol, y_ref_h); + test_method( + "Euler-Heun", my_oscillator, tstart, tend, 1, y_old, y_new, 2, 2, ks, + sol, y_ref_h); } { Kokkos::deep_copy(y_old, y_old_h); - double ks_raw[6] = {0, -0.02, -0.03980078, - -4, -3.98, -3.95940234}; + double ks_raw[6] = {0, -0.02, -0.03980078, -4, -3.98, -3.95940234}; Kokkos::View ks(ks_raw, 2, 3); double sol_raw[2] = {0.9998, -0.03979999}; Kokkos::View sol(sol_raw, 2); - test_method, vec_type, mv_type, double>("RKF-12", my_oscillator, tstart, tend, dt, 1, y_old, y_new, ks, sol, y_ref_h); + test_method( + "RKF-12", my_oscillator, tstart, tend, 1, y_old, y_new, 2, 3, ks, sol, + y_ref_h); } { Kokkos::deep_copy(y_old, y_old_h); - double ks_raw[8] = {0, -0.02, -0.02985, -0.039798, - -4, -3.98, -3.96955, -3.95940467}; + double ks_raw[8] = {0, -0.02, -0.02985, -0.039798, + -4, -3.98, -3.96955, -3.95940467}; Kokkos::View ks(ks_raw, 2, 4); double sol_raw[2] = {0.99980067, -0.039798}; Kokkos::View sol(sol_raw, 2); - test_method, vec_type, mv_type, double>("RKBS", my_oscillator, tstart, tend, dt, 1, y_old, y_new, ks, sol, y_ref_h); + test_method( + "RKBS", my_oscillator, tstart, tend, 1, y_old, y_new, 3, 4, ks, sol, + y_ref_h); } { Kokkos::deep_copy(y_old, y_old_h); - double ks_raw[12] = {0, -0.01, -0.01497188, -0.03674986, -0.03979499, -0.0199505, - -4, -3.99, -3.98491562, -3.96257222, -3.95941166, -3.97984883}; + double ks_raw[12] = {0, -0.01, -0.01497188, -0.03674986, + -0.03979499, -0.0199505, -4, -3.99, + -3.98491562, -3.96257222, -3.95941166, -3.97984883}; Kokkos::View ks(ks_raw, 2, 6); - double sol_raw[2] = { 0.99980067, -0.03979801}; + double sol_raw[2] = {0.99980067, -0.03979801}; Kokkos::View sol(sol_raw, 2); - test_method, vec_type, mv_type, double>("RKF-45", my_oscillator, tstart, tend, dt, 1, y_old, y_new, ks, sol, y_ref_h); + test_method( + "RKF-45", my_oscillator, tstart, tend, 1, y_old, y_new, 5, 6, ks, sol, + y_ref_h); } { Kokkos::deep_copy(y_old, y_old_h); - double ks_raw[12] = {0, -0.008, -0.011982, -0.02392735, -0.03979862, -0.03484563, - -4, -3.992, -3.987946, -3.97578551, -3.95940328, -3.96454357}; + double ks_raw[12] = {0, -0.008, -0.011982, -0.02392735, + -0.03979862, -0.03484563, -4, -3.992, + -3.987946, -3.97578551, -3.95940328, -3.96454357}; Kokkos::View ks(ks_raw, 2, 6); - double sol_raw[2] = { 0.99980067, -0.03979801}; + double sol_raw[2] = {0.99980067, -0.03979801}; Kokkos::View sol(sol_raw, 2); - test_method, vec_type, mv_type, double>("Cash-Karp", my_oscillator, tstart, tend, dt, 1, y_old, y_new, ks, sol, y_ref_h); + test_method( + "Cash-Karp", my_oscillator, tstart, tend, 1, y_old, y_new, 5, 6, ks, + sol, y_ref_h); } -} // test_RK + { + Kokkos::deep_copy(y_old, y_old_h); + double ks_raw[14] = {0, -0.008, -0.011982, -0.03187008, + -0.03539333, -0.0397954, -0.03979801, -4, + -3.992, -3.987946, -3.96762048, -3.96398013, + -3.95941068, -3.95940467}; + Kokkos::View ks(ks_raw, 2, 7); + double sol_raw[2] = {0.99980067, -0.03979801}; + Kokkos::View sol(sol_raw, 2); + test_method( + "Dormand-Prince", my_oscillator, tstart, tend, 1, y_old, y_new, 5, 7, + ks, sol, y_ref_h); + } + +} // test_RK -template -void test_rate(ode_type& my_ode, const scalar_type& tstart, const scalar_type& tend, - Kokkos::View dt, const int max_steps, - typename vec_type::HostMirror& y_old_h, typename vec_type::HostMirror& y_ref_h, - typename vec_type::HostMirror& error) { +template +void test_rate(ode_type& my_ode, const scalar_type& tstart, + const scalar_type& tend, + Kokkos::View num_steps, + typename vec_type::HostMirror& y_old_h, + typename vec_type::HostMirror& y_ref_h, + typename vec_type::HostMirror& error) { using execution_space = typename vec_type::execution_space; + using solver_type = KokkosODE::Experimental::RungeKutta; - table_type table; vec_type tmp("tmp vector", my_ode.neqs); - mv_type kstack("k stack", my_ode.neqs, table.nstages); + mv_type kstack("k stack", my_ode.neqs, solver_type::num_stages()); vec_type y_new("solution", my_ode.neqs); vec_type y_old("intial conditions", my_ode.neqs); auto y_new_h = Kokkos::create_mirror(y_new); Kokkos::RangePolicy my_policy(0, 1); - for(int idx = 0; idx < dt.extent_int(0); ++idx) { + for (int idx = 0; idx < num_steps.extent_int(0); ++idx) { + KokkosODE::Experimental::ODE_params params(num_steps(idx)); Kokkos::deep_copy(y_old, y_old_h); Kokkos::deep_copy(y_new, y_old_h); - RKSolve_wrapper solve_wrapper(my_ode, table, tstart, tend, dt(idx), max_steps, y_old, y_new, tmp, kstack); + RKSolve_wrapper + solve_wrapper(my_ode, params, tstart, tend, y_old, y_new, tmp, kstack); Kokkos::parallel_for(my_policy, solve_wrapper); Kokkos::deep_copy(y_new_h, y_new); error(idx) = Kokkos::abs(y_new_h(0) - y_ref_h(0)) / Kokkos::abs(y_ref_h(0)); -#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - std::cout << "dt=" << dt(idx) << ", error=" << error(idx) - << ", solution: {" << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + scalar_type dt = (tend - tstart) / num_steps(idx); + std::cout << "dt=" << dt << ", error=" << error(idx) << ", solution: {" + << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; #endif } -} // test_method +} // test_method -template +template void test_convergence_rate() { - using vec_type = Kokkos::View; - using mv_type = Kokkos::View; + using RK_type = KokkosODE::Experimental::RK_type; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; duho my_oscillator(1, 1, 4); - const int neqs = my_oscillator.neqs; - + const int neqs = my_oscillator.neqs; + vec_type y("solution", neqs), f("function", neqs); auto y_h = Kokkos::create_mirror(y); - y_h(0) = 1; y_h(1) = 0; + y_h(0) = 1; + y_h(1) = 0; Kokkos::deep_copy(y, y_h); constexpr double tstart = 0, tend = 1.024; - constexpr int max_steps = 1024; - Kokkos::View dt("Time Steps", 8); - dt(0) = 0.002; dt(1) = 0.004; dt(2) = 0.008; dt(3) = 0.016; - dt(4) = 0.032; dt(5) = 0.064; dt(6) = 0.128; dt(7) = 0.256; + Kokkos::View num_steps("Max Steps", 8); + num_steps(0) = 512; + num_steps(1) = 256; + num_steps(2) = 128; + num_steps(3) = 64; + num_steps(4) = 32; + num_steps(5) = 16; + num_steps(6) = 8; + num_steps(7) = 4; vec_type y_new("y new", neqs), y_old("y old", neqs); // Since y_old_h will be reused to set initial conditions @@ -309,7 +381,8 @@ void test_convergence_rate() { // create_mirror_view which would not do a copy // when y_old is in HostSpace. typename vec_type::HostMirror y_old_h = Kokkos::create_mirror(y_old); - y_old_h(0) = 1; y_old_h(1) = 0; + y_old_h(0) = 1; + y_old_h(1) = 0; // First compute analytical solution as reference // and to evaluate the error from each RK method. @@ -322,62 +395,78 @@ void test_convergence_rate() { Kokkos::parallel_for(my_policy, wrapper); Kokkos::deep_copy(y_ref_h, y_ref); -#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) +#if defined(HAVE_KOKKOSKERNELS_DEBUG) std::cout << "\nAnalytical solution" << std::endl; - std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" << std::endl; + std::cout << " y={" << y_ref_h(0) << ", " << y_ref_h(1) << "}" + << std::endl; #endif } - typename vec_type::HostMirror error("error", dt.extent(0)); - test_rate, vec_type, mv_type, double>(my_oscillator, tstart, tend, dt, max_steps, y_old_h, y_ref_h, error); + typename vec_type::HostMirror error("error", num_steps.extent(0)); + test_rate( + my_oscillator, tstart, tend, num_steps, y_old_h, y_ref_h, error); - for(int idx = 1; idx < dt.extent_int(0) - 2; ++idx) { - double expected_ratio = Kokkos::pow(dt(idx + 1) / dt(idx), KokkosODE::Impl::ButcherTableau<1, 1>::order); - double actual_ratio = error(idx+1) / error(idx); + for (int idx = 1; idx < num_steps.extent_int(0) - 2; ++idx) { + double expected_ratio = + Kokkos::pow(num_steps(idx) / num_steps(idx + 1), + KokkosODE::Impl::ButcherTableau<1, 1>::order); + double actual_ratio = error(idx + 1) / error(idx); EXPECT_NEAR_KK_REL(actual_ratio, expected_ratio, 0.15); -#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / Kokkos::abs(expected_ratio); - std::cout << "error ratio: " << actual_ratio << ", expected ratio: " << expected_ratio - << ", rel diff: " << rel_ratio_diff << std::endl; +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / + Kokkos::abs(expected_ratio); + std::cout << "error ratio: " << actual_ratio + << ", expected ratio: " << expected_ratio + << ", rel diff: " << rel_ratio_diff << std::endl; #endif } Kokkos::deep_copy(error, 0); - test_rate, vec_type, mv_type, double>(my_oscillator, tstart, tend, dt, max_steps, y_old_h, y_ref_h, error); - - for(int idx = 1; idx < dt.extent_int(0) - 2; ++idx) { - double expected_ratio = Kokkos::pow(dt(idx + 1) / dt(idx), KokkosODE::Impl::ButcherTableau<2, 3>::order); - double actual_ratio = error(idx+1) / error(idx); + test_rate( + my_oscillator, tstart, tend, num_steps, y_old_h, y_ref_h, error); + + for (int idx = 1; idx < num_steps.extent_int(0) - 2; ++idx) { + double expected_ratio = + Kokkos::pow(num_steps(idx) / num_steps(idx + 1), + KokkosODE::Impl::ButcherTableau<2, 3>::order); + double actual_ratio = error(idx + 1) / error(idx); EXPECT_NEAR_KK_REL(actual_ratio, expected_ratio, 0.05); -#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / Kokkos::abs(expected_ratio); - std::cout << "error ratio: " << actual_ratio << ", expected ratio: " << expected_ratio - << ", rel diff: " << rel_ratio_diff << std::endl; +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / + Kokkos::abs(expected_ratio); + std::cout << "error ratio: " << actual_ratio + << ", expected ratio: " << expected_ratio + << ", rel diff: " << rel_ratio_diff << std::endl; #endif } Kokkos::deep_copy(error, 0); - test_rate, vec_type, mv_type, double>(my_oscillator, tstart, tend, dt, max_steps, y_old_h, y_ref_h, error); - - for(int idx = 1; idx < dt.extent_int(0) - 2; ++idx) { - double expected_ratio = Kokkos::pow(dt(idx + 1) / dt(idx), KokkosODE::Impl::ButcherTableau<4, 5>::order); - double actual_ratio = error(idx+1) / error(idx); + test_rate( + my_oscillator, tstart, tend, num_steps, y_old_h, y_ref_h, error); + + for (int idx = 1; idx < num_steps.extent_int(0) - 2; ++idx) { + double expected_ratio = + Kokkos::pow(num_steps(idx) / num_steps(idx + 1), + KokkosODE::Impl::ButcherTableau<4, 5>::order); + double actual_ratio = error(idx + 1) / error(idx); EXPECT_NEAR_KK_REL(actual_ratio, expected_ratio, 0.05); -#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / Kokkos::abs(expected_ratio); - std::cout << "error ratio: " << actual_ratio << ", expected ratio: " << expected_ratio - << ", rel diff: " << rel_ratio_diff << std::endl; +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + double rel_ratio_diff = Kokkos::abs(actual_ratio - expected_ratio) / + Kokkos::abs(expected_ratio); + std::cout << "error ratio: " << actual_ratio + << ", expected ratio: " << expected_ratio + << ", rel diff: " << rel_ratio_diff << std::endl; #endif } -} // test_convergence_rate -} // namespace Test +} // test_convergence_rate + +} // namespace Test int test_RK() { Test::test_RK(); - return 1; } diff --git a/ode/unit_test/Test_ODE_RK_chem.hpp b/ode/unit_test/Test_ODE_RK_chem.hpp index e3fee4b461..24ad69f3ac 100644 --- a/ode/unit_test/Test_ODE_RK_chem.hpp +++ b/ode/unit_test/Test_ODE_RK_chem.hpp @@ -14,27 +14,33 @@ // //@HEADER +#include +#include "KokkosKernels_TestUtils.hpp" + +#include "KokkosODE_RungeKutta.hpp" + namespace Test { // R1 = 1e-6*1.85e10 * exp(-15618 / T) * (reac) ( 1 – (1- 10^-9) reac) // d(reac)/dt = -R1 // d(prod)/dt = R1 struct chem_model_1 { - constexpr static int neqs = 2; // constexpr static double alpha = 1e-6*1.85e10; constexpr static double alpha = 1.85e10; constexpr static double beta = 15618; - constexpr static double gamma = 1 - 10^-9; + constexpr static double gamma = 1 - 10e-9; const double tstart, tend, T0, T1; chem_model_1(const double tstart_ = 0, const double tend_ = 100, - const double T0_ = 300, const double T1_ = 800) : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_) {}; + const double T0_ = 300, const double T1_ = 800) + : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_){}; template - KOKKOS_FUNCTION - void evaluate_function(const double t, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { + KOKKOS_FUNCTION void evaluate_function(const double t, const double /*dt*/, + const vec_type1& y, + const vec_type2& f) const { // First compute the temperature // using linear ramp from T0 to T1 // between tstart and tend. @@ -44,25 +50,25 @@ struct chem_model_1 { f(0) = -alpha * Kokkos::exp(-beta / T) * y(0) * (1 - gamma * y(0)); f(1) = -f(0); } - }; struct chem_model_2 { - - constexpr static int neqs = 7; - constexpr static double alpha1 = 1e-6*3334169440721739.0*1500; + constexpr static int neqs = 7; + constexpr static double alpha1 = 1e-6 * 3334169440721739.0 * 1500; constexpr static double beta1 = 207850000.0 / 8314.0; - constexpr static double alpha2 = 1e-6*49997793980831.89*1500; + constexpr static double alpha2 = 1e-6 * 49997793980831.89 * 1500; constexpr static double beta2 = 207850000.0 / 8314.0; const double tstart, tend, T0, T1; chem_model_2(const double tstart_ = 0, const double tend_ = 1200, - const double T0_ = 300, const double T1_ = 1000) : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_) {}; + const double T0_ = 300, const double T1_ = 1000) + : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_){}; template - KOKKOS_FUNCTION - void evaluate_function(const double t, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { + KOKKOS_FUNCTION void evaluate_function(const double t, const double /*dt*/, + const vec_type1& y, + const vec_type2& f) const { // First compute the temperature // using linear ramp from T0 to T1 // between tstart and tend. @@ -85,83 +91,102 @@ struct chem_model_2 { template void test_chem() { - using vec_type = Kokkos::View; - using mv_type = Kokkos::View; - using table_type = KokkosODE::Impl::ButcherTableau<4, 5, 1>; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + using RK_type = KokkosODE::Experimental::RK_type; + using solver_type = KokkosODE::Experimental::RungeKutta; { chem_model_1 chem_model; - const int neqs = chem_model.neqs; - const int max_steps = 15000; - const double dt = 0.1; + const int neqs = chem_model.neqs; + const int num_steps = 15000; + const double dt = 0.1; - table_type table; + KokkosODE::Experimental::ODE_params params(num_steps); vec_type tmp("tmp vector", neqs); - mv_type kstack("k stack", neqs, table.nstages); + mv_type kstack("k stack", neqs, solver_type::num_stages()); // Set initial conditions vec_type y_new("solution", neqs); vec_type y_old("initial conditions", neqs); auto y_old_h = Kokkos::create_mirror(y_old); - y_old_h(0) = 1; y_old_h(1) = 0; + y_old_h(0) = 1; + y_old_h(1) = 0; Kokkos::deep_copy(y_old, y_old_h); Kokkos::deep_copy(y_new, y_old_h); Kokkos::RangePolicy my_policy(0, 1); - RKSolve_wrapper solve_wrapper(chem_model, table, chem_model.tstart, chem_model.tend, - dt, max_steps, y_old, y_new, tmp, kstack); + RKSolve_wrapper + solve_wrapper(chem_model, params, chem_model.tstart, chem_model.tend, + y_old, y_new, tmp, kstack); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror(y_new); Kokkos::deep_copy(y_new_h, y_new); -#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + const double dt = (chem_model.tend - chem_model.tstart) / params.num_steps; std::cout << "\nChem model 1" << std::endl; - std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend << std::endl; - std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 << std::endl; + std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend + << std::endl; + std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 + << std::endl; std::cout << " dt=" << dt << std::endl; - std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << "}" << std::endl; - std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; + std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << "}" + << std::endl; + std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << "}" + << std::endl; #endif } { chem_model_2 chem_model; - const int neqs = chem_model.neqs; - const int max_steps = 1500; - const double dt = 1; + const int neqs = chem_model.neqs; + const int num_steps = 1500; - table_type table; + KokkosODE::Experimental::ODE_params params(num_steps); vec_type tmp("tmp vector", neqs); - mv_type kstack("k stack", neqs, table.nstages); + mv_type kstack("k stack", neqs, solver_type::num_stages()); // Set initial conditions vec_type y_new("solution", neqs); vec_type y_old("initial conditions", neqs); auto y_old_h = Kokkos::create_mirror(y_old); - y_old_h(0) = 0.25; y_old_h(1) = 0.25; y_old_h(2) = 0; y_old_h(3) = 0; y_old_h(4) = 0; y_old_h(5) = 0; y_old_h(6) = 0; + y_old_h(0) = 0.25; + y_old_h(1) = 0.25; + y_old_h(2) = 0; + y_old_h(3) = 0; + y_old_h(4) = 0; + y_old_h(5) = 0; + y_old_h(6) = 0; Kokkos::deep_copy(y_old, y_old_h); Kokkos::deep_copy(y_new, y_old_h); Kokkos::RangePolicy my_policy(0, 1); - RKSolve_wrapper solve_wrapper(chem_model, table, chem_model.tstart, chem_model.tend, - dt, max_steps, y_old, y_new, tmp, kstack); + RKSolve_wrapper + solve_wrapper(chem_model, params, chem_model.tstart, chem_model.tend, + y_old, y_new, tmp, kstack); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror(y_new); Kokkos::deep_copy(y_new_h, y_new); -#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + const double dt = (chem_model.tend - chem_model.tstart) / params.num_steps; std::cout << "\nChem model 2" << std::endl; - std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend << std::endl; - std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 << std::endl; + std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend + << std::endl; + std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 + << std::endl; std::cout << " dt=" << dt << std::endl; - std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << ", " << y_old_h(2) - << ", " << y_old_h(3) << ", " << y_old_h(4) << ", " << y_old_h(5) << ", " << y_old_h(6) << "}" << std::endl; - std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << ", " << y_new_h(2) - << ", " << y_new_h(3) << ", " << y_new_h(4) << ", " << y_new_h(5) << ", " << y_new_h(6) << "}" << std::endl; + std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << ", " + << y_old_h(2) << ", " << y_old_h(3) << ", " << y_old_h(4) << ", " + << y_old_h(5) << ", " << y_old_h(6) << "}" << std::endl; + std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << ", " + << y_new_h(2) << ", " << y_new_h(3) << ", " << y_new_h(4) << ", " + << y_new_h(5) << ", " << y_new_h(6) << "}" << std::endl; #endif } -} // test_chem -} // namespace Test +} // test_chem +} // namespace Test int test_chem_models() { Test::test_chem(); diff --git a/perf_test/KokkosKernels_perf_test_utilities.hpp b/perf_test/KokkosKernels_perf_test_utilities.hpp index cc7f70ccec..0ebfd20578 100644 --- a/perf_test/KokkosKernels_perf_test_utilities.hpp +++ b/perf_test/KokkosKernels_perf_test_utilities.hpp @@ -30,6 +30,8 @@ struct CommonInputParams { int use_sycl = 0; int use_openmp = 0; int use_threads = 0; + + int repeat = 0; }; std::string list_common_options() { @@ -130,6 +132,8 @@ void parse_common_options(int& argc, char** argv, CommonInputParams& params) { } else if (check_arg_int(argIdx, argc, argv, "--sycl", params.use_sycl)) { params.use_sycl++; remove_flag = true; + } else if (check_arg_int(argIdx, argc, argv, "--repeat", params.repeat)) { + remove_flag = true; } if (remove_flag) { diff --git a/perf_test/ode/CMakeLists.txt b/perf_test/ode/CMakeLists.txt index 24ddf592bc..67d0c421fb 100644 --- a/perf_test/ode/CMakeLists.txt +++ b/perf_test/ode/CMakeLists.txt @@ -20,9 +20,4 @@ if(KOKKOSKERNELS_ENABLE_BENCHMARK) NAME ${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} ) -else() - KOKKOSKERNELS_ADD_EXECUTABLE( - ${PACKAGE_NAME}_ode_runge_kutta - SOURCES KokkosODE_RK.cpp - ) endif() diff --git a/perf_test/ode/KokkosODE_RK.cpp b/perf_test/ode/KokkosODE_RK.cpp index 987cd0610c..617e7453cd 100644 --- a/perf_test/ode/KokkosODE_RK.cpp +++ b/perf_test/ode/KokkosODE_RK.cpp @@ -14,37 +14,35 @@ // //@HEADER -#include "KokkosODE_RungeKuttaTables_impl.hpp" -#include "KokkosODE_RungeKutta_impl.hpp" +#include "KokkosODE_RungeKutta.hpp" + #include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" -#ifdef KOKKOSKERNELS_ENABLE_BENCHMARK #include #include "Benchmark_Context.hpp" -#else -#include "KokkosKernels_perf_test_utilities.hpp" -#endif namespace { // R1 = 1e-6*1.85e10 * exp(-15618 / T) * (reac) ( 1 – (1- 10^-9) reac) // d(reac)/dt = -R1 // d(prod)/dt = R1 struct chem_model_1 { - constexpr static int neqs = 2; // constexpr static double alpha = 1e-6*1.85e10; constexpr static double alpha = 1.85e10; constexpr static double beta = 15618; - constexpr static double gamma = 1 - 10^-9; + constexpr static double gamma = 1 - 10 ^ -9; const double tstart, tend, T0, T1; chem_model_1(const double tstart_ = 0, const double tend_ = 300, - const double T0_ = 300, const double T1_ = 800) : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_) {}; + const double T0_ = 300, const double T1_ = 800) + : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_){}; template - KOKKOS_FUNCTION - void evaluate_function(const double t, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { + KOKKOS_FUNCTION void evaluate_function(const double t, const double /*dt*/, + const vec_type1& y, + const vec_type2& f) const { // First compute the temperature // using linear ramp from T0 to T1 // between tstart and tend. @@ -54,32 +52,34 @@ struct chem_model_1 { f(0) = -alpha * Kokkos::exp(-beta / T) * y(0) * (1 - gamma * y(0)); f(1) = -f(0); } - }; // More complex chemical reaction involving two reacting // species foam A and foam B, that become 5 products. // The temperature is capped at 1000K once t reaches 1500s struct chem_model_2 { - - constexpr static int neqs = 7; - constexpr static double alpha1 = 1e-6*3334169440721739.0*1500; + constexpr static int neqs = 7; + constexpr static double alpha1 = 1e-6 * 3334169440721739.0 * 1500; constexpr static double beta1 = 207850000.0 / 8314.0; - constexpr static double alpha2 = 1e-6*49997793980831.89*1500; + constexpr static double alpha2 = 1e-6 * 49997793980831.89 * 1500; constexpr static double beta2 = 207850000.0 / 8314.0; const double tstart, tend, T0, T1; chem_model_2(const double tstart_ = 0, const double tend_ = 2000, - const double T0_ = 300, const double T1_ = 1000) : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_) {}; + const double T0_ = 300, const double T1_ = 1000) + : tstart(tstart_), tend(tend_), T0(T0_), T1(T1_){}; template - KOKKOS_FUNCTION - void evaluate_function(const double t, const double /*dt*/, const vec_type1& y, const vec_type2& f) const { + KOKKOS_FUNCTION void evaluate_function(const double t, const double /*dt*/, + const vec_type1& y, + const vec_type2& f) const { // First compute the temperature // using linear ramp from T0 to T1 // between tstart and tend. - double T = ((T1 - T0) * (t - tstart) / (1500 - tstart) + T0 < 1000) ? (T1 - T0) * (t - tstart) / (1500 - tstart) + T0 : 1000; + double T = ((T1 - T0) * (t - tstart) / (1500 - tstart) + T0 < 1000) + ? (T1 - T0) * (t - tstart) / (1500 - tstart) + T0 + : 1000; // Evaluate the chemical reaction rates double R1 = y(0) * alpha1 * Kokkos::exp(-beta1 / T); @@ -94,194 +94,239 @@ struct chem_model_2 { f(5) = 0.187 * R2; f(6) = 0.133 * R2; } - }; -template +template struct RKSolve_wrapper { + using ode_params = KokkosODE::Experimental::ODE_params; ode_type my_ode; table_type table; - scalar_type tstart, tend, dt; - int max_steps; + ode_params params; + + scalar_type tstart, tend; vec_type y_old, y_new, tmp; mv_type kstack; RKSolve_wrapper(const ode_type& my_ode_, const table_type& table_, - const scalar_type tstart_, const scalar_type tend_, const scalar_type dt_, - const int max_steps_, const vec_type& y_old_, const vec_type& y_new_, - const vec_type& tmp_, const mv_type& kstack_) : - my_ode(my_ode_), table(table_), tstart(tstart_), tend(tend_), dt(dt_), max_steps(max_steps_), - y_old(y_old_), y_new(y_new_), tmp(tmp_), kstack(kstack_) {} + const ode_params& params_, const scalar_type tstart_, + const scalar_type tend_, const vec_type& y_old_, + const vec_type& y_new_, const vec_type& tmp_, + const mv_type& kstack_) + : my_ode(my_ode_), + table(table_), + params(params_), + tstart(tstart_), + tend(tend_), + y_old(y_old_), + y_new(y_new_), + tmp(tmp_), + kstack(kstack_) {} KOKKOS_FUNCTION - void operator() (const int idx) const { - + void operator()(const int idx) const { // Take subviews to create the local problem - auto local_y_old = Kokkos::subview( y_old, Kokkos::pair(2*idx, 2*idx + 1)); - auto local_y_new = Kokkos::subview( y_new, Kokkos::pair(2*idx, 2*idx + 1)); - auto local_tmp = Kokkos::subview( tmp, Kokkos::pair(2*idx, 2*idx + 1)); - auto local_kstack = Kokkos::subview(kstack, Kokkos::pair(2*idx, 2*idx + 1), Kokkos::ALL()); + auto local_y_old = + Kokkos::subview(y_old, Kokkos::pair(2 * idx, 2 * idx + 1)); + auto local_y_new = + Kokkos::subview(y_new, Kokkos::pair(2 * idx, 2 * idx + 1)); + auto local_tmp = Kokkos::subview(tmp, Kokkos::pair(2 * idx, 2 * idx + 1)); + auto local_kstack = Kokkos::subview( + kstack, Kokkos::pair(2 * idx, 2 * idx + 1), Kokkos::ALL()); // Run Runge-Kutta time integrator - KokkosODE::Impl::RKSolve(my_ode, table, tstart, tend, dt, max_steps, - local_y_old, local_y_new, local_tmp, local_kstack); + KokkosODE::Impl::RKSolve( + my_ode, table, params, tstart, tend, local_y_old, local_y_new, + local_tmp, local_kstack); } }; -struct rk_input_parameters{ - +struct rk_input_parameters { int num_odes; int model; - - rk_input_parameters(const int num_odes_, const int model_) : num_odes(num_odes_), model(model_) {}; - + int repeat; + bool verbose; + + rk_input_parameters(const int num_odes_, const int model_, const int repeat_, + const bool verbose_) + : num_odes(num_odes_), + model(model_), + repeat(repeat_), + verbose(verbose_){}; }; -} // namespace (anonymous) - +} // namespace -#ifdef KOKKOSKERNELS_ENABLE_BENCHMARK -void run_ode_chem(benchmark::State& state, const rk_input_parameters& params) { -#else -void run_ode_chem(const std::vector& state, const rk_input_parameters& params) { -#endif - using execution_space = Kokkos::DefaultExecutionSpace; - using vec_type = Kokkos::View; +template +void run_ode_chem(benchmark::State& state, const rk_input_parameters& inputs) { + using vec_type = Kokkos::View; using mv_type = Kokkos::View; using table_type = KokkosODE::Impl::ButcherTableau<4, 5, 1>; + using ode_params = KokkosODE::Experimental::ODE_params; - const int num_odes = params.num_odes; - const int model = params.model; + const int num_odes = inputs.num_odes; + const int model = inputs.model; switch (model) { - case 1: { - chem_model_1 chem_model; - const int neqs = chem_model.neqs; - const int max_steps = 15000; - const double dt = 0.1; - - table_type table; - vec_type tmp("tmp vector", neqs*num_odes); - mv_type kstack("k stack", neqs*num_odes, table.nstages); - - // Set initial conditions - vec_type y_new("solution", neqs*num_odes); - vec_type y_old("initial conditions", neqs*num_odes); - auto y_old_h = Kokkos::create_mirror(y_old); - y_old_h(0) = 1; y_old_h(1) = 0; - Kokkos::deep_copy(y_old, y_old_h); - Kokkos::deep_copy(y_new, y_old_h); - - Kokkos::RangePolicy my_policy(0, num_odes); - RKSolve_wrapper solve_wrapper(chem_model, table, chem_model.tstart, chem_model.tend, - dt, max_steps, y_old, y_new, tmp, kstack); - - Kokkos::Timer time; - time.reset(); - for(auto _ : state) { - (void) _; - Kokkos::parallel_for(my_policy, solve_wrapper); - Kokkos::fence(); + case 1: { + chem_model_1 chem_model; + const int neqs = chem_model.neqs; + const int num_steps = 15000; + const double dt = 0.1; + + table_type table; + ode_params params(num_steps); + vec_type tmp("tmp vector", neqs * num_odes); + mv_type kstack("k stack", neqs * num_odes, table.nstages); + + // Set initial conditions + vec_type y_new("solution", neqs * num_odes); + vec_type y_old("initial conditions", neqs * num_odes); + auto y_old_h = Kokkos::create_mirror(y_old); + y_old_h(0) = 1; + y_old_h(1) = 0; + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::deep_copy(y_new, y_old_h); + + Kokkos::RangePolicy my_policy(0, num_odes); + RKSolve_wrapper solve_wrapper(chem_model, table, params, + chem_model.tstart, chem_model.tend, y_old, + y_new, tmp, kstack); + + Kokkos::Timer time; + time.reset(); + for (auto _ : state) { + (void)_; + Kokkos::parallel_for(my_policy, solve_wrapper); + Kokkos::fence(); + } + double run_time = time.seconds(); + + if (inputs.verbose) { + auto y_new_h = Kokkos::create_mirror(y_new); + Kokkos::deep_copy(y_new_h, y_new); + std::cout << "\nChem model 1" << std::endl; + std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend + << std::endl; + std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 + << std::endl; + std::cout << " dt=" << dt << std::endl; + std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << "}" + << std::endl; + std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << "}" + << std::endl; + std::cout << " num odes: " << num_odes << std::endl; + std::cout << " time elapsed: " << run_time << std::endl; + } + break; } - double run_time = time.seconds(); - - auto y_new_h = Kokkos::create_mirror(y_new); - Kokkos::deep_copy(y_new_h, y_new); - std::cout << "\nChem model 1" << std::endl; - std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend << std::endl; - std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 << std::endl; - std::cout << " dt=" << dt << std::endl; - std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << "}" << std::endl; - std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; - std::cout << " num odes: " << num_odes << std::endl; - std::cout << " time elapsed: " << run_time << std::endl; - break; - } - case 2: { - chem_model_2 chem_model; - const int neqs = chem_model.neqs; - const int max_steps = 15000; - const double dt = 0.1; - - table_type table; - vec_type tmp("tmp vector", neqs*num_odes); - mv_type kstack("k stack", neqs*num_odes, table.nstages); - - // Set initial conditions - vec_type y_new("solution", neqs*num_odes); - vec_type y_old("initial conditions", neqs*num_odes); - auto y_old_h = Kokkos::create_mirror(y_old); - y_old_h(0) = 0.25; y_old_h(1) = 0.25; y_old(2) = 0; - y_old(3) = 0; y_old(4) = 0; y_old(5) = 0; y_old(6) = 0; - Kokkos::deep_copy(y_old, y_old_h); - Kokkos::deep_copy(y_new, y_old_h); - - Kokkos::RangePolicy my_policy(0, num_odes); - RKSolve_wrapper solve_wrapper(chem_model, table, chem_model.tstart, chem_model.tend, - dt, max_steps, y_old, y_new, tmp, kstack); - - Kokkos::Timer time; - time.reset(); - for(auto _ : state) { - (void) _; - Kokkos::parallel_for(my_policy, solve_wrapper); - Kokkos::fence(); + case 2: { + chem_model_2 chem_model; + const int neqs = chem_model.neqs; + const int num_steps = 15000; + const double dt = 0.1; + + table_type table; + ode_params params(num_steps); + vec_type tmp("tmp vector", neqs * num_odes); + mv_type kstack("k stack", neqs * num_odes, table.nstages); + + // Set initial conditions + vec_type y_new("solution", neqs * num_odes); + vec_type y_old("initial conditions", neqs * num_odes); + auto y_old_h = Kokkos::create_mirror(y_old); + y_old_h(0) = 0.25; + y_old_h(1) = 0.25; + y_old_h(2) = 0; + y_old_h(3) = 0; + y_old_h(4) = 0; + y_old_h(5) = 0; + y_old_h(6) = 0; + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::deep_copy(y_new, y_old_h); + + Kokkos::RangePolicy my_policy(0, num_odes); + RKSolve_wrapper solve_wrapper(chem_model, table, params, + chem_model.tstart, chem_model.tend, y_old, + y_new, tmp, kstack); + + Kokkos::Timer time; + time.reset(); + for (auto _ : state) { + (void)_; + Kokkos::parallel_for(my_policy, solve_wrapper); + Kokkos::fence(); + } + double run_time = time.seconds(); + + if (inputs.verbose) { + auto y_new_h = Kokkos::create_mirror(y_new); + Kokkos::deep_copy(y_new_h, y_new); + std::cout << "\nChem model 2" << std::endl; + std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend + << std::endl; + std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 + << std::endl; + std::cout << " dt=" << dt << std::endl; + std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << "}" + << std::endl; + std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << "}" + << std::endl; + std::cout << " num odes: " << num_odes << std::endl; + std::cout << " time elapsed: " << run_time << std::endl; + } + break; } - double run_time = time.seconds(); - - auto y_new_h = Kokkos::create_mirror(y_new); - Kokkos::deep_copy(y_new_h, y_new); - std::cout << "\nChem model 2" << std::endl; - std::cout << " t0=" << chem_model.tstart << ", tn=" << chem_model.tend << std::endl; - std::cout << " T0=" << chem_model.T0 << ", Tn=" << chem_model.T1 << std::endl; - std::cout << " dt=" << dt << std::endl; - std::cout << " y(t0)={" << y_old_h(0) << ", " << y_old_h(1) << "}" << std::endl; - std::cout << " y(tn)={" << y_new_h(0) << ", " << y_new_h(1) << "}" << std::endl; - std::cout << " num odes: " << num_odes << std::endl; - std::cout << " time elapsed: " << run_time << std::endl; - break; } - } -} - -#ifdef KOKKOSKERNELS_ENABLE_BENCHMARK -void run_benchmark_wrapper(benchmark::State& state) { - rk_input_parameters params(state.range(0), state.range(1)); - - run_ode_chem(state, params); } -#else -template - void run_perftest_wrapper(int argc, char** argv, perf_test::CommonInputParams) { - rk_input_parameters params(1000, 1); - - // Loop over arguments, parse them and fill params struct - for(int argIdx = 1; argIdx < argc - 1; ++argIdx) { - if((0 == Test::string_compare_no_case(argv[argIdx], "-N"))) { - params.num_odes = atoi(argv[argIdx + 1]); - } - if((0 == Test::string_compare_no_case(argv[argIdx], "--model"))) { - params.model = atoi(argv[argIdx + 1]); +void print_options() { + std::cerr << "Options\n" << std::endl; + + std::cerr << perf_test::list_common_options(); + + std::cerr + << "\t[Optional] --repeat :: how many times to repeat overall test" + << std::endl; + std::cerr << "\t[Optional] --verbose :: enable verbose output" + << std::endl; + std::cerr << "\t[Optional] --n :: number of ode problems to solve" + << std::endl; + std::cerr + << "\t[Optional] --model :: chemical mode to be solved: 1 or 2" + << std::endl; +} // print_options + +int parse_inputs(rk_input_parameters& params, int argc, char** argv) { + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_int(i, argc, argv, "--n", params.num_odes)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--model", + params.model)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--repeat", + params.repeat)) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--verbose", + params.verbose)) { + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + return 1; } } + return 0; +} // parse_inputs - // Basically we call the run routine - // with std::vector state = {1} which - // means we will do the perf test once. - // we could change that with command - // line argument logic in the future. - // std::vector tmp({1}); - run_ode_chem({1}, params); +template +void run_benchmark_wrapper(benchmark::State& state, int argc, char** argv) { + rk_input_parameters params(state.range(0), state.range(1), 1, false); + parse_inputs(params, argc, argv); + run_ode_chem(state, params); } -#endif - -#ifdef KOKKOSKERNELS_ENABLE_BENCHMARK -// Benchmark style call for performance -// monitoring over time in a highly controlled -// environment int main(int argc, char** argv) { Kokkos::initialize(argc, argv); @@ -289,13 +334,28 @@ int main(int argc, char** argv) { benchmark::SetDefaultTimeUnit(benchmark::kMillisecond); KokkosKernelsBenchmark::add_benchmark_context(true); + perf_test::CommonInputParams common_params; + perf_test::parse_common_options(argc, argv, common_params); std::string bench_name = "KokkosODE_chem_models"; - benchmark::RegisterBenchmark(bench_name.c_str(), run_benchmark_wrapper) - ->UseRealTime() - ->ArgNames({"n", "model"}) - ->Args({1000, 1}); + if (0 < common_params.repeat) { + benchmark::RegisterBenchmark( + bench_name.c_str(), + run_benchmark_wrapper, argc, argv) + ->UseRealTime() + ->ArgNames({"n", "model"}) + ->Args({1000, 1}) + ->Iterations(common_params.repeat); + } else { + benchmark::RegisterBenchmark( + bench_name.c_str(), + run_benchmark_wrapper, argc, argv) + ->UseRealTime() + ->ArgNames({"n", "model"}) + ->Args({1000, 1}) + ->Iterations(common_params.repeat); + } benchmark::RunSpecifiedBenchmarks(); @@ -304,15 +364,3 @@ int main(int argc, char** argv) { return 0; } - -#else - -// perf test style call for testing and -// tunning of algorithms in a convenient way -#define KOKKOSKERNELS_PERF_TEST_NAME run_perftest_wrapper -#include "KokkosKernels_perf_test_instantiation.hpp" -int main(int argc, char** argv) { - return main_instantiation(argc, argv); -} // main - -#endif // KOKKOSKERNELS_ENABLE_BENCHMARK From 6b4b8bb17cba8f62c700832d5b27a684fa128ba0 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 6 Apr 2023 09:27:04 -0600 Subject: [PATCH 204/442] ODE: fix small typo and rebase error --- ode/unit_test/Test_ODE_RK_chem.hpp | 1 - perf_test/ode/KokkosODE_RK.cpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ode/unit_test/Test_ODE_RK_chem.hpp b/ode/unit_test/Test_ODE_RK_chem.hpp index 24ad69f3ac..2adc202ddc 100644 --- a/ode/unit_test/Test_ODE_RK_chem.hpp +++ b/ode/unit_test/Test_ODE_RK_chem.hpp @@ -100,7 +100,6 @@ void test_chem() { chem_model_1 chem_model; const int neqs = chem_model.neqs; const int num_steps = 15000; - const double dt = 0.1; KokkosODE::Experimental::ODE_params params(num_steps); vec_type tmp("tmp vector", neqs); diff --git a/perf_test/ode/KokkosODE_RK.cpp b/perf_test/ode/KokkosODE_RK.cpp index 617e7453cd..4f6e53e143 100644 --- a/perf_test/ode/KokkosODE_RK.cpp +++ b/perf_test/ode/KokkosODE_RK.cpp @@ -31,7 +31,7 @@ struct chem_model_1 { // constexpr static double alpha = 1e-6*1.85e10; constexpr static double alpha = 1.85e10; constexpr static double beta = 15618; - constexpr static double gamma = 1 - 10 ^ -9; + constexpr static double gamma = 1 - 10e-9; const double tstart, tend, T0, T1; From 788018fd43784fd8020e01a18dde0fe1bf7323c1 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 6 Apr 2023 11:29:11 -0600 Subject: [PATCH 205/442] Batched Gesv: initializing variable to make compiler happy Explicitely intializing the reducer type avoids warning in the nightly GCC 910 build. --- batched/dense/impl/KokkosBatched_Gesv_Impl.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index a72c1a04ce..3541ed246e 100644 --- a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -291,6 +291,8 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting::invoke( for (size_t i = 0; i < n; ++i) { int row_index, col_index; reducer_value_type value; + value.val = Kokkos::reduction_identity::max(); + value.loc = Kokkos::reduction_identity::min(); Kokkos::MaxLoc reducer_value(value); Kokkos::parallel_reduce( Kokkos::TeamVectorRange(member, n), From 2bd997ae36987f0017ccec207d38d427944451f6 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Thu, 6 Apr 2023 19:42:09 +0200 Subject: [PATCH 206/442] #8 added SYCL path for MKL in FindTPLMKL.cmake file --- cmake/Modules/FindTPLMKL.cmake | 54 +++++++++++++++------------------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/cmake/Modules/FindTPLMKL.cmake b/cmake/Modules/FindTPLMKL.cmake index 163d3c280d..52f4571976 100644 --- a/cmake/Modules/FindTPLMKL.cmake +++ b/cmake/Modules/FindTPLMKL.cmake @@ -1,34 +1,30 @@ find_package(MKL) IF(TARGET MKL::MKL) - # MKL version >= 2021 (see kokkos wiki and intel documentation. MKL CMake module file has been introduced starting MKL >= 2021) - MESSAGE("TARGET MKL::MKL FOUND") - IF (KOKKOS_ENABLE_SYCL) #get from kokkos-core - # MKL version >= 2022 (see kokkos wiki) - MESSAGE("KOKKOS_ENABLE_SYCL Detected") - IF (TARGET MKL::MKL_DPCPP) - MESSAGE("TARGET MKL::MKL_DPCPP FOUND") - ENDIF() - MESSAGE(FATAL_ERROR "KOKKOS_ENABLE_SYCL activated but the target MKL_DPCPP wasn't found") + # MKL version >= 2021 (see kokkos wiki and intel documentation. MKL CMake module file has been introduced starting MKL >= 2021) + IF (KOKKOS_ENABLE_SYCL) #get from kokkos-core + # MKL version >= 2022 (see kokkos wiki) + IF (NOT TARGET MKL::MKL_DPCPP) + MESSAGE(FATAL_ERROR "KOKKOS_ENABLE_SYCL activated but the target MKL_DPCPP wasn't found") ENDIF() - - SET(TPL_MKL_IMPORTED_NAME MKL::MKL) - SET(TPL_IMPORTED_NAME MKL::MKL) - ADD_LIBRARY(MKL INTERFACE) - IF(KOKKOS_ENABLE_SYCL) - TARGET_LINK_LIBRARIES(MKL INTERFACE MKL::MKL MKL::MKL_DPCPP) - ELSE() - TARGET_LINK_LIBRARIES(MKL INTERFACE MKL::MKL ) - ENDIF() - - ADD_LIBRARY(KokkosKernels::MKL ALIAS MKL ) - GET_TARGET_PROPERTY(LIB_TYPE ${TPL_IMPORTED_NAME} TYPE) - MESSAGE("LIB_TYPE: ${LIB_TYPE}") - # kokkoskernels_export_imported_tpl install MKL with target name MKL instead of - # MKL::MKL or KokkosKernels::MKL, so we need to install a specific ALIAS one - if(TARGET MKL) - MESSAGE("TARGET MKL CREATED") - ENDIF() -ELSEIF (CMAKE_CXX_COMPILER_ID STREQUAL "Intel") # Regular wary with MKL version < 2021 (Where MKL doesn't provide cmake module file) + ENDIF() + SET(TPL_MKL_IMPORTED_NAME MKL::MKL) + SET(TPL_IMPORTED_NAME MKL::MKL) + ADD_LIBRARY(MKL INTERFACE) + IF(KOKKOS_ENABLE_SYCL) + TARGET_LINK_LIBRARIES(MKL INTERFACE MKL::MKL MKL::MKL_DPCPP) + ELSE() + TARGET_LINK_LIBRARIES(MKL INTERFACE MKL::MKL ) + ENDIF() + ADD_LIBRARY(KokkosKernels::MKL ALIAS MKL ) + GET_TARGET_PROPERTY(LIB_TYPE ${TPL_IMPORTED_NAME} TYPE) + MESSAGE("LIB_TYPE: ${LIB_TYPE}") + # kokkoskernels_export_imported_tpl install MKL with target name MKL instead of + # MKL::MKL or KokkosKernels::MKL, so we need to install a specific ALIAS one + if(TARGET MKL) + MESSAGE("TARGET MKL CREATED") + ENDIF() +ELSEIF (CMAKE_CXX_COMPILER_ID STREQUAL "Intel") +# Regular way with MKL version < 2021 (Where MKL doesn't provide cmake module file) TRY_COMPILE(KOKKOSKERNELS_HAS_MKL_ARG ${KOKKOSKERNELS_TOP_BUILD_DIR}/tpl_tests ${KOKKOSKERNELS_TOP_SOURCE_DIR}/cmake/compile_tests/mkl.cpp @@ -51,8 +47,6 @@ ELSEIF(WIN32) ) ENDIF() ELSE() - #TODO: old version plus small modif on header : - # ${MKL_ROOT}/include => ${MKL_ROOT}/include/mkl IF (NOT DEFINED ENV{MKLROOT}) SET(NO_MKL_ROOT_GIVEN "MKL-NOTFOUND") MESSAGE(WARNING "No MKLROOT environment variable specified - must source mklvars.sh to configure MKL path") From 397a3c6604c87280411150521134336956e09424 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 6 Apr 2023 15:05:56 -0600 Subject: [PATCH 207/442] Gesv: adding small comment for clarity --- batched/dense/impl/KokkosBatched_Gesv_Impl.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index 3541ed246e..79c832579d 100644 --- a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -291,6 +291,8 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting::invoke( for (size_t i = 0; i < n; ++i) { int row_index, col_index; reducer_value_type value; + // Note: reduction_identity::max() returns + // the samllest representable value of type value_type value.val = Kokkos::reduction_identity::max(); value.loc = Kokkos::reduction_identity::min(); Kokkos::MaxLoc reducer_value(value); From ec7ce2133f49470e695ca2a1a4420cb32842cc94 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 6 Apr 2023 16:04:10 -0600 Subject: [PATCH 208/442] Gesv: using a value-initialization after all --- batched/dense/impl/KokkosBatched_Gesv_Impl.hpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index 79c832579d..0ef43ee4f8 100644 --- a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -290,11 +290,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting::invoke( for (size_t i = 0; i < n; ++i) { int row_index, col_index; - reducer_value_type value; - // Note: reduction_identity::max() returns - // the samllest representable value of type value_type - value.val = Kokkos::reduction_identity::max(); - value.loc = Kokkos::reduction_identity::min(); + reducer_value_type value{}; Kokkos::MaxLoc reducer_value(value); Kokkos::parallel_reduce( Kokkos::TeamVectorRange(member, n), From f666fba99e1d5709998f6233760c4b28b4a78660 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 30 Mar 2023 17:20:35 -0600 Subject: [PATCH 209/442] Sort and merge improvements - Don't initialize views unnecessarily - Early exit if no entries were merged - View-based interface for matrices, like graph already had - Beef up unit tests (graph, empty cases, early-exit case with zero merges) --- sparse/src/KokkosSparse_SortCrs.hpp | 221 +++++++++++++++------ sparse/unit_test/Test_Sparse_SortCrs.hpp | 242 ++++++++++++++++++----- 2 files changed, 348 insertions(+), 115 deletions(-) diff --git a/sparse/src/KokkosSparse_SortCrs.hpp b/sparse/src/KokkosSparse_SortCrs.hpp index 6cdfd9c1c9..912e7ad5ad 100644 --- a/sparse/src/KokkosSparse_SortCrs.hpp +++ b/sparse/src/KokkosSparse_SortCrs.hpp @@ -95,7 +95,9 @@ void sort_crs_graph(const typename crsGraph_t::execution_space& exec, // sort_and_merge_matrix produces a new matrix which is equivalent to A but is // sorted and has no duplicated entries: each (i, j) is unique. Values for // duplicated entries are summed. Each version either takes an execution space -// instance as a parameter, or uses the default instance. +// instance as a parameter, or uses the default instance. If there are no +// duplicated entries in A, A is sorted and returned (instead of a newly +// allocated matrix). template crsMat_t sort_and_merge_matrix(const crsMat_t& A); @@ -104,6 +106,21 @@ template crsMat_t sort_and_merge_matrix(const typename crsMat_t::execution_space& exec, const crsMat_t& A); +template +void sort_and_merge_matrix(const exec_space& exec, + const typename rowmap_t::const_type& rowmap_in, + const entries_t& entries_in, + const values_t& values_in, rowmap_t& rowmap_out, + entries_t& entries_out, values_t& values_out); + +template +void sort_and_merge_matrix(const typename rowmap_t::const_type& rowmap_in, + const entries_t& entries_in, + const values_t& values_in, rowmap_t& rowmap_out, + entries_t& entries_out, values_t& values_out); + template crsGraph_t sort_and_merge_graph(const crsGraph_t& G); @@ -267,8 +284,8 @@ struct MatrixMergedEntriesFunctor { using scalar_t = typename values_t::non_const_value_type; // Precondition: entries are sorted within each row - MatrixMergedEntriesFunctor(const rowmap_t& rowmap_, const entries_t& entries_, - const values_t& values_, + MatrixMergedEntriesFunctor(const typename rowmap_t::const_type& rowmap_, + const entries_t& entries_, const values_t& values_, const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_, const values_t& mergedValues_) @@ -308,7 +325,7 @@ struct MatrixMergedEntriesFunctor { mergedEntries(insertPos) = accumCol; } - rowmap_t rowmap; + typename rowmap_t::const_type rowmap; entries_t entries; values_t values; rowmap_t mergedRowmap; @@ -322,7 +339,8 @@ struct GraphMergedEntriesFunctor { using lno_t = typename entries_t::non_const_value_type; // Precondition: entries are sorted within each row - GraphMergedEntriesFunctor(const rowmap_t& rowmap_, const entries_t& entries_, + GraphMergedEntriesFunctor(const typename rowmap_t::const_type& rowmap_, + const entries_t& entries_, const rowmap_t& mergedRowmap_, const entries_t& mergedEntries_) : rowmap(rowmap_), @@ -352,7 +370,7 @@ struct GraphMergedEntriesFunctor { mergedEntries(insertPos) = accumCol; } - rowmap_t rowmap; + typename rowmap_t::const_type rowmap; entries_t entries; rowmap_t mergedRowmap; entries_t mergedEntries; @@ -566,46 +584,109 @@ void sort_crs_graph(const crsGraph_t& G) { template crsMat_t sort_and_merge_matrix(const typename crsMat_t::execution_space& exec, const crsMat_t& A) { - using c_rowmap_t = typename crsMat_t::row_map_type; - using rowmap_t = typename crsMat_t::row_map_type::non_const_type; - using entries_t = typename crsMat_t::index_type::non_const_type; - using values_t = typename crsMat_t::values_type::non_const_type; - using size_type = typename rowmap_t::non_const_value_type; - using exec_space = typename crsMat_t::execution_space; - using range_t = Kokkos::RangePolicy; - sort_crs_matrix(exec, A); + using rowmap_t = typename crsMat_t::row_map_type; + using entries_t = typename crsMat_t::index_type; + using values_t = typename crsMat_t::values_type; + + rowmap_t rowmap_out; + entries_t entries_out; + values_t values_out; + + sort_and_merge_matrix(exec, A.graph.row_map, A.graph.entries, A.values, + rowmap_out, entries_out, values_out); + + return crsMat_t("SortedMerged", A.numRows(), A.numCols(), + values_out.extent(0), values_out, rowmap_out, entries_out); +} + +template +crsMat_t sort_and_merge_matrix(const crsMat_t& A) { + return sort_and_merge_matrix(typename crsMat_t::execution_space(), A); +} + +template +void sort_and_merge_matrix(const exec_space& exec, + const typename rowmap_t::const_type& rowmap_in, + const entries_t& entries_in, + const values_t& values_in, rowmap_t& rowmap_out, + entries_t& entries_out, values_t& values_out) { + using nc_rowmap_t = typename rowmap_t::non_const_type; + using size_type = typename nc_rowmap_t::value_type; + using ordinal_t = typename entries_t::value_type; + using range_t = Kokkos::RangePolicy; + static_assert(!std::is_const_v, + "sort_and_merge_matrix: entries_t must not be const-valued"); + static_assert(!std::is_const_v, + "sort_and_merge_matrix: values_t must not be const-valued"); + + ordinal_t numRows = + rowmap_in.extent(0) ? ordinal_t(rowmap_in.extent(0) - 1) : ordinal_t(0); + size_type nnz = entries_in.extent(0); + + if (numRows == 0) { + rowmap_out = typename rowmap_t::non_const_type("SortedMerged rowmap", + rowmap_in.extent(0)); + entries_out = entries_t(); + values_out = values_t(); + return; + } + + sort_crs_matrix(exec, rowmap_in, entries_in, values_in); + // Count entries per row into a new rowmap, in terms of merges that can be // done - rowmap_t mergedRowmap(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, - "SortedMerged rowmap"), - A.numRows() + 1); + nc_rowmap_t nc_rowmap_out( + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "SortedMerged rowmap"), + numRows + 1); size_type numCompressedEntries = 0; - Kokkos::parallel_reduce(range_t(exec, 0, A.numRows()), - Impl::MergedRowmapFunctor( - mergedRowmap, A.graph.row_map, A.graph.entries), + Kokkos::parallel_reduce(range_t(exec, 0, numRows), + Impl::MergedRowmapFunctor( + nc_rowmap_out, rowmap_in, entries_in), numCompressedEntries); + if (nnz == numCompressedEntries) { + // No merges to do, so just return A. Save the time of allocating and + // filling a copy. + if constexpr (std::is_const_v) { + rowmap_out = rowmap_in; + } else { + // rowmap_t is non-const, so we can't directly assign rowmap_in to + // rowmap_out. Forced to deep copy it to maintain const-correctness. + Kokkos::deep_copy(exec, nc_rowmap_out, rowmap_in); + rowmap_out = nc_rowmap_out; + } + entries_out = entries_in; + values_out = values_in; + return; + } // Prefix sum to get rowmap - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - exec, A.numRows() + 1, mergedRowmap); - entries_t mergedEntries(Kokkos::view_alloc(exec, "SortedMerged entries"), + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + exec, numRows + 1, nc_rowmap_out); + rowmap_out = nc_rowmap_out; + entries_out = entries_t(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "SortedMerged entries"), numCompressedEntries); - values_t mergedValues(Kokkos::view_alloc(exec, "SortedMerged values"), + values_out = values_t(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "SortedMerged values"), numCompressedEntries); // Compute merged entries and values Kokkos::parallel_for( - range_t(exec, 0, A.numRows()), - Impl::MatrixMergedEntriesFunctor( - A.graph.row_map, A.graph.entries, A.values, mergedRowmap, - mergedEntries, mergedValues)); - // Finally, construct the new compressed matrix - return crsMat_t("SortedMerged", A.numRows(), A.numCols(), - numCompressedEntries, mergedValues, mergedRowmap, - mergedEntries); + range_t(exec, 0, numRows), + Impl::MatrixMergedEntriesFunctor( + rowmap_in, entries_in, values_in, rowmap_out, entries_out, + values_out)); } -template -crsMat_t sort_and_merge_matrix(const crsMat_t& A) { - return sort_and_merge_matrix(typename crsMat_t::execution_space(), A); +template +void sort_and_merge_matrix(const typename rowmap_t::const_type& rowmap_in, + const entries_t& entries_in, + const values_t& values_in, rowmap_t& rowmap_out, + entries_t& entries_out, values_t& values_out) { + sort_and_merge_matrix(exec_space(), rowmap_in, entries_in, values_in, + rowmap_out, entries_out, values_out); } template @@ -613,41 +694,61 @@ void sort_and_merge_graph(const exec_space& exec, const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, rowmap_t& rowmap_out, entries_t& entries_out) { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using range_t = Kokkos::RangePolicy; - using const_rowmap_t = typename rowmap_t::const_type; - lno_t numRows = rowmap_in.extent(0); - if (numRows <= 1) { - // Matrix has zero rows - rowmap_out = rowmap_t(); + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::value_type; + using range_t = Kokkos::RangePolicy; + using nc_rowmap_t = typename rowmap_t::non_const_type; + static_assert(!std::is_const_v, + "sort_and_merge_graph: entries_t must not be const-valued"); + lno_t numRows = rowmap_in.extent(0) ? rowmap_in.extent(0) - 1 : 0; + if (numRows == 0) { + rowmap_out = typename rowmap_t::non_const_type("SortedMerged rowmap", + rowmap_in.extent(0)); entries_out = entries_t(); return; } - numRows--; // Sort in place - sort_crs_graph(exec, rowmap_in, - entries_in); + sort_crs_graph(exec, rowmap_in, entries_in); // Count entries per row into a new rowmap, in terms of merges that can be // done - rowmap_out = rowmap_t(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, - "SortedMerged rowmap"), - numRows + 1); + nc_rowmap_t nc_rowmap_out( + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "SortedMerged rowmap"), + numRows + 1); size_type numCompressedEntries = 0; Kokkos::parallel_reduce(range_t(exec, 0, numRows), Impl::MergedRowmapFunctor( - rowmap_out, rowmap_in, entries_in), + nc_rowmap_out, rowmap_in, entries_in), numCompressedEntries); - // Prefix sum to get rowmap - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - exec, numRows + 1, rowmap_out); - entries_out = entries_t(Kokkos::view_alloc(exec, "SortedMerged entries"), + if (entries_in.extent(0) == size_t(numCompressedEntries)) { + // No merges to perform, so the output rowmap is unchanged and we can just + // return the now-sorted entries_in. + if constexpr (std::is_const_v) { + rowmap_out = rowmap_in; + } else { + // rowmap_t is non-const, so we can't directly assign rowmap_in to + // rowmap_out. Forced to deep copy it to maintain const-correctness. + Kokkos::deep_copy(exec, nc_rowmap_out, rowmap_in); + rowmap_out = nc_rowmap_out; + } + entries_out = entries_in; + return; + } + // Prefix sum to get rowmap. + // In the case where the output rowmap is the same as the input, we could just + // assign "rowmap_out = rowmap_in" except that would break const-correctness. + // Can skip filling the entries, however. + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + exec, numRows + 1, nc_rowmap_out); + rowmap_out = nc_rowmap_out; + entries_out = entries_t(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "SortedMerged entries"), numCompressedEntries); // Compute merged entries and values - Kokkos::parallel_for( - range_t(exec, 0, numRows), - Impl::GraphMergedEntriesFunctor( - rowmap_in, entries_in, rowmap_out, entries_out)); + Kokkos::parallel_for(range_t(exec, 0, numRows), + Impl::GraphMergedEntriesFunctor( + rowmap_in, entries_in, rowmap_out, entries_out)); } template @@ -668,9 +769,7 @@ crsGraph_t sort_and_merge_graph( "sort_and_merge_graph requires StaticCrsGraph entries to be non-const."); rowmap_t mergedRowmap; entries_t mergedEntries; - sort_and_merge_graph(exec, G.row_map, G.entries, mergedRowmap, - mergedEntries); + sort_and_merge_graph(exec, G.row_map, G.entries, mergedRowmap, mergedEntries); return crsGraph_t(mergedEntries, mergedRowmap); } diff --git a/sparse/unit_test/Test_Sparse_SortCrs.hpp b/sparse/unit_test/Test_Sparse_SortCrs.hpp index 089fdd73c7..9a04bce302 100644 --- a/sparse/unit_test/Test_Sparse_SortCrs.hpp +++ b/sparse/unit_test/Test_Sparse_SortCrs.hpp @@ -186,7 +186,8 @@ void testSortCRSUnmanaged(bool doValues, bool doStructInterface) { } template -void testSortAndMerge(bool useExecInstance) { +void testSortAndMerge(bool justGraph, bool useExecInstance, + bool doStructInterface, int testCase) { using size_type = default_size_type; using lno_t = default_lno_t; using scalar_t = default_scalar; @@ -194,86 +195,206 @@ void testSortAndMerge(bool useExecInstance) { using device_t = Kokkos::Device; using crsMat_t = KokkosSparse::CrsMatrix; + using graph_t = typename crsMat_t::staticcrsgraph_type; using rowmap_t = typename crsMat_t::row_map_type::non_const_type; using entries_t = typename crsMat_t::index_type; using values_t = typename crsMat_t::values_type; using Kokkos::HostSpace; using Kokkos::MemoryTraits; using Kokkos::Unmanaged; - // Create a small CRS matrix on host - std::vector inRowmap = {0, 4, 4, 5, 7, 10}; - std::vector inEntries = { - 4, 3, 5, 3, // row 0 - // row 1 has no entries - 6, // row 2 - 2, 2, // row 3 - 0, 1, 2 // row 4 - }; - // note: choosing values that can be represented exactly by float - std::vector inValues = { - 1.5, 4, 1, -3, // row 0 - // row 1 - 2, // row 2 - -1, -2, // row 3 - 0, 3.5, -2.25 // row 4 - }; - lno_t nrows = 5; - lno_t ncols = 7; + // Select a test case: matrices and correct ouptut are hardcoded for each + std::vector inRowmap; + std::vector inEntries; + std::vector inValues; + std::vector goldRowmap; + std::vector goldEntries; + std::vector goldValues; + lno_t nrows = 0; + lno_t ncols = 0; + switch (testCase) { + case 0: { + // Two merges take place, and one depends on sorting being done correctly + nrows = 5; + ncols = 7; + inRowmap = {0, 4, 4, 5, 7, 10}; + inEntries = { + 4, 3, 5, 3, // row 0 + // row 1 has no entries + 6, // row 2 + 2, 2, // row 3 + 0, 1, 2 // row 4 + }; + // note: choosing values that can be represented exactly by float + inValues = { + 1.5, 4, 1, -3, // row 0 + // row 1 + 2, // row 2 + -1, -2, // row 3 + 0, 3.5, -2.25 // row 4 + }; + // Expect 2 merges to have taken place + goldRowmap = {0, 3, 3, 4, 5, 8}; + goldEntries = { + 3, 4, 5, // row 0 + // row 1 has no entries + 6, // row 2 + 2, // row 3 + 0, 1, 2 // row 4 + }; + goldValues = { + 1, 1.5, 1, // row 0 + // row 1 + 2, // row 2 + -3, // row 3 + 0, 3.5, -2.25 // row 4 + }; + break; + } + case 1: { + // Same as above, but no merges take place + nrows = 5; + ncols = 7; + inRowmap = {0, 3, 3, 4, 5, 8}; + inEntries = { + 4, 5, 3, // row 0 + // row 1 has no entries + 6, // row 2 + 2, // row 3 + 0, 1, 2 // row 4 + }; + inValues = { + 1.5, 4, 1, // row 0 + // row 1 + 2, // row 2 + -1, // row 3 + 0, 3.5, -2.25 // row 4 + }; + // Expect 2 merges to have taken place + goldRowmap = {0, 3, 3, 4, 5, 8}; + goldEntries = { + 3, 4, 5, // row 0 + // row 1 has no entries + 6, // row 2 + 2, // row 3 + 0, 1, 2 // row 4 + }; + goldValues = { + 1, 1.5, 4, // row 0 + // row 1 + 2, // row 2 + -1, // row 3 + 0, 3.5, -2.25 // row 4 + }; + break; + } + case 2: { + // Nonzero dimensions but no entries + nrows = 5; + ncols = 7; + inRowmap = {0, 0, 0, 0, 0, 0}; + goldRowmap = inRowmap; + break; + } + case 3: { + // Zero rows, length-zero rowmap + break; + } + case 4: { + // Zero rows, length-one rowmap + inRowmap = {0}; + goldRowmap = {0}; + break; + } + } size_type nnz = inEntries.size(); Kokkos::View> hostInRowmap( - inRowmap.data(), nrows + 1); + inRowmap.data(), inRowmap.size()); Kokkos::View> hostInEntries( inEntries.data(), nnz); Kokkos::View> hostInValues( inValues.data(), nnz); - rowmap_t devInRowmap("", nrows + 1); - entries_t devInEntries("", nnz); - values_t devInValues("", nnz); + rowmap_t devInRowmap("in rowmap", inRowmap.size()); + entries_t devInEntries("in entries", nnz); + values_t devInValues("in values", nnz); Kokkos::deep_copy(devInRowmap, hostInRowmap); Kokkos::deep_copy(devInEntries, hostInEntries); Kokkos::deep_copy(devInValues, hostInValues); crsMat_t input("Input", nrows, ncols, nnz, devInValues, devInRowmap, devInEntries); crsMat_t output; - if (useExecInstance) { - output = KokkosSparse::sort_and_merge_matrix(exec_space(), input); + if (justGraph) { + graph_t outputGraph; + // Testing sort_and_merge_graph + if (doStructInterface) { + if (useExecInstance) { + outputGraph = + KokkosSparse::sort_and_merge_graph(exec_space(), input.graph); + } else { + outputGraph = KokkosSparse::sort_and_merge_graph(input.graph); + } + } else { + rowmap_t devOutRowmap; + entries_t devOutEntries; + if (useExecInstance) { + KokkosSparse::sort_and_merge_graph(exec_space(), input.graph.row_map, + input.graph.entries, devOutRowmap, + devOutEntries); + } else { + KokkosSparse::sort_and_merge_graph( + input.graph.row_map, input.graph.entries, devOutRowmap, + devOutEntries); + } + outputGraph = graph_t(devOutEntries, devOutRowmap); + } + // Construct output using the output graph, leaving values zero-initialized + output = crsMat_t("Output", outputGraph, ncols); } else { - output = KokkosSparse::sort_and_merge_matrix(input); + // Testing sort_and_merge_matrix + if (doStructInterface) { + if (useExecInstance) { + output = KokkosSparse::sort_and_merge_matrix(exec_space(), input); + } else { + output = KokkosSparse::sort_and_merge_matrix(input); + } + } else { + rowmap_t devOutRowmap; + entries_t devOutEntries; + values_t devOutValues; + if (useExecInstance) { + KokkosSparse::sort_and_merge_matrix( + exec_space(), input.graph.row_map, input.graph.entries, + input.values, devOutRowmap, devOutEntries, devOutValues); + } else { + KokkosSparse::sort_and_merge_matrix( + input.graph.row_map, input.graph.entries, input.values, + devOutRowmap, devOutEntries, devOutValues); + } + // and then construct output from views + output = crsMat_t("Output", nrows, ncols, devOutValues.extent(0), + devOutValues, devOutRowmap, devOutEntries); + } + EXPECT_EQ(output.numRows(), nrows); + EXPECT_EQ(output.numCols(), ncols); } - exec_space().fence(); - EXPECT_EQ(output.numRows(), nrows); - EXPECT_EQ(output.numCols(), ncols); auto outRowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), output.graph.row_map); auto outEntries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), output.graph.entries); auto outValues = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), output.values); - // Expect 2 merges to have taken place - std::vector goldRowmap = {0, 3, 3, 4, 5, 8}; - std::vector goldEntries = { - 3, 4, 5, // row 0 - // row 1 has no entries - 6, // row 2 - 2, // row 3 - 0, 1, 2 // row 4 - }; - // note: choosing values that can be represented exactly by float - std::vector goldValues = { - 1, 1.5, 1, // row 0 - // row 1 - 2, // row 2 - -3, // row 3 - 0, 3.5, -2.25 // row 4 - }; EXPECT_EQ(goldRowmap.size(), outRowmap.extent(0)); EXPECT_EQ(goldEntries.size(), outEntries.extent(0)); - EXPECT_EQ(goldValues.size(), outValues.extent(0)); - EXPECT_EQ(goldValues.size(), output.nnz()); - for (lno_t i = 0; i < nrows + 1; i++) EXPECT_EQ(goldRowmap[i], outRowmap(i)); - for (size_type i = 0; i < output.nnz(); i++) { + if (!justGraph) { + EXPECT_EQ(goldValues.size(), outValues.extent(0)); + EXPECT_EQ(goldValues.size(), output.nnz()); + } + for (size_t i = 0; i < goldRowmap.size(); i++) + EXPECT_EQ(goldRowmap[i], outRowmap(i)); + for (size_t i = 0; i < goldEntries.size(); i++) { EXPECT_EQ(goldEntries[i], outEntries(i)); - EXPECT_EQ(goldValues[i], outValues(i)); + if (!justGraph) { + EXPECT_EQ(goldValues[i], outValues(i)); + } } } @@ -313,8 +434,21 @@ TEST_F(TestCategory, common_sort_crs_longrows) { } TEST_F(TestCategory, common_sort_merge_crsmatrix) { - testSortAndMerge(false); - testSortAndMerge(true); + for (int testCase = 0; testCase < 5; testCase++) { + testSortAndMerge(false, false, false, testCase); + testSortAndMerge(false, false, true, testCase); + testSortAndMerge(false, true, false, testCase); + testSortAndMerge(false, true, true, testCase); + } +} + +TEST_F(TestCategory, common_sort_merge_crsgraph) { + for (int testCase = 0; testCase < 5; testCase++) { + testSortAndMerge(true, false, false, testCase); + testSortAndMerge(true, false, true, testCase); + testSortAndMerge(true, true, false, testCase); + testSortAndMerge(true, true, true, testCase); + } } #endif // KOKKOSSPARSE_SORTCRSTEST_HPP From d49004f7716194091973048de4ae91370aba63d9 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 6 Apr 2023 12:30:01 -0600 Subject: [PATCH 210/442] Remvoe deprecated KokkosKernels::Impl:: sort functions --- sparse/src/KokkosSparse_SortCrs.hpp | 38 ----------------------------- 1 file changed, 38 deletions(-) diff --git a/sparse/src/KokkosSparse_SortCrs.hpp b/sparse/src/KokkosSparse_SortCrs.hpp index 912e7ad5ad..8c735ef301 100644 --- a/sparse/src/KokkosSparse_SortCrs.hpp +++ b/sparse/src/KokkosSparse_SortCrs.hpp @@ -857,44 +857,6 @@ template entries_out); } -// For backward compatibility: keep the public interface accessible in -// KokkosKernels::Impl:: -namespace Impl { -template -[[deprecated]] void sort_crs_graph(const rowmap_t& rowmap, - const entries_t& entries) { - KokkosKernels::sort_crs_graph(rowmap, - entries); -} - -template -[[deprecated]] void sort_crs_matrix(const rowmap_t& rowmap, - const entries_t& entries, - const values_t& values) { - KokkosKernels::sort_crs_matrix(rowmap, entries, values); -} - -template -[[deprecated]] void sort_crs_matrix(const crsMat_t& A) { - KokkosKernels::sort_crs_matrix(A); -} - -template -[[deprecated]] void sort_and_merge_graph( - const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, - rowmap_t& rowmap_out, entries_t& entries_out) { - KokkosKernels::sort_and_merge_graph( - rowmap_in, entries_in, rowmap_out, entries_out); -} - -template -[[deprecated]] crsMat_t sort_and_merge_matrix(const crsMat_t& A) { - return KokkosKernels::sort_and_merge_matrix(A); -} - -} // namespace Impl } // namespace KokkosKernels #endif // _KOKKOSSPARSE_SORTCRS_HPP From 893132ccd22551d78c953e7ace2173d29f8ef98f Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 6 Apr 2023 14:24:38 -0600 Subject: [PATCH 211/442] Allowed template arg deduction for sort_, sort_and_merge --- sparse/src/KokkosSparse_SortCrs.hpp | 165 ++++++++----------- sparse/unit_test/Test_Sparse_SortCrs.hpp | 195 +++++++++++++++-------- 2 files changed, 191 insertions(+), 169 deletions(-) diff --git a/sparse/src/KokkosSparse_SortCrs.hpp b/sparse/src/KokkosSparse_SortCrs.hpp index 8c735ef301..efe39e186f 100644 --- a/sparse/src/KokkosSparse_SortCrs.hpp +++ b/sparse/src/KokkosSparse_SortCrs.hpp @@ -61,37 +61,6 @@ void sort_bsr_matrix(const typename bsrMat_t::execution_space& exec, // ascending order. Each version either takes an execution space instance as a // parameter, or uses the default instance. -template -void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, - const values_t& values); - -template -void sort_crs_matrix(const execution_space& exec, const rowmap_t& rowmap, - const entries_t& entries, const values_t& values); - -template -void sort_crs_matrix(const crsMat_t& A); - -template -void sort_crs_matrix(const typename crsMat_t::execution_space& exec, - const crsMat_t& A); - -template -void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries); - -template -void sort_crs_graph(const execution_space& exec, const rowmap_t& rowmap, - const entries_t& entries); - -template -void sort_crs_graph(const crsGraph_t& G); - -template -void sort_crs_graph(const typename crsGraph_t::execution_space& exec, - const crsGraph_t& G); - // sort_and_merge_matrix produces a new matrix which is equivalent to A but is // sorted and has no duplicated entries: each (i, j) is unique. Values for // duplicated entries are summed. Each version either takes an execution space @@ -99,46 +68,6 @@ void sort_crs_graph(const typename crsGraph_t::execution_space& exec, // duplicated entries in A, A is sorted and returned (instead of a newly // allocated matrix). -template -crsMat_t sort_and_merge_matrix(const crsMat_t& A); - -template -crsMat_t sort_and_merge_matrix(const typename crsMat_t::execution_space& exec, - const crsMat_t& A); - -template -void sort_and_merge_matrix(const exec_space& exec, - const typename rowmap_t::const_type& rowmap_in, - const entries_t& entries_in, - const values_t& values_in, rowmap_t& rowmap_out, - entries_t& entries_out, values_t& values_out); - -template -void sort_and_merge_matrix(const typename rowmap_t::const_type& rowmap_in, - const entries_t& entries_in, - const values_t& values_in, rowmap_t& rowmap_out, - entries_t& entries_out, values_t& values_out); - -template -crsGraph_t sort_and_merge_graph(const crsGraph_t& G); - -template -crsGraph_t sort_and_merge_graph( - const typename crsGraph_t::execution_space& exec, const crsGraph_t& G); - -template -void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, - const entries_t& entries_in, rowmap_t& rowmap_out, - entries_t& entries_out); - -template -void sort_and_merge_graph(const exec_space& exec, - const typename rowmap_t::const_type& rowmap_in, - const entries_t& entries_in, rowmap_t& rowmap_out, - entries_t& entries_out); - namespace Impl { template +void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, + const values_t& values) { + sort_crs_matrix(typename entries_t::execution_space(), rowmap, entries, values); +} + template void sort_crs_matrix(const typename crsMat_t::execution_space& exec, const crsMat_t& A) { @@ -566,13 +501,21 @@ void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { sort_crs_graph(execution_space(), rowmap, entries); } -template -void sort_crs_graph(const typename crsGraph_t::execution_space& exec, - const crsGraph_t& G) { - static_assert( - !std::is_const::value, - "sort_crs_graph requires StaticCrsGraph entries to be non-const."); - sort_crs_graph(exec, G.row_map, G.entries); +// This overload covers 2 cases, while allowing all template args to be deduced: +// - sort_crs_graph(exec, G) +// - sort_crs_graph(rowmap, entries) +template +void sort_crs_graph(const Arg1& a1, const Arg2& a2) { + if constexpr(Kokkos::is_execution_space_v) { + // a1 is an exec instance, a2 is a graph + sort_crs_graph(a1, a2.row_map, a2.entries); + } + else if constexpr(Kokkos::is_view_v) { + // a1 is rowmap, a2 is entries + sort_crs_graph(typename Arg2::execution_space(), a1, a2); + } else { + static_assert(Arg1::doesnthavethisthing, "sort_crs_graph(arg1, arg2): expect either (exec, G) or (rowmap, entries)"); + } } template @@ -580,30 +523,6 @@ void sort_crs_graph(const crsGraph_t& G) { sort_crs_graph(typename crsGraph_t::execution_space(), G); } -// Sort the rows of matrix, and merge duplicate entries. -template -crsMat_t sort_and_merge_matrix(const typename crsMat_t::execution_space& exec, - const crsMat_t& A) { - using rowmap_t = typename crsMat_t::row_map_type; - using entries_t = typename crsMat_t::index_type; - using values_t = typename crsMat_t::values_type; - - rowmap_t rowmap_out; - entries_t entries_out; - values_t values_out; - - sort_and_merge_matrix(exec, A.graph.row_map, A.graph.entries, A.values, - rowmap_out, entries_out, values_out); - - return crsMat_t("SortedMerged", A.numRows(), A.numCols(), - values_out.extent(0), values_out, rowmap_out, entries_out); -} - -template -crsMat_t sort_and_merge_matrix(const crsMat_t& A) { - return sort_and_merge_matrix(typename crsMat_t::execution_space(), A); -} - template void sort_and_merge_matrix(const exec_space& exec, @@ -679,6 +598,30 @@ void sort_and_merge_matrix(const exec_space& exec, values_out)); } +// Sort the rows of matrix, and merge duplicate entries. +template +crsMat_t sort_and_merge_matrix(const typename crsMat_t::execution_space& exec, + const crsMat_t& A) { + using rowmap_t = typename crsMat_t::row_map_type; + using entries_t = typename crsMat_t::index_type; + using values_t = typename crsMat_t::values_type; + + rowmap_t rowmap_out; + entries_t entries_out; + values_t values_out; + + sort_and_merge_matrix(exec, A.graph.row_map, A.graph.entries, A.values, + rowmap_out, entries_out, values_out); + + return crsMat_t("SortedMerged", A.numRows(), A.numCols(), + values_out.extent(0), values_out, rowmap_out, entries_out); +} + +template +crsMat_t sort_and_merge_matrix(const crsMat_t& A) { + return sort_and_merge_matrix(typename crsMat_t::execution_space(), A); +} + template void sort_and_merge_matrix(const typename rowmap_t::const_type& rowmap_in, @@ -689,6 +632,16 @@ void sort_and_merge_matrix(const typename rowmap_t::const_type& rowmap_in, rowmap_out, entries_out, values_out); } +template +void sort_and_merge_matrix(const typename rowmap_t::const_type& rowmap_in, + const entries_t& entries_in, + const values_t& values_in, rowmap_t& rowmap_out, + entries_t& entries_out, values_t& values_out) { + sort_and_merge_matrix(typename entries_t::execution_space(), rowmap_in, entries_in, values_in, + rowmap_out, entries_out, values_out); +} + template void sort_and_merge_graph(const exec_space& exec, const typename rowmap_t::const_type& rowmap_in, @@ -759,6 +712,14 @@ void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, entries_out); } +template +void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, + const entries_t& entries_in, rowmap_t& rowmap_out, + entries_t& entries_out) { + return sort_and_merge_graph(typename entries_t::execution_space(), rowmap_in, entries_in, rowmap_out, + entries_out); +} + template crsGraph_t sort_and_merge_graph( const typename crsGraph_t::execution_space& exec, const crsGraph_t& G) { diff --git a/sparse/unit_test/Test_Sparse_SortCrs.hpp b/sparse/unit_test/Test_Sparse_SortCrs.hpp index 9a04bce302..796b538c60 100644 --- a/sparse/unit_test/Test_Sparse_SortCrs.hpp +++ b/sparse/unit_test/Test_Sparse_SortCrs.hpp @@ -32,10 +32,19 @@ #include #include +namespace SortCrsTest +{ + enum : int { + Instance, //Passing in an instance, and deducing template args + ExplicitType, //Using default instance, but specifying type with template arg + ImplicitType //Using default instance, and deducing type based on view + }; +} + template void testSortCRS(default_lno_t numRows, default_lno_t numCols, default_size_type nnz, bool doValues, bool doStructInterface, - bool useExecInstance) { + int howExecSpecified) { using scalar_t = default_scalar; using lno_t = default_lno_t; using size_type = default_size_type; @@ -90,35 +99,50 @@ void testSortCRS(default_lno_t numRows, default_lno_t numCols, // call the actual sort routine being tested if (doValues) { if (doStructInterface) { - if (useExecInstance) { - KokkosSparse::sort_crs_matrix(exec_space(), A); - } else { - KokkosSparse::sort_crs_matrix(A); + switch(howExecSpecified) { + case SortCrsTest::Instance: + KokkosSparse::sort_crs_matrix(exec_space(), A); + break; + case SortCrsTest::ExplicitType: + throw std::logic_error("Should not get here"); + case SortCrsTest::ImplicitType: + KokkosSparse::sort_crs_matrix(A); } } else { - if (useExecInstance) { - KokkosSparse::sort_crs_matrix(exec_space(), A.graph.row_map, - A.graph.entries, A.values); - } else { - KokkosSparse::sort_crs_matrix(A.graph.row_map, - A.graph.entries, A.values); + switch(howExecSpecified) { + case SortCrsTest::Instance: + KokkosSparse::sort_crs_matrix(exec_space(), A.graph.row_map, + A.graph.entries, A.values); + break; + case SortCrsTest::ExplicitType: + KokkosSparse::sort_crs_matrix(A.graph.row_map, + A.graph.entries, A.values); + break; + case SortCrsTest::ImplicitType: + KokkosSparse::sort_crs_matrix(A.graph.row_map, A.graph.entries, A.values); } } } else { if (doStructInterface) { - if (useExecInstance) { - KokkosSparse::sort_crs_graph(exec_space(), A.graph); - } else { - KokkosSparse::sort_crs_graph(A.graph); + switch(howExecSpecified) { + case SortCrsTest::Instance: + KokkosSparse::sort_crs_graph(exec_space(), A.graph); + break; + case SortCrsTest::ExplicitType: + throw std::logic_error("Should not get here"); + case SortCrsTest::ImplicitType: + KokkosSparse::sort_crs_graph(A.graph); } } else { - if (useExecInstance) { - KokkosSparse::sort_crs_graph(exec_space(), A.graph.row_map, - A.graph.entries); - } else { - KokkosSparse::sort_crs_graph( - A.graph.row_map, A.graph.entries); + switch(howExecSpecified) { + case SortCrsTest::Instance: + KokkosSparse::sort_crs_graph(exec_space(), A.graph.row_map, A.graph.entries); + break; + case SortCrsTest::ExplicitType: + KokkosSparse::sort_crs_graph(A.graph.row_map, A.graph.entries); + break; + case SortCrsTest::ImplicitType: + KokkosSparse::sort_crs_graph(A.graph.row_map, A.graph.entries); } } } @@ -186,7 +210,7 @@ void testSortCRSUnmanaged(bool doValues, bool doStructInterface) { } template -void testSortAndMerge(bool justGraph, bool useExecInstance, +void testSortAndMerge(bool justGraph, int howExecSpecified, bool doStructInterface, int testCase) { using size_type = default_size_type; using lno_t = default_lno_t; @@ -326,23 +350,34 @@ void testSortAndMerge(bool justGraph, bool useExecInstance, graph_t outputGraph; // Testing sort_and_merge_graph if (doStructInterface) { - if (useExecInstance) { - outputGraph = + switch(howExecSpecified) { + case SortCrsTest::Instance: + outputGraph = KokkosSparse::sort_and_merge_graph(exec_space(), input.graph); - } else { - outputGraph = KokkosSparse::sort_and_merge_graph(input.graph); + break; + case SortCrsTest::ExplicitType: + throw std::logic_error("Should not get here"); + case SortCrsTest::ImplicitType: + outputGraph = KokkosSparse::sort_and_merge_graph(input.graph); } } else { rowmap_t devOutRowmap; entries_t devOutEntries; - if (useExecInstance) { - KokkosSparse::sort_and_merge_graph(exec_space(), input.graph.row_map, - input.graph.entries, devOutRowmap, - devOutEntries); - } else { - KokkosSparse::sort_and_merge_graph( - input.graph.row_map, input.graph.entries, devOutRowmap, - devOutEntries); + switch(howExecSpecified) { + case SortCrsTest::Instance: + KokkosSparse::sort_and_merge_graph(exec_space(), input.graph.row_map, + input.graph.entries, devOutRowmap, + devOutEntries); + break; + case SortCrsTest::ExplicitType: + KokkosSparse::sort_and_merge_graph( + input.graph.row_map, input.graph.entries, devOutRowmap, + devOutEntries); + break; + case SortCrsTest::ImplicitType: + KokkosSparse::sort_and_merge_graph( + input.graph.row_map, input.graph.entries, devOutRowmap, + devOutEntries); } outputGraph = graph_t(devOutEntries, devOutRowmap); } @@ -351,23 +386,34 @@ void testSortAndMerge(bool justGraph, bool useExecInstance, } else { // Testing sort_and_merge_matrix if (doStructInterface) { - if (useExecInstance) { - output = KokkosSparse::sort_and_merge_matrix(exec_space(), input); - } else { - output = KokkosSparse::sort_and_merge_matrix(input); + switch(howExecSpecified) { + case SortCrsTest::Instance: + output = KokkosSparse::sort_and_merge_matrix(exec_space(), input); + break; + case SortCrsTest::ExplicitType: + throw std::logic_error("Should not get here"); + case SortCrsTest::ImplicitType: + output = KokkosSparse::sort_and_merge_matrix(input); } } else { rowmap_t devOutRowmap; entries_t devOutEntries; values_t devOutValues; - if (useExecInstance) { - KokkosSparse::sort_and_merge_matrix( - exec_space(), input.graph.row_map, input.graph.entries, - input.values, devOutRowmap, devOutEntries, devOutValues); - } else { - KokkosSparse::sort_and_merge_matrix( - input.graph.row_map, input.graph.entries, input.values, - devOutRowmap, devOutEntries, devOutValues); + switch(howExecSpecified) { + case SortCrsTest::Instance: + KokkosSparse::sort_and_merge_matrix( + exec_space(), input.graph.row_map, input.graph.entries, + input.values, devOutRowmap, devOutEntries, devOutValues); + break; + case SortCrsTest::ExplicitType: + KokkosSparse::sort_and_merge_matrix( + input.graph.row_map, input.graph.entries, input.values, + devOutRowmap, devOutEntries, devOutValues); + break; + case SortCrsTest::ImplicitType: + KokkosSparse::sort_and_merge_matrix( + input.graph.row_map, input.graph.entries, input.values, + devOutRowmap, devOutEntries, devOutValues); } // and then construct output from views output = crsMat_t("Output", nrows, ncols, devOutValues.extent(0), @@ -400,13 +446,17 @@ void testSortAndMerge(bool justGraph, bool useExecInstance, TEST_F(TestCategory, common_sort_crsgraph) { for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { - for (int useExecInstance = 0; useExecInstance < 2; useExecInstance++) { + for (int howExecSpecified = 0; howExecSpecified < 3; howExecSpecified++) { + // If using the struct interface (StaticCrsGraph), cannot use ExplicitType because + // the exec space type is determined from the graph. + if(doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) + continue; testSortCRS(10, 10, 20, false, doStructInterface, - useExecInstance); + howExecSpecified); testSortCRS(100, 100, 2000, false, doStructInterface, - useExecInstance); + howExecSpecified); testSortCRS(1000, 1000, 30000, false, doStructInterface, - useExecInstance); + howExecSpecified); } testSortCRSUnmanaged(false, doStructInterface); } @@ -414,40 +464,51 @@ TEST_F(TestCategory, common_sort_crsgraph) { TEST_F(TestCategory, common_sort_crsmatrix) { for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { - for (int useExecInstance = 0; useExecInstance < 2; useExecInstance++) { + // howExecSpecified: Instance, ExplicitType, ImplicitType + for (int howExecSpecified = 0; howExecSpecified < 3; howExecSpecified++) { + // If using the struct interface (CrsMatrix), cannot use ExplicitType because + // the exec space type is determined from the matrix. + if(doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) + continue; testSortCRS(10, 10, 20, true, doStructInterface, - useExecInstance); + howExecSpecified); testSortCRS(100, 100, 2000, true, doStructInterface, - useExecInstance); + howExecSpecified); testSortCRS(1000, 1000, 30000, true, doStructInterface, - useExecInstance); + howExecSpecified); } testSortCRSUnmanaged(true, doStructInterface); } } TEST_F(TestCategory, common_sort_crs_longrows) { - testSortCRS(1, 50000, 10000, false, false, false); - testSortCRS(1, 50000, 10000, true, false, false); - testSortCRS(1, 50000, 10000, false, false, true); - testSortCRS(1, 50000, 10000, true, false, true); + // Matrix/graph with one very long row + // Just test this once with graph, and once with matrix + testSortCRS(1, 50000, 10000, false, false, SortCrsTest::ImplicitType); + testSortCRS(1, 50000, 10000, true, false, SortCrsTest::ImplicitType); } TEST_F(TestCategory, common_sort_merge_crsmatrix) { for (int testCase = 0; testCase < 5; testCase++) { - testSortAndMerge(false, false, false, testCase); - testSortAndMerge(false, false, true, testCase); - testSortAndMerge(false, true, false, testCase); - testSortAndMerge(false, true, true, testCase); + for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { + for (int howExecSpecified = 0; howExecSpecified < 3; howExecSpecified++) { + if(doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) + continue; + testSortAndMerge(false, howExecSpecified, doStructInterface, testCase); + } + } } } TEST_F(TestCategory, common_sort_merge_crsgraph) { for (int testCase = 0; testCase < 5; testCase++) { - testSortAndMerge(true, false, false, testCase); - testSortAndMerge(true, false, true, testCase); - testSortAndMerge(true, true, false, testCase); - testSortAndMerge(true, true, true, testCase); + for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { + for (int howExecSpecified = 0; howExecSpecified < 3; howExecSpecified++) { + if(doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) + continue; + testSortAndMerge(true, howExecSpecified, doStructInterface, testCase); + } + } } } From 17b71d2b3c242148f3631fe02b64377cb1759584 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 6 Apr 2023 14:38:46 -0600 Subject: [PATCH 212/442] Add compile-time checks for SortCrs functions - Make sure exec space can access all the views - Make sure views that will be modified are nonconst --- sparse/src/KokkosSparse_SortCrs.hpp | 83 ++++++++++++++++++++++++----- 1 file changed, 71 insertions(+), 12 deletions(-) diff --git a/sparse/src/KokkosSparse_SortCrs.hpp b/sparse/src/KokkosSparse_SortCrs.hpp index efe39e186f..31b835d358 100644 --- a/sparse/src/KokkosSparse_SortCrs.hpp +++ b/sparse/src/KokkosSparse_SortCrs.hpp @@ -360,6 +360,25 @@ template void sort_crs_matrix(const execution_space& exec, const rowmap_t& rowmap, const entries_t& entries, const values_t& values) { + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_crs_matrix: rowmap_t is not accessible from the given execution " + "space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_crs_matrix: entries_t is not accessible from the given execution " + "space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_crs_matrix: values_t is not accessible from the given execution " + "space"); + static_assert(!std::is_const_v, + "sort_crs_matrix: entries_t must not be const-valued"); + static_assert(!std::is_const_v, + "sort_crs_matrix: value_t must not be const-valued"); using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); @@ -399,7 +418,8 @@ void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, template void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values) { - sort_crs_matrix(typename entries_t::execution_space(), rowmap, entries, values); + sort_crs_matrix(typename entries_t::execution_space(), rowmap, entries, + values); } template @@ -468,6 +488,18 @@ void sort_crs_graph(const execution_space& exec, const rowmap_t& rowmap, const entries_t& entries) { using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_crs_graph: rowmap_t is not accessible from the given execution " + "space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_crs_graph: entries_t is not accessible from the given execution " + "space"); + static_assert(!std::is_const_v, + "sort_crs_graph: entries_t must not be const-valued"); bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; if (numRows == 0) return; @@ -506,15 +538,16 @@ void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { // - sort_crs_graph(rowmap, entries) template void sort_crs_graph(const Arg1& a1, const Arg2& a2) { - if constexpr(Kokkos::is_execution_space_v) { + if constexpr (Kokkos::is_execution_space_v) { // a1 is an exec instance, a2 is a graph sort_crs_graph(a1, a2.row_map, a2.entries); - } - else if constexpr(Kokkos::is_view_v) { + } else if constexpr (Kokkos::is_view_v) { // a1 is rowmap, a2 is entries sort_crs_graph(typename Arg2::execution_space(), a1, a2); } else { - static_assert(Arg1::doesnthavethisthing, "sort_crs_graph(arg1, arg2): expect either (exec, G) or (rowmap, entries)"); + static_assert(Arg1::doesnthavethisthing, + "sort_crs_graph(arg1, arg2): expect either (exec, G) or " + "(rowmap, entries)"); } } @@ -534,10 +567,25 @@ void sort_and_merge_matrix(const exec_space& exec, using size_type = typename nc_rowmap_t::value_type; using ordinal_t = typename entries_t::value_type; using range_t = Kokkos::RangePolicy; + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_and_merge_matrix: rowmap_t is not accessible from the given " + "execution space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_and_merge_matrix: entries_t is not accessible from the given " + "execution space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_and_merge_matrix: values_t is not accessible from the given " + "execution space"); static_assert(!std::is_const_v, "sort_and_merge_matrix: entries_t must not be const-valued"); static_assert(!std::is_const_v, - "sort_and_merge_matrix: values_t must not be const-valued"); + "sort_and_merge_matrix: value_t must not be const-valued"); ordinal_t numRows = rowmap_in.extent(0) ? ordinal_t(rowmap_in.extent(0) - 1) : ordinal_t(0); @@ -632,14 +680,14 @@ void sort_and_merge_matrix(const typename rowmap_t::const_type& rowmap_in, rowmap_out, entries_out, values_out); } -template +template void sort_and_merge_matrix(const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, const values_t& values_in, rowmap_t& rowmap_out, entries_t& entries_out, values_t& values_out) { - sort_and_merge_matrix(typename entries_t::execution_space(), rowmap_in, entries_in, values_in, - rowmap_out, entries_out, values_out); + sort_and_merge_matrix(typename entries_t::execution_space(), rowmap_in, + entries_in, values_in, rowmap_out, entries_out, + values_out); } template @@ -651,8 +699,19 @@ void sort_and_merge_graph(const exec_space& exec, using lno_t = typename entries_t::value_type; using range_t = Kokkos::RangePolicy; using nc_rowmap_t = typename rowmap_t::non_const_type; + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_and_merge_graph: rowmap_t is not accessible from the given " + "execution space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sort_and_merge_graph: entries_t is not accessible from the given " + "execution space"); static_assert(!std::is_const_v, "sort_and_merge_graph: entries_t must not be const-valued"); + lno_t numRows = rowmap_in.extent(0) ? rowmap_in.extent(0) - 1 : 0; if (numRows == 0) { rowmap_out = typename rowmap_t::non_const_type("SortedMerged rowmap", @@ -716,8 +775,8 @@ template void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, rowmap_t& rowmap_out, entries_t& entries_out) { - return sort_and_merge_graph(typename entries_t::execution_space(), rowmap_in, entries_in, rowmap_out, - entries_out); + return sort_and_merge_graph(typename entries_t::execution_space(), rowmap_in, + entries_in, rowmap_out, entries_out); } template From 0ae0d31e1b61a44ce5f57bea85a851c5ef9e866f Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 6 Apr 2023 14:45:06 -0600 Subject: [PATCH 213/442] Formatting & remove unused typedefs --- sparse/unit_test/Test_Sparse_SortCrs.hpp | 98 ++++++++++++------------ 1 file changed, 51 insertions(+), 47 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_SortCrs.hpp b/sparse/unit_test/Test_Sparse_SortCrs.hpp index 796b538c60..63c977ca9a 100644 --- a/sparse/unit_test/Test_Sparse_SortCrs.hpp +++ b/sparse/unit_test/Test_Sparse_SortCrs.hpp @@ -32,13 +32,13 @@ #include #include -namespace SortCrsTest -{ - enum : int { - Instance, //Passing in an instance, and deducing template args - ExplicitType, //Using default instance, but specifying type with template arg - ImplicitType //Using default instance, and deducing type based on view - }; +namespace SortCrsTest { +enum : int { + Instance, // Passing in an instance, and deducing template args + ExplicitType, // Using default instance, but specifying type with template + // arg + ImplicitType // Using default instance, and deducing type based on view +}; } template @@ -52,9 +52,6 @@ void testSortCRS(default_lno_t numRows, default_lno_t numCols, using device_t = Kokkos::Device; using crsMat_t = KokkosSparse::CrsMatrix; - using rowmap_t = typename crsMat_t::row_map_type; - using entries_t = typename crsMat_t::index_type; - using values_t = typename crsMat_t::values_type; // Create a random matrix on device // IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this // wouldn't test anything @@ -99,47 +96,48 @@ void testSortCRS(default_lno_t numRows, default_lno_t numCols, // call the actual sort routine being tested if (doValues) { if (doStructInterface) { - switch(howExecSpecified) { + switch (howExecSpecified) { case SortCrsTest::Instance: KokkosSparse::sort_crs_matrix(exec_space(), A); break; case SortCrsTest::ExplicitType: throw std::logic_error("Should not get here"); - case SortCrsTest::ImplicitType: - KokkosSparse::sort_crs_matrix(A); + case SortCrsTest::ImplicitType: KokkosSparse::sort_crs_matrix(A); } } else { - switch(howExecSpecified) { + switch (howExecSpecified) { case SortCrsTest::Instance: KokkosSparse::sort_crs_matrix(exec_space(), A.graph.row_map, - A.graph.entries, A.values); + A.graph.entries, A.values); break; case SortCrsTest::ExplicitType: KokkosSparse::sort_crs_matrix(A.graph.row_map, - A.graph.entries, A.values); + A.graph.entries, A.values); break; case SortCrsTest::ImplicitType: - KokkosSparse::sort_crs_matrix(A.graph.row_map, A.graph.entries, A.values); + KokkosSparse::sort_crs_matrix(A.graph.row_map, A.graph.entries, + A.values); } } } else { if (doStructInterface) { - switch(howExecSpecified) { + switch (howExecSpecified) { case SortCrsTest::Instance: KokkosSparse::sort_crs_graph(exec_space(), A.graph); break; case SortCrsTest::ExplicitType: throw std::logic_error("Should not get here"); - case SortCrsTest::ImplicitType: - KokkosSparse::sort_crs_graph(A.graph); + case SortCrsTest::ImplicitType: KokkosSparse::sort_crs_graph(A.graph); } } else { - switch(howExecSpecified) { + switch (howExecSpecified) { case SortCrsTest::Instance: - KokkosSparse::sort_crs_graph(exec_space(), A.graph.row_map, A.graph.entries); + KokkosSparse::sort_crs_graph(exec_space(), A.graph.row_map, + A.graph.entries); break; case SortCrsTest::ExplicitType: - KokkosSparse::sort_crs_graph(A.graph.row_map, A.graph.entries); + KokkosSparse::sort_crs_graph(A.graph.row_map, + A.graph.entries); break; case SortCrsTest::ImplicitType: KokkosSparse::sort_crs_graph(A.graph.row_map, A.graph.entries); @@ -350,10 +348,10 @@ void testSortAndMerge(bool justGraph, int howExecSpecified, graph_t outputGraph; // Testing sort_and_merge_graph if (doStructInterface) { - switch(howExecSpecified) { + switch (howExecSpecified) { case SortCrsTest::Instance: outputGraph = - KokkosSparse::sort_and_merge_graph(exec_space(), input.graph); + KokkosSparse::sort_and_merge_graph(exec_space(), input.graph); break; case SortCrsTest::ExplicitType: throw std::logic_error("Should not get here"); @@ -363,11 +361,11 @@ void testSortAndMerge(bool justGraph, int howExecSpecified, } else { rowmap_t devOutRowmap; entries_t devOutEntries; - switch(howExecSpecified) { + switch (howExecSpecified) { case SortCrsTest::Instance: KokkosSparse::sort_and_merge_graph(exec_space(), input.graph.row_map, - input.graph.entries, devOutRowmap, - devOutEntries); + input.graph.entries, devOutRowmap, + devOutEntries); break; case SortCrsTest::ExplicitType: KokkosSparse::sort_and_merge_graph( @@ -375,9 +373,9 @@ void testSortAndMerge(bool justGraph, int howExecSpecified, devOutEntries); break; case SortCrsTest::ImplicitType: - KokkosSparse::sort_and_merge_graph( - input.graph.row_map, input.graph.entries, devOutRowmap, - devOutEntries); + KokkosSparse::sort_and_merge_graph(input.graph.row_map, + input.graph.entries, devOutRowmap, + devOutEntries); } outputGraph = graph_t(devOutEntries, devOutRowmap); } @@ -386,7 +384,7 @@ void testSortAndMerge(bool justGraph, int howExecSpecified, } else { // Testing sort_and_merge_matrix if (doStructInterface) { - switch(howExecSpecified) { + switch (howExecSpecified) { case SortCrsTest::Instance: output = KokkosSparse::sort_and_merge_matrix(exec_space(), input); break; @@ -399,7 +397,7 @@ void testSortAndMerge(bool justGraph, int howExecSpecified, rowmap_t devOutRowmap; entries_t devOutEntries; values_t devOutValues; - switch(howExecSpecified) { + switch (howExecSpecified) { case SortCrsTest::Instance: KokkosSparse::sort_and_merge_matrix( exec_space(), input.graph.row_map, input.graph.entries, @@ -447,9 +445,9 @@ void testSortAndMerge(bool justGraph, int howExecSpecified, TEST_F(TestCategory, common_sort_crsgraph) { for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { for (int howExecSpecified = 0; howExecSpecified < 3; howExecSpecified++) { - // If using the struct interface (StaticCrsGraph), cannot use ExplicitType because - // the exec space type is determined from the graph. - if(doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) + // If using the struct interface (StaticCrsGraph), cannot use ExplicitType + // because the exec space type is determined from the graph. + if (doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) continue; testSortCRS(10, 10, 20, false, doStructInterface, howExecSpecified); @@ -466,9 +464,9 @@ TEST_F(TestCategory, common_sort_crsmatrix) { for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { // howExecSpecified: Instance, ExplicitType, ImplicitType for (int howExecSpecified = 0; howExecSpecified < 3; howExecSpecified++) { - // If using the struct interface (CrsMatrix), cannot use ExplicitType because - // the exec space type is determined from the matrix. - if(doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) + // If using the struct interface (CrsMatrix), cannot use ExplicitType + // because the exec space type is determined from the matrix. + if (doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) continue; testSortCRS(10, 10, 20, true, doStructInterface, howExecSpecified); @@ -484,17 +482,21 @@ TEST_F(TestCategory, common_sort_crsmatrix) { TEST_F(TestCategory, common_sort_crs_longrows) { // Matrix/graph with one very long row // Just test this once with graph, and once with matrix - testSortCRS(1, 50000, 10000, false, false, SortCrsTest::ImplicitType); - testSortCRS(1, 50000, 10000, true, false, SortCrsTest::ImplicitType); + testSortCRS(1, 50000, 10000, false, false, + SortCrsTest::ImplicitType); + testSortCRS(1, 50000, 10000, true, false, + SortCrsTest::ImplicitType); } TEST_F(TestCategory, common_sort_merge_crsmatrix) { for (int testCase = 0; testCase < 5; testCase++) { - for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { + for (int doStructInterface = 0; doStructInterface < 2; + doStructInterface++) { for (int howExecSpecified = 0; howExecSpecified < 3; howExecSpecified++) { - if(doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) + if (doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) continue; - testSortAndMerge(false, howExecSpecified, doStructInterface, testCase); + testSortAndMerge(false, howExecSpecified, + doStructInterface, testCase); } } } @@ -502,11 +504,13 @@ TEST_F(TestCategory, common_sort_merge_crsmatrix) { TEST_F(TestCategory, common_sort_merge_crsgraph) { for (int testCase = 0; testCase < 5; testCase++) { - for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { + for (int doStructInterface = 0; doStructInterface < 2; + doStructInterface++) { for (int howExecSpecified = 0; howExecSpecified < 3; howExecSpecified++) { - if(doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) + if (doStructInterface && howExecSpecified == SortCrsTest::ExplicitType) continue; - testSortAndMerge(true, howExecSpecified, doStructInterface, testCase); + testSortAndMerge(true, howExecSpecified, + doStructInterface, testCase); } } } From dc6f763f3e4dcaaa62e8fdd3cb4e83d323b00183 Mon Sep 17 00:00:00 2001 From: kliegeois Date: Mon, 10 Apr 2023 08:20:03 -0600 Subject: [PATCH 214/442] Remove the printf inside the team kernels. --- .../KokkosBatched_Spmv_TeamVector_Impl.hpp | 6 +- .../impl/KokkosBatched_Spmv_Team_Impl.hpp | 6 +- sparse/impl/KokkosSparse_spmv_team_impl.hpp | 14 +++-- sparse/impl/KokkosSparse_spmv_team_spec.hpp | 12 ++-- sparse/src/KokkosSparse_spmv_team.hpp | 56 +++++++++---------- 5 files changed, 46 insertions(+), 48 deletions(-) diff --git a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp index 3b6dbf9769..2b62be1e5a 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp @@ -388,11 +388,10 @@ struct TeamVectorSpmv { } #endif if (values.extent(0) == 1) { - KokkosBlas::Experimental::team_vector_spmv( + return KokkosSparse::Experimental::team_vector_spmv( member, alpha.data()[0], Kokkos::subview(values, 0, Kokkos::ALL), row_ptr, colIndices, Kokkos::subview(X, 0, Kokkos::ALL), beta.data()[0], Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); - return 0; } return TeamVectorSpmvInternal::template invoke< @@ -472,11 +471,10 @@ struct TeamVectorSpmv { } #endif if (values.extent(0) == 1) { - KokkosBlas::Experimental::team_vector_spmv( + return KokkosSparse::Experimental::team_vector_spmv( member, alpha, Kokkos::subview(values, 0, Kokkos::ALL), row_ptr, colIndices, Kokkos::subview(X, 0, Kokkos::ALL), beta, Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); - return 0; } return TeamVectorSpmvInternal::template invoke< diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp index a508c14cce..c46ef7edc7 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp @@ -239,11 +239,10 @@ struct TeamSpmv { } #endif if (values.extent(0) == 1) { - KokkosBlas::Experimental::team_spmv( + return KokkosSparse::Experimental::team_spmv( member, alpha.data()[0], Kokkos::subview(values, 0, Kokkos::ALL), row_ptr, colIndices, Kokkos::subview(X, 0, Kokkos::ALL), beta.data()[0], Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); - return 0; } return TeamSpmvInternal::template invoke< @@ -323,11 +322,10 @@ struct TeamSpmv { } #endif if (values.extent(0) == 1) { - KokkosBlas::Experimental::team_spmv( + return KokkosSparse::Experimental::team_spmv( member, alpha, Kokkos::subview(values, 0, Kokkos::ALL), row_ptr, colIndices, Kokkos::subview(X, 0, Kokkos::ALL), beta, Kokkos::subview(Y, 0, Kokkos::ALL), dobeta); - return 0; } return TeamSpmvInternal::template invoke< diff --git a/sparse/impl/KokkosSparse_spmv_team_impl.hpp b/sparse/impl/KokkosSparse_spmv_team_impl.hpp index 1c6efd14f0..622dd4997c 100644 --- a/sparse/impl/KokkosSparse_spmv_team_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_team_impl.hpp @@ -22,13 +22,13 @@ #include #include -namespace KokkosBlas { +namespace KokkosSparse { namespace Impl { struct TeamSpmvInternal { template - KOKKOS_INLINE_FUNCTION static void invoke( + KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const OrdinalType numRows, const ScalarType alpha, const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, const OrdinalType* KOKKOS_RESTRICT row_ptr, @@ -42,7 +42,7 @@ struct TeamSpmvInternal { struct TeamVectorSpmvInternal { template - KOKKOS_INLINE_FUNCTION static void invoke( + KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const OrdinalType numRows, const ScalarType alpha, const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, const OrdinalType* KOKKOS_RESTRICT row_ptr, @@ -55,7 +55,7 @@ struct TeamVectorSpmvInternal { template -KOKKOS_INLINE_FUNCTION void TeamSpmvInternal::invoke( +KOKKOS_INLINE_FUNCTION int TeamSpmvInternal::invoke( const MemberType& member, const OrdinalType numRows, const ScalarType alpha, const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, @@ -87,11 +87,12 @@ KOKKOS_INLINE_FUNCTION void TeamSpmvInternal::invoke( y[iRow * ys0] = beta * y[iRow * ys0] + sum; } }); + return 0; } template -KOKKOS_INLINE_FUNCTION void TeamVectorSpmvInternal::invoke( +KOKKOS_INLINE_FUNCTION int TeamVectorSpmvInternal::invoke( const MemberType& member, const OrdinalType numRows, const ScalarType alpha, const ValueType* KOKKOS_RESTRICT values, const OrdinalType valuess0, const OrdinalType* KOKKOS_RESTRICT row_ptr, const OrdinalType row_ptrs0, @@ -124,9 +125,10 @@ KOKKOS_INLINE_FUNCTION void TeamVectorSpmvInternal::invoke( y[iRow * ys0] = beta * y[iRow * ys0] + sum; } }); + return 0; } } // namespace Impl -} // namespace KokkosBlas +} // namespace KokkosSparse #endif diff --git a/sparse/impl/KokkosSparse_spmv_team_spec.hpp b/sparse/impl/KokkosSparse_spmv_team_spec.hpp index a148833a4a..156123b113 100644 --- a/sparse/impl/KokkosSparse_spmv_team_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_team_spec.hpp @@ -23,18 +23,18 @@ #include #include -namespace KokkosBlas { +namespace KokkosSparse { template struct TeamSpmv { template - KOKKOS_INLINE_FUNCTION static void invoke( + KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const ValuesViewType& values, const IntView& row_ptr, const IntView& colIndices, const xViewType& x, const ScalarType beta, const yViewType& y) { - Impl::TeamSpmvInternal::invoke< + return Impl::TeamSpmvInternal::invoke< MemberType, ScalarType, typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, dobeta>( member, x.extent(0), alpha, values.data(), values.stride_0(), @@ -48,12 +48,12 @@ template struct TeamVectorSpmv { template - KOKKOS_INLINE_FUNCTION static void invoke( + KOKKOS_INLINE_FUNCTION static int invoke( const MemberType& member, const ScalarType alpha, const ValuesViewType& values, const IntView& row_ptr, const IntView& colIndices, const xViewType& x, const ScalarType beta, const yViewType& y) { - Impl::TeamVectorSpmvInternal::invoke< + return Impl::TeamVectorSpmvInternal::invoke< MemberType, ScalarType, typename ValuesViewType::non_const_value_type, typename IntView::non_const_value_type, dobeta>( member, x.extent(0), alpha, values.data(), values.stride_0(), @@ -63,6 +63,6 @@ struct TeamVectorSpmv { } }; -} // namespace KokkosBlas +} // namespace KokkosSparse #endif diff --git a/sparse/src/KokkosSparse_spmv_team.hpp b/sparse/src/KokkosSparse_spmv_team.hpp index e62f2807ad..fb55a65420 100644 --- a/sparse/src/KokkosSparse_spmv_team.hpp +++ b/sparse/src/KokkosSparse_spmv_team.hpp @@ -24,14 +24,14 @@ #include // requires C++11, but so does Kokkos #include -namespace KokkosBlas { +namespace KokkosSparse { namespace Experimental { /// \brief Sparse matrix-vector multiply: y = beta*y + alpha*A*x. /// template -void KOKKOS_INLINE_FUNCTION team_spmv( +int KOKKOS_INLINE_FUNCTION team_spmv( const TeamType &team, const ScalarType &alpha, const ValuesViewType &values, const IntView &row_ptr, const IntView &colIndices, const xViewType &x, const ScalarType &beta, const yViewType &y, const int dobeta) { @@ -55,28 +55,28 @@ void KOKKOS_INLINE_FUNCTION team_spmv( // Check compatibility of dimensions at run time. if (values.extent(0) != colIndices.extent(0)) { - std::ostringstream os; - os << "KokkosBlas::spmv: Dimensions of values and colIndices do not match: " - << "values: " << values.extent(0) - << ", colIndices: " << colIndices.extent(0); - KokkosKernels::Impl::throw_runtime_exception(os.str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosSparse::spmv: Dimensions of values and colIndices do not match: " + "values: %d, colIndices: %d", + (int)values.extent(0), (int)colIndices.extent(0)); + return 1; } if (x.extent(0) != y.extent(0) || (x.extent(0) + 1) != row_ptr.extent(0)) { - std::ostringstream os; - os << "KokkosBlas::spmv: Dimensions of x, y, and row_ptr do not match: " - << "x: " << x.extent(0) << ", y: " << y.extent(0) - << ", row_ptr: " << row_ptr.extent(0); - KokkosKernels::Impl::throw_runtime_exception(os.str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " + "x: %d, y: %d, row_ptr: %d", + (int)x.extent(0), (int)y.extent(0), (int)row_ptr.extent(0)); + return 1; } #endif // KOKKOSKERNELS_DEBUG_LEVEL if (dobeta == 1) - KokkosBlas::TeamSpmv::template invoke< + return KokkosSparse::TeamSpmv::template invoke< ScalarType, ValuesViewType, IntView, xViewType, yViewType, 1>( team, alpha, values, row_ptr, colIndices, x, beta, y); else - KokkosBlas::TeamSpmv::template invoke< + return KokkosSparse::TeamSpmv::template invoke< ScalarType, ValuesViewType, IntView, xViewType, yViewType, 0>( team, alpha, values, row_ptr, colIndices, x, beta, y); } @@ -85,7 +85,7 @@ void KOKKOS_INLINE_FUNCTION team_spmv( /// template -void KOKKOS_INLINE_FUNCTION team_vector_spmv( +int KOKKOS_INLINE_FUNCTION team_vector_spmv( const TeamType &team, const ScalarType &alpha, const ValuesViewType &values, const IntView &row_ptr, const IntView &colIndices, const xViewType &x, const ScalarType &beta, const yViewType &y, const int dobeta) { @@ -109,33 +109,33 @@ void KOKKOS_INLINE_FUNCTION team_vector_spmv( // Check compatibility of dimensions at run time. if (values.extent(0) != colIndices.extent(0)) { - std::ostringstream os; - os << "KokkosBlas::spmv: Dimensions of values and colIndices do not match: " - << "values: " << values.extent(0) - << ", colIndices: " << colIndices.extent(0); - KokkosKernels::Impl::throw_runtime_exception(os.str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosSparse::spmv: Dimensions of values and colIndices do not match: " + "values: %d, colIndices: %d", + (int)values.extent(0), (int)colIndices.extent(0)); + return 1; } if (x.extent(0) != y.extent(0) || (x.extent(0) + 1) != row_ptr.extent(0)) { - std::ostringstream os; - os << "KokkosBlas::spmv: Dimensions of x, y, and row_ptr do not match: " - << "x: " << x.extent(0) << ", y: " << y.extent(0) - << ", row_ptr: " << row_ptr.extent(0); - KokkosKernels::Impl::throw_runtime_exception(os.str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosSparse::spmv: Dimensions of x, y, and row_ptr do not match: " + "x: %d, y: %d, row_ptr: %d", + (int)x.extent(0), (int)y.extent(0), (int)row_ptr.extent(0)); + return 1; } #endif // KOKKOSKERNELS_DEBUG_LEVEL if (dobeta == 1) - KokkosBlas::TeamVectorSpmv::template invoke< + return KokkosSparse::TeamVectorSpmv::template invoke< ScalarType, ValuesViewType, IntView, xViewType, yViewType, 1>( team, alpha, values, row_ptr, colIndices, x, beta, y); else - KokkosBlas::TeamVectorSpmv::template invoke< + return KokkosSparse::TeamVectorSpmv::template invoke< ScalarType, ValuesViewType, IntView, xViewType, yViewType, 0>( team, alpha, values, row_ptr, colIndices, x, beta, y); } } // namespace Experimental -} // namespace KokkosBlas +} // namespace KokkosSparse #endif // KOKKOS_BLAS2_MV_HPP_ From 6dc2a6a533e6f8fce4732eff99620739e5a9d912 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Mon, 10 Apr 2023 16:54:58 -0600 Subject: [PATCH 215/442] Re-enable and clean up triangle counting perf test (#1752) * Re-enable triangle counting perf test (#1684, #1745) But disable non-working modes: - gpu backends - AI, IA, IAUNION algorithms * Remove unused params --- graph/src/KokkosGraph_Triangle.hpp | 17 +- .../KokkosKernels_perf_test_utilities.hpp | 2 + perf_test/graph/CMakeLists.txt | 10 +- .../graph/KokkosGraph_multimem_triangle.hpp | 205 ----------- perf_test/graph/KokkosGraph_run_triangle.hpp | 290 --------------- perf_test/graph/KokkosGraph_triangle.cpp | 340 +++++++++++------- .../KokkosSparse_spgemm_impl_triangle.hpp | 5 +- sparse/src/KokkosSparse_IOUtils.hpp | 29 +- 8 files changed, 233 insertions(+), 665 deletions(-) delete mode 100644 perf_test/graph/KokkosGraph_multimem_triangle.hpp delete mode 100644 perf_test/graph/KokkosGraph_run_triangle.hpp diff --git a/graph/src/KokkosGraph_Triangle.hpp b/graph/src/KokkosGraph_Triangle.hpp index 5c7360a88a..0a878891ce 100644 --- a/graph/src/KokkosGraph_Triangle.hpp +++ b/graph/src/KokkosGraph_Triangle.hpp @@ -232,11 +232,8 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, // if 2, we do an interleaved sort. } { - if (sh->get_sort_option() != -1) { - sort_decreasing_order = sh->get_sort_option(); - } - KokkosKernels::Impl::kk_sort_by_row_size( + KokkosSparse::Impl::kk_sort_by_row_size( m, row_mapA.data(), new_indices.data(), sort_decreasing_order, ExecutionSpace().concurrency()); } @@ -264,7 +261,7 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, nnz_lno_persistent_work_view_t new_indices = sh->get_lower_triangular_permutation(); - KokkosKernels::Impl::kk_get_lower_triangle< + KokkosSparse::Impl::kk_get_lower_triangle< alno_row_view_t_, alno_nnz_view_t_, alno_nnz_view_t_, row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, alno_nnz_view_t_, nnz_lno_persistent_work_view_t, ExecutionSpace>( @@ -292,7 +289,7 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, nnz_lno_persistent_work_view_t new_indices = sh->get_lower_triangular_permutation(); - KokkosKernels::Impl::kk_get_lower_triangle< + KokkosSparse::Impl::kk_get_lower_triangle< alno_row_view_t_, alno_nnz_view_t_, alno_nnz_view_t_, row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, alno_nnz_view_t_, nnz_lno_persistent_work_view_t, ExecutionSpace>( @@ -334,7 +331,7 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, nnz_lno_persistent_work_view_t new_indices = sh->get_lower_triangular_permutation(); - KokkosKernels::Impl::kk_get_lower_triangle< + KokkosSparse::Impl::kk_get_lower_triangle< alno_row_view_t_, alno_nnz_view_t_, alno_nnz_view_t_, row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, alno_nnz_view_t_, nnz_lno_persistent_work_view_t, ExecutionSpace>( @@ -342,7 +339,7 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, lower_triangular_matrix_entries, null_values, new_indices, handle->is_dynamic_scheduling()); } - KokkosKernels::Impl:: + KokkosSparse::Impl:: kk_create_incidence_tranpose_matrix_from_lower_triangle< row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, @@ -357,7 +354,7 @@ void triangle_generic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, case SPGEMM_KK_TRIANGLE_AI: { // these are the algorithms that requires the incidence matrix. - KokkosKernels::Impl::kk_create_incidence_matrix_from_original_matrix< + KokkosSparse::Impl::kk_create_incidence_matrix_from_original_matrix< alno_row_view_t_, alno_nnz_view_t_, row_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, nnz_lno_persistent_work_view_t, ExecutionSpace>(m, row_mapA, entriesA, incidence_rowmap, diff --git a/perf_test/KokkosKernels_perf_test_utilities.hpp b/perf_test/KokkosKernels_perf_test_utilities.hpp index d7c47150df..0df96f4494 100644 --- a/perf_test/KokkosKernels_perf_test_utilities.hpp +++ b/perf_test/KokkosKernels_perf_test_utilities.hpp @@ -20,6 +20,8 @@ #ifndef KOKKOSKERNELS_PERF_TEST_UTILITIES_HPP #define KOKKOSKERNELS_PERF_TEST_UTILITIES_HPP +#include "KokkosKernels_TestUtils.hpp" // for string_compare_no_case + // Namepsace that defines common utilities // for performance tests namespace perf_test { diff --git a/perf_test/graph/CMakeLists.txt b/perf_test/graph/CMakeLists.txt index 134a7acc2e..26eab42ed4 100644 --- a/perf_test/graph/CMakeLists.txt +++ b/perf_test/graph/CMakeLists.txt @@ -16,10 +16,8 @@ KOKKOSKERNELS_ADD_EXECUTABLE( SOURCES KokkosGraph_mis_d2.cpp ) - -#Below will probably fail on GPUs. -#KOKKOSKERNELS_ADD_EXECUTABLE( -# graph_triangle -# SOURCES KokkosGraph_triangle.cpp -# ) +KOKKOSKERNELS_ADD_EXECUTABLE( + graph_triangle + SOURCES KokkosGraph_triangle.cpp + ) diff --git a/perf_test/graph/KokkosGraph_multimem_triangle.hpp b/perf_test/graph/KokkosGraph_multimem_triangle.hpp deleted file mode 100644 index f7875fed0e..0000000000 --- a/perf_test/graph/KokkosGraph_multimem_triangle.hpp +++ /dev/null @@ -1,205 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include "KokkosGraph_run_triangle.hpp" -#include "KokkosSparse_CrsMatrix.hpp" - -namespace KokkosKernels { - -namespace Experiment { - -template -void run_multi_mem_triangle(Parameters params) { - typedef exec_space myExecSpace; - typedef Kokkos::Device myFastDevice; - typedef Kokkos::Device mySlowExecSpace; - - typedef typename KokkosSparse::CrsMatrix - fast_crstmat_t; - typedef typename fast_crstmat_t::StaticCrsGraphType fast_graph_t; - - typedef typename KokkosSparse::CrsMatrix - slow_crstmat_t; - typedef typename slow_crstmat_t::StaticCrsGraphType slow_graph_t; - - char *a_mat_file = params.a_mtx_bin_file; - // char *b_mat_file = params.b_mtx_bin_file; - // char *c_mat_file = params.c_mtx_bin_file; - - slow_graph_t a_slow_crsgraph, /*b_slow_crsgraph,*/ c_slow_crsgraph; - fast_graph_t a_fast_crsgraph, /*b_fast_crsgraph,*/ c_fast_crsgraph; - - // read a and b matrices and store them on slow or fast memory. - if (params.a_mem_space == 1) { - fast_crstmat_t a_fast_crsmat; - a_fast_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( - a_mat_file); - a_fast_crsgraph = a_fast_crsmat.graph; - a_fast_crsgraph.num_cols = a_fast_crsmat.numCols(); - - } else { - slow_crstmat_t a_slow_crsmat; - a_slow_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( - a_mat_file); - a_slow_crsgraph = a_slow_crsmat.graph; - a_slow_crsgraph.num_cols = a_slow_crsmat.numCols(); - } - - if (params.a_mem_space == 1) { - if (params.b_mem_space == 1) { - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - /* c_fast_crsgraph = */ - KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_graph_t, fast_graph_t, fast_graph_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsgraph, - /*b_fast_crsgraph,*/ params); - } else { - /* c_fast_crsgraph = */ - KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_graph_t, fast_graph_t, fast_graph_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsgraph, - /*b_fast_crsgraph,*/ params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - /*c_slow_crsgraph =*/ - KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_graph_t, fast_graph_t, slow_graph_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsgraph, - /*b_fast_crsgraph,*/ params); - } else { - /*c_slow_crsgraph =*/ - KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_graph_t, fast_graph_t, slow_graph_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsgraph, - /*b_fast_crsgraph,*/ params); - } - } - } else { - // B is in slow memory - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - /* c_fast_crsgraph = */ - KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_graph_t, slow_graph_t, fast_graph_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsgraph, - /*b_slow_crsgraph,*/ params); - } else { - /* c_fast_crsgraph = */ - KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_graph_t, slow_graph_t, fast_graph_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsgraph, - /*b_slow_crsgraph,*/ params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - /*c_slow_crsgraph =*/ - KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_graph_t, slow_graph_t, slow_graph_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsgraph, - /*b_slow_crsgraph,*/ params); - } else { - /*c_slow_crsgraph =*/ - KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_graph_t, slow_graph_t, slow_graph_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsgraph, - /*b_slow_crsgraph,*/ params); - } - } - } - } else { - // A is in slow memory - if (params.b_mem_space == 1) { - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - /* c_fast_crsgraph = */ - KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_graph_t, fast_graph_t, fast_graph_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsgraph, - /*b_fast_crsgraph,*/ params); - } else { - /* c_fast_crsgraph = */ - KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_graph_t, fast_graph_t, fast_graph_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsgraph, - /*b_fast_crsgraph,*/ params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - /*c_slow_crsgraph =*/ - KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_graph_t, fast_graph_t, slow_graph_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsgraph, - /*b_fast_crsgraph,*/ params); - } else { - /*c_slow_crsgraph =*/ - KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_graph_t, fast_graph_t, slow_graph_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsgraph, - /*b_fast_crsgraph,*/ params); - } - } - } else { - // B is in slow memory - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - /* c_fast_crsgraph = */ - KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_graph_t, slow_graph_t, fast_graph_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsgraph, - /*b_slow_crsgraph,*/ params); - } else { - /* c_fast_crsgraph = */ - KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_graph_t, slow_graph_t, fast_graph_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsgraph, - /*b_slow_crsgraph,*/ params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - /*c_slow_crsgraph =*/ - KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_graph_t, slow_graph_t, slow_graph_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsgraph, - /*b_slow_crsgraph,*/ params); - } else { - /*c_slow_crsgraph =*/ - KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_graph_t, slow_graph_t, slow_graph_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsgraph, - /*b_slow_crsgraph,*/ params); - } - } - } - } -} - -} // namespace Experiment -} // namespace KokkosKernels diff --git a/perf_test/graph/KokkosGraph_run_triangle.hpp b/perf_test/graph/KokkosGraph_run_triangle.hpp deleted file mode 100644 index 2bdea59bea..0000000000 --- a/perf_test/graph/KokkosGraph_run_triangle.hpp +++ /dev/null @@ -1,290 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include "KokkosGraph_Triangle.hpp" -#include "KokkosKernels_TestParameters.hpp" - -#define TRANPOSEFIRST false -#define TRANPOSESECOND false - -namespace KokkosKernels { - -namespace Experiment { -template -bool is_same_graph(crsGraph_t output_mat1, crsGraph_t output_mat2) { - // typedef typename crsGraph_t::StaticCrsGraphType crsGraph_t; - typedef typename crsGraph_t::row_map_type::non_const_type lno_view_t; - typedef typename crsGraph_t::entries_type::non_const_type lno_nnz_view_t; - // typedef typename crsGraph_t::values_type::non_const_type scalar_view_t; - - size_t nrows1 = output_mat1.row_map.extent(0); - size_t nentries1 = output_mat1.entries.extent(0); - - size_t nrows2 = output_mat2.row_map.extent(0); - size_t nentries2 = output_mat2.entries.extent(0); - // size_t nvals2 = output_mat2.values.extent(0); - - KokkosKernels::sort_crs_graph( - output_mat1.graph.row_map, output_mat1.entries); - - if (nrows1 != nrows2) return false; - if (nentries1 != nentries2) return false; - - KokkosKernels::sort_crs_graph( - output_mat2.graph.row_map, output_mat2.entries); - - bool is_identical = true; - is_identical = KokkosKernels::Impl::kk_is_identical_view< - typename crsGraph_t::row_map_type, typename crsGraph_t::row_map_type, - typename lno_view_t::value_type, typename device::execution_space>( - output_mat1.row_map, output_mat2.row_map, 0); - if (!is_identical) return false; - - is_identical = KokkosKernels::Impl::kk_is_identical_view< - lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type, - typename device::execution_space>(output_mat1.entries, - output_mat2.entries, 0); - if (!is_identical) return false; - - if (!is_identical) { - std::cout << "Incorrect values" << std::endl; - } - return true; -} - -template -struct Flush { - typedef double value_type; - - // flush a large host buffer - Kokkos::View _buf; - Flush(int flush_option) : _buf("Flush::buf", BufSize) { - Kokkos::deep_copy(_buf, 1); - Kokkos::fence(); - if (flush_option == 2) { - for (size_t i = 0; i < BufSize; ++i) { - _buf(i) = rand(); - } - } - } - - KOKKOS_INLINE_FUNCTION - void init(value_type &update) { update = 0; } - - KOKKOS_INLINE_FUNCTION - void join(value_type &update, const value_type &input) { update += input; } - - KOKKOS_INLINE_FUNCTION - void operator()(const int i, value_type &update) const { update += _buf[i]; } - - void run() { - double sum = 0; - Kokkos::parallel_reduce( - "KokkosGraph::PerfTest::Flush", - Kokkos::RangePolicy(0, BufSize / sizeof(double)), *this, - sum); - SpaceType().fence(); - std::cout << "Flush sum:" << sum << std::endl; - FILE *fp = fopen("/dev/null", "w"); - fprintf(fp, "%f\n", sum); - fclose(fp); - - /* - #pragma omp parallel - { - const size_t cache_line = 64; - const char *cp = (const char *) _buf.data(); - size_t i = 0; - - - for (i = 0; i < BufSize; i += cache_line) { - asm volatile("clflush (%0)\n\t" - : - : "r"(&cp[i]) - : "memory"); - } - - asm volatile("sfence\n\t" - : - : - : "memory"); - } - */ - } -}; - -template -void run_experiment(crsGraph_t crsGraph, Parameters params) { - // using namespace KokkosSparse; - using namespace KokkosSparse; - using namespace KokkosGraph::Experimental; - // using namespace KokkosSparse::Experimental; - - int algorithm = params.algorithm; - int repeat = params.repeat; - int chunk_size = params.chunk_size; - - int shmemsize = params.shmemsize; - int team_size = params.team_size; - int use_dynamic_scheduling = params.use_dynamic_scheduling; - int verbose = params.verbose; - - int accumulator = params.accumulator; - // char spgemm_step = params.spgemm_step; - int vector_size = params.vector_size; - - // spgemm_step++; - - typedef typename crsGraph_t3::row_map_type::non_const_type lno_view_t; - typedef typename crsGraph_t3::entries_type::non_const_type lno_nnz_view_t; - - Kokkos::View row_mapC; - lno_nnz_view_t entriesC; - lno_nnz_view_t valuesC; - - typedef typename lno_nnz_view_t::value_type lno_t; - typedef typename lno_view_t::value_type size_type; - - typedef KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_t, lno_t, ExecSpace, TempMemSpace, PersistentMemSpace> - KernelHandle; - - KernelHandle kh; - kh.set_team_work_size(chunk_size); - kh.set_shmem_size(shmemsize); - kh.set_suggested_team_size(team_size); - kh.set_suggested_vector_size(vector_size); - - if (use_dynamic_scheduling) { - kh.set_dynamic_scheduling(true); - } - if (verbose) { - kh.set_verbose(true); - } - const lno_t m = crsGraph.numRows(); - ; - - for (int i = 0; i < repeat; ++i) { - size_type rowmap_size = crsGraph.entries.extent(0); - switch (algorithm) { - case 16: - kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_AI); - rowmap_size = m; - break; - case 17: - kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_IA); - std::cout << "IA" << std::endl; - break; - case 18: kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_IA_UNION); break; - case 19: - kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_LL); - rowmap_size = m; - break; - case 20: - kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_LU); - rowmap_size = m; - break; - default: kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_IA); break; - } - - kh.get_spgemm_handle()->set_compression_steps(!params.compression2step); - - kh.get_spgemm_handle()->set_sort_lower_triangular(params.right_sort); - kh.get_spgemm_handle()->set_create_lower_triangular( - params.right_lower_triangle); - kh.get_spgemm_handle()->set_compression(params.apply_compression); - kh.get_spgemm_handle()->set_sort_option(params.sort_option); - kh.get_spgemm_handle()->set_min_hash_size_scale(params.minhashscale); - - switch (accumulator) { - case 0: - default: - kh.get_spgemm_handle()->set_accumulator_type(SPGEMM_ACC_DEFAULT); - break; - case 1: - kh.get_spgemm_handle()->set_accumulator_type(SPGEMM_ACC_DENSE); - break; - case 2: - kh.get_spgemm_handle()->set_accumulator_type(SPGEMM_ACC_SPARSE); - break; - } - - constexpr size_t LLC_CAPACITY = 256 * 4 * 1024 * 1024; - if (params.cache_flush) { - std::cout << "Flushing cache with option:" << params.cache_flush - << std::endl; - Flush flush(params.cache_flush); - flush.run(); - } - if (i == 0) { - kh.get_spgemm_handle()->set_read_write_cost_calc( - params.calculate_read_write_cost); - } - - Kokkos::Timer timer1; - - row_mapC = - Kokkos::View("non_const_lnow_row", rowmap_size); - entriesC = lno_nnz_view_t(""); - valuesC = lno_nnz_view_t(""); - - double symbolic_time = 0; - if (params.triangle_options == 0) { - if (params.apply_compression) { - triangle_generic( - &kh, m, crsGraph.row_map, crsGraph.entries, - KOKKOS_LAMBDA(const lno_t &row, const lno_t &col_set_index, - const lno_t &col_set, const lno_t &thread_id) { - // row_mapC(row) += KokkosKernels::Impl::set_bit_count(col_set); - row_mapC(row) += KokkosKernels::Impl::pop_count(col_set); - }); - } else { - triangle_generic( - &kh, m, crsGraph.row_map, crsGraph.entries, - KOKKOS_LAMBDA(const lno_t &row, const lno_t &col_set_index, - const lno_t &col_set, const lno_t &thread_id) { - row_mapC(row) += 1; - // row_mapC(row) += KokkosKernels::Impl::set_bit_count(col_set); row_mapC(row) += - // KokkosKernels::Impl::pop_count(col_set); - }); - } - - size_t num_triangles = 0; - KokkosKernels::Impl::kk_reduce_view, - ExecSpace>(rowmap_size, row_mapC, - num_triangles); - ExecSpace().fence(); - - symbolic_time = timer1.seconds(); - std::cout << "num_triangles:" << num_triangles << std::endl; - } - kh.destroy_spgemm_handle(); - std::cout << "mm_time:" << symbolic_time << std::endl; - // only do this once - // kh.get_spgemm_handle()->set_read_write_cost_calc(false); - } -} - -} // namespace Experiment -} // namespace KokkosKernels diff --git a/perf_test/graph/KokkosGraph_triangle.cpp b/perf_test/graph/KokkosGraph_triangle.cpp index 153382f111..a250fe15a7 100644 --- a/perf_test/graph/KokkosGraph_triangle.cpp +++ b/perf_test/graph/KokkosGraph_triangle.cpp @@ -14,15 +14,56 @@ // //@HEADER #include - -#include "KokkosGraph_multimem_triangle.hpp" #include "KokkosKernels_IOUtils.hpp" +#include "KokkosGraph_Triangle.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_IOUtils.hpp" //for read_kokkos_crst_graph +#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestParameters.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +template +struct Flush { + typedef double value_type; + + // flush a large host buffer + Kokkos::View _buf; + Flush(int flush_option) : _buf("Flush::buf", BufSize) { + Kokkos::deep_copy(_buf, 1); + Kokkos::fence(); + if (flush_option == 2) { + for (size_t i = 0; i < BufSize; ++i) { + _buf(i) = rand(); + } + } + } + + KOKKOS_INLINE_FUNCTION + void init(value_type &update) { update = 0; } + + KOKKOS_INLINE_FUNCTION + void join(value_type &update, const value_type &input) { update += input; } + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, value_type &update) const { update += _buf[i]; } + + void run() { + double sum = 0; + Kokkos::parallel_reduce( + "KokkosGraph::PerfTest::Flush", + Kokkos::RangePolicy(0, BufSize / sizeof(double)), *this, + sum); + SpaceType().fence(); + std::cout << "Flush sum:" << sum << std::endl; + FILE *fp = fopen("/dev/null", "w"); + fprintf(fp, "%f\n", sum); + fclose(fp); + } +}; void print_options() { std::cerr << "Options\n" << std::endl; - std::cerr << "Choose BackEnd : --openmp [numthreads] | " - "--cuda | --hip" - << std::endl; + std::cerr << perf_test::list_common_options(); std::cerr << "Input Matrix : --amtx [path_to_input_matrix]" << std::endl; @@ -32,13 +73,14 @@ void print_options() { std::cerr << "\t\t.bin: it will read binary crs matrix format." << std::endl; std::cerr << "\t\t.crs: it will read text crs matrix format." << std::endl; std::cerr << "--algorithm :" << std::endl; - std::cerr << "\tTRIANGLEAI: for Adj x Incidence" << std::endl; - std::cerr << "\tTRIANGLEIA: for Incidence x Adj -- implementing set " - "intersection (2D) -- 3rd fastest" - << std::endl; - std::cerr - << "\tTRIANGLEIAUNION: for Incidence x Adj -- implementing set union " - << std::endl; + // BMK 3-28-23: these algorithms do not give correct triangle counts + // std::cerr << "\tTRIANGLEAI: for Adj x Incidence" << std::endl; + // std::cerr << "\tTRIANGLEIA: for Incidence x Adj -- implementing set " + // "intersection (2D) -- 3rd fastest" + // << std::endl; + // std::cerr + // << "\tTRIANGLEIAUNION: for Incidence x Adj -- implementing set union " + // << std::endl; std::cerr << "\tTRIANGLELL: Lower x Lower -- usually fastest " << std::endl; std::cerr << "\tTRIANGLELU: Lower x Upper -- usually 2nd fastest " << std::endl; @@ -87,24 +129,17 @@ void print_options() { std::cerr << "Suggested use of LU: executable --amtx path_to_file.bin " "--algorithm TRIANGLELU --repeat 6 --verbose --chunksize [4|16]" << std::endl; - std::cerr - << "Suggested use of AI: executable --amtx path_to_file.bin --algorithm " - "TRIANGLEIA --repeat 6 --verbose --chunksize [4|16] rlt" - << std::endl; + // std::cerr + // << "Suggested use of AI: executable --amtx path_to_file.bin --algorithm + // " + // "TRIANGLEIA --repeat 6 --verbose --chunksize [4|16] rlt" + // << std::endl; } int parse_inputs(KokkosKernels::Experiment::Parameters ¶ms, int argc, char **argv) { for (int i = 1; i < argc; ++i) { - if (0 == Test::string_compare_no_case(argv[i], "--threads")) { - params.use_threads = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) { - params.use_openmp = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { - params.use_cuda = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) { - params.use_hip = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) { + if (0 == Test::string_compare_no_case(argv[i], "--repeat")) { params.repeat = atoi(argv[++i]); } else if (0 == Test::string_compare_no_case(argv[i], "--triangle_operation")) { @@ -117,44 +152,6 @@ int parse_inputs(KokkosKernels::Experiment::Parameters ¶ms, int argc, params.vector_size = atoi(argv[++i]); } else if (0 == Test::string_compare_no_case(argv[i], "--compression")) { params.apply_compression = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--sort_option")) { - params.sort_option = atoi(argv[++i]); - } else if (0 == Test::string_compare_no_case(argv[i], "--memspaces")) { - int memspaces = atoi(argv[++i]); - int memspaceinfo = memspaces; - std::cout << "memspaceinfo:" << memspaceinfo << std::endl; - if (memspaceinfo & 1) { - params.a_mem_space = 1; - std::cout << "Using HBM for A" << std::endl; - } else { - params.a_mem_space = 0; - std::cout << "Using DDR4 for A" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; - if (memspaceinfo & 1) { - params.b_mem_space = 1; - std::cout << "Using HBM for B" << std::endl; - } else { - params.b_mem_space = 0; - std::cout << "Using DDR4 for B" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; - if (memspaceinfo & 1) { - params.c_mem_space = 1; - std::cout << "Using HBM for C" << std::endl; - } else { - params.c_mem_space = 0; - std::cout << "Using DDR4 for C" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; - if (memspaceinfo & 1) { - params.work_mem_space = 1; - std::cout << "Using HBM for work memory space" << std::endl; - } else { - params.work_mem_space = 0; - std::cout << "Using DDR4 for work memory space" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; } else if (0 == Test::string_compare_no_case(argv[i], "--flop")) { params.calculate_read_write_cost = 1; } else if (0 == Test::string_compare_no_case(argv[i], "--CIF")) { @@ -178,16 +175,7 @@ int parse_inputs(KokkosKernels::Experiment::Parameters ¶ms, int argc, params.check_output = 1; } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) { params.a_mtx_bin_file = argv[++i]; - } - /* - else if ( 0 == Test::string_compare_no_case( argv[i] , "cmtx" ) ) { - params.c_mtx_bin_file = argv[++i]; - } - else if ( 0 == Test::string_compare_no_case( argv[i] , "bmtx" ) ) { - params.b_mtx_bin_file = argv[++i]; - } - */ - else if (0 == Test::string_compare_no_case(argv[i], "--dynamic")) { + } else if (0 == Test::string_compare_no_case(argv[i], "--dynamic")) { params.use_dynamic_scheduling = 1; } else if (0 == Test::string_compare_no_case(argv[i], "--cache_flush")) { params.cache_flush = atoi(argv[++i]); @@ -221,11 +209,20 @@ int parse_inputs(KokkosKernels::Experiment::Parameters ¶ms, int argc, ++i; if (0 == Test::string_compare_no_case(argv[i], "TRIANGLEAI")) { params.algorithm = 16; + std::cerr << "\nAlgorithm TRIANGLEAI is disabled (produces incorrect " + "triangle count)\n"; + return 1; } else if (0 == Test::string_compare_no_case(argv[i], "TRIANGLEIA")) { params.algorithm = 17; + std::cerr << "\nAlgorithm TRIANGLEIA is disabled (produces incorrect " + "triangle count)\n"; + return 1; } else if (0 == Test::string_compare_no_case(argv[i], "TRIANGLEIAUNION")) { params.algorithm = 18; + std::cerr << "\nAlgorithm TRIANGLEIAUNION is disabled (produces " + "incorrect triangle count)\n"; + return 1; } else if (0 == Test::string_compare_no_case(argv[i], "TRIANGLELL")) { params.algorithm = 19; } else if (0 == Test::string_compare_no_case(argv[i], "TRIANGLELU")) { @@ -246,75 +243,162 @@ int parse_inputs(KokkosKernels::Experiment::Parameters ¶ms, int argc, return 0; } -int main(int argc, char **argv) { - typedef unsigned size_type; - typedef int idx; +template +void run_experiment(int argc, char **argv, perf_test::CommonInputParams) { + using namespace KokkosSparse; + using mem_space = typename exec_space::memory_space; + using device_t = Kokkos::Device; + using lno_t = default_lno_t; + using size_type = default_size_type; + using graph_t = + Kokkos::StaticCrsGraph; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, lno_t, exec_space, mem_space, mem_space>; + + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + std::cerr + << "** Triangle counting is currently not supported on GPU backends.\n"; + return; + } KokkosKernels::Experiment::Parameters params; if (parse_inputs(params, argc, argv)) { - return 1; + return; } - if (params.a_mtx_bin_file == NULL) { - std::cerr << "Provide a matrix file" << std::endl; + if (params.a_mtx_bin_file == "") { + std::cerr << "Provide a graph file" << std::endl; print_options(); - return 0; + return; } - std::cout << "Sizeof(idx):" << sizeof(idx) + std::cout << "Sizeof(idx):" << sizeof(lno_t) << " sizeof(size_type):" << sizeof(size_type) << std::endl; - const int num_threads = - params.use_openmp; // Assumption is that use_openmp variable is provided - // as number of threads - const int device_id = 0; - Kokkos::initialize(Kokkos::InitializationSettings() - .set_num_threads(num_threads) - .set_device_id(device_id)); - -#if defined(KOKKOS_ENABLE_OPENMP) - - if (params.use_openmp) { - Kokkos::OpenMP().print_configuration(std::cout); -#ifdef KOKKOSKERNELS_MULTI_MEM - KokkosKernels::Experiment::run_multi_mem_triangle< - size_type, idx, Kokkos::OpenMP, Kokkos::OpenMP::memory_space, - Kokkos::HostSpace>(params); -#else - KokkosKernels::Experiment::run_multi_mem_triangle< - size_type, idx, Kokkos::OpenMP, Kokkos::OpenMP::memory_space, - Kokkos::OpenMP::memory_space>(params); -#endif - } + // read graph + graph_t crsGraph = KokkosSparse::Impl::read_kokkos_crst_graph( + params.a_mtx_bin_file.c_str()); -#endif - -#if defined(KOKKOS_ENABLE_CUDA) - if (params.use_cuda) { - Kokkos::Cuda().print_configuration(std::cout); -#ifdef KOKKOSKERNELS_MULTI_MEM - KokkosKernels::Experiment::run_multi_mem_triangle< - size_type, idx, Kokkos::Cuda, Kokkos::Cuda::memory_space, - Kokkos::CudaHostPinnedSpace>(params); -#else - KokkosKernels::Experiment::run_multi_mem_triangle< - size_type, idx, Kokkos::Cuda, Kokkos::Cuda::memory_space, - Kokkos::Cuda::memory_space>(params); -#endif - } + int algorithm = params.algorithm; + int repeat = params.repeat; + int chunk_size = params.chunk_size; + + int shmemsize = params.shmemsize; + int team_size = params.team_size; + int use_dynamic_scheduling = params.use_dynamic_scheduling; + int verbose = params.verbose; -#endif + int accumulator = params.accumulator; + int vector_size = params.vector_size; -#if defined(KOKKOS_ENABLE_HIP) - if (params.use_hip) { - Kokkos::Experimental::HIP().print_configuration(std::cout); - KokkosKernels::Experiment::run_multi_mem_triangle< - size_type, idx, Kokkos::Experimental::HIP, - Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(params); + Kokkos::View row_mapC; + + KernelHandle kh; + kh.set_team_work_size(chunk_size); + kh.set_shmem_size(shmemsize); + kh.set_suggested_team_size(team_size); + kh.set_suggested_vector_size(vector_size); + + if (use_dynamic_scheduling) { + kh.set_dynamic_scheduling(true); } -#endif + if (verbose) { + kh.set_verbose(true); + } + const lno_t m = crsGraph.numRows(); - Kokkos::finalize(); + for (int i = 0; i < repeat; ++i) { + size_type rowmap_size = crsGraph.entries.extent(0); + switch (algorithm) { + case 16: + kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_AI); + rowmap_size = m; + break; + case 17: + kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_IA); + std::cout << "IA" << std::endl; + break; + case 18: kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_IA_UNION); break; + case 19: + kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_LL); + rowmap_size = m; + break; + case 20: + kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_LU); + rowmap_size = m; + break; + default: kh.create_spgemm_handle(SPGEMM_KK_TRIANGLE_IA); break; + } - return 0; + kh.get_spgemm_handle()->set_compression_steps(!params.compression2step); + + kh.get_spgemm_handle()->set_sort_lower_triangular(params.right_sort); + kh.get_spgemm_handle()->set_create_lower_triangular( + params.right_lower_triangle); + kh.get_spgemm_handle()->set_compression(params.apply_compression); + kh.get_spgemm_handle()->set_min_hash_size_scale(params.minhashscale); + + switch (accumulator) { + case 0: + default: + kh.get_spgemm_handle()->set_accumulator_type(SPGEMM_ACC_DEFAULT); + break; + case 1: + kh.get_spgemm_handle()->set_accumulator_type(SPGEMM_ACC_DENSE); + break; + case 2: + kh.get_spgemm_handle()->set_accumulator_type(SPGEMM_ACC_SPARSE); + break; + } + + constexpr size_t LLC_CAPACITY = 128 * 1024 * 1024; + if (params.cache_flush) { + std::cout << "Flushing cache with option:" << params.cache_flush + << std::endl; + Flush flush(params.cache_flush); + flush.run(); + } + if (i == 0) { + kh.get_spgemm_handle()->set_read_write_cost_calc( + params.calculate_read_write_cost); + } + + Kokkos::Timer timer1; + + row_mapC = + Kokkos::View("non_const_lnow_row", rowmap_size); + + double symbolic_time = 0; + if (params.triangle_options == 0) { + if (params.apply_compression) { + KokkosGraph::Experimental::triangle_generic( + &kh, m, crsGraph.row_map, crsGraph.entries, + KOKKOS_LAMBDA(const lno_t &row, const lno_t & /* col_set_index */, + const lno_t &col_set, const lno_t & /* thread_id */) { + row_mapC(row) += KokkosKernels::Impl::pop_count(col_set); + }); + } else { + KokkosGraph::Experimental::triangle_generic( + &kh, m, crsGraph.row_map, crsGraph.entries, + KOKKOS_LAMBDA(const lno_t &row, const lno_t & /*col_set_index*/, + const lno_t & /*col_set*/, + const lno_t & /*thread_id*/) { row_mapC(row)++; }); + } + + size_t num_triangles = 0; + KokkosKernels::Impl::kk_reduce_view, + exec_space>(rowmap_size, row_mapC, + num_triangles); + symbolic_time = timer1.seconds(); + std::cout << "num_triangles:" << num_triangles << std::endl; + } + kh.destroy_spgemm_handle(); + std::cout << "mm_time:" << symbolic_time << std::endl; + } } + +#define KOKKOSKERNELS_PERF_TEST_NAME run_experiment +#include "KokkosKernels_perf_test_instantiation.hpp" +int main(int argc, char **argv) { + return main_instantiation(argc, argv); +} // main diff --git a/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp index ef03e0b786..dd1a7cd9b5 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp @@ -1086,8 +1086,6 @@ struct KokkosSPGEMMhandle->get_spgemm_handle()->get_min_col_of_row().data(); nnz_lno_t max_row_size = - this->handle->get_spgemm_handle()->get_max_result_nnz(); + this->handle->get_spgemm_handle()->get_max_result_nnz( + Kokkos::View(rowmapC, m + 1)); typedef KokkosKernels::Impl::UniformMemoryPool pool_memory_space; diff --git a/sparse/src/KokkosSparse_IOUtils.hpp b/sparse/src/KokkosSparse_IOUtils.hpp index c5f024f4f6..4704a8724c 100644 --- a/sparse/src/KokkosSparse_IOUtils.hpp +++ b/sparse/src/KokkosSparse_IOUtils.hpp @@ -1179,33 +1179,16 @@ crsGraph_t read_kokkos_crst_graph(const char *filename_) { row_map_view_t rowmap_view("rowmap_view", nv + 1); cols_view_t columns_view("colsmap_view", nnzA); - { - typename row_map_view_t::HostMirror hr = - Kokkos::create_mirror_view(rowmap_view); - typename cols_view_t::HostMirror hc = - Kokkos::create_mirror_view(columns_view); - - for (lno_t i = 0; i <= nv; ++i) { - hr(i) = xadj[i]; - } + typename row_map_view_t::HostMirror hr(xadj, nv + 1); + typename cols_view_t::HostMirror hc(adj, nnzA); + Kokkos::deep_copy(rowmap_view, hr); + Kokkos::deep_copy(columns_view, hc); - for (size_type i = 0; i < nnzA; ++i) { - hc(i) = adj[i]; - } - Kokkos::deep_copy(rowmap_view, hr); - Kokkos::deep_copy(columns_view, hc); - } - - lno_t ncols = 0; - KokkosKernels::Impl::kk_view_reduce_max( - nnzA, columns_view, ncols); - ncols += 1; - - crsGraph_t static_graph(columns_view, rowmap_view, ncols); delete[] xadj; delete[] adj; delete[] values; + + crsGraph_t static_graph(columns_view, rowmap_view); return static_graph; } From 4ca54ed157d11de521cacc3d04cdf34142dbef46 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Tue, 11 Apr 2023 10:04:53 -0400 Subject: [PATCH 216/442] Use KOKKOS_IMPL_DO_NOT_USE_PRINTF in Test_Common_UpperBound.hpp --- common/unit_test/Test_Common_UpperBound.hpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/common/unit_test/Test_Common_UpperBound.hpp b/common/unit_test/Test_Common_UpperBound.hpp index b99ffbb0a6..9a440b376a 100644 --- a/common/unit_test/Test_Common_UpperBound.hpp +++ b/common/unit_test/Test_Common_UpperBound.hpp @@ -43,8 +43,9 @@ struct ThreadUpperBoundFunctor { if (0 == i) { hv_size_type idx = KokkosKernels::upper_bound_thread(haystack_, needle_); if (idx != expected_) { - printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, - int(i), int(expected_), int(idx)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", + __FILE__, __LINE__, int(i), + int(expected_), int(idx)); ++lerrCount; } } @@ -99,8 +100,9 @@ struct TeamUpperBoundFunctor { hv_size_type idx = KokkosKernels::upper_bound_team(handle, haystack_, needle_); if (idx != expected_) { - printf("%s:%d thread %d expected %d got %d\n", __FILE__, __LINE__, - int(handle.team_rank()), int(expected_), int(idx)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%s:%d thread %d expected %d got %d\n", + __FILE__, __LINE__, int(handle.team_rank()), + int(expected_), int(idx)); ++lerrCount; } } @@ -242,4 +244,4 @@ EXECUTE_TEST(float, TestExecSpace) EXECUTE_TEST(double, TestExecSpace) #endif -#undef EXECUTE_TEST \ No newline at end of file +#undef EXECUTE_TEST From 31ef8f6bfbc0a39d9059346721e050123b12fed3 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Wed, 12 Apr 2023 01:33:31 -0700 Subject: [PATCH 217/442] Intial stream interface --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 137 +++++++++++++++ .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 84 +++++++-- sparse/src/KokkosSparse_sptrsv.hpp | 161 ++++++++++++++++++ ...kkosSparse_sptrsv_solve_tpl_spec_avail.hpp | 4 +- 4 files changed, 367 insertions(+), 19 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 4cff646325..3ca003f713 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -4019,6 +4019,143 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, } // end tri_solve_chain + +// -------------------------------- +// Stream interfaces +// -------------------------------- + +template +void lower_tri_solve_streams(const std::vector &execspace_v, + const std::vector &thandle_v, + const std::vector &row_map_v, + const std::vector &entries_v, + const std::vector &values_v, + const std::vector &rhs_v, + std::vector &lhs_v) { + using size_type = typename TriSolveHandle::size_type; + using NGBLType = typename TriSolveHandle::nnz_lno_view_t; + using nodes_per_level_type = typename TriSolveHandle::hostspace_nnz_lno_view_t; + using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; + + // Create vectors for handles' data in streams + int nstreams = execspace_v.size(); + std::vector nlevels_v(nstreams); + std::vector hnodes_per_level_v(nstreams); + std::vector nodes_grouped_by_level_v(nstreams); + std::vector node_count_v(nstreams); + + // Retrieve data from handles and find max. number of levels among streams + size_type nlevels_max = 0; + for (int i = 0; i < nstreams; i++) { + nlevels_v[i] = thandle_v[i]->get_num_levels(); + hnodes_per_level_v[i] = thandle_v[i]->get_host_nodes_per_level(); + nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level(); + node_count_v[i] = 0; + if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; + } + + // Main loop must be performed sequential + for (size_type lvl = 0; lvl < nlevels_max; lvl++) { + // 1. Launch work on all streams + for (int i = 0; i < nstreams; i++) { + // Only if stream i-th still has this level + if (lvl < nlevels_v[i]) { + size_type lvl_nodes = hnodes_per_level_v[i](lvl); + if (lvl_nodes != 0) { + if (thandle_v[0]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { + Kokkos::parallel_for("parfor_fixed_lvl", Kokkos::RangePolicy(node_count_v[i], node_count_v[i] + lvl_nodes), LowerTriLvlSchedRPSolverFunctor(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i])); + } else if (thandle_v[0]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { + using policy_type = Kokkos::TeamPolicy; + int team_size = thandle_v[0]->get_team_size(); +#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED + TriLvlSchedTP1SolverFunctor tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], true, node_count_v[i]); +#else + LowerTriLvlSchedTP1SolverFunctor tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); +#endif + if (team_size == -1) + Kokkos::parallel_for("parfor_l_team", policy_type(lvl_nodes, Kokkos::AUTO), tstf); + else + Kokkos::parallel_for("parfor_l_team", policy_type(lvl_nodes, team_size), tstf); + } + node_count_v[i] += lvl_nodes; + } // end if (lvl_nodes != 0) + } // end if (lvl < nlevels_v[i]) + } // end for streams + + // 2. Wait for all streams finished + for (int i = 0; i < nstreams; i++) { + execspace_v[i].fence(); + } // end for streams + } // end for lvl +} // end lower_tri_solve_streams + +template +void upper_tri_solve_streams(const std::vector &execspace_v, + const std::vector &thandle_v, + const std::vector &row_map_v, + const std::vector &entries_v, + const std::vector &values_v, + const std::vector &rhs_v, + std::vector &lhs_v) { + using size_type = typename TriSolveHandle::size_type; + using NGBLType = typename TriSolveHandle::nnz_lno_view_t; + using nodes_per_level_type = typename TriSolveHandle::hostspace_nnz_lno_view_t; + using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; + + // Create vectors for handles' data in streams + int nstreams = execspace_v.size(); + std::vector nlevels_v(nstreams); + std::vector hnodes_per_level_v(nstreams); + std::vector nodes_grouped_by_level_v(nstreams); + std::vector node_count_v(nstreams); + + // Retrieve data from handles and find max. number of levels among streams + size_type nlevels_max = 0; + for (int i = 0; i < nstreams; i++) { + nlevels_v[i] = thandle_v[i]->get_num_levels(); + hnodes_per_level_v[i] = thandle_v[i]->get_host_nodes_per_level(); + nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level(); + node_count_v[i] = 0; + if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; + } + + // Main loop must be performed sequential + for (size_type lvl = 0; lvl < nlevels_max; lvl++) { + // 1. Launch work on all streams + for (int i = 0; i < nstreams; i++) { + // Only if stream i-th still has this level + if (lvl < nlevels_v[i]) { + size_type lvl_nodes = hnodes_per_level_v[i](lvl); + if (lvl_nodes != 0) { + if (thandle_v[0]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { + Kokkos::parallel_for("parfor_fixed_lvl", Kokkos::RangePolicy(node_count_v[i], node_count_v[i] + lvl_nodes), UpperTriLvlSchedRPSolverFunctor(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i])); + } else if (thandle_v[0]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { + using policy_type = Kokkos::TeamPolicy; + int team_size = thandle_v[0]->get_team_size(); +#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED + TriLvlSchedTP1SolverFunctor tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], false, node_count_v[i]); +#else + UpperTriLvlSchedTP1SolverFunctor tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); +#endif + if (team_size == -1) + Kokkos::parallel_for("parfor_l_team", policy_type(lvl_nodes, Kokkos::AUTO), tstf); + else + Kokkos::parallel_for("parfor_l_team", policy_type(lvl_nodes, team_size), tstf); + } + node_count_v[i] += lvl_nodes; + } // end if (lvl_nodes != 0) + } // end if (lvl < nlevels_v[i]) + } // end for streams + + // 2. Wait for all streams finished + for (int i = 0; i < nstreams; i++) { + execspace_v[i].fence(); + } // end for streams + } // end for lvl +} // end upper_tri_solve_streams + } // namespace Experimental } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index fce10e3acd..628d72add7 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -31,8 +31,8 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct sptrsv_solve_eti_spec_avail { enum : bool { value = false }; }; @@ -45,6 +45,7 @@ struct sptrsv_solve_eti_spec_avail { MEM_SPACE_TYPE) \ template <> \ struct sptrsv_solve_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -83,29 +84,40 @@ namespace Impl { #endif // Unification layer -/// \brief Implementation of KokkosSparse::sptrsv_solve - -template ::value, - bool eti_spec_avail = - sptrsv_solve_eti_spec_avail::value> +/// \brief Implementations of KokkosSparse::sptrsv_solve and +/// \brief KokkosSparse::sptrsv_solve_streams + +template ::value, + bool eti_spec_avail = sptrsv_solve_eti_spec_avail< + ExecutionSpace, KernelHandle, RowMapType, EntriesType, + ValuesType, BType, XType>::value> struct SPTRSV_SOLVE { static void sptrsv_solve(KernelHandle *handle, const RowMapType row_map, const EntriesType entries, const ValuesType values, BType b, XType x); + + static void sptrsv_solve_streams( + const std::vector &execspace_v, + std::vector &handle_v, + const std::vector &row_map_v, + const std::vector &entries_v, + const std::vector &values_v, + const std::vector &b_v, + std::vector &x_v); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -//! Full specialization of sptrsv_solve +//! Full specialization of sptrsv_solve and sptrsv_solve_streams // Unification layer -template -struct SPTRSV_SOLVE { +template +struct SPTRSV_SOLVE { static void sptrsv_solve(KernelHandle *handle, const RowMapType row_map, const EntriesType entries, const ValuesType values, BType b, XType x) { @@ -155,6 +167,42 @@ struct SPTRSV_SOLVE &execspace_v, + std::vector &handle_v, + const std::vector &row_map_v, + const std::vector &entries_v, + const std::vector &values_v, + const std::vector &b_v, + std::vector &x_v) { + // Call specific algorithm type + // NOTE: Only support SEQLVLSCHD_TP1 for now + // Assume streams have the same either lower or upper matrix type + std::vector sptrsv_handle_v(execspace_v.size()); + for (int i = 0; i < static_cast(execspace_v.size()); i++) { + sptrsv_handle_v[i] = handle_v[i].get_sptrsv_handle(); + } + Kokkos::Profiling::pushRegion(sptrsv_handle_v[0]->is_lower_tri() + ? "KokkosSparse_sptrsv[lower]" + : "KokkosSparse_sptrsv[upper]"); + if (sptrsv_handle_v[0]->is_lower_tri()) { + for (int i = 0; i < static_cast(execspace_v.size()); i++) { + if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { + Experimental::lower_tri_symbolic(*(sptrsv_handle_v[i]), row_map_v[i], entries_v[i]); + } + } + Experimental::lower_tri_solve_streams(execspace_v, sptrsv_handle_v, row_map_v, entries_v, values_v, b_v, x_v); + } else { + for (int i = 0; i < static_cast(execspace_v.size()); i++) { + if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { + Experimental::upper_tri_symbolic(*(sptrsv_handle_v[i]), row_map_v[i], entries_v[i]); + } + } + Experimental::upper_tri_solve_streams(execspace_v, sptrsv_handle_v, row_map_v, entries_v, values_v, b_v, x_v); + } + Kokkos::Profiling::popRegion(); + } }; #endif @@ -172,6 +220,7 @@ struct SPTRSV_SOLVE, \ @@ -200,6 +249,7 @@ struct SPTRSV_SOLVE, \ diff --git a/sparse/src/KokkosSparse_sptrsv.hpp b/sparse/src/KokkosSparse_sptrsv.hpp index 04cb0f5285..08241cc653 100644 --- a/sparse/src/KokkosSparse_sptrsv.hpp +++ b/sparse/src/KokkosSparse_sptrsv.hpp @@ -312,6 +312,7 @@ void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, } else { KokkosSparse::Impl::SPTRSV_SOLVE< + typename scalar_nnz_view_t_::execution_space, const_handle_type, RowMap_Internal, Entries_Internal, Values_Internal, BType_Internal, XType_Internal>::sptrsv_solve(&tmp_handle, rowmap_i, entries_i, values_i, b_i, @@ -369,6 +370,166 @@ void sptrsv_solve(KernelHandle *handleL, KernelHandle *handleU, XType x, } #endif +template +void sptrsv_solve_streams(const std::vector& execspace_v, + const std::vector& handle_v, + const std::vector& rowmap_v, + const std::vector& entries_v, + const std::vector& values_v, + const std::vector& b_v, + std::vector& x_v) { + using size_type = typename KernelHandle::size_type; + using ordinal_type = typename KernelHandle::nnz_lno_t; + using scalar_type = typename KernelHandle::nnz_scalar_t; + + static_assert(Kokkos::is_execution_space::value, "ExecutionSpace is not valid"); + static_assert(Kokkos::SpaceAccessibility::accessible, "sptrsv_solve_streams: ExecutionSpace cannot access data in lno_row_view_t_"); + static_assert(Kokkos::SpaceAccessibility::accessible, "sptrsv_solve_streams: ExecutionSpace cannot access data in lno_nnz_view_t_"); + static_assert(Kokkos::SpaceAccessibility::accessible, "sptrsv_solve_streams: ExecutionSpace cannot access data in scalar_nnz_view_t_"); + static_assert(Kokkos::SpaceAccessibility::accessible, "sptrsv_solve_streams: ExecutionSpace cannot access data in BType"); + static_assert(Kokkos::SpaceAccessibility::accessible, "sptrsv_solve_streams: ExecutionSpace cannot access data in XType"); + + static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename lno_row_view_t_::non_const_value_type, size_type), "sptrsv_solve_streams: A size_type must match KernelHandle size_type (const doesn't matter)"); + static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename lno_nnz_view_t_::non_const_value_type, ordinal_type), "sptrsv_solve_streams: A entry type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)"); + static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename scalar_nnz_view_t_::value_type, scalar_type), "sptrsv_solve_streams: A scalar type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)"); + + static_assert(Kokkos::is_view::value, "sptrsv_solve_streams: b is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, "sptrsv_solve_streams: x is not a Kokkos::View."); + static_assert((int)BType::rank == (int)XType::rank, "sptrsv_solve_streams: The ranks of b and x do not match."); + static_assert(BType::rank == 1, "sptrsv_solve_streams: b and x must both either have rank 1."); + static_assert(std::is_same::value, "sptrsv_solve_streams: The output x must be nonconst."); + static_assert(std::is_same::value, "sptrsv_solve_streams: Views BType and XType have different device_types."); + static_assert(std::is_same::value, "sptrsv_solve_streams: KernelHandle's execution space is different from ExecutionSpace."); + static_assert(std::is_same::value, "sptrsv_solve_streams: KernelHandle and Views have different execution spaces."); + static_assert(std::is_same::value, "sptrsv_solve_streams: rowmap and entries have different device types."); + static_assert(std::is_same::value, "sptrsv_solve_streams: rowmap and values have different device types."); + + // Check sizes of vectors + if (execspace_v.size() != handle_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::sptrsv_solve_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. handle_v.size() " << handle_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != rowmap_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::sptrsv_solve_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. rowmap_v.size() " << rowmap_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != entries_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::sptrsv_solve_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. entries_v.size() " << entries_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != values_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::sptrsv_solve_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. values_v.size() " << values_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != b_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::sptrsv_solve_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. b_v.size() " << b_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (execspace_v.size() != x_v.size()) { + std::ostringstream os; + os << "KokkosSparse::Experimental::sptrsv_solve_streams: vector sizes " + "must match -- execspace_v.size() " + << execspace_v.size() << " vs. x_v.size() " << x_v.size(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + using c_size_t = typename KernelHandle::const_size_type c_size_t; + using c_lno_t = typename KernelHandle::const_nnz_lno_t; + using c_scalar_t = typename KernelHandle::const_nnz_scalar_t; + using c_exec_t = typename KernelHandle::HandleExecSpace; + using c_temp_t = typename KernelHandle::HandleTempMemorySpace; + using c_persist_t = typename KernelHandle::HandlePersistentMemorySpace; + + using const_handle_type = typename KokkosKernels::Experimental::KokkosKernelsHandle; + const_handle_type tmp_handle(*handle); + + using RowMap_Internal = Kokkos::View< + typename lno_row_view_t_::const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename lno_row_view_t_::device_type, + Kokkos::MemoryTraits >; + + using Entries_Internal = Kokkos::View< + typename lno_nnz_view_t_::const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename lno_nnz_view_t_::device_type, + Kokkos::MemoryTraits >; + + using Values_Internal = Kokkos::View< + typename scalar_nnz_view_t_::const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename scalar_nnz_view_t_::device_type, + Kokkos::MemoryTraits >; + + using BType_Internal = Kokkos::View< + typename BType::const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename BType::device_type, + Kokkos::MemoryTraits >; + + using XType_Internal = Kokkos::View< + typename XType::non_const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename XType::device_type, Kokkos::MemoryTraits >; + + std::vector handle_i_v (execspace_v.size()); + std::vector rowmap_i_v (execspace_v.size()); + std::vector entries_i_v(execspace_v.size()); + std::vector values_i_v (execspace_v.size()); + std::vector b_i_v(execspace_v.size()); + std::vector x_i_v(execspace_v.size()); + + for (int i = 0; i < static_cast(execspace_v.size()); i++) { + handle_i_v[i] = const_handle_type(*(handle_v[i])); + rowmap_i_v[i] = rowmap_v[i]; + entries_i_v[i] = entries_v[i]; + values_i_v[i] = values_v[i]; + b_i_v[i] = b_v[i]; + x_i_v[i] = x_v[i]; + } + + //auto sptrsv_handle = handle->get_sptrsv_handle(); + //if (sptrsv_handle->get_algorithm() == + // KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { + // typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType; + // sptrsvHandleType *sh = handle->get_sptrsv_handle(); + // auto nrows = sh->get_nrows(); + // + // KokkosSparse::Impl::sptrsvcuSPARSE_solve( + // sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false); + // + //} else { + KokkosSparse::Impl::SPTRSV_SOLVE< + ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, XType_Internal>::sptrsv_solve_streams(execspace_v, handle_i_v, + rowmap_i_v, entries_i_v, values_i_v, + b_i_v, x_i_v); + //} + +} // sptrsv_solve_streams + } // namespace Experimental } // namespace KokkosSparse diff --git a/sparse/tpls/KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp index c12e8bb335..e83611026e 100644 --- a/sparse/tpls/KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp @@ -20,8 +20,8 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct sptrsv_solve_tpl_spec_avail { enum : bool { value = false }; }; From 9f12713ad3674068340d6572d23c63565d5c55dc Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Wed, 12 Apr 2023 09:25:23 -0600 Subject: [PATCH 218/442] Add --enable-docs option to cm_generate_makefile (#1785) --- cm_generate_makefile.bash | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index d6c125899f..21d3176cec 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -362,13 +362,13 @@ display_help_text() { echo "--gcc-toolchain=/Path/To/GccRoot: Set the gcc toolchain to use with clang (e.g. /usr)" echo "--kokkos-make-j=[NUM]: Set -j parallel level for kokkos install" echo " Default: j == 4" - echo "--enable-tests: build Kokkos Kernels unit tests" - echo "--disable-tests: Do not build Kokkos Kernels unit tests" - echo "--disable-perftests: Do not build Kokkos Kernels performance tests" - echo "--enable-perftests: build Kokkos Kernels performance tests (default)" + echo "--enable-tests: build Kokkos Kernels unit tests" + echo "--disable-tests: Do not build Kokkos Kernels unit tests" + echo "--disable-perftests: Do not build Kokkos Kernels performance tests" + echo "--enable-perftests: build Kokkos Kernels performance tests (default)" echo "--deprecated-code Enable deprecated code (disabled by default)" - echo "--export-compile-commands: export cmake compile_commands.json file" - + echo "--export-compile-commands: export cmake compile_commands.json file" + echo "--enable-docs: build the Kokkos Kernels developer documentation (requires sphinx, doxygen)" } @@ -380,6 +380,7 @@ KOKKOSKERNELS_DO_TESTS=ON KOKKOSKERNELS_DO_PERFTESTS=ON KOKKOSKERNELS_DO_PERFSUITE=OFF KOKKOSKERNELS_DO_EXAMPLES=ON +KOKKOSKERNELS_DO_DOCS=OFF CMAKE_EXPORT_COMPILE_COMMANDS=OFF @@ -573,6 +574,9 @@ do --deprecated-code) KOKKOS_DEPRECATED_CODE=ON ;; + --enable-docs) + KOKKOSKERNELS_DO_DOCS=ON + ;; --compiler*) COMPILER="${key#*=}" CNUM=$(command -v ${COMPILER} 2>&1 >/dev/null | grep "no ${COMPILER}" | wc -l) @@ -820,6 +824,6 @@ cd $STORE_KOKKOSKERNELS_BUILD_PATH # Configure kokkos-kernels echo "" -echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS=\"${KOKKOS_CXXFLAGS}\" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS=\"${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED}\" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} ${KOKKOSKERNELS_PATH} +echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS=\"${KOKKOS_CXXFLAGS}\" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS=\"${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED}\" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} -DKokkosKernels_ENABLE_DOCS=${KOKKOSKERNELS_DO_DOCS} ${KOKKOSKERNELS_PATH} echo "" -cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS="${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED//\"}" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} ${KOKKOSKERNELS_PATH} +cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_PERFTESTS=${KOKKOSKERNELS_DO_PERFTESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=${CMAKE_EXPORT_COMPILE_COMMANDS} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} -DCMAKE_EXE_LINKER_FLAGS="${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_PARSED//\"}" ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_COMPONENTS_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KERNELS_DEFAULT_TEST_ETI_ONLY_OPTION} -DKokkosKernels_ENABLE_DOCS=${KOKKOSKERNELS_DO_DOCS} ${KOKKOSKERNELS_PATH} From bfc68039d5230fb4cc8897443db3486f63470d00 Mon Sep 17 00:00:00 2001 From: meriadeg perrinel Date: Thu, 5 Jan 2023 15:16:14 +0100 Subject: [PATCH 219/442] #5: Create blas2 gemv benchmark test --- perf_test/CMakeLists.txt | 1 + .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 122 ++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index d46b85b4d7..fc2ddc5d62 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -61,5 +61,6 @@ if(KokkosKernels_ENABLE_BENCHMARK) blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp + blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp ) endif() diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp new file mode 100644 index 0000000000..a0fbff639f --- /dev/null +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -0,0 +1,122 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +#include "KokkosBlas2_gemv.hpp" +#include + +template +static void run(benchmark::State& state) { + const auto m = state.range(0); + const auto n = state.range(1); + const auto repeat = state.range(2); + // Declare type aliases + using Scalar = double; + using MemSpace = typename ExecSpace::memory_space; + using Device = Kokkos::Device; + + std::cout << "Running BLAS Level 1 DOT perfomrance experiment (" + << ExecSpace::name() << ")\n"; + + std::cout << "Each test input vector has a length of " << m << std::endl; + + std::cout << "Running GEMV experiment (" << ExecSpace::name() << ")\n"; + + // Create a View containing a 2D matrix; allocate KokkosView with template + // args of Scalar**, a layout, and + Kokkos::View A( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), m, n); + // Create Views containing 1D matrix; allocate (without) matrix "x" of size n + Kokkos::View x( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "x"), n); + // Create Views containing 1D matrix; allocate (without) matrix "y" of size m + Kokkos::View y( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "y"), m); + + // Declaring variable pool w/ a number seed; + // a parallel random number generator, so you + // won't get the same number with a given seed each time + Kokkos::Random_XorShift64_Pool pool(123); + + // Fill 2D Matrix "A" and 1D matrix (i.e., a vector) "x" with random values; + // Here, 10 is the max value of the random generator between 1 and 10 + // (uniform ) + Kokkos::fill_random(A, pool, 10.0); + Kokkos::fill_random(x, pool, 10.0); + + for (auto _ : state) { + // Do a warm-up run + KokkosBlas::gemv("N", 1.0, A, x, 0.0, y); + + // Start timing + Kokkos::fence(); + Kokkos::Timer timer; + for (int i = 0; i < repeat; i++) { + KokkosBlas::gemv("N", 1.0, A, x, 0.0, y); + ExecSpace().fence(); + } + + // Kokkos Timer set up + double total = timer.seconds(); + double avg = total / repeat; + // Flops calculation + size_t flopsPerRun = (size_t)2 * m * n; + printf("Avg GEMV time: %f s.\n", avg); + printf("Avg GEMV FLOP/s: %.3e\n", flopsPerRun / avg); + state.SetIterationTime(timer.seconds()); + + state.counters["Avg GEMV time (s):"] = + benchmark::Counter(avg, benchmark::Counter::kDefaults); + state.counters["Avg GEMV FLOP/s:"] = + benchmark::Counter(flopsPerRun / avg, benchmark::Counter::kDefaults); + } +} + +BENCHMARK(run) + ->Name("KokkosBlas2_gemv") + ->ArgNames({"m", "n", "repeat"}) + ->Args({5000, 5000, 1}) + ->UseManualTime(); From 3b8c2da3d0e174aadc38a63d6e1335d9d9841abf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 20 Mar 2023 13:14:35 +0100 Subject: [PATCH 220/442] Remove redundant output - remove redundant print statements - use meaningful benchmark name and pass configuration via arguments --- .../KokkosBlas_dot_mv_perf_test_benchmark.cpp | 2 -- .../KokkosBlas_dot_perf_test_benchmark.cpp | 2 -- ...okkosBlas_team_dot_perf_test_benchmark.cpp | 2 -- .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 21 ++++++------------- 4 files changed, 6 insertions(+), 21 deletions(-) diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp index 1e537ceadc..c0a01eaff5 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp @@ -124,8 +124,6 @@ static void run(benchmark::State& state) { double avg = total / repeat; // Flops calculation for a 1D matrix dot product per test run; size_t flopsPerRun = (size_t)2 * m * n; - printf("Avg DOT time: %f s.\n", avg); - printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); state.SetIterationTime(timer.seconds()); state.counters["Avg DOT time (s):"] = diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp index 14957994d1..fd4513d7d2 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp @@ -122,8 +122,6 @@ static void run(benchmark::State& state) { double avg = total / repeat; // Flops calculation for a 1D matrix dot product per test run; size_t flopsPerRun = (size_t)2 * m; - printf("Avg DOT time: %f s.\n", avg); - printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); state.SetIterationTime(timer.seconds()); state.counters["Avg DOT time (s):"] = diff --git a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp index 165f7fe6db..2764da9556 100644 --- a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp @@ -128,8 +128,6 @@ static void run(benchmark::State& state) { double avg = total / repeat; // Flops calculation for a 1D matrix dot product per test run; size_t flopsPerRun = (size_t)2 * m; - printf("Avg DOT time: %f s.\n", avg); - printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); state.SetIterationTime(timer.seconds()); state.counters["Avg DOT time (s):"] = diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index a0fbff639f..69796d7132 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -48,23 +48,17 @@ #include "KokkosBlas2_gemv.hpp" #include -template -static void run(benchmark::State& state) { +template +static void KokkosBlas2_gemv(benchmark::State& state) { const auto m = state.range(0); const auto n = state.range(1); const auto repeat = state.range(2); // Declare type aliases + using ExecSpace = Kokkos::DefaultExecutionSpace; using Scalar = double; using MemSpace = typename ExecSpace::memory_space; using Device = Kokkos::Device; - std::cout << "Running BLAS Level 1 DOT perfomrance experiment (" - << ExecSpace::name() << ")\n"; - - std::cout << "Each test input vector has a length of " << m << std::endl; - - std::cout << "Running GEMV experiment (" << ExecSpace::name() << ")\n"; - // Create a View containing a 2D matrix; allocate KokkosView with template // args of Scalar**, a layout, and Kokkos::View A( @@ -104,8 +98,6 @@ static void run(benchmark::State& state) { double avg = total / repeat; // Flops calculation size_t flopsPerRun = (size_t)2 * m * n; - printf("Avg GEMV time: %f s.\n", avg); - printf("Avg GEMV FLOP/s: %.3e\n", flopsPerRun / avg); state.SetIterationTime(timer.seconds()); state.counters["Avg GEMV time (s):"] = @@ -115,8 +107,7 @@ static void run(benchmark::State& state) { } } -BENCHMARK(run) - ->Name("KokkosBlas2_gemv") - ->ArgNames({"m", "n", "repeat"}) - ->Args({5000, 5000, 1}) +BENCHMARK(KokkosBlas2_gemv) + ->ArgNames({"m", "n", "repeat", Kokkos::DefaultExecutionSpace::name()}) + ->Args({5000, 5000, 1, 1}) ->UseManualTime(); From e87d532c2a48dd0f8315469567342316df076739 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 20 Mar 2023 13:29:24 +0100 Subject: [PATCH 221/442] Let benchmark decide the number of repetitions --- .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 69796d7132..596a774073 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -50,14 +50,14 @@ template static void KokkosBlas2_gemv(benchmark::State& state) { - const auto m = state.range(0); - const auto n = state.range(1); - const auto repeat = state.range(2); + const auto m = state.range(0); + const auto n = state.range(1); + // Declare type aliases using ExecSpace = Kokkos::DefaultExecutionSpace; - using Scalar = double; - using MemSpace = typename ExecSpace::memory_space; - using Device = Kokkos::Device; + using Scalar = double; + using MemSpace = typename ExecSpace::memory_space; + using Device = Kokkos::Device; // Create a View containing a 2D matrix; allocate KokkosView with template // args of Scalar**, a layout, and @@ -88,26 +88,23 @@ static void KokkosBlas2_gemv(benchmark::State& state) { // Start timing Kokkos::fence(); Kokkos::Timer timer; - for (int i = 0; i < repeat; i++) { - KokkosBlas::gemv("N", 1.0, A, x, 0.0, y); - ExecSpace().fence(); - } + KokkosBlas::gemv("N", 1.0, A, x, 0.0, y); + ExecSpace().fence(); // Kokkos Timer set up - double total = timer.seconds(); - double avg = total / repeat; + double time = timer.seconds(); // Flops calculation size_t flopsPerRun = (size_t)2 * m * n; state.SetIterationTime(timer.seconds()); state.counters["Avg GEMV time (s):"] = - benchmark::Counter(avg, benchmark::Counter::kDefaults); + benchmark::Counter(time, benchmark::Counter::kDefaults); state.counters["Avg GEMV FLOP/s:"] = - benchmark::Counter(flopsPerRun / avg, benchmark::Counter::kDefaults); + benchmark::Counter(flopsPerRun / time, benchmark::Counter::kDefaults); } } BENCHMARK(KokkosBlas2_gemv) - ->ArgNames({"m", "n", "repeat", Kokkos::DefaultExecutionSpace::name()}) - ->Args({5000, 5000, 1, 1}) + ->ArgNames({"m", "n", Kokkos::DefaultExecutionSpace::name()}) + ->Args({5000, 5000, 1}) ->UseManualTime(); From 0678b55b19ca3a5d022c712e962edb7368132b7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 20 Mar 2023 13:46:05 +0100 Subject: [PATCH 222/442] Include scalar type in the output --- .../blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 596a774073..adf1e51c59 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -48,14 +48,13 @@ #include "KokkosBlas2_gemv.hpp" #include -template +template static void KokkosBlas2_gemv(benchmark::State& state) { const auto m = state.range(0); const auto n = state.range(1); // Declare type aliases using ExecSpace = Kokkos::DefaultExecutionSpace; - using Scalar = double; using MemSpace = typename ExecSpace::memory_space; using Device = Kokkos::Device; @@ -104,7 +103,7 @@ static void KokkosBlas2_gemv(benchmark::State& state) { } } -BENCHMARK(KokkosBlas2_gemv) +BENCHMARK(KokkosBlas2_gemv) ->ArgNames({"m", "n", Kokkos::DefaultExecutionSpace::name()}) ->Args({5000, 5000, 1}) ->UseManualTime(); From 6d027010ab47f2dc41d79122d56700658b99543b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 20 Mar 2023 13:50:51 +0100 Subject: [PATCH 223/442] Let benchmark calculate FLOP/s --- perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index adf1e51c59..3cb85dcadf 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -98,8 +98,8 @@ static void KokkosBlas2_gemv(benchmark::State& state) { state.counters["Avg GEMV time (s):"] = benchmark::Counter(time, benchmark::Counter::kDefaults); - state.counters["Avg GEMV FLOP/s:"] = - benchmark::Counter(flopsPerRun / time, benchmark::Counter::kDefaults); + state.counters["Avg GEMV FLOP/s:"] = benchmark::Counter( + flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); } } From 7336d9c2fd4647c63daf90cc4c4cb4bab5e3c012 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 20 Mar 2023 14:07:11 +0100 Subject: [PATCH 224/442] Add a benchmark for LayoutRight --- .../blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 3cb85dcadf..550115ce94 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -107,3 +107,8 @@ BENCHMARK(KokkosBlas2_gemv) ->ArgNames({"m", "n", Kokkos::DefaultExecutionSpace::name()}) ->Args({5000, 5000, 1}) ->UseManualTime(); + +BENCHMARK(KokkosBlas2_gemv) + ->ArgNames({"m", "n", Kokkos::DefaultExecutionSpace::name()}) + ->Args({5000, 5000, 1}) + ->UseManualTime(); From b3da125585d21b8b9bfeb053814b699a1f7b2343 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 22 Mar 2023 19:31:11 +0100 Subject: [PATCH 225/442] Use correct header --- .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 38 +++---------------- 1 file changed, 5 insertions(+), 33 deletions(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 550115ce94..fa583e624d 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -1,46 +1,18 @@ -/* //@HEADER // ************************************************************************ // -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering // Solutions of Sandia, LLC (NTESS). // // Under the terms of Contract DE-NA0003525 with NTESS, // the U.S. Government retains certain rights in this software. // -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) -// -// ************************************************************************ //@HEADER -*/ #include #include From 278d18fac954fefc1dce51300c04034cfa896087 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 22 Mar 2023 20:15:18 +0100 Subject: [PATCH 226/442] Use stored time value --- perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index fa583e624d..6e62fe09fc 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -66,7 +66,7 @@ static void KokkosBlas2_gemv(benchmark::State& state) { double time = timer.seconds(); // Flops calculation size_t flopsPerRun = (size_t)2 * m * n; - state.SetIterationTime(timer.seconds()); + state.SetIterationTime(time); state.counters["Avg GEMV time (s):"] = benchmark::Counter(time, benchmark::Counter::kDefaults); From 6c21c4df2585b0f6f3ac29b5acd466ee27ec0e22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 28 Mar 2023 17:26:36 +0200 Subject: [PATCH 227/442] Revert changes to blas1 benchmark --- perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp | 2 ++ perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp | 2 ++ .../blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp | 2 ++ 3 files changed, 6 insertions(+) diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp index c0a01eaff5..1e537ceadc 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp @@ -124,6 +124,8 @@ static void run(benchmark::State& state) { double avg = total / repeat; // Flops calculation for a 1D matrix dot product per test run; size_t flopsPerRun = (size_t)2 * m * n; + printf("Avg DOT time: %f s.\n", avg); + printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); state.SetIterationTime(timer.seconds()); state.counters["Avg DOT time (s):"] = diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp index fd4513d7d2..14957994d1 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp @@ -122,6 +122,8 @@ static void run(benchmark::State& state) { double avg = total / repeat; // Flops calculation for a 1D matrix dot product per test run; size_t flopsPerRun = (size_t)2 * m; + printf("Avg DOT time: %f s.\n", avg); + printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); state.SetIterationTime(timer.seconds()); state.counters["Avg DOT time (s):"] = diff --git a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp index 2764da9556..165f7fe6db 100644 --- a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp @@ -128,6 +128,8 @@ static void run(benchmark::State& state) { double avg = total / repeat; // Flops calculation for a 1D matrix dot product per test run; size_t flopsPerRun = (size_t)2 * m; + printf("Avg DOT time: %f s.\n", avg); + printf("Avg DOT FLOP/s: %.3e\n", flopsPerRun / avg); state.SetIterationTime(timer.seconds()); state.counters["Avg DOT time (s):"] = From 24923b79e40d2069f08890c1e645eb9ad03630d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 7 Apr 2023 17:54:44 +0200 Subject: [PATCH 228/442] Use separate executable --- cmake/kokkoskernels_benchmarks.cmake | 4 +--- perf_test/CMakeLists.txt | 2 +- perf_test/blas/blas2/CMakeLists.txt | 7 +++++++ .../blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp | 14 ++++++++++++++ 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/cmake/kokkoskernels_benchmarks.cmake b/cmake/kokkoskernels_benchmarks.cmake index 7bb262247d..3a38feee88 100644 --- a/cmake/kokkoskernels_benchmarks.cmake +++ b/cmake/kokkoskernels_benchmarks.cmake @@ -30,8 +30,6 @@ ELSE() TARGET_COMPILE_OPTIONS(benchmark_main PRIVATE -w) ENDIF() -KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) - FUNCTION(KOKKOSKERNELS_ADD_BENCHMARK NAME) CMAKE_PARSE_ARGUMENTS( BENCHMARK @@ -53,7 +51,7 @@ FUNCTION(KOKKOSKERNELS_ADD_BENCHMARK NAME) ADD_EXECUTABLE( ${BENCHMARK_NAME} - ${CMAKE_SOURCE_DIR}/perf_test/BenchmarkMain.cpp ${BENCHMARK_SOURCES} + ${BENCHMARK_SOURCES} ) TARGET_LINK_LIBRARIES( ${BENCHMARK_NAME} diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index fc2ddc5d62..cf1905d6d4 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -61,6 +61,6 @@ if(KokkosKernels_ENABLE_BENCHMARK) blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp - blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp + BenchmarkMain.cpp ) endif() diff --git a/perf_test/blas/blas2/CMakeLists.txt b/perf_test/blas/blas2/CMakeLists.txt index f69c576cd3..9c2aa424d1 100644 --- a/perf_test/blas/blas2/CMakeLists.txt +++ b/perf_test/blas/blas2/CMakeLists.txt @@ -5,3 +5,10 @@ KOKKOSKERNELS_ADD_EXECUTABLE( KokkosBlas2_gemv_perf_test SOURCES KokkosBlas2_gemv_perf_test.cpp ) + +IF(KokkosKernels_ENABLE_BENCHMARK) + KOKKOSKERNELS_ADD_BENCHMARK( + Blas2_Benchmark + SOURCES KokkosBlas2_gemv_perf_test_benchmark.cpp + ) +ENDIF() diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 6e62fe09fc..d116e3fdd2 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -18,6 +18,7 @@ #include #include "KokkosBlas2_gemv.hpp" +#include #include template @@ -84,3 +85,16 @@ BENCHMARK(KokkosBlas2_gemv) ->ArgNames({"m", "n", Kokkos::DefaultExecutionSpace::name()}) ->Args({5000, 5000, 1}) ->UseManualTime(); + +int main(int argc, char** argv) { + Kokkos::initialize(argc, argv); + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kSecond); + KokkosKernelsBenchmark::add_benchmark_context(true); + + benchmark::RunSpecifiedBenchmarks(); + + benchmark::Shutdown(); + Kokkos::finalize(); + return 0; +} From 10dc298b515404dda135d05bdf044e434b66a1a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 7 Apr 2023 18:00:44 +0200 Subject: [PATCH 229/442] Move warm-up out of benchmarking loop --- .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index d116e3fdd2..652d7ae806 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -53,24 +53,24 @@ static void KokkosBlas2_gemv(benchmark::State& state) { Kokkos::fill_random(A, pool, 10.0); Kokkos::fill_random(x, pool, 10.0); - for (auto _ : state) { - // Do a warm-up run - KokkosBlas::gemv("N", 1.0, A, x, 0.0, y); + // Do a warm-up run + KokkosBlas::gemv("N", 1.0, A, x, 0.0, y); + Kokkos::fence(); + double total_time = 0.0; + for (auto _ : state) { // Start timing - Kokkos::fence(); Kokkos::Timer timer; KokkosBlas::gemv("N", 1.0, A, x, 0.0, y); ExecSpace().fence(); - // Kokkos Timer set up double time = timer.seconds(); - // Flops calculation + total_time += time; size_t flopsPerRun = (size_t)2 * m * n; - state.SetIterationTime(time); + state.SetIterationTime(time); state.counters["Avg GEMV time (s):"] = - benchmark::Counter(time, benchmark::Counter::kDefaults); + benchmark::Counter(total_time, benchmark::Counter::kAvgIterations); state.counters["Avg GEMV FLOP/s:"] = benchmark::Counter( flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); } From 1d70e7aebe5e1c13aa80a514c35ab91eaff63b41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 7 Apr 2023 18:20:52 +0200 Subject: [PATCH 230/442] Parse common parameters --- .../blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 652d7ae806..0a5c18a7f1 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -18,6 +18,10 @@ #include #include "KokkosBlas2_gemv.hpp" + +#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + #include #include @@ -92,6 +96,15 @@ int main(int argc, char** argv) { benchmark::SetDefaultTimeUnit(benchmark::kSecond); KokkosKernelsBenchmark::add_benchmark_context(true); + perf_test::CommonInputParams common_params; + perf_test::parse_common_options(argc, argv, common_params); + + benchmark::RegisterBenchmark("KokkosBlas2_gemv", + KokkosBlas2_gemv) + ->ArgNames({"m", "n", Kokkos::DefaultExecutionSpace::name()}) + ->Args({5000, 5000, 1}) + ->UseManualTime(); + benchmark::RunSpecifiedBenchmarks(); benchmark::Shutdown(); From 03728a8b8ddd2f301c0a4b1a839dbc6d4d973903 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 7 Apr 2023 18:31:17 +0200 Subject: [PATCH 231/442] Use CMake helper for ODE_RK benchmark --- perf_test/ode/CMakeLists.txt | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/perf_test/ode/CMakeLists.txt b/perf_test/ode/CMakeLists.txt index 67d0c421fb..b4aa86889f 100644 --- a/perf_test/ode/CMakeLists.txt +++ b/perf_test/ode/CMakeLists.txt @@ -2,22 +2,7 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) if(KOKKOSKERNELS_ENABLE_BENCHMARK) - SET(BENCHMARK_NAME ${PACKAGE_NAME}_ode_runge_kutta) - - ADD_EXECUTABLE( - ${BENCHMARK_NAME} - KokkosODE_RK.cpp - ) - TARGET_LINK_LIBRARIES( - ${BENCHMARK_NAME} - PRIVATE benchmark::benchmark Kokkos::kokkoskernels - ) - TARGET_INCLUDE_DIRECTORIES( - ${BENCHMARK_NAME} - SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include - ) - ADD_TEST( - NAME ${BENCHMARK_NAME} - COMMAND ${BENCHMARK_NAME} + KOKKOSKERNELS_ADD_BENCHMARK( + ode_runge_kutta SOURCES KokkosODE_RK.cpp ) endif() From f38b56ab13db8c6da8d757c11640b9ca5c87efde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 7 Apr 2023 18:35:17 +0200 Subject: [PATCH 232/442] Let benchmark decide number of iterations Let benchmark decide how many iterations will be run when --repeat is not provided. --- perf_test/ode/KokkosODE_RK.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/perf_test/ode/KokkosODE_RK.cpp b/perf_test/ode/KokkosODE_RK.cpp index 4f6e53e143..e9dc3f2f8e 100644 --- a/perf_test/ode/KokkosODE_RK.cpp +++ b/perf_test/ode/KokkosODE_RK.cpp @@ -353,8 +353,7 @@ int main(int argc, char** argv) { run_benchmark_wrapper, argc, argv) ->UseRealTime() ->ArgNames({"n", "model"}) - ->Args({1000, 1}) - ->Iterations(common_params.repeat); + ->Args({1000, 1}); } benchmark::RunSpecifiedBenchmarks(); From 34a228689178971f210e0a585981b82cdfb30df0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 7 Apr 2023 19:35:26 +0200 Subject: [PATCH 233/442] Parse blas2 custom command line parameters --- .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 112 ++++++++++++++---- 1 file changed, 87 insertions(+), 25 deletions(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 0a5c18a7f1..14e67a803d 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -25,13 +25,47 @@ #include #include -template +struct blas2_gemv_params : public perf_test::CommonInputParams { + int m = 5000; + int n = 5000; + // bool layoutLeft = true; +}; + +void print_options() { + std::cerr << "Options\n" << std::endl; + std::cerr << perf_test::list_common_options(); + + std::cerr << "\t[Optional] --m :: number of rows to generate" + << std::endl; + std::cerr << "\t[Optional] --n :: number of cols to generate" + << std::endl; +} + +blas2_gemv_params parse_blas2_gemv_options(int& argc, char** argv) { + blas2_gemv_params params; + perf_test::parse_common_options(argc, argv, params); + + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_int(i, argc, argv, "--m", params.m)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--n", params.n)) { + ++i; + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + return params; + } + } + return params; +} + +template static void KokkosBlas2_gemv(benchmark::State& state) { const auto m = state.range(0); const auto n = state.range(1); // Declare type aliases - using ExecSpace = Kokkos::DefaultExecutionSpace; using MemSpace = typename ExecSpace::memory_space; using Device = Kokkos::Device; @@ -70,25 +104,16 @@ static void KokkosBlas2_gemv(benchmark::State& state) { double time = timer.seconds(); total_time += time; - size_t flopsPerRun = (size_t)2 * m * n; - state.SetIterationTime(time); - state.counters["Avg GEMV time (s):"] = - benchmark::Counter(total_time, benchmark::Counter::kAvgIterations); - state.counters["Avg GEMV FLOP/s:"] = benchmark::Counter( - flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); } -} -BENCHMARK(KokkosBlas2_gemv) - ->ArgNames({"m", "n", Kokkos::DefaultExecutionSpace::name()}) - ->Args({5000, 5000, 1}) - ->UseManualTime(); - -BENCHMARK(KokkosBlas2_gemv) - ->ArgNames({"m", "n", Kokkos::DefaultExecutionSpace::name()}) - ->Args({5000, 5000, 1}) - ->UseManualTime(); + state.counters[ExecSpace::name()] = 1; + state.counters["Avg GEMV time (s):"] = + benchmark::Counter(total_time, benchmark::Counter::kAvgIterations); + size_t flopsPerRun = (size_t)2 * m * n; + state.counters["Avg GEMV FLOP/s:"] = benchmark::Counter( + flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); +} int main(int argc, char** argv) { Kokkos::initialize(argc, argv); @@ -96,14 +121,51 @@ int main(int argc, char** argv) { benchmark::SetDefaultTimeUnit(benchmark::kSecond); KokkosKernelsBenchmark::add_benchmark_context(true); - perf_test::CommonInputParams common_params; - perf_test::parse_common_options(argc, argv, common_params); + const auto params = parse_blas2_gemv_options(argc, argv); + const auto arg_names = std::vector{"m", "n"}; + const auto args = std::vector{params.m, params.n}; + + if (params.use_openmp) { +#if defined(KOKKOS_ENABLE_OPENMP) + benchmark::RegisterBenchmark( + "KokkosBlas2_gemv", + KokkosBlas2_gemv) + ->ArgNames(arg_names) + ->Args(args) + ->UseManualTime(); +#else + std::cout << "ERROR: OpenMP requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_cuda) { +#if defined(KOKKOS_ENABLE_CUDA) + benchmark::RegisterBenchmark( + "KokkosBlas2_gemv", + KokkosBlas2_gemv) + ->ArgNames(arg_names) + ->Args(args) + ->UseManualTime(); +#else + std::cout << "ERROR: CUDA requested, but not available.\n"; + return 1; +#endif + } - benchmark::RegisterBenchmark("KokkosBlas2_gemv", - KokkosBlas2_gemv) - ->ArgNames({"m", "n", Kokkos::DefaultExecutionSpace::name()}) - ->Args({5000, 5000, 1}) - ->UseManualTime(); + if (true) { // serial +#if defined(KOKKOS_ENABLE_SERIAL) + benchmark::RegisterBenchmark( + "KokkosBlas2_gemv", + KokkosBlas2_gemv) + ->ArgNames({"m", "n"}) + ->Args({params.m, params.n}) + ->UseManualTime(); +#else + std::cout << "ERROR: Serial device requested, but not available.\n"; + return 1; +#endif + } benchmark::RunSpecifiedBenchmarks(); From 3324854864331e09d554026f43f3ec39e33a5a9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Fri, 7 Apr 2023 20:45:48 +0200 Subject: [PATCH 234/442] Add registration wrapper --- .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 45 +++++++++++-------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 14e67a803d..93349fa061 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -115,24 +115,39 @@ static void KokkosBlas2_gemv(benchmark::State& state) { flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); } +void register_benchmark(const char* name, void (*func)(benchmark::State&), + std::vector arg_names, + std::vector args, int repeat) { + if (repeat > 0) { + benchmark::RegisterBenchmark(name, func) + ->ArgNames(arg_names) + ->Args(args) + ->UseManualTime() + ->Iterations(repeat); + } else { + benchmark::RegisterBenchmark(name, func) + ->ArgNames(arg_names) + ->Args(args) + ->UseManualTime(); + } +} + int main(int argc, char** argv) { Kokkos::initialize(argc, argv); benchmark::Initialize(&argc, argv); benchmark::SetDefaultTimeUnit(benchmark::kSecond); KokkosKernelsBenchmark::add_benchmark_context(true); + const auto name = "KokkosBlas2_gemv"; const auto params = parse_blas2_gemv_options(argc, argv); const auto arg_names = std::vector{"m", "n"}; const auto args = std::vector{params.m, params.n}; if (params.use_openmp) { #if defined(KOKKOS_ENABLE_OPENMP) - benchmark::RegisterBenchmark( - "KokkosBlas2_gemv", - KokkosBlas2_gemv) - ->ArgNames(arg_names) - ->Args(args) - ->UseManualTime(); + register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); #else std::cout << "ERROR: OpenMP requested, but not available.\n"; return 1; @@ -141,12 +156,9 @@ int main(int argc, char** argv) { if (params.use_cuda) { #if defined(KOKKOS_ENABLE_CUDA) - benchmark::RegisterBenchmark( - "KokkosBlas2_gemv", - KokkosBlas2_gemv) - ->ArgNames(arg_names) - ->Args(args) - ->UseManualTime(); + register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); #else std::cout << "ERROR: CUDA requested, but not available.\n"; return 1; @@ -155,12 +167,9 @@ int main(int argc, char** argv) { if (true) { // serial #if defined(KOKKOS_ENABLE_SERIAL) - benchmark::RegisterBenchmark( - "KokkosBlas2_gemv", - KokkosBlas2_gemv) - ->ArgNames({"m", "n"}) - ->Args({params.m, params.n}) - ->UseManualTime(); + register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); #else std::cout << "ERROR: Serial device requested, but not available.\n"; return 1; From 35ee9ee7ed8fda903db10c5479483cff93362b8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 11 Apr 2023 12:30:35 +0200 Subject: [PATCH 235/442] Fix formatting --- perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 93349fa061..44fcd20e16 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -66,8 +66,8 @@ static void KokkosBlas2_gemv(benchmark::State& state) { const auto n = state.range(1); // Declare type aliases - using MemSpace = typename ExecSpace::memory_space; - using Device = Kokkos::Device; + using MemSpace = typename ExecSpace::memory_space; + using Device = Kokkos::Device; // Create a View containing a 2D matrix; allocate KokkosView with template // args of Scalar**, a layout, and From 5d237f8b6e0c8e9c0f5fc62fefafceb36f091587 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 11 Apr 2023 16:30:42 +0200 Subject: [PATCH 236/442] Support all command line parameters --- perf_test/Benchmark_Context.hpp | 18 ++ .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 191 ++++++++++++------ 2 files changed, 150 insertions(+), 59 deletions(-) diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index 16a7d4c4e8..e81b158d93 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -95,6 +95,24 @@ inline void add_benchmark_context(bool verbose = false) { add_version_info(); } +inline void register_benchmark(const char* name, + void (*func)(benchmark::State&), + std::vector arg_names, + std::vector args, int repeat) { + if (repeat > 0) { + benchmark::RegisterBenchmark(name, func) + ->ArgNames(arg_names) + ->Args(args) + ->UseManualTime() + ->Iterations(repeat); + } else { + benchmark::RegisterBenchmark(name, func) + ->ArgNames(arg_names) + ->Args(args) + ->UseManualTime(); + } +} + } // namespace KokkosKernelsBenchmark #endif diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index 44fcd20e16..f37ddf3dc1 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -26,39 +26,55 @@ #include struct blas2_gemv_params : public perf_test::CommonInputParams { - int m = 5000; - int n = 5000; - // bool layoutLeft = true; -}; - -void print_options() { - std::cerr << "Options\n" << std::endl; - std::cerr << perf_test::list_common_options(); + int m = 5000; + int n = 5000; + bool layoutLeft = true; - std::cerr << "\t[Optional] --m :: number of rows to generate" - << std::endl; - std::cerr << "\t[Optional] --n :: number of cols to generate" - << std::endl; -} + static blas2_gemv_params get_params(int& argc, char** argv) { + blas2_gemv_params params; + perf_test::parse_common_options(argc, argv, params); -blas2_gemv_params parse_blas2_gemv_options(int& argc, char** argv) { - blas2_gemv_params params; - perf_test::parse_common_options(argc, argv, params); - - for (int i = 1; i < argc; ++i) { - if (perf_test::check_arg_int(i, argc, argv, "--m", params.m)) { - ++i; - } else if (perf_test::check_arg_int(i, argc, argv, "--n", params.n)) { - ++i; - } else { - std::cerr << "Unrecognized command line argument #" << i << ": " - << argv[i] << std::endl; - print_options(); - return params; + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_int(i, argc, argv, "--m", params.m)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--n", params.n)) { + ++i; + } else if (std::string layout; + perf_test::check_arg_str(i, argc, argv, "--layout", layout)) { + if (0 == Test::string_compare_no_case(layout, "left")) + params.layoutLeft = true; + else if (0 == Test::string_compare_no_case(layout, "right")) + params.layoutLeft = false; + else { + std::cerr << "Invalid layout: must be 'left' or 'right'.\n"; + exit(1); + } + ++i; + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + exit(1); + } } + return params; } - return params; -} + + static void print_options() { + std::cerr << "Options\n" << std::endl; + std::cerr << perf_test::list_common_options(); + + std::cerr + << "\t[Optional] --m :: number of rows to generate (default 5000)" + << std::endl; + std::cerr + << "\t[Optional] --n :: number of cols to generate (default 5000)" + << std::endl; + std::cerr << "\t[Optional] --layout :: matrix layout ('left' or 'right', " + "default 'left')" + << std::endl; + } +}; template static void KokkosBlas2_gemv(benchmark::State& state) { @@ -115,23 +131,6 @@ static void KokkosBlas2_gemv(benchmark::State& state) { flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); } -void register_benchmark(const char* name, void (*func)(benchmark::State&), - std::vector arg_names, - std::vector args, int repeat) { - if (repeat > 0) { - benchmark::RegisterBenchmark(name, func) - ->ArgNames(arg_names) - ->Args(args) - ->UseManualTime() - ->Iterations(repeat); - } else { - benchmark::RegisterBenchmark(name, func) - ->ArgNames(arg_names) - ->Args(args) - ->UseManualTime(); - } -} - int main(int argc, char** argv) { Kokkos::initialize(argc, argv); benchmark::Initialize(&argc, argv); @@ -139,15 +138,37 @@ int main(int argc, char** argv) { KokkosKernelsBenchmark::add_benchmark_context(true); const auto name = "KokkosBlas2_gemv"; - const auto params = parse_blas2_gemv_options(argc, argv); - const auto arg_names = std::vector{"m", "n"}; - const auto args = std::vector{params.m, params.n}; + const auto params = blas2_gemv_params::get_params(argc, argv); + const auto arg_names = std::vector{ + "m", "n", params.layoutLeft ? "LayoutLeft" : "LayoutRight"}; + const auto args = std::vector{params.m, params.n, 1}; + + if (params.use_threads) { +#if defined(KOKKOS_ENABLE_THREADS) + if (params.layoutLeft) + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); + else + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); +#else + std::cout << "ERROR: PThreads requested, but not available.\n"; + return 1; +#endif + } if (params.use_openmp) { #if defined(KOKKOS_ENABLE_OPENMP) - register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); + if (params.layoutLeft) + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); + else + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); #else std::cout << "ERROR: OpenMP requested, but not available.\n"; return 1; @@ -156,20 +177,72 @@ int main(int argc, char** argv) { if (params.use_cuda) { #if defined(KOKKOS_ENABLE_CUDA) - register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); + if (params.layoutLeft) + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); + else + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); #else std::cout << "ERROR: CUDA requested, but not available.\n"; return 1; #endif } - if (true) { // serial + if (params.use_hip) { +#if defined(KOKKOS_ENABLE_HIP) + if (params.layoutLeft) + KokkosKernelsBenchmark::register_benchmark( + name, + KokkosBlas2_gemv, + arg_names, args, params.repeat); + else + KokkosKernelsBenchmark::register_benchmark( + name, + KokkosBlas2_gemv, + arg_names, args, params.repeat); +#else + std::cout << "ERROR: HIP requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_sycl) { +#if defined(KOKKOS_ENABLE_SYCL) + if (params.layoutLeft) + KokkosKernelsBenchmark::register_benchmark( + name, + KokkosBlas2_gemv, + arg_names, args, params.repeat); + else + KokkosKernelsBenchmark::register_benchmark( + name, + KokkosBlas2_gemv, + arg_names, args, params.repeat); +#else + std::cout << "ERROR: SYCL requested, but not available.\n"; + return 1; +#endif + } + + // use serial if no backend is specified + if (!params.use_cuda and !params.use_hip and !params.use_openmp and + !params.use_sycl and !params.use_threads) { #if defined(KOKKOS_ENABLE_SERIAL) - register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); + if (params.layoutLeft) + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); + else + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_gemv, + arg_names, args, params.repeat); #else std::cout << "ERROR: Serial device requested, but not available.\n"; return 1; From 15d61698300b4fcd400846b7ff04a4e12f018d55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 11 Apr 2023 18:32:33 +0200 Subject: [PATCH 237/442] Reduce duplication --- .../KokkosBlas2_gemv_perf_test_benchmark.cpp | 90 ++++++------------- 1 file changed, 28 insertions(+), 62 deletions(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index f37ddf3dc1..962328eb95 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -77,7 +77,7 @@ struct blas2_gemv_params : public perf_test::CommonInputParams { }; template -static void KokkosBlas2_gemv(benchmark::State& state) { +static void KokkosBlas2_GEMV(benchmark::State& state) { const auto m = state.range(0); const auto n = state.range(1); @@ -131,28 +131,37 @@ static void KokkosBlas2_gemv(benchmark::State& state) { flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); } +template +void run(const blas2_gemv_params& params) { + using Scalar = double; + + const auto name = "KokkosBlas2_GEMV"; + const auto arg_names = std::vector{ + "m", "n", params.layoutLeft ? "LayoutLeft" : "LayoutRight"}; + const auto args = std::vector{params.m, params.n, 1}; + + if (params.layoutLeft) { + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_GEMV, + arg_names, args, params.repeat); + } else { + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas2_GEMV, + arg_names, args, params.repeat); + } +} + int main(int argc, char** argv) { Kokkos::initialize(argc, argv); benchmark::Initialize(&argc, argv); benchmark::SetDefaultTimeUnit(benchmark::kSecond); KokkosKernelsBenchmark::add_benchmark_context(true); - const auto name = "KokkosBlas2_gemv"; - const auto params = blas2_gemv_params::get_params(argc, argv); - const auto arg_names = std::vector{ - "m", "n", params.layoutLeft ? "LayoutLeft" : "LayoutRight"}; - const auto args = std::vector{params.m, params.n, 1}; + const auto params = blas2_gemv_params::get_params(argc, argv); if (params.use_threads) { #if defined(KOKKOS_ENABLE_THREADS) - if (params.layoutLeft) - KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); - else - KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); + run(params); #else std::cout << "ERROR: PThreads requested, but not available.\n"; return 1; @@ -161,14 +170,7 @@ int main(int argc, char** argv) { if (params.use_openmp) { #if defined(KOKKOS_ENABLE_OPENMP) - if (params.layoutLeft) - KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); - else - KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); + run(params); #else std::cout << "ERROR: OpenMP requested, but not available.\n"; return 1; @@ -177,14 +179,7 @@ int main(int argc, char** argv) { if (params.use_cuda) { #if defined(KOKKOS_ENABLE_CUDA) - if (params.layoutLeft) - KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); - else - KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); + run(params); #else std::cout << "ERROR: CUDA requested, but not available.\n"; return 1; @@ -193,18 +188,7 @@ int main(int argc, char** argv) { if (params.use_hip) { #if defined(KOKKOS_ENABLE_HIP) - if (params.layoutLeft) - KokkosKernelsBenchmark::register_benchmark( - name, - KokkosBlas2_gemv, - arg_names, args, params.repeat); - else - KokkosKernelsBenchmark::register_benchmark( - name, - KokkosBlas2_gemv, - arg_names, args, params.repeat); + run(params); #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; @@ -213,18 +197,7 @@ int main(int argc, char** argv) { if (params.use_sycl) { #if defined(KOKKOS_ENABLE_SYCL) - if (params.layoutLeft) - KokkosKernelsBenchmark::register_benchmark( - name, - KokkosBlas2_gemv, - arg_names, args, params.repeat); - else - KokkosKernelsBenchmark::register_benchmark( - name, - KokkosBlas2_gemv, - arg_names, args, params.repeat); + run(params); #else std::cout << "ERROR: SYCL requested, but not available.\n"; return 1; @@ -235,14 +208,7 @@ int main(int argc, char** argv) { if (!params.use_cuda and !params.use_hip and !params.use_openmp and !params.use_sycl and !params.use_threads) { #if defined(KOKKOS_ENABLE_SERIAL) - if (params.layoutLeft) - KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); - else - KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas2_gemv, - arg_names, args, params.repeat); + run(params); #else std::cout << "ERROR: Serial device requested, but not available.\n"; return 1; From 20ad98ac6f90efc075a60f3c4aa5eeca3af751d9 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Wed, 12 Apr 2023 12:44:24 -0700 Subject: [PATCH 238/442] Add execution space to policies --- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 26 ++++++++++--------- .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 2 +- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 3ca003f713..583b3e8ab9 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -4033,6 +4033,7 @@ void lower_tri_solve_streams(const std::vector &execspace_v, const std::vector &values_v, const std::vector &rhs_v, std::vector &lhs_v) { + // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment using size_type = typename TriSolveHandle::size_type; using NGBLType = typename TriSolveHandle::nnz_lno_view_t; using nodes_per_level_type = typename TriSolveHandle::hostspace_nnz_lno_view_t; @@ -4063,20 +4064,20 @@ void lower_tri_solve_streams(const std::vector &execspace_v, if (lvl < nlevels_v[i]) { size_type lvl_nodes = hnodes_per_level_v[i](lvl); if (lvl_nodes != 0) { - if (thandle_v[0]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { - Kokkos::parallel_for("parfor_fixed_lvl", Kokkos::RangePolicy(node_count_v[i], node_count_v[i] + lvl_nodes), LowerTriLvlSchedRPSolverFunctor(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i])); - } else if (thandle_v[0]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { + if (thandle_v[i]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { + Kokkos::parallel_for("parfor_fixed_lvl", Kokkos::RangePolicy(execspace_v[i], node_count_v[i], node_count_v[i] + lvl_nodes), LowerTriLvlSchedRPSolverFunctor(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i])); + } else if (thandle_v[i]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { using policy_type = Kokkos::TeamPolicy; - int team_size = thandle_v[0]->get_team_size(); + int team_size = thandle_v[i]->get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP1SolverFunctor tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], true, node_count_v[i]); #else LowerTriLvlSchedTP1SolverFunctor tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); #endif if (team_size == -1) - Kokkos::parallel_for("parfor_l_team", policy_type(lvl_nodes, Kokkos::AUTO), tstf); + Kokkos::parallel_for("parfor_l_team", policy_type(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); else - Kokkos::parallel_for("parfor_l_team", policy_type(lvl_nodes, team_size), tstf); + Kokkos::parallel_for("parfor_l_team", policy_type(execspace_v[i], lvl_nodes, team_size), tstf); } node_count_v[i] += lvl_nodes; } // end if (lvl_nodes != 0) @@ -4099,6 +4100,7 @@ void upper_tri_solve_streams(const std::vector &execspace_v, const std::vector &values_v, const std::vector &rhs_v, std::vector &lhs_v) { + // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment using size_type = typename TriSolveHandle::size_type; using NGBLType = typename TriSolveHandle::nnz_lno_view_t; using nodes_per_level_type = typename TriSolveHandle::hostspace_nnz_lno_view_t; @@ -4129,20 +4131,20 @@ void upper_tri_solve_streams(const std::vector &execspace_v, if (lvl < nlevels_v[i]) { size_type lvl_nodes = hnodes_per_level_v[i](lvl); if (lvl_nodes != 0) { - if (thandle_v[0]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { - Kokkos::parallel_for("parfor_fixed_lvl", Kokkos::RangePolicy(node_count_v[i], node_count_v[i] + lvl_nodes), UpperTriLvlSchedRPSolverFunctor(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i])); - } else if (thandle_v[0]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { + if (thandle_v[i]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { + Kokkos::parallel_for("parfor_fixed_lvl", Kokkos::RangePolicy(execspace_v[i], node_count_v[i], node_count_v[i] + lvl_nodes), UpperTriLvlSchedRPSolverFunctor(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i])); + } else if (thandle_v[i]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { using policy_type = Kokkos::TeamPolicy; - int team_size = thandle_v[0]->get_team_size(); + int team_size = thandle_v[i]->get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP1SolverFunctor tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], false, node_count_v[i]); #else UpperTriLvlSchedTP1SolverFunctor tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); #endif if (team_size == -1) - Kokkos::parallel_for("parfor_l_team", policy_type(lvl_nodes, Kokkos::AUTO), tstf); + Kokkos::parallel_for("parfor_l_team", policy_type(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); else - Kokkos::parallel_for("parfor_l_team", policy_type(lvl_nodes, team_size), tstf); + Kokkos::parallel_for("parfor_l_team", policy_type(execspace_v[i], lvl_nodes, team_size), tstf); } node_count_v[i] += lvl_nodes; } // end if (lvl_nodes != 0) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index 628d72add7..0004c565ce 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -177,7 +177,7 @@ struct SPTRSV_SOLVE &b_v, std::vector &x_v) { // Call specific algorithm type - // NOTE: Only support SEQLVLSCHD_TP1 for now + // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment // Assume streams have the same either lower or upper matrix type std::vector sptrsv_handle_v(execspace_v.size()); for (int i = 0; i < static_cast(execspace_v.size()); i++) { From 97187c3af919b321f7bfffd190f8e54773618460 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 12 Apr 2023 22:50:56 +0200 Subject: [PATCH 239/442] Allow passing additional arguments --- perf_test/Benchmark_Context.hpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index e81b158d93..3cfefbc057 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -95,18 +95,21 @@ inline void add_benchmark_context(bool verbose = false) { add_version_info(); } -inline void register_benchmark(const char* name, - void (*func)(benchmark::State&), +template +inline void register_benchmark(const char* name, FuncType func, std::vector arg_names, - std::vector args, int repeat) { + std::vector args, int repeat, + ArgsToCallOp&&... func_args) { if (repeat > 0) { - benchmark::RegisterBenchmark(name, func) + benchmark::RegisterBenchmark(name, func, + std::forward(func_args)...) ->ArgNames(arg_names) ->Args(args) ->UseManualTime() ->Iterations(repeat); } else { - benchmark::RegisterBenchmark(name, func) + benchmark::RegisterBenchmark(name, func, + std::forward(func_args)...) ->ArgNames(arg_names) ->Args(args) ->UseManualTime(); From b1185f3a91446e99f5c7b99833ad5f0cdae5fae6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 12 Apr 2023 23:29:50 +0200 Subject: [PATCH 240/442] Include OpenMP environment variables in benchmark context --- perf_test/Benchmark_Context.hpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index 16a7d4c4e8..1dfb7803e0 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -21,6 +21,7 @@ #include "KokkosKernels_PrintConfiguration.hpp" +#include #include #include @@ -89,10 +90,30 @@ inline void add_version_info() { } } +inline void add_env_info() { + auto num_threads = std::getenv("OMP_NUM_THREADS"); + if (num_threads) { + benchmark::AddCustomContext("OMP_NUM_THREADS", num_threads); + } + auto dynamic = std::getenv("OMP_DYNAMIC"); + if (dynamic) { + benchmark::AddCustomContext("OMP_DYNAMIC", dynamic); + } + auto proc_bind = std::getenv("OMP_PROC_BIND"); + if (proc_bind) { + benchmark::AddCustomContext("OMP_PROC_BIND", proc_bind); + } + auto places = std::getenv("OMP_PLACES"); + if (places) { + benchmark::AddCustomContext("OMP_PLACES", places); + } +} + /// \brief Gather all context information and add it to benchmark context inline void add_benchmark_context(bool verbose = false) { add_kokkos_configuration(verbose); add_version_info(); + add_env_info(); } } // namespace KokkosKernelsBenchmark From 0fd4f287823c5f9cb506b983e18147e691bfa10e Mon Sep 17 00:00:00 2001 From: kliegeois Date: Wed, 12 Apr 2023 16:56:49 -0600 Subject: [PATCH 241/442] Fix unused variable warnings --- sparse/src/KokkosSparse_spgemm_handle.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sparse/src/KokkosSparse_spgemm_handle.hpp b/sparse/src/KokkosSparse_spgemm_handle.hpp index 7cdba6030c..1106d300c8 100644 --- a/sparse/src/KokkosSparse_spgemm_handle.hpp +++ b/sparse/src/KokkosSparse_spgemm_handle.hpp @@ -810,6 +810,11 @@ class SPGEMMHandle { KokkosKernels::Impl::hashView(b_entriesIn))) return false; } +#else + (void)a_rowptrsIn; + (void)a_entriesIn; + (void)b_rowptrsIn; + (void)b_entriesIn; #endif return true; } @@ -827,6 +832,11 @@ class SPGEMMHandle { if (b_graph_hash != (KokkosKernels::Impl::hashView(b_rowptrsIn) ^ KokkosKernels::Impl::hashView(b_entriesIn))) return false; +#else + (void)a_rowptrsIn; + (void)a_entriesIn; + (void)b_rowptrsIn; + (void)b_entriesIn; #endif return true; } From 7c798ae97a47f33b5370ce523467818597361534 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Wed, 12 Apr 2023 16:38:53 -0700 Subject: [PATCH 242/442] cuSPARSE trisolve with streams --- .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 165 ++++++++++++++++++ .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 4 +- sparse/src/KokkosSparse_sptrsv.hpp | 29 ++- 3 files changed, 178 insertions(+), 20 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 17611c3f2c..b3d8753526 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -436,6 +436,171 @@ void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle, #endif } +// -------------------------------- +// Stream interface +// -------------------------------- + +template +void sptrsvcuSPARSE_solve_streams( + const std::vector &execspace_v, + const std::vector &handle_v, + const std::vector &row_map_v, + const std::vector &entries_v, + const std::vector &values_v, + const std::vector &rhs_v, + std::vector &lhs_v, bool /*trans*/ +) { +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + using idx_type = typename KernelHandle::nnz_lno_t; + using size_type = typename KernelHandle::size_type; + using scalar_type = typename KernelHandle::nnz_scalar_t; + using memory_space = typename KernelHandle::HandlePersistentMemorySpace; + using sptrsvHandleType = typename KernelHandle::SPTRSVHandleType; + usinf sptrsvCuSparseHandleType = typename sptrsvHandleType::SPTRSVcuSparseHandleType; + + int nstreams = execspace_v.size(); +#if (CUDA_VERSION >= 11030) + (void)row_map_v; + (void)entries_v; + (void)values_v; + + const bool is_cuda_space = + std::is_same::value || + std::is_same::value || + std::is_same::value; + + const bool is_idx_type_supported = std::is_same::value || + std::is_same::value; + + if (!is_cuda_space) { + throw std::runtime_error( + "KokkosKernels sptrsvcuSPARSE_solve_streams: MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE\n"); + } else if (!is_idx_type_supported) { + throw std::runtime_error( + "CUSPARSE requires local ordinals to be integer (32 bits or 64 bits).\n"); + } else { + const scalar_type alpha = scalar_type(1.0); + + cudaDataType cudaValueType = cuda_data_type_from(); + + std::vector h_v(nstreams); + + for (int i = 0; i < nstreams; i++) { + sptrsvHandleType *sptrsv_handle = handle_v[i].get_sptrsv_handle(); + h_v[i] = sptrsv_handle->get_cuSparseHandle(); + + // Bind cuspare handle to a stream + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetStream(h_v[i]->handle, execspace_v[i].cuda_stream())); + + int64_t nrows = static_cast(sptrsv_handle->get_nrows()); + + // Create dense vector B (RHS) + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(&(h_v[i]->vecBDescr), nrows, (void*)rhs_v[i].data(), cudaValueType)); + + // Create dense vector X (LHS) + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(&(h_v[i]->vecXDescr), nrows, (void*)lhs_v[i].data(), cudaValueType)); + } + + // Solve + for (int i = 0; i < nstreams; i++) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_solve(h_v[i]->handle, h_v[i]->transpose, &alpha, h_v[i]->matDescr, h_v[i]->vecBDescr, h_v[i]->vecXDescr, cudaValueType, CUSPARSE_SPSV_ALG_DEFAULT, h_v[i]->spsvDescr)); + } + + // Destroy dense vector descriptors + for (int i = 0; i < nstreams; i++) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(h_v[i]->vecBDescr)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(h_v[i]->vecXDescr)); + } + } +#else // CUDA_VERSION < 11030 +// if (std::is_same::value) { +// cusparseStatus_t status; +// +// typename KernelHandle::SPTRSVcuSparseHandleType* h = +// sptrsv_handle->get_cuSparseHandle(); +// +// int nnz = entries.extent_int(0); +// +// const int* rm = !std::is_same::value +// ? sptrsv_handle->get_int_rowmap_ptr() +// : (const int*)row_map.data(); +// const int* ent = (const int*)entries.data(); +// const scalar_type* vals = values.data(); +// const scalar_type* bv = rhs.data(); +// scalar_type* xv = lhs.data(); +// +// if (std::is_same::value) { +// if (h->pBuffer == nullptr) { +// std::cout << " pBuffer invalid" << std::endl; +// } +// const double alpha = double(1); +// +// status = cusparseDcsrsv2_solve(h->handle, h->transpose, nrows, nnz, +// &alpha, h->descr, (double*)vals, (int*)rm, +// (int*)ent, h->info, (double*)bv, +// (double*)xv, h->policy, h->pBuffer); +// +// if (CUSPARSE_STATUS_SUCCESS != status) +// std::cout << "solve status error name " << (status) << std::endl; +// } else if (std::is_same::value) { +// if (h->pBuffer == nullptr) { +// std::cout << " pBuffer invalid" << std::endl; +// } +// const float alpha = float(1); +// +// status = cusparseScsrsv2_solve(h->handle, h->transpose, nrows, nnz, +// &alpha, h->descr, (float*)vals, (int*)rm, +// (int*)ent, h->info, (float*)bv, (float*)xv, +// h->policy, h->pBuffer); +// +// if (CUSPARSE_STATUS_SUCCESS != status) +// std::cout << "solve status error name " << (status) << std::endl; +// } else if (std::is_same >::value) { +// cuDoubleComplex cualpha; +// cualpha.x = 1.0; +// cualpha.y = 0.0; +// status = cusparseZcsrsv2_solve( +// h->handle, h->transpose, nrows, nnz, &cualpha, h->descr, +// (cuDoubleComplex*)vals, (int*)rm, (int*)ent, h->info, +// (cuDoubleComplex*)bv, (cuDoubleComplex*)xv, h->policy, h->pBuffer); +// +// if (CUSPARSE_STATUS_SUCCESS != status) +// std::cout << "solve status error name " << (status) << std::endl; +// } else if (std::is_same >::value) { +// cuComplex cualpha; +// cualpha.x = 1.0; +// cualpha.y = 0.0; +// status = cusparseCcsrsv2_solve( +// h->handle, h->transpose, nrows, nnz, &cualpha, h->descr, +// (cuComplex*)vals, (int*)rm, (int*)ent, h->info, (cuComplex*)bv, +// (cuComplex*)xv, h->policy, h->pBuffer); +// +// if (CUSPARSE_STATUS_SUCCESS != status) +// std::cout << "solve status error name " << (status) << std::endl; +// } else { +// throw std::runtime_error("CUSPARSE wrapper error: unsupported type.\n"); +// } +// +// } else { +// throw std::runtime_error( +// "CUSPARSE requires local ordinals to be integer.\n"); +// } +#endif +#else + (void)execspace_v; + (void)handle_v; + (void)row_map_v; + (void)entries_v; + (void)values_v; + (void)rhs_v; + (void)lhs_v; + throw std::runtime_error("CUSPARSE IS NOT DEFINED\n"); +#endif +} + } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index 0004c565ce..52e3442896 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -102,7 +102,7 @@ struct SPTRSV_SOLVE { static void sptrsv_solve_streams( const std::vector &execspace_v, - std::vector &handle_v, + const std::vector &handle_v, const std::vector &row_map_v, const std::vector &entries_v, const std::vector &values_v, @@ -170,7 +170,7 @@ struct SPTRSV_SOLVE &execspace_v, - std::vector &handle_v, + const std::vector &handle_v, const std::vector &row_map_v, const std::vector &entries_v, const std::vector &values_v, diff --git a/sparse/src/KokkosSparse_sptrsv.hpp b/sparse/src/KokkosSparse_sptrsv.hpp index 08241cc653..94636a9fd7 100644 --- a/sparse/src/KokkosSparse_sptrsv.hpp +++ b/sparse/src/KokkosSparse_sptrsv.hpp @@ -461,7 +461,6 @@ void sptrsv_solve_streams(const std::vector& execspace_v, using c_persist_t = typename KernelHandle::HandlePersistentMemorySpace; using const_handle_type = typename KokkosKernels::Experimental::KokkosKernelsHandle; - const_handle_type tmp_handle(*handle); using RowMap_Internal = Kokkos::View< typename lno_row_view_t_::const_value_type *, @@ -508,25 +507,19 @@ void sptrsv_solve_streams(const std::vector& execspace_v, x_i_v[i] = x_v[i]; } - //auto sptrsv_handle = handle->get_sptrsv_handle(); - //if (sptrsv_handle->get_algorithm() == - // KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { - // typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType; - // sptrsvHandleType *sh = handle->get_sptrsv_handle(); - // auto nrows = sh->get_nrows(); - // - // KokkosSparse::Impl::sptrsvcuSPARSE_solve( - // sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false); - // - //} else { + auto sptrsv_handle = handle_v[0]->get_sptrsv_handle(); + if (handle_v[0]->get_sptrsv_handle()->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { + // NOTE: assume all streams use the same SPTRSV_CUSPARSE algo. + KokkosSparse::Impl::sptrsvcuSPARSE_solve_streams< + ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, XType_Internal>(execspace_v, handle_i_v, rowmap_i_v, entries_i_v, values_i_v, b_i_v, x_i_v, false); + + } else { KokkosSparse::Impl::SPTRSV_SOLVE< ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, - Values_Internal, BType_Internal, XType_Internal>::sptrsv_solve_streams(execspace_v, handle_i_v, - rowmap_i_v, entries_i_v, values_i_v, - b_i_v, x_i_v); - //} + Values_Internal, BType_Internal, XType_Internal>::sptrsv_solve_streams(execspace_v, handle_i_v, rowmap_i_v, entries_i_v, values_i_v, b_i_v, x_i_v); + } } // sptrsv_solve_streams From c025caacd6dd2eaa30739e0ba6f9ce9f06be6913 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Wed, 22 Mar 2023 19:42:08 +0100 Subject: [PATCH 243/442] Port blas3 gemm test --- perf_test/blas/blas3/CMakeLists.txt | 6 + ...s3_gemm_standalone_perf_test_benchmark.cpp | 205 ++++++++++++++++++ 2 files changed, 211 insertions(+) create mode 100644 perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp diff --git a/perf_test/blas/blas3/CMakeLists.txt b/perf_test/blas/blas3/CMakeLists.txt index 90097b86f8..80c9d25c1c 100644 --- a/perf_test/blas/blas3/CMakeLists.txt +++ b/perf_test/blas/blas3/CMakeLists.txt @@ -12,3 +12,9 @@ KOKKOSKERNELS_ADD_EXECUTABLE( SOURCES KokkosBlas3_gemm_standalone_perf_test.cpp ) +IF(KokkosKernels_ENABLE_BENCHMARK) + KOKKOSKERNELS_ADD_BENCHMARK( + Blas3_gemm_benchmark + SOURCES KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp + ) +ENDIF() diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp new file mode 100644 index 0000000000..012edcb042 --- /dev/null +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp @@ -0,0 +1,205 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "KokkosBlas3_gemm.hpp" +#include +#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" +#include "Benchmark_Context.hpp" +#include + +struct blas3_gemm_params : public perf_test::CommonInputParams { + int m = 1000; + int n = 1000; + int k = 1000; + + static blas3_gemm_params get_params(int& argc, char** argv) { + blas3_gemm_params params; + perf_test::parse_common_options(argc, argv, params); + + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_int(i, argc, argv, "--m", params.m)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--n", params.n)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--k", params.k)) { + ++i; + } else if (std::string(argv[i]).find("--benchmark") == 0) { + continue; // ignore benchmark arguments + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + exit(1); + } + } + return params; + } + + static void print_options() { + std::cerr << "Options\n" << std::endl; + std::cerr << perf_test::list_common_options(); + + std::cerr << "\t[Optional] --m :: Rows in A (default 1000)" + << std::endl; + std::cerr + << "\t[Optional] --n :: Columns in A / Rows in B (default 1000)" + << std::endl; + std::cerr << "\t[Optional] --k :: Columns in B (default 1000)" + << std::endl; + } +}; + +template +static void KokkosBlas3_GEMM(benchmark::State& state) { + const auto m = state.range(0); + const auto n = state.range(1); + const auto k = state.range(2); + + using MemSpace = typename ExecSpace::memory_space; + using Device = Kokkos::Device; + Kokkos::View A( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A"), m, n); + Kokkos::View B( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), n, k); + Kokkos::View C( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "C"), m, k); + Kokkos::Random_XorShift64_Pool pool(123); + Kokkos::fill_random(A, pool, 10.0); + Kokkos::fill_random(B, pool, 10.0); + + // Do a warm-up run + KokkosBlas::gemm("N", "N", 1.0, A, B, 0.0, C); + Kokkos::fence(); + double total_time = 0.0; + + for (auto _ : state) { + Kokkos::Timer timer; + KokkosBlas::gemm("N", "N", 1.0, A, B, 0.0, C); + ExecSpace().fence(); + + double time = timer.seconds(); + total_time += time; + state.SetIterationTime(time); + } + + state.counters[ExecSpace::name()] = 1; + state.counters["Avg GEMM time (s):"] = + benchmark::Counter(total_time, benchmark::Counter::kAvgIterations); + size_t flopsPerRun = (size_t)2 * m * n * k; + state.counters["Avg GEMM (FLOP/s):"] = benchmark::Counter( + flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); +} + +template +void run(const blas3_gemm_params& params) { + using LL = Kokkos::LayoutLeft; + using LR = Kokkos::LayoutRight; + using Scalar = double; + + const auto name = "KokkosBlas3_GEMM"; + const auto arg_names = std::vector{"m", "n", "k"}; + const auto args = std::vector{params.m, params.n, params.k}; + + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas3_GEMM, arg_names, args, + params.repeat); + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas3_GEMM, arg_names, args, + params.repeat); + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas3_GEMM, arg_names, args, + params.repeat); + KokkosKernelsBenchmark::register_benchmark( + name, KokkosBlas3_GEMM, arg_names, args, + params.repeat); +} + +int main(int argc, char** argv) { + const auto params = blas3_gemm_params::get_params(argc, argv); + const int num_threads = params.use_openmp; + const int device_id = params.use_cuda - 1; + + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kSecond); + KokkosKernelsBenchmark::add_benchmark_context(true); + + if (params.use_threads) { +#if defined(KOKKOS_ENABLE_THREADS) + run(params); +#else + std::cout << "ERROR: PThreads requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_openmp) { +#if defined(KOKKOS_ENABLE_OPENMP) + run(params); +#else + std::cout << "ERROR: OpenMP requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_cuda) { +#if defined(KOKKOS_ENABLE_CUDA) + run(params); +#else + std::cout << "ERROR: CUDA requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_hip) { +#if defined(KOKKOS_ENABLE_HIP) + run(params); +#else + std::cout << "ERROR: HIP requested, but not available.\n"; + return 1; +#endif + } + + if (params.use_sycl) { +#if defined(KOKKOS_ENABLE_SYCL) + run(params); +#else + std::cout << "ERROR: SYCL requested, but not available.\n"; + return 1; +#endif + } + + // use serial if no backend is specified + if (!params.use_cuda and !params.use_hip and !params.use_openmp and + !params.use_sycl and !params.use_threads) { +#if defined(KOKKOS_ENABLE_SERIAL) + run(params); +#else + std::cout << "ERROR: Serial device requested, but not available.\n"; + return 1; +#endif + } + + benchmark::RunSpecifiedBenchmarks(); + + benchmark::Shutdown(); + Kokkos::finalize(); + return 0; +} From 1a6f22b1c5211496b02650646697ebbf427a58dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Tue, 11 Apr 2023 21:30:08 +0200 Subject: [PATCH 244/442] Report layouts used --- ...KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp index 012edcb042..778e1e478d 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp @@ -103,6 +103,16 @@ static void KokkosBlas3_GEMM(benchmark::State& state) { size_t flopsPerRun = (size_t)2 * m * n * k; state.counters["Avg GEMM (FLOP/s):"] = benchmark::Counter( flopsPerRun, benchmark::Counter::kIsIterationInvariantRate); + if constexpr (std::is_same_v) { + state.counters["Memory Layout in A: LayoutLeft"] = 1; + } else { + state.counters["Memory Layout in A: LayoutRight"] = 1; + } + if constexpr (std::is_same_v) { + state.counters["Memory Layout in B: LayoutLeft"] = 1; + } else { + state.counters["Memory Layout in B: LayoutRight"] = 1; + } } template From 507c29f685920b2278c77dd45b3a9fe3f7260771 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Fri, 14 Apr 2023 12:39:54 -0600 Subject: [PATCH 245/442] par_ilut: make Ut_values view atomic in compute_l_u_factors (#1781) * par_ilut: make Ut_values view atomic in compute_l_u_factors ... to fix the race issues when async updates are on. * With Ut atomic, no need to avoid async updates on GPU * Remove unnecessary header * Update comments * Fixes for complex scalars * Adjust async update views; default it to off * Fix UtValuesSafeType * Update sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp * Remove UtViewType in favor of std::conditional --- .../KokkosSparse_par_ilut_numeric_impl.hpp | 65 ++++++++++++++----- sparse/src/KokkosKernels_Handle.hpp | 2 +- sparse/src/KokkosSparse_par_ilut_handle.hpp | 2 - 3 files changed, 51 insertions(+), 18 deletions(-) diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index 89dcd12c5b..c482aff429 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -420,7 +420,8 @@ struct IlutWrap { const auto l_col = L_entries(l_row_nnz); const auto u_row = Ut_entries(ut_row_nnz); if (l_col == u_row && l_col < last_entry) { - sum += L_values(l_row_nnz) * Ut_values(ut_row_nnz); + const scalar_t ut_val = Ut_values(ut_row_nnz); + sum += L_values(l_row_nnz) * ut_val; } if (static_cast(u_row) == row_idx) { ut_nnz = ut_row_nnz; @@ -440,17 +441,32 @@ struct IlutWrap { * make this function determistic, but that could cause par_ilut * to take longer (more iterations) to converge. */ - template - static void compute_l_u_factors( + template + static void compute_l_u_factors_impl( IlutHandle& ih, const ARowMapType& A_row_map, const AEntriesType& A_entries, const AValuesType& A_values, LRowMapType& L_row_map, LEntriesType& L_entries, LValuesType& L_values, URowMapType& U_row_map, UEntriesType& U_entries, UValuesType& U_values, UtRowMapType& Ut_row_map, UtEntriesType& Ut_entries, - UtValuesType& Ut_values, const bool async_update) { + UtValuesType& Ut_values_arg) { + // UtValues needs to be Atomic if async updates are on. Otherwise, + // non-atomic is fine. + using UtValuesSafeType = std::conditional_t< + async_update, + Kokkos::View< + typename UtValuesType::non_const_value_type*, + typename UtValuesType::array_layout, + typename UtValuesType::device_type, + Kokkos::MemoryTraits >, + UtValuesType>; + + UtValuesSafeType Ut_values = Ut_values_arg; + const size_type nrows = ih.get_nrows(); Kokkos::parallel_for( "compute_l_u_factors", range_policy(0, nrows), @@ -460,8 +476,8 @@ struct IlutWrap { L_row_map(row_idx + 1) - 1; // skip diagonal for L for (auto l_nnz = l_row_nnz_begin; l_nnz < l_row_nnz_end; ++l_nnz) { - const auto col_idx = L_entries(l_nnz); - const auto u_diag = Ut_values(Ut_row_map(col_idx + 1) - 1); + const auto col_idx = L_entries(l_nnz); + const scalar_t u_diag = Ut_values(Ut_row_map(col_idx + 1) - 1); if (u_diag != 0.0) { const auto new_val = compute_sum(row_idx, col_idx, A_row_map, A_entries, A_values, @@ -487,8 +503,7 @@ struct IlutWrap { // ut_nnz is not guarateed to fail into range used exclusively // by this thread. Updating it here opens up potential race - // conditions that cause problems on GPU but usually causes - // faster convergence. + // conditions but usually causes faster convergence. if (async_update) { Ut_values(ut_nnz) = new_val; } @@ -496,6 +511,28 @@ struct IlutWrap { }); } + template + static void compute_l_u_factors( + IlutHandle& ih, const ARowMapType& A_row_map, + const AEntriesType& A_entries, const AValuesType& A_values, + LRowMapType& L_row_map, LEntriesType& L_entries, LValuesType& L_values, + URowMapType& U_row_map, UEntriesType& U_entries, UValuesType& U_values, + UtRowMapType& Ut_row_map, UtEntriesType& Ut_entries, + UtValuesType& Ut_values, const bool async_update) { + if (async_update) { + compute_l_u_factors_impl( + ih, A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, + U_row_map, U_entries, U_values, Ut_row_map, Ut_entries, Ut_values); + } else { + compute_l_u_factors_impl( + ih, A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, + U_row_map, U_entries, U_values, Ut_row_map, Ut_entries, Ut_values); + } + } + /** * Select threshold based on filter rank. Do all this on host */ @@ -794,10 +831,8 @@ struct IlutWrap { thandle.get_residual_norm_delta_stop(); const size_type max_iter = thandle.get_max_iter(); - const auto verbose = thandle.get_verbose(); - constexpr bool on_gpu = - KokkosKernels::Impl::kk_is_gpu_exec_space(); - const auto async_update = !on_gpu && thandle.get_async_update(); + const auto verbose = thandle.get_verbose(); + const auto async_update = false; // thandle.get_async_update(); if (verbose) { std::cout << "Starting PARILUT with..." << std::endl; diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index 1f080b7bce..dc76ee23d7 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -872,7 +872,7 @@ class KokkosKernelsHandle { const typename PAR_ILUTHandleType::float_t residual_norm_delta_stop = 1e-2, const typename PAR_ILUTHandleType::float_t fill_in_limit = 0.75, - const bool async_update = true, const bool verbose = false) { + const bool async_update = false, const bool verbose = false) { this->destroy_par_ilut_handle(); this->is_owner_of_the_par_ilut_handle = true; this->par_ilutHandle = diff --git a/sparse/src/KokkosSparse_par_ilut_handle.hpp b/sparse/src/KokkosSparse_par_ilut_handle.hpp index 7ae11d297c..3ffe44ffca 100644 --- a/sparse/src/KokkosSparse_par_ilut_handle.hpp +++ b/sparse/src/KokkosSparse_par_ilut_handle.hpp @@ -82,8 +82,6 @@ class PAR_ILUTHandle { bool async_update; /// Whether compute LU factors should do asychronous /// updates. When ON, the algorithm will usually converge /// faster but it makes the algorithm non-deterministic. - /// This will always be OFF for GPU since it doesn't work - /// there. bool verbose; /// Print information while executing par_ilut // Stored by parent KokkosKernelsHandle From e3a42e418d7de87f813f20398b1c45e23b41218e Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Sun, 16 Apr 2023 06:09:43 -0700 Subject: [PATCH 246/442] Fix compile errors --- sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp | 4 ++-- sparse/src/KokkosSparse_sptrsv.hpp | 2 +- sparse/tpls/KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index 52e3442896..0004c565ce 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -102,7 +102,7 @@ struct SPTRSV_SOLVE { static void sptrsv_solve_streams( const std::vector &execspace_v, - const std::vector &handle_v, + std::vector &handle_v, const std::vector &row_map_v, const std::vector &entries_v, const std::vector &values_v, @@ -170,7 +170,7 @@ struct SPTRSV_SOLVE &execspace_v, - const std::vector &handle_v, + std::vector &handle_v, const std::vector &row_map_v, const std::vector &entries_v, const std::vector &values_v, diff --git a/sparse/src/KokkosSparse_sptrsv.hpp b/sparse/src/KokkosSparse_sptrsv.hpp index 94636a9fd7..6bc2c04678 100644 --- a/sparse/src/KokkosSparse_sptrsv.hpp +++ b/sparse/src/KokkosSparse_sptrsv.hpp @@ -453,7 +453,7 @@ void sptrsv_solve_streams(const std::vector& execspace_v, KokkosKernels::Impl::throw_runtime_exception(os.str()); } - using c_size_t = typename KernelHandle::const_size_type c_size_t; + using c_size_t = typename KernelHandle::const_size_type; using c_lno_t = typename KernelHandle::const_nnz_lno_t; using c_scalar_t = typename KernelHandle::const_nnz_scalar_t; using c_exec_t = typename KernelHandle::HandleExecSpace; diff --git a/sparse/tpls/KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp index e83611026e..1a22146d01 100644 --- a/sparse/tpls/KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_sptrsv_solve_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template struct sptrsv_solve_tpl_spec_avail { enum : bool { value = false }; From 03f48fae63a8b7c688475477057d7b116cd6e898 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Mon, 17 Apr 2023 09:04:21 -0600 Subject: [PATCH 247/442] BLAS: fixes and testing for LayoutStride (#1794) * Re-enable LayoutStride blas1 tests * Better utility for testing LayoutStride (called view_stride_adapter). * BLAS tests: finish refactoring, enabling LayoutStride * Update view_stride_adapter comments * BLAS tests: remove INST_LAYOUTSTRIDE as #if condition There is no cmake option to instantiate for LayoutStride, so don't check for it in the tests. Just test on LayoutStride if test-eti-only is off. --- blas/src/KokkosBlas1_iamax.hpp | 7 +- blas/src/KokkosBlas1_nrm2.hpp | 7 +- blas/src/KokkosBlas1_nrm2w.hpp | 7 +- blas/src/KokkosBlas1_nrm2w_squared.hpp | 6 +- blas/src/KokkosBlas1_nrminf.hpp | 7 +- blas/src/KokkosBlas1_sum.hpp | 7 +- blas/unit_test/Test_Blas1_abs.hpp | 109 +++----- blas/unit_test/Test_Blas1_asum.hpp | 31 +-- blas/unit_test/Test_Blas1_axpby.hpp | 130 +++------- blas/unit_test/Test_Blas1_axpy.hpp | 122 +++------ blas/unit_test/Test_Blas1_dot.hpp | 143 +++++------ blas/unit_test/Test_Blas1_iamax.hpp | 107 ++++---- blas/unit_test/Test_Blas1_mult.hpp | 220 +++++++--------- blas/unit_test/Test_Blas1_nrm1.hpp | 65 ++--- blas/unit_test/Test_Blas1_nrm2.hpp | 79 +++--- blas/unit_test/Test_Blas1_nrm2_squared.hpp | 57 ++--- blas/unit_test/Test_Blas1_nrm2w.hpp | 90 +++---- blas/unit_test/Test_Blas1_nrm2w_squared.hpp | 91 +++---- blas/unit_test/Test_Blas1_nrminf.hpp | 80 +++--- blas/unit_test/Test_Blas1_reciprocal.hpp | 193 +++++--------- blas/unit_test/Test_Blas1_scal.hpp | 174 +++++-------- blas/unit_test/Test_Blas1_sum.hpp | 79 +++--- blas/unit_test/Test_Blas1_team_abs.hpp | 130 ++++------ blas/unit_test/Test_Blas1_team_axpby.hpp | 144 ++++------- blas/unit_test/Test_Blas1_team_axpy.hpp | 142 ++++------- blas/unit_test/Test_Blas1_team_dot.hpp | 153 +++++------- blas/unit_test/Test_Blas1_team_mult.hpp | 227 +++++++---------- blas/unit_test/Test_Blas1_team_nrm2.hpp | 27 +- blas/unit_test/Test_Blas1_team_scal.hpp | 152 ++++-------- blas/unit_test/Test_Blas1_team_update.hpp | 216 +++++++--------- blas/unit_test/Test_Blas1_update.hpp | 262 ++++++++------------ blas/unit_test/Test_Blas2_gemv.hpp | 134 +++++----- test_common/KokkosKernels_TestUtils.hpp | 111 ++++++--- 33 files changed, 1330 insertions(+), 2179 deletions(-) diff --git a/blas/src/KokkosBlas1_iamax.hpp b/blas/src/KokkosBlas1_iamax.hpp index 22411a70bb..11ae267855 100644 --- a/blas/src/KokkosBlas1_iamax.hpp +++ b/blas/src/KokkosBlas1_iamax.hpp @@ -49,13 +49,14 @@ typename XVector::size_type iamax(const XVector& x) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; index_type result; - RVector_Internal R = RVector_Internal(&result); + RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; Impl::Iamax::iamax(R, X); diff --git a/blas/src/KokkosBlas1_nrm2.hpp b/blas/src/KokkosBlas1_nrm2.hpp index a8e56d95cf..b8e12c4f5f 100644 --- a/blas/src/KokkosBlas1_nrm2.hpp +++ b/blas/src/KokkosBlas1_nrm2.hpp @@ -49,13 +49,14 @@ nrm2(const XVector& x) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; mag_type result; - RVector_Internal R = RVector_Internal(&result); + RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; Impl::Nrm2::nrm2(R, X, true); diff --git a/blas/src/KokkosBlas1_nrm2w.hpp b/blas/src/KokkosBlas1_nrm2w.hpp index bf952c77a5..6a78a49bd2 100644 --- a/blas/src/KokkosBlas1_nrm2w.hpp +++ b/blas/src/KokkosBlas1_nrm2w.hpp @@ -49,13 +49,14 @@ nrm2w(const XVector& x, const XVector& w) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; mag_type result; - RVector_Internal R = RVector_Internal(&result); + RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; XVector_Internal W = w; diff --git a/blas/src/KokkosBlas1_nrm2w_squared.hpp b/blas/src/KokkosBlas1_nrm2w_squared.hpp index a65dad9b0f..0a5f29011d 100644 --- a/blas/src/KokkosBlas1_nrm2w_squared.hpp +++ b/blas/src/KokkosBlas1_nrm2w_squared.hpp @@ -50,12 +50,14 @@ nrm2w_squared(const XVector& x, const XVector& w) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; mag_type result; - RVector_Internal R = RVector_Internal(&result); + RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; XVector_Internal W = w; diff --git a/blas/src/KokkosBlas1_nrminf.hpp b/blas/src/KokkosBlas1_nrminf.hpp index bd4bf080a9..d0f4d25eab 100644 --- a/blas/src/KokkosBlas1_nrminf.hpp +++ b/blas/src/KokkosBlas1_nrminf.hpp @@ -48,13 +48,14 @@ nrminf(const XVector& x) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; mag_type result; - RVector_Internal R = RVector_Internal(&result); + RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; Impl::NrmInf::nrminf(R, X); diff --git a/blas/src/KokkosBlas1_sum.hpp b/blas/src/KokkosBlas1_sum.hpp index 0214feaf15..6db51d7f54 100644 --- a/blas/src/KokkosBlas1_sum.hpp +++ b/blas/src/KokkosBlas1_sum.hpp @@ -44,14 +44,15 @@ typename XVector::non_const_value_type sum(const XVector& x) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; typename XVector::non_const_value_type result; - RVector_Internal R = RVector_Internal(&result); + RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; Impl::Sum::sum(R, X); diff --git a/blas/unit_test/Test_Blas1_abs.hpp b/blas/unit_test/Test_Blas1_abs.hpp index 6ed2f9dbb3..8a2c7e3374 100644 --- a/blas/unit_test/Test_Blas1_abs.hpp +++ b/blas/unit_test/Test_Blas1_abs.hpp @@ -26,36 +26,11 @@ void impl_test_abs(int N) { typedef typename ViewTypeB::value_type ScalarB; typedef Kokkos::ArithTraits AT; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; - typename AT::mag_type eps = AT::epsilon() * 10; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeB b_org_y("Org_Y", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + view_stride_adapter org_y("Org_Y", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -63,33 +38,34 @@ void impl_test_abs(int N) { { ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_y, b_y); + Kokkos::deep_copy(org_y.h_base, y.d_base); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(x.h_base, x.d_base); // Run with nonconst input - KokkosBlas::abs(y, x); + KokkosBlas::abs(y.d_view, x.d_view); // Copy result to host (h_y is subview of h_b_y) - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(h_y(i), AT::abs(h_x(i)), eps * AT::abs(h_x(i))); + EXPECT_NEAR_KK(y.h_view(i), AT::abs(x.h_view(i)), + eps * AT::abs(x.h_view(i))); } // Run with const input // Reset output - Kokkos::deep_copy(b_y, b_org_y); - KokkosBlas::abs(y, c_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(y.d_base, org_y.h_base); + KokkosBlas::abs(y.d_view, x.d_view_const); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(h_y(i), AT::abs(h_x(i)), eps * AT::abs(h_x(i))); + EXPECT_NEAR_KK(y.h_view(i), AT::abs(x.h_view(i)), + eps * AT::abs(x.h_view(i))); } } @@ -99,24 +75,9 @@ void impl_test_abs_mv(int N, int K) { typedef typename ViewTypeB::value_type ScalarB; typedef Kokkos::ArithTraits AT; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_x("A", N, K); - typename vfB_type::BaseType b_y("B", N, K); - typename vfB_type::BaseType b_org_y("B", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); + view_stride_adapter org_y("Org_Y", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -124,38 +85,38 @@ void impl_test_abs_mv(int N, int K) { { ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_y, b_y); - - Kokkos::deep_copy(h_b_x, b_x); + Kokkos::deep_copy(org_y.h_base, y.d_base); - typename ViewTypeA::const_type c_x = x; + Kokkos::deep_copy(x.h_base, x.d_base); typename AT::mag_type eps = AT::epsilon() * 10; // Test and verify non-const input - KokkosBlas::abs(y, x); - Kokkos::deep_copy(h_b_y, b_y); + KokkosBlas::abs(y.d_view, x.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(h_y(i, j), AT::abs(h_x(i, j)), eps * AT::abs(h_x(i, j))); + EXPECT_NEAR_KK(y.h_view(i, j), AT::abs(x.h_view(i, j)), + eps * AT::abs(x.h_view(i, j))); } } // Test and verify const input // Reset y - Kokkos::deep_copy(b_y, b_org_y); - KokkosBlas::abs(y, c_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(y.d_base, org_y.h_base); + KokkosBlas::abs(y.d_view, x.d_view_const); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(h_y(i, j), AT::abs(h_x(i, j)), eps * AT::abs(h_x(i, j))); + EXPECT_NEAR_KK(y.h_view(i, j), AT::abs(x.h_view(i, j)), + eps * AT::abs(x.h_view(i, j))); } } } @@ -185,8 +146,7 @@ int test_abs() { // Test::impl_test_abs(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -229,8 +189,7 @@ int test_abs_mv() { // Test::impl_test_abs_mv(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_asum.hpp b/blas/unit_test/Test_Blas1_asum.hpp index b1d617061b..e914c9a19a 100644 --- a/blas/unit_test/Test_Blas1_asum.hpp +++ b/blas/unit_test/Test_Blas1_asum.hpp @@ -26,32 +26,17 @@ void impl_test_asum(int N) { typedef Kokkos::ArithTraits AT; typedef Kokkos::ArithTraits MAT; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - - BaseTypeA b_a("A", N); - - ViewTypeA a = Kokkos::subview(b_a, Kokkos::ALL(), 0); - - typename BaseTypeA::HostMirror h_b_a = Kokkos::create_mirror_view(b_a); - - typename ViewTypeA::HostMirror h_a = Kokkos::subview(h_b_a, Kokkos::ALL(), 0); + view_stride_adapter a("A", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - Kokkos::deep_copy(h_b_a, b_a); + Kokkos::deep_copy(a.h_base, a.d_base); - typename ViewTypeA::const_type c_a = a; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; typename AT::mag_type expected_result = 0; @@ -61,13 +46,14 @@ void impl_test_asum(int N) { // parts. // // This is safe; ArithTraits::imag is 0 if T is real. - expected_result += MAT::abs(AT::real(h_a(i))) + MAT::abs(AT::imag(h_a(i))); + expected_result += + MAT::abs(AT::real(a.h_view(i))) + MAT::abs(AT::imag(a.h_view(i))); } - typename AT::mag_type nonconst_result = KokkosBlas::asum(a); + typename AT::mag_type nonconst_result = KokkosBlas::asum(a.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); - typename AT::mag_type const_result = KokkosBlas::asum(c_a); + typename AT::mag_type const_result = KokkosBlas::asum(a.d_view_const); EXPECT_NEAR_KK(const_result, expected_result, eps * expected_result); } @@ -95,8 +81,7 @@ int test_asum() { // Test::impl_test_asum(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_asum(0); diff --git a/blas/unit_test/Test_Blas1_axpby.hpp b/blas/unit_test/Test_Blas1_axpby.hpp index 79a244fc6e..0d34464a84 100644 --- a/blas/unit_test/Test_Blas1_axpby.hpp +++ b/blas/unit_test/Test_Blas1_axpby.hpp @@ -27,19 +27,6 @@ void impl_test_axpby(int N) { using ScalarB = typename ViewTypeB::value_type; using MagnitudeB = typename Kokkos::ArithTraits::mag_type; - using BaseTypeA = Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device>; - using BaseTypeB = Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device>; - ScalarA a = 3; ScalarB b = 5; // eps should probably be based on ScalarB since that is the type @@ -51,22 +38,9 @@ void impl_test_axpby(int N) { Kokkos::ArithTraits::abs(b)) * max_val * eps; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeB b_org_y("Org_Y", N); - - auto h_b_org_y = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + view_stride_adapter org_y("Org_Y", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -74,34 +48,28 @@ void impl_test_axpby(int N) { { ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); - } - { - ScalarB randStart, randEnd; - Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_y, b_y); - Kokkos::deep_copy(h_b_org_y, b_org_y); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(org_y.h_base, y.d_base); - Kokkos::deep_copy(h_b_x, b_x); - - // Run with non-const input (x) and verify - KokkosBlas::axpby(a, x, b, y); - Kokkos::deep_copy(h_b_y, b_y); + // Run with non-const input and verify + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i) + b * h_b_org_y(i, 0)), - h_y(i), 2 * max_error); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * org_y.h_view(i)), + y.h_view(i), 2 * max_error); } - Kokkos::deep_copy(b_y, b_org_y); - // Run again with const input (c_x) - KokkosBlas::axpby(a, c_x, b, y); - Kokkos::deep_copy(h_b_y, b_y); + // Re-randomize y + Kokkos::deep_copy(y.d_base, org_y.h_base); + // Run again with const input + KokkosBlas::axpby(a, x.d_view_const, b, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i) + b * h_b_org_y(i, 0)), - h_y(i), 2 * max_error); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * org_y.h_view(i)), + y.h_view(i), 2 * max_error); } } @@ -111,24 +79,9 @@ void impl_test_axpby_mv(int N, int K) { using ScalarB = typename ViewTypeB::value_type; using MagnitudeB = typename Kokkos::ArithTraits::mag_type; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_x("A", N, K); - typename vfB_type::BaseType b_y("B", N, K); - typename vfB_type::BaseType b_org_y("B", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); + view_stride_adapter org_y("Org_Y", N, K); ScalarA a = 3; ScalarB b = 5; @@ -145,44 +98,39 @@ void impl_test_axpby_mv(int N, int K) { { ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_y, b_y); - ViewTypeB org_y = vfB_type::view(b_org_y); - auto h_org_y = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), org_y); - - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); - - typename ViewTypeA::const_type c_x = x; + Kokkos::deep_copy(org_y.h_base, y.d_base); + Kokkos::deep_copy(x.h_base, x.d_base); Kokkos::View r("Dot::Result", K); - KokkosBlas::axpby(a, x, b, y); - Kokkos::deep_copy(h_b_y, b_y); + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + b * h_org_y(i, j)), - h_y(i, j), 2 * max_error); + EXPECT_NEAR_KK( + static_cast(a * x.h_view(i, j) + b * org_y.h_view(i, j)), + y.h_view(i, j), 2 * max_error); } } - Kokkos::deep_copy(b_y, b_org_y); - KokkosBlas::axpby(a, c_x, b, y); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(y.d_base, org_y.h_base); + KokkosBlas::axpby(a, x.d_view_const, b, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + b * h_org_y(i, j)), - h_y(i, j), 2 * max_error); + EXPECT_NEAR_KK( + static_cast(a * x.h_view(i, j) + b * org_y.h_view(i, j)), + y.h_view(i, j), 2 * max_error); } } } @@ -212,8 +160,7 @@ int test_axpby() { Test::impl_test_axpby(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -256,8 +203,7 @@ int test_axpby_mv() { Test::impl_test_axpby_mv(132231, 5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_axpy.hpp b/blas/unit_test/Test_Blas1_axpy.hpp index 890e116584..8b21ff6dc5 100644 --- a/blas/unit_test/Test_Blas1_axpy.hpp +++ b/blas/unit_test/Test_Blas1_axpy.hpp @@ -27,19 +27,6 @@ void impl_test_axpy(int N) { using ScalarB = typename ViewTypeB::value_type; using MagnitudeB = typename Kokkos::ArithTraits::mag_type; - using BaseTypeA = Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device>; - using BaseTypeB = Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device>; - ScalarA a = 3; const MagnitudeB max_val = 10; const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); @@ -48,20 +35,9 @@ void impl_test_axpy(int N) { max_val) * eps; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeB b_org_y("Org_Y", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + view_stride_adapter org_y("Org_Y", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -69,35 +45,32 @@ void impl_test_axpy(int N) { { ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_y, b_y); - auto h_b_org_y = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(org_y.h_base, y.d_base); - Kokkos::deep_copy(h_b_x, b_x); - - KokkosBlas::axpy(a, x, y); - Kokkos::deep_copy(h_b_y, b_y); + KokkosBlas::axpy(a, x.d_view, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - ScalarB expected = a * h_x(i) + h_b_org_y(i, 0); - EXPECT_NEAR_KK(expected, h_y(i), 2 * max_error); + ScalarB expected = a * x.h_view(i) + org_y.h_view(i); + EXPECT_NEAR_KK(expected, y.h_view(i), 2 * max_error); } // reset y to orig, and run again with const-valued x - Kokkos::deep_copy(b_y, b_org_y); - KokkosBlas::axpy(a, c_x, y); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(y.d_base, org_y.h_base); + KokkosBlas::axpy(a, x.d_view_const, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - ScalarB expected = a * h_x(i) + h_b_org_y(i, 0); - EXPECT_NEAR_KK(expected, h_y(i), 2 * max_error); + ScalarB expected = a * x.h_view(i) + org_y.h_view(i); + EXPECT_NEAR_KK(expected, y.h_view(i), 2 * max_error); } } @@ -107,24 +80,9 @@ void impl_test_axpy_mv(int N, int K) { using ScalarB = typename ViewTypeB::value_type; using MagnitudeB = typename Kokkos::ArithTraits::mag_type; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_x("A", N, K); - typename vfB_type::BaseType b_y("B", N, K); - typename vfB_type::BaseType b_org_y("B", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); + view_stride_adapter org_y("Org_Y", N, K); ScalarA a = 3; const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); @@ -140,40 +98,36 @@ void impl_test_axpy_mv(int N, int K) { { ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_y, b_y); - ViewTypeB org_y = vfB_type::view(b_org_y); - auto h_org_y = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), org_y); + Kokkos::deep_copy(org_y.h_base, y.d_base); + Kokkos::deep_copy(x.h_base, x.d_base); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); - - typename ViewTypeA::const_type c_x = x; - - KokkosBlas::axpy(a, x, y); - Kokkos::deep_copy(h_b_y, b_y); + KokkosBlas::axpy(a, x.d_view, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + h_org_y(i, j)), - h_y(i, j), 2 * max_error); + EXPECT_NEAR_KK( + static_cast(a * x.h_view(i, j) + org_y.h_view(i, j)), + y.h_view(i, j), 2 * max_error); } } - Kokkos::deep_copy(b_y, b_org_y); - KokkosBlas::axpy(a, c_x, y); - Kokkos::deep_copy(h_b_y, b_y); + // reset y to orig, and run again with const-valued x + Kokkos::deep_copy(y.d_base, org_y.h_base); + KokkosBlas::axpy(a, x.d_view, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + h_org_y(i, j)), - h_y(i, j), 2 * max_error); + EXPECT_NEAR_KK( + static_cast(a * x.h_view(i, j) + org_y.h_view(i, j)), + y.h_view(i, j), 2 * max_error); } } } @@ -203,8 +157,7 @@ int test_axpy() { // Test::impl_test_axpy(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -247,8 +200,7 @@ int test_axpy_mv() { // Test::impl_test_axpy_mv(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_dot.hpp b/blas/unit_test/Test_Blas1_dot.hpp index b2dfc1bd41..d978cbafaa 100644 --- a/blas/unit_test/Test_Blas1_dot.hpp +++ b/blas/unit_test/Test_Blas1_dot.hpp @@ -27,11 +27,8 @@ void impl_test_dot(int N) { typedef typename ViewTypeB::value_type ScalarB; typedef Kokkos::ArithTraits ats; - ViewTypeA a("a", N); - ViewTypeB b("b", N); - - typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); - typename ViewTypeB::HostMirror h_b = Kokkos::create_mirror_view(b); + view_stride_adapter a("a", N); + view_stride_adapter b("b", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -39,34 +36,33 @@ void impl_test_dot(int N) { { ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b, rand_pool, randStart, randEnd); + Kokkos::fill_random(b.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(h_a, a); - Kokkos::deep_copy(h_b, b); + Kokkos::deep_copy(a.h_base, a.d_base); + Kokkos::deep_copy(b.h_base, b.d_base); ScalarA expected_result = 0; - for (int i = 0; i < N; i++) expected_result += ats::conj(h_a(i)) * h_b(i); + for (int i = 0; i < N; i++) + expected_result += ats::conj(a.h_view(i)) * b.h_view(i); - ScalarA nonconst_nonconst_result = KokkosBlas::dot(a, b); + ScalarA nonconst_nonconst_result = KokkosBlas::dot(a.d_view, b.d_view); double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); - typename ViewTypeA::const_type c_a = a; - typename ViewTypeB::const_type c_b = b; - ScalarA const_const_result = KokkosBlas::dot(c_a, c_b); + ScalarA const_const_result = KokkosBlas::dot(a.d_view_const, b.d_view_const); EXPECT_NEAR_KK(const_const_result, expected_result, eps * expected_result); - ScalarA nonconst_const_result = KokkosBlas::dot(a, c_b); + ScalarA nonconst_const_result = KokkosBlas::dot(a.d_view, b.d_view_const); EXPECT_NEAR_KK(nonconst_const_result, expected_result, eps * expected_result); - ScalarA const_nonconst_result = KokkosBlas::dot(c_a, b); + ScalarA const_nonconst_result = KokkosBlas::dot(a.d_view_const, b.d_view); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); } @@ -76,23 +72,8 @@ void impl_test_dot_mv(int N, int K) { typedef typename ViewTypeB::value_type ScalarB; typedef Kokkos::ArithTraits ats; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_a("A", N, K); - typename vfB_type::BaseType b_b("B", N, K); - - ViewTypeA a = vfA_type::view(b_a); - ViewTypeB b = vfB_type::view(b_b); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - typename h_vfB_type::BaseType h_b_b = Kokkos::create_mirror_view(b_b); - - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); - typename ViewTypeB::HostMirror h_b = h_vfB_type::view(h_b_b); + view_stride_adapter a("A", N, K); + view_stride_adapter b("B", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -100,32 +81,29 @@ void impl_test_dot_mv(int N, int K) { { ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_b, rand_pool, randStart, randEnd); + Kokkos::fill_random(b.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(h_b_a, b_a); - Kokkos::deep_copy(h_b_b, b_b); - - typename ViewTypeA::const_type c_a = a; - typename ViewTypeB::const_type c_b = b; + Kokkos::deep_copy(a.h_base, a.d_base); + Kokkos::deep_copy(b.h_base, b.d_base); ScalarA* expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) - expected_result[j] += ats::conj(h_a(i, j)) * h_b(i, j); + expected_result[j] += ats::conj(a.h_view(i, j)) * b.h_view(i, j); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; Kokkos::View r("Dot::Result", K); - KokkosBlas::dot(r, a, b); + KokkosBlas::dot(r, a.d_view, b.d_view); Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); @@ -133,7 +111,7 @@ void impl_test_dot_mv(int N, int K) { eps * expected_result[k]); } - KokkosBlas::dot(r, c_a, c_b); + KokkosBlas::dot(r, a.d_view_const, b.d_view_const); Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA const_const_result = r(k); @@ -141,7 +119,7 @@ void impl_test_dot_mv(int N, int K) { eps * expected_result[k]); } - KokkosBlas::dot(r, a, c_b); + KokkosBlas::dot(r, a.d_view, b.d_view_const); Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA non_const_const_result = r(k); @@ -149,7 +127,7 @@ void impl_test_dot_mv(int N, int K) { eps * expected_result[k]); } - KokkosBlas::dot(r, c_a, b); + KokkosBlas::dot(r, a.d_view_const, b.d_view); Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); @@ -185,26 +163,21 @@ int test_dot() { // Test::impl_test_dot(132231); #endif - // Removing the layout stride test as ViewTypeA a("a", N); - // is invalid since the view constructor needs a stride object! - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - Test::impl_test_dot(0); - Test::impl_test_dot(13); - Test::impl_test_dot(1024); - // Test::impl_test_dot(132231); - #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_dot(1024); - Test::impl_test_dot(1024); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + Test::impl_test_dot(0); + Test::impl_test_dot(13); + Test::impl_test_dot(1024); + // Test::impl_test_dot(132231); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_dot(1024); + Test::impl_test_dot(1024); +#endif return 1; } @@ -235,28 +208,24 @@ int test_dot_mv() { // Test::impl_test_dot_mv(132231,5); #endif - // Removing the layout stride test as ViewTypeA a("a", N); - // is invalid since the view constructor needs a stride object! - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; typedef Kokkos::View - view_type_b_ls; Test::impl_test_dot_mv(0, 5); Test::impl_test_dot_mv(13, 5); Test::impl_test_dot_mv(1024, 5); Test::impl_test_dot_mv(789, 1); - // Test::impl_test_dot_mv(132231,5); - #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_dot_mv(1024, 5); - Test::impl_test_dot_mv(1024, 5); - #endif - */ +// Removing the layout stride test as ViewTypeA a("a", N); +// is invalid since the view constructor needs a stride object! +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + Test::impl_test_dot_mv(0, 5); + Test::impl_test_dot_mv(13, 5); + Test::impl_test_dot_mv(1024, 5); + Test::impl_test_dot_mv(789, 1); + // Test::impl_test_dot_mv(132231,5); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_dot_mv(1024, 5); + Test::impl_test_dot_mv(1024, 5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_iamax.hpp b/blas/unit_test/Test_Blas1_iamax.hpp index 2b9885e30f..fcd896e22a 100644 --- a/blas/unit_test/Test_Blas1_iamax.hpp +++ b/blas/unit_test/Test_Blas1_iamax.hpp @@ -27,25 +27,21 @@ void impl_test_iamax(int N) { typedef typename AT::mag_type mag_type; using size_type = typename ViewTypeA::size_type; - ViewTypeA a("A", N); - - typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); + view_stride_adapter a("X", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(a, rand_pool, randStart, randEnd); - - Kokkos::deep_copy(h_a, a); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - typename ViewTypeA::const_type c_a = a; + Kokkos::deep_copy(a.h_base, a.d_base); mag_type expected_result = Kokkos::ArithTraits::min(); size_type expected_max_loc = 0; for (int i = 0; i < N; i++) { - mag_type val = AT::abs(h_a(i)); + mag_type val = AT::abs(a.h_view(i)); if (val > expected_result) { expected_result = val; expected_max_loc = i + 1; @@ -60,10 +56,10 @@ void impl_test_iamax(int N) { { // printf("impl_test_iamax -- return result as a scalar on host -- N %d\n", // N); - size_type nonconst_max_loc = KokkosBlas::iamax(a); + size_type nonconst_max_loc = KokkosBlas::iamax(a.d_view); ASSERT_EQ(nonconst_max_loc, expected_max_loc); - size_type const_max_loc = KokkosBlas::iamax(c_a); + size_type const_max_loc = KokkosBlas::iamax(a.d_view_const); ASSERT_EQ(const_max_loc, expected_max_loc); } @@ -73,14 +69,15 @@ void impl_test_iamax(int N) { typedef Kokkos::View ViewType0D; - ViewType0D r("Iamax::Result 0-D View on host"); + ViewType0D r("Iamax::Result 0-D View on host", + typename ViewTypeA::array_layout()); - KokkosBlas::iamax(r, a); + KokkosBlas::iamax(r, a.d_view); Kokkos::fence(); size_type nonconst_max_loc = r(); ASSERT_EQ(nonconst_max_loc, expected_max_loc); - KokkosBlas::iamax(r, c_a); + KokkosBlas::iamax(r, a.d_view_const); size_type const_max_loc = r(); ASSERT_EQ(const_max_loc, expected_max_loc); } @@ -90,19 +87,20 @@ void impl_test_iamax(int N) { // %d\n", N); typedef Kokkos::View ViewType0D; - ViewType0D r("Iamax::Result 0-D View on device"); + ViewType0D r("Iamax::Result 0-D View on device", + typename ViewTypeA::array_layout()); typename ViewType0D::HostMirror h_r = Kokkos::create_mirror_view(r); size_type nonconst_max_loc, const_max_loc; - KokkosBlas::iamax(r, a); + KokkosBlas::iamax(r, a.d_view); Kokkos::deep_copy(h_r, r); nonconst_max_loc = h_r(); ASSERT_EQ(nonconst_max_loc, expected_max_loc); - KokkosBlas::iamax(r, c_a); + KokkosBlas::iamax(r, a.d_view_const); Kokkos::deep_copy(h_r, r); const_max_loc = h_r(); @@ -118,28 +116,16 @@ void impl_test_iamax_mv(int N, int K) { typedef typename AT::mag_type mag_type; typedef typename ViewTypeA::size_type size_type; - typedef multivector_layout_adapter vfA_type; - - typename vfA_type::BaseType b_a("A", N, K); - - ViewTypeA a = vfA_type::view(b_a); - - typedef multivector_layout_adapter h_vfA_type; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + view_stride_adapter a("A", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); - - Kokkos::deep_copy(h_b_a, b_a); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - typename ViewTypeA::const_type c_a = a; + Kokkos::deep_copy(a.h_base, a.d_base); mag_type* expected_result = new mag_type[K]; size_type* expected_max_loc = new size_type[K]; @@ -147,7 +133,7 @@ void impl_test_iamax_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = Kokkos::ArithTraits::min(); for (int i = 0; i < N; i++) { - mag_type val = AT::abs(h_a(i, j)); + mag_type val = AT::abs(a.h_view(i, j)); if (val > expected_result[j]) { expected_result[j] = val; expected_max_loc[j] = i + 1; @@ -162,11 +148,13 @@ void impl_test_iamax_mv(int N, int K) { { // printf("impl_test_iamax_mv -- return results as a 1-D View on host -- N // %d\n", N); + Kokkos::View rcontig( + "Iamax::Result View on host", K); Kokkos::View - r("Iamax::Result View on host", K); + r = rcontig; - KokkosBlas::iamax(r, a); + KokkosBlas::iamax(r, a.d_view); Kokkos::fence(); for (int k = 0; k < K; k++) { @@ -175,7 +163,7 @@ void impl_test_iamax_mv(int N, int K) { ASSERT_EQ(nonconst_result, exp_result); } - KokkosBlas::iamax(r, c_a); + KokkosBlas::iamax(r, a.d_view_const); Kokkos::fence(); for (int k = 0; k < K; k++) { @@ -188,13 +176,14 @@ void impl_test_iamax_mv(int N, int K) { { // printf("impl_test_iamax_mv -- return results as a 1-D View on device -- N // %d\n", N); - Kokkos::View r( - "Iamax::Result View on device", K); + Kokkos::View rcontig("Iamax::Result View on host", K); + Kokkos::View r = + rcontig; typename Kokkos::View::HostMirror h_r = - Kokkos::create_mirror_view(r); + Kokkos::create_mirror_view(rcontig); - KokkosBlas::iamax(r, a); + KokkosBlas::iamax(r, a.d_view); Kokkos::deep_copy(h_r, r); for (int k = 0; k < K; k++) { @@ -203,7 +192,7 @@ void impl_test_iamax_mv(int N, int K) { ASSERT_EQ(nonconst_result, exp_result); } - KokkosBlas::iamax(r, c_a); + KokkosBlas::iamax(r, a.d_view_const); Kokkos::deep_copy(h_r, r); for (int k = 0; k < K; k++) { @@ -240,17 +229,14 @@ int test_iamax() { // Test::impl_test_iamax(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_iamax(0); - Test::impl_test_iamax(13); - Test::impl_test_iamax(1024); - // Test::impl_test_iamax(132231); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_iamax(0); + Test::impl_test_iamax(13); + Test::impl_test_iamax(1024); + // Test::impl_test_iamax(132231); +#endif return 1; } @@ -277,17 +263,14 @@ int test_iamax_mv() { // Test::impl_test_iamax_mv(132231,5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; Test::impl_test_iamax_mv(0, 5); - Test::impl_test_iamax_mv(13, 5); - Test::impl_test_iamax_mv(1024, 5); - // Test::impl_test_iamax_mv(132231,5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_iamax_mv(0, 5); + Test::impl_test_iamax_mv(13, 5); + Test::impl_test_iamax_mv(1024, 5); + // Test::impl_test_iamax_mv(132231,5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_mult.hpp b/blas/unit_test/Test_Blas1_mult.hpp index 3c027f26e7..0888c7a6b2 100644 --- a/blas/unit_test/Test_Blas1_mult.hpp +++ b/blas/unit_test/Test_Blas1_mult.hpp @@ -31,17 +31,10 @@ void impl_test_mult(int N) { ScalarB b = 5; double eps = std::is_same::value ? 1e-4 : 1e-7; - ViewTypeA x("X", N); - ViewTypeB y("Y", N); - ViewTypeC z("Y", N); - ViewTypeC b_org_z("Org_Z", N); - - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename ViewTypeA::HostMirror h_x = Kokkos::create_mirror_view(x); - typename ViewTypeB::HostMirror h_y = Kokkos::create_mirror_view(y); - typename ViewTypeC::HostMirror h_z = Kokkos::create_mirror_view(z); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + view_stride_adapter z("Z", N); + view_stride_adapter org_z("Org_Z", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -49,49 +42,48 @@ void impl_test_mult(int N) { { ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } { ScalarC randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(z, rand_pool, randStart, randEnd); + Kokkos::fill_random(z.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_z, z); - auto h_b_org_z = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); - - Kokkos::deep_copy(h_x, x); - Kokkos::deep_copy(h_y, y); + Kokkos::deep_copy(org_z.h_base, z.d_base); - // expected_result = ScalarC(b*h_z(i) + a*h_x(i)*h_y(i)) + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); - KokkosBlas::mult(b, z, a, x, y); - Kokkos::deep_copy(h_z, z); + KokkosBlas::mult(b, z.d_view, a, x.d_view, y.d_view); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i) * h_y(i) + b * h_b_org_z(i)), - h_z(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i) + + b * org_z.h_view(i)), + z.h_view(i), eps); } - Kokkos::deep_copy(z, b_org_z); - KokkosBlas::mult(b, z, a, x, c_y); - Kokkos::deep_copy(h_z, z); + Kokkos::deep_copy(z.d_base, org_z.h_base); + KokkosBlas::mult(b, z.d_view, a, x.d_view, y.d_view_const); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i) * h_y(i) + b * h_b_org_z(i)), - h_z(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i) + + b * org_z.h_view(i)), + z.h_view(i), eps); } - Kokkos::deep_copy(z, b_org_z); - KokkosBlas::mult(b, z, a, c_x, c_y); - Kokkos::deep_copy(h_z, z); + Kokkos::deep_copy(z.d_base, org_z.h_base); + KokkosBlas::mult(b, z.d_view, a, x.d_view_const, y.d_view_const); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i) * h_y(i) + b * h_b_org_z(i)), - h_z(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i) + + b * org_z.h_view(i)), + z.h_view(i), eps); } } @@ -101,26 +93,11 @@ void impl_test_mult_mv(int N, int K) { typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef multivector_layout_adapter vfB_type; - typedef multivector_layout_adapter vfC_type; - - ViewTypeA x("X", N); - typename vfB_type::BaseType b_y("Y", N, K); - typename vfC_type::BaseType b_z("Z", N, K); - typename vfC_type::BaseType b_org_z("Z", N, K); - - ViewTypeB y = vfB_type::view(b_y); - ViewTypeC z = vfC_type::view(b_z); - - typedef multivector_layout_adapter h_vfB_type; - typedef multivector_layout_adapter h_vfC_type; - - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - typename h_vfC_type::BaseType h_b_z = Kokkos::create_mirror_view(b_z); - - typename ViewTypeA::HostMirror h_x = Kokkos::create_mirror_view(x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); - typename ViewTypeC::HostMirror h_z = h_vfC_type::view(h_b_z); + // x is rank-1, all others are rank-2 + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N, K); + view_stride_adapter z("Z", N, K); + view_stride_adapter org_z("Org_Z", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -128,52 +105,46 @@ void impl_test_mult_mv(int N, int K) { { ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } { ScalarC randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_z, rand_pool, randStart, randEnd); + Kokkos::fill_random(z.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_z, b_z); - auto h_b_org_z = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); - - Kokkos::deep_copy(h_x, x); - Kokkos::deep_copy(h_b_y, b_y); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(org_z.h_base, z.d_base); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); - ScalarA a = 3; - ScalarB b = 5; - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; + ScalarA a = 3; + ScalarB b = 5; double eps = std::is_same::value ? 1e-4 : 1e-7; - KokkosBlas::mult(b, z, a, x, y); - Kokkos::deep_copy(h_b_z, b_z); + KokkosBlas::mult(b, z.d_view, a, x.d_view, y.d_view); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK( - static_cast(a * h_x(i) * h_y(i, j) + b * h_b_org_z(i, j)), - h_z(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i, j) + + b * org_z.h_view(i, j)), + z.h_view(i, j), eps); } } - Kokkos::deep_copy(b_z, b_org_z); - KokkosBlas::mult(b, z, a, x, c_y); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(z.d_base, org_z.h_base); + KokkosBlas::mult(b, z.d_view, a, x.d_view, y.d_view_const); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK( - static_cast(a * h_x(i) * h_y(i, j) + b * h_b_org_z(i, j)), - h_z(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) * y.h_view(i, j) + + b * org_z.h_view(i, j)), + z.h_view(i, j), eps); } } } @@ -213,27 +184,28 @@ int test_mult() { // Device>(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - typedef Kokkos::View view_type_c_ls; - Test::impl_test_mult( 0); Test::impl_test_mult( 13); Test::impl_test_mult( 1024); - // Test::impl_test_mult(132231); - #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_mult( 1024); Test::impl_test_mult( 1024); #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + typedef Kokkos::View view_type_c_ls; + Test::impl_test_mult( + 0); + Test::impl_test_mult( + 13); + Test::impl_test_mult( + 1024); + // Test::impl_test_mult(132231); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_mult( + 1024); + Test::impl_test_mult( + 1024); +#endif return 1; } @@ -272,30 +244,28 @@ int test_mult_mv() { // Device>(132231,5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View - view_type_b_ls; typedef Kokkos::View - view_type_c_ls; Test::impl_test_mult_mv(0, 5); Test::impl_test_mult_mv(13, 5); - Test::impl_test_mult_mv(1024, 5); - // Test::impl_test_mult_mv(132231,5); - #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_mult_mv(1024, 5); - Test::impl_test_mult_mv(1024, 5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + typedef Kokkos::View view_type_c_ls; + Test::impl_test_mult_mv(0, 5); + Test::impl_test_mult_mv(13, 5); + Test::impl_test_mult_mv(1024, 5); + // Test::impl_test_mult_mv(132231,5); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_mult_mv(1024, 5); + Test::impl_test_mult_mv(1024, 5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_nrm1.hpp b/blas/unit_test/Test_Blas1_nrm1.hpp index e17f8b988a..5c99895a49 100644 --- a/blas/unit_test/Test_Blas1_nrm1.hpp +++ b/blas/unit_test/Test_Blas1_nrm1.hpp @@ -27,20 +27,17 @@ void impl_test_nrm1(int N) { typedef typename AT::mag_type mag_type; typedef Kokkos::ArithTraits MAT; - ViewTypeA a("A", N); - - typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); + view_stride_adapter a("a", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - Kokkos::deep_copy(h_a, a); + Kokkos::deep_copy(a.h_base, a.d_base); - typename ViewTypeA::const_type c_a = a; double eps = (std::is_same::mag_type, float>::value ? 1e-4 @@ -53,13 +50,14 @@ void impl_test_nrm1(int N) { // parts. See netlib, MKL, and CUBLAS documentation. // // This is safe; ArithTraits::imag is 0 if T is real. - expected_result += MAT::abs(AT::real(h_a(i))) + MAT::abs(AT::imag(h_a(i))); + expected_result += + MAT::abs(AT::real(a.h_view(i))) + MAT::abs(AT::imag(a.h_view(i))); } - mag_type nonconst_result = KokkosBlas::nrm1(a); + mag_type nonconst_result = KokkosBlas::nrm1(a.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); - mag_type const_result = KokkosBlas::nrm1(c_a); + mag_type const_result = KokkosBlas::nrm1(a.d_view_const); EXPECT_NEAR_KK(const_result, expected_result, eps * expected_result); } @@ -70,28 +68,16 @@ void impl_test_nrm1_mv(int N, int K) { typedef typename AT::mag_type mag_type; typedef Kokkos::ArithTraits MAT; - typedef multivector_layout_adapter vfA_type; - - typename vfA_type::BaseType b_a("A", N, K); - - ViewTypeA a = vfA_type::view(b_a); - - typedef multivector_layout_adapter h_vfA_type; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + view_stride_adapter a("A", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); - - Kokkos::deep_copy(h_b_a, b_a); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - typename ViewTypeA::const_type c_a = a; + Kokkos::deep_copy(a.h_base, a.d_base); double eps = (std::is_same::mag_type, float>::value @@ -103,20 +89,19 @@ void impl_test_nrm1_mv(int N, int K) { for (int k = 0; k < K; k++) { expected_result(k) = MAT::zero(); for (int i = 0; i < N; i++) { - expected_result(k) += - MAT::abs(AT::real(h_a(i, k))) + MAT::abs(AT::imag(h_a(i, k))); + expected_result(k) += MAT::abs(AT::real(a.h_view(i, k))) + + MAT::abs(AT::imag(a.h_view(i, k))); } } Kokkos::View r("Nrm1::Result", K); Kokkos::View c_r("Nrm1::ConstResult", K); - KokkosBlas::nrm1(r, a); - KokkosBlas::nrm1(c_r, a); + KokkosBlas::nrm1(r, a.d_view); + KokkosBlas::nrm1(c_r, a.d_view_const); Kokkos::fence(); for (int k = 0; k < K; k++) { EXPECT_NEAR_KK(r(k), expected_result(k), eps * expected_result(k)); - EXPECT_NEAR_KK(c_r(k), expected_result(k), eps * expected_result(k)); } } } // namespace Test @@ -143,17 +128,14 @@ int test_nrm1() { Test::impl_test_nrm1(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrm1(0); - Test::impl_test_nrm1(13); - Test::impl_test_nrm1(1024); - Test::impl_test_nrm1(132231); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm1(0); + Test::impl_test_nrm1(13); + Test::impl_test_nrm1(1024); + Test::impl_test_nrm1(132231); +#endif return 1; } @@ -182,8 +164,7 @@ int test_nrm1_mv() { Test::impl_test_nrm1_mv(132231, 5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm1_mv(0, 5); diff --git a/blas/unit_test/Test_Blas1_nrm2.hpp b/blas/unit_test/Test_Blas1_nrm2.hpp index b7444b76df..1264cfecf2 100644 --- a/blas/unit_test/Test_Blas1_nrm2.hpp +++ b/blas/unit_test/Test_Blas1_nrm2.hpp @@ -25,33 +25,30 @@ void impl_test_nrm2(int N) { typedef typename ViewTypeA::value_type ScalarA; typedef Kokkos::ArithTraits AT; - ViewTypeA a("A", N); - - typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); + view_stride_adapter a("a", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - Kokkos::deep_copy(h_a, a); + Kokkos::deep_copy(a.h_base, a.d_base); - typename ViewTypeA::const_type c_a = a; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; typename AT::mag_type expected_result = 0; for (int i = 0; i < N; i++) { - expected_result += AT::abs(h_a(i)) * AT::abs(h_a(i)); + expected_result += AT::abs(a.h_view(i)) * AT::abs(a.h_view(i)); } expected_result = Kokkos::ArithTraits::sqrt(expected_result); - typename AT::mag_type nonconst_result = KokkosBlas::nrm2(a); + typename AT::mag_type nonconst_result = KokkosBlas::nrm2(a.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); - typename AT::mag_type const_result = KokkosBlas::nrm2(c_a); + typename AT::mag_type const_result = KokkosBlas::nrm2(a.d_view_const); EXPECT_NEAR_KK(const_result, expected_result, eps * expected_result); } @@ -60,34 +57,22 @@ void impl_test_nrm2_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef Kokkos::ArithTraits AT; - typedef multivector_layout_adapter vfA_type; - - typename vfA_type::BaseType b_a("A", N, K); - - ViewTypeA a = vfA_type::view(b_a); - - typedef multivector_layout_adapter h_vfA_type; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + view_stride_adapter a("A", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - Kokkos::deep_copy(h_b_a, b_a); - - typename ViewTypeA::const_type c_a = a; + Kokkos::deep_copy(a.h_base, a.d_base); typename AT::mag_type* expected_result = new typename AT::mag_type[K]; for (int j = 0; j < K; j++) { expected_result[j] = typename AT::mag_type(); for (int i = 0; i < N; i++) { - expected_result[j] += AT::abs(h_a(i, j)) * AT::abs(h_a(i, j)); + expected_result[j] += AT::abs(a.h_view(i, j)) * AT::abs(a.h_view(i, j)); } expected_result[j] = Kokkos::ArithTraits::sqrt(expected_result[j]); @@ -97,7 +82,7 @@ void impl_test_nrm2_mv(int N, int K) { Kokkos::View r("Dot::Result", K); - KokkosBlas::nrm2(r, a); + KokkosBlas::nrm2(r, a.d_view); Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r(k); @@ -105,7 +90,7 @@ void impl_test_nrm2_mv(int N, int K) { eps * expected_result[k]); } - KokkosBlas::nrm2(r, c_a); + KokkosBlas::nrm2(r, a.d_view_const); Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type const_result = r(k); @@ -138,17 +123,14 @@ int test_nrm2() { // Test::impl_test_nrm2(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrm2(0); - Test::impl_test_nrm2(13); - Test::impl_test_nrm2(1024); - // Test::impl_test_nrm2(132231); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2(0); + Test::impl_test_nrm2(13); + Test::impl_test_nrm2(1024); + // Test::impl_test_nrm2(132231); +#endif return 1; } @@ -177,18 +159,15 @@ int test_nrm2_mv() { // Test::impl_test_nrm2_mv(132231,5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; Test::impl_test_nrm2_mv(0, 5); - Test::impl_test_nrm2_mv(13, 5); - Test::impl_test_nrm2_mv(1024, 5); - Test::impl_test_nrm2_mv(789, 1); - // Test::impl_test_nrm2_mv(132231,5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2_mv(0, 5); + Test::impl_test_nrm2_mv(13, 5); + Test::impl_test_nrm2_mv(1024, 5); + Test::impl_test_nrm2_mv(789, 1); + // Test::impl_test_nrm2_mv(132231,5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_nrm2_squared.hpp b/blas/unit_test/Test_Blas1_nrm2_squared.hpp index 7bfb46446f..c218a12d39 100644 --- a/blas/unit_test/Test_Blas1_nrm2_squared.hpp +++ b/blas/unit_test/Test_Blas1_nrm2_squared.hpp @@ -25,43 +25,28 @@ void impl_test_nrm2_squared(int N) { typedef typename ViewTypeA::value_type ScalarA; typedef Kokkos::ArithTraits AT; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - - BaseTypeA b_a("A", N); - - ViewTypeA a = Kokkos::subview(b_a, Kokkos::ALL(), 0); - - typename BaseTypeA::HostMirror h_b_a = Kokkos::create_mirror_view(b_a); - - typename ViewTypeA::HostMirror h_a = Kokkos::subview(h_b_a, Kokkos::ALL(), 0); + view_stride_adapter a("a", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - Kokkos::deep_copy(h_b_a, b_a); + Kokkos::deep_copy(a.h_base, a.d_base); - typename ViewTypeA::const_type c_a = a; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; typename AT::mag_type expected_result(0); for (int i = 0; i < N; i++) { - expected_result += AT::abs(h_a(i)) * AT::abs(h_a(i)); + expected_result += AT::abs(a.h_view(i)) * AT::abs(a.h_view(i)); } - typename AT::mag_type nonconst_result = KokkosBlas::nrm2_squared(a); + typename AT::mag_type nonconst_result = KokkosBlas::nrm2_squared(a.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); - typename AT::mag_type const_result = KokkosBlas::nrm2_squared(c_a); + typename AT::mag_type const_result = KokkosBlas::nrm2_squared(a.d_view_const); EXPECT_NEAR_KK(const_result, expected_result, eps * expected_result); } @@ -70,34 +55,22 @@ void impl_test_nrm2_squared_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef Kokkos::ArithTraits AT; - typedef multivector_layout_adapter vfA_type; - - typename vfA_type::BaseType b_a("A", N, K); - - ViewTypeA a = vfA_type::view(b_a); - - typedef multivector_layout_adapter h_vfA_type; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + view_stride_adapter a("A", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); - - Kokkos::deep_copy(h_b_a, b_a); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - typename ViewTypeA::const_type c_a = a; + Kokkos::deep_copy(a.h_base, a.d_base); typename AT::mag_type* expected_result = new typename AT::mag_type[K]; for (int j = 0; j < K; j++) { expected_result[j] = typename AT::mag_type(); for (int i = 0; i < N; i++) { - expected_result[j] += AT::abs(h_a(i, j)) * AT::abs(h_a(i, j)); + expected_result[j] += AT::abs(a.h_view(i, j)) * AT::abs(a.h_view(i, j)); } } @@ -107,7 +80,7 @@ void impl_test_nrm2_squared_mv(int N, int K) { Kokkos::View r("Dot::Result", K); - KokkosBlas::nrm2_squared(r, a); + KokkosBlas::nrm2_squared(r, a.d_view); Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r(k); @@ -118,7 +91,7 @@ void impl_test_nrm2_squared_mv(int N, int K) { EXPECT_NEAR_KK(diff, zero, eps); } - KokkosBlas::nrm2_squared(r, c_a); + KokkosBlas::nrm2_squared(r, a.d_view_const); Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type const_result = r(k); @@ -155,8 +128,7 @@ int test_nrm2_squared() { // Test::impl_test_nrm2_squared(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2_squared(0); @@ -192,8 +164,7 @@ int test_nrm2_squared_mv() { // Test::impl_test_nrm2_squared_mv(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_nrm2_squared_mv(0, 5); diff --git a/blas/unit_test/Test_Blas1_nrm2w.hpp b/blas/unit_test/Test_Blas1_nrm2w.hpp index 8a3675cc5e..89c1bdad45 100644 --- a/blas/unit_test/Test_Blas1_nrm2w.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w.hpp @@ -26,11 +26,8 @@ void impl_test_nrm2w(int N) { using AT = Kokkos::ArithTraits; using MagnitudeA = typename AT::mag_type; - ViewTypeA a("A", N); - ViewTypeA w("W", N); - - typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); - typename ViewTypeA::HostMirror h_w = Kokkos::create_mirror_view(w); + view_stride_adapter a("A", N); + view_stride_adapter w("W", N); constexpr MagnitudeA max_val = 10; const MagnitudeA eps = AT::epsilon(); @@ -42,21 +39,22 @@ void impl_test_nrm2w(int N) { ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(a, rand_pool, randStart, randEnd); - Kokkos::fill_random(w, rand_pool, AT::one(), randEnd); // Avoid divide by 0 + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); + Kokkos::fill_random(w.d_view, rand_pool, AT::one(), + randEnd); // Avoid divide by 0 - Kokkos::deep_copy(h_a, a); - Kokkos::deep_copy(h_w, w); + Kokkos::deep_copy(a.h_base, a.d_base); + Kokkos::deep_copy(w.h_base, w.d_base); typename AT::mag_type expected_result = 0; for (int i = 0; i < N; i++) { - typename AT::mag_type term = AT::abs(h_a(i)) / AT::abs(h_w(i)); + typename AT::mag_type term = AT::abs(a.h_view(i)) / AT::abs(w.h_view(i)); expected_result += term * term; } expected_result = Kokkos::ArithTraits::sqrt(expected_result); - typename AT::mag_type nonconst_result = KokkosBlas::nrm2w(a, w); + typename AT::mag_type nonconst_result = KokkosBlas::nrm2w(a.d_view, w.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, max_error); } @@ -66,21 +64,8 @@ void impl_test_nrm2w_mv(int N, int K) { using AT = Kokkos::ArithTraits; using MagnitudeA = typename AT::mag_type; - using vfA_type = multivector_layout_adapter; - - typename vfA_type::BaseType b_a("A", N, K); - typename vfA_type::BaseType b_w("W", N, K); - - ViewTypeA a = vfA_type::view(b_a); - ViewTypeA w = vfA_type::view(b_w); - - using h_vfA_type = multivector_layout_adapter; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - typename h_vfA_type::BaseType h_b_w = Kokkos::create_mirror_view(b_w); - - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); - typename ViewTypeA::HostMirror h_w = h_vfA_type::view(h_b_w); + view_stride_adapter a("A", N, K); + view_stride_adapter w("W", N, K); constexpr MagnitudeA max_val = 10; const MagnitudeA eps = AT::epsilon(); @@ -92,18 +77,19 @@ void impl_test_nrm2w_mv(int N, int K) { ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); - Kokkos::fill_random(b_w, rand_pool, AT::one(), + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); + Kokkos::fill_random(w.d_view, rand_pool, AT::one(), randEnd); // Avoid dividing by 0 - Kokkos::deep_copy(h_b_a, b_a); - Kokkos::deep_copy(h_b_w, b_w); + Kokkos::deep_copy(a.h_base, a.d_base); + Kokkos::deep_copy(w.h_base, w.d_base); typename AT::mag_type* expected_result = new typename AT::mag_type[K]; for (int j = 0; j < K; j++) { expected_result[j] = typename AT::mag_type(); for (int i = 0; i < N; i++) { - typename AT::mag_type term = AT::abs(h_a(i, j)) / AT::abs(h_w(i, j)); + typename AT::mag_type term = + AT::abs(a.h_view(i, j)) / AT::abs(w.h_view(i, j)); expected_result[j] += term * term; } expected_result[j] = @@ -111,7 +97,7 @@ void impl_test_nrm2w_mv(int N, int K) { } Kokkos::View r("Dot::Result", K); - KokkosBlas::nrm2w(r, a, w); + KokkosBlas::nrm2w(r, a.d_view, w.d_view); auto r_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), r); for (int k = 0; k < K; k++) { @@ -145,17 +131,14 @@ int test_nrm2w() { // Test::impl_test_nrm2(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrm2w(0); - Test::impl_test_nrm2w(13); - Test::impl_test_nrm2w(1024); - // Test::impl_test_nrm2(132231); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w(0); + Test::impl_test_nrm2w(13); + Test::impl_test_nrm2w(1024); + // Test::impl_test_nrm2(132231); +#endif return 1; } @@ -184,18 +167,15 @@ int test_nrm2w_mv() { // Test::impl_test_nrm2w_mv(132231,5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; Test::impl_test_nrm2w_mv(0, 5); - Test::impl_test_nrm2w_mv(13, 5); - Test::impl_test_nrm2w_mv(1024, 5); - Test::impl_test_nrm2w_mv(789, 1); - // Test::impl_test_nrm2w_mv(132231,5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w_mv(0, 5); + Test::impl_test_nrm2w_mv(13, 5); + Test::impl_test_nrm2w_mv(1024, 5); + Test::impl_test_nrm2w_mv(789, 1); + // Test::impl_test_nrm2w_mv(132231,5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp index 7d6c84def6..bacc733b1a 100644 --- a/blas/unit_test/Test_Blas1_nrm2w_squared.hpp +++ b/blas/unit_test/Test_Blas1_nrm2w_squared.hpp @@ -26,11 +26,8 @@ void impl_test_nrm2w_squared(int N) { using AT = Kokkos::ArithTraits; using MagnitudeA = typename AT::mag_type; - ViewTypeA a("A", N); - ViewTypeA w("W", N); - - typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); - typename ViewTypeA::HostMirror h_w = Kokkos::create_mirror_view(w); + view_stride_adapter a("A", N); + view_stride_adapter w("W", N); constexpr MagnitudeA max_val = 10; const MagnitudeA eps = AT::epsilon(); @@ -41,19 +38,21 @@ void impl_test_nrm2w_squared(int N) { ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(a, rand_pool, randStart, randEnd); - Kokkos::fill_random(w, rand_pool, AT::one(), randEnd); // Avoid divide by 0 + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); + Kokkos::fill_random(w.d_view, rand_pool, AT::one(), + randEnd); // Avoid divide by 0 - Kokkos::deep_copy(h_a, a); - Kokkos::deep_copy(h_w, w); + Kokkos::deep_copy(a.h_base, a.d_base); + Kokkos::deep_copy(w.h_base, w.d_base); typename AT::mag_type expected_result = 0; for (int i = 0; i < N; i++) { - typename AT::mag_type term = AT::abs(h_a(i)) / AT::abs(h_w(i)); + typename AT::mag_type term = AT::abs(a.h_view(i)) / AT::abs(w.h_view(i)); expected_result += term * term; } - typename AT::mag_type nonconst_result = KokkosBlas::nrm2w_squared(a, w); + typename AT::mag_type nonconst_result = + KokkosBlas::nrm2w_squared(a.d_view, w.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, max_error); } @@ -63,21 +62,8 @@ void impl_test_nrm2w_squared_mv(int N, int K) { using AT = Kokkos::ArithTraits; using MagnitudeA = typename AT::mag_type; - using vfA_type = multivector_layout_adapter; - - typename vfA_type::BaseType b_a("A", N, K); - typename vfA_type::BaseType b_w("W", N, K); - - ViewTypeA a = vfA_type::view(b_a); - ViewTypeA w = vfA_type::view(b_w); - - using h_vfA_type = multivector_layout_adapter; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - typename h_vfA_type::BaseType h_b_w = Kokkos::create_mirror_view(b_w); - - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); - typename ViewTypeA::HostMirror h_w = h_vfA_type::view(h_b_w); + view_stride_adapter a("A", N, K); + view_stride_adapter w("W", N, K); constexpr MagnitudeA max_val = 10; const MagnitudeA eps = AT::epsilon(); @@ -88,23 +74,24 @@ void impl_test_nrm2w_squared_mv(int N, int K) { ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); - Kokkos::fill_random(b_w, rand_pool, AT::one(), randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); + Kokkos::fill_random(w.d_view, rand_pool, AT::one(), randEnd); - Kokkos::deep_copy(h_b_a, b_a); - Kokkos::deep_copy(h_b_w, b_w); + Kokkos::deep_copy(a.h_base, a.d_base); + Kokkos::deep_copy(w.h_base, w.d_base); typename AT::mag_type* expected_result = new typename AT::mag_type[K]; for (int j = 0; j < K; j++) { expected_result[j] = typename AT::mag_type(); for (int i = 0; i < N; i++) { - typename AT::mag_type term = AT::abs(h_a(i, j)) / AT::abs(h_w(i, j)); + typename AT::mag_type term = + AT::abs(a.h_view(i, j)) / AT::abs(w.h_view(i, j)); expected_result[j] += term * term; } } Kokkos::View r("Dot::Result", K); - KokkosBlas::nrm2w_squared(r, a, w); + KokkosBlas::nrm2w_squared(r, a.d_view, w.d_view); auto r_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), r); for (int k = 0; k < K; k++) { @@ -138,17 +125,14 @@ int test_nrm2w_squared() { // Test::impl_test_nrm2(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrm2w_squared(0); - Test::impl_test_nrm2w_squared(13); - Test::impl_test_nrm2w_squared(1024); - // Test::impl_test_nrm2(132231); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w_squared(0); + Test::impl_test_nrm2w_squared(13); + Test::impl_test_nrm2w_squared(1024); + // Test::impl_test_nrm2(132231); +#endif return 1; } @@ -177,18 +161,15 @@ int test_nrm2w_squared_mv() { // Test::impl_test_nrm2w_squared_mv(132231,5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; Test::impl_test_nrm2w_squared_mv(0, - 5); Test::impl_test_nrm2w_squared_mv(13, 5); - Test::impl_test_nrm2w_squared_mv(1024, 5); - Test::impl_test_nrm2w_squared_mv(789, 1); - // Test::impl_test_nrm2w_squared_mv(132231,5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w_squared_mv(0, 5); + Test::impl_test_nrm2w_squared_mv(13, 5); + Test::impl_test_nrm2w_squared_mv(1024, 5); + Test::impl_test_nrm2w_squared_mv(789, 1); + // Test::impl_test_nrm2w_squared_mv(132231,5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_nrminf.hpp b/blas/unit_test/Test_Blas1_nrminf.hpp index 9a8a79c115..438db16895 100644 --- a/blas/unit_test/Test_Blas1_nrminf.hpp +++ b/blas/unit_test/Test_Blas1_nrminf.hpp @@ -25,33 +25,31 @@ void impl_test_nrminf(int N) { typedef typename ViewTypeA::non_const_value_type ScalarA; typedef Kokkos::ArithTraits AT; - ViewTypeA a("A", N); - - typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); + view_stride_adapter a("A", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - Kokkos::deep_copy(h_a, a); + Kokkos::deep_copy(a.h_base, a.d_base); - typename ViewTypeA::const_type c_a = a; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; typename AT::mag_type expected_result = Kokkos::ArithTraits::min(); for (int i = 0; i < N; i++) - if (AT::abs(h_a(i)) > expected_result) expected_result = AT::abs(h_a(i)); + if (AT::abs(a.h_view(i)) > expected_result) + expected_result = AT::abs(a.h_view(i)); if (N == 0) expected_result = typename AT::mag_type(0); - typename AT::mag_type nonconst_result = KokkosBlas::nrminf(a); + typename AT::mag_type nonconst_result = KokkosBlas::nrminf(a.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); - typename AT::mag_type const_result = KokkosBlas::nrminf(c_a); + typename AT::mag_type const_result = KokkosBlas::nrminf(a.d_view_const); EXPECT_NEAR_KK(const_result, expected_result, eps * expected_result); } @@ -60,35 +58,23 @@ void impl_test_nrminf_mv(int N, int K) { typedef typename ViewTypeA::non_const_value_type ScalarA; typedef Kokkos::ArithTraits AT; - typedef multivector_layout_adapter vfA_type; - - typename vfA_type::BaseType b_a("A", N, K); - - ViewTypeA a = vfA_type::view(b_a); - - typedef multivector_layout_adapter h_vfA_type; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + view_stride_adapter a("A", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - Kokkos::deep_copy(h_b_a, b_a); - - typename ViewTypeA::const_type c_a = a; + Kokkos::deep_copy(a.h_base, a.d_base); typename AT::mag_type* expected_result = new typename AT::mag_type[K]; for (int j = 0; j < K; j++) { expected_result[j] = Kokkos::ArithTraits::min(); for (int i = 0; i < N; i++) { - if (AT::abs(h_a(i, j)) > expected_result[j]) - expected_result[j] = AT::abs(h_a(i, j)); + if (AT::abs(a.h_view(i, j)) > expected_result[j]) + expected_result[j] = AT::abs(a.h_view(i, j)); } if (N == 0) expected_result[j] = typename AT::mag_type(0); } @@ -97,14 +83,14 @@ void impl_test_nrminf_mv(int N, int K) { Kokkos::View r("Dot::Result", K); - KokkosBlas::nrminf(r, a); + KokkosBlas::nrminf(r, a.d_view); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r(k); typename AT::mag_type exp_result = expected_result[k]; EXPECT_NEAR_KK(nonconst_result, exp_result, eps * exp_result); } - KokkosBlas::nrminf(r, c_a); + KokkosBlas::nrminf(r, a.d_view_const); for (int k = 0; k < K; k++) { typename AT::mag_type const_result = r(k); typename AT::mag_type exp_result = expected_result[k]; @@ -136,17 +122,14 @@ int test_nrminf() { // Test::impl_test_nrminf(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_nrminf(0); - Test::impl_test_nrminf(13); - Test::impl_test_nrminf(1024); - // Test::impl_test_nrminf(132231); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrminf(0); + Test::impl_test_nrminf(13); + Test::impl_test_nrminf(1024); + // Test::impl_test_nrminf(132231); +#endif return 1; } @@ -173,17 +156,14 @@ int test_nrminf_mv() { // Test::impl_test_nrminf_mv(132231,5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; Test::impl_test_nrminf_mv(0, 5); - Test::impl_test_nrminf_mv(13, 5); - Test::impl_test_nrminf_mv(1024, 5); - // Test::impl_test_nrminf_mv(132231,5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrminf_mv(0, 5); + Test::impl_test_nrminf_mv(13, 5); + Test::impl_test_nrminf_mv(1024, 5); + // Test::impl_test_nrminf_mv(132231,5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_reciprocal.hpp b/blas/unit_test/Test_Blas1_reciprocal.hpp index 687aacf1d9..841725e6fd 100644 --- a/blas/unit_test/Test_Blas1_reciprocal.hpp +++ b/blas/unit_test/Test_Blas1_reciprocal.hpp @@ -29,38 +29,12 @@ void impl_test_reciprocal(int N) { using MagnitudeA = typename AT::mag_type; using MagnitudeB = typename Kokkos::ArithTraits::mag_type; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; - const MagnitudeB eps = Kokkos::ArithTraits::epsilon(); const MagnitudeA one = AT::abs(AT::one()); const MagnitudeA max_val = 10; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeB b_org_y("Org_Y", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -68,30 +42,24 @@ void impl_test_reciprocal(int N) { { ScalarA randStart, randEnd; Test::getRandomBounds(max_val, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, one, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, one, randEnd); } - { - ScalarB randStart, randEnd; - Test::getRandomBounds(10, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); - } - - Kokkos::deep_copy(b_org_y, b_y); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(x.h_base, x.d_base); - KokkosBlas::reciprocal(y, x); - Kokkos::deep_copy(h_b_y, b_y); + KokkosBlas::reciprocal(y.d_view, x.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; ++i) { - EXPECT_NEAR_KK(h_b_y(i, 0), ScalarB(one / h_b_x(i, 0)), 2 * eps); + EXPECT_NEAR_KK(y.h_view(i), ScalarB(one / x.h_view(i)), 2 * eps); } - Kokkos::deep_copy(b_y, b_org_y); - KokkosBlas::reciprocal(y, c_x); - Kokkos::deep_copy(h_b_y, b_y); + // Zero out y again, and run again with const input + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); + + KokkosBlas::reciprocal(y.d_view, x.d_view_const); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; ++i) { - EXPECT_NEAR_KK(h_b_y(i, 0), ScalarB(one / h_b_x(i, 0)), 2 * eps); + EXPECT_NEAR_KK(y.h_view(i), ScalarB(one / x.h_view(i)), 2 * eps); } } @@ -100,24 +68,8 @@ void impl_test_reciprocal_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_x("A", N, K); - typename vfB_type::BaseType b_y("B", N, K); - typename vfB_type::BaseType b_org_y("B", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -125,40 +77,35 @@ void impl_test_reciprocal_mv(int N, int K) { { ScalarA randStart, randEnd; Test::getRandomBounds(10, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, Kokkos::ArithTraits::one(), - randEnd); + Kokkos::fill_random(x.d_view, rand_pool, + Kokkos::ArithTraits::one(), randEnd); } - { - ScalarB randStart, randEnd; - Test::getRandomBounds(10, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); - } - - Kokkos::deep_copy(b_org_y, b_y); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(x.h_base, x.d_base); - typename ViewTypeA::const_type c_x = x; + KokkosBlas::reciprocal(y.d_view, x.d_view); - KokkosBlas::reciprocal(y, x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(y.h_base, y.d_base); for (int j = 0; j < K; ++j) { for (int i = 0; i < N; ++i) { - EXPECT_NEAR_KK(h_b_y(i, j), - Kokkos::ArithTraits::one() / ScalarB(h_b_x(i, j)), - 2 * Kokkos::ArithTraits::epsilon()); + EXPECT_NEAR_KK( + y.h_view(i, j), + Kokkos::ArithTraits::one() / ScalarB(x.h_view(i, j)), + 2 * Kokkos::ArithTraits::epsilon()); } } - Kokkos::deep_copy(b_y, b_org_y); - KokkosBlas::reciprocal(y, c_x); - Kokkos::deep_copy(h_b_y, b_y); + // Zero out y again, and run again with const input + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); + + KokkosBlas::reciprocal(y.d_view, x.d_view_const); + Kokkos::deep_copy(y.h_base, y.d_base); for (int j = 0; j < K; j++) { for (int i = 0; i < N; ++i) { - EXPECT_NEAR_KK(h_b_y(i, j), - Kokkos::ArithTraits::one() / ScalarB(h_b_x(i, j)), - 2 * Kokkos::ArithTraits::epsilon()); + EXPECT_NEAR_KK( + y.h_view(i, j), + Kokkos::ArithTraits::one() / ScalarB(x.h_view(i, j)), + 2 * Kokkos::ArithTraits::epsilon()); } } } @@ -188,24 +135,21 @@ int test_reciprocal() { // Test::impl_test_reciprocal(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - Test::impl_test_reciprocal(0); - Test::impl_test_reciprocal(13); - Test::impl_test_reciprocal(1024); - // Test::impl_test_reciprocal(132231); #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_reciprocal(1024); - Test::impl_test_reciprocal(1024); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + Test::impl_test_reciprocal(0); + Test::impl_test_reciprocal(13); + Test::impl_test_reciprocal(1024); + // Test::impl_test_reciprocal(132231); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_reciprocal(1024); + Test::impl_test_reciprocal(1024); +#endif return 1; } @@ -238,28 +182,25 @@ int test_reciprocal_mv() { // Device>(132231,5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; typedef Kokkos::View - view_type_b_ls; Test::impl_test_reciprocal_mv(0, 5); Test::impl_test_reciprocal_mv(13, 5); Test::impl_test_reciprocal_mv(1024, 5); - // Test::impl_test_reciprocal_mv(132231,5); - #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_reciprocal_mv(1024, - 5); - Test::impl_test_reciprocal_mv(1024, - 5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + Test::impl_test_reciprocal_mv(0, 5); + Test::impl_test_reciprocal_mv(13, 5); + Test::impl_test_reciprocal_mv(1024, + 5); + // Test::impl_test_reciprocal_mv(132231,5); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_reciprocal_mv(1024, + 5); + Test::impl_test_reciprocal_mv(1024, + 5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_scal.hpp b/blas/unit_test/Test_Blas1_scal.hpp index 4c414ea735..6c4f7b7f2a 100644 --- a/blas/unit_test/Test_Blas1_scal.hpp +++ b/blas/unit_test/Test_Blas1_scal.hpp @@ -30,15 +30,8 @@ void impl_test_scal(int N) { ScalarA a(3); typename AT::mag_type eps = AT::epsilon() * 1000; - ViewTypeA x("X", N); - ViewTypeB y("Y", N); - ViewTypeB org_y("Org_Y", N); - - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename ViewTypeA::HostMirror h_x = Kokkos::create_mirror_view(x); - typename ViewTypeB::HostMirror h_y = Kokkos::create_mirror_view(y); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -46,29 +39,23 @@ void impl_test_scal(int N) { { ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(x, rand_pool, randStart, randEnd); - } - { - ScalarB randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(y, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(org_y, y); - - Kokkos::deep_copy(h_x, x); + Kokkos::deep_copy(x.h_base, x.d_base); - KokkosBlas::scal(y, a, x); - Kokkos::deep_copy(h_y, y); + KokkosBlas::scal(y.d_view, a, x.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i)), h_y(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i)), y.h_view(i), eps); } - Kokkos::deep_copy(y, org_y); - KokkosBlas::scal(y, a, c_x); - Kokkos::deep_copy(h_y, y); + // Zero out y again and run with const input + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); + KokkosBlas::scal(y.d_view, a, x.d_view_const); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i)), h_y(i), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i)), y.h_view(i), eps); } } @@ -78,24 +65,8 @@ void impl_test_scal_mv(int N, int K) { typedef typename ViewTypeB::value_type ScalarB; typedef Kokkos::ArithTraits AT; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_x("A", N, K); - typename vfB_type::BaseType b_y("B", N, K); - typename vfB_type::BaseType b_org_y("B", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -103,41 +74,34 @@ void impl_test_scal_mv(int N, int K) { { ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); - } - { - ScalarB randStart, randEnd; - Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } - Kokkos::fence(); - - Kokkos::deep_copy(b_org_y, b_y); - - Kokkos::deep_copy(h_b_x, b_x); + Kokkos::deep_copy(x.h_base, x.d_base); ScalarA a(3.0); - typename ViewTypeA::const_type c_x = x; typename AT::mag_type eps = AT::epsilon() * 1000; Kokkos::View r("Dot::Result", K); - KokkosBlas::scal(y, a, x); - Kokkos::deep_copy(h_b_y, b_y); + KokkosBlas::scal(y.d_view, a, x.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i, j)), h_y(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j)), y.h_view(i, j), + eps); } } - Kokkos::deep_copy(b_y, b_org_y); - KokkosBlas::scal(y, a, c_x); - Kokkos::deep_copy(h_b_y, b_y); + // Zero out y again, and run again with const input + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); + KokkosBlas::scal(y.d_view, a, x.d_view_const); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i, j)), h_y(i, j), eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i, j)), y.h_view(i, j), + eps); } } @@ -152,22 +116,23 @@ void impl_test_scal_mv(int N, int K) { auto h_params = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), params); - KokkosBlas::scal(y, params, x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); + KokkosBlas::scal(y.d_view, params, x.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(h_params(j) * h_x(i, j)), h_y(i, j), - eps); + EXPECT_NEAR_KK(static_cast(h_params(j) * x.h_view(i, j)), + y.h_view(i, j), eps); } } - Kokkos::deep_copy(b_y, b_org_y); - KokkosBlas::scal(y, params, c_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); + KokkosBlas::scal(y.d_view, params, x.d_view_const); + Kokkos::deep_copy(y.h_base, y.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(h_params(j) * h_x(i, j)), h_y(i, j), - eps); + EXPECT_NEAR_KK(static_cast(h_params(j) * x.h_view(i, j)), + y.h_view(i, j), eps); } } } @@ -197,24 +162,21 @@ int test_scal() { // Test::impl_test_scal(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - Test::impl_test_scal(0); - Test::impl_test_scal(13); - Test::impl_test_scal(1024); - // Test::impl_test_scal(132231); - #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_scal(1024); - Test::impl_test_scal(1024); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + Test::impl_test_scal(0); + Test::impl_test_scal(13); + Test::impl_test_scal(1024); + // Test::impl_test_scal(132231); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_scal(1024); + Test::impl_test_scal(1024); +#endif return 1; } @@ -243,25 +205,21 @@ int test_scal_mv() { // Test::impl_test_scal_mv(132231,5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; typedef Kokkos::View - view_type_b_ls; Test::impl_test_scal_mv(0, 5); Test::impl_test_scal_mv(13, 5); Test::impl_test_scal_mv(1024, 5); - // Test::impl_test_scal_mv(132231,5); #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_scal_mv(1024, 5); - Test::impl_test_scal_mv(1024, 5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + Test::impl_test_scal_mv(0, 5); + Test::impl_test_scal_mv(13, 5); + Test::impl_test_scal_mv(1024, 5); + // Test::impl_test_scal_mv(132231,5); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_scal_mv(1024, 5); + Test::impl_test_scal_mv(1024, 5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_sum.hpp b/blas/unit_test/Test_Blas1_sum.hpp index 4472f8d204..cf119cbd00 100644 --- a/blas/unit_test/Test_Blas1_sum.hpp +++ b/blas/unit_test/Test_Blas1_sum.hpp @@ -24,29 +24,26 @@ template void impl_test_sum(int N) { typedef typename ViewTypeA::value_type ScalarA; - ViewTypeA a("A", N); - - typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); + view_stride_adapter a("A", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - Kokkos::deep_copy(h_a, a); + Kokkos::deep_copy(a.h_base, a.d_base); - typename ViewTypeA::const_type c_a = a; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; ScalarA expected_result = 0; - for (int i = 0; i < N; i++) expected_result += h_a(i); + for (int i = 0; i < N; i++) expected_result += a.h_view(i); - ScalarA nonconst_result = KokkosBlas::sum(a); + ScalarA nonconst_result = KokkosBlas::sum(a.d_view); EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); - ScalarA const_result = KokkosBlas::sum(c_a); + ScalarA const_result = KokkosBlas::sum(a.d_view_const); EXPECT_NEAR_KK(const_result, expected_result, eps * expected_result); } @@ -54,40 +51,28 @@ template void impl_test_sum_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; - typedef multivector_layout_adapter vfA_type; - - typename vfA_type::BaseType b_a("A", N, K); - - ViewTypeA a = vfA_type::view(b_a); - - typedef multivector_layout_adapter h_vfA_type; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + view_stride_adapter a("A", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); + Kokkos::fill_random(a.d_view, rand_pool, randStart, randEnd); - Kokkos::deep_copy(h_b_a, b_a); - - typename ViewTypeA::const_type c_a = a; + Kokkos::deep_copy(a.h_base, a.d_base); ScalarA* expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); - for (int i = 0; i < N; i++) expected_result[j] += h_a(i, j); + for (int i = 0; i < N; i++) expected_result[j] += a.h_view(i, j); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; Kokkos::View r("Sum::Result", K); - KokkosBlas::sum(r, a); + KokkosBlas::sum(r, a.d_view); Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA nonconst_result = r(k); @@ -95,7 +80,7 @@ void impl_test_sum_mv(int N, int K) { eps * expected_result[k]); } - KokkosBlas::sum(r, c_a); + KokkosBlas::sum(r, a.d_view_const); Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA const_result = r(k); @@ -128,17 +113,14 @@ int test_sum() { // Test::impl_test_sum(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - Test::impl_test_sum(0); - Test::impl_test_sum(13); - Test::impl_test_sum(1024); - // Test::impl_test_sum(132231); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_sum(0); + Test::impl_test_sum(13); + Test::impl_test_sum(1024); + // Test::impl_test_sum(132231); +#endif return 1; } @@ -167,18 +149,15 @@ int test_sum_mv() { // Test::impl_test_sum_mv(132231,5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; Test::impl_test_sum_mv(0, 5); - Test::impl_test_sum_mv(13, 5); - Test::impl_test_sum_mv(1024, 5); - Test::impl_test_sum_mv(789, 1); - // Test::impl_test_sum_mv(132231,5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_sum_mv(0, 5); + Test::impl_test_sum_mv(13, 5); + Test::impl_test_sum_mv(1024, 5); + Test::impl_test_sum_mv(789, 1); + // Test::impl_test_sum_mv(132231,5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas1_team_abs.hpp b/blas/unit_test/Test_Blas1_team_abs.hpp index 8cb8d9cf43..d3f4f661d0 100644 --- a/blas/unit_test/Test_Blas1_team_abs.hpp +++ b/blas/unit_test/Test_Blas1_team_abs.hpp @@ -41,51 +41,22 @@ void impl_test_team_abs(int N) { typedef typename ViewTypeB::value_type ScalarB; typedef Kokkos::ArithTraits AT; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeB b_org_y("Org_Y", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(1)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(1)); - - Kokkos::deep_copy(b_org_y, b_y); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(1)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(1)); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(x.h_base, x.d_base); ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += AT::abs(h_x(i)) * AT::abs(h_x(i)); + expected_result += AT::abs(x.h_view(i)) * AT::abs(x.h_view(i)); // KokkosBlas::abs(y,x); Kokkos::parallel_for( @@ -95,20 +66,23 @@ void impl_test_team_abs(int N) { KokkosBlas::Experimental::abs( teamMember, Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + x.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarB nonconst_nonconst_result = KokkosBlas::dot(y, y); + ScalarB nonconst_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); - Kokkos::deep_copy(b_y, b_org_y); + // Zero out y and run again with const input + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); // KokkosBlas::abs(y,c_x); Kokkos::parallel_for( @@ -118,16 +92,18 @@ void impl_test_team_abs(int N) { KokkosBlas::Experimental::abs( teamMember, Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - c_x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + x.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarB const_nonconst_result = KokkosBlas::dot(y, y); + ScalarB const_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); } @@ -143,43 +119,22 @@ void impl_test_team_abs_mv(int N, int K) { typedef typename ViewTypeB::value_type ScalarB; typedef Kokkos::ArithTraits AT; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_x("A", N, K); - typename vfB_type::BaseType b_y("B", N, K); - typename vfB_type::BaseType b_org_y("B", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(1)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(1)); - - Kokkos::deep_copy(b_org_y, b_y); - - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(1)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(1)); - typename ViewTypeA::const_type c_x = x; + Kokkos::deep_copy(x.h_base, x.d_base); ScalarA *expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) - expected_result[j] += AT::abs(h_x(i, j)) * AT::abs(h_x(i, j)); + expected_result[j] += AT::abs(x.h_view(i, j)) * AT::abs(x.h_view(i, j)); } // double eps = std::is_same::value?2*1e-5:1e-7; @@ -195,11 +150,11 @@ void impl_test_team_abs_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::abs( - teamMember, Kokkos::subview(y, Kokkos::ALL(), teamId), - Kokkos::subview(x, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(x.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_result = r(k); typename AT::mag_type divisor = @@ -211,7 +166,8 @@ void impl_test_team_abs_mv(int N, int K) { // eps*expected_result[k]); } - Kokkos::deep_copy(b_y, b_org_y); + // Zero out y and run again with const input + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); // KokkosBlas::abs(y,c_x); Kokkos::parallel_for( @@ -219,11 +175,11 @@ void impl_test_team_abs_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::abs( - teamMember, Kokkos::subview(y, Kokkos::ALL(), teamId), - Kokkos::subview(c_x, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA const_result = r(k); typename AT::mag_type divisor = @@ -263,8 +219,7 @@ int test_team_abs() { // Test::impl_test_team_abs(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -309,8 +264,7 @@ int test_team_abs_mv() { // Device>(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_team_axpby.hpp b/blas/unit_test/Test_Blas1_team_axpby.hpp index 3e071e7537..e11b1e14a5 100644 --- a/blas/unit_test/Test_Blas1_team_axpby.hpp +++ b/blas/unit_test/Test_Blas1_team_axpby.hpp @@ -40,55 +40,28 @@ void impl_test_team_axpby(int N) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; - ScalarA a = 3; ScalarB b = 5; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeB b_org_y("Org_Y", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + view_stride_adapter org_y("Y", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(10)); - - Kokkos::deep_copy(b_org_y, b_y); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); + Kokkos::deep_copy(org_y.h_base, y.d_base); ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += - ScalarB(a * h_x(i) + b * h_y(i)) * ScalarB(a * h_x(i) + b * h_y(i)); + expected_result += ScalarB(a * x.h_view(i) + b * y.h_view(i)) * + ScalarB(a * x.h_view(i) + b * y.h_view(i)); // KokkosBlas::axpby(a,x,b,y); Kokkos::parallel_for( @@ -98,21 +71,23 @@ void impl_test_team_axpby(int N) { KokkosBlas::Experimental::axpby( teamMember, a, Kokkos::subview( - x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarB nonconst_nonconst_result = KokkosBlas::dot(y, y); + ScalarB nonconst_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); - Kokkos::deep_copy(b_y, b_org_y); + Kokkos::deep_copy(y.d_base, org_y.h_base); // KokkosBlas::axpby(a,c_x,b,y); Kokkos::parallel_for( @@ -122,17 +97,20 @@ void impl_test_team_axpby(int N) { KokkosBlas::Experimental::axpby( teamMember, a, Kokkos::subview( - c_x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarB const_nonconst_result = KokkosBlas::dot(c_y, c_y); + ScalarB const_nonconst_result = + KokkosBlas::dot(y.d_view_const, y.d_view_const); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); } @@ -147,46 +125,29 @@ void impl_test_team_axpby_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_x("A", N, K); - typename vfB_type::BaseType b_y("B", N, K); - typename vfB_type::BaseType b_org_y("B", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); + view_stride_adapter org_y("Org_Y", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(10)); - - Kokkos::deep_copy(b_org_y, b_y); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); + Kokkos::deep_copy(org_y.h_base, y.d_base); - ScalarA a = 3; - ScalarB b = 5; - typename ViewTypeA::const_type c_x = x; + ScalarA a = 3; + ScalarB b = 5; ScalarA *expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) - expected_result[j] += ScalarB(a * h_x(i, j) + b * h_y(i, j)) * - ScalarB(a * h_x(i, j) + b * h_y(i, j)); + expected_result[j] += ScalarB(a * x.h_view(i, j) + b * y.h_view(i, j)) * + ScalarB(a * x.h_view(i, j) + b * y.h_view(i, j)); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -201,11 +162,11 @@ void impl_test_team_axpby_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpby( - teamMember, a, Kokkos::subview(x, Kokkos::ALL(), teamId), b, - Kokkos::subview(y, Kokkos::ALL(), teamId)); + teamMember, a, Kokkos::subview(x.d_view, Kokkos::ALL(), teamId), b, + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); EXPECT_NEAR_KK(AT::abs(nonconst_nonconst_result), @@ -213,7 +174,7 @@ void impl_test_team_axpby_mv(int N, int K) { AT::abs(expected_result[k] * eps)); } - Kokkos::deep_copy(b_y, b_org_y); + Kokkos::deep_copy(y.d_base, org_y.h_base); // KokkosBlas::axpby(a,c_x,b,y); Kokkos::parallel_for( @@ -221,11 +182,12 @@ void impl_test_team_axpby_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpby( - teamMember, a, Kokkos::subview(c_x, Kokkos::ALL(), teamId), b, - Kokkos::subview(y, Kokkos::ALL(), teamId)); + teamMember, a, + Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId), b, + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); EXPECT_NEAR_KK(AT::abs(const_non_const_result), AT::abs(expected_result[k]), @@ -260,8 +222,7 @@ int test_team_axpby() { // Test::impl_test_team_axpby(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -306,8 +267,7 @@ int test_team_axpby_mv() { // Device>(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_team_axpy.hpp b/blas/unit_test/Test_Blas1_team_axpy.hpp index d861da45eb..5cff9d025e 100644 --- a/blas/unit_test/Test_Blas1_team_axpy.hpp +++ b/blas/unit_test/Test_Blas1_team_axpy.hpp @@ -40,54 +40,27 @@ void impl_test_team_axpy(int N) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + view_stride_adapter org_y("Y", N); ScalarA a = 3; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeB b_org_y("Org_Y", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); - Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(10)); - - Kokkos::deep_copy(b_org_y, b_y); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); + Kokkos::deep_copy(org_y.h_base, y.d_base); ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += - ScalarB(a * h_x(i) + h_y(i)) * ScalarB(a * h_x(i) + h_y(i)); + expected_result += ScalarB(a * x.h_view(i) + y.h_view(i)) * + ScalarB(a * x.h_view(i) + y.h_view(i)); // KokkosBlas::axpy(a,x,y); Kokkos::parallel_for( @@ -97,20 +70,22 @@ void impl_test_team_axpy(int N) { KokkosBlas::Experimental::axpy( teamMember, a, Kokkos::subview( - x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarB nonconst_nonconst_result = KokkosBlas::dot(y, y); + ScalarB nonconst_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); - Kokkos::deep_copy(b_y, b_org_y); + Kokkos::deep_copy(y.d_base, org_y.h_base); // KokkosBlas::axpy(a,c_x,y); Kokkos::parallel_for( @@ -120,16 +95,19 @@ void impl_test_team_axpy(int N) { KokkosBlas::Experimental::axpy( teamMember, a, Kokkos::subview( - c_x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarB const_nonconst_result = KokkosBlas::dot(c_y, c_y); + ScalarB const_nonconst_result = + KokkosBlas::dot(y.d_view_const, y.d_view_const); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); } @@ -144,45 +122,28 @@ void impl_test_team_axpy_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_x("A", N, K); - typename vfB_type::BaseType b_y("B", N, K); - typename vfB_type::BaseType b_org_y("B", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); + view_stride_adapter org_y("Org_Y", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(10)); - - Kokkos::deep_copy(b_org_y, b_y); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); + Kokkos::deep_copy(org_y.h_base, y.d_base); - ScalarA a = 3; - typename ViewTypeA::const_type c_x = x; + ScalarA a = 3; ScalarA *expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) - expected_result[j] += ScalarB(a * h_x(i, j) + h_y(i, j)) * - ScalarB(a * h_x(i, j) + h_y(i, j)); + expected_result[j] += ScalarB(a * x.h_view(i, j) + y.h_view(i, j)) * + ScalarB(a * x.h_view(i, j) + y.h_view(i, j)); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -195,18 +156,18 @@ void impl_test_team_axpy_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpy( - teamMember, a, Kokkos::subview(x, Kokkos::ALL(), teamId), - Kokkos::subview(y, Kokkos::ALL(), teamId)); + teamMember, a, Kokkos::subview(x.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], eps * expected_result[k]); } - Kokkos::deep_copy(b_y, b_org_y); + Kokkos::deep_copy(y.d_base, org_y.h_base); // KokkosBlas::axpy(a,c_x,y); Kokkos::parallel_for( @@ -214,11 +175,12 @@ void impl_test_team_axpy_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::axpy( - teamMember, a, Kokkos::subview(c_x, Kokkos::ALL(), teamId), - Kokkos::subview(y, Kokkos::ALL(), teamId)); + teamMember, a, + Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId), + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); EXPECT_NEAR_KK(const_non_const_result, expected_result[k], @@ -253,8 +215,7 @@ int test_team_axpy() { // Test::impl_test_team_axpy(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -299,8 +260,7 @@ int test_team_axpy_mv() { // Device>(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_team_dot.hpp b/blas/unit_test/Test_Blas1_team_dot.hpp index 2de9ad8a7a..00c0940023 100644 --- a/blas/unit_test/Test_Blas1_team_dot.hpp +++ b/blas/unit_test/Test_Blas1_team_dot.hpp @@ -39,44 +39,20 @@ void impl_test_team_dot(int N) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; - - BaseTypeA b_a("A", N); - BaseTypeB b_b("B", N); - - ViewTypeA a = Kokkos::subview(b_a, Kokkos::ALL(), 0); - ViewTypeB b = Kokkos::subview(b_b, Kokkos::ALL(), 0); - - typename BaseTypeA::HostMirror h_b_a = Kokkos::create_mirror_view(b_a); - typename BaseTypeB::HostMirror h_b_b = Kokkos::create_mirror_view(b_b); - - typename ViewTypeA::HostMirror h_a = Kokkos::subview(h_b_a, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_b = Kokkos::subview(h_b_b, Kokkos::ALL(), 0); + view_stride_adapter a("a", N); + view_stride_adapter b("b", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_a, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_b, rand_pool, ScalarB(10)); + Kokkos::fill_random(a.d_view, rand_pool, ScalarA(10)); + Kokkos::fill_random(b.d_view, rand_pool, ScalarB(10)); - Kokkos::deep_copy(h_b_a, b_a); - Kokkos::deep_copy(h_b_b, b_b); + Kokkos::deep_copy(a.h_base, a.d_base); + Kokkos::deep_copy(b.h_base, b.d_base); ScalarA expected_result = 0; - for (int i = 0; i < N; i++) expected_result += h_a(i) * h_b(i); + for (int i = 0; i < N; i++) expected_result += a.h_view(i) * b.h_view(i); Kokkos::View r("PartialDots", M); Kokkos::View d_r("PartialDots", M); @@ -91,13 +67,15 @@ void impl_test_team_dot(int N) { d_r(teamId) = KokkosBlas::Experimental::dot( teamMember, Kokkos::subview( - a, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + a.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - b, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + b.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < M; k++) nonconst_nonconst_result += r(k); @@ -106,10 +84,6 @@ void impl_test_team_dot(int N) { EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); - typename ViewTypeA::const_type c_a = a; - typename ViewTypeB::const_type c_b = b; - - // ScalarA const_const_result = KokkosBlas::dot(c_a,c_b); ScalarA const_const_result = 0; Kokkos::parallel_for( @@ -119,13 +93,15 @@ void impl_test_team_dot(int N) { d_r(teamId) = KokkosBlas::Experimental::dot( teamMember, Kokkos::subview( - c_a, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + a.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - c_b, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + b.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < M; k++) const_const_result += r(k); @@ -142,13 +118,15 @@ void impl_test_team_dot(int N) { d_r(teamId) = KokkosBlas::Experimental::dot( teamMember, Kokkos::subview( - a, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + a.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - c_b, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + b.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < M; k++) nonconst_const_result += r(k); @@ -165,13 +143,15 @@ void impl_test_team_dot(int N) { d_r(teamId) = KokkosBlas::Experimental::dot( teamMember, Kokkos::subview( - c_a, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + a.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - b, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + b.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < M; k++) const_nonconst_result += r(k); @@ -190,40 +170,23 @@ void impl_test_team_dot_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_a("A", N, K); - typename vfB_type::BaseType b_b("B", N, K); - - ViewTypeA a = vfA_type::view(b_a); - ViewTypeB b = vfB_type::view(b_b); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - typename h_vfB_type::BaseType h_b_b = Kokkos::create_mirror_view(b_b); - - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); - typename ViewTypeB::HostMirror h_b = h_vfB_type::view(h_b_b); + view_stride_adapter a("A", N, K); + view_stride_adapter b("B", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_a, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_b, rand_pool, ScalarB(10)); - - Kokkos::deep_copy(h_b_a, b_a); - Kokkos::deep_copy(h_b_b, b_b); + Kokkos::fill_random(a.d_view, rand_pool, ScalarA(10)); + Kokkos::fill_random(b.d_view, rand_pool, ScalarB(10)); - typename ViewTypeA::const_type c_a = a; - typename ViewTypeB::const_type c_b = b; + Kokkos::deep_copy(a.h_base, a.d_base); + Kokkos::deep_copy(b.h_base, b.d_base); ScalarA *expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); - for (int i = 0; i < N; i++) expected_result[j] += h_a(i, j) * h_b(i, j); + for (int i = 0; i < N; i++) + expected_result[j] += a.h_view(i, j) * b.h_view(i, j); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -237,8 +200,8 @@ void impl_test_team_dot_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot( - teamMember, Kokkos::subview(a, Kokkos::ALL(), teamId), - Kokkos::subview(b, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(a.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(b.d_view, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { @@ -253,8 +216,8 @@ void impl_test_team_dot_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot( - teamMember, Kokkos::subview(c_a, Kokkos::ALL(), teamId), - Kokkos::subview(c_b, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(a.d_view_const, Kokkos::ALL(), teamId), + Kokkos::subview(b.d_view_const, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { @@ -269,8 +232,8 @@ void impl_test_team_dot_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot( - teamMember, Kokkos::subview(a, Kokkos::ALL(), teamId), - Kokkos::subview(c_b, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(a.d_view, Kokkos::ALL(), teamId), + Kokkos::subview(b.d_view_const, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { @@ -285,8 +248,8 @@ void impl_test_team_dot_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::dot( - teamMember, Kokkos::subview(c_a, Kokkos::ALL(), teamId), - Kokkos::subview(b, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(a.d_view_const, Kokkos::ALL(), teamId), + Kokkos::subview(b.d_view, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { @@ -323,8 +286,7 @@ int test_team_dot() { // Test::impl_test_team_dot(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -369,8 +331,7 @@ int test_team_dot_mv() { // Device>(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_team_mult.hpp b/blas/unit_test/Test_Blas1_team_mult.hpp index 4df5dd9cd4..f340ac2309 100644 --- a/blas/unit_test/Test_Blas1_team_mult.hpp +++ b/blas/unit_test/Test_Blas1_team_mult.hpp @@ -41,68 +41,33 @@ void impl_test_team_mult(int N) { typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; - typedef Kokkos::View< - ScalarC * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeC; - ScalarA a = 3; ScalarB b = 5; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeC b_z("Y", N); - BaseTypeC b_org_z("Org_Z", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - ViewTypeC z = Kokkos::subview(b_z, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - typename BaseTypeC::HostMirror h_b_z = Kokkos::create_mirror_view(b_z); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); - typename ViewTypeC::HostMirror h_z = Kokkos::subview(h_b_z, Kokkos::ALL(), 0); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + view_stride_adapter z("Z", N); + view_stride_adapter org_z("Org_Z", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(10)); - Kokkos::fill_random(b_z, rand_pool, ScalarC(10)); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); + Kokkos::fill_random(z.d_view, rand_pool, ScalarC(10)); - Kokkos::deep_copy(b_org_z, b_z); + Kokkos::deep_copy(org_z.h_base, z.d_base); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); + Kokkos::deep_copy(z.h_base, z.d_base); ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += ScalarC(b * h_z(i) + a * h_x(i) * h_y(i)) * - ScalarC(b * h_z(i) + a * h_x(i) * h_y(i)); + expected_result += + ScalarC(b * z.h_view(i) + a * x.h_view(i) * y.h_view(i)) * + ScalarC(b * z.h_view(i) + a * x.h_view(i) * y.h_view(i)); // KokkosBlas::mult(b,z,a,x,y); Kokkos::parallel_for( @@ -112,24 +77,28 @@ void impl_test_team_mult(int N) { KokkosBlas::Experimental::mult( teamMember, b, Kokkos::subview( - z, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + z.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, Kokkos::subview( - x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarC nonconst_nonconst_result = KokkosBlas::dot(z, z); + ScalarC nonconst_nonconst_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); - Kokkos::deep_copy(b_z, b_org_z); + // Reset z on device to orig and run again with const-valued y + Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::mult(b,z,a,x,c_y); Kokkos::parallel_for( "KokkosBlas::Test::TeamMult", policy, @@ -138,23 +107,27 @@ void impl_test_team_mult(int N) { KokkosBlas::Experimental::mult( teamMember, b, Kokkos::subview( - z, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + z.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, Kokkos::subview( - x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - c_y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + y.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarC const_nonconst_result = KokkosBlas::dot(z, z); + ScalarC const_nonconst_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); - Kokkos::deep_copy(b_z, b_org_z); + // Reset z again to orig, and run with both x and y const + Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::mult(b,z,a,c_x,c_y); Kokkos::parallel_for( "KokkosBlas::Test::TeamMult", policy, @@ -163,20 +136,23 @@ void impl_test_team_mult(int N) { KokkosBlas::Experimental::mult( teamMember, b, Kokkos::subview( - z, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + z.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, Kokkos::subview( - c_x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), Kokkos::subview( - c_y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + y.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarC const_const_result = KokkosBlas::dot(z, z); + ScalarC const_const_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(const_const_result, expected_result, eps * expected_result); } @@ -192,54 +168,27 @@ void impl_test_team_mult_mv(int N, int K) { typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef multivector_layout_adapter vfB_type; - typedef multivector_layout_adapter vfC_type; - - BaseTypeA b_x("X", N); - typename vfB_type::BaseType b_y("Y", N, K); - typename vfC_type::BaseType b_z("Z", N, K); - typename vfC_type::BaseType b_org_z("Z", N, K); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = vfB_type::view(b_y); - ViewTypeC z = vfC_type::view(b_z); - - typedef multivector_layout_adapter h_vfB_type; - typedef multivector_layout_adapter h_vfC_type; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - typename h_vfC_type::BaseType h_b_z = Kokkos::create_mirror_view(b_z); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); - typename ViewTypeC::HostMirror h_z = h_vfC_type::view(h_b_z); + // x is rank-1, all others are rank-2 + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N, K); + view_stride_adapter z("Z", N, K); + view_stride_adapter org_z("Org_Z", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); typename Kokkos::ArithTraits::mag_type const max_val = 10; - Kokkos::fill_random(b_x, rand_pool, ScalarA(max_val)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(max_val)); - Kokkos::fill_random(b_z, rand_pool, ScalarC(max_val)); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(max_val)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(max_val)); + Kokkos::fill_random(z.d_view, rand_pool, ScalarC(max_val)); - Kokkos::deep_copy(b_org_z, b_z); + Kokkos::deep_copy(org_z.h_base, z.d_base); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); - ScalarA a = 3; - ScalarB b = 5; - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; + ScalarA a = 3; + ScalarB b = 5; // In the operation z = (b*z) + (a*x*y) we estimate // the largest rounding error to be dominated by max(b*z, a*x*y) @@ -257,39 +206,37 @@ void impl_test_team_mult_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::mult( - teamMember, b, Kokkos::subview(z, Kokkos::ALL(), teamId), a, x, - Kokkos::subview(y, Kokkos::ALL(), teamId)); + teamMember, b, Kokkos::subview(z.d_view, Kokkos::ALL(), teamId), a, + x.d_view, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId)); }); - ScalarC temp; - typename h_vfC_type::BaseType h_b_z_res = Kokkos::create_mirror_view(b_z); - Kokkos::deep_copy(h_b_z_res, b_z); - typename h_vfC_type::BaseType h_b_org_z = Kokkos::create_mirror_view(b_org_z); - Kokkos::deep_copy(h_b_org_z, b_org_z); + Kokkos::deep_copy(z.h_base, z.d_base); + ScalarC temp; for (int j = 0; j < K; j++) { for (int i = 0; i < N; i++) { - temp = ScalarC(b * h_b_org_z(i, j) + a * h_x(i) * h_y(i, j)); - EXPECT_NEAR_KK(temp, h_b_z_res(i, j), max_error); + temp = ScalarC(b * org_z.h_view(i, j) + a * x.h_view(i) * y.h_view(i, j)); + EXPECT_NEAR_KK(temp, z.h_view(i, j), max_error); } } - Kokkos::deep_copy(b_z, b_org_z); + // Reset z on device and run again with const y + Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::mult(b,z,a,x,c_y); Kokkos::parallel_for( "KokkosBlas::Test::TeamMult", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::mult( - teamMember, b, Kokkos::subview(z, Kokkos::ALL(), teamId), a, x, - Kokkos::subview(c_y, Kokkos::ALL(), teamId)); + teamMember, b, Kokkos::subview(z.d_view, Kokkos::ALL(), teamId), a, + x.d_view, Kokkos::subview(y.d_view_const, Kokkos::ALL(), teamId)); }); - Kokkos::deep_copy(h_b_z_res, b_z); + Kokkos::deep_copy(z.h_base, z.d_base); for (int k = 0; k < K; k++) { for (int i = 0; i < N; ++i) { - temp = ScalarC(b * h_b_org_z(i, k) + a * h_x(i) * h_y(i, k)); - EXPECT_NEAR_KK(temp, h_b_z_res(i, k), max_error); + temp = ScalarC(b * org_z.h_view(i, k) + a * x.h_view(i) * y.h_view(i, k)); + EXPECT_NEAR_KK(temp, z.h_view(i, k), max_error); } } } @@ -329,8 +276,7 @@ int test_team_mult() { // Device>(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -390,9 +336,7 @@ int test_team_mult_mv() { // view_type_c_lr, Device>(132231,5); #endif - /* -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -414,7 +358,6 @@ int test_team_mult_mv() { Test::impl_test_team_mult_mv(124, 5); #endif - */ return 1; } diff --git a/blas/unit_test/Test_Blas1_team_nrm2.hpp b/blas/unit_test/Test_Blas1_team_nrm2.hpp index 05d4970bcd..4bc4836782 100644 --- a/blas/unit_test/Test_Blas1_team_nrm2.hpp +++ b/blas/unit_test/Test_Blas1_team_nrm2.hpp @@ -37,32 +37,20 @@ void impl_test_team_nrm2(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; typedef Kokkos::ArithTraits AT; - typedef multivector_layout_adapter vfA_type; - - typename vfA_type::BaseType b_a("A", N, K); - - ViewTypeA a = vfA_type::view(b_a); - - typedef multivector_layout_adapter h_vfA_type; - - typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); - - typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + view_stride_adapter a("A", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_a, rand_pool, ScalarA(10)); - - Kokkos::deep_copy(h_b_a, b_a); + Kokkos::fill_random(a.d_view, rand_pool, ScalarA(10)); - typename ViewTypeA::const_type c_a = a; + Kokkos::deep_copy(a.h_base, a.d_base); typename AT::mag_type *expected_result = new typename AT::mag_type[K]; for (int j = 0; j < K; j++) { expected_result[j] = typename AT::mag_type(); for (int i = 0; i < N; i++) - expected_result[j] += AT::abs(h_a(i, j)) * AT::abs(h_a(i, j)); + expected_result[j] += AT::abs(a.h_view(i, j)) * AT::abs(a.h_view(i, j)); expected_result[j] = Kokkos::ArithTraits::sqrt(expected_result[j]); } @@ -78,7 +66,7 @@ void impl_test_team_nrm2(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::nrm2( - teamMember, Kokkos::subview(a, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(a.d_view, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { @@ -93,7 +81,7 @@ void impl_test_team_nrm2(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); d_r(teamId) = KokkosBlas::Experimental::nrm2( - teamMember, Kokkos::subview(c_a, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(a.d_view_const, Kokkos::ALL(), teamId)); }); Kokkos::deep_copy(r, d_r); for (int k = 0; k < K; k++) { @@ -127,8 +115,7 @@ int test_team_nrm2() { // Test::impl_test_team_nrm2(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; Test::impl_test_team_nrm2(0, 5); diff --git a/blas/unit_test/Test_Blas1_team_scal.hpp b/blas/unit_test/Test_Blas1_team_scal.hpp index 5d9f298d06..e0c109e1af 100644 --- a/blas/unit_test/Test_Blas1_team_scal.hpp +++ b/blas/unit_test/Test_Blas1_team_scal.hpp @@ -41,55 +41,24 @@ void impl_test_team_scal(int N) { typedef typename ViewTypeB::value_type ScalarB; typedef Kokkos::ArithTraits AT; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); ScalarA a(3); typename AT::mag_type eps = AT::epsilon() * 1000; typename AT::mag_type zero = AT::abs(AT::zero()); typename AT::mag_type one = AT::abs(AT::one()); - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeB b_org_y("Org_Y", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); - Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(1)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(1)); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(1)); - Kokkos::deep_copy(b_org_y, b_y); - - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::deep_copy(x.h_base, x.d_base); ScalarA expected_result(0); for (int i = 0; i < N; i++) { - expected_result += ScalarB(a * h_x(i)) * ScalarB(a * h_x(i)); + expected_result += ScalarB(a * x.h_view(i)) * ScalarB(a * x.h_view(i)); } Kokkos::parallel_for( @@ -99,18 +68,20 @@ void impl_test_team_scal(int N) { KokkosBlas::Experimental::scal( teamMember, Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, Kokkos::subview( - x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + x.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); { - ScalarB nonconst_nonconst_result = KokkosBlas::dot(y, y); + ScalarB nonconst_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); typename AT::mag_type divisor = AT::abs(expected_result) == zero ? one : AT::abs(expected_result); typename AT::mag_type diff = @@ -118,7 +89,7 @@ void impl_test_team_scal(int N) { EXPECT_NEAR_KK(diff, zero, eps); } - Kokkos::deep_copy(b_y, b_org_y); + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); Kokkos::parallel_for( "KokkosBlas::Test::TeamScal", policy, @@ -127,18 +98,20 @@ void impl_test_team_scal(int N) { KokkosBlas::Experimental::scal( teamMember, Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), a, Kokkos::subview( - c_x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + x.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); { - ScalarB const_nonconst_result = KokkosBlas::dot(y, y); + ScalarB const_nonconst_result = KokkosBlas::dot(y.d_view, y.d_view); typename AT::mag_type divisor = AT::abs(expected_result) == zero ? one : AT::abs(expected_result); typename AT::mag_type diff = @@ -159,44 +132,23 @@ void impl_test_team_scal_mv(int N, int K) { typedef typename ViewTypeB::value_type ScalarB; typedef Kokkos::ArithTraits AT; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - - typename vfA_type::BaseType b_x("A", N, K); - typename vfB_type::BaseType b_y("B", N, K); - typename vfB_type::BaseType b_org_y("B", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(1)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(1)); - - Kokkos::deep_copy(b_org_y, b_y); - - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(1)); + Kokkos::deep_copy(x.h_base, x.d_base); ScalarA a(3); - typename ViewTypeA::const_type c_x = x; ScalarA *expected_result = new ScalarA[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) { - expected_result[j] += ScalarB(a * h_x(i, j)) * ScalarB(a * h_x(i, j)); + expected_result[j] += + ScalarB(a * x.h_view(i, j)) * ScalarB(a * x.h_view(i, j)); } } @@ -211,11 +163,11 @@ void impl_test_team_scal_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::scal( - teamMember, Kokkos::subview(y, Kokkos::ALL(), teamId), a, - Kokkos::subview(x, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), a, + Kokkos::subview(x.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_scalar_result = r(k); typename AT::mag_type divisor = @@ -225,18 +177,19 @@ void impl_test_team_scal_mv(int N, int K) { EXPECT_NEAR_KK(diff, zero, eps); } - Kokkos::deep_copy(b_y, b_org_y); + // Zero out y again, and run again with const input + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); Kokkos::parallel_for( "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::scal( - teamMember, Kokkos::subview(y, Kokkos::ALL(), teamId), a, - Kokkos::subview(c_x, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), a, + Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA const_scalar_result = r(k); typename AT::mag_type divisor = @@ -258,21 +211,24 @@ void impl_test_team_scal_mv(int N, int K) { for (int j = 0; j < K; j++) { expected_result[j] = ScalarA(); for (int i = 0; i < N; i++) { - expected_result[j] += - ScalarB((3.0 + j) * h_x(i, j)) * ScalarB((3.0 + j) * h_x(i, j)); + expected_result[j] += ScalarB((3.0 + j) * x.h_view(i, j)) * + ScalarB((3.0 + j) * x.h_view(i, j)); } } + // Zero out y to run again + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); + Kokkos::parallel_for( "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::scal( - teamMember, Kokkos::subview(y, Kokkos::ALL(), teamId), - params(teamId), Kokkos::subview(x, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), + params(teamId), Kokkos::subview(x.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_vector_result = r(k); typename AT::mag_type divisor = @@ -282,18 +238,20 @@ void impl_test_team_scal_mv(int N, int K) { EXPECT_NEAR_KK(diff, zero, eps); } - Kokkos::deep_copy(b_y, b_org_y); + // Zero out y again, and run again with const input + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::zero()); Kokkos::parallel_for( "KokkosBlas::Test::TeamScal", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::scal( - teamMember, Kokkos::subview(y, Kokkos::ALL(), teamId), - params(teamId), Kokkos::subview(c_x, Kokkos::ALL(), teamId)); + teamMember, Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), + params(teamId), + Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, y, y); + KokkosBlas::dot(r, y.d_view, y.d_view); for (int k = 0; k < K; k++) { ScalarA const_vector_result = r(k); typename AT::mag_type divisor = @@ -331,8 +289,7 @@ int test_team_scal() { // Test::impl_test_team_scal(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -377,8 +334,7 @@ int test_team_scal_mv() { // Device>(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_team_update.hpp b/blas/unit_test/Test_Blas1_team_update.hpp index 8a591b8c27..09b60440ae 100644 --- a/blas/unit_test/Test_Blas1_team_update.hpp +++ b/blas/unit_test/Test_Blas1_team_update.hpp @@ -41,69 +41,34 @@ void impl_test_team_update(int N) { typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; - typedef Kokkos::View< - ScalarC * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeC; - ScalarA a = 3; ScalarB b = 5; ScalarC c = 7; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeC b_z("Y", N); - BaseTypeC b_org_z("Org_Z", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - ViewTypeC z = Kokkos::subview(b_z, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - typename BaseTypeC::HostMirror h_b_z = Kokkos::create_mirror_view(b_z); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); - typename ViewTypeC::HostMirror h_z = Kokkos::subview(h_b_z, Kokkos::ALL(), 0); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + view_stride_adapter z("Z", N); + view_stride_adapter org_z("Org_Z", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(10)); - Kokkos::fill_random(b_z, rand_pool, ScalarC(10)); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); + Kokkos::fill_random(z.d_view, rand_pool, ScalarC(10)); - Kokkos::deep_copy(b_org_z, b_z); + Kokkos::deep_copy(org_z.h_base, z.d_base); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); + Kokkos::deep_copy(z.h_base, z.d_base); ScalarA expected_result = 0; for (int i = 0; i < N; i++) - expected_result += ScalarC(c * h_z(i) + a * h_x(i) + b * h_y(i)) * - ScalarC(c * h_z(i) + a * h_x(i) + b * h_y(i)); + expected_result += + ScalarC(c * z.h_view(i) + a * x.h_view(i) + b * y.h_view(i)) * + ScalarC(c * z.h_view(i) + a * x.h_view(i) + b * y.h_view(i)); // KokkosBlas::update(a,x,b,y,c,z); Kokkos::parallel_for( @@ -113,25 +78,28 @@ void impl_test_team_update(int N) { KokkosBlas::Experimental::update( teamMember, a, Kokkos::subview( - x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), c, Kokkos::subview( - z, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + z.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarC nonconst_nonconst_result = KokkosBlas::dot(z, z); + ScalarC nonconst_nonconst_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, eps * expected_result); - Kokkos::deep_copy(b_z, b_org_z); + Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::update(a,c_x,b,y,c,z); Kokkos::parallel_for( "KokkosBlas::Test::TeamUpdate", policy, @@ -140,24 +108,27 @@ void impl_test_team_update(int N) { KokkosBlas::Experimental::update( teamMember, a, Kokkos::subview( - c_x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + y.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), c, Kokkos::subview( - z, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + z.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarC const_nonconst_result = KokkosBlas::dot(z, z); + ScalarC const_nonconst_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); - Kokkos::deep_copy(b_z, b_org_z); + Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::update(a,c_x,b,c_y,c,z); Kokkos::parallel_for( "KokkosBlas::Test::TeamUpdate", policy, @@ -166,21 +137,24 @@ void impl_test_team_update(int N) { KokkosBlas::Experimental::update( teamMember, a, Kokkos::subview( - c_x, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + x.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), b, Kokkos::subview( - c_y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), + y.d_view_const, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N)), c, Kokkos::subview( - z, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); + z.d_view, + Kokkos::make_pair( + teamId * team_data_siz, + (teamId < M - 1) ? (teamId + 1) * team_data_siz : N))); }); - ScalarC const_const_result = KokkosBlas::dot(z, z); + ScalarC const_const_result = KokkosBlas::dot(z.d_view, z.d_view); EXPECT_NEAR_KK(const_const_result, expected_result, eps * expected_result); } @@ -196,57 +170,36 @@ void impl_test_team_update_mv(int N, int K) { typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - typedef multivector_layout_adapter vfC_type; - - typename vfA_type::BaseType b_x("X", N, K); - typename vfB_type::BaseType b_y("Y", N, K); - typename vfC_type::BaseType b_z("Z", N, K); - typename vfC_type::BaseType b_org_z("Z", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - ViewTypeC z = vfC_type::view(b_z); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - typedef multivector_layout_adapter h_vfC_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - typename h_vfC_type::BaseType h_b_z = Kokkos::create_mirror_view(b_z); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); - typename ViewTypeC::HostMirror h_z = h_vfC_type::view(h_b_z); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); + view_stride_adapter z("Z", N, K); + view_stride_adapter org_z("Org_Z", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, ScalarA(10)); - Kokkos::fill_random(b_y, rand_pool, ScalarB(10)); - Kokkos::fill_random(b_z, rand_pool, ScalarC(10)); + Kokkos::fill_random(x.d_view, rand_pool, ScalarA(10)); + Kokkos::fill_random(y.d_view, rand_pool, ScalarB(10)); + Kokkos::fill_random(z.d_view, rand_pool, ScalarC(10)); - Kokkos::deep_copy(b_org_z, b_z); + Kokkos::deep_copy(org_z.h_base, z.d_base); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); + Kokkos::deep_copy(z.h_base, z.d_base); - ScalarA a = 3; - ScalarB b = 5; - ScalarC c = 5; - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; + ScalarA a = 3; + ScalarB b = 5; + ScalarC c = 5; ScalarC *expected_result = new ScalarC[K]; for (int j = 0; j < K; j++) { expected_result[j] = ScalarC(); for (int i = 0; i < N; i++) expected_result[j] += - ScalarC(a * h_x(i, j) + b * h_y(i, j) + c * h_z(i, j)) * - ScalarC(a * h_x(i, j) + b * h_y(i, j) + c * h_z(i, j)); + ScalarC(a * x.h_view(i, j) + b * y.h_view(i, j) + + c * z.h_view(i, j)) * + ScalarC(a * x.h_view(i, j) + b * y.h_view(i, j) + c * z.h_view(i, j)); } double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; @@ -259,29 +212,30 @@ void impl_test_team_update_mv(int N, int K) { KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::update( - teamMember, a, Kokkos::subview(x, Kokkos::ALL(), teamId), b, - Kokkos::subview(y, Kokkos::ALL(), teamId), c, - Kokkos::subview(z, Kokkos::ALL(), teamId)); + teamMember, a, Kokkos::subview(x.d_view, Kokkos::ALL(), teamId), b, + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), c, + Kokkos::subview(z.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, z, z); + KokkosBlas::dot(r, z.d_view, z.d_view); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], eps * expected_result[k]); } - Kokkos::deep_copy(b_z, b_org_z); + Kokkos::deep_copy(z.d_base, org_z.h_base); // KokkosBlas::update(a,c_x,b,y,c,z); Kokkos::parallel_for( "KokkosBlas::Test::TeamUpdate", policy, KOKKOS_LAMBDA(const team_member &teamMember) { const int teamId = teamMember.league_rank(); KokkosBlas::Experimental::update( - teamMember, a, Kokkos::subview(c_x, Kokkos::ALL(), teamId), b, - Kokkos::subview(y, Kokkos::ALL(), teamId), c, - Kokkos::subview(z, Kokkos::ALL(), teamId)); + teamMember, a, + Kokkos::subview(x.d_view_const, Kokkos::ALL(), teamId), b, + Kokkos::subview(y.d_view, Kokkos::ALL(), teamId), c, + Kokkos::subview(z.d_view, Kokkos::ALL(), teamId)); }); - KokkosBlas::dot(r, z, z); + KokkosBlas::dot(r, z.d_view, z.d_view); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); EXPECT_NEAR_KK(const_non_const_result, expected_result[k], @@ -326,8 +280,7 @@ int test_team_update() { // Device>(132231); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; @@ -387,8 +340,7 @@ int test_team_update_mv() { // view_type_c_lr, Device>(132231,5); #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_ls; typedef Kokkos::View view_type_b_ls; diff --git a/blas/unit_test/Test_Blas1_update.hpp b/blas/unit_test/Test_Blas1_update.hpp index 189dc2afb6..07445f595e 100644 --- a/blas/unit_test/Test_Blas1_update.hpp +++ b/blas/unit_test/Test_Blas1_update.hpp @@ -27,51 +27,15 @@ void impl_test_update(int N) { typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::View< - ScalarA * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeA; - typedef Kokkos::View< - ScalarB * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeB; - typedef Kokkos::View< - ScalarC * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeC; - ScalarA a = 3; ScalarB b = 5; ScalarC c = 7; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - BaseTypeA b_x("X", N); - BaseTypeB b_y("Y", N); - BaseTypeC b_z("Y", N); - BaseTypeC b_org_z("Org_Z", N); - - ViewTypeA x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeB y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - ViewTypeC z = Kokkos::subview(b_z, Kokkos::ALL(), 0); - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; - - typename BaseTypeA::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeB::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - typename BaseTypeC::HostMirror h_b_z = Kokkos::create_mirror_view(b_z); - - typename ViewTypeA::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeB::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); - typename ViewTypeC::HostMirror h_z = Kokkos::subview(h_b_z, Kokkos::ALL(), 0); + view_stride_adapter x("X", N); + view_stride_adapter y("Y", N); + view_stride_adapter z("Z", N); + view_stride_adapter org_z("Org_Z", N); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -79,52 +43,48 @@ void impl_test_update(int N) { { ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } { ScalarC randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_z, rand_pool, randStart, randEnd); + Kokkos::fill_random(z.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_z, b_z); - auto h_b_org_z = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); - auto h_org_z = Kokkos::subview(h_b_org_z, Kokkos::ALL(), 0); + Kokkos::deep_copy(org_z.h_base, z.d_base); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); - KokkosBlas::update(a, x, b, y, c, z); - Kokkos::deep_copy(h_b_z, b_z); + KokkosBlas::update(a, x.d_view, b, y.d_view, c, z.d_view); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK( - static_cast(a * h_x(i) + b * h_y(i) + c * h_org_z(i)), h_z(i), - eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * y.h_view(i) + + c * org_z.h_view(i)), + z.h_view(i), eps); } - Kokkos::deep_copy(b_z, b_org_z); - KokkosBlas::update(a, c_x, b, y, c, z); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(z.d_base, org_z.h_base); + KokkosBlas::update(a, x.d_view_const, b, y.d_view, c, z.d_view); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK( - static_cast(a * h_x(i) + b * h_y(i) + c * h_org_z(i)), h_z(i), - eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * y.h_view(i) + + c * org_z.h_view(i)), + z.h_view(i), eps); } - Kokkos::deep_copy(b_z, b_org_z); - KokkosBlas::update(a, c_x, b, c_y, c, z); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(z.d_base, org_z.h_base); + KokkosBlas::update(a, x.d_view_const, b, y.d_view_const, c, z.d_view); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { - EXPECT_NEAR_KK( - static_cast(a * h_x(i) + b * h_y(i) + c * h_org_z(i)), h_z(i), - eps); + EXPECT_NEAR_KK(static_cast(a * x.h_view(i) + b * y.h_view(i) + + c * org_z.h_view(i)), + z.h_view(i), eps); } } @@ -134,30 +94,10 @@ void impl_test_update_mv(int N, int K) { typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; - typedef multivector_layout_adapter vfA_type; - typedef multivector_layout_adapter vfB_type; - typedef multivector_layout_adapter vfC_type; - - typename vfA_type::BaseType b_x("X", N, K); - typename vfB_type::BaseType b_y("Y", N, K); - typename vfC_type::BaseType b_z("Z", N, K); - typename vfC_type::BaseType b_org_z("Z", N, K); - - ViewTypeA x = vfA_type::view(b_x); - ViewTypeB y = vfB_type::view(b_y); - ViewTypeC z = vfC_type::view(b_z); - - typedef multivector_layout_adapter h_vfA_type; - typedef multivector_layout_adapter h_vfB_type; - typedef multivector_layout_adapter h_vfC_type; - - typename h_vfA_type::BaseType h_b_x = Kokkos::create_mirror_view(b_x); - typename h_vfB_type::BaseType h_b_y = Kokkos::create_mirror_view(b_y); - typename h_vfC_type::BaseType h_b_z = Kokkos::create_mirror_view(b_z); - - typename ViewTypeA::HostMirror h_x = h_vfA_type::view(h_b_x); - typename ViewTypeB::HostMirror h_y = h_vfB_type::view(h_b_y); - typename ViewTypeC::HostMirror h_z = h_vfC_type::view(h_b_z); + view_stride_adapter x("X", N, K); + view_stride_adapter y("Y", N, K); + view_stride_adapter z("Z", N, K); + view_stride_adapter org_z("Org_Z", N, K); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -165,53 +105,50 @@ void impl_test_update_mv(int N, int K) { { ScalarA randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarB randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } { ScalarC randStart, randEnd; Test::getRandomBounds(10.0, randStart, randEnd); - Kokkos::fill_random(b_z, rand_pool, randStart, randEnd); + Kokkos::fill_random(z.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(b_org_z, b_z); - auto h_b_org_z = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); + Kokkos::deep_copy(org_z.h_base, z.d_base); - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); - ScalarA a = 3; - ScalarB b = 5; - ScalarC c = 5; - typename ViewTypeA::const_type c_x = x; - typename ViewTypeB::const_type c_y = y; + ScalarA a = 3; + ScalarB b = 5; + ScalarC c = 5; double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - KokkosBlas::update(a, x, b, y, c, z); - Kokkos::deep_copy(h_b_z, b_z); + KokkosBlas::update(a, x.d_view, b, y.d_view, c, z.d_view); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + b * h_y(i, j) + - c * h_b_org_z(i, j)), - h_z(i, j), eps); + EXPECT_NEAR_KK( + static_cast(a * x.h_view(i, j) + b * y.h_view(i, j) + + c * org_z.h_view(i, j)), + z.h_view(i, j), eps); } } - Kokkos::deep_copy(b_z, b_org_z); - KokkosBlas::update(a, c_x, b, y, c, z); - Kokkos::deep_copy(h_b_z, b_z); + Kokkos::deep_copy(z.d_base, org_z.h_base); + KokkosBlas::update(a, x.d_view_const, b, y.d_view, c, z.d_view); + Kokkos::deep_copy(z.h_base, z.d_base); for (int i = 0; i < N; i++) { for (int j = 0; j < K; j++) { - EXPECT_NEAR_KK(static_cast(a * h_x(i, j) + b * h_y(i, j) + - c * h_b_org_z(i, j)), - h_z(i, j), eps); + EXPECT_NEAR_KK( + static_cast(a * x.h_view(i, j) + b * y.h_view(i, j) + + c * org_z.h_view(i, j)), + z.h_view(i, j), eps); } } } @@ -251,31 +188,28 @@ int test_update() { // Device>(132231); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - typedef Kokkos::View view_type_c_ls; - Test::impl_test_update(0); - Test::impl_test_update(13); - Test::impl_test_update(1024); - // Test::impl_test_update(132231); - #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_update(1024); - Test::impl_test_update(1024); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + typedef Kokkos::View view_type_c_ls; + Test::impl_test_update(0); + Test::impl_test_update(13); + Test::impl_test_update(1024); + // Test::impl_test_update(132231); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_update(1024); + Test::impl_test_update(1024); +#endif return 1; } @@ -314,30 +248,28 @@ int test_update_mv() { Device>(132231, 5); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; typedef Kokkos::View - view_type_b_ls; typedef Kokkos::View - view_type_c_ls; Test::impl_test_update_mv(0, 5); Test::impl_test_update_mv(13, 5); - Test::impl_test_update_mv(1024, 5); - Test::impl_test_update_mv(132231, 5); - #endif - - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_update_mv(1024, 5); - Test::impl_test_update_mv(1024, 5); - #endif - */ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + typedef Kokkos::View view_type_c_ls; + Test::impl_test_update_mv(0, 5); + Test::impl_test_update_mv(13, 5); + Test::impl_test_update_mv(1024, 5); + Test::impl_test_update_mv(132231, 5); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_update_mv(1024, 5); + Test::impl_test_update_mv(1024, 5); +#endif return 1; } diff --git a/blas/unit_test/Test_Blas2_gemv.hpp b/blas/unit_test/Test_Blas2_gemv.hpp index dce07df9bc..dc83ac82f5 100644 --- a/blas/unit_test/Test_Blas2_gemv.hpp +++ b/blas/unit_test/Test_Blas2_gemv.hpp @@ -28,8 +28,6 @@ void impl_test_gemv(const char* mode, int M, int N) { typedef typename ViewTypeY::value_type ScalarY; typedef Kokkos::ArithTraits KAT_Y; - typedef multivector_layout_adapter vfA_type; - const ScalarA alpha = 3; ScalarY beta = 5; typename KAT_Y::mag_type const eps = KAT_Y::epsilon(); @@ -43,22 +41,11 @@ void impl_test_gemv(const char* mode, int M, int N) { ldx = M; ldy = N; } - typename vfA_type::BaseType b_A("A", M, N); - ViewTypeX x("X", ldx); - ViewTypeY y("Y", ldy); - ViewTypeY org_y("Org_Y", ldy); - - ViewTypeA A = vfA_type::view(b_A); - typename ViewTypeX::const_type c_x = x; - typename ViewTypeA::const_type c_A = A; - - typedef multivector_layout_adapter h_vfA_type; - typename h_vfA_type::BaseType h_b_A = Kokkos::create_mirror_view(b_A); - - typename ViewTypeA::HostMirror h_A = h_vfA_type::view(h_b_A); - typename ViewTypeX::HostMirror h_x = Kokkos::create_mirror_view(x); - typename ViewTypeY::HostMirror h_y = Kokkos::create_mirror_view(y); + view_stride_adapter A("A", M, N); + view_stride_adapter x("X", ldx); + view_stride_adapter y("Y", ldy); + view_stride_adapter org_y("Org_Y", ldy); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -69,17 +56,17 @@ void impl_test_gemv(const char* mode, int M, int N) { { ScalarX randStart, randEnd; Test::getRandomBounds(max_valX, randStart, randEnd); - Kokkos::fill_random(x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarY randStart, randEnd; Test::getRandomBounds(max_valY, randStart, randEnd); - Kokkos::fill_random(y, rand_pool, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); } { ScalarA randStart, randEnd; Test::getRandomBounds(max_valA, randStart, randEnd); - Kokkos::fill_random(b_A, rand_pool, randStart, randEnd); + Kokkos::fill_random(A.d_view, rand_pool, randStart, randEnd); } const typename KAT_Y::mag_type max_error = @@ -87,26 +74,22 @@ void impl_test_gemv(const char* mode, int M, int N) { const typename KAT_Y::mag_type tol = max_error * eps * 2; // adding small fudge factor of 2 - Kokkos::deep_copy(org_y, y); - auto h_org_y = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), org_y); - - Kokkos::deep_copy(h_x, x); - Kokkos::deep_copy(h_y, y); - Kokkos::deep_copy(h_b_A, b_A); + Kokkos::deep_copy(org_y.h_base, y.d_base); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(A.h_base, A.d_base); Kokkos::View expected("expected aAx+by", ldy); - Kokkos::deep_copy(expected, h_org_y); - vanillaGEMV(mode[0], alpha, h_A, h_x, beta, expected); + Kokkos::deep_copy(expected, org_y.h_view); + vanillaGEMV(mode[0], alpha, A.h_view, x.h_view, beta, expected); - KokkosBlas::gemv(mode, alpha, A, x, beta, y); - Kokkos::deep_copy(h_y, y); + KokkosBlas::gemv(mode, alpha, A.d_view, x.d_view, beta, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); int numErrors = 0; for (int i = 0; i < ldy; i++) { - if (KAT_Y::abs(expected(i) - h_y(i)) > tol) { + if (KAT_Y::abs(expected(i) - y.h_view(i)) > tol) { numErrors++; std::cerr << __FILE__ << ":" << __LINE__ - << ": expected(i)=" << expected(i) << ", h_y(i)=" << h_y(i) + << ": expected(i)=" << expected(i) << ", h_y(i)=" << y.h_view(i) << std::endl; } } @@ -114,23 +97,23 @@ void impl_test_gemv(const char* mode, int M, int N) { << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; - Kokkos::deep_copy(y, org_y); - KokkosBlas::gemv(mode, alpha, A, c_x, beta, y); - Kokkos::deep_copy(h_y, y); + Kokkos::deep_copy(y.d_base, org_y.h_base); + KokkosBlas::gemv(mode, alpha, A.d_view, x.d_view_const, beta, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); numErrors = 0; for (int i = 0; i < ldy; i++) { - if (KAT_Y::abs(expected(i) - h_y(i)) > tol) numErrors++; + if (KAT_Y::abs(expected(i) - y.h_view(i)) > tol) numErrors++; } EXPECT_EQ(numErrors, 0) << "Const vector input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect"; - Kokkos::deep_copy(y, org_y); - KokkosBlas::gemv(mode, alpha, c_A, c_x, beta, y); - Kokkos::deep_copy(h_y, y); + Kokkos::deep_copy(y.d_base, org_y.h_base); + KokkosBlas::gemv(mode, alpha, A.d_view_const, x.d_view_const, beta, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); numErrors = 0; for (int i = 0; i < ldy; i++) { - if (KAT_Y::abs(expected(i) - h_y(i)) > tol) numErrors++; + if (KAT_Y::abs(expected(i) - y.h_view(i)) > tol) numErrors++; } EXPECT_EQ(numErrors, 0) << "Const matrix/vector input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta @@ -139,18 +122,18 @@ void impl_test_gemv(const char* mode, int M, int N) { // This should overwrite the NaNs with the correct result. beta = KAT_Y::zero(); // beta changed, so update the correct answer - vanillaGEMV(mode[0], alpha, h_A, h_x, beta, expected); - Kokkos::deep_copy(y, KAT_Y::nan()); - KokkosBlas::gemv(mode, alpha, A, x, beta, y); - Kokkos::deep_copy(h_y, y); + vanillaGEMV(mode[0], alpha, A.h_view, x.h_view, beta, expected); + Kokkos::deep_copy(y.d_view, KAT_Y::nan()); + KokkosBlas::gemv(mode, alpha, A.d_view, x.d_view, beta, y.d_view); + Kokkos::deep_copy(y.h_base, y.d_base); numErrors = 0; for (int i = 0; i < ldy; i++) { - if (KAT_Y::isNan(h_y(i)) || - KAT_Y::abs(expected(i) - h_y(i)) > + if (KAT_Y::isNan(y.h_view(i)) || + KAT_Y::abs(expected(i) - y.h_view(i)) > KAT_Y::abs(alpha * max_valA * max_valX * ldx * eps * 2)) { numErrors++; std::cerr << __FILE__ << ":" << __LINE__ << ": expected(" << i - << ")=" << expected(i) << ", h_y(" << i << ")=" << h_y(i) + << ")=" << expected(i) << ", h_y(" << i << ")=" << y.h_view(i) << ", eps=" << eps << ", 1024*2*eps=" << 1024 * 2 * KAT_Y::epsilon() << std::endl; } @@ -218,33 +201,36 @@ int test_gemv(const char* mode) { // Device>(mode,132231,1024); #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View - view_type_a_ls; typedef Kokkos::View - view_type_b_ls; typedef Kokkos::View - view_type_c_ls; Test::impl_test_gemv( mode, 0, 1024); Test::impl_test_gemv( mode, 1024, 0); - Test::impl_test_gemv( mode, 13, 13); Test::impl_test_gemv( mode, 13, 1024); Test::impl_test_gemv( mode, 50, 40); - Test::impl_test_gemv( mode, 1024, 1024); Test::impl_test_gemv( mode, 2131, 2131); - // Test::impl_test_gemv(mode,132231,1024); - #endif +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + typedef Kokkos::View view_type_b_ls; + typedef Kokkos::View view_type_c_ls; + Test::impl_test_gemv( + mode, 0, 1024); + Test::impl_test_gemv( + mode, 1024, 0); + Test::impl_test_gemv( + mode, 13, 13); + Test::impl_test_gemv( + mode, 13, 1024); + Test::impl_test_gemv( + mode, 50, 40); + Test::impl_test_gemv( + mode, 1024, 1024); + Test::impl_test_gemv( + mode, 2131, 2131); + // Test::impl_test_gemv(mode,132231,1024); +#endif - #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_gemv( mode, 1024, 1024); Test::impl_test_gemv( mode, 1024, 1024); #endif - */ +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_gemv( + mode, 1024, 1024); + Test::impl_test_gemv( + mode, 1024, 1024); +#endif return 1; } diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 130187ef35..4724621f46 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -62,45 +62,84 @@ #endif namespace Test { -template ::value> -struct multivector_layout_adapter; +// Utility class for testing kernels with rank-1 and rank-2 views that may be +// LayoutStride. Simplifies making a LayoutStride view of a given size that is +// actually noncontiguous, and host-device transfers for checking results on +// host. +// +// Constructed with label and extent(s), and then provides 5 views as members: +// - d_view, and a const-valued alias d_view_const +// - h_view +// - d_base +// - h_base +// d_view is of type ViewType, and has the extents passed to the constructor. +// h_view is a mirror of d_view. +// d_base (and its mirror h_base) are contiguous views, so they can be +// deep-copied to each other. d_view aliases d_base, and h_view aliases h_base. +// This means that copying between d_base and h_base +// also copies between d_view and h_view. template -struct multivector_layout_adapter { - typedef typename ViewType::value_type Scalar; - typedef typename ViewType::device_type Device; - typedef Kokkos::View - BaseTypeRight; - typedef Kokkos::View - BaseTypeDefault; - typedef - typename std::conditional::value, - BaseTypeRight, BaseTypeDefault>::type BaseType; - - static ViewType view(const BaseType& v) { - return Kokkos::subview(v, Kokkos::ALL, Kokkos::ALL, 0); - }; -}; +struct view_stride_adapter { + static_assert(Kokkos::is_view_v, + "view_stride_adapter: ViewType must be a Kokkos::View"); + static_assert(ViewType::rank >= 1 && ViewType::rank <= 2, + "view_stride_adapter: ViewType must be rank 1 or rank 2"); + + static constexpr bool strided = std::is_same::value; + static constexpr int rank = ViewType::rank; + + using DView = ViewType; + using HView = typename DView::HostMirror; + // If not strided, the base view types are the same as DView/HView. + // But if strided, the base views have one additional dimension, so that + // d_view/h_view have stride > 1 between consecutive elements. + using DViewBase = std::conditional_t< + strided, + Kokkos::View, + DView>; + using HViewBase = typename DViewBase::HostMirror; + + view_stride_adapter(const std::string& label, int m, int n = 1) { + if constexpr (rank == 1) { + if constexpr (strided) { + d_base = DViewBase(label, m, 2); + h_base = Kokkos::create_mirror_view(Kokkos::HostSpace(), d_base); + d_view = Kokkos::subview(d_base, Kokkos::ALL(), 0); + h_view = Kokkos::subview(h_base, Kokkos::ALL(), 0); + } else { + d_base = DViewBase(label, m); + h_base = Kokkos::create_mirror_view(Kokkos::HostSpace(), d_base); + d_view = d_base; + h_view = h_base; + } + } else { + if constexpr (strided) { + d_base = DViewBase(label, m, n, 2); + h_base = Kokkos::create_mirror_view(Kokkos::HostSpace(), d_base); + d_view = + Kokkos::subview(d_base, Kokkos::ALL(), Kokkos::make_pair(0, n), 0); + h_view = + Kokkos::subview(h_base, Kokkos::ALL(), Kokkos::make_pair(0, n), 0); + } else { + d_base = DViewBase(label, m, n); + h_base = Kokkos::create_mirror_view(Kokkos::HostSpace(), d_base); + d_view = d_base; + h_view = h_base; + } + } + d_view_const = d_view; + } -template -struct multivector_layout_adapter { - typedef typename ViewType::value_type Scalar; - typedef typename ViewType::device_type Device; - typedef Kokkos::View - BaseTypeRight; - typedef Kokkos::View - BaseTypeDefault; - typedef - typename std::conditional::value, - BaseTypeRight, BaseTypeDefault>::type BaseType; - - static ViewType view(const BaseType& v) { - return Kokkos::subview(v, Kokkos::ALL, Kokkos::ALL); - }; + // Have both const and nonconst versions of d_view (with same underlying + // data), since we often test BLAS with both + DView d_view; + typename DView::const_type d_view_const; + HView h_view; + DViewBase d_base; + HViewBase h_base; }; template From a176b931b9d07eb526eda0fabd93dfe3aca60a2b Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Mon, 17 Apr 2023 09:04:50 -0600 Subject: [PATCH 248/442] Fix #1786: check that work array is contiguous in SVD (#1793) * Extend batched dense SVD test for #1786 * Batched SVD: check that work view is contiguous --- .../impl/KokkosBatched_SVD_Serial_Impl.hpp | 22 +++ .../unit_test/Test_Batched_SerialSVD.hpp | 127 +++++++++++++----- 2 files changed, 119 insertions(+), 30 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp index 5a2cc638c4..20dab77092 100644 --- a/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp @@ -29,6 +29,19 @@ KOKKOS_INLINE_FUNCTION int SerialSVD::invoke(SVD_USV_Tag, const AViewType &A, const SViewType &sigma, const VViewType &Vt, const WViewType &work) { + static_assert(Kokkos::is_view_v && AViewType::rank == 2, + "SVD: A must be a rank-2 view"); + static_assert(Kokkos::is_view_v && UViewType::rank == 2, + "SVD: U must be a rank-2 view"); + static_assert(Kokkos::is_view_v && SViewType::rank == 1, + "SVD: s must be a rank-1 view"); + static_assert(Kokkos::is_view_v && VViewType::rank == 2, + "SVD: V must be a rank-2 view"); + static_assert(Kokkos::is_view_v && WViewType::rank == 1, + "SVD: W must be a rank-1 view"); + static_assert( + !std::is_same_v, + "SVD: W must be contiguous (not LayoutStride)"); using value_type = typename AViewType::non_const_value_type; return KokkosBatched::SerialSVDInternal::invoke( A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), U.data(), @@ -41,6 +54,15 @@ template KOKKOS_INLINE_FUNCTION int SerialSVD::invoke(SVD_S_Tag, const AViewType &A, const SViewType &sigma, const WViewType &work) { + static_assert(Kokkos::is_view_v && AViewType::rank == 2, + "SVD: A must be a rank-2 view"); + static_assert(Kokkos::is_view_v && SViewType::rank == 1, + "SVD: s must be a rank-1 view"); + static_assert(Kokkos::is_view_v && WViewType::rank == 1, + "SVD: W must be a rank-1 view"); + static_assert( + !std::is_same_v, + "SVD: W must be contiguous (not LayoutStride)"); using value_type = typename AViewType::non_const_value_type; return KokkosBatched::SerialSVDInternal::invoke( A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), nullptr, 0, diff --git a/batched/dense/unit_test/Test_Batched_SerialSVD.hpp b/batched/dense/unit_test/Test_Batched_SerialSVD.hpp index a841cc7ba9..5aa832f0df 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSVD.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSVD.hpp @@ -36,45 +36,35 @@ float svdEpsilon() { } } // namespace Test -template -double simpleNorm2(const Vector& v) { - using Scalar = typename Vector::non_const_value_type; - using KAT = Kokkos::ArithTraits; - auto vhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v); - double d = 0; - for (size_t i = 0; i < v.extent(0); i++) { - double m = KAT::abs(vhost(i)); - d += m * m; - } - return std::sqrt(d); -} - +// NOTE: simpleDot and simpleNorm2 currently support only real scalars (OK since +// SVD does as well) template typename V1::non_const_value_type simpleDot(const V1& v1, const V2& v2) { using Scalar = typename V1::non_const_value_type; - using KAT = Kokkos::ArithTraits; - auto v1host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v1); - auto v2host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), v2); - typename V1::non_const_value_type val = KAT::zero(); - for (size_t i = 0; i < v1.extent(0); i++) { - val += v1host(i) * v2host(i); - } - return val; + Scalar d; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, v1.extent(0)), + KOKKOS_LAMBDA(int i, Scalar& ld) { ld += v1(i) * v2(i); }, d); + return d; +} +template +typename V::non_const_value_type simpleNorm2(const V& v) { + return Kokkos::sqrt(simpleDot(v, v)); } // Check that all columns of X are unit length and pairwise orthogonal template void verifyOrthogonal(const Mat& X) { - using value_type = typename Mat::non_const_value_type; - int k = X.extent(1); + using Scalar = typename Mat::non_const_value_type; + int k = X.extent(1); for (int i = 0; i < k; i++) { auto col1 = Kokkos::subview(X, Kokkos::ALL(), i); double len = simpleNorm2(col1); - Test::EXPECT_NEAR_KK(len, 1.0, Test::svdEpsilon()); + Test::EXPECT_NEAR_KK(len, 1.0, Test::svdEpsilon()); for (int j = 0; j < i; j++) { auto col2 = Kokkos::subview(X, Kokkos::ALL(), j); - double d = Kokkos::ArithTraits::abs(simpleDot(col1, col2)); - Test::EXPECT_NEAR_KK(d, 0.0, Test::svdEpsilon()); + double d = Kokkos::ArithTraits::abs(simpleDot(col1, col2)); + Test::EXPECT_NEAR_KK(d, 0.0, Test::svdEpsilon()); } } } @@ -82,8 +72,8 @@ void verifyOrthogonal(const Mat& X) { template void verifySVD(const AView& A, const UView& U, const VtView& Vt, const SigmaView& sigma) { - using value_type = typename AView::non_const_value_type; - using KAT = Kokkos::ArithTraits; + using Scalar = typename AView::non_const_value_type; + using KAT = Kokkos::ArithTraits; // Check that U/V columns are unit length and orthogonal, and that U * // diag(sigma) * V^T == A int m = A.extent(0); @@ -93,7 +83,7 @@ void verifySVD(const AView& A, const UView& U, const VtView& Vt, // NOTE: V^T being square and orthonormal implies that V is, so we don't have // to transpose it here. verifyOrthogonal(Vt); - AView usvt("USV^T", m, n); + Kokkos::View usvt("USV^T", m, n); for (int i = 0; i < maxrank; i++) { auto Ucol = Kokkos::subview(U, Kokkos::ALL(), Kokkos::make_pair(i, i + 1)); @@ -103,7 +93,7 @@ void verifySVD(const AView& A, const UView& U, const VtView& Vt, } for (int i = 0; i < m; i++) { for (int j = 0; j < n; j++) { - Test::EXPECT_NEAR_KK(usvt(i, j), A(i, j), Test::svdEpsilon()); + Test::EXPECT_NEAR_KK(usvt(i, j), A(i, j), Test::svdEpsilon()); } } // Make sure all singular values are positive @@ -389,11 +379,86 @@ void testSVD() { testSerialSVDSingularValuesOnly(10, 8); } +template +KOKKOS_INLINE_FUNCTION constexpr auto Determinant(ViewT F) + -> std::enable_if_t::value && ViewT::rank == 2, + double> { + return (F(0, 0) * F(1, 1) * F(2, 2) + F(0, 1) * F(1, 2) * F(2, 0) + + F(0, 2) * F(1, 0) * F(2, 1) - + (F(0, 2) * F(1, 1) * F(2, 0) + F(0, 1) * F(1, 0) * F(2, 2) + + F(0, 0) * F(1, 2) * F(2, 1))); +} + +template +void GenerateTestData(ViewT data) { + using memory_space = typename ExeSpace::memory_space; + // finite difference should return dPK2dU. So, we can analyze two cases. + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(data, random, 1.0); + Kokkos::parallel_for( + Kokkos::RangePolicy(0, data.extent(0)), KOKKOS_LAMBDA(int i) { + auto data_i = Kokkos::subview(data, i, Kokkos::ALL(), Kokkos::ALL()); + while (Determinant(data_i) < 0.5) { + data_i(0, 0) += 1.0; + data_i(1, 1) += 1.0; + data_i(2, 2) += 1.0; + } + }); +} + +template +void testIssue1786() { + using memory_space = typename ExeSpace::memory_space; + constexpr int num_tests = 4; + Kokkos::View matrices("data", + num_tests); + GenerateTestData(matrices); + Kokkos::View Us("Us", + matrices.extent(0)); + Kokkos::View Ss("Ss", matrices.extent(0)); + Kokkos::View Vts("Vts", + matrices.extent(0)); + // Make sure the 2nd dimension of works is contiguous + Kokkos::View works( + "works", matrices.extent(0)); + Kokkos::View matrices_copy( + "matrices_copy", matrices.extent(0)); + // make a copy of the input data to avoid overwriting it + Kokkos::deep_copy(matrices_copy, matrices); + auto policy = Kokkos::RangePolicy(0, matrices.extent(0)); + Kokkos::parallel_for( + "polar decomposition", policy, KOKKOS_LAMBDA(int i) { + auto matrix_copy = + Kokkos::subview(matrices_copy, i, Kokkos::ALL(), Kokkos::ALL()); + auto U = Kokkos::subview(Us, i, Kokkos::ALL(), Kokkos::ALL()); + auto S = Kokkos::subview(Ss, i, Kokkos::ALL()); + auto Vt = Kokkos::subview(Vts, i, Kokkos::ALL(), Kokkos::ALL()); + auto work = Kokkos::subview(works, i, Kokkos::ALL()); + KokkosBatched::SerialSVD::invoke(KokkosBatched::SVD_USV_Tag{}, + matrix_copy, U, S, Vt, work); + }); + + auto Us_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, Us); + auto Ss_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, Ss); + auto Vts_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, Vts); + auto matrices_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace{}, matrices); + for (int i = 0; i < num_tests; i++) { + auto A = Kokkos::subview(matrices_h, i, Kokkos::ALL(), Kokkos::ALL()); + auto U = Kokkos::subview(Us_h, i, Kokkos::ALL(), Kokkos::ALL()); + auto S = Kokkos::subview(Ss_h, i, Kokkos::ALL()); + auto Vt = Kokkos::subview(Vts_h, i, Kokkos::ALL(), Kokkos::ALL()); + verifySVD(A, U, Vt, S); + } +} + #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_svd_double) { // Test general SVD on a few different input sizes (full rank randomized) testSVD(); testSVD(); + testIssue1786(); + testIssue1786(); } #endif @@ -402,5 +467,7 @@ TEST_F(TestCategory, batched_scalar_serial_svd_float) { // Test general SVD on a few different input sizes (full rank randomized) testSVD(); testSVD(); + testIssue1786(); + testIssue1786(); } #endif From 26dac2932e2e1678287d8b19f792eec224a32c5d Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 17 Apr 2023 12:22:29 -0600 Subject: [PATCH 249/442] scripts: Final changes for clang 10 --- scripts/cm_test_all_sandia | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 102e3b098b..44a3a9e795 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -157,7 +157,6 @@ fi if [[ "$HOSTNAME" == *solo* ]]; then # Warning: very generic name MACHINE=solo - module use /projects/netpub/clang/modulefiles fi if [[ "$HOSTNAME" == kokkos-dev-2* ]]; then @@ -770,6 +769,7 @@ elif [ "$MACHINE" = "solo" ]; then module load cmake/3.22.3 BASE_MODULE_LIST="cmake/3.22.3,/" + BASE_MODULE_LIST_LLVM="cmake/3.22.3,/,gnu/10.2.1" BASE_MODULE_LIST_INTEL="cmake/3.22.3,gnu/8.2.1,/" ONEAPI_WARNING_FLAGS="" @@ -778,7 +778,7 @@ elif [ "$MACHINE" = "solo" ]; then if [ "$SPOT_CHECK" = "True" ]; then COMPILERS=( "gnu/10.2.1 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GNU_WARNING_FLAGS" - "llvm/10.0.1 $BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" + "llvm/10.0.1 $BASE_MODULE_LIST_LLVM "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then COMPILERS=("intel/19.0.5.281 $BASE_MODULE_LIST_INTEL,mkl/19.0.5.281 "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" From 2b63c1a616bdba0618af1b8aff46a6cd9bb7e5df Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 17 Apr 2023 12:53:31 -0600 Subject: [PATCH 250/442] scripts: Fix github-DOCS --- .github/workflows/docs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index b429ec415f..a066ce315b 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -15,7 +15,7 @@ jobs: steps: - name: Install Dependencies run: | - sudo apt install --no-install-recommends doxygen-latex + sudo apt install doxygen pip install sphinx pip install breathe pip install sphinx-rtd-theme From 0b871d129d27b7ab51b2b98e9ea158e47f575938 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 17 Apr 2023 17:18:32 -0600 Subject: [PATCH 251/442] Remove deprecated code --- blas/unit_test/Test_Blas3_gemm.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/unit_test/Test_Blas3_gemm.hpp b/blas/unit_test/Test_Blas3_gemm.hpp index d4aae792ea..13c52ec437 100644 --- a/blas/unit_test/Test_Blas3_gemm.hpp +++ b/blas/unit_test/Test_Blas3_gemm.hpp @@ -264,7 +264,7 @@ void impl_test_stream_gemm_psge2(const int M, const int N, const int K, using ViewTypeB = Kokkos::View; using ViewTypeC = Kokkos::View; using ScalarC = typename ViewTypeC::value_type; - using APT = Kokkos::Details::ArithTraits; + using APT = Kokkos::ArithTraits; using mag_type = typename APT::mag_type; const char tA[] = {"N"}; From 038def6154b6fd24f4087c9a4b1005128dd2cc64 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 25 Jan 2023 15:19:27 -0700 Subject: [PATCH 252/442] sparse: Add coo2crs, crs2coo and CooMatrix --- docs/developer/apidocs/sparse.rst | 15 + graph/impl/KokkosGraph_Distance2MIS_impl.hpp | 1 + sparse/src/KokkosSparse_CooMatrix.hpp | 129 +++++++ sparse/src/KokkosSparse_ccs2crs.hpp | 2 +- sparse/src/KokkosSparse_coo2crs.hpp | 344 ++++++++++++++++++ sparse/src/KokkosSparse_crs2ccs.hpp | 2 +- sparse/src/KokkosSparse_crs2coo.hpp | 154 ++++++++ sparse/unit_test/Test_Sparse.hpp | 2 + .../Test_Sparse_TestUtils_RandCsMat.hpp | 2 +- sparse/unit_test/Test_Sparse_coo2crs.hpp | 329 +++++++++++++++++ sparse/unit_test/Test_Sparse_crs2coo.hpp | 142 ++++++++ test_common/KokkosKernels_TestUtils.hpp | 70 +++- 12 files changed, 1187 insertions(+), 5 deletions(-) create mode 100644 sparse/src/KokkosSparse_CooMatrix.hpp create mode 100644 sparse/src/KokkosSparse_coo2crs.hpp create mode 100644 sparse/src/KokkosSparse_crs2coo.hpp create mode 100644 sparse/unit_test/Test_Sparse_coo2crs.hpp create mode 100644 sparse/unit_test/Test_Sparse_crs2coo.hpp diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index ed877ac567..15509e90a0 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -11,6 +11,11 @@ ccsmatrix .. doxygenclass:: KokkosSparse::CcsMatrix :members: +coomatrix +--------- +.. doxygenclass:: KokkosSparse::CooMatrix + :members: + crs2ccs ------- .. doxygenfunction:: KokkosSparse::crs2ccs(OrdinalType nrows, OrdinalType ncols, SizeType nnz, ValViewType vals, RowMapViewType row_map, ColIdViewType col_ids) @@ -21,6 +26,16 @@ ccs2crs .. doxygenfunction:: KokkosSparse::ccs2crs(OrdinalType nrows, OrdinalType ncols, SizeType nnz, ValViewType vals, ColMapViewType col_map, RowIdViewType row_ids) .. doxygenfunction:: KokkosSparse::ccs2crs(KokkosSparse::CcsMatrix &ccsMatrix) +coo2crs +------- +.. doxygenfunction:: KokkosSparse::coo2crs(DimType, DimType, RowViewType, ColViewType, DataViewType) +.. doxygenfunction:: KokkosSparse::coo2crs(KokkosSparse::CooMatrix &cooMatrix) + +crs2coo +------- +.. doxygenfunction:: KokkosSparse::crs2coo(OrdinalType, OrdinalType, SizeType, ValViewType, RowMapViewType, ColIdViewType) +.. doxygenfunction:: KokkosSparse::crs2coo(KokkosSparse::CrsMatrix &crsMatrix) + spmv ---- diff --git a/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index aa8180fae7..ff4382c930 100644 --- a/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -829,6 +829,7 @@ struct D2_MIS_FixedPriority { InitWorklistFunctor(worklist1)); lno_t workRemain = numVerts; int numIter = 0; + (void)numIter; while (workRemain) { // do another iteration Kokkos::parallel_for( diff --git a/sparse/src/KokkosSparse_CooMatrix.hpp b/sparse/src/KokkosSparse_CooMatrix.hpp new file mode 100644 index 0000000000..631283cfab --- /dev/null +++ b/sparse/src/KokkosSparse_CooMatrix.hpp @@ -0,0 +1,129 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/// \file KokkosSparse_CooMatrix.hpp +/// \brief Local sparse matrix interface +/// +/// This file provides KokkosSparse::CooMatrix. This implements a +/// local (no MPI) sparse matrix stored in coordinate ("Coo") format +/// which is also known as ivj or triplet format. + +#ifndef KOKKOS_SPARSE_COOMATRIX_HPP_ +#define KOKKOS_SPARSE_COOMATRIX_HPP_ + +#include "Kokkos_Core.hpp" +#include "KokkosKernels_Error.hpp" +#include +#include + +namespace KokkosSparse { +/// \class CooMatrix +/// +/// \brief Coordinate format implementation of a sparse matrix. +/// +/// \tparam RowView The type of row index view. +/// \tparam ColumnView The type of column index view. +/// \tparam DataView The type of data view. +/// \tparam Device The Kokkos Device type. +/// \tparam MemoryTraits Traits describing how Kokkos manages and +/// accesses data. The default parameter suffices for most users. +/// +/// "Coo" stands for "coordinate format". +template +class CooMatrix { + public: + using execution_space = typename Device::execution_space; + using memory_space = typename Device::memory_space; + using data_type = typename DataView::non_const_value_type; + using const_data_type = typename DataView::const_value_type; + using row_type = typename RowView::non_const_value_type; + using const_row_type = typename RowView::const_value_type; + using column_type = typename ColumnView::non_const_value_type; + using const_column_type = typename ColumnView::const_value_type; + using size_type = size_t; + + static_assert(std::is_integral_v, + "RowView::value_type must be an integral."); + static_assert(std::is_integral_v, + "ColumnView::value_type must be an integral."); + + private: + size_type m_num_rows, m_num_cols; + + public: + RowView row; + ColumnView col; + DataView data; + + /// \brief Default constructor; constructs an empty sparse matrix. + KOKKOS_INLINE_FUNCTION + CooMatrix() : m_num_rows(0), m_num_cols(0) {} + + // clang-format off + /// \brief Constructor that accepts a column indicies view, row indices view, and + /// values view. + /// + /// The matrix will store and use the column indices, rows indices, and values + /// directly (by view, not by deep copy). + /// + /// \param nrows [in] The number of rows. + /// \param ncols [in] The number of columns. + /// \param row_in [in] The row indexes. + /// \param col_in [in] The column indexes. + /// \param data_in [in] The values. + // clang-format on + CooMatrix(size_type nrows, size_type ncols, RowView row_in, ColumnView col_in, + DataView data_in) + : m_num_rows(nrows), + m_num_cols(ncols), + row(row_in), + col(col_in), + data(data_in) { + if (data.extent(0) != row.extent(0) || row.extent(0) != col.extent(0)) { + std::ostringstream os; + os << "data.extent(0): " << data.extent(0) << " != " + << "row.extent(0): " << row.extent(0) << " != " + << "col.extent(0): " << col.extent(0) << "."; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + } + + //! The number of columns in the sparse matrix. + KOKKOS_INLINE_FUNCTION size_type numCols() const { return m_num_cols; } + + //! The number of rows in the sparse matrix. + KOKKOS_INLINE_FUNCTION size_type numRows() const { return m_num_rows; } + + //! The number of stored entries in the sparse matrix, including zeros. + KOKKOS_INLINE_FUNCTION size_type nnz() const { + assert(data.extent(0) == row.extent(0) == col.extent(0) && + "Error lengths of RowView != ColView != DataView"); + return data.extent(0); + } +}; + +/// \class is_coo_matrix +/// \brief is_coo_matrix::value is true if T is a CooMatrix<...>, false +/// otherwise +template +struct is_coo_matrix : public std::false_type {}; +template +struct is_coo_matrix> : public std::true_type {}; +template +struct is_coo_matrix> : public std::true_type {}; + +} // namespace KokkosSparse +#endif diff --git a/sparse/src/KokkosSparse_ccs2crs.hpp b/sparse/src/KokkosSparse_ccs2crs.hpp index 50fec77411..9b4bae2134 100644 --- a/sparse/src/KokkosSparse_ccs2crs.hpp +++ b/sparse/src/KokkosSparse_ccs2crs.hpp @@ -115,7 +115,7 @@ auto ccs2crs(OrdinalType nrows, OrdinalType ncols, SizeType nnz, /// /// \tparam ScalarType The ccsMatrix::scalar_type /// \tparam OrdinalType The ccsMatrix::ordinal_type -/// \tparam Device The ccsMatrix::device_type +/// \tparam DeviceType The ccsMatrix::device_type /// \tparam MemoryTraits The ccsMatrix::memory_traits /// \tparam SizeType The ccsMatrix::size_type /// \param ccsMatrix The KokkosSparse::CcsMatrix. diff --git a/sparse/src/KokkosSparse_coo2crs.hpp b/sparse/src/KokkosSparse_coo2crs.hpp new file mode 100644 index 0000000000..7e3ce2ccc4 --- /dev/null +++ b/sparse/src/KokkosSparse_coo2crs.hpp @@ -0,0 +1,344 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "KokkosSparse_CooMatrix.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosKernels_Utils.hpp" +#include "Kokkos_UnorderedMap.hpp" +#include + +#ifndef _KOKKOSSPARSE_COO2CRS_HPP +#define _KOKKOSSPARSE_COO2CRS_HPP +namespace KokkosSparse { +namespace Impl { +template +class Coo2Crs { + private: + using RowViewScalarType = typename RowViewType::value_type; + using ColViewScalarType = typename ColViewType::value_type; + using DataViewScalarType = typename DataViewType::value_type; + using CrsST = DataViewScalarType; + using CrsOT = RowViewScalarType; + using CrsET = typename DataViewType::execution_space; + using CrsMT = void; + using CrsSzT = ColViewScalarType; + using CrsType = CrsMatrix; + using CrsValsViewType = typename CrsType::values_type; + using CrsRowMapViewType = typename CrsType::row_map_type::non_const_type; + using CrsColIdViewType = typename CrsType::index_type; + + using UmapValueViewType = Kokkos::View; + using UmapOpTypes = + Kokkos::UnorderedMapInsertOpTypes; + using UmapOpType = typename UmapOpTypes::AtomicAdd; + + // Make public for Kokkos::View + public: + using UmapHasherType = typename Kokkos::pod_hash; + using UmapEqualToType = typename Kokkos::pod_equal_to; + using UmapType = Kokkos::UnorderedMap; + + // Public for kokkos policies + struct coo2crsRp1 {}; + struct rowmapRp1 {}; + struct copyTp1 {}; + struct copyRp1 {}; + + using copyTp1Pt = Kokkos::TeamPolicy; + using copyTp1MemberType = typename copyTp1Pt::member_type; + + private: + using BmapViewType = Kokkos::View; + + using CrsRowMapView = Kokkos::View; + using CrsRowMapAtomicView = + Kokkos::View>; + using CrsValuesView = Kokkos::View; + using CrsColIdsView = Kokkos::View; + + CrsRowMapView m_crs_row_map; + CrsRowMapAtomicView m_crs_row_map_tmp; + CrsValuesView m_crs_vals; + CrsColIdsView m_crs_col_ids; + UmapType *m_umaps; + BmapViewType m_capacity_bmap; + BmapViewType m_tuple_bmap; + UmapOpType m_insert_op; + CrsOT m_nrows; + CrsOT m_ncols; + RowViewType m_row; + ColViewType m_col; + DataViewType m_data; + CrsSzT m_nnz; + + int m_n_tuples; + + public: + KOKKOS_INLINE_FUNCTION + void operator()(const coo2crsRp1 &, const int &idx) const { + auto i = m_row(idx); + auto j = m_col(idx); + auto is_inserted = m_tuple_bmap(idx); + + if (i >= m_nrows || j >= m_ncols) { + Kokkos::abort("tuple is out of bounds"); + } else if (!is_inserted && i >= 0 && j >= 0) { + if (m_umaps[i].insert(j, m_data(idx), m_insert_op).failed()) { + m_capacity_bmap(i) = true; // hmap at index i reached capacity + } else { + m_tuple_bmap(idx) = true; // checklist of inserted tuples + } + } + } + + // TODO: umap.size cannot be called in a kernel. + // Requires updating Kokkos::BitSet::count() to be + // a host device function. + /* KOKKOS_INLINE_FUNCTION + void operator()(const rowmapRp1 &, const int &row_idx) const { + auto i = row_idx - 1; + m_crs_row_map(row_idx) = m_crs_row_map(i) + m_umaps[i].ptr->size(); + } */ + + KOKKOS_INLINE_FUNCTION + void operator()(const copyRp1 &, const int &i) const { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UN + for (int j = 0; j < m_ncols; j++) { + if (m_umaps[i].exists(j)) { + auto umap_idx = m_umaps[i].find(j); + auto offset = m_crs_row_map_tmp(i)++; + m_crs_vals(offset) = m_umaps[i].value_at(umap_idx); + m_crs_col_ids(offset) = m_umaps[i].key_at(umap_idx); + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const copyTp1 &, const copyTp1MemberType &member) const { + auto row_idx = member.league_rank(); + auto cpy_beg = m_crs_row_map(row_idx); + auto cpy_end = m_crs_row_map(row_idx + 1); + auto cpy_len = cpy_end - cpy_beg; + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, cpy_len), + [&](const int &i) { + auto offset = i + cpy_beg; + m_crs_vals(offset) = m_umaps[i].value_at(i); + m_crs_col_ids(offset) = m_umaps[i].key_at(i); + }); + } + + Coo2Crs(DimType m, DimType n, RowViewType row, ColViewType col, + DataViewType data) { + m_n_tuples = data.extent(0); + m_nrows = m; + m_ncols = n; + m_row = row; + m_col = col; + m_data = data; + + typename UmapType::size_type arg_capacity_hint = m_n_tuples / m_nrows / 4; + typename UmapType::hasher_type arg_hasher; + typename UmapType::equal_to_type arg_equal_to; + arg_capacity_hint = arg_capacity_hint < 16 ? 16 : arg_capacity_hint; + + m_capacity_bmap = BmapViewType("m_capacity_bmap", m_nrows); + typename BmapViewType::HostMirror m_capacity_bmap_mirror = + Kokkos::create_mirror_view(m_capacity_bmap); + m_tuple_bmap = BmapViewType("m_tuple_bmap", m_n_tuples); + + m_crs_row_map = CrsRowMapView( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_crs_row_map"), + m_nrows + 1); + + // Memory management notes for `umap_ptrs` and `m_umaps`: + // `umap_ptrs` is a two dimensional array. The first dimension contains + // pointers to mixed-memory (host and device memory). The second + // dimension is the array of UnorderedMap objects. Some of the object + // methods are callable from only the device (device-callable), others + // are callable from only the host. Some of the host-callable methods, + // such as rehash are intended to be observable on the device. + // See Kokkos::UnorderedMap for details. + // + // `m_umaps` is a single dimension array of device memory. This array + // contains a shallow copy of all the UnorderedMap members that are + // allocated manually below. + // + // Any time a host-callable method with device observable results is + // invoked, we must shallow-copy the given `umap_ptrs` member back to + // the device. + // + // However, since we are using shallow copies of objects of type + // UnorderedMap, we do not need to copy the device memory back to + // the host before using a host-callable method. + + // Setup a nrows length array of Unordered Maps + m_umaps = reinterpret_cast( + Kokkos::kokkos_malloc("m_umaps", m_nrows * sizeof(UmapType))); + + using shallow_copy_to_device = + Kokkos::Impl::DeepCopy; + + UmapType **umap_ptrs = new UmapType *[m_nrows]; + + // TODO: use host-level parallel_for with tag rowmapRp1 + for (int i = 0; i < m_nrows; i++) { + umap_ptrs[i] = new UmapType(arg_capacity_hint, arg_hasher, arg_equal_to); + shallow_copy_to_device(m_umaps + i, umap_ptrs[i], sizeof(UmapType)); + } + + using coo2crsRp1Pt = Kokkos::RangePolicy; + bool rehashed = true; + while (rehashed) { + Kokkos::parallel_for("coo2crsRp1", coo2crsRp1Pt(0, m_n_tuples), *this); + + CrsET().fence(); // Wait for bitmap writes to land + Kokkos::deep_copy(m_capacity_bmap_mirror, m_capacity_bmap); + CrsET().fence(); + + rehashed = false; + // TODO: covert to host-level parallel for. + for (int i = 0; i < m_nrows; i++) { + if (m_capacity_bmap_mirror(i)) { + umap_ptrs[i]->rehash(umap_ptrs[i]->capacity() * 2); + rehashed = true; + m_capacity_bmap_mirror(i) = false; + shallow_copy_to_device(m_umaps + i, umap_ptrs[i], sizeof(UmapType)); + } + } + Kokkos::deep_copy(m_capacity_bmap, m_capacity_bmap_mirror); + CrsET().fence(); + } + + typename CrsRowMapView::HostMirror m_crs_row_map_h = + Kokkos::create_mirror_view(m_crs_row_map); + + // TODO: convert to host-level parallel_for / prefix sum + m_crs_row_map_h(0) = 0; + for (int i = 1; i < m_nrows + 1; i++) { + auto adj_i = i - 1; + auto sz = umap_ptrs[adj_i]->size(); + m_crs_row_map_h(i) = m_crs_row_map_h(adj_i) + sz; + } + + m_crs_row_map_tmp = CrsRowMapAtomicView( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_crs_row_map_tmp"), + m_nrows + 1); + Kokkos::deep_copy(m_crs_row_map, m_crs_row_map_h); + Kokkos::deep_copy(m_crs_row_map_tmp, m_crs_row_map_h); + CrsET().fence(); + + m_nnz = m_crs_row_map_h(m_nrows); + + m_crs_vals = CrsValuesView( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_crs_vals"), m_nnz); + m_crs_col_ids = CrsColIdsView( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_crs_col_ids"), + m_nnz); + + using copyRp1Pt = Kokkos::RangePolicy; + Kokkos::parallel_for("copyRp1", copyRp1Pt(0, m_nrows), *this); + CrsET().fence(); + + // Cleanup + for (int i = 0; i < m_nrows; i++) { + delete umap_ptrs[i]; + } + delete[] umap_ptrs; + Kokkos::kokkos_free(m_umaps); + } + + CrsType get_crsMat() { + return CrsType("coo2crs", m_nrows, m_ncols, m_nnz, m_crs_vals, + m_crs_row_map, m_crs_col_ids); + } +}; +} // namespace Impl + +// clang-format off +/// +/// \brief Blocking function that converts a CooMatrix into a CrsMatrix. Values are summed. +/// \tparam DimType the dimension type +/// \tparam RowViewType The row array view type +/// \tparam ColViewType The column array view type +/// \tparam DataViewType The data array view type +/// \param m the number of rows +/// \param n the number of columns +/// \param row the array of row ids +/// \param col the array of col ids +/// \param data the array of data +/// \return A KokkosSparse::CrsMatrix. +// clang-format on +template +auto coo2crs(DimType m, DimType n, RowViewType row, ColViewType col, + DataViewType data) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "RowViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "CalViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "DataViewType must be a Kokkos::View."); + static_assert(static_cast(RowViewType::rank) == 1, + "RowViewType must have rank 1."); + static_assert(static_cast(ColViewType::rank) == 1, + "ColViewType must have rank 1."); + static_assert(static_cast(DataViewType::rank) == 1, + "DataViewType must have rank 1."); +#endif + + static_assert(std::is_integral::value, + "RowViewType::value_type must be an integral."); + static_assert(std::is_integral::value, + "ColViewType::value_type must be an integral."); + + if (row.extent(0) != col.extent(0) || row.extent(0) != data.extent(0)) + Kokkos::abort("row.extent(0) = col.extent(0) = data.extent(0) required."); + + if (m <= 0 || n <= 0) Kokkos::abort("m > 0 and n > 0 required."); + + using Coo2crsType = + Impl::Coo2Crs; + Coo2crsType Coo2Crs(m, n, row, col, data); + return Coo2Crs.get_crsMat(); +} + +// clang-format off +/// +/// \brief Blocking function that converts a CooMatrix into a CrsMatrix. Values are summed. +/// \tparam DimType The dimension type +/// \tparam RowViewType The row array view type +/// \tparam ColViewType The column array view type +/// \tparam DataViewType The data array view type +/// \tparam DeviceType The cooMatrix::execution_space +/// \param cooMatrix The sparse matrix stored in coordinate ("Coo") format. +/// \return A KokkosSparse::CrsMatrix. +// clang-format on +template +auto coo2crs(KokkosSparse::CooMatrix &cooMatrix) { + return coo2crs(cooMatrix.numRows(), cooMatrix.numCols(), cooMatrix.row, + cooMatrix.col, cooMatrix.data); +} +} // namespace KokkosSparse +#endif // _KOKKOSSPARSE_COO2CRS_HPP diff --git a/sparse/src/KokkosSparse_crs2ccs.hpp b/sparse/src/KokkosSparse_crs2ccs.hpp index 9def73b5db..c9265842cb 100644 --- a/sparse/src/KokkosSparse_crs2ccs.hpp +++ b/sparse/src/KokkosSparse_crs2ccs.hpp @@ -113,7 +113,7 @@ auto crs2ccs(OrdinalType nrows, OrdinalType ncols, SizeType nnz, /// /// \tparam ScalarType The crsMatrix::scalar_type /// \tparam OrdinalType The crsMatrix::ordinal_type -/// \tparam Device The crsMatrix::device_type +/// \tparam DeviceType The crsMatrix::device_type /// \tparam MemoryTraits The crsMatrix::memory_traits /// \tparam SizeType The crsMatrix::size_type /// \param crsMatrix The KokkosSparse::CrsMatrix. diff --git a/sparse/src/KokkosSparse_crs2coo.hpp b/sparse/src/KokkosSparse_crs2coo.hpp new file mode 100644 index 0000000000..58984f3bab --- /dev/null +++ b/sparse/src/KokkosSparse_crs2coo.hpp @@ -0,0 +1,154 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "KokkosKernels_Utils.hpp" +#include "KokkosSparse_CooMatrix.hpp" +#include "KokkosSparse_CrsMatrix.hpp" + +#ifndef _KOKKOSSPARSE_CRS2COO_HPP +#define _KOKKOSSPARSE_CRS2COO_HPP +namespace KokkosSparse { +namespace Impl { +template +class Crs2Coo { + private: + using non_const_ordinal_type = std::remove_const_t; + using non_const_size_type = std::remove_const_t; + using coo_row_view = + typename Kokkos::View; + using coo_col_view = coo_row_view; + using coo_data_view = typename ValViewType::non_const_type; + using coo_type = + CooMatrix; + + non_const_ordinal_type m_nrows; + non_const_ordinal_type m_ncols; + non_const_size_type m_nnz; + + coo_data_view m_data; + coo_col_view m_col; + coo_row_view m_row; + + ValViewType m_vals; + RowMapViewType m_row_map; + ColIdViewType m_col_ids; + + using copy_tp1_pt = Kokkos::TeamPolicy; + using copy_tp1_member_type = typename copy_tp1_pt::member_type; + + public: + Crs2Coo(OrdinalType nrows, OrdinalType ncols, SizeType nnz, ValViewType vals, + RowMapViewType row_map, ColIdViewType col_ids) + : m_nrows(nrows), + m_ncols(ncols), + m_nnz(nnz), + m_vals(vals), + m_row_map(row_map), + m_col_ids(col_ids) { + m_data = coo_data_view( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_data"), nnz); + m_col = coo_col_view( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_col"), nnz); + m_row = coo_row_view( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_row"), nnz); + + copy_tp1_pt policy(m_nrows, 1, 1); + { + auto vec_len_max = policy.vector_length_max(); + copy_tp1_pt query_policy(m_nrows, 1, vec_len_max); + policy = copy_tp1_pt( + m_nrows, + query_policy.team_size_recommended(*this, Kokkos::ParallelForTag()), + vec_len_max); + } + + Kokkos::parallel_for("Crs2Coo", policy, *this); + DeviceType().fence(); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const copy_tp1_member_type &member) const { + auto i = member.league_rank(); + auto row_start = m_row_map(i); + auto row_len = m_row_map(i + 1) - row_start; + auto row_end = row_start + row_len; + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, row_start, row_end), + [&](const int &id) { + m_data(id) = m_vals(id); + m_col(id) = m_col_ids(id); + m_row(id) = i; + }); + } + + coo_type get_cooMat() { + return coo_type(m_nrows, m_ncols, m_row, m_col, m_data); + } +}; +} // namespace Impl +// clang-format off +/// +/// \brief Blocking function that converts a CrsMatrix to a CooMatrix. +/// Crs values are copied into the CooMatrix in the order they appear +/// within the CrsMatrix, starting from row 0 to row nrows - 1. +/// \tparam OrdinalType The view value type associated with the RowIdViewType +/// \tparam SizeType The type of nnz +/// \tparam ValViewType The values view type +/// \tparam RowMapViewType The column map view type +/// \tparam ColIdViewType The row ids view type +/// \param nrows The number of rows in the crs matrix +/// \param ncols The number of columns in the crs matrix +/// \param nnz The number of non-zeros in the crs matrix +/// \param vals The values view of the crs matrix +/// \param row_map The row map view of the crs matrix +/// \param col_ids The col ids view of the crs matrix +/// \return A KokkosSparse::CooMatrix. +/// +// clang-format on +template +auto crs2coo(OrdinalType nrows, OrdinalType ncols, SizeType nnz, + ValViewType vals, RowMapViewType row_map, ColIdViewType col_ids) { + using Crs2cooType = Impl::Crs2Coo; + Crs2cooType crs2Coo(nrows, ncols, nnz, vals, row_map, col_ids); + return crs2Coo.get_cooMat(); +} + +/// +/// @brief Blocking function that converts a CrsMatrix to a CooMatrix. +/// Crs values are copied into the CooMatrix in the order they appear +/// within the CrsMatrix, starting from row 0 to row nrows - 1. +/// +/// \tparam ScalarType The crsMatrix::scalar_type +/// \tparam OrdinalType The crsMatrix::ordinal_type +/// \tparam DeviceType The crsMatrix::device_type +/// \tparam MemoryTraits The crsMatrix::memory_traits +/// \tparam SizeType The crsMatrix::size_type +/// \param crsMatrix The KokkosSparse::CrsMatrix. +/// \return A KokkosSparse::CooMatrix. +template +auto crs2coo(KokkosSparse::CrsMatrix &crsMatrix) { + return crs2coo(crsMatrix.numRows(), crsMatrix.numCols(), crsMatrix.nnz(), + crsMatrix.values, crsMatrix.graph.row_map, + crsMatrix.graph.entries); +} +} // namespace KokkosSparse +#endif // _KOKKOSSPARSE_CRS2COO_HPP \ No newline at end of file diff --git a/sparse/unit_test/Test_Sparse.hpp b/sparse/unit_test/Test_Sparse.hpp index 647fff4c18..d0233a9c67 100644 --- a/sparse/unit_test/Test_Sparse.hpp +++ b/sparse/unit_test/Test_Sparse.hpp @@ -16,6 +16,8 @@ #ifndef TEST_SPARSE_HPP #define TEST_SPARSE_HPP +#include "Test_Sparse_coo2crs.hpp" +#include "Test_Sparse_crs2coo.hpp" #include "Test_Sparse_block_gauss_seidel.hpp" #include "Test_Sparse_Controls.hpp" #include "Test_Sparse_CrsMatrix.hpp" diff --git a/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp b/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp index 856b888c1a..4e2aa7695c 100644 --- a/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp +++ b/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp @@ -45,7 +45,7 @@ void doCsMat(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { ASSERT_EQ(vals.extent(0), cm.get_nnz() + 1) << cm.info; auto row_ids = cm.get_ids(); - ASSERT_EQ(row_ids.extent(0), cm.get_dim1() * cm.get_dim2() + 1) << cm.info; + ASSERT_EQ(row_ids.extent(0), cm.get_nnz()) << cm.info; auto col_map = cm.get_map(); ASSERT_EQ(col_map.extent(0), cm.get_dim1() + 1); diff --git a/sparse/unit_test/Test_Sparse_coo2crs.hpp b/sparse/unit_test/Test_Sparse_coo2crs.hpp new file mode 100644 index 0000000000..9910dd876b --- /dev/null +++ b/sparse/unit_test/Test_Sparse_coo2crs.hpp @@ -0,0 +1,329 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "KokkosSparse_coo2crs.hpp" +#include "KokkosSparse_crs2coo.hpp" +#include "KokkosKernels_TestUtils.hpp" + +namespace Test { +template +CrsType vanilla_coo2crs(size_t m, size_t n, RowType row, ColType col, + DataType data) { + using RowIndexType = typename RowType::value_type; + using ColIndexType = typename ColType::value_type; + using ValueType = typename DataType::value_type; + std::unordered_map *> + umap; + int nnz = 0; + + for (uint64_t i = 0; i < data.extent(0); i++) { + auto r = row(i); + auto c = col(i); + auto d = data(i); + + if (r >= 0 && c >= 0) { + if (umap.find(r) != umap.end()) { // exists + auto my_row = umap.at(r); + if (my_row->find(c) != my_row->end()) + my_row->at(c) += d; + else { + my_row->insert(std::make_pair(c, d)); + nnz++; + } + } else { // create a new row. + auto new_row = new std::unordered_map(); + umap.insert(std::make_pair(r, new_row)); + new_row->insert(std::make_pair(c, d)); + nnz++; + } + } + } + + typename CrsType::row_map_type::non_const_type row_map("vanilla_row_map", + m + 1); + typename CrsType::values_type values("vanilla_values", nnz); + typename CrsType::staticcrsgraph_type::entries_type col_ids("vanilla_col_ids", + nnz); + + typename CrsType::row_map_type::non_const_type::HostMirror row_map_h = + Kokkos::create_mirror_view(row_map); + typename CrsType::values_type::HostMirror values_h = + Kokkos::create_mirror_view(values); + typename CrsType::staticcrsgraph_type::entries_type::HostMirror col_ids_h = + Kokkos::create_mirror_view(col_ids); + + int row_len = 0; + for (uint64_t i = 0; i < m; i++) { + if (umap.find(i) != umap.end()) row_len += umap.at(i)->size(); + row_map_h(i + 1) = row_len; + } + + for (uint64_t i = 0; i < m; i++) { + if (umap.find(i) == umap.end()) // Fully sparse row + continue; + + auto row_start = row_map_h(i); + auto row_end = row_map_h(i + 1); + auto my_row = umap.at(i); + auto iter = my_row->begin(); + for (auto j = row_start; j < row_end; j++, iter++) { + col_ids_h(j) = iter->first; + values_h(j) = iter->second; + } + delete my_row; + } + + Kokkos::deep_copy(row_map, row_map_h); + Kokkos::deep_copy(col_ids, col_ids_h); + Kokkos::deep_copy(values, values_h); + + return CrsType("vanilla_coo2csr", m, n, nnz, values, row_map, col_ids); +} + +template +void check_crs_matrix(CrsType crsMat, RowType row, ColType col, DataType data, + std::string failure_info = "no failure information!") { + using value_type = typename DataType::value_type; + using ats = Kokkos::ArithTraits; + + // Copy coo to host + typename RowType::HostMirror row_h = Kokkos::create_mirror_view(row); + Kokkos::deep_copy(row_h, row); + typename ColType::HostMirror col_h = Kokkos::create_mirror_view(col); + Kokkos::deep_copy(col_h, col); + typename DataType::HostMirror data_h = Kokkos::create_mirror_view(data); + Kokkos::deep_copy(data_h, data); + + auto crsMatRef = vanilla_coo2crs( + crsMat.numRows(), crsMat.numCols(), row_h, col_h, data_h); + + auto crs_col_ids_ref_d = crsMatRef.graph.entries; + auto crs_row_map_ref_d = crsMatRef.graph.row_map; + auto crs_vals_ref_d = crsMatRef.values; + + using ViewTypeCrsColIdsRef = decltype(crs_col_ids_ref_d); + using ViewTypeCrsRowMapRef = decltype(crs_row_map_ref_d); + using ViewTypeCrsValsRef = decltype(crs_vals_ref_d); + + // Copy crs to host + typename ViewTypeCrsColIdsRef::HostMirror crs_col_ids_ref = + Kokkos::create_mirror_view(crs_col_ids_ref_d); + Kokkos::deep_copy(crs_col_ids_ref, crs_col_ids_ref_d); + typename ViewTypeCrsRowMapRef::HostMirror crs_row_map_ref = + Kokkos::create_mirror_view(crs_row_map_ref_d); + Kokkos::deep_copy(crs_row_map_ref, crs_row_map_ref_d); + typename ViewTypeCrsValsRef::HostMirror crs_vals_ref = + Kokkos::create_mirror_view(crs_vals_ref_d); + Kokkos::deep_copy(crs_vals_ref, crs_vals_ref_d); + + auto crs_col_ids_d = crsMat.graph.entries; + auto crs_row_map_d = crsMat.graph.row_map; + auto crs_vals_d = crsMat.values; + + using ViewTypeCrsColIds = decltype(crs_col_ids_d); + using ViewTypeCrsRowMap = decltype(crs_row_map_d); + using ViewTypeCrsVals = decltype(crs_vals_d); + + // Copy crs to host + typename ViewTypeCrsColIds::HostMirror crs_col_ids = + Kokkos::create_mirror_view(crs_col_ids_d); + Kokkos::deep_copy(crs_col_ids, crs_col_ids_d); + typename ViewTypeCrsRowMap::HostMirror crs_row_map = + Kokkos::create_mirror_view(crs_row_map_d); + Kokkos::deep_copy(crs_row_map, crs_row_map_d); + typename ViewTypeCrsVals::HostMirror crs_vals = + Kokkos::create_mirror_view(crs_vals_d); + Kokkos::deep_copy(crs_vals, crs_vals_d); + + Kokkos::fence(); + + ASSERT_EQ(crsMatRef.nnz(), crsMat.nnz()) << failure_info; + + for (int i = 0; i < crsMatRef.numRows(); i++) { + ASSERT_EQ(crs_row_map_ref(i), crs_row_map(i)) + << "crs_row_map_ref(" << i << " = " << crs_row_map_ref(i) << " != " + << "crs_row_map(" << i << " = " << crs_row_map(i) << " -- " + << failure_info; + } + + for (int i = 0; i < crsMatRef.numRows(); ++i) { + auto row_start_ref = crs_row_map_ref(i); + auto row_stop_ref = crs_row_map_ref(i + 1); + auto row_len_ref = row_stop_ref - row_start_ref; + + auto row_start = crs_row_map(i); + auto row_len = crs_row_map(i + 1) - row_start; + + ASSERT_EQ(row_start_ref, row_start); + ASSERT_EQ(row_len_ref, row_len); + + for (auto j = row_start_ref; j < row_stop_ref; ++j) { + // Look for the corresponding col_id + auto col_id_ref = crs_col_ids_ref(j); + std::string fail_msg = "row: " + std::to_string(i) + + ", crs_col_ids_ref(" + std::to_string(j) + + ") = " + std::to_string(col_id_ref); + + auto k = row_start_ref; + for (; k < row_stop_ref; ++k) { + if (crs_col_ids(k) == col_id_ref) break; + } + if (k == row_stop_ref) + FAIL() << fail_msg << " not found in crs_col_ids!" << failure_info; + + // NOTE: ASSERT_EQ doesn't work -- values may be summed in different + // orders We sum at most m x n values. + auto eps = + crsMatRef.numCols() * crsMatRef.numRows() * 10e1 * ats::epsilon(); + EXPECT_NEAR_KK(crs_vals_ref(j), crs_vals(k), eps, + fail_msg + " mismatched values!" + failure_info); + } + } +} + +template +void doCoo2Crs(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { + RandCooMat cooMat(m, n, m * n, min_val, + max_val); + auto randRow = cooMat.get_row(); + auto randCol = cooMat.get_col(); + auto randData = cooMat.get_data(); + + std::string failure_info = + "\nBegin arguments for above failure...\n" + cooMat.info + + "scalar: " + std::string(typeid(ScalarType).name()) + "\n" + + "layout: " + std::string(typeid(LayoutType).name()) + "\n" + + "m: " + std::to_string(m) + ", n: " + std::to_string(n) + + "\n...end arguments for above failure.\n"; + + auto crsMat = KokkosSparse::coo2crs(m, n, randRow, randCol, randData); + check_crs_matrix(crsMat, randRow, randCol, randData, failure_info); +} + +template +void doAllScalarsCoo2Crs(size_t m, size_t n, int min, int max) { + doCoo2Crs(m, n, min, max); + doCoo2Crs(m, n, min, max); + doCoo2Crs, LayoutType, ExeSpaceType>(m, n, min, max); + doCoo2Crs, LayoutType, ExeSpaceType>(m, n, min, max); +} + +template +void doAllLayoutsCoo2Crs(size_t m, size_t n, int min, int max) { + doAllScalarsCoo2Crs(m, n, min, max); + doAllScalarsCoo2Crs(m, n, min, max); +} + +template +void doAllCoo2Crs(size_t m, size_t n) { + int min = 1, max = 10; + doAllLayoutsCoo2Crs(m, n, min, max); +} + +TEST_F(TestCategory, sparse_coo2crs) { + uint64_t ticks = + std::chrono::high_resolution_clock::now().time_since_epoch().count() % + UINT32_MAX; + std::srand(ticks); + + // Square cases + for (size_t i = 1; i < 256; i *= 4) { + size_t dim = (std::rand() % 511) + 1; + doAllCoo2Crs(dim, dim); + } + + // Non-square cases + for (size_t i = 1; i < 256; i *= 4) { + size_t m = (std::rand() % 511) + 1; + size_t n = (std::rand() % 511) + 1; + while (n == m) n = (std::rand() % 511) + 1; + doAllCoo2Crs(m, n); + } + + RandCooMat cooMat(2, 2, 2 * 2, 10, + 10); + auto crsMatrix = KokkosSparse::coo2crs(2, 2, cooMat.get_row(), + cooMat.get_col(), cooMat.get_data()); + auto cooMatrix = KokkosSparse::crs2coo(crsMatrix); + + check_crs_matrix(crsMatrix, cooMatrix.row, cooMatrix.col, cooMatrix.data); +} + +TEST_F(TestCategory, sparse_coo2crs_staticMatrix_edgeCases) { + int m = 4; + int n = 4; + long long staticRow[16]{0, 1, 3, 2, 3, 2, 2, 2, 0, 0, 0, 1, 2, 0, 3, 0}; + long long staticCol[16]{1, 1, 2, 3, 3, 2, 3, 2, 0, 0, 1, 3, 1, 2, 0, 0}; + float staticData[16]{7.28411, 8.17991, 8.84304, 5.01788, 9.85646, 5.79404, + 8.42014, 1.90238, 8.24195, 4.39955, 3.2637, 5.4546, + 6.51895, 8.09302, 9.36294, 3.44206}; + Kokkos::View row("coo row", 16); + Kokkos::View col("coo col", 16); + Kokkos::View data("coo data", 16); + + typename Kokkos::View::HostMirror row_h = + Kokkos::create_mirror_view(row); + typename Kokkos::View::HostMirror col_h = + Kokkos::create_mirror_view(col); + typename Kokkos::View::HostMirror data_h = + Kokkos::create_mirror_view(data); + for (int i = 0; i < 16; i++) { + row_h(i) = staticRow[i]; + col_h(i) = staticCol[i]; + data_h(i) = staticData[i]; + } + + Kokkos::deep_copy(row, row_h); + Kokkos::deep_copy(col, col_h); + Kokkos::deep_copy(data, data_h); + + // Even partitions with multiple threads + auto crsMatTs4 = KokkosSparse::coo2crs(m, n, row, col, data); + check_crs_matrix(crsMatTs4, row_h, col_h, data_h); + + // Even partitions, single thread, fully sparse row + long long staticRowTs1[16]{0, 3, 0, 2, 2, 3, 0, 3, 2, 0, 0, 0, 0, 3, 3, 0}; + long long staticColTs1[16]{3, 1, 3, 1, 2, 2, 1, 1, 2, 3, 3, 1, 1, 0, 0, 0}; + float staticDataTs1[16]{6.1355, 6.53989, 8.58559, 6.37476, 4.18964, 2.41146, + 1.82177, 1.4249, 1.52659, 5.50521, 8.0484, 3.98874, + 6.74709, 3.35072, 7.81944, 5.83494}; + for (int i = 0; i < 16; i++) { + row_h(i) = staticRowTs1[i]; + col_h(i) = staticColTs1[i]; + data_h(i) = staticDataTs1[i]; + } + Kokkos::deep_copy(row, row_h); + Kokkos::deep_copy(col, col_h); + Kokkos::deep_copy(data, data_h); + + auto crsMatTs1 = KokkosSparse::coo2crs(m, n, row, col, data); + check_crs_matrix(crsMatTs1, row_h, col_h, data_h); + + // Fully sparse + for (int i = 0; i < 16; i++) { + row_h(i) = -staticRowTs1[i]; + col_h(i) = -staticColTs1[i]; + } + Kokkos::deep_copy(row, row_h); + Kokkos::deep_copy(col, col_h); + + auto crsMatFsTs1 = KokkosSparse::coo2crs(m, n, row, col, data); + check_crs_matrix(crsMatFsTs1, row_h, col_h, data); +} +} // namespace Test \ No newline at end of file diff --git a/sparse/unit_test/Test_Sparse_crs2coo.hpp b/sparse/unit_test/Test_Sparse_crs2coo.hpp new file mode 100644 index 0000000000..71d01e6005 --- /dev/null +++ b/sparse/unit_test/Test_Sparse_crs2coo.hpp @@ -0,0 +1,142 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "KokkosSparse_coo2crs.hpp" +#include "KokkosSparse_crs2coo.hpp" +#include "KokkosKernels_TestUtils.hpp" + +namespace Test { +template +void check_coo_matrix(CrsType crsMatRef, RowType row, ColType col, + DataType data) { + // Copy coo to host + typename RowType::HostMirror row_h = Kokkos::create_mirror_view(row); + Kokkos::deep_copy(row_h, row); + typename ColType::HostMirror col_h = Kokkos::create_mirror_view(col); + Kokkos::deep_copy(col_h, col); + typename DataType::HostMirror data_h = Kokkos::create_mirror_view(data); + Kokkos::deep_copy(data_h, data); + + // printf("coo in:\n"); + // for (unsigned i = 0; i < data_h.extent(0); i++) + // printf("(%lld, %lld, %g)\n", row_h(i), col_h(i), data_h(i)); + + auto crs_col_ids_ref_d = crsMatRef.graph.entries; + auto crs_row_map_ref_d = crsMatRef.graph.row_map; + auto crs_vals_ref_d = crsMatRef.values; + + using ViewTypeCrsColIdsRef = decltype(crs_col_ids_ref_d); + using ViewTypeCrsRowMapRef = decltype(crs_row_map_ref_d); + using ViewTypeCrsValsRef = decltype(crs_vals_ref_d); + + // Copy crs to host + typename ViewTypeCrsColIdsRef::HostMirror crs_col_ids_ref = + Kokkos::create_mirror_view(crs_col_ids_ref_d); + Kokkos::deep_copy(crs_col_ids_ref, crs_col_ids_ref_d); + typename ViewTypeCrsRowMapRef::HostMirror crs_row_map_ref = + Kokkos::create_mirror_view(crs_row_map_ref_d); + Kokkos::deep_copy(crs_row_map_ref, crs_row_map_ref_d); + typename ViewTypeCrsValsRef::HostMirror crs_vals_ref = + Kokkos::create_mirror_view(crs_vals_ref_d); + Kokkos::deep_copy(crs_vals_ref, crs_vals_ref_d); + + Kokkos::fence(); + + ASSERT_EQ(crsMatRef.nnz(), row.extent(0)); + ASSERT_EQ(crsMatRef.nnz(), col.extent(0)); + ASSERT_EQ(crsMatRef.nnz(), data.extent(0)); + + for (decltype(row.extent(0)) idx = 0; idx < row.extent(0); ++idx) { + auto row_id = row_h(idx); + auto col_id = col_h(idx); + auto val = data_h(idx); + std::string fail_msg = "idx - " + std::to_string(idx) + + " row: " + std::to_string(row_id) + + ", col: " + std::to_string(col_id); + + auto row_start_ref = crs_row_map_ref(row_id); + auto row_stop_ref = crs_row_map_ref(row_id + 1); + + auto crs_idx = row_start_ref; + for (; crs_idx < row_stop_ref; crs_idx++) { + if (crs_col_ids_ref(crs_idx) == col_id) { + // crs2coo does a direct copy, no need for an epsilon. + if (crs_vals_ref(crs_idx) == val) break; + } + } + if (crs_idx == row_stop_ref) + FAIL() << fail_msg << " not found in crsMatRef!"; + } +} + +template +void doCrs2Coo(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { + using RandCrsMatType = RandCsMatrix; + RandCrsMatType crsMat(m, n, min_val, max_val, m == 0 || n == 0); + + using CrsOT = typename RandCrsMatType::IdViewTypeD::value_type; + using CrsType = typename KokkosSparse::CrsMatrix; + auto map = crsMat.get_map(); + auto ids = crsMat.get_ids(); + CrsType crsMatrix("doCrs2Coo", crsMat.get_dim1(), crsMat.get_dim2(), + crsMat.get_nnz(), crsMat.get_vals(), map, ids); + + auto cooMat = KokkosSparse::crs2coo(crsMatrix); + check_coo_matrix(crsMatrix, cooMat.row, cooMat.col, cooMat.data); +} + +template +void doAllScalarsCrs2Coo(size_t m, size_t n, int min, int max) { + doCrs2Coo(m, n, min, max); + doCrs2Coo(m, n, min, max); + doCrs2Coo, LayoutType, ExeSpaceType>(m, n, min, max); + doCrs2Coo, LayoutType, ExeSpaceType>(m, n, min, max); +} + +template +void doAllLayoutsCrs2Coo(size_t m, size_t n, int min, int max) { + doAllScalarsCrs2Coo(m, n, min, max); + doAllScalarsCrs2Coo(m, n, min, max); +} + +template +void doAllCrs2Coo(size_t m, size_t n) { + int min = 1, max = 10; + doAllLayoutsCrs2Coo(m, n, min, max); +} + +TEST_F(TestCategory, sparse_crs2coo) { + uint64_t ticks = + std::chrono::high_resolution_clock::now().time_since_epoch().count() % + UINT32_MAX; + std::srand(ticks); + + // Square cases + for (size_t i = 1; i < 256; i *= 4) { + size_t dim = (std::rand() % 511) + 1; + doAllCrs2Coo(dim, dim); + } + + // Non-square cases + for (size_t i = 1; i < 256; i *= 4) { + size_t m = (std::rand() % 511) + 1; + size_t n = (std::rand() % 511) + 1; + while (n == m) n = (std::rand() % 511) + 1; + doAllCrs2Coo(m, n); + } +} +} // namespace Test \ No newline at end of file diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 4724621f46..4ad2d9b459 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -562,6 +562,64 @@ int string_compare_no_case(const char* str1, const char* str2) { int string_compare_no_case(const std::string& str1, const std::string& str2) { return string_compare_no_case(str1.c_str(), str2.c_str()); } +/// /brief Coo matrix class for testing purposes. +/// \tparam ScalarType +/// \tparam LayoutType +/// \tparam ExeSpaceType +template +class RandCooMat { + private: + using RowViewTypeD = Kokkos::View; + using ColViewTypeD = Kokkos::View; + using DataViewTypeD = Kokkos::View; + RowViewTypeD __row_d; + ColViewTypeD __col_d; + DataViewTypeD __data_d; + + template + T __getter_copy_helper(T src) { + T dst(std::string("RandCooMat.") + typeid(T).name() + " copy", + src.extent(0)); + Kokkos::deep_copy(dst, src); + ExeSpaceType().fence(); + return dst; + } + + public: + std::string info; + /// Constructs a random coo matrix with negative indices. + /// \param m The max row id + /// \param n The max col id + /// \param n_tuples The number of tuples. + /// \param min_val The minimum scalar value in the matrix. + /// \param max_val The maximum scalar value in the matrix. + RandCooMat(int64_t m, int64_t n, int64_t n_tuples, ScalarType min_val, + ScalarType max_val) { + uint64_t ticks = + std::chrono::high_resolution_clock::now().time_since_epoch().count() % + UINT32_MAX; + + info = std::string(std::string("RandCooMat<") + typeid(ScalarType).name() + + ", " + typeid(LayoutType).name() + ", " + + typeid(ExeSpaceType).name() + std::to_string(n) + + "...): rand seed: " + std::to_string(ticks) + "\n"); + Kokkos::Random_XorShift64_Pool random(ticks); + + __row_d = RowViewTypeD("RandCooMat.RowViewType", n_tuples); + Kokkos::fill_random(__row_d, random, -m, m); + + __col_d = ColViewTypeD("RandCooMat.ColViewType", n_tuples); + Kokkos::fill_random(__col_d, random, -n, n); + + __data_d = DataViewTypeD("RandCooMat.DataViewType", n_tuples); + Kokkos::fill_random(__data_d, random, min_val, max_val); + + ExeSpaceType().fence(); + } + auto get_row() { return __getter_copy_helper(__row_d); } + auto get_col() { return __getter_copy_helper(__col_d); } + auto get_data() { return __getter_copy_helper(__data_d); } +}; /// /brief Cs (Compressed Sparse) matrix class for testing purposes. /// This class is for testing purposes only and will generate a random @@ -574,10 +632,12 @@ int string_compare_no_case(const std::string& str1, const std::string& str2) { /// \tparam ExeSpaceType template class RandCsMatrix { - private: + public: using ValViewTypeD = Kokkos::View; using IdViewTypeD = Kokkos::View; using MapViewTypeD = Kokkos::View; + + private: int64_t __dim2; int64_t __dim1; int64_t __nnz = 0; @@ -624,8 +684,14 @@ class RandCsMatrix { // Copy to device Kokkos::deep_copy(__map_d, __map); - Kokkos::deep_copy(__ids_d, __ids); + IdViewTypeD tight_ids(Kokkos::view_alloc(Kokkos::WithoutInitializing, + "RandCsMatrix.IdViewTypeD"), + __nnz); + Kokkos::deep_copy( + tight_ids, + Kokkos::subview(__ids, Kokkos::make_pair(0, static_cast(__nnz)))); ExeSpaceType().fence(); + __ids_d = tight_ids; } template From ec611fe92ab3589e98ae5f8606ecd4d1a37a52c0 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 18 Apr 2023 08:17:29 -0600 Subject: [PATCH 253/442] Blas1: adding execution space overload of axpy and axpby This will allow other libraries to execute this kernel on stream. It has been requested by PETSc and Trilinos... --- blas/impl/KokkosBlas1_axpby_impl.hpp | 7 +- blas/impl/KokkosBlas1_axpby_mv_impl.hpp | 36 ++++----- blas/impl/KokkosBlas1_axpby_spec.hpp | 77 ++++++++++--------- blas/src/KokkosBlas1_axpby.hpp | 45 +++++++---- .../tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp | 4 +- ...s3_gemm_standalone_perf_test_benchmark.cpp | 8 +- 6 files changed, 96 insertions(+), 81 deletions(-) diff --git a/blas/impl/KokkosBlas1_axpby_impl.hpp b/blas/impl/KokkosBlas1_axpby_impl.hpp index 00fc445ec9..0403ef24ff 100644 --- a/blas/impl/KokkosBlas1_axpby_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_impl.hpp @@ -302,8 +302,8 @@ struct Axpby_Functor -void Axpby_Generic(const AV& av, const XV& x, const BV& bv, const YV& y, +template +void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, const BV& bv, const YV& y, const SizeType startingColumn, int a = 2, int b = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -323,9 +323,8 @@ void Axpby_Generic(const AV& av, const XV& x, const BV& bv, const YV& y, "KokkosBlas::Impl::Axpby_Generic: " "XV and YV must have rank 1."); - typedef typename YV::execution_space execution_space; const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (a == 0 && b == 0) { Axpby_Functor op(x, y, av, bv, diff --git a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp index 4ef3201163..84b2856cac 100644 --- a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp @@ -43,7 +43,6 @@ namespace Impl { template struct Axpby_MV_Functor { - typedef typename YMV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -286,7 +285,6 @@ template struct Axpby_MV_Functor { - typedef typename YMV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -500,7 +498,6 @@ struct Axpby_MV_Functor struct Axpby_MV_Unroll_Functor { - typedef typename YMV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -728,7 +725,6 @@ template { - typedef typename YMV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -951,8 +947,8 @@ struct Axpby_MV_Unroll_Functor -void Axpby_MV_Unrolled(const AV& av, const XMV& x, const BV& bv, const YMV& y, +template +void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, const SizeType startingColumn, int a = 2, int b = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -972,9 +968,8 @@ void Axpby_MV_Unrolled(const AV& av, const XMV& x, const BV& bv, const YMV& y, "KokkosBlas::Impl::Axpby_MV_Unrolled: " "XMV and YMV must have rank 2."); - typedef typename YMV::execution_space execution_space; const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (a == 0 && b == 0) { Axpby_MV_Unroll_Functor op( @@ -1106,8 +1101,8 @@ void Axpby_MV_Unrolled(const AV& av, const XMV& x, const BV& bv, const YMV& y, // coefficients in av and bv vectors, if they are used. // // Either av and bv are both 1-D Views, or av and bv are both scalars. -template -void Axpby_MV_Generic(const AV& av, const XMV& x, const BV& bv, const YMV& y, +template +void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, int a = 2, int b = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -1127,9 +1122,8 @@ void Axpby_MV_Generic(const AV& av, const XMV& x, const BV& bv, const YMV& y, "KokkosBlas::Impl::Axpby_MV_Generic: " "XMV and YMV must have rank 2."); - typedef typename YMV::execution_space execution_space; const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (a == 0 && b == 0) { Axpby_MV_Functor op(x, y, av, bv); @@ -1245,9 +1239,9 @@ void Axpby_MV_Generic(const AV& av, const XMV& x, const BV& bv, const YMV& y, // coefficients in av and bv vectors, if they are used. // // Either av and bv are both 1-D Views, or av and bv are both scalars. -template +template struct Axpby_MV_Invoke_Left { - static void run(const AV& av, const XMV& x, const BV& bv, const YMV& y, + static void run(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, int a = 2, int b = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -1280,7 +1274,7 @@ struct Axpby_MV_Invoke_Left { // Passing in the starting column index lets the functor take // subviews of av and bv, if they are Views. If they are scalars, // the functor doesn't have to do anything to them. - Axpby_MV_Unrolled(av, X_cur, bv, Y_cur, j, + Axpby_MV_Unrolled(space, av, X_cur, bv, Y_cur, j, a, b); } for (; j + 4 <= numCols; j += 4) { @@ -1290,7 +1284,7 @@ struct Axpby_MV_Invoke_Left { // Passing in the starting column index lets the functor take // subviews of av and bv, if they are Views. If they are scalars, // the functor doesn't have to do anything to them. - Axpby_MV_Unrolled(av, X_cur, bv, Y_cur, j, + Axpby_MV_Unrolled(space, av, X_cur, bv, Y_cur, j, a, b); } for (; j < numCols; ++j) { @@ -1302,7 +1296,7 @@ struct Axpby_MV_Invoke_Left { // the functor doesn't have to do anything to them. typedef decltype(x_cur) XV; typedef decltype(y_cur) YV; - Axpby_Generic(av, x_cur, bv, y_cur, j, a, b); + Axpby_Generic(space, av, x_cur, bv, y_cur, j, a, b); } } }; @@ -1326,9 +1320,9 @@ struct Axpby_MV_Invoke_Left { // coefficients in av and bv vectors, if they are used. // // Either av and bv are both 1-D Views, or av and bv are both scalars. -template +template struct Axpby_MV_Invoke_Right { - static void run(const AV& av, const XMV& x, const BV& bv, const YMV& y, + static void run(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, int a = 2, int b = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -1354,9 +1348,9 @@ struct Axpby_MV_Invoke_Right { auto y_0 = Kokkos::subview(y, Kokkos::ALL(), 0); typedef decltype(x_0) XV; typedef decltype(y_0) YV; - Axpby_Generic(av, x_0, bv, y_0, 0, a, b); + Axpby_Generic(space, av, x_0, bv, y_0, 0, a, b); } else { - Axpby_MV_Generic(av, x, bv, y, a, b); + Axpby_MV_Generic(space, av, x, bv, y, a, b); } } }; diff --git a/blas/impl/KokkosBlas1_axpby_spec.hpp b/blas/impl/KokkosBlas1_axpby_spec.hpp index 6561163fd1..f5d7c96061 100644 --- a/blas/impl/KokkosBlas1_axpby_spec.hpp +++ b/blas/impl/KokkosBlas1_axpby_spec.hpp @@ -28,7 +28,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct axpby_eti_spec_avail { enum : bool { value = false }; }; @@ -46,6 +46,7 @@ struct axpby_eti_spec_avail { MEM_SPACE) \ template <> \ struct axpby_eti_spec_avail< \ + EXEC_SPACE, \ SCALAR, \ Kokkos::View, \ @@ -68,6 +69,7 @@ struct axpby_eti_spec_avail { MEM_SPACE) \ template <> \ struct axpby_eti_spec_avail< \ + EXEC_SPACE, \ SCALAR, \ Kokkos::View, \ @@ -80,6 +82,7 @@ struct axpby_eti_spec_avail { }; \ template <> \ struct axpby_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -127,16 +130,16 @@ namespace Impl { /// Any scalar coefficient of zero has BLAS semantics of /// ignoring the corresponding (multi)vector entry. This does NOT /// apply to coefficients in av and bv vectors, if they are used. -template ::value, - bool eti_spec_avail = axpby_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = axpby_eti_spec_avail::value> struct Axpby { - static void axpby(const AV& av, const XMV& X, const BV& bv, const YMV& Y); + static void axpby(const execution_space& space, const AV& av, const XMV& X, const BV& bv, const YMV& Y); }; -template -struct Axpby { - static void axpby(const AV& /* av */, const XMV& /* X */, const BV& /* bv */, +template +struct Axpby { + static void axpby(const execution_space& /*space*/, const AV& /* av */, const XMV& /* X */, const BV& /* bv */, const YMV& /* Y */) { static_assert(YMV::rank == 0, "Oh My God"); } @@ -144,11 +147,11 @@ struct Axpby { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY // Full specialization for XMV and YMV rank-2 Views. -template -struct Axpby { +template +struct Axpby { typedef typename YMV::size_type size_type; - static void axpby(const AV& av, const XMV& X, const BV& bv, const YMV& Y) { + static void axpby(const execution_space& space, const AV& av, const XMV& X, const BV& bv, const YMV& Y) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby::axpby: X is not a Kokkos::View."); @@ -199,18 +202,18 @@ struct Axpby { typedef int index_type; typedef typename std::conditional< std::is_same::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type + Axpby_MV_Invoke_Right, + Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; - Axpby_MV_Invoke_Layout::run(av, X, bv, Y, a, b); + Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, a, b); } else { typedef typename XMV::size_type index_type; typedef typename std::conditional< std::is_same::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type + Axpby_MV_Invoke_Right, + Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; - Axpby_MV_Invoke_Layout::run(av, X, bv, Y, a, b); + Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, a, b); } Kokkos::Profiling::popRegion(); } @@ -218,8 +221,8 @@ struct Axpby { // Partial specialization for XMV, and YMV rank-2 Views, // and AV and BV scalars. -template -struct Axpby +struct Axpby { typedef typename XMV::non_const_value_type AV; @@ -228,7 +231,7 @@ struct Axpby ATA; typedef Kokkos::ArithTraits ATB; - static void axpby(const AV& alpha, const XMV& X, const BV& beta, + static void axpby(const execution_space& space, const AV& alpha, const XMV& X, const BV& beta, const YMV& Y) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby::axpby (MV): " @@ -301,18 +304,18 @@ struct Axpby::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type + Axpby_MV_Invoke_Right, + Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; - Axpby_MV_Invoke_Layout::run(alpha, X, beta, Y, a, b); + Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, a, b); } else { typedef typename XMV::size_type index_type; typedef typename std::conditional< std::is_same::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type + Axpby_MV_Invoke_Right, + Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; - Axpby_MV_Invoke_Layout::run(alpha, X, beta, Y, a, b); + Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, a, b); } Kokkos::Profiling::popRegion(); } @@ -320,8 +323,8 @@ struct Axpby -struct Axpby +struct Axpby { typedef typename XV::non_const_value_type AV; @@ -330,7 +333,7 @@ struct Axpby ATA; typedef Kokkos::ArithTraits ATB; - static void axpby(const AV& alpha, const XV& X, const BV& beta, const YV& Y) { + static void axpby(const execution_space& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby::axpby: X is not a Kokkos::View."); @@ -394,14 +397,12 @@ struct Axpby(INT_MAX)) { typedef int index_type; - Axpby_Generic( - alpha, X, beta, Y, 0, a, b); + Axpby_Generic(space, alpha, X, beta, Y, 0, a, b); } else { typedef typename XV::size_type index_type; - Axpby_Generic( - alpha, X, beta, Y, 0, a, b); + Axpby_Generic(space, alpha, X, beta, Y, 0, a, b); } Kokkos::Profiling::popRegion(); } @@ -422,6 +423,7 @@ struct Axpby, \ @@ -433,6 +435,7 @@ struct Axpby, \ @@ -453,6 +456,7 @@ struct Axpby, \ @@ -462,6 +466,7 @@ struct Axpby >, \ 2, false, true>; \ extern template struct Axpby< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -478,6 +483,7 @@ struct Axpby, \ @@ -487,6 +493,7 @@ struct Axpby >, \ 2, false, true>; \ template struct Axpby< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index 45b82e8fcd..825ab34d1f 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -30,14 +30,20 @@ namespace KokkosBlas { -template -void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { +template +void axpby(const execution_space& space, const AV& a, const XMV& X, const BV& b, const YMV& Y) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::axpby: execution_space must be a valid Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::axpby: " "X is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::axpby: XMV must be accessible from execution_space"); static_assert(Kokkos::is_view::value, "KokkosBlas::axpby: " "Y is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::axpby: XMV must be accessible from execution_space"); static_assert(std::is_same::value, "KokkosBlas::axpby: Y is const. It must be nonconst, " @@ -68,32 +74,39 @@ void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { // Create unmanaged versions of the input Views. XMV and YMV may be // rank 1 or rank 2. AV and BV may be either rank-1 Views, or // scalar values. - typedef Kokkos::View > - XMV_Internal; - typedef Kokkos::View >; + using YMV_Internal = Kokkos::View > - YMV_Internal; - typedef typename KokkosKernels::Impl::GetUnifiedScalarViewType< - AV, XMV_Internal, true>::type AV_Internal; - typedef typename KokkosKernels::Impl::GetUnifiedScalarViewType< - BV, YMV_Internal, true>::type BV_Internal; + Kokkos::MemoryTraits >; + using AV_Internal = typename KokkosKernels::Impl::GetUnifiedScalarViewType< + AV, XMV_Internal, true>::type; + using BV_Internal = typename KokkosKernels::Impl::GetUnifiedScalarViewType< + BV, YMV_Internal, true>::type; AV_Internal a_internal = a; XMV_Internal X_internal = X; BV_Internal b_internal = b; YMV_Internal Y_internal = Y; - Impl::Axpby::axpby( - a_internal, X_internal, b_internal, Y_internal); + Impl::Axpby::axpby(space, a_internal, X_internal, b_internal, Y_internal); +} + +template +void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { + axpby(typename XMV::execution_space{}, a, X, b, Y); +} + +template +void axpy(const execution_space& space, const AV& a, const XMV& X, const YMV& Y) { + axpby(space, a, X, Kokkos::ArithTraits::one(), + Y); } template void axpy(const AV& a, const XMV& X, const YMV& Y) { - axpby(a, X, Kokkos::ArithTraits::one(), - Y); + axpy(typename XMV::execution_space{}, a, X, Y); } /// diff --git a/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp index 3d7952a578..3cedf5fc3f 100644 --- a/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct axpby_tpl_spec_avail { enum : bool { value = false }; }; @@ -36,6 +36,7 @@ namespace Impl { #define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct axpby_tpl_spec_avail< \ + ExecSpace, \ SCALAR, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -63,6 +64,7 @@ KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, #define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct axpby_tpl_spec_avail< \ + ExecSpace, \ SCALAR, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp index 778e1e478d..32d91e6b33 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp @@ -126,16 +126,16 @@ void run(const blas3_gemm_params& params) { const auto args = std::vector{params.m, params.n, params.k}; KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas3_GEMM, arg_names, args, + name, KokkosBlas3_GEMM, arg_names, args, params.repeat); KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas3_GEMM, arg_names, args, + name, KokkosBlas3_GEMM, arg_names, args, params.repeat); KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas3_GEMM, arg_names, args, + name, KokkosBlas3_GEMM, arg_names, args, params.repeat); KokkosKernelsBenchmark::register_benchmark( - name, KokkosBlas3_GEMM, arg_names, args, + name, KokkosBlas3_GEMM, arg_names, args, params.repeat); } From 81477dc0d0aadc9704c50a8654eba22b551941ee Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 18 Apr 2023 08:59:39 -0600 Subject: [PATCH 254/442] Update docs.yml --- .github/workflows/docs.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index a066ce315b..70a97119c9 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -15,7 +15,8 @@ jobs: steps: - name: Install Dependencies run: | - sudo apt install doxygen + sudo apt-get update + sudo apt-get install --no-install-recommends doxygen-latex pip install sphinx pip install breathe pip install sphinx-rtd-theme From 790c9f50667ca802861136c3f196d4a973f4abd0 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 18 Apr 2023 10:39:02 -0600 Subject: [PATCH 255/442] Blas1: adding execution space instance interface for abs --- blas/impl/KokkosBlas1_abs_impl.hpp | 18 +++------ blas/impl/KokkosBlas1_abs_spec.hpp | 40 +++++++++++--------- blas/src/KokkosBlas1_abs.hpp | 37 +++++++++++++----- blas/tpls/KokkosBlas1_abs_tpl_spec_avail.hpp | 2 +- 4 files changed, 58 insertions(+), 39 deletions(-) diff --git a/blas/impl/KokkosBlas1_abs_impl.hpp b/blas/impl/KokkosBlas1_abs_impl.hpp index d23ba1d7ed..0334adbafe 100644 --- a/blas/impl/KokkosBlas1_abs_impl.hpp +++ b/blas/impl/KokkosBlas1_abs_impl.hpp @@ -30,7 +30,6 @@ namespace Impl { // Entry-wise absolute value / magnitude: R(i,j) = abs(X(i,j)). template struct MV_Abs_Functor { - typedef typename RMV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -68,7 +67,6 @@ struct MV_Abs_Functor { // Entry-wise, in-place absolute value / magnitude: R(i,j) = abs(R(i,j)). template struct MV_AbsSelf_Functor { - typedef typename RMV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -98,7 +96,6 @@ struct MV_AbsSelf_Functor { // Single-vector, entry-wise absolute value / magnitude: R(i) = abs(X(i)). template struct V_Abs_Functor { - typedef typename RV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -128,7 +125,6 @@ struct V_Abs_Functor { // abs(R(i)). template struct V_AbsSelf_Functor { - typedef typename RV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -149,8 +145,8 @@ struct V_AbsSelf_Functor { // Invoke the "generic" (not unrolled) multivector functor that // computes entry-wise absolute value. -template -void MV_Abs_Generic(const RMV& R, const XMV& X) { +template +void MV_Abs_Generic(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Abs_Generic: RMV is not a Kokkos::View."); @@ -164,9 +160,8 @@ void MV_Abs_Generic(const RMV& R, const XMV& X) { "KokkosBlas::Impl::" "MV_Abs_Generic: XMV is not rank 2"); - typedef typename XMV::execution_space execution_space; const SizeType numRows = X.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if ((void*)(R.data()) == (void*)(X.data())) { // if R and X are the same (alias one another) @@ -179,8 +174,8 @@ void MV_Abs_Generic(const RMV& R, const XMV& X) { } // Variant of MV_Abs_Generic for single vectors (1-D Views) R and X. -template -void V_Abs_Generic(const RV& R, const XV& X) { +template +void V_Abs_Generic(const execution_space& space, const RV& R, const XV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "V_Abs_Generic: RV is not a Kokkos::View."); @@ -194,9 +189,8 @@ void V_Abs_Generic(const RV& R, const XV& X) { "KokkosBlas::Impl::" "V_Abs_Generic: XV is not rank 1"); - typedef typename XV::execution_space execution_space; const SizeType numRows = X.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if ((void*)(R.data()) == (void*)(X.data())) { // if R and X are the same (alias one another) diff --git a/blas/impl/KokkosBlas1_abs_spec.hpp b/blas/impl/KokkosBlas1_abs_spec.hpp index 525d1e9ee8..ec14678816 100644 --- a/blas/impl/KokkosBlas1_abs_spec.hpp +++ b/blas/impl/KokkosBlas1_abs_spec.hpp @@ -28,7 +28,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct abs_eti_spec_avail { enum : bool { value = false }; }; @@ -45,6 +45,7 @@ struct abs_eti_spec_avail { #define KOKKOSBLAS1_ABS_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct abs_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View \ struct abs_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View::value, - bool eti_spec_avail = abs_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = abs_eti_spec_avail::value> struct Abs { - static void abs(const RMV& R, const XMV& X); + static void abs(const execution_space& space, const RMV& R, const XMV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Abs for single vectors (1-D Views). -template -struct Abs { - typedef typename XMV::size_type size_type; +template +struct Abs { + using size_type = typename XMV::size_type; - static void abs(const RMV& R, const XMV& X) { + static void abs(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Abs<1-D>: RMV is not a Kokkos::View."); @@ -125,20 +127,20 @@ struct Abs { if (numRows < static_cast(INT_MAX)) { typedef int index_type; - V_Abs_Generic(R, X); + V_Abs_Generic(space, R, X); } else { typedef std::int64_t index_type; - V_Abs_Generic(R, X); + V_Abs_Generic(space, R, X); } Kokkos::Profiling::popRegion(); } }; -template -struct Abs { - typedef typename XMV::size_type size_type; +template +struct Abs { + using size_type = typename XMV::size_type; - static void abs(const RMV& R, const XMV& X) { + static void abs(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Abs<2-D>: RMV is not a Kokkos::View."); @@ -169,10 +171,10 @@ struct Abs { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Abs_Generic(R, X); + MV_Abs_Generic(space, R, X); } else { typedef std::int64_t index_type; - MV_Abs_Generic(R, X); + MV_Abs_Generic(space, R, X); } Kokkos::Profiling::popRegion(); } @@ -191,6 +193,7 @@ struct Abs { // #define KOKKOSBLAS1_ABS_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ extern template struct Abs< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { // #define KOKKOSBLAS1_ABS_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Abs< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_ABS_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Abs< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_ABS_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Abs< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View -void abs(const RMV& R, const XMV& X) { +template +void abs(const execution_space& space, const RMV& R, const XMV& X) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::abs: execution_space must be a valid Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::abs: " "R is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::abs: RMV must be accessible from execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::abs: " "X is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::abs: XMV must be accessible from execution space"); static_assert(std::is_same::value, "KokkosBlas::abs: R is const. " @@ -63,24 +70,36 @@ void abs(const RMV& R, const XMV& X) { // Create unmanaged versions of the input Views. RMV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View< + using RMV_Internal = Kokkos::View< typename std::conditional::type, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename RMV::device_type, Kokkos::MemoryTraits > - RMV_Internal; - typedef Kokkos::View< + typename RMV::device_type, Kokkos::MemoryTraits >; + using XMV_Internal = Kokkos::View< typename std::conditional::type, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XMV::device_type, Kokkos::MemoryTraits > - XMV_Internal; + typename XMV::device_type, Kokkos::MemoryTraits >; RMV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Abs::abs(R_internal, X_internal); + Impl::Abs::abs(space, R_internal, X_internal); +} + +/// \brief R(i,j) = abs(X(i,j)) +/// +/// Replace each entry in R with the absolute value (magnitude) of the +/// corresponding entry in X. +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV, and its entries must be assignable to +/// those of RMV. +template +void abs(const RMV& R, const XMV& X) { + abs(typename RMV::execution_space{}, R, X); } } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_abs_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_abs_tpl_spec_avail.hpp index 9f66f8ee61..9fada3ff9c 100644 --- a/blas/tpls/KokkosBlas1_abs_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_abs_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct abs_tpl_spec_avail { enum : bool { value = false }; }; From 40eac295809abe7a99221a09236f47c279d71bc1 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 18 Apr 2023 13:00:51 -0600 Subject: [PATCH 256/442] Fix #1798 --- test_common/KokkosKernels_TestUtils.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 4724621f46..cd0e651e1c 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -106,26 +106,26 @@ struct view_stride_adapter { if constexpr (rank == 1) { if constexpr (strided) { d_base = DViewBase(label, m, 2); - h_base = Kokkos::create_mirror_view(Kokkos::HostSpace(), d_base); + h_base = Kokkos::create_mirror_view(d_base); d_view = Kokkos::subview(d_base, Kokkos::ALL(), 0); h_view = Kokkos::subview(h_base, Kokkos::ALL(), 0); } else { d_base = DViewBase(label, m); - h_base = Kokkos::create_mirror_view(Kokkos::HostSpace(), d_base); + h_base = Kokkos::create_mirror_view(d_base); d_view = d_base; h_view = h_base; } } else { if constexpr (strided) { d_base = DViewBase(label, m, n, 2); - h_base = Kokkos::create_mirror_view(Kokkos::HostSpace(), d_base); + h_base = Kokkos::create_mirror_view(d_base); d_view = Kokkos::subview(d_base, Kokkos::ALL(), Kokkos::make_pair(0, n), 0); h_view = Kokkos::subview(h_base, Kokkos::ALL(), Kokkos::make_pair(0, n), 0); } else { d_base = DViewBase(label, m, n); - h_base = Kokkos::create_mirror_view(Kokkos::HostSpace(), d_base); + h_base = Kokkos::create_mirror_view(d_base); d_view = d_base; h_view = h_base; } From f78e4eb747d87ba5cf6584a723fd93a81b8cd546 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 19 Apr 2023 08:42:50 -0600 Subject: [PATCH 257/442] sparse: specify memory space for coo2crs --- sparse/src/KokkosSparse_coo2crs.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sparse/src/KokkosSparse_coo2crs.hpp b/sparse/src/KokkosSparse_coo2crs.hpp index 7e3ce2ccc4..bb889330ec 100644 --- a/sparse/src/KokkosSparse_coo2crs.hpp +++ b/sparse/src/KokkosSparse_coo2crs.hpp @@ -52,6 +52,7 @@ class Coo2Crs { using UmapEqualToType = typename Kokkos::pod_equal_to; using UmapType = Kokkos::UnorderedMap; + using UmapMemorySpace = typename UmapType::device_type::memory_space; // Public for kokkos policies struct coo2crsRp1 {}; @@ -190,15 +191,14 @@ class Coo2Crs { // the host before using a host-callable method. // Setup a nrows length array of Unordered Maps - m_umaps = reinterpret_cast( - Kokkos::kokkos_malloc("m_umaps", m_nrows * sizeof(UmapType))); + m_umaps = + reinterpret_cast(Kokkos::kokkos_malloc( + "m_umaps", m_nrows * sizeof(UmapType))); using shallow_copy_to_device = - Kokkos::Impl::DeepCopy; + Kokkos::Impl::DeepCopy; UmapType **umap_ptrs = new UmapType *[m_nrows]; - // TODO: use host-level parallel_for with tag rowmapRp1 for (int i = 0; i < m_nrows; i++) { umap_ptrs[i] = new UmapType(arg_capacity_hint, arg_hasher, arg_equal_to); @@ -263,7 +263,7 @@ class Coo2Crs { delete umap_ptrs[i]; } delete[] umap_ptrs; - Kokkos::kokkos_free(m_umaps); + Kokkos::kokkos_free(m_umaps); } CrsType get_crsMat() { From 1d33c6f9b237116d72196d9a71289aab5392360b Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 19 Apr 2023 08:44:03 -0600 Subject: [PATCH 258/442] scripts: Include OMP settings --- scripts/cm_test_all_sandia | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 44a3a9e795..7e5c135d7f 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -1183,6 +1183,9 @@ single_build_and_test() { fi echo " module purge" &>> reload_modules.sh echo " module load $compiler_modules_list" &>> reload_modules.sh + echo " export OMP_NUM_THREADS=$omp_num_threads" &>> reload_modules.sh + echo " export OMP_PROC_BIND=$omp_proc_bind" &>> reload_modules.sh + echo " export OMP_PLACES=$omp_places" &>> reload_modules.sh echo "" &>> reload_modules.sh chmod +x reload_modules.sh From 01547c4476b49d644bdd04e89ab8cf9ff137228b Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 19 Apr 2023 10:41:37 -0600 Subject: [PATCH 259/442] Blas1: supporting execution space on BLAS1 kernels --- blas/impl/KokkosBlas1_mult_impl.hpp | 18 ++--- blas/impl/KokkosBlas1_mult_spec.hpp | 36 +++++---- blas/impl/KokkosBlas1_reciprocal_impl.hpp | 18 ++--- blas/impl/KokkosBlas1_reciprocal_spec.hpp | 36 +++++---- blas/impl/KokkosBlas1_scal_impl.hpp | 9 +-- blas/impl/KokkosBlas1_scal_mv_impl.hpp | 76 +++++++++---------- blas/impl/KokkosBlas1_scal_spec.hpp | 53 +++++++------ blas/impl/KokkosBlas1_update_impl.hpp | 18 ++--- blas/impl/KokkosBlas1_update_spec.hpp | 47 +++++++----- blas/src/KokkosBlas1_fill.hpp | 13 ++++ blas/src/KokkosBlas1_mult.hpp | 39 +++++++++- blas/src/KokkosBlas1_reciprocal.hpp | 29 ++++++- blas/src/KokkosBlas1_scal.hpp | 35 ++++++--- blas/src/KokkosBlas1_update.hpp | 48 ++++++++---- blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp | 2 +- .../KokkosBlas1_reciprocal_tpl_spec_avail.hpp | 2 +- blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp | 42 +++++----- blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp | 75 ++++++++++-------- .../KokkosBlas1_update_tpl_spec_avail.hpp | 2 +- 19 files changed, 361 insertions(+), 237 deletions(-) diff --git a/blas/impl/KokkosBlas1_mult_impl.hpp b/blas/impl/KokkosBlas1_mult_impl.hpp index 754cce4d12..2a70e0caab 100644 --- a/blas/impl/KokkosBlas1_mult_impl.hpp +++ b/blas/impl/KokkosBlas1_mult_impl.hpp @@ -37,7 +37,6 @@ namespace Impl { template struct MV_MultFunctor { - typedef typename CMV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -105,7 +104,6 @@ struct MV_MultFunctor { template struct V_MultFunctor { - typedef typename CV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -146,18 +144,17 @@ struct V_MultFunctor { /// /// C(i) = c * C(i) + ab * A(i) * B(i), subject to the usual BLAS /// update rules. -template -void V_Mult_Generic(typename CV::const_value_type& c, const CV& C, +template +void V_Mult_Generic(const execution_space& space, typename CV::const_value_type& c, const CV& C, typename AV::const_value_type& ab, const AV& A, const BV& B) { using Kokkos::ALL; using Kokkos::subview; typedef Kokkos::ArithTraits ATA; typedef Kokkos::ArithTraits ATC; - typedef typename CV::execution_space execution_space; const SizeType numRows = C.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (c == ATC::zero()) { if (ab == ATA::zero()) { @@ -193,13 +190,12 @@ void V_Mult_Generic(typename CV::const_value_type& c, const CV& C, /// /// C(i,j) = c * C(i,j) + ab * A(i) * B(i,j), subject to the usual /// BLAS update rules. -template -void MV_Mult_Generic(typename CMV::const_value_type& c, const CMV& C, +template +void MV_Mult_Generic(const execution_space& space, typename CMV::const_value_type& c, const CMV& C, typename AV::const_value_type& ab, const AV& A, const BMV& B) { typedef Kokkos::ArithTraits ATA; typedef Kokkos::ArithTraits ATC; - typedef typename CMV::execution_space execution_space; if (C.extent(1) == 1) { auto C_0 = Kokkos::subview(C, Kokkos::ALL(), 0); @@ -207,12 +203,12 @@ void MV_Mult_Generic(typename CMV::const_value_type& c, const CMV& C, typedef decltype(C_0) CV; typedef decltype(B_0) BV; - V_Mult_Generic(c, C_0, ab, A, B_0); + V_Mult_Generic(space, c, C_0, ab, A, B_0); return; } const SizeType numRows = C.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (c == ATC::zero()) { if (ab == ATA::zero()) { diff --git a/blas/impl/KokkosBlas1_mult_spec.hpp b/blas/impl/KokkosBlas1_mult_spec.hpp index 1c0a88e8dc..25aeba86c2 100644 --- a/blas/impl/KokkosBlas1_mult_spec.hpp +++ b/blas/impl/KokkosBlas1_mult_spec.hpp @@ -27,7 +27,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct mult_eti_spec_avail { enum : bool { value = false }; }; @@ -44,6 +44,7 @@ struct mult_eti_spec_avail { #define KOKKOSBLAS1_MULT_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct mult_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View \ struct mult_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View::value, - bool eti_spec_avail = mult_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = mult_eti_spec_avail::value> struct Mult { - static void mult(const typename YMV::non_const_value_type& gamma, + static void mult(const execution_space& space, const typename YMV::non_const_value_type& gamma, const YMV& Y, const typename XMV::non_const_value_type& alpha, const AV& A, const XMV& X); @@ -110,13 +112,13 @@ struct Mult { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY // Partial specialization for YMV, AV, and XMV rank-2 Views. -template -struct Mult { +template +struct Mult { typedef typename YMV::size_type size_type; typedef typename YMV::non_const_value_type YMV_scalar; typedef typename XMV::non_const_value_type XMV_scalar; - static void mult(const YMV_scalar& gamma, const YMV& Y, + static void mult(const execution_space& space, const YMV_scalar& gamma, const YMV& Y, const XMV_scalar& alpha, const AV& A, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -160,22 +162,22 @@ struct Mult { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - MV_Mult_Generic(gamma, Y, alpha, A, X); + MV_Mult_Generic(space, gamma, Y, alpha, A, X); } else { - MV_Mult_Generic(gamma, Y, alpha, A, X); + MV_Mult_Generic(space, gamma, Y, alpha, A, X); } Kokkos::Profiling::popRegion(); } }; // Partial specialization for YV, AV, and XV rank-1 Views. -template -struct Mult { +template +struct Mult { typedef typename YV::size_type size_type; typedef typename YV::non_const_value_type YV_scalar; typedef typename XV::non_const_value_type XV_scalar; - static void mult(const YV_scalar& gamma, const YV& Y, const XV_scalar& alpha, + static void mult(const execution_space& space, const YV_scalar& gamma, const YV& Y, const XV_scalar& alpha, const AV& A, const XV& X) { // YV, AV, and XV must be Kokkos::View specializations. static_assert(Kokkos::is_view::value, @@ -212,9 +214,9 @@ struct Mult { const size_type numRows = Y.extent(0); if (numRows < static_cast(INT_MAX)) { - V_Mult_Generic(gamma, Y, alpha, A, X); + V_Mult_Generic(space, gamma, Y, alpha, A, X); } else { - V_Mult_Generic(gamma, Y, alpha, A, X); + V_Mult_Generic(space, gamma, Y, alpha, A, X); } Kokkos::Profiling::popRegion(); } @@ -235,6 +237,7 @@ struct Mult { #define KOKKOSBLAS1_MULT_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ extern template struct Mult< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_MULT_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Mult< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_MULT_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Mult< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_MULT_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Mult< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View struct MV_Reciprocal_Functor { - typedef typename RMV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -69,7 +68,6 @@ struct MV_Reciprocal_Functor { // reciprocal(R(i,j)). template struct MV_ReciprocalSelf_Functor { - typedef typename RMV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -100,7 +98,6 @@ struct MV_ReciprocalSelf_Functor { // reciprocal(X(i)). template struct V_Reciprocal_Functor { - typedef typename RV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -130,7 +127,6 @@ struct V_Reciprocal_Functor { // reciprocal(R(i)). template struct V_ReciprocalSelf_Functor { - typedef typename RV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -151,8 +147,8 @@ struct V_ReciprocalSelf_Functor { // Invoke the "generic" (not unrolled) multivector functor that // computes entry-wise reciprocalolute value. -template -void MV_Reciprocal_Generic(const RMV& R, const XMV& X) { +template +void MV_Reciprocal_Generic(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Reciprocal_Generic: RMV is not a Kokkos::View."); @@ -166,9 +162,8 @@ void MV_Reciprocal_Generic(const RMV& R, const XMV& X) { "KokkosBlas::Impl::" "MV_Reciprocal_Generic: XMV is not rank 2"); - typedef typename XMV::execution_space execution_space; const SizeType numRows = X.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (R == X) { // if R and X are the same (alias one another) MV_ReciprocalSelf_Functor op(R); @@ -180,8 +175,8 @@ void MV_Reciprocal_Generic(const RMV& R, const XMV& X) { } // Variant of MV_Reciprocal_Generic for single vectors (1-D Views) R and X. -template -void V_Reciprocal_Generic(const RV& R, const XV& X) { +template +void V_Reciprocal_Generic(const execution_space& space, const RV& R, const XV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "V_Reciprocal_Generic: RV is not a Kokkos::View."); @@ -195,9 +190,8 @@ void V_Reciprocal_Generic(const RV& R, const XV& X) { "KokkosBlas::Impl::" "V_Reciprocal_Generic: XV is not rank 1"); - typedef typename XV::execution_space execution_space; const SizeType numRows = X.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (R == X) { // if R and X are the same (alias one another) V_ReciprocalSelf_Functor op(R); diff --git a/blas/impl/KokkosBlas1_reciprocal_spec.hpp b/blas/impl/KokkosBlas1_reciprocal_spec.hpp index 1a40aa3542..ea937c167b 100644 --- a/blas/impl/KokkosBlas1_reciprocal_spec.hpp +++ b/blas/impl/KokkosBlas1_reciprocal_spec.hpp @@ -28,7 +28,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct reciprocal_eti_spec_avail { enum : bool { value = false }; }; @@ -46,6 +46,7 @@ struct reciprocal_eti_spec_avail { MEM_SPACE) \ template <> \ struct reciprocal_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View \ struct reciprocal_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View::value, - bool eti_spec_avail = reciprocal_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = reciprocal_eti_spec_avail::value> struct Reciprocal { - static void reciprocal(const RMV& R, const XMV& X); + static void reciprocal(const execution_space& space, const RMV& R, const XMV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Reciprocal for single vectors (1-D Views). -template -struct Reciprocal { +template +struct Reciprocal { typedef typename XMV::size_type size_type; - static void reciprocal(const RMV& R, const XMV& X) { + static void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Reciprocal<1-D>: RMV is not a Kokkos::View."); @@ -127,20 +129,20 @@ struct Reciprocal { if (numRows < static_cast(INT_MAX)) { typedef int index_type; - V_Reciprocal_Generic(R, X); + V_Reciprocal_Generic(space, R, X); } else { typedef std::int64_t index_type; - V_Reciprocal_Generic(R, X); + V_Reciprocal_Generic(space, R, X); } Kokkos::Profiling::popRegion(); } }; -template -struct Reciprocal { +template +struct Reciprocal { typedef typename XMV::size_type size_type; - static void reciprocal(const RMV& R, const XMV& X) { + static void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Reciprocal<2-D>: RMV is not a Kokkos::View."); @@ -171,10 +173,10 @@ struct Reciprocal { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Reciprocal_Generic(R, X); + MV_Reciprocal_Generic(space, R, X); } else { typedef std::int64_t index_type; - MV_Reciprocal_Generic(R, X); + MV_Reciprocal_Generic(space, R, X); } Kokkos::Profiling::popRegion(); } @@ -194,6 +196,7 @@ struct Reciprocal { #define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Reciprocal< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Reciprocal< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Reciprocal< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Reciprocal< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View struct V_Scal_Functor { - typedef typename RV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -101,7 +100,6 @@ struct V_Scal_Functor { template struct V_Scal_Functor { - typedef typename RV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -134,8 +132,8 @@ struct V_Scal_Functor -void V_Scal_Generic(const RV& r, const AV& av, const XV& x, +template +void V_Scal_Generic(const execution_space& space, const RV& r, const AV& av, const XV& x, const SizeType startingColumn, int a = 2) { static_assert(Kokkos::is_view::value, "V_Scal_Generic: RV is not a Kokkos::View."); @@ -144,9 +142,8 @@ void V_Scal_Generic(const RV& r, const AV& av, const XV& x, static_assert(RV::rank == 1, "V_Scal_Generic: RV is not rank 1."); static_assert(XV::rank == 1, "V_Scal_Generic: XV is not rank 1."); - typedef typename RV::execution_space execution_space; const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (a == 0) { V_Scal_Functor op(r, x, av, startingColumn); diff --git a/blas/impl/KokkosBlas1_scal_mv_impl.hpp b/blas/impl/KokkosBlas1_scal_mv_impl.hpp index aded2fd19c..58c4091fed 100644 --- a/blas/impl/KokkosBlas1_scal_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_scal_mv_impl.hpp @@ -45,7 +45,6 @@ namespace Impl { template struct MV_Scal_Functor { - typedef typename RMV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -127,7 +126,6 @@ struct MV_Scal_Functor { template struct MV_Scal_Functor { - typedef typename RMV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -198,7 +196,6 @@ struct MV_Scal_Functor struct MV_Scal_Unroll_Functor { - typedef typename RMV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -259,7 +256,6 @@ struct MV_Scal_Unroll_Functor { template struct MV_Scal_Unroll_Functor { - typedef typename RMV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -323,16 +319,15 @@ struct MV_Scal_Unroll_Functor -void MV_Scal_Unrolled(const RMV& r, const aVector& av, const XMV& x, +template +void MV_Scal_Unrolled(const execution_space& space, const RMV& r, const aVector& av, const XMV& x, const SizeType startingColumn, int a = 2) { - typedef typename XMV::execution_space execution_space; if (a == 0) { MV_Scal_Unroll_Functor op( r, x, av, startingColumn); const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); Kokkos::parallel_for("KokkosBlas::Scal::MV::S0", policy, op); return; } @@ -340,7 +335,7 @@ void MV_Scal_Unrolled(const RMV& r, const aVector& av, const XMV& x, MV_Scal_Unroll_Functor op( r, x, av, startingColumn); const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); Kokkos::parallel_for("KokkosBlas::Scal::MV::S1", policy, op); return; } @@ -348,7 +343,7 @@ void MV_Scal_Unrolled(const RMV& r, const aVector& av, const XMV& x, MV_Scal_Unroll_Functor op( r, x, av, startingColumn); const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); Kokkos::parallel_for("KokkosBlas::Scal::MV::S2", policy, op); return; } @@ -357,7 +352,7 @@ void MV_Scal_Unrolled(const RMV& r, const aVector& av, const XMV& x, MV_Scal_Unroll_Functor op( r, x, av, startingColumn); const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); Kokkos::parallel_for("KokkosBlas::Scal::MV::S3", policy, op); } @@ -375,12 +370,11 @@ void MV_Scal_Unrolled(const RMV& r, const aVector& av, const XMV& x, // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does NOT apply to // coefficient(s) in av, if used. -template -void MV_Scal_Generic(const RVector& r, const aVector& av, const XVector& x, +template +void MV_Scal_Generic(const execution_space& space, const RVector& r, const aVector& av, const XVector& x, const SizeType startingColumn, int a = 2) { - typedef typename XVector::execution_space execution_space; const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (a == 0) { MV_Scal_Functor op(r, x, av, @@ -421,8 +415,8 @@ void MV_Scal_Generic(const RVector& r, const aVector& av, const XVector& x, // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does NOT apply to // coefficient(s) in av, if used. -template -void MV_Scal_Invoke_Left(const RMV& r, const AV& av, const XMV& x, int a = 2) { +template +void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, const AV& av, const XMV& x, int a = 2) { const SizeType numCols = x.extent(1); #if KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL <= 2 @@ -439,7 +433,7 @@ void MV_Scal_Invoke_Left(const RMV& r, const AV& av, const XMV& x, int a = 2) { typedef decltype(X_cur) XMV2D; typedef decltype(R_cur) RMV2D; - MV_Scal_Unrolled(R_cur, av, X_cur, j, a); + MV_Scal_Unrolled(space, R_cur, av, X_cur, j, a); } for (; j + 4 <= numCols; j += 4) { const std::pair rng(j, j + 4); @@ -448,7 +442,7 @@ void MV_Scal_Invoke_Left(const RMV& r, const AV& av, const XMV& x, int a = 2) { typedef decltype(X_cur) XMV2D; typedef decltype(R_cur) RMV2D; - MV_Scal_Unrolled(R_cur, av, X_cur, j, a); + MV_Scal_Unrolled(space, R_cur, av, X_cur, j, a); } for (; j < numCols; ++j) { // RMV and XMV need to turn 1-D. @@ -457,7 +451,7 @@ void MV_Scal_Invoke_Left(const RMV& r, const AV& av, const XMV& x, int a = 2) { typedef decltype(r_cur) RV; typedef decltype(x_cur) XV; - V_Scal_Generic(r_cur, av, x_cur, j, a); + V_Scal_Generic(space, r_cur, av, x_cur, j, a); } #else // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL > 2 @@ -469,39 +463,39 @@ void MV_Scal_Invoke_Left(const RMV& r, const AV& av, const XMV& x, int a = 2) { typedef decltype(r_0) RV; typedef decltype(x_0) XV; - V_Scal_Generic(r_0, av, x_0, 0, a); + V_Scal_Generic(space, r_0, av, x_0, 0, a); break; } - case 2: MV_Scal_Unrolled(r, av, x, 0, a); break; - case 3: MV_Scal_Unrolled(r, av, x, 0, a); break; - case 4: MV_Scal_Unrolled(r, av, x, 0, a); break; - case 5: MV_Scal_Unrolled(r, av, x, 0, a); break; - case 6: MV_Scal_Unrolled(r, av, x, 0, a); break; - case 7: MV_Scal_Unrolled(r, av, x, 0, a); break; - case 8: MV_Scal_Unrolled(r, av, x, 0, a); break; - case 9: MV_Scal_Unrolled(r, av, x, 0, a); break; + case 2: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 3: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 4: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 5: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 6: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 7: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 8: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 9: MV_Scal_Unrolled(space, r, av, x, 0, a); break; case 10: - MV_Scal_Unrolled(r, av, x, 0, a); + MV_Scal_Unrolled(space, r, av, x, 0, a); break; case 11: - MV_Scal_Unrolled(r, av, x, 0, a); + MV_Scal_Unrolled(space, r, av, x, 0, a); break; case 12: - MV_Scal_Unrolled(r, av, x, 0, a); + MV_Scal_Unrolled(space, r, av, x, 0, a); break; case 13: - MV_Scal_Unrolled(r, av, x, 0, a); + MV_Scal_Unrolled(space, r, av, x, 0, a); break; case 14: - MV_Scal_Unrolled(r, av, x, 0, a); + MV_Scal_Unrolled(space, r, av, x, 0, a); break; case 15: - MV_Scal_Unrolled(r, av, x, 0, a); + MV_Scal_Unrolled(space, r, av, x, 0, a); break; case 16: - MV_Scal_Unrolled(r, av, x, 0, a); + MV_Scal_Unrolled(space, r, av, x, 0, a); break; - default: MV_Scal_Generic(r, av, x, 0, a); + default: MV_Scal_Generic(space, r, av, x, 0, a); } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL @@ -521,8 +515,8 @@ void MV_Scal_Invoke_Left(const RMV& r, const AV& av, const XMV& x, int a = 2) { // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does NOT apply to // coefficient(s) in av, if used. -template -void MV_Scal_Invoke_Right(const RMV& r, const aVector& av, const XMV& x, +template +void MV_Scal_Invoke_Right(const execution_space& space, const RMV& r, const aVector& av, const XMV& x, int a = 2) { const SizeType numCols = x.extent(1); @@ -536,9 +530,9 @@ void MV_Scal_Invoke_Right(const RMV& r, const aVector& av, const XMV& x, RV r_0 = Kokkos::subview(r, Kokkos::ALL(), 0); XV x_0 = Kokkos::subview(x, Kokkos::ALL(), 0); - V_Scal_Generic(r_0, av, x_0, a); + V_Scal_Generic(space, r_0, av, x_0, a); } else { - MV_Scal_Generic(r, av, x, a); + MV_Scal_Generic(space, r, av, x, a); } } diff --git a/blas/impl/KokkosBlas1_scal_spec.hpp b/blas/impl/KokkosBlas1_scal_spec.hpp index 8d85f5f1e9..bb80f30451 100644 --- a/blas/impl/KokkosBlas1_scal_spec.hpp +++ b/blas/impl/KokkosBlas1_scal_spec.hpp @@ -29,7 +29,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct scal_eti_spec_avail { enum : bool { value = false }; }; @@ -46,6 +46,7 @@ struct scal_eti_spec_avail { #define KOKKOSBLAS1_SCAL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct scal_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ @@ -67,6 +68,7 @@ struct scal_eti_spec_avail { MEM_SPACE) \ template <> \ struct scal_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View \ struct scal_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ @@ -99,23 +102,23 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = scal_eti_spec_avail::value> + template ::value, + bool eti_spec_avail = scal_eti_spec_avail::value> struct Scal { - static void scal(const RV& R, const AV& A, const XV& X); + static void scal(const execution_space& space, const RV& R, const AV& A, const XV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Scal for single vectors (1-D Views). -template -struct Scal +struct Scal { typedef typename XV::non_const_value_type AV; typedef typename XV::size_type size_type; typedef Kokkos::ArithTraits ATA; - static void scal(const RV& R, const AV& alpha, const XV& X) { + static void scal(const execution_space& space, const RV& R, const AV& alpha, const XV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Scal<1-D>: RV is not a Kokkos::View."); @@ -154,10 +157,10 @@ struct Scal(INT_MAX)) { typedef int index_type; - V_Scal_Generic(R, alpha, X, a); + V_Scal_Generic(space, R, alpha, X, a); } else { typedef typename XV::size_type index_type; - V_Scal_Generic(R, alpha, X, a); + V_Scal_Generic(space, R, alpha, X, a); } Kokkos::Profiling::popRegion(); } @@ -169,12 +172,12 @@ struct Scal -struct Scal { +template +struct Scal { typedef typename XMV::size_type size_type; typedef Kokkos::ArithTraits ATA; - static void scal(const RMV& R, const AV& av, const XMV& X) { + static void scal(const execution_space& space, const RMV& R, const AV& av, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Scal<2-D>: RMV is not a Kokkos::View."); @@ -212,10 +215,10 @@ struct Scal { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Scal_Invoke_Left(R, av, X, a); + MV_Scal_Invoke_Left(space, R, av, X, a); } else { typedef typename XMV::size_type index_type; - MV_Scal_Invoke_Left(R, av, X, a); + MV_Scal_Invoke_Left(space, R, av, X, a); } Kokkos::Profiling::popRegion(); } @@ -227,14 +230,14 @@ struct Scal { /// /// 1. R(i,j) = a*X(i,j) for a in -1,0,1 /// 2. R(i,j) = alpha*X(i,j) -template -struct Scal +struct Scal { typedef typename XMV::non_const_value_type AV; typedef typename XMV::size_type size_type; typedef Kokkos::ArithTraits ATA; - static void scal(const RMV& R, const AV& alpha, const XMV& X) { + static void scal(const execution_space& space, const RMV& R, const AV& alpha, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Scal<2-D, AV=scalar>: RMV is not a Kokkos::View."); @@ -275,12 +278,12 @@ struct Scal(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Scal_Invoke_Left(R, alpha, X, a); + MV_Scal_Invoke_Left(space, R, alpha, X, a); } else { typedef typename XMV::size_type index_type; - MV_Scal_Invoke_Left(R, alpha, X, a); + MV_Scal_Invoke_Left(space, R, alpha, X, a); } Kokkos::Profiling::popRegion(); } @@ -299,6 +302,7 @@ struct Scal, \ Kokkos::MemoryTraits >, \ SCALAR, \ @@ -309,6 +313,7 @@ struct Scal, \ Kokkos::MemoryTraits >, \ SCALAR, \ @@ -326,6 +331,7 @@ struct Scal, \ Kokkos::MemoryTraits >, \ Kokkos::View >, \ 2, false, true>; \ extern template struct Scal< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ @@ -347,6 +354,7 @@ struct Scal, \ Kokkos::MemoryTraits >, \ Kokkos::View >, \ 2, false, true>; \ template struct Scal< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ diff --git a/blas/impl/KokkosBlas1_update_impl.hpp b/blas/impl/KokkosBlas1_update_impl.hpp index 99c80f8d3c..d4abcc0ba0 100644 --- a/blas/impl/KokkosBlas1_update_impl.hpp +++ b/blas/impl/KokkosBlas1_update_impl.hpp @@ -43,7 +43,6 @@ namespace Impl { template struct MV_Update_Functor { - typedef typename ZMV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -213,7 +212,6 @@ struct MV_Update_Functor { template struct V_Update_Functor { - typedef typename ZV::execution_space execution_space; typedef SizeType size_type; typedef Kokkos::ArithTraits ATS; @@ -316,8 +314,9 @@ struct V_Update_Functor { // // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding multivector entry. -template -void MV_Update_Generic(const typename XMV::non_const_value_type& alpha, +template +void MV_Update_Generic(const execution_space& space, + const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, @@ -347,9 +346,8 @@ void MV_Update_Generic(const typename XMV::non_const_value_type& alpha, "KokkosBlas::Impl::MV_Update_Generic: " "XMV, YMV, and ZMV must have rank 2."); - typedef typename XMV::execution_space execution_space; const SizeType numRows = X.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (a == 0) { if (b == 0) { @@ -417,8 +415,9 @@ void MV_Update_Generic(const typename XMV::non_const_value_type& alpha, // // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding vector entry. -template -void V_Update_Generic(const typename XV::non_const_value_type& alpha, +template +void V_Update_Generic(const execution_space& space, + const typename XV::non_const_value_type& alpha, const XV& X, const typename YV::non_const_value_type& beta, const YV& Y, @@ -448,9 +447,8 @@ void V_Update_Generic(const typename XV::non_const_value_type& alpha, "KokkosBlas::Impl::V_Update_Generic: " "XV, YV, and ZV must have rank 1."); - typedef typename XV::execution_space execution_space; const SizeType numRows = X.extent(0); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); if (a == 0) { if (b == 0) { diff --git a/blas/impl/KokkosBlas1_update_spec.hpp b/blas/impl/KokkosBlas1_update_spec.hpp index 0b33e5224e..aa573b4058 100644 --- a/blas/impl/KokkosBlas1_update_spec.hpp +++ b/blas/impl/KokkosBlas1_update_spec.hpp @@ -27,7 +27,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct update_eti_spec_avail { enum : bool { value = false }; }; @@ -45,6 +45,7 @@ struct update_eti_spec_avail { MEM_SPACE) \ template <> \ struct update_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -68,6 +69,7 @@ struct update_eti_spec_avail { MEM_SPACE) \ template <> \ struct update_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -100,11 +102,12 @@ namespace Impl { /// Z(i,j) = alpha*X(i,j) + beta*Y(i,j) + gamma*Z(i,j), /// /// with special cases for alpha, beta, or gamma = 0. -template ::value, - bool eti_spec_avail = update_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = update_eti_spec_avail::value> struct Update { - static void update(const typename XMV::non_const_value_type& alpha, + static void update(const execution_space& space, + const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, @@ -114,14 +117,15 @@ struct Update { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY // Partial specialization for XMV, YMV, and ZMV rank-2 Views. -template -struct Update { +template +struct Update { typedef typename XMV::size_type size_type; typedef Kokkos::ArithTraits ATA; typedef Kokkos::ArithTraits ATB; typedef Kokkos::ArithTraits ATC; - static void update(const typename XMV::non_const_value_type& alpha, + static void update(const execution_space& space, + const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, @@ -194,23 +198,23 @@ struct Update { if (numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - V_Update_Generic(alpha, X_0, beta, Y_0, gamma, Z_0, a, b, + V_Update_Generic(space, alpha, X_0, beta, Y_0, gamma, Z_0, a, b, c); } else { typedef typename XMV::size_type index_type; - V_Update_Generic(alpha, X_0, beta, Y_0, gamma, Z_0, a, b, + V_Update_Generic(space, alpha, X_0, beta, Y_0, gamma, Z_0, a, b, c); } } else { if (numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Update_Generic(alpha, X, beta, Y, gamma, + MV_Update_Generic(space, alpha, X, beta, Y, gamma, Z, a, b, c); } else { typedef typename XMV::size_type index_type; - MV_Update_Generic(alpha, X, beta, Y, gamma, + MV_Update_Generic(space, alpha, X, beta, Y, gamma, Z, a, b, c); } } @@ -219,14 +223,15 @@ struct Update { }; // Partial specialization for XV, YV, and ZV rank-1 Views. -template -struct Update { +template +struct Update { typedef typename XV::size_type size_type; typedef Kokkos::ArithTraits ATA; typedef Kokkos::ArithTraits ATB; typedef Kokkos::ArithTraits ATC; - static void update(const typename XV::non_const_value_type& alpha, + static void update(const execution_space& space, + const typename XV::non_const_value_type& alpha, const XV& X, const typename YV::non_const_value_type& beta, const YV& Y, const typename ZV::non_const_value_type& gamma, @@ -291,11 +296,11 @@ struct Update { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - V_Update_Generic(alpha, X, beta, Y, gamma, Z, a, + V_Update_Generic(space, alpha, X, beta, Y, gamma, Z, a, b, c); } else { typedef typename XV::size_type index_type; - V_Update_Generic(alpha, X, beta, Y, gamma, Z, a, + V_Update_Generic(space, alpha, X, beta, Y, gamma, Z, a, b, c); } Kokkos::Profiling::popRegion(); @@ -318,6 +323,7 @@ struct Update { #define KOKKOSBLAS1_UPDATE_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Update< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -331,6 +337,7 @@ struct Update { #define KOKKOSBLAS1_UPDATE_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Update< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -352,6 +359,7 @@ struct Update { #define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Update< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -365,6 +373,7 @@ struct Update { #define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Update< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/blas/src/KokkosBlas1_fill.hpp b/blas/src/KokkosBlas1_fill.hpp index 37aebb3c5d..a7186c466a 100644 --- a/blas/src/KokkosBlas1_fill.hpp +++ b/blas/src/KokkosBlas1_fill.hpp @@ -21,6 +21,19 @@ namespace KokkosBlas { +/// \brief Fill the multivector or single vector X with the given value. +/// +/// \tparam XMV 1-D or 2-D output View +/// +/// \param X [out] Output View (1-D or 2-D). +/// \param val [in] Value with which to fill the entries of X. +template +void fill(const execution_space& space, const XMV& X, const typename XMV::non_const_value_type& val) { + Kokkos::Profiling::pushRegion("KokkosBlas::fill"); + Kokkos::deep_copy(space, X, val); + Kokkos::Profiling::popRegion(); +} + /// \brief Fill the multivector or single vector X with the given value. /// /// \tparam XMV 1-D or 2-D output View diff --git a/blas/src/KokkosBlas1_mult.hpp b/blas/src/KokkosBlas1_mult.hpp index e08409e9aa..a13a2ebd17 100644 --- a/blas/src/KokkosBlas1_mult.hpp +++ b/blas/src/KokkosBlas1_mult.hpp @@ -37,15 +37,26 @@ namespace KokkosBlas { /// \param X [in] The X vector. /// /// \return Y = gamma * Y + alpha * A * X. -template -void mult(typename YMV::const_value_type& gamma, const YMV& Y, +template +void mult(const execution_space& space, typename YMV::const_value_type& gamma, const YMV& Y, typename AV::const_value_type& alpha, const AV& A, const XMV& X) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::mult: execution_space must be a valid Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::mult: " "Y is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::mult: YMV must be accessible from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::mult: " "A is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::mult: AV must be accessible from execution_space."); + static_assert(Kokkos::is_view::value, + "KokkosBlas::mult: " + "X is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::mult: AV must be accessible from execution_space."); static_assert(std::is_same::value, "KokkosBlas::mult: Y is const. " @@ -95,8 +106,28 @@ void mult(typename YMV::const_value_type& gamma, const YMV& Y, AV_Internal A_internal = A; XMV_Internal X_internal = X; - Impl::Mult::mult( - gamma, Y_internal, alpha, A_internal, X_internal); + Impl::Mult::mult( + space, gamma, Y_internal, alpha, A_internal, X_internal); +} + +/// \brief Element wise multiplication of two vectors: +/// Y[i] = gamma * Y[i] + alpha * A[i] * X[i] +/// +/// \tparam YMV Type of the first vector Y; a 1-D or 2-D Kokkos::View. +/// \tparam AV Type of the second vector A; a 1-D Kokkos::View. +/// \tparam XMV Type of the third vector X; a 1-D or 2-D Kokkos::View. +/// +/// \param gamma [in] The scalar to apply to Y. +/// \param Y [in/out] The Y vector. +/// \param alpha [in] The scalar to apply to A. +/// \param A [in] The vector to apply to X. +/// \param X [in] The X vector. +/// +/// \return Y = gamma * Y + alpha * A * X. +template +void mult(typename YMV::const_value_type& gamma, const YMV& Y, + typename AV::const_value_type& alpha, const AV& A, const XMV& X) { + mult(typename YMV::execution_space{}, gamma, Y, alpha, A, X); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_reciprocal.hpp b/blas/src/KokkosBlas1_reciprocal.hpp index 19624d11c9..62780faaa8 100644 --- a/blas/src/KokkosBlas1_reciprocal.hpp +++ b/blas/src/KokkosBlas1_reciprocal.hpp @@ -32,14 +32,20 @@ namespace KokkosBlas { /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. -template -void reciprocal(const RMV& R, const XMV& X) { +template +void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::reciprocal: execution_space must be a valid Kokkos execition space."); static_assert(Kokkos::is_view::value, "KokkosBlas::reciprocal: " "R is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::reciprocal: RMV must be accessible from execution_space"); static_assert(Kokkos::is_view::value, "KokkosBlas::reciprocal: " "X is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::reciprocal: XMV must be accessible from execution_space"); static_assert(std::is_same::value, "KokkosBlas::reciprocal: R is const. " @@ -80,8 +86,23 @@ void reciprocal(const RMV& R, const XMV& X) { RMV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Reciprocal::reciprocal(R_internal, - X_internal); + Impl::Reciprocal::reciprocal(space, + R_internal, + X_internal); +} + +/// \brief R(i,j) = reciprocal(X(i,j)) +/// +/// Replace each entry in R with the absolute value (magnitude), of the +/// reciprocal of the corresponding entry in X. +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV, and its entries must be assignable to +/// those of RMV. +template +void reciprocal(const RMV& R, const XMV& X) { + reciprocal(typename RMV::execution_space{}, R, X); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_scal.hpp b/blas/src/KokkosBlas1_scal.hpp index 8b61936460..996a78f252 100644 --- a/blas/src/KokkosBlas1_scal.hpp +++ b/blas/src/KokkosBlas1_scal.hpp @@ -29,14 +29,22 @@ namespace KokkosBlas { -template -void scal(const RMV& R, const AV& a, const XMV& X) { +template +void scal(const execution_space& space, const RMV& R, const AV& a, const XMV& X) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::scal: execution_space must be a valid Kokkos execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::scal: " "R is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::scal: RMV must be accessible from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::scal: " "X is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::scal: XMV must be accessible from execution_space"); + static_assert(Kokkos::SpaceAccessibility::assignable, + "KokkosBlas::scal: XMV must be assignable to RMV"); static_assert(std::is_same::value, "KokkosBlas::scal: R is const. " @@ -67,23 +75,26 @@ void scal(const RMV& R, const AV& a, const XMV& X) { // Create unmanaged versions of the input Views. RMV and XMV may be // rank 1 or rank 2. AV may be either a rank-1 View, or a scalar // value. - typedef Kokkos::View > - RMV_Internal; - typedef Kokkos::View >; + using XMV_Internal = Kokkos::View > - XMV_Internal; - typedef typename KokkosKernels::Impl::GetUnifiedScalarViewType< - AV, XMV_Internal, true>::type AV_Internal; + Kokkos::MemoryTraits >; + using AV_Internal = typename KokkosKernels::Impl::GetUnifiedScalarViewType< + AV, XMV_Internal, true>::type; RMV_Internal R_internal = R; AV_Internal a_internal = a; XMV_Internal X_internal = X; - Impl::Scal::scal( - R_internal, a_internal, X_internal); + Impl::Scal::scal( + space, R_internal, a_internal, X_internal); +} + +template +void scal(const RMV& R, const AV& a, const XMV& X) { + scal(typename RMV::execution_space{}, R, a, X); } /// diff --git a/blas/src/KokkosBlas1_update.hpp b/blas/src/KokkosBlas1_update.hpp index 741dc508fb..5a37482fb2 100644 --- a/blas/src/KokkosBlas1_update.hpp +++ b/blas/src/KokkosBlas1_update.hpp @@ -25,6 +25,7 @@ namespace KokkosBlas { /// \brief Compute Z := alpha*X + beta*Y + gamma*Z. /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. /// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as XMV. @@ -32,10 +33,13 @@ namespace KokkosBlas { /// the same rank as XMV and YMV, and it must make sense to add up /// the entries of XMV and YMV and assign them to the entries of /// ZMV. -template -void update(const typename XMV::non_const_value_type& alpha, const XMV& X, +template +void update(const execution_space& space, + const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, const typename ZMV::non_const_value_type& gamma, const ZMV& Z) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::update: execution_space must be a valid Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::update: " "X is not a Kokkos::View."); @@ -45,6 +49,12 @@ void update(const typename XMV::non_const_value_type& alpha, const XMV& X, static_assert(Kokkos::is_view::value, "KokkosBlas::update: " "Z is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::update: XMV must be accessible from execution_space."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::update: YMV must be accessible from execution_space."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::update: ZMV must be accessible from execution_space."); static_assert(std::is_same::value, "KokkosBlas::update: Z is const. " @@ -74,27 +84,24 @@ void update(const typename XMV::non_const_value_type& alpha, const XMV& X, // Create unmanaged versions of the input Views. XMV, YMV, and ZMV // may be rank 1 or rank 2, but they must all have the same rank. - typedef Kokkos::View< + using XMV_Internal = Kokkos::View< typename std::conditional::type, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XMV::device_type, Kokkos::MemoryTraits > - XMV_Internal; + typename XMV::device_type, Kokkos::MemoryTraits >; - typedef Kokkos::View< + using YMV_Internal = Kokkos::View< typename std::conditional::type, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YMV::device_type, Kokkos::MemoryTraits > - YMV_Internal; + typename YMV::device_type, Kokkos::MemoryTraits >; - typedef Kokkos::View< + using ZMV_Internal = Kokkos::View< typename std::conditional::type, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename ZMV::device_type, Kokkos::MemoryTraits > - ZMV_Internal; + typename ZMV::device_type, Kokkos::MemoryTraits >; XMV_Internal X_internal = X; YMV_Internal Y_internal = Y; @@ -110,10 +117,25 @@ void update(const typename XMV::non_const_value_type& alpha, const XMV& X, << endl; #endif // KOKKOSKERNELS_PRINT_DEMANGLED_TYPE_INFO - return Impl::Update::update( - alpha, X_internal, beta, Y_internal, gamma, Z_internal); + Impl::Update::update( + space, alpha, X_internal, beta, Y_internal, gamma, Z_internal); } +/// \brief Compute Z := alpha*X + beta*Y + gamma*Z. +/// +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. +/// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as XMV. +/// \tparam ZMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as XMV and YMV, and it must make sense to add up +/// the entries of XMV and YMV and assign them to the entries of +/// ZMV. +template +void update(const typename XMV::non_const_value_type& alpha, const XMV& X, + const typename YMV::non_const_value_type& beta, const YMV& Y, + const typename ZMV::non_const_value_type& gamma, const ZMV& Z) { + update(typename ZMV::execution_space{}, alpha, X, beta, Y, gamma, Z); +} } // namespace KokkosBlas #endif // KOKKOSBLAS1_UPDATE_HPP_ diff --git a/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp index 5b0b5662ba..3924e0da21 100644 --- a/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct mult_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas1_reciprocal_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_reciprocal_tpl_spec_avail.hpp index 5879131808..636d3fe61f 100644 --- a/blas/tpls/KokkosBlas1_reciprocal_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_reciprocal_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct reciprocal_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp index dccc20b9ac..571682b5b5 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct scal_tpl_spec_avail { enum : bool { value = false }; }; @@ -59,44 +59,48 @@ KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, // cuBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) // double -#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ +#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ + MEMSPACE) \ + template <> \ struct scal_tpl_spec_avail< \ - Kokkos::View, \ + EXECSPACE, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { \ enum : bool { value = true }; \ }; KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::CudaSpace) + Kokkos::Cuda, Kokkos::CudaSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::CudaSpace) + Kokkos::Cuda, Kokkos::CudaSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) + Kokkos::Cuda, Kokkos::CudaUVMSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) + Kokkos::Cuda, Kokkos::CudaUVMSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) #endif // rocBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS) -#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ +#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXECSPACE,\ + MEMSPACE) \ + template <> \ struct scal_tpl_spec_avail< \ + EXECSPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ @@ -107,15 +111,15 @@ KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, }; KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::HIP, Kokkos::HIPSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::HIP, Kokkos::HIPSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::HIP, Kokkos::HIPSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::HIP, Kokkos::HIPSpace) #endif diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index 8b3d8e6d95..0050923a47 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -248,10 +248,11 @@ namespace KokkosBlas { namespace Impl { #define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ - SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, MEMSPACE, \ + SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, EXECSPACE, MEMSPACE \ ETI_SPEC_AVAIL) \ - template \ + template <> \ struct Scal< \ + EXECSPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR_TYPE, \ @@ -259,6 +260,7 @@ namespace Impl { Kokkos::Device, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ typedef Kokkos::View, \ Kokkos::MemoryTraits > \ @@ -270,7 +272,8 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void scal(const RV& R, const AS& alpha, const XV& X) { \ + static void scal(const execution_space& space, const RV& R, \ + const AS& alpha, const XV& X) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::scal[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ const size_type numElems = X.extent(0); \ @@ -281,9 +284,18 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::RocBlasSingleton& s = \ KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN( \ s.handle, N, reinterpret_cast(&alpha), \ reinterpret_cast(R.data()), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_pointer_mode(s.handle, pointer_mode)); \ } else { \ Scal::scal(R, alpha, X); \ } \ @@ -291,47 +303,48 @@ namespace Impl { } \ }; -#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, \ + MEMSPACE, \ ETI_SPEC_AVAIL) \ KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(double, double, rocblas_dscal, \ - LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) + LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE \ ETI_SPEC_AVAIL) \ KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(float, float, rocblas_sscal, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) + EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE \ ETI_SPEC_AVAIL) \ KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ Kokkos::complex, rocblas_double_complex, rocblas_zscal, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) + EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE \ ETI_SPEC_AVAIL) \ KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ Kokkos::complex, rocblas_float_complex, rocblas_cscal, LAYOUT, \ - MEMSPACE, ETI_SPEC_AVAIL) - -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) -KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) - -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) -KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) - -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) -KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) - -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) -KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) + EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, true) +KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, false) + +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, true) +KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, false) + +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, true) +KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, false) + +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, true) +KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp index 76eb7cb37a..55e1383ed7 100644 --- a/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct update_tpl_spec_avail { enum : bool { value = false }; }; From 6016771b3a655ebff20c4e78d7df38926c112b84 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 21 Apr 2023 09:28:58 -0600 Subject: [PATCH 260/442] sparse: CooMatrix - Rework CooMatrix template parameters - Document all public members and type with //! --- docs/developer/apidocs/sparse.rst | 2 +- graph/impl/KokkosGraph_Distance2MIS_impl.hpp | 3 - sparse/src/KokkosSparse_CooMatrix.hpp | 85 +++++++++++++------- sparse/src/KokkosSparse_coo2crs.hpp | 19 +++-- sparse/src/KokkosSparse_crs2coo.hpp | 42 ++++++---- 5 files changed, 91 insertions(+), 60 deletions(-) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index 15509e90a0..8726996e6b 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -29,7 +29,7 @@ ccs2crs coo2crs ------- .. doxygenfunction:: KokkosSparse::coo2crs(DimType, DimType, RowViewType, ColViewType, DataViewType) -.. doxygenfunction:: KokkosSparse::coo2crs(KokkosSparse::CooMatrix &cooMatrix) +.. doxygenfunction:: KokkosSparse::coo2crs(KokkosSparse::CooMatrix &cooMatrix) crs2coo ------- diff --git a/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index ff4382c930..a359956a23 100644 --- a/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -828,8 +828,6 @@ struct D2_MIS_FixedPriority { Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(worklist1)); lno_t workRemain = numVerts; - int numIter = 0; - (void)numIter; while (workRemain) { // do another iteration Kokkos::parallel_for( @@ -854,7 +852,6 @@ struct D2_MIS_FixedPriority { // Finally, flip the worklists std::swap(worklist1, worklist2); workRemain = newWorkRemain; - numIter++; } // now that every vertex has been decided IN_SET/OUT_SET, // build a compact list of the vertices which are IN_SET. diff --git a/sparse/src/KokkosSparse_CooMatrix.hpp b/sparse/src/KokkosSparse_CooMatrix.hpp index 631283cfab..42f27abe4c 100644 --- a/sparse/src/KokkosSparse_CooMatrix.hpp +++ b/sparse/src/KokkosSparse_CooMatrix.hpp @@ -34,39 +34,68 @@ namespace KokkosSparse { /// /// \brief Coordinate format implementation of a sparse matrix. /// -/// \tparam RowView The type of row index view. -/// \tparam ColumnView The type of column index view. -/// \tparam DataView The type of data view. -/// \tparam Device The Kokkos Device type. +/// \tparam ScalarType The type of scalar entries in the sparse matrix. +/// \tparam OrdinalType The type of index entries in the sparse matrix. +/// \tparam Device The Kokkos Device type. /// \tparam MemoryTraits Traits describing how Kokkos manages and /// accesses data. The default parameter suffices for most users. -/// /// "Coo" stands for "coordinate format". -template +template ::size_type> class CooMatrix { public: - using execution_space = typename Device::execution_space; - using memory_space = typename Device::memory_space; - using data_type = typename DataView::non_const_value_type; - using const_data_type = typename DataView::const_value_type; - using row_type = typename RowView::non_const_value_type; - using const_row_type = typename RowView::const_value_type; - using column_type = typename ColumnView::non_const_value_type; - using const_column_type = typename ColumnView::const_value_type; - using size_type = size_t; - - static_assert(std::is_integral_v, - "RowView::value_type must be an integral."); - static_assert(std::is_integral_v, - "ColumnView::value_type must be an integral."); + //! Type of each value in the matrix + using scalar_type = ScalarType; + //! Type of each value in the const matrix + using const_scalar_type = const std::remove_const_t; + //! Non constant scalar type + using non_const_scalar_type = std::remove_const_t; + //! Type of each index in the matrix + using ordinal_type = OrdinalType; + //! Type of each value in the const matrix + using const_ordinal_type = const std::remove_const_t; + //! Non constant ordinal type + using non_const_ordinal_type = std::remove_const_t; + //! Type of each row index in the matrix + using row_type = ordinal_type; + //! Type of each column index in the matrix + using column_type = ordinal_type; + //! Type of the Kokkos::Device + using device_type = Device; + //! Type of the Kokkos::Device::execution_space + using execution_space = typename device_type::execution_space; + //! Type of the Kokkos::Device::memory_space + using memory_space = typename device_type::memory_space; + //! Type of the Kokkos::MemoryTraits + using memory_traits = MemoryTraits; + //! Type of all integral class members + using size_type = SizeType; + + static_assert(std::is_integral_v, + "OrdinalType must be an integral."); + + using row_view = + Kokkos::View; + using column_view = Kokkos::View; + using scalar_view = Kokkos::View; + + using const_type = CooMatrix; private: size_type m_num_rows, m_num_cols; public: - RowView row; - ColumnView col; - DataView data; + //! The row indexes of the matrix + row_view row; + //! The column indexes of the matrix + column_view col; + //! The scalar values of the matrix + scalar_view data; /// \brief Default constructor; constructs an empty sparse matrix. KOKKOS_INLINE_FUNCTION @@ -85,8 +114,8 @@ class CooMatrix { /// \param col_in [in] The column indexes. /// \param data_in [in] The values. // clang-format on - CooMatrix(size_type nrows, size_type ncols, RowView row_in, ColumnView col_in, - DataView data_in) + CooMatrix(size_type nrows, size_type ncols, row_view row_in, + column_view col_in, scalar_view data_in) : m_num_rows(nrows), m_num_cols(ncols), row(row_in), @@ -108,11 +137,7 @@ class CooMatrix { KOKKOS_INLINE_FUNCTION size_type numRows() const { return m_num_rows; } //! The number of stored entries in the sparse matrix, including zeros. - KOKKOS_INLINE_FUNCTION size_type nnz() const { - assert(data.extent(0) == row.extent(0) == col.extent(0) && - "Error lengths of RowView != ColView != DataView"); - return data.extent(0); - } + KOKKOS_INLINE_FUNCTION size_type nnz() const { return data.extent(0); } }; /// \class is_coo_matrix diff --git a/sparse/src/KokkosSparse_coo2crs.hpp b/sparse/src/KokkosSparse_coo2crs.hpp index bb889330ec..6030747098 100644 --- a/sparse/src/KokkosSparse_coo2crs.hpp +++ b/sparse/src/KokkosSparse_coo2crs.hpp @@ -325,18 +325,17 @@ auto coo2crs(DimType m, DimType n, RowViewType row, ColViewType col, // clang-format off /// /// \brief Blocking function that converts a CooMatrix into a CrsMatrix. Values are summed. -/// \tparam DimType The dimension type -/// \tparam RowViewType The row array view type -/// \tparam ColViewType The column array view type -/// \tparam DataViewType The data array view type -/// \tparam DeviceType The cooMatrix::execution_space +/// \tparam ScalarType The `KokkosSparse::CooMatrix::scalar_type` +/// \tparam OrdinalType The KokkosSparse::CooMatrix::ordinal_type +/// \tparam DeviceType The KokkosSparse::CooMatrix::device_type +/// \tparam MemoryTraits The KokkosSparse::CooMatrix::memory_traits +/// \tparam SizeType The KokkosSparse::CooMatrix::size_type /// \param cooMatrix The sparse matrix stored in coordinate ("Coo") format. /// \return A KokkosSparse::CrsMatrix. -// clang-format on -template -auto coo2crs(KokkosSparse::CooMatrix &cooMatrix) { +template +auto coo2crs(KokkosSparse::CooMatrix &cooMatrix) { return coo2crs(cooMatrix.numRows(), cooMatrix.numCols(), cooMatrix.row, cooMatrix.col, cooMatrix.data); } diff --git a/sparse/src/KokkosSparse_crs2coo.hpp b/sparse/src/KokkosSparse_crs2coo.hpp index 58984f3bab..a19c8a164a 100644 --- a/sparse/src/KokkosSparse_crs2coo.hpp +++ b/sparse/src/KokkosSparse_crs2coo.hpp @@ -27,22 +27,32 @@ template class Crs2Coo { private: - using non_const_ordinal_type = std::remove_const_t; - using non_const_size_type = std::remove_const_t; - using coo_row_view = - typename Kokkos::View; - using coo_col_view = coo_row_view; - using coo_data_view = typename ValViewType::non_const_type; - using coo_type = - CooMatrix; + using scalar_type = typename ValViewType::value_type; + using const_scalar_type = const std::remove_const_t; + using non_const_scalar_type = std::remove_const_t; + + using ordinal_type = OrdinalType; + using const_ordinal_type = const std::remove_const_t; + using non_const_ordinal_type = std::remove_const_t; + + using size_type = SizeType; + using const_size_type = const std::remove_const_t; + using non_const_size_type = std::remove_const_t; + + using device_type = DeviceType; + + using row_view = typename Kokkos::View; + using col_view = row_view; + using non_const_coo_data_view = typename ValViewType::non_const_type; + using coo_type = CooMatrix; non_const_ordinal_type m_nrows; non_const_ordinal_type m_ncols; non_const_size_type m_nnz; - coo_data_view m_data; - coo_col_view m_col; - coo_row_view m_row; + non_const_coo_data_view m_data; + col_view m_col; + row_view m_row; ValViewType m_vals; RowMapViewType m_row_map; @@ -60,12 +70,12 @@ class Crs2Coo { m_vals(vals), m_row_map(row_map), m_col_ids(col_ids) { - m_data = coo_data_view( + m_data = non_const_coo_data_view( Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_data"), nnz); - m_col = coo_col_view( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_col"), nnz); - m_row = coo_row_view( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_row"), nnz); + m_col = + col_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_col"), nnz); + m_row = + row_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_row"), nnz); copy_tp1_pt policy(m_nrows, 1, 1); { From 6e150ac9d18f63e7e86fb9c042008a5625b713e6 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 21 Apr 2023 09:33:44 -0600 Subject: [PATCH 261/442] sparse: CooMatrix - Include remaining public-facing types. --- sparse/src/KokkosSparse_CooMatrix.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sparse/src/KokkosSparse_CooMatrix.hpp b/sparse/src/KokkosSparse_CooMatrix.hpp index 42f27abe4c..ccedc85530 100644 --- a/sparse/src/KokkosSparse_CooMatrix.hpp +++ b/sparse/src/KokkosSparse_CooMatrix.hpp @@ -76,13 +76,16 @@ class CooMatrix { static_assert(std::is_integral_v, "OrdinalType must be an integral."); + //! The type of the row index view in the matrix using row_view = Kokkos::View; + //! The type of the column index view in the matrix using column_view = Kokkos::View; + //! The type of the scalar values view in the matrix using scalar_view = Kokkos::View; - + //! The type of a constant CooMatrix using const_type = CooMatrix; From 4c6d55b1110e24e459afb4d7476b68fd730ba9eb Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 21 Apr 2023 09:53:41 -0600 Subject: [PATCH 262/442] docs: Update contrib --- docs/developer/contrib.rst | 55 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/docs/developer/contrib.rst b/docs/developer/contrib.rst index c0117126bd..2e39f21732 100644 --- a/docs/developer/contrib.rst +++ b/docs/developer/contrib.rst @@ -24,6 +24,34 @@ In general, we prefer that the prototype has the doxygen style comment rather th KOKKOS_INLINE_FUNCTION ViewValueType access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::Yes &); +.. code-block:: + :caption: Type Doxygen Style Example + + /// \class CooMatrix + /// + /// \brief Coordinate format implementation of a sparse matrix. + /// + /// \tparam ScalarType The type of scalar entries in the sparse matrix. + /// \tparam OrdinalType The type of index entries in the sparse matrix. + /// \tparam Device The Kokkos Device type. + /// "Coo" stands for "coordinate format". + template + class CooMatrix { + public: + //! Type of each value in the matrix + using scalar_type = ScalarType; + + private: + size_type m_num_rows, m_num_cols; + + public: + //! The data in the matrix + scalar_type data; + + /// \brief Default constructor; constructs an empty sparse matrix. + KOKKOS_INLINE_FUNCTION + CooMatrix() : m_num_rows(0), m_num_cols(0) {} + **NOTE:** To have vscode generate the "\\\\\\" style stubs: 1. install the C/C++ IntelliSense, debugging, and code browsing extention. @@ -32,6 +60,33 @@ In general, we prefer that the prototype has the doxygen style comment rather th 3. place your cursor on the line above `template ...` and type "\\\\\\". +Including your documentation with directives +-------------------------------------------- +Rather than have the documentation generation system default to generating documentation for the entire code base, +we opt-in to what we would like to include in the generated documentation. To opt-in, simply place the publicly facing +function signature or the class name in the appropriate ReStructuredText file. For example, to document a sparse +function and class open up kokkos-kernels/docs/developer/apidocs/sparse.rst: + +.. code-block:: + :caption: Function signature example + + coo2crs + ------- + .. doxygenfunction:: KokkosSparse::coo2crs(DimType, DimType, RowViewType, ColViewType, DataViewType) + .. doxygenfunction:: KokkosSparse::coo2crs(KokkosSparse::CooMatrix &cooMatrix) + +Note that only the signature is required. One may specify the parameter names and any default values, but this is not required. + +.. code-block:: + :caption: User defined type example + + coomatrix + --------- + .. doxygenclass:: KokkosSparse::CooMatrix + :members: + +For a full list of available directives, see https://breathe.readthedocs.io/en/latest/. + Library policies ---------------- From 394409fb4881c179d5f3f4bc1dd3c8191cce5ea5 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 21 Apr 2023 09:58:04 -0600 Subject: [PATCH 263/442] docs: build_doc --- docs/developer/build_doc.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/developer/build_doc.rst b/docs/developer/build_doc.rst index dd3d357286..6ccd0dccf7 100644 --- a/docs/developer/build_doc.rst +++ b/docs/developer/build_doc.rst @@ -15,4 +15,6 @@ Building Developer Documentation cmake -DKokkosKernels_ENABLE_DOCS:BOOL=ON /path/to/kokkos-kernels make Doxygen make Sphinx - open build/docs/docs/sphinx/index.html \ No newline at end of file + open build/docs/docs/sphinx/index.html + +Alternatively, pass the --enable-docs option to cm_generate_makefile.bash. From 4ce5d2a4e00d9a95613971a1b18a952cf92149ea Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 21 Apr 2023 11:26:04 -0600 Subject: [PATCH 264/442] sparse: coo2crs and crs2coo updates - Use a Bitset to track processes tuples - Use a view of bools to track full unordered maps - Use the correct index types when iterating up to nnz - Allow and test for m = n = 0 in coo2crs --- sparse/src/KokkosSparse_coo2crs.hpp | 24 +++++++++++++++--------- sparse/src/KokkosSparse_crs2coo.hpp | 2 +- sparse/unit_test/Test_Sparse_coo2crs.hpp | 2 ++ 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/sparse/src/KokkosSparse_coo2crs.hpp b/sparse/src/KokkosSparse_coo2crs.hpp index 6030747098..b3e8baaa04 100644 --- a/sparse/src/KokkosSparse_coo2crs.hpp +++ b/sparse/src/KokkosSparse_coo2crs.hpp @@ -64,21 +64,23 @@ class Coo2Crs { using copyTp1MemberType = typename copyTp1Pt::member_type; private: - using BmapViewType = Kokkos::View; - using CrsRowMapView = Kokkos::View; using CrsRowMapAtomicView = Kokkos::View>; using CrsValuesView = Kokkos::View; using CrsColIdsView = Kokkos::View; + // Needed since Kokkos::Bitset cannot be accessed on the host + using BmapViewType = Kokkos::View; + using Bitset = Kokkos::Bitset; + CrsRowMapView m_crs_row_map; CrsRowMapAtomicView m_crs_row_map_tmp; CrsValuesView m_crs_vals; CrsColIdsView m_crs_col_ids; UmapType *m_umaps; BmapViewType m_capacity_bmap; - BmapViewType m_tuple_bmap; + Bitset m_tuple_bmap; UmapOpType m_insert_op; CrsOT m_nrows; CrsOT m_ncols; @@ -94,7 +96,7 @@ class Coo2Crs { void operator()(const coo2crsRp1 &, const int &idx) const { auto i = m_row(idx); auto j = m_col(idx); - auto is_inserted = m_tuple_bmap(idx); + auto is_inserted = m_tuple_bmap.test(idx); if (i >= m_nrows || j >= m_ncols) { Kokkos::abort("tuple is out of bounds"); @@ -102,7 +104,7 @@ class Coo2Crs { if (m_umaps[i].insert(j, m_data(idx), m_insert_op).failed()) { m_capacity_bmap(i) = true; // hmap at index i reached capacity } else { - m_tuple_bmap(idx) = true; // checklist of inserted tuples + m_tuple_bmap.set(idx); // checklist of inserted tuples } } } @@ -139,7 +141,7 @@ class Coo2Crs { auto cpy_len = cpy_end - cpy_beg; Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, cpy_len), - [&](const int &i) { + [&](const CrsOT &i) { auto offset = i + cpy_beg; m_crs_vals(offset) = m_umaps[i].value_at(i); m_crs_col_ids(offset) = m_umaps[i].key_at(i); @@ -155,15 +157,19 @@ class Coo2Crs { m_col = col; m_data = data; - typename UmapType::size_type arg_capacity_hint = m_n_tuples / m_nrows / 4; + typename UmapType::size_type arg_capacity_hint = + m_nrows > 0 ? (m_n_tuples / m_nrows / 4) : 16; typename UmapType::hasher_type arg_hasher; typename UmapType::equal_to_type arg_equal_to; arg_capacity_hint = arg_capacity_hint < 16 ? 16 : arg_capacity_hint; + // Record of whether capacity was reached in any unordered map m_capacity_bmap = BmapViewType("m_capacity_bmap", m_nrows); typename BmapViewType::HostMirror m_capacity_bmap_mirror = Kokkos::create_mirror_view(m_capacity_bmap); - m_tuple_bmap = BmapViewType("m_tuple_bmap", m_n_tuples); + + // Track which tuples have been processed + m_tuple_bmap = Bitset(m_n_tuples); m_crs_row_map = CrsRowMapView( Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_crs_row_map"), @@ -314,7 +320,7 @@ auto coo2crs(DimType m, DimType n, RowViewType row, ColViewType col, if (row.extent(0) != col.extent(0) || row.extent(0) != data.extent(0)) Kokkos::abort("row.extent(0) = col.extent(0) = data.extent(0) required."); - if (m <= 0 || n <= 0) Kokkos::abort("m > 0 and n > 0 required."); + if (m < 0 || n < 0) Kokkos::abort("m >= 0 and n >= 0 required."); using Coo2crsType = Impl::Coo2Crs; diff --git a/sparse/src/KokkosSparse_crs2coo.hpp b/sparse/src/KokkosSparse_crs2coo.hpp index a19c8a164a..8292b26250 100644 --- a/sparse/src/KokkosSparse_crs2coo.hpp +++ b/sparse/src/KokkosSparse_crs2coo.hpp @@ -99,7 +99,7 @@ class Crs2Coo { auto row_end = row_start + row_len; Kokkos::parallel_for(Kokkos::TeamVectorRange(member, row_start, row_end), - [&](const int &id) { + [&](const size_type &id) { m_data(id) = m_vals(id); m_col(id) = m_col_ids(id); m_row(id) = i; diff --git a/sparse/unit_test/Test_Sparse_coo2crs.hpp b/sparse/unit_test/Test_Sparse_coo2crs.hpp index 9910dd876b..e185512d93 100644 --- a/sparse/unit_test/Test_Sparse_coo2crs.hpp +++ b/sparse/unit_test/Test_Sparse_coo2crs.hpp @@ -242,6 +242,8 @@ TEST_F(TestCategory, sparse_coo2crs) { UINT32_MAX; std::srand(ticks); + doAllCoo2Crs(0, 0); + // Square cases for (size_t i = 1; i < 256; i *= 4) { size_t dim = (std::rand() % 511) + 1; From 8a35f819ac93139a30f8675108f2be103135c709 Mon Sep 17 00:00:00 2001 From: Evan Harvey <57234914+e10harvey@users.noreply.github.com> Date: Fri, 21 Apr 2023 12:43:24 -0600 Subject: [PATCH 265/442] Update docs/developer/contrib.rst --- docs/developer/contrib.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/developer/contrib.rst b/docs/developer/contrib.rst index 2e39f21732..d9b7d31256 100644 --- a/docs/developer/contrib.rst +++ b/docs/developer/contrib.rst @@ -54,7 +54,7 @@ In general, we prefer that the prototype has the doxygen style comment rather th **NOTE:** To have vscode generate the "\\\\\\" style stubs: -1. install the C/C++ IntelliSense, debugging, and code browsing extention. +1. install the C/C++ IntelliSense, debugging, and code browsing extension. 2. go to Settings, Extensions, C/C++, Doxygen Documentation Generator Settings, and ensure the setting for Doxdocgen is "\\\\\\". From 4ad4962c5cc7504668c920fb5584f5f459be9ffc Mon Sep 17 00:00:00 2001 From: Evan Harvey <57234914+e10harvey@users.noreply.github.com> Date: Fri, 21 Apr 2023 12:48:06 -0600 Subject: [PATCH 266/442] Update docs/developer/apidocs/sparse.rst --- docs/developer/apidocs/sparse.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index bee4b2d89c..8347afcf34 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -65,7 +65,7 @@ block_gauss_seidel par_ilut -------- .. doxygenfunction:: par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, AEntriesType& A_entries, LRowMapType& L_rowmap, URowMapType& U_rowmap) -.. doxygenfunction:: par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, AEntriesType& A_entries, AValuesType& A_values, LRowMapType& L_rowmap, LEntriesType& L_entries, LValuesType& L_values, URowMapType& U_rowmap, UEntriesType& U_entries, UValuesType& U_values, bool deterministic) +.. doxygenfunction:: par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, AEntriesType& A_entries, AValuesType& A_values, LRowMapType& L_rowmap, LEntriesType& L_entries, LValuesType& L_values, URowMapType& U_rowmap, UEntriesType& U_entries, UValuesType& U_values) .. doxygenclass:: KokkosSparse::PAR_ILUTHandle :members: From 2d3c2c4f4884e5ed46dc46410d02824cdb5a29ad Mon Sep 17 00:00:00 2001 From: Evan Harvey <57234914+e10harvey@users.noreply.github.com> Date: Fri, 21 Apr 2023 12:51:18 -0600 Subject: [PATCH 267/442] Update sparse/src/KokkosSparse_par_ilut.hpp --- sparse/src/KokkosSparse_par_ilut.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sparse/src/KokkosSparse_par_ilut.hpp b/sparse/src/KokkosSparse_par_ilut.hpp index d42b38e5e1..42cfc1e7d2 100644 --- a/sparse/src/KokkosSparse_par_ilut.hpp +++ b/sparse/src/KokkosSparse_par_ilut.hpp @@ -215,7 +215,6 @@ void par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, /// @param U_rowmap The row map (row nnz offsets) for the U CSR (Input/Output) /// @param U_entries The entries (column ids) for the U CSR (Output) /// @param U_values The values (non-zero matrix values) for the U CSR (Output) -/// @param deterministic Please ignore. This parameter will be removed soon. template Date: Fri, 21 Apr 2023 13:03:04 -0600 Subject: [PATCH 268/442] sparse: coo2crs add RandomAccess to BmapViewType --- sparse/src/KokkosSparse_coo2crs.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sparse/src/KokkosSparse_coo2crs.hpp b/sparse/src/KokkosSparse_coo2crs.hpp index b3e8baaa04..2574ae0e7c 100644 --- a/sparse/src/KokkosSparse_coo2crs.hpp +++ b/sparse/src/KokkosSparse_coo2crs.hpp @@ -71,8 +71,9 @@ class Coo2Crs { using CrsColIdsView = Kokkos::View; // Needed since Kokkos::Bitset cannot be accessed on the host - using BmapViewType = Kokkos::View; - using Bitset = Kokkos::Bitset; + using BmapViewType = + Kokkos::View>; + using Bitset = Kokkos::Bitset; CrsRowMapView m_crs_row_map; CrsRowMapAtomicView m_crs_row_map_tmp; From 2949394c035fe52deb89f8f8bed4bfc5c65ad872 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Fri, 21 Apr 2023 14:45:06 -0600 Subject: [PATCH 269/442] BLAS1: apply clang-format --- blas/impl/KokkosBlas1_abs_spec.hpp | 13 ++- blas/impl/KokkosBlas1_axpby_impl.hpp | 8 +- blas/impl/KokkosBlas1_axpby_mv_impl.hpp | 44 +++++--- blas/impl/KokkosBlas1_axpby_spec.hpp | 79 +++++++------ blas/impl/KokkosBlas1_mult_impl.hpp | 9 +- blas/impl/KokkosBlas1_mult_spec.hpp | 43 ++++--- blas/impl/KokkosBlas1_reciprocal_impl.hpp | 6 +- blas/impl/KokkosBlas1_reciprocal_spec.hpp | 21 ++-- blas/impl/KokkosBlas1_scal_impl.hpp | 4 +- blas/impl/KokkosBlas1_scal_mv_impl.hpp | 106 +++++++++++++----- blas/impl/KokkosBlas1_scal_spec.hpp | 57 ++++++---- blas/impl/KokkosBlas1_update_impl.hpp | 7 +- blas/impl/KokkosBlas1_update_spec.hpp | 52 +++++---- blas/src/KokkosBlas1_abs.hpp | 18 ++- blas/src/KokkosBlas1_axpby.hpp | 51 +++++---- blas/src/KokkosBlas1_fill.hpp | 3 +- blas/src/KokkosBlas1_mult.hpp | 28 +++-- blas/src/KokkosBlas1_reciprocal.hpp | 20 ++-- blas/src/KokkosBlas1_scal.hpp | 43 ++++--- blas/src/KokkosBlas1_update.hpp | 28 +++-- .../tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp | 9 +- blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp | 3 +- blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp | 59 +++++----- blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp | 48 ++++---- .../KokkosBlas1_update_tpl_spec_avail.hpp | 3 +- 25 files changed, 462 insertions(+), 300 deletions(-) diff --git a/blas/impl/KokkosBlas1_abs_spec.hpp b/blas/impl/KokkosBlas1_abs_spec.hpp index ec14678816..76555aec5a 100644 --- a/blas/impl/KokkosBlas1_abs_spec.hpp +++ b/blas/impl/KokkosBlas1_abs_spec.hpp @@ -85,9 +85,10 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = abs_eti_spec_avail::value> +template < + class execution_space, class RMV, class XMV, int rank = RMV::rank, + bool tpl_spec_avail = abs_tpl_spec_avail::value, + bool eti_spec_avail = abs_eti_spec_avail::value> struct Abs { static void abs(const execution_space& space, const RMV& R, const XMV& X); }; @@ -95,7 +96,8 @@ struct Abs { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Abs for single vectors (1-D Views). template -struct Abs { +struct Abs { using size_type = typename XMV::size_type; static void abs(const execution_space& space, const RMV& R, const XMV& X) { @@ -137,7 +139,8 @@ struct Abs -struct Abs { +struct Abs { using size_type = typename XMV::size_type; static void abs(const execution_space& space, const RMV& R, const XMV& X) { diff --git a/blas/impl/KokkosBlas1_axpby_impl.hpp b/blas/impl/KokkosBlas1_axpby_impl.hpp index 0403ef24ff..4e468b0e56 100644 --- a/blas/impl/KokkosBlas1_axpby_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_impl.hpp @@ -302,9 +302,11 @@ struct Axpby_Functor -void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, const BV& bv, const YV& y, - const SizeType startingColumn, int a = 2, int b = 2) { +template +void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, + const BV& bv, const YV& y, const SizeType startingColumn, + int a = 2, int b = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_Generic: X is not a Kokkos::View."); diff --git a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp index 84b2856cac..32653b9cce 100644 --- a/blas/impl/KokkosBlas1_axpby_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_axpby_mv_impl.hpp @@ -947,8 +947,10 @@ struct Axpby_MV_Unroll_Functor -void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, +template +void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, + const BV& bv, const YMV& y, const SizeType startingColumn, int a = 2, int b = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -1101,9 +1103,10 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, // coefficients in av and bv vectors, if they are used. // // Either av and bv are both 1-D Views, or av and bv are both scalars. -template -void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, - int a = 2, int b = 2) { +template +void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, + const BV& bv, const YMV& y, int a = 2, int b = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_MV_Generic: X is not a Kokkos::View."); @@ -1239,10 +1242,11 @@ void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, // coefficients in av and bv vectors, if they are used. // // Either av and bv are both 1-D Views, or av and bv are both scalars. -template +template struct Axpby_MV_Invoke_Left { - static void run(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, - int a = 2, int b = 2) { + static void run(const execution_space& space, const AV& av, const XMV& x, + const BV& bv, const YMV& y, int a = 2, int b = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_MV_Invoke_Left: X is not a Kokkos::View."); @@ -1274,8 +1278,8 @@ struct Axpby_MV_Invoke_Left { // Passing in the starting column index lets the functor take // subviews of av and bv, if they are Views. If they are scalars, // the functor doesn't have to do anything to them. - Axpby_MV_Unrolled(space, av, X_cur, bv, Y_cur, j, - a, b); + Axpby_MV_Unrolled( + space, av, X_cur, bv, Y_cur, j, a, b); } for (; j + 4 <= numCols; j += 4) { XMV X_cur = Kokkos::subview(x, Kokkos::ALL(), std::make_pair(j, j + 4)); @@ -1284,8 +1288,8 @@ struct Axpby_MV_Invoke_Left { // Passing in the starting column index lets the functor take // subviews of av and bv, if they are Views. If they are scalars, // the functor doesn't have to do anything to them. - Axpby_MV_Unrolled(space, av, X_cur, bv, Y_cur, j, - a, b); + Axpby_MV_Unrolled( + space, av, X_cur, bv, Y_cur, j, a, b); } for (; j < numCols; ++j) { auto x_cur = Kokkos::subview(x, Kokkos::ALL(), j); @@ -1296,7 +1300,8 @@ struct Axpby_MV_Invoke_Left { // the functor doesn't have to do anything to them. typedef decltype(x_cur) XV; typedef decltype(y_cur) YV; - Axpby_Generic(space, av, x_cur, bv, y_cur, j, a, b); + Axpby_Generic( + space, av, x_cur, bv, y_cur, j, a, b); } } }; @@ -1320,10 +1325,11 @@ struct Axpby_MV_Invoke_Left { // coefficients in av and bv vectors, if they are used. // // Either av and bv are both 1-D Views, or av and bv are both scalars. -template +template struct Axpby_MV_Invoke_Right { - static void run(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, - int a = 2, int b = 2) { + static void run(const execution_space& space, const AV& av, const XMV& x, + const BV& bv, const YMV& y, int a = 2, int b = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_MV_Invoke_Right: X is not a Kokkos::View."); @@ -1348,9 +1354,11 @@ struct Axpby_MV_Invoke_Right { auto y_0 = Kokkos::subview(y, Kokkos::ALL(), 0); typedef decltype(x_0) XV; typedef decltype(y_0) YV; - Axpby_Generic(space, av, x_0, bv, y_0, 0, a, b); + Axpby_Generic( + space, av, x_0, bv, y_0, 0, a, b); } else { - Axpby_MV_Generic(space, av, x, bv, y, a, b); + Axpby_MV_Generic( + space, av, x, bv, y, a, b); } } }; diff --git a/blas/impl/KokkosBlas1_axpby_spec.hpp b/blas/impl/KokkosBlas1_axpby_spec.hpp index f5d7c96061..187ea04c2e 100644 --- a/blas/impl/KokkosBlas1_axpby_spec.hpp +++ b/blas/impl/KokkosBlas1_axpby_spec.hpp @@ -28,7 +28,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct axpby_eti_spec_avail { enum : bool { value = false }; }; @@ -46,8 +47,7 @@ struct axpby_eti_spec_avail { MEM_SPACE) \ template <> \ struct axpby_eti_spec_avail< \ - EXEC_SPACE, \ - SCALAR, \ + EXEC_SPACE, SCALAR, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -69,8 +69,7 @@ struct axpby_eti_spec_avail { MEM_SPACE) \ template <> \ struct axpby_eti_spec_avail< \ - EXEC_SPACE, \ - SCALAR, \ + EXEC_SPACE, SCALAR, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -130,16 +129,21 @@ namespace Impl { /// Any scalar coefficient of zero has BLAS semantics of /// ignoring the corresponding (multi)vector entry. This does NOT /// apply to coefficients in av and bv vectors, if they are used. -template ::value, - bool eti_spec_avail = axpby_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + axpby_eti_spec_avail::value> struct Axpby { - static void axpby(const execution_space& space, const AV& av, const XMV& X, const BV& bv, const YMV& Y); + static void axpby(const execution_space& space, const AV& av, const XMV& X, + const BV& bv, const YMV& Y); }; template struct Axpby { - static void axpby(const execution_space& /*space*/, const AV& /* av */, const XMV& /* X */, const BV& /* bv */, + static void axpby(const execution_space& /*space*/, const AV& /* av */, + const XMV& /* X */, const BV& /* bv */, const YMV& /* Y */) { static_assert(YMV::rank == 0, "Oh My God"); } @@ -148,10 +152,12 @@ struct Axpby { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY // Full specialization for XMV and YMV rank-2 Views. template -struct Axpby { +struct Axpby { typedef typename YMV::size_type size_type; - static void axpby(const execution_space& space, const AV& av, const XMV& X, const BV& bv, const YMV& Y) { + static void axpby(const execution_space& space, const AV& av, const XMV& X, + const BV& bv, const YMV& Y) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby::axpby: X is not a Kokkos::View."); @@ -202,17 +208,17 @@ struct Axpby::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type - Axpby_MV_Invoke_Layout; + Axpby_MV_Invoke_Right, + Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, a, b); } else { typedef typename XMV::size_type index_type; typedef typename std::conditional< std::is_same::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type - Axpby_MV_Invoke_Layout; + Axpby_MV_Invoke_Right, + Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, a, b); } Kokkos::Profiling::popRegion(); @@ -231,8 +237,8 @@ struct Axpby ATA; typedef Kokkos::ArithTraits ATB; - static void axpby(const execution_space& space, const AV& alpha, const XMV& X, const BV& beta, - const YMV& Y) { + static void axpby(const execution_space& space, const AV& alpha, const XMV& X, + const BV& beta, const YMV& Y) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::Axpby::axpby (MV): " "X is not a Kokkos::View."); @@ -304,17 +310,17 @@ struct Axpby::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type - Axpby_MV_Invoke_Layout; + Axpby_MV_Invoke_Right, + Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, a, b); } else { typedef typename XMV::size_type index_type; typedef typename std::conditional< std::is_same::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type - Axpby_MV_Invoke_Layout; + Axpby_MV_Invoke_Right, + Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, a, b); } Kokkos::Profiling::popRegion(); @@ -333,7 +339,8 @@ struct Axpby ATA; typedef Kokkos::ArithTraits ATB; - static void axpby(const execution_space& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { + static void axpby(const execution_space& space, const AV& alpha, const XV& X, + const BV& beta, const YV& Y) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby::axpby: X is not a Kokkos::View."); @@ -398,11 +405,13 @@ struct Axpby(INT_MAX)) { typedef int index_type; Axpby_Generic(space, alpha, X, beta, Y, 0, a, b); + typename YV::non_const_value_type, YV, index_type>( + space, alpha, X, beta, Y, 0, a, b); } else { typedef typename XV::size_type index_type; Axpby_Generic(space, alpha, X, beta, Y, 0, a, b); + typename YV::non_const_value_type, YV, index_type>( + space, alpha, X, beta, Y, 0, a, b); } Kokkos::Profiling::popRegion(); } @@ -423,8 +432,7 @@ struct Axpby, \ Kokkos::MemoryTraits >, \ @@ -435,8 +443,7 @@ struct Axpby, \ Kokkos::MemoryTraits >, \ @@ -456,8 +463,7 @@ struct Axpby, \ Kokkos::MemoryTraits >, \ @@ -483,8 +489,7 @@ struct Axpby, \ Kokkos::MemoryTraits >, \ diff --git a/blas/impl/KokkosBlas1_mult_impl.hpp b/blas/impl/KokkosBlas1_mult_impl.hpp index 2a70e0caab..048db395b0 100644 --- a/blas/impl/KokkosBlas1_mult_impl.hpp +++ b/blas/impl/KokkosBlas1_mult_impl.hpp @@ -145,7 +145,8 @@ struct V_MultFunctor { /// C(i) = c * C(i) + ab * A(i) * B(i), subject to the usual BLAS /// update rules. template -void V_Mult_Generic(const execution_space& space, typename CV::const_value_type& c, const CV& C, +void V_Mult_Generic(const execution_space& space, + typename CV::const_value_type& c, const CV& C, typename AV::const_value_type& ab, const AV& A, const BV& B) { using Kokkos::ALL; @@ -191,7 +192,8 @@ void V_Mult_Generic(const execution_space& space, typename CV::const_value_type& /// C(i,j) = c * C(i,j) + ab * A(i) * B(i,j), subject to the usual /// BLAS update rules. template -void MV_Mult_Generic(const execution_space& space, typename CMV::const_value_type& c, const CMV& C, +void MV_Mult_Generic(const execution_space& space, + typename CMV::const_value_type& c, const CMV& C, typename AV::const_value_type& ab, const AV& A, const BMV& B) { typedef Kokkos::ArithTraits ATA; @@ -203,7 +205,8 @@ void MV_Mult_Generic(const execution_space& space, typename CMV::const_value_typ typedef decltype(C_0) CV; typedef decltype(B_0) BV; - V_Mult_Generic(space, c, C_0, ab, A, B_0); + V_Mult_Generic(space, c, C_0, ab, A, + B_0); return; } diff --git a/blas/impl/KokkosBlas1_mult_spec.hpp b/blas/impl/KokkosBlas1_mult_spec.hpp index 25aeba86c2..4a38c347f5 100644 --- a/blas/impl/KokkosBlas1_mult_spec.hpp +++ b/blas/impl/KokkosBlas1_mult_spec.hpp @@ -27,7 +27,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct mult_eti_spec_avail { enum : bool { value = false }; }; @@ -100,11 +101,15 @@ namespace Impl { /// Y(i,j) = alpha*A(i,j)*X(i,j) + gamma*Y(i,j) /// /// with special cases for alpha, or gamma = 0. -template ::value, - bool eti_spec_avail = mult_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + mult_eti_spec_avail::value> struct Mult { - static void mult(const execution_space& space, const typename YMV::non_const_value_type& gamma, + static void mult(const execution_space& space, + const typename YMV::non_const_value_type& gamma, const YMV& Y, const typename XMV::non_const_value_type& alpha, const AV& A, const XMV& X); @@ -113,13 +118,15 @@ struct Mult { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY // Partial specialization for YMV, AV, and XMV rank-2 Views. template -struct Mult { +struct Mult { typedef typename YMV::size_type size_type; typedef typename YMV::non_const_value_type YMV_scalar; typedef typename XMV::non_const_value_type XMV_scalar; - static void mult(const execution_space& space, const YMV_scalar& gamma, const YMV& Y, - const XMV_scalar& alpha, const AV& A, const XMV& X) { + static void mult(const execution_space& space, const YMV_scalar& gamma, + const YMV& Y, const XMV_scalar& alpha, const AV& A, + const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Mult::mult: Y is not a Kokkos::View."); @@ -162,9 +169,11 @@ struct Mult(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - MV_Mult_Generic(space, gamma, Y, alpha, A, X); + MV_Mult_Generic(space, gamma, Y, + alpha, A, X); } else { - MV_Mult_Generic(space, gamma, Y, alpha, A, X); + MV_Mult_Generic(space, gamma, Y, + alpha, A, X); } Kokkos::Profiling::popRegion(); } @@ -172,13 +181,15 @@ struct Mult -struct Mult { +struct Mult { typedef typename YV::size_type size_type; typedef typename YV::non_const_value_type YV_scalar; typedef typename XV::non_const_value_type XV_scalar; - static void mult(const execution_space& space, const YV_scalar& gamma, const YV& Y, const XV_scalar& alpha, - const AV& A, const XV& X) { + static void mult(const execution_space& space, const YV_scalar& gamma, + const YV& Y, const XV_scalar& alpha, const AV& A, + const XV& X) { // YV, AV, and XV must be Kokkos::View specializations. static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -214,9 +225,11 @@ struct Mult(INT_MAX)) { - V_Mult_Generic(space, gamma, Y, alpha, A, X); + V_Mult_Generic(space, gamma, Y, alpha, + A, X); } else { - V_Mult_Generic(space, gamma, Y, alpha, A, X); + V_Mult_Generic(space, gamma, Y, + alpha, A, X); } Kokkos::Profiling::popRegion(); } diff --git a/blas/impl/KokkosBlas1_reciprocal_impl.hpp b/blas/impl/KokkosBlas1_reciprocal_impl.hpp index 4c3a28815b..21f736ac4f 100644 --- a/blas/impl/KokkosBlas1_reciprocal_impl.hpp +++ b/blas/impl/KokkosBlas1_reciprocal_impl.hpp @@ -148,7 +148,8 @@ struct V_ReciprocalSelf_Functor { // Invoke the "generic" (not unrolled) multivector functor that // computes entry-wise reciprocalolute value. template -void MV_Reciprocal_Generic(const execution_space& space, const RMV& R, const XMV& X) { +void MV_Reciprocal_Generic(const execution_space& space, const RMV& R, + const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "MV_Reciprocal_Generic: RMV is not a Kokkos::View."); @@ -176,7 +177,8 @@ void MV_Reciprocal_Generic(const execution_space& space, const RMV& R, const XMV // Variant of MV_Reciprocal_Generic for single vectors (1-D Views) R and X. template -void V_Reciprocal_Generic(const execution_space& space, const RV& R, const XV& X) { +void V_Reciprocal_Generic(const execution_space& space, const RV& R, + const XV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "V_Reciprocal_Generic: RV is not a Kokkos::View."); diff --git a/blas/impl/KokkosBlas1_reciprocal_spec.hpp b/blas/impl/KokkosBlas1_reciprocal_spec.hpp index ea937c167b..f758acae2f 100644 --- a/blas/impl/KokkosBlas1_reciprocal_spec.hpp +++ b/blas/impl/KokkosBlas1_reciprocal_spec.hpp @@ -87,19 +87,24 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = reciprocal_eti_spec_avail::value> + bool tpl_spec_avail = + reciprocal_tpl_spec_avail::value, + bool eti_spec_avail = + reciprocal_eti_spec_avail::value> struct Reciprocal { - static void reciprocal(const execution_space& space, const RMV& R, const XMV& X); + static void reciprocal(const execution_space& space, const RMV& R, + const XMV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Reciprocal for single vectors (1-D Views). template -struct Reciprocal { +struct Reciprocal { typedef typename XMV::size_type size_type; - static void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { + static void reciprocal(const execution_space& space, const RMV& R, + const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Reciprocal<1-D>: RMV is not a Kokkos::View."); @@ -139,10 +144,12 @@ struct Reciprocal -struct Reciprocal { +struct Reciprocal { typedef typename XMV::size_type size_type; - static void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { + static void reciprocal(const execution_space& space, const RMV& R, + const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Reciprocal<2-D>: RMV is not a Kokkos::View."); diff --git a/blas/impl/KokkosBlas1_scal_impl.hpp b/blas/impl/KokkosBlas1_scal_impl.hpp index 3cbf44af7d..541d9a4934 100644 --- a/blas/impl/KokkosBlas1_scal_impl.hpp +++ b/blas/impl/KokkosBlas1_scal_impl.hpp @@ -133,8 +133,8 @@ struct V_Scal_Functor -void V_Scal_Generic(const execution_space& space, const RV& r, const AV& av, const XV& x, - const SizeType startingColumn, int a = 2) { +void V_Scal_Generic(const execution_space& space, const RV& r, const AV& av, + const XV& x, const SizeType startingColumn, int a = 2) { static_assert(Kokkos::is_view::value, "V_Scal_Generic: RV is not a Kokkos::View."); static_assert(Kokkos::is_view::value, diff --git a/blas/impl/KokkosBlas1_scal_mv_impl.hpp b/blas/impl/KokkosBlas1_scal_mv_impl.hpp index 58c4091fed..da4d7a5149 100644 --- a/blas/impl/KokkosBlas1_scal_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_scal_mv_impl.hpp @@ -319,10 +319,11 @@ struct MV_Scal_Unroll_Functor -void MV_Scal_Unrolled(const execution_space& space, const RMV& r, const aVector& av, const XMV& x, +template +void MV_Scal_Unrolled(const execution_space& space, const RMV& r, + const aVector& av, const XMV& x, const SizeType startingColumn, int a = 2) { - if (a == 0) { MV_Scal_Unroll_Functor op( r, x, av, startingColumn); @@ -370,8 +371,10 @@ void MV_Scal_Unrolled(const execution_space& space, const RMV& r, const aVector& // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does NOT apply to // coefficient(s) in av, if used. -template -void MV_Scal_Generic(const execution_space& space, const RVector& r, const aVector& av, const XVector& x, +template +void MV_Scal_Generic(const execution_space& space, const RVector& r, + const aVector& av, const XVector& x, const SizeType startingColumn, int a = 2) { const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); @@ -416,7 +419,8 @@ void MV_Scal_Generic(const execution_space& space, const RVector& r, const aVect // corresponding (multi)vector entry. This does NOT apply to // coefficient(s) in av, if used. template -void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, const AV& av, const XMV& x, int a = 2) { +void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, + const AV& av, const XMV& x, int a = 2) { const SizeType numCols = x.extent(1); #if KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL <= 2 @@ -433,7 +437,8 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, const AV& a typedef decltype(X_cur) XMV2D; typedef decltype(R_cur) RMV2D; - MV_Scal_Unrolled(space, R_cur, av, X_cur, j, a); + MV_Scal_Unrolled( + space, R_cur, av, X_cur, j, a); } for (; j + 4 <= numCols; j += 4) { const std::pair rng(j, j + 4); @@ -442,7 +447,8 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, const AV& a typedef decltype(X_cur) XMV2D; typedef decltype(R_cur) RMV2D; - MV_Scal_Unrolled(space, R_cur, av, X_cur, j, a); + MV_Scal_Unrolled( + space, R_cur, av, X_cur, j, a); } for (; j < numCols; ++j) { // RMV and XMV need to turn 1-D. @@ -451,7 +457,8 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, const AV& a typedef decltype(r_cur) RV; typedef decltype(x_cur) XV; - V_Scal_Generic(space, r_cur, av, x_cur, j, a); + V_Scal_Generic(space, r_cur, av, + x_cur, j, a); } #else // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL > 2 @@ -463,39 +470,73 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, const AV& a typedef decltype(r_0) RV; typedef decltype(x_0) XV; - V_Scal_Generic(space, r_0, av, x_0, 0, a); + V_Scal_Generic(space, r_0, av, x_0, + 0, a); break; } - case 2: MV_Scal_Unrolled(space, r, av, x, 0, a); break; - case 3: MV_Scal_Unrolled(space, r, av, x, 0, a); break; - case 4: MV_Scal_Unrolled(space, r, av, x, 0, a); break; - case 5: MV_Scal_Unrolled(space, r, av, x, 0, a); break; - case 6: MV_Scal_Unrolled(space, r, av, x, 0, a); break; - case 7: MV_Scal_Unrolled(space, r, av, x, 0, a); break; - case 8: MV_Scal_Unrolled(space, r, av, x, 0, a); break; - case 9: MV_Scal_Unrolled(space, r, av, x, 0, a); break; + case 2: + MV_Scal_Unrolled(space, r, av, + x, 0, a); + break; + case 3: + MV_Scal_Unrolled(space, r, av, + x, 0, a); + break; + case 4: + MV_Scal_Unrolled(space, r, av, + x, 0, a); + break; + case 5: + MV_Scal_Unrolled(space, r, av, + x, 0, a); + break; + case 6: + MV_Scal_Unrolled(space, r, av, + x, 0, a); + break; + case 7: + MV_Scal_Unrolled(space, r, av, + x, 0, a); + break; + case 8: + MV_Scal_Unrolled(space, r, av, + x, 0, a); + break; + case 9: + MV_Scal_Unrolled(space, r, av, + x, 0, a); + break; case 10: - MV_Scal_Unrolled(space, r, av, x, 0, a); + MV_Scal_Unrolled( + space, r, av, x, 0, a); break; case 11: - MV_Scal_Unrolled(space, r, av, x, 0, a); + MV_Scal_Unrolled( + space, r, av, x, 0, a); break; case 12: - MV_Scal_Unrolled(space, r, av, x, 0, a); + MV_Scal_Unrolled( + space, r, av, x, 0, a); break; case 13: - MV_Scal_Unrolled(space, r, av, x, 0, a); + MV_Scal_Unrolled( + space, r, av, x, 0, a); break; case 14: - MV_Scal_Unrolled(space, r, av, x, 0, a); + MV_Scal_Unrolled( + space, r, av, x, 0, a); break; case 15: - MV_Scal_Unrolled(space, r, av, x, 0, a); + MV_Scal_Unrolled( + space, r, av, x, 0, a); break; case 16: - MV_Scal_Unrolled(space, r, av, x, 0, a); + MV_Scal_Unrolled( + space, r, av, x, 0, a); break; - default: MV_Scal_Generic(space, r, av, x, 0, a); + default: + MV_Scal_Generic(space, r, av, x, + 0, a); } #endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_SCAL @@ -515,9 +556,10 @@ void MV_Scal_Invoke_Left(const execution_space& space, const RMV& r, const AV& a // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does NOT apply to // coefficient(s) in av, if used. -template -void MV_Scal_Invoke_Right(const execution_space& space, const RMV& r, const aVector& av, const XMV& x, - int a = 2) { +template +void MV_Scal_Invoke_Right(const execution_space& space, const RMV& r, + const aVector& av, const XMV& x, int a = 2) { const SizeType numCols = x.extent(1); if (numCols == 1) { @@ -530,9 +572,11 @@ void MV_Scal_Invoke_Right(const execution_space& space, const RMV& r, const aVec RV r_0 = Kokkos::subview(r, Kokkos::ALL(), 0); XV x_0 = Kokkos::subview(x, Kokkos::ALL(), 0); - V_Scal_Generic(space, r_0, av, x_0, a); + V_Scal_Generic(space, r_0, + av, x_0, a); } else { - MV_Scal_Generic(space, r, av, x, a); + MV_Scal_Generic(space, r, av, + x, a); } } diff --git a/blas/impl/KokkosBlas1_scal_spec.hpp b/blas/impl/KokkosBlas1_scal_spec.hpp index bb80f30451..a7e6ef1f11 100644 --- a/blas/impl/KokkosBlas1_scal_spec.hpp +++ b/blas/impl/KokkosBlas1_scal_spec.hpp @@ -29,7 +29,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct scal_eti_spec_avail { enum : bool { value = false }; }; @@ -102,23 +103,28 @@ namespace KokkosBlas { namespace Impl { // Unification layer - template ::value, - bool eti_spec_avail = scal_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + scal_eti_spec_avail::value> struct Scal { - static void scal(const execution_space& space, const RV& R, const AV& A, const XV& X); + static void scal(const execution_space& space, const RV& R, const AV& A, + const XV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Scal for single vectors (1-D Views). template -struct Scal { +struct Scal { typedef typename XV::non_const_value_type AV; typedef typename XV::size_type size_type; typedef Kokkos::ArithTraits ATA; - static void scal(const execution_space& space, const RV& R, const AV& alpha, const XV& X) { + static void scal(const execution_space& space, const RV& R, const AV& alpha, + const XV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Scal<1-D>: RV is not a Kokkos::View."); @@ -157,10 +163,12 @@ struct Scal(INT_MAX)) { typedef int index_type; - V_Scal_Generic(space, R, alpha, X, a); + V_Scal_Generic(space, R, alpha, + X, a); } else { typedef typename XV::size_type index_type; - V_Scal_Generic(space, R, alpha, X, a); + V_Scal_Generic(space, R, alpha, + X, a); } Kokkos::Profiling::popRegion(); } @@ -173,11 +181,13 @@ struct Scal -struct Scal { +struct Scal { typedef typename XMV::size_type size_type; typedef Kokkos::ArithTraits ATA; - static void scal(const execution_space& space, const RMV& R, const AV& av, const XMV& X) { + static void scal(const execution_space& space, const RMV& R, const AV& av, + const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Scal<2-D>: RMV is not a Kokkos::View."); @@ -215,10 +225,12 @@ struct Scal(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Scal_Invoke_Left(space, R, av, X, a); + MV_Scal_Invoke_Left(space, R, + av, X, a); } else { typedef typename XMV::size_type index_type; - MV_Scal_Invoke_Left(space, R, av, X, a); + MV_Scal_Invoke_Left(space, R, + av, X, a); } Kokkos::Profiling::popRegion(); } @@ -231,13 +243,14 @@ struct Scal -struct Scal { +struct Scal { typedef typename XMV::non_const_value_type AV; typedef typename XMV::size_type size_type; typedef Kokkos::ArithTraits ATA; - static void scal(const execution_space& space, const RMV& R, const AV& alpha, const XMV& X) { + static void scal(const execution_space& space, const RMV& R, const AV& alpha, + const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Scal<2-D, AV=scalar>: RMV is not a Kokkos::View."); @@ -278,12 +291,14 @@ struct Scal(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Scal_Invoke_Left(space, R, alpha, X, a); + MV_Scal_Invoke_Left( + space, R, alpha, X, a); } else { typedef typename XMV::size_type index_type; - MV_Scal_Invoke_Left(space, R, alpha, X, a); + MV_Scal_Invoke_Left( + space, R, alpha, X, a); } Kokkos::Profiling::popRegion(); } diff --git a/blas/impl/KokkosBlas1_update_impl.hpp b/blas/impl/KokkosBlas1_update_impl.hpp index d4abcc0ba0..96aca5c70e 100644 --- a/blas/impl/KokkosBlas1_update_impl.hpp +++ b/blas/impl/KokkosBlas1_update_impl.hpp @@ -314,9 +314,10 @@ struct V_Update_Functor { // // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding multivector entry. -template +template void MV_Update_Generic(const execution_space& space, - const typename XMV::non_const_value_type& alpha, + const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, @@ -417,7 +418,7 @@ void MV_Update_Generic(const execution_space& space, // corresponding vector entry. template void V_Update_Generic(const execution_space& space, - const typename XV::non_const_value_type& alpha, + const typename XV::non_const_value_type& alpha, const XV& X, const typename YV::non_const_value_type& beta, const YV& Y, diff --git a/blas/impl/KokkosBlas1_update_spec.hpp b/blas/impl/KokkosBlas1_update_spec.hpp index aa573b4058..d1e8692c8a 100644 --- a/blas/impl/KokkosBlas1_update_spec.hpp +++ b/blas/impl/KokkosBlas1_update_spec.hpp @@ -27,7 +27,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct update_eti_spec_avail { enum : bool { value = false }; }; @@ -102,12 +103,15 @@ namespace Impl { /// Z(i,j) = alpha*X(i,j) + beta*Y(i,j) + gamma*Z(i,j), /// /// with special cases for alpha, beta, or gamma = 0. -template ::value, - bool eti_spec_avail = update_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + update_eti_spec_avail::value> struct Update { static void update(const execution_space& space, - const typename XMV::non_const_value_type& alpha, + const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, @@ -118,14 +122,15 @@ struct Update { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY // Partial specialization for XMV, YMV, and ZMV rank-2 Views. template -struct Update { +struct Update { typedef typename XMV::size_type size_type; typedef Kokkos::ArithTraits ATA; typedef Kokkos::ArithTraits ATB; typedef Kokkos::ArithTraits ATC; static void update(const execution_space& space, - const typename XMV::non_const_value_type& alpha, + const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, @@ -198,24 +203,24 @@ struct Update(INT_MAX)) { typedef int index_type; - V_Update_Generic(space, alpha, X_0, beta, Y_0, gamma, Z_0, a, b, - c); + V_Update_Generic(space, alpha, X_0, beta, + Y_0, gamma, Z_0, a, b, c); } else { typedef typename XMV::size_type index_type; - V_Update_Generic(space, alpha, X_0, beta, Y_0, gamma, Z_0, a, b, - c); + V_Update_Generic(space, alpha, X_0, beta, + Y_0, gamma, Z_0, a, b, c); } } else { if (numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - MV_Update_Generic(space, alpha, X, beta, Y, gamma, - Z, a, b, c); + MV_Update_Generic( + space, alpha, X, beta, Y, gamma, Z, a, b, c); } else { typedef typename XMV::size_type index_type; - MV_Update_Generic(space, alpha, X, beta, Y, gamma, - Z, a, b, c); + MV_Update_Generic( + space, alpha, X, beta, Y, gamma, Z, a, b, c); } } Kokkos::Profiling::popRegion(); @@ -224,14 +229,15 @@ struct Update -struct Update { +struct Update { typedef typename XV::size_type size_type; typedef Kokkos::ArithTraits ATA; typedef Kokkos::ArithTraits ATB; typedef Kokkos::ArithTraits ATC; static void update(const execution_space& space, - const typename XV::non_const_value_type& alpha, + const typename XV::non_const_value_type& alpha, const XV& X, const typename YV::non_const_value_type& beta, const YV& Y, const typename ZV::non_const_value_type& gamma, @@ -296,12 +302,12 @@ struct Update(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { typedef int index_type; - V_Update_Generic(space, alpha, X, beta, Y, gamma, Z, a, - b, c); + V_Update_Generic( + space, alpha, X, beta, Y, gamma, Z, a, b, c); } else { typedef typename XV::size_type index_type; - V_Update_Generic(space, alpha, X, beta, Y, gamma, Z, a, - b, c); + V_Update_Generic( + space, alpha, X, beta, Y, gamma, Z, a, b, c); } Kokkos::Profiling::popRegion(); } diff --git a/blas/src/KokkosBlas1_abs.hpp b/blas/src/KokkosBlas1_abs.hpp index 15afb8929f..7d2915af31 100644 --- a/blas/src/KokkosBlas1_abs.hpp +++ b/blas/src/KokkosBlas1_abs.hpp @@ -36,17 +36,22 @@ namespace KokkosBlas { template void abs(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_execution_space_v, - "KokkosBlas::abs: execution_space must be a valid Kokkos execution space."); + "KokkosBlas::abs: execution_space must be a valid Kokkos " + "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::abs: " "R is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::abs: RMV must be accessible from execution space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::abs: RMV must be accessible from execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::abs: " "X is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::abs: XMV must be accessible from execution space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::abs: XMV must be accessible from execution space"); static_assert(std::is_same::value, "KokkosBlas::abs: R is const. " @@ -85,7 +90,8 @@ void abs(const execution_space& space, const RMV& R, const XMV& X) { RMV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Abs::abs(space, R_internal, X_internal); + Impl::Abs::abs(space, R_internal, + X_internal); } /// \brief R(i,j) = abs(X(i,j)) diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index 825ab34d1f..9c83e8ace5 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -31,19 +31,25 @@ namespace KokkosBlas { template -void axpby(const execution_space& space, const AV& a, const XMV& X, const BV& b, const YMV& Y) { +void axpby(const execution_space& space, const AV& a, const XMV& X, const BV& b, + const YMV& Y) { static_assert(Kokkos::is_execution_space_v, - "KokkosBlas::axpby: execution_space must be a valid Kokkos execution space."); + "KokkosBlas::axpby: execution_space must be a valid Kokkos " + "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::axpby: " "X is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::axpby: XMV must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::axpby: XMV must be accessible from execution_space"); static_assert(Kokkos::is_view::value, "KokkosBlas::axpby: " "Y is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::axpby: XMV must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::axpby: XMV must be accessible from execution_space"); static_assert(std::is_same::value, "KokkosBlas::axpby: Y is const. It must be nonconst, " @@ -74,23 +80,27 @@ void axpby(const execution_space& space, const AV& a, const XMV& X, const BV& b, // Create unmanaged versions of the input Views. XMV and YMV may be // rank 1 or rank 2. AV and BV may be either rank-1 Views, or // scalar values. - using XMV_Internal = Kokkos::View >; - using YMV_Internal = Kokkos::View >; - using AV_Internal = typename KokkosKernels::Impl::GetUnifiedScalarViewType< - AV, XMV_Internal, true>::type; - using BV_Internal = typename KokkosKernels::Impl::GetUnifiedScalarViewType< - BV, YMV_Internal, true>::type; + using XMV_Internal = Kokkos::View >; + using YMV_Internal = Kokkos::View >; + using AV_Internal = + typename KokkosKernels::Impl::GetUnifiedScalarViewType::type; + using BV_Internal = + typename KokkosKernels::Impl::GetUnifiedScalarViewType::type; AV_Internal a_internal = a; XMV_Internal X_internal = X; BV_Internal b_internal = b; YMV_Internal Y_internal = Y; - Impl::Axpby::axpby(space, a_internal, X_internal, b_internal, Y_internal); + Impl::Axpby::axpby(space, a_internal, X_internal, b_internal, + Y_internal); } template @@ -99,9 +109,10 @@ void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { } template -void axpy(const execution_space& space, const AV& a, const XMV& X, const YMV& Y) { - axpby(space, a, X, Kokkos::ArithTraits::one(), - Y); +void axpy(const execution_space& space, const AV& a, const XMV& X, + const YMV& Y) { + axpby(space, a, X, + Kokkos::ArithTraits::one(), Y); } template diff --git a/blas/src/KokkosBlas1_fill.hpp b/blas/src/KokkosBlas1_fill.hpp index a7186c466a..a3fe4e4cd2 100644 --- a/blas/src/KokkosBlas1_fill.hpp +++ b/blas/src/KokkosBlas1_fill.hpp @@ -28,7 +28,8 @@ namespace KokkosBlas { /// \param X [out] Output View (1-D or 2-D). /// \param val [in] Value with which to fill the entries of X. template -void fill(const execution_space& space, const XMV& X, const typename XMV::non_const_value_type& val) { +void fill(const execution_space& space, const XMV& X, + const typename XMV::non_const_value_type& val) { Kokkos::Profiling::pushRegion("KokkosBlas::fill"); Kokkos::deep_copy(space, X, val); Kokkos::Profiling::popRegion(); diff --git a/blas/src/KokkosBlas1_mult.hpp b/blas/src/KokkosBlas1_mult.hpp index a13a2ebd17..1d346bf06f 100644 --- a/blas/src/KokkosBlas1_mult.hpp +++ b/blas/src/KokkosBlas1_mult.hpp @@ -38,25 +38,33 @@ namespace KokkosBlas { /// /// \return Y = gamma * Y + alpha * A * X. template -void mult(const execution_space& space, typename YMV::const_value_type& gamma, const YMV& Y, - typename AV::const_value_type& alpha, const AV& A, const XMV& X) { +void mult(const execution_space& space, typename YMV::const_value_type& gamma, + const YMV& Y, typename AV::const_value_type& alpha, const AV& A, + const XMV& X) { static_assert(Kokkos::is_execution_space_v, - "KokkosBlas::mult: execution_space must be a valid Kokkos execution space."); + "KokkosBlas::mult: execution_space must be a valid Kokkos " + "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::mult: " "Y is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::mult: YMV must be accessible from execution_space."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::mult: YMV must be accessible from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::mult: " "A is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::mult: AV must be accessible from execution_space."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::mult: AV must be accessible from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::mult: " "X is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::mult: AV must be accessible from execution_space."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::mult: AV must be accessible from execution_space."); static_assert(std::is_same::value, "KokkosBlas::mult: Y is const. " @@ -107,7 +115,7 @@ void mult(const execution_space& space, typename YMV::const_value_type& gamma, c XMV_Internal X_internal = X; Impl::Mult::mult( - space, gamma, Y_internal, alpha, A_internal, X_internal); + space, gamma, Y_internal, alpha, A_internal, X_internal); } /// \brief Element wise multiplication of two vectors: diff --git a/blas/src/KokkosBlas1_reciprocal.hpp b/blas/src/KokkosBlas1_reciprocal.hpp index 62780faaa8..e42b6fec54 100644 --- a/blas/src/KokkosBlas1_reciprocal.hpp +++ b/blas/src/KokkosBlas1_reciprocal.hpp @@ -35,17 +35,22 @@ namespace KokkosBlas { template void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_execution_space_v, - "KokkosBlas::reciprocal: execution_space must be a valid Kokkos execition space."); + "KokkosBlas::reciprocal: execution_space must be a valid " + "Kokkos execition space."); static_assert(Kokkos::is_view::value, "KokkosBlas::reciprocal: " "R is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::reciprocal: RMV must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::reciprocal: RMV must be accessible from execution_space"); static_assert(Kokkos::is_view::value, "KokkosBlas::reciprocal: " "X is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::reciprocal: XMV must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::reciprocal: XMV must be accessible from execution_space"); static_assert(std::is_same::value, "KokkosBlas::reciprocal: R is const. " @@ -86,9 +91,8 @@ void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { RMV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Reciprocal::reciprocal(space, - R_internal, - X_internal); + Impl::Reciprocal::reciprocal( + space, R_internal, X_internal); } /// \brief R(i,j) = reciprocal(X(i,j)) diff --git a/blas/src/KokkosBlas1_scal.hpp b/blas/src/KokkosBlas1_scal.hpp index 996a78f252..2e44b135e6 100644 --- a/blas/src/KokkosBlas1_scal.hpp +++ b/blas/src/KokkosBlas1_scal.hpp @@ -30,21 +30,29 @@ namespace KokkosBlas { template -void scal(const execution_space& space, const RMV& R, const AV& a, const XMV& X) { +void scal(const execution_space& space, const RMV& R, const AV& a, + const XMV& X) { static_assert(Kokkos::is_execution_space_v, - "KokkosBlas::scal: execution_space must be a valid Kokkos execution space"); + "KokkosBlas::scal: execution_space must be a valid Kokkos " + "execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::scal: " "R is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::scal: RMV must be accessible from execution_space."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::scal: RMV must be accessible from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::scal: " "X is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::scal: XMV must be accessible from execution_space"); - static_assert(Kokkos::SpaceAccessibility::assignable, - "KokkosBlas::scal: XMV must be assignable to RMV"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::scal: XMV must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::assignable, + "KokkosBlas::scal: XMV must be assignable to RMV"); static_assert(std::is_same::value, "KokkosBlas::scal: R is const. " @@ -75,21 +83,22 @@ void scal(const execution_space& space, const RMV& R, const AV& a, const XMV& X) // Create unmanaged versions of the input Views. RMV and XMV may be // rank 1 or rank 2. AV may be either a rank-1 View, or a scalar // value. - using RMV_Internal = Kokkos::View >; - using XMV_Internal = Kokkos::View >; - using AV_Internal = typename KokkosKernels::Impl::GetUnifiedScalarViewType< - AV, XMV_Internal, true>::type; + using RMV_Internal = Kokkos::View >; + using XMV_Internal = Kokkos::View >; + using AV_Internal = + typename KokkosKernels::Impl::GetUnifiedScalarViewType::type; RMV_Internal R_internal = R; AV_Internal a_internal = a; XMV_Internal X_internal = X; Impl::Scal::scal( - space, R_internal, a_internal, X_internal); + space, R_internal, a_internal, X_internal); } template diff --git a/blas/src/KokkosBlas1_update.hpp b/blas/src/KokkosBlas1_update.hpp index 5a37482fb2..17f6680eaf 100644 --- a/blas/src/KokkosBlas1_update.hpp +++ b/blas/src/KokkosBlas1_update.hpp @@ -35,11 +35,12 @@ namespace KokkosBlas { /// ZMV. template void update(const execution_space& space, - const typename XMV::non_const_value_type& alpha, const XMV& X, + const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, const typename ZMV::non_const_value_type& gamma, const ZMV& Z) { static_assert(Kokkos::is_execution_space_v, - "KokkosBlas::update: execution_space must be a valid Kokkos execution space."); + "KokkosBlas::update: execution_space must be a valid Kokkos " + "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::update: " "X is not a Kokkos::View."); @@ -49,12 +50,18 @@ void update(const execution_space& space, static_assert(Kokkos::is_view::value, "KokkosBlas::update: " "Z is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::update: XMV must be accessible from execution_space."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::update: YMV must be accessible from execution_space."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::update: ZMV must be accessible from execution_space."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::update: XMV must be accessible from execution_space."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::update: YMV must be accessible from execution_space."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::update: ZMV must be accessible from execution_space."); static_assert(std::is_same::value, "KokkosBlas::update: Z is const. " @@ -117,8 +124,9 @@ void update(const execution_space& space, << endl; #endif // KOKKOSKERNELS_PRINT_DEMANGLED_TYPE_INFO - Impl::Update::update( - space, alpha, X_internal, beta, Y_internal, gamma, Z_internal); + Impl::Update::update(space, alpha, X_internal, beta, Y_internal, + gamma, Z_internal); } /// \brief Compute Z := alpha*X + beta*Y + gamma*Z. diff --git a/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp index 3cedf5fc3f..e2b04e300d 100644 --- a/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp @@ -20,7 +20,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct axpby_tpl_spec_avail { enum : bool { value = false }; }; @@ -36,8 +37,7 @@ namespace Impl { #define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct axpby_tpl_spec_avail< \ - ExecSpace, \ - SCALAR, \ + ExecSpace, SCALAR, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ @@ -64,8 +64,7 @@ KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, #define KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct axpby_tpl_spec_avail< \ - ExecSpace, \ - SCALAR, \ + ExecSpace, SCALAR, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ diff --git a/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp index 3924e0da21..8d3fc0f4d2 100644 --- a/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_mult_tpl_spec_avail.hpp @@ -20,7 +20,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct mult_tpl_spec_avail { enum : bool { value = false }; }; diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp index 571682b5b5..2aeef2b40d 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp @@ -20,7 +20,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct scal_tpl_spec_avail { enum : bool { value = false }; }; @@ -59,9 +60,9 @@ KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, // cuBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) // double -#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ - template <> \ +#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ + MEMSPACE) \ + template <> \ struct scal_tpl_spec_avail< \ EXECSPACE, \ Kokkos::View, \ @@ -73,32 +74,36 @@ KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, enum : bool { value = true }; \ }; -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) #endif // rocBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS) -#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXECSPACE,\ - MEMSPACE) \ - template <> \ +#define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXECSPACE, \ + MEMSPACE) \ + template <> \ struct scal_tpl_spec_avail< \ EXECSPACE, \ Kokkos::View, \ @@ -110,16 +115,16 @@ KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, enum : bool { value = true }; \ }; -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, - Kokkos::HIP, Kokkos::HIPSpace) -KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, - Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::HIP, Kokkos::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::HIP, Kokkos::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) #endif diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index 0050923a47..ec7f095fa6 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -248,9 +248,9 @@ namespace KokkosBlas { namespace Impl { #define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ - SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, EXECSPACE, MEMSPACE \ - ETI_SPEC_AVAIL) \ - template <> \ + SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, EXECSPACE, \ + MEMSPACE ETI_SPEC_AVAIL) \ + template <> \ struct Scal< \ EXECSPACE, \ Kokkos::View, \ @@ -273,7 +273,7 @@ namespace Impl { typedef typename XV::size_type size_type; \ \ static void scal(const execution_space& space, const RV& R, \ - const AS& alpha, const XV& X) { \ + const AS& alpha, const XV& X) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::scal[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ const size_type numElems = X.extent(0); \ @@ -284,18 +284,18 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::RocBlasSingleton& s = \ KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN( \ s.handle, N, reinterpret_cast(&alpha), \ reinterpret_cast(R.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_pointer_mode(s.handle, pointer_mode)); \ } else { \ Scal::scal(R, alpha, X); \ } \ @@ -303,25 +303,25 @@ namespace Impl { } \ }; -#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, \ - MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(double, double, rocblas_dscal, \ - LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) +#define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(double, double, rocblas_dscal, \ + LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE \ - ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, \ + MEMSPACE ETI_SPEC_AVAIL) \ KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(float, float, rocblas_sscal, LAYOUT, \ EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE \ - ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, \ + MEMSPACE ETI_SPEC_AVAIL) \ KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ Kokkos::complex, rocblas_double_complex, rocblas_zscal, LAYOUT, \ EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE \ - ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, \ + MEMSPACE ETI_SPEC_AVAIL) \ KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ Kokkos::complex, rocblas_float_complex, rocblas_cscal, LAYOUT, \ EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) diff --git a/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp index 55e1383ed7..88a60e6d19 100644 --- a/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_update_tpl_spec_avail.hpp @@ -20,7 +20,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct update_tpl_spec_avail { enum : bool { value = false }; }; From b3d73f1d09603e42a2a7a3cb48c7b537bee9d4da Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 21 Apr 2023 17:47:24 -0600 Subject: [PATCH 270/442] Add doxygen for user-facing Gauss-Seidel functions --- docs/developer/apidocs/sparse.rst | 17 +- sparse/src/KokkosSparse_gauss_seidel.hpp | 433 +++++++++++++---------- 2 files changed, 255 insertions(+), 195 deletions(-) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index 8347afcf34..e56ad79345 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -24,11 +24,11 @@ ccs2crs spmv ---- -.. doxygenfunctions:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls, const char[], const AlphaType&, const AMatrix&, const XVector&, const BetaType&, const YVector&) -.. doxygenfunctions:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) -.. doxygenfunctions:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_ONE) -.. doxygenfunctions:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_TWO) -.. doxygenfunctions:: KokkosSparse::spmv(const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) +.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls, const char[], const AlphaType&, const AMatrix&, const XVector&, const BetaType&, const YVector&) +.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) +.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_ONE) +.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_TWO) +.. doxygenfunction:: KokkosSparse::spmv(const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) trsv @@ -49,9 +49,10 @@ block_spgemm gauss_seidel ------------ .. doxygenfunction:: gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) -.. doxygenfunctions:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) -.. doxygenfunctions:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) .. doxygenfunction:: symmetric_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: forward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: backward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) block_gauss_seidel @@ -71,4 +72,4 @@ par_ilut gmres ----- -.. doxygenfunction:: gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, Preconditioner* precond) \ No newline at end of file +.. doxygenfunction:: gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, Preconditioner* precond) diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index a2c6e89e82..9f1b9d8cb1 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -26,17 +26,21 @@ namespace KokkosSparse { namespace Experimental { /// -/// @brief +/// @brief Gauss-Seidel preconditioner setup (first phase, based on sparsity +/// pattern only) /// -/// @tparam KernelHandle -/// @tparam lno_row_view_t_ -/// @tparam lno_nnz_view_t_ -/// @param handle -/// @param num_rows -/// @param num_cols -/// @param row_map -/// @param entries -/// @param is_graph_symmetric +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// @pre handle->create_gs_handle(...) has been called previously /// template @@ -99,19 +103,22 @@ void gauss_seidel_symbolic(KernelHandle *handle, } /// -/// @brief -/// -/// @tparam KernelHandle -/// @tparam lno_row_view_t_ -/// @tparam lno_nnz_view_t_ -/// @param handle -/// @param num_rows -/// @param num_cols -/// @param block_size -/// @param row_map -/// @param entries -/// @param is_graph_symmetric +/// @brief Block Gauss-Seidel preconditioner setup (first phase, based on +/// sparsity pattern only) /// +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param block_size The number of degrees of freedom per block +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// @pre handle->create_gs_handle(...) has been called previously template void block_gauss_seidel_symbolic( @@ -132,20 +139,23 @@ void block_gauss_seidel_symbolic( } /// -/// @brief +/// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's +/// numeric values) /// -/// @tparam format -/// @tparam KernelHandle -/// @tparam lno_row_view_t_ -/// @tparam lno_nnz_view_t_ -/// @tparam scalar_nnz_view_t_ -/// @param handle -/// @param num_rows -/// @param num_cols -/// @param row_map -/// @param entries -/// @param values -/// @param is_graph_symmetric +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric /// template num_rows x +/// num_rows submatrix of A is structurally symmetric +/// @remark If the inverse diagonal is not already available, it's best to call +/// the version of gauss_seidel_numeric that +/// doesn't take it as an argument. The inverse diagonal will be +/// computed internally. template num_rows x +/// num_rows submatrix of A is structurally symmetric /// template x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// template x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// template x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// template x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// template x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// template x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// template Date: Fri, 21 Apr 2023 22:40:15 -0600 Subject: [PATCH 271/442] BLAS1: fix some Host BLAS TPL issue with execution space overload --- blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp | 34 +++++++++++-------- blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp | 1 + blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp | 5 +-- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp index f69fc618a0..561ce8b6ae 100644 --- a/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp @@ -43,6 +43,7 @@ namespace Impl { #define KOKKOSBLAS1_DAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Axpby< \ + ExecSpace, \ double, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -60,7 +61,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YV; \ \ - static void axpby(const AV& alpha, const XV& X, const BV& beta, \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, \ const YV& Y) { \ Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,double]"); \ if ((X.extent(0) < INT_MAX) && (beta == 1.0)) { \ @@ -69,8 +70,8 @@ namespace Impl { int one = 1; \ HostBlas::axpy(N, alpha, X.data(), one, Y.data(), one); \ } else \ - Axpby::axpby( \ - alpha, X, beta, Y); \ + Axpby::axpby( \ + space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -78,6 +79,7 @@ namespace Impl { #define KOKKOSBLAS1_SAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Axpby< \ + ExecSpace, \ float, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -95,7 +97,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YV; \ \ - static void axpby(const AV& alpha, const XV& X, const BV& beta, \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, \ const YV& Y) { \ Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,float]"); \ if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ @@ -104,15 +106,17 @@ namespace Impl { int one = 1; \ HostBlas::axpy(N, alpha, X.data(), one, Y.data(), one); \ } else \ - Axpby::axpby( \ - alpha, X, beta, Y); \ + Axpby::axpby( \ + space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ }; #define KOKKOSBLAS1_ZAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Axpby, \ + struct Axpby< \ + ExecSpace, \ + Kokkos::complex, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ @@ -132,7 +136,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YV; \ \ - static void axpby(const AV& alpha, const XV& X, const BV& beta, \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, \ const YV& Y) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::axpby[TPL_BLAS,complex]"); \ @@ -146,15 +150,17 @@ namespace Impl { reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(Y.data()), one); \ } else \ - Axpby::axpby( \ - alpha, X, beta, Y); \ + Axpby::axpby( \ + space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ }; #define KOKKOSBLAS1_CAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Axpby, \ + struct Axpby< \ + ExecSpace, \ + Kokkos::complex, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ @@ -174,7 +180,7 @@ namespace Impl { Kokkos::MemoryTraits > \ YV; \ \ - static void axpby(const AV& alpha, const XV& X, const BV& beta, \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, \ const YV& Y) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::axpby[TPL_BLAS,complex]"); \ @@ -188,8 +194,8 @@ namespace Impl { reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(Y.data()), one); \ } else \ - Axpby::axpby( \ - alpha, X, beta, Y); \ + Axpby::axpby( \ + space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp index 2aeef2b40d..f4ca77ba69 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp @@ -37,6 +37,7 @@ namespace Impl { #define KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct scal_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index ec7f095fa6..bcea9b9a57 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -42,6 +42,7 @@ namespace Impl { LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Scal< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR_TYPE, \ @@ -60,7 +61,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void scal(const RV& R, const AS& alpha, const XV& X) { \ + static void scal(const ExecSpace& space, const RV& R, const AS& alpha, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_BLAS," #SCALAR_TYPE \ "]"); \ const size_type numElems = X.extent(0); \ @@ -73,7 +74,7 @@ namespace Impl { HostBlas::scal( \ N, alpha_b, reinterpret_cast(R.data()), one); \ } else { \ - Scal::scal(R, alpha, X); \ + Scal::scal(space, R, alpha, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ From ffefb538630f6350f50dfc08683c65515ede694c Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Fri, 21 Apr 2023 22:42:22 -0600 Subject: [PATCH 272/442] BLAS1: applying clang format --- blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp | 186 +++++++++--------- blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp | 6 +- 2 files changed, 94 insertions(+), 98 deletions(-) diff --git a/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp index 561ce8b6ae..1ffd75fe41 100644 --- a/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp @@ -43,8 +43,7 @@ namespace Impl { #define KOKKOSBLAS1_DAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Axpby< \ - ExecSpace, \ - double, \ + ExecSpace, double, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ double, \ @@ -61,8 +60,8 @@ namespace Impl { Kokkos::MemoryTraits > \ YV; \ \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, \ - const YV& Y) { \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ + const BV& beta, const YV& Y) { \ Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,double]"); \ if ((X.extent(0) < INT_MAX) && (beta == 1.0)) { \ axpby_print_specialization(); \ @@ -70,8 +69,8 @@ namespace Impl { int one = 1; \ HostBlas::axpy(N, alpha, X.data(), one, Y.data(), one); \ } else \ - Axpby::axpby( \ - space, alpha, X, beta, Y); \ + Axpby::axpby(space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -79,8 +78,7 @@ namespace Impl { #define KOKKOSBLAS1_SAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Axpby< \ - ExecSpace, \ - float, \ + ExecSpace, float, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ float, \ @@ -97,8 +95,8 @@ namespace Impl { Kokkos::MemoryTraits > \ YV; \ \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, \ - const YV& Y) { \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ + const BV& beta, const YV& Y) { \ Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_BLAS,float]"); \ if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ axpby_print_specialization(); \ @@ -106,98 +104,94 @@ namespace Impl { int one = 1; \ HostBlas::axpy(N, alpha, X.data(), one, Y.data(), one); \ } else \ - Axpby::axpby( \ - space, alpha, X, beta, Y); \ + Axpby::axpby(space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_ZAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby< \ - ExecSpace, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, \ - const YV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::axpby[TPL_BLAS,complex]"); \ - if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - int N = X.extent(0); \ - int one = 1; \ - const std::complex alpha_val = alpha; \ - HostBlas >::axpy( \ - N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one); \ - } else \ - Axpby::axpby( \ - space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ + const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::axpby[TPL_BLAS,complex]"); \ + if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + int N = X.extent(0); \ + int one = 1; \ + const std::complex alpha_val = alpha; \ + HostBlas >::axpy( \ + N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby< \ - ExecSpace, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, \ - const YV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::axpby[TPL_BLAS,complex]"); \ - if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - int N = X.extent(0); \ - int one = 1; \ - const std::complex alpha_val = alpha; \ - HostBlas >::axpy( \ - N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one); \ - } else \ - Axpby::axpby( \ - space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CAXPBY_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ + const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::axpby[TPL_BLAS,complex]"); \ + if ((X.extent(0) < INT_MAX) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + int N = X.extent(0); \ + int one = 1; \ + const std::complex alpha_val = alpha; \ + HostBlas >::axpy( \ + N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS1_DAXPBY_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index bcea9b9a57..8265195884 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -61,7 +61,8 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void scal(const ExecSpace& space, const RV& R, const AS& alpha, const XV& X) { \ + static void scal(const ExecSpace& space, const RV& R, const AS& alpha, \ + const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_BLAS," #SCALAR_TYPE \ "]"); \ const size_type numElems = X.extent(0); \ @@ -74,7 +75,8 @@ namespace Impl { HostBlas::scal( \ N, alpha_b, reinterpret_cast(R.data()), one); \ } else { \ - Scal::scal(space, R, alpha, X); \ + Scal::scal(space, R, \ + alpha, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ From fa03d4884144b6573dd3c02c0d951b1d2d57e97b Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Mon, 24 Apr 2023 09:04:05 -0600 Subject: [PATCH 273/442] Update blas1.rst --- docs/developer/apidocs/blas1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/developer/apidocs/blas1.rst b/docs/developer/apidocs/blas1.rst index 1a68066271..d6956f34b3 100644 --- a/docs/developer/apidocs/blas1.rst +++ b/docs/developer/apidocs/blas1.rst @@ -51,7 +51,7 @@ sum .. doxygenfunction:: KokkosBlas::sum(const RV &R, const XMV &X, typename std::enable_if::value, int>::type = 0) swap ---- +---- .. doxygenfunction:: KokkosBlas::swap(execution_space const& space, XVector const& X, YVector const& Y) .. doxygenfunction:: KokkosBlas::swap(XVector const& X, YVector const& Y) From bf09ba19b0383901273f06eabc8c863106700115 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 24 Apr 2023 09:19:58 -0600 Subject: [PATCH 274/442] BLAS1: fix CUBLAS TPL layer for axpby and scal --- blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp | 70 ++++++++++++------- blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp | 10 ++- 2 files changed, 53 insertions(+), 27 deletions(-) diff --git a/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp index 1ffd75fe41..b650263dbd 100644 --- a/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp @@ -224,7 +224,8 @@ namespace Impl { #define KOKKOSBLAS1_DAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Axpby< \ + struct Axpby< \ + ExecSpace, \ double, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -243,7 +244,7 @@ namespace Impl { YV; \ typedef typename XV::size_type size_type; \ \ - static void axpby(const AV& alpha, const XV& X, const BV& beta, \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, \ const YV& Y) { \ Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,double]"); \ const size_type numElems = X.extent(0); \ @@ -253,17 +254,22 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, NULL)); \ } else \ - Axpby::axpby( \ - alpha, X, beta, Y); \ + Axpby::axpby( \ + space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ }; #define KOKKOSBLAS1_SAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Axpby< \ + struct Axpby< \ + ExecSpace, \ float, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -282,7 +288,7 @@ namespace Impl { YV; \ typedef typename XV::size_type size_type; \ \ - static void axpby(const AV& alpha, const XV& X, const BV& beta, \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, \ const YV& Y) { \ Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,float]"); \ const size_type numElems = X.extent(0); \ @@ -292,17 +298,22 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, NULL)); \ } else \ - Axpby::axpby( \ - alpha, X, beta, Y); \ + Axpby::axpby( \ + space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ }; #define KOKKOSBLAS1_ZAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Axpby, \ + struct Axpby, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ @@ -323,7 +334,7 @@ namespace Impl { YV; \ typedef typename XV::size_type size_type; \ \ - static void axpby(const AV& alpha, const XV& X, const BV& beta, \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, \ const YV& Y) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ @@ -334,20 +345,25 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasZaxpy(s.handle, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZaxpy(s.handle, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, NULL)); \ } else \ - Axpby::axpby( \ - alpha, X, beta, Y); \ + Axpby::axpby( \ + space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ }; #define KOKKOSBLAS1_CAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Axpby, \ + struct Axpby, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ @@ -368,7 +384,7 @@ namespace Impl { YV; \ typedef typename XV::size_type size_type; \ \ - static void axpby(const AV& alpha, const XV& X, const BV& beta, \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, \ const YV& Y) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ @@ -379,12 +395,16 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasCaxpy(s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCaxpy(s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, NULL)); \ } else \ - Axpby::axpby( \ - alpha, X, beta, Y); \ + Axpby::axpby( \ + space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index 8265195884..f760d8c0ff 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -137,6 +137,7 @@ namespace Impl { ETI_SPEC_AVAIL) \ template \ struct Scal< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR_TYPE, \ @@ -155,7 +156,8 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void scal(const RV& R, const AS& alpha, const XV& X) { \ + static void scal(const ExecSpace& space, const RV& R, const AS& alpha, \ + const XV& X) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::scal[TPL_CUBLAS," #SCALAR_TYPE "]"); \ const size_type numElems = X.extent(0); \ @@ -166,11 +168,15 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ s.handle, N, reinterpret_cast(&alpha), \ reinterpret_cast(R.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, NULL)); \ } else { \ - Scal::scal(R, alpha, X); \ + Scal::scal(space, R, alpha, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ From bb0e2fef32bcfd0e3122146d5653315211392ef9 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 24 Apr 2023 09:29:01 -0600 Subject: [PATCH 275/442] BLAS1: fix documentation for fill and mult and apply clang-format --- blas/src/KokkosBlas1_fill.hpp | 3 + blas/src/KokkosBlas1_mult.hpp | 3 + blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp | 233 +++++++++--------- blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp | 96 ++++---- 4 files changed, 168 insertions(+), 167 deletions(-) diff --git a/blas/src/KokkosBlas1_fill.hpp b/blas/src/KokkosBlas1_fill.hpp index a3fe4e4cd2..1507341b76 100644 --- a/blas/src/KokkosBlas1_fill.hpp +++ b/blas/src/KokkosBlas1_fill.hpp @@ -23,8 +23,11 @@ namespace KokkosBlas { /// \brief Fill the multivector or single vector X with the given value. /// +/// \tparam execution_space a Kokkos execution space /// \tparam XMV 1-D or 2-D output View /// +/// \param space [in] A Kokkos instance of execution_space on which the +/// kernel will run. /// \param X [out] Output View (1-D or 2-D). /// \param val [in] Value with which to fill the entries of X. template diff --git a/blas/src/KokkosBlas1_mult.hpp b/blas/src/KokkosBlas1_mult.hpp index 1d346bf06f..8c72896312 100644 --- a/blas/src/KokkosBlas1_mult.hpp +++ b/blas/src/KokkosBlas1_mult.hpp @@ -26,10 +26,13 @@ namespace KokkosBlas { /// \brief Element wise multiplication of two vectors: /// Y[i] = gamma * Y[i] + alpha * A[i] * X[i] /// +/// \tparam execution_type a Kokkos execution space type. /// \tparam YMV Type of the first vector Y; a 1-D or 2-D Kokkos::View. /// \tparam AV Type of the second vector A; a 1-D Kokkos::View. /// \tparam XMV Type of the third vector X; a 1-D or 2-D Kokkos::View. /// +/// \param space [in] An instance of execution_space on which the kernel +/// will run (it may specify an execution stream/queue). /// \param gamma [in] The scalar to apply to Y. /// \param Y [in/out] The Y vector. /// \param alpha [in] The scalar to apply to A. diff --git a/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp index b650263dbd..65154b9985 100644 --- a/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp @@ -224,9 +224,8 @@ namespace Impl { #define KOKKOSBLAS1_DAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Axpby< \ - ExecSpace, \ - double, \ + struct Axpby< \ + ExecSpace, double, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ double, \ @@ -244,8 +243,8 @@ namespace Impl { YV; \ typedef typename XV::size_type size_type; \ \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, \ - const YV& Y) { \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ + const BV& beta, const YV& Y) { \ Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,double]"); \ const size_type numElems = X.extent(0); \ if ((numElems < static_cast(INT_MAX)) && (beta == 1.0)) { \ @@ -254,23 +253,22 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, NULL)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } else \ - Axpby::axpby( \ - space, alpha, X, beta, Y); \ + Axpby::axpby(space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ }; #define KOKKOSBLAS1_SAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Axpby< \ - ExecSpace, \ - float, \ + struct Axpby< \ + ExecSpace, float, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ float, \ @@ -288,8 +286,8 @@ namespace Impl { YV; \ typedef typename XV::size_type size_type; \ \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, \ - const YV& Y) { \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ + const BV& beta, const YV& Y) { \ Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,float]"); \ const size_type numElems = X.extent(0); \ if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ @@ -298,115 +296,112 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, NULL)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } else \ - Axpby::axpby( \ - space, alpha, X, beta, Y); \ + Axpby::axpby(space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_ZAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - typedef typename XV::size_type size_type; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, \ - const YV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZaxpy(s.handle, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, NULL)); \ - } else \ - Axpby::axpby( \ - space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + typedef typename XV::size_type size_type; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ + const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZaxpy( \ + s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - typedef typename XV::size_type size_type; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, \ - const YV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCaxpy(s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, NULL)); \ - } else \ - Axpby::axpby( \ - space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + typedef typename XV::size_type size_type; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, \ + const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCaxpy( \ + s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS1_DAXPBY_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index f760d8c0ff..9fb67e726b 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -132,54 +132,54 @@ KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, \ - CUBLAS_FN, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Scal< \ - ExecSpace, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - SCALAR_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef SCALAR_TYPE AS; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void scal(const ExecSpace& space, const RV& R, const AS& alpha, \ - const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::scal[TPL_CUBLAS," #SCALAR_TYPE "]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && \ - (R.data() == X.data())) { \ - scal_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ - s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(R.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, NULL)); \ - } else { \ - Scal::scal(space, R, alpha, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, \ + CUBLAS_FN, LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct Scal< \ + ExecSpace, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + SCALAR_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + RV; \ + typedef SCALAR_TYPE AS; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void scal(const ExecSpace& space, const RV& R, const AS& alpha, \ + const XV& X) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::scal[TPL_CUBLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && \ + (R.data() == X.data())) { \ + scal_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ + s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(R.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + Scal::scal(space, R, \ + alpha, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ From 5f89a772f08e68238b0b24613db9dfcbc6e2ae86 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 24 Apr 2023 09:37:07 -0600 Subject: [PATCH 276/442] sparse: Fix intel build error --- sparse/src/KokkosSparse_coo2crs.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sparse/src/KokkosSparse_coo2crs.hpp b/sparse/src/KokkosSparse_coo2crs.hpp index 2574ae0e7c..ef3f57fd36 100644 --- a/sparse/src/KokkosSparse_coo2crs.hpp +++ b/sparse/src/KokkosSparse_coo2crs.hpp @@ -321,7 +321,9 @@ auto coo2crs(DimType m, DimType n, RowViewType row, ColViewType col, if (row.extent(0) != col.extent(0) || row.extent(0) != data.extent(0)) Kokkos::abort("row.extent(0) = col.extent(0) = data.extent(0) required."); - if (m < 0 || n < 0) Kokkos::abort("m >= 0 and n >= 0 required."); + if constexpr (std::is_signed_v) { + if (m < 0 || n < 0) Kokkos::abort("m >= 0 and n >= 0 required."); + } using Coo2crsType = Impl::Coo2Crs; From daf1edce68b7e9a1068b1a9ea85d864e366cc484 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 24 Apr 2023 10:17:51 -0600 Subject: [PATCH 277/442] BLAS1: updating documentation for changes in PR #1803 This documents the addition of the execution_space template parameter and the associated space parameter to allow execution of the BLAS kernels in streams/queues when on device. --- blas/src/KokkosBlas1_abs.hpp | 7 +++++ blas/src/KokkosBlas1_axpby.hpp | 48 +++++++++++++++++++++++++++++ blas/src/KokkosBlas1_reciprocal.hpp | 5 +++ blas/src/KokkosBlas1_scal.hpp | 22 +++++++++++++ blas/src/KokkosBlas1_update.hpp | 15 +++++++++ docs/developer/apidocs/blas1.rst | 23 ++++++++++---- 6 files changed, 114 insertions(+), 6 deletions(-) diff --git a/blas/src/KokkosBlas1_abs.hpp b/blas/src/KokkosBlas1_abs.hpp index 7d2915af31..4137941de5 100644 --- a/blas/src/KokkosBlas1_abs.hpp +++ b/blas/src/KokkosBlas1_abs.hpp @@ -33,6 +33,10 @@ namespace KokkosBlas { /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. +/// +/// \param space [in] an execution_space instance where the kernel will run. +/// \param R [out] view of type RMV that contains the absolute value X on output. +/// \param X [in] view of type XMV. template void abs(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_execution_space_v, @@ -103,6 +107,9 @@ void abs(const execution_space& space, const RMV& R, const XMV& X) { /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. +/// +/// \param R [out] view of type RMV that contains the absolute value X on output. +/// \param X [in] view of type XMV. template void abs(const RMV& R, const XMV& X) { abs(typename RMV::execution_space{}, R, X); diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index 9c83e8ace5..cfd38f3cd4 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -30,6 +30,20 @@ namespace KokkosBlas { +/// \brief Computes Y := a*X + b*Y +/// +/// \tparam execution_space a Kokkos execution space where the kernel will run. +/// \tparam AV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. +/// \tparam BV 1-D or 2-D Kokkos::View specialization. +/// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as XMV. +/// +/// \param space [in] the execution space instance on which the kernel will run. +/// \param a [in] view of type AV, scaling parameter for X. +/// \param X [in] input view of type XMV. +/// \param b [in] view of type BV, scaling parameter for Y. +/// \param Y [in/out] view of type YMV in which the results will be stored. template void axpby(const execution_space& space, const AV& a, const XMV& X, const BV& b, const YMV& Y) { @@ -103,11 +117,35 @@ void axpby(const execution_space& space, const AV& a, const XMV& X, const BV& b, Y_internal); } +/// \brief Computes Y := a*X + b*Y +/// +/// \tparam AV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. +/// \tparam BV 1-D or 2-D Kokkos::View specialization. +/// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as XMV. +/// +/// \param a [in] view of type AV, scaling parameter for X. +/// \param X [in] input view of type XMV. +/// \param b [in] view of type BV, scaling parameter for Y. +/// \param Y [in/out] view of type YMV in which the results will be stored. template void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { axpby(typename XMV::execution_space{}, a, X, b, Y); } +/// \brief Computes Y := a*X + Y +/// +/// \tparam execution_space a Kokkos execution space where the kernel will run. +/// \tparam AV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. +/// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as XMV. +/// +/// \param space [in] the execution space instance on which the kernel will run. +/// \param a [in] view of type AV, scaling parameter for X. +/// \param X [in] input view of type XMV. +/// \param Y [in/out] view of type YMV in which the results will be stored. template void axpy(const execution_space& space, const AV& a, const XMV& X, const YMV& Y) { @@ -115,6 +153,16 @@ void axpy(const execution_space& space, const AV& a, const XMV& X, Kokkos::ArithTraits::one(), Y); } +/// \brief Computes Y := a*X + Y +/// +/// \tparam AV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. +/// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as XMV. +/// +/// \param a [in] view of type AV, scaling parameter for X. +/// \param X [in] input view of type XMV. +/// \param Y [in/out] view of type YMV in which the results will be stored. template void axpy(const AV& a, const XMV& X, const YMV& Y) { axpy(typename XMV::execution_space{}, a, X, Y); diff --git a/blas/src/KokkosBlas1_reciprocal.hpp b/blas/src/KokkosBlas1_reciprocal.hpp index e42b6fec54..75f0159cdc 100644 --- a/blas/src/KokkosBlas1_reciprocal.hpp +++ b/blas/src/KokkosBlas1_reciprocal.hpp @@ -28,10 +28,15 @@ namespace KokkosBlas { /// Replace each entry in R with the absolute value (magnitude), of the /// reciprocal of the corresponding entry in X. /// +/// \tparam execution_space a Kokkos execution space /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. +/// +/// \param space [in] an instance of execution space where the kernel will run +/// \param R [out] a view of type RMV that contains the inverse of the values in X. +/// \param X [in] a view of type XMV that contains the values to invert. template void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_execution_space_v, diff --git a/blas/src/KokkosBlas1_scal.hpp b/blas/src/KokkosBlas1_scal.hpp index 2e44b135e6..47c8be3ffb 100644 --- a/blas/src/KokkosBlas1_scal.hpp +++ b/blas/src/KokkosBlas1_scal.hpp @@ -29,6 +29,18 @@ namespace KokkosBlas { +/// \brief Computes R := alpha*X +/// +/// \tparam execution_space a Kokkos execution space where the kernel will run. +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV. +/// \tparam AV 1-D or 2-D Kokkos::View specialization. +/// +/// \param space [in] the execution space instance on which the kernel will run. +/// \param R [in/out] view of type RMV in which the results will be stored. +/// \param a [in] view of type AV, scaling parameter for X. +/// \param X [in] input view of type XMV. template void scal(const execution_space& space, const RMV& R, const AV& a, const XMV& X) { @@ -101,6 +113,16 @@ void scal(const execution_space& space, const RMV& R, const AV& a, space, R_internal, a_internal, X_internal); } +/// \brief Computes R := alpha*X +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV. +/// \tparam AV 1-D or 2-D Kokkos::View specialization. +/// +/// \param R [in/out] view of type RMV in which the results will be stored. +/// \param a [in] view of type AV, scaling parameter for X. +/// \param X [in] input view of type XMV. template void scal(const RMV& R, const AV& a, const XMV& X) { scal(typename RMV::execution_space{}, R, a, X); diff --git a/blas/src/KokkosBlas1_update.hpp b/blas/src/KokkosBlas1_update.hpp index 17f6680eaf..f77777884d 100644 --- a/blas/src/KokkosBlas1_update.hpp +++ b/blas/src/KokkosBlas1_update.hpp @@ -33,6 +33,14 @@ namespace KokkosBlas { /// the same rank as XMV and YMV, and it must make sense to add up /// the entries of XMV and YMV and assign them to the entries of /// ZMV. +/// +/// \param space [in] the execution space instance on which the kernel will run. +/// \param alpha [in] scaling parameter for X +/// \param X [in] input view of type XMV +/// \param beta [in] scaling parameter for Y +/// \param Y [in] input view of type YMV +/// \param gamma [in] scaling parameter for Z +/// \param Z [in/out] view of type ZMV in which the results will be stored. template void update(const execution_space& space, const typename XMV::non_const_value_type& alpha, const XMV& X, @@ -138,6 +146,13 @@ void update(const execution_space& space, /// the same rank as XMV and YMV, and it must make sense to add up /// the entries of XMV and YMV and assign them to the entries of /// ZMV. +/// +/// \param alpha [in] scaling parameter for X +/// \param X [in] input view of type XMV +/// \param beta [in] scaling parameter for Y +/// \param Y [in] input view of type YMV +/// \param gamma [in] scaling parameter for Z +/// \param Z [in/out] view of type ZMV in which the results will be stored. template void update(const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, diff --git a/docs/developer/apidocs/blas1.rst b/docs/developer/apidocs/blas1.rst index d6956f34b3..f800f6e5ce 100644 --- a/docs/developer/apidocs/blas1.rst +++ b/docs/developer/apidocs/blas1.rst @@ -1,9 +1,15 @@ BLAS1 -- KokkosKernels blas1 interfaces ======================================= +abs +___ +.. doxygenfunction:: KokkosBlas::abs(const execution_space& space, const RMV& R, const XMV& X) +.. doxygenfunction:: KokkosBlas::abs(const RMV& R, const XMV& X) + axpby ----- -.. doxygenfunction:: KokkosBlas::axpby +.. doxygenfunction:: KokkosBlas::axpby(const execution_space& space, const AV& a, const XMV& X, const BV& b, const YMV& Y) +.. doxygenfunction:: KokkosBlas::axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) dot --- @@ -12,11 +18,13 @@ dot fill ---- -.. doxygenfunction:: KokkosBlas::fill +.. doxygenfunction:: KokkosBlas::fill(const execution_space& space, const XMV& X, const typename XMV::non_const_value_type& val) +.. doxygenfunction:: KokkosBlas::fill(const XMV& X, const typename XMV::non_const_value_type& val) mult ---- -.. doxygenfunction:: KokkosBlas::mult +.. doxygenfunction:: KokkosBlas::mult(const execution_space& space, typename YMV::const_value_type& gamma, const YMV& Y, typename AV::const_value_type& alpha, const AV& A, const XMV& X) +.. doxygenfunction:: KokkosBlas::mult(typename YMV::const_value_type& gamma, const YMV& Y, typename AV::const_value_type& alpha, const AV& A, const XMV& X) nrm1 ---- @@ -40,11 +48,13 @@ nrminf reciprocal ---------- -.. doxygenfunction:: KokkosBlas::reciprocal +.. doxygenfunction:: KokkosBlas::reciprocal(const execution_space& space, const RMV& R, const XMV& X) +.. doxygenfunction:: KokkosBlas::reciprocal(const RMV& R, const XMV& X) scal ---- -.. doxygenfunction:: KokkosBlas::scal +.. doxygenfunction:: KokkosBlas::scal(const execution_space& space, const RMV& R, const AV& a, const XMV& X) +.. doxygenfunction:: KokkosBlas::scal(const RMV& R, const AV& a, const XMV& X) sum --- @@ -57,4 +67,5 @@ swap update ------ -.. doxygenfunction:: KokkosBlas::update +.. doxygenfunction:: KokkosBlas::update(const execution_space& space, const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, const typename ZMV::non_const_value_type& gamma, const ZMV& Z) +.. doxygenfunction:: KokkosBlas::update(const typename XMV::non_const_value_type& alpha, const XMV& X, const typename YMV::non_const_value_type& beta, const YMV& Y, const typename ZMV::non_const_value_type& gamma, const ZMV& Z) From 6606dde035051d22411699854ed965cce48fd14b Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 24 Apr 2023 18:04:24 -0600 Subject: [PATCH 278/442] BLAS1: documentation adding default space info and non-block statement --- blas/src/KokkosBlas1_abs.hpp | 7 ++++--- blas/src/KokkosBlas1_axpby.hpp | 12 ++++++++++++ blas/src/KokkosBlas1_fill.hpp | 6 ++++++ blas/src/KokkosBlas1_mult.hpp | 6 ++++++ blas/src/KokkosBlas1_reciprocal.hpp | 4 ++++ blas/src/KokkosBlas1_scal.hpp | 6 ++++++ blas/src/KokkosBlas1_update.hpp | 6 ++++++ 7 files changed, 44 insertions(+), 3 deletions(-) diff --git a/blas/src/KokkosBlas1_abs.hpp b/blas/src/KokkosBlas1_abs.hpp index 4137941de5..2c65ba26dc 100644 --- a/blas/src/KokkosBlas1_abs.hpp +++ b/blas/src/KokkosBlas1_abs.hpp @@ -25,7 +25,7 @@ namespace KokkosBlas { /// \brief R(i,j) = abs(X(i,j)) /// -/// Replace each entry in R with the absolute value (magnitude) of the +/// Non-blocking function to replace each entry in R with the absolute value (magnitude) of the /// corresponding entry in X. /// /// \tparam execution_space a Kokkos execution space to run the kernels on. @@ -100,8 +100,9 @@ void abs(const execution_space& space, const RMV& R, const XMV& X) { /// \brief R(i,j) = abs(X(i,j)) /// -/// Replace each entry in R with the absolute value (magnitude) of the -/// corresponding entry in X. +/// Non-blocking function to replace each entry in R with the absolute value +/// (magnitude) of the corresponding entry in X. The kernel is executed in the +/// default stream/queue associated with the execution space of RMV. /// /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have diff --git a/blas/src/KokkosBlas1_axpby.hpp b/blas/src/KokkosBlas1_axpby.hpp index cfd38f3cd4..2f59cb4cce 100644 --- a/blas/src/KokkosBlas1_axpby.hpp +++ b/blas/src/KokkosBlas1_axpby.hpp @@ -32,6 +32,8 @@ namespace KokkosBlas { /// \brief Computes Y := a*X + b*Y /// +/// This function is non-blocking and thread safe. +/// /// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam AV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. @@ -119,6 +121,10 @@ void axpby(const execution_space& space, const AV& a, const XMV& X, const BV& b, /// \brief Computes Y := a*X + b*Y /// +/// This function is non-blocking and thread-safe +/// The kernel is executed in the default stream/queue +/// associated with the execution space of XMV. +/// /// \tparam AV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. /// \tparam BV 1-D or 2-D Kokkos::View specialization. @@ -136,6 +142,8 @@ void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { /// \brief Computes Y := a*X + Y /// +/// This function is non-blocking and thread-safe +/// /// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam AV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. @@ -155,6 +163,10 @@ void axpy(const execution_space& space, const AV& a, const XMV& X, /// \brief Computes Y := a*X + Y /// +/// This function is non-blocking and thread-safe +/// The kernel is executed in the default stream/queue +/// associated with the execution space of XMV. +/// /// \tparam AV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. /// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have diff --git a/blas/src/KokkosBlas1_fill.hpp b/blas/src/KokkosBlas1_fill.hpp index 1507341b76..403411f7b8 100644 --- a/blas/src/KokkosBlas1_fill.hpp +++ b/blas/src/KokkosBlas1_fill.hpp @@ -23,6 +23,8 @@ namespace KokkosBlas { /// \brief Fill the multivector or single vector X with the given value. /// +/// This function is non-blocking and thread-safe +/// /// \tparam execution_space a Kokkos execution space /// \tparam XMV 1-D or 2-D output View /// @@ -40,6 +42,10 @@ void fill(const execution_space& space, const XMV& X, /// \brief Fill the multivector or single vector X with the given value. /// +/// This function is non-blocking and thread-safe +/// The kernel is executed in the default stream/queue +/// associated with the execution space of XMV. +/// /// \tparam XMV 1-D or 2-D output View /// /// \param X [out] Output View (1-D or 2-D). diff --git a/blas/src/KokkosBlas1_mult.hpp b/blas/src/KokkosBlas1_mult.hpp index 8c72896312..47fa1f536f 100644 --- a/blas/src/KokkosBlas1_mult.hpp +++ b/blas/src/KokkosBlas1_mult.hpp @@ -26,6 +26,8 @@ namespace KokkosBlas { /// \brief Element wise multiplication of two vectors: /// Y[i] = gamma * Y[i] + alpha * A[i] * X[i] /// +/// This function is non-blocking and thread-safe +/// /// \tparam execution_type a Kokkos execution space type. /// \tparam YMV Type of the first vector Y; a 1-D or 2-D Kokkos::View. /// \tparam AV Type of the second vector A; a 1-D Kokkos::View. @@ -124,6 +126,10 @@ void mult(const execution_space& space, typename YMV::const_value_type& gamma, /// \brief Element wise multiplication of two vectors: /// Y[i] = gamma * Y[i] + alpha * A[i] * X[i] /// +/// This function is non-blocking and thread-safe +/// The kernel is executed in the default stream/queue +/// associated with the execution space of YMV. +/// /// \tparam YMV Type of the first vector Y; a 1-D or 2-D Kokkos::View. /// \tparam AV Type of the second vector A; a 1-D Kokkos::View. /// \tparam XMV Type of the third vector X; a 1-D or 2-D Kokkos::View. diff --git a/blas/src/KokkosBlas1_reciprocal.hpp b/blas/src/KokkosBlas1_reciprocal.hpp index 75f0159cdc..f5f55e54cf 100644 --- a/blas/src/KokkosBlas1_reciprocal.hpp +++ b/blas/src/KokkosBlas1_reciprocal.hpp @@ -27,6 +27,7 @@ namespace KokkosBlas { /// /// Replace each entry in R with the absolute value (magnitude), of the /// reciprocal of the corresponding entry in X. +/// This function is non-blocking and thread-safe /// /// \tparam execution_space a Kokkos execution space /// \tparam RMV 1-D or 2-D Kokkos::View specialization. @@ -104,6 +105,9 @@ void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { /// /// Replace each entry in R with the absolute value (magnitude), of the /// reciprocal of the corresponding entry in X. +/// This function is non-blocking and thread-safe +/// The kernel is executed in the default stream/queue +/// associated with the execution space of RMV. /// /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have diff --git a/blas/src/KokkosBlas1_scal.hpp b/blas/src/KokkosBlas1_scal.hpp index 47c8be3ffb..39c197f352 100644 --- a/blas/src/KokkosBlas1_scal.hpp +++ b/blas/src/KokkosBlas1_scal.hpp @@ -31,6 +31,8 @@ namespace KokkosBlas { /// \brief Computes R := alpha*X /// +/// This function is non-blocking and thread-safe +/// /// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have @@ -115,6 +117,10 @@ void scal(const execution_space& space, const RMV& R, const AV& a, /// \brief Computes R := alpha*X /// +/// This function is non-blocking and thread-safe +/// The kernel is executed in the default stream/queue +/// associated with the execution space of YMV. +/// /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV. diff --git a/blas/src/KokkosBlas1_update.hpp b/blas/src/KokkosBlas1_update.hpp index f77777884d..889f9ede32 100644 --- a/blas/src/KokkosBlas1_update.hpp +++ b/blas/src/KokkosBlas1_update.hpp @@ -25,6 +25,8 @@ namespace KokkosBlas { /// \brief Compute Z := alpha*X + beta*Y + gamma*Z. /// +/// This function is non-blocking and thread-safe +/// /// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. /// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have @@ -139,6 +141,10 @@ void update(const execution_space& space, /// \brief Compute Z := alpha*X + beta*Y + gamma*Z. /// +/// This function is non-blocking and thread-safe +/// The kernel is executed in the default stream/queue +/// associated with the execution space of ZMV. +/// /// \tparam XMV 1-D or 2-D Kokkos::View specialization. /// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as XMV. From 03d67872477887f4e697bb91f98851e384dcbe30 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 25 Apr 2023 06:37:48 -0600 Subject: [PATCH 279/442] BLAS1: clang-format for documentation... : ( --- blas/src/KokkosBlas1_abs.hpp | 10 ++++++---- blas/src/KokkosBlas1_reciprocal.hpp | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/blas/src/KokkosBlas1_abs.hpp b/blas/src/KokkosBlas1_abs.hpp index 2c65ba26dc..bd63ccedf1 100644 --- a/blas/src/KokkosBlas1_abs.hpp +++ b/blas/src/KokkosBlas1_abs.hpp @@ -25,8 +25,8 @@ namespace KokkosBlas { /// \brief R(i,j) = abs(X(i,j)) /// -/// Non-blocking function to replace each entry in R with the absolute value (magnitude) of the -/// corresponding entry in X. +/// Non-blocking function to replace each entry in R with the absolute value +/// (magnitude) of the corresponding entry in X. /// /// \tparam execution_space a Kokkos execution space to run the kernels on. /// \tparam RMV 1-D or 2-D Kokkos::View specialization. @@ -35,7 +35,8 @@ namespace KokkosBlas { /// those of RMV. /// /// \param space [in] an execution_space instance where the kernel will run. -/// \param R [out] view of type RMV that contains the absolute value X on output. +/// \param R [out] view of type RMV that contains the absolute value X on +/// output. /// \param X [in] view of type XMV. template void abs(const execution_space& space, const RMV& R, const XMV& X) { @@ -109,7 +110,8 @@ void abs(const execution_space& space, const RMV& R, const XMV& X) { /// the same rank as RMV, and its entries must be assignable to /// those of RMV. /// -/// \param R [out] view of type RMV that contains the absolute value X on output. +/// \param R [out] view of type RMV that contains the absolute value X on +/// output. /// \param X [in] view of type XMV. template void abs(const RMV& R, const XMV& X) { diff --git a/blas/src/KokkosBlas1_reciprocal.hpp b/blas/src/KokkosBlas1_reciprocal.hpp index f5f55e54cf..ef73d26828 100644 --- a/blas/src/KokkosBlas1_reciprocal.hpp +++ b/blas/src/KokkosBlas1_reciprocal.hpp @@ -36,7 +36,8 @@ namespace KokkosBlas { /// those of RMV. /// /// \param space [in] an instance of execution space where the kernel will run -/// \param R [out] a view of type RMV that contains the inverse of the values in X. +/// \param R [out] a view of type RMV that contains the inverse of the values in +/// X. /// \param X [in] a view of type XMV that contains the values to invert. template void reciprocal(const execution_space& space, const RMV& R, const XMV& X) { From 4538fc44692749677b7d47506e53865f92ec1b73 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 12 Apr 2023 15:42:07 -0600 Subject: [PATCH 280/442] Blas1: updating nrm1 interface to accept execution space instance This basically will allow users to run the kernels on a stream or queue. Also adding rocBLAS support while making changes... --- blas/impl/KokkosBlas1_nrm1_impl.hpp | 49 +++-- blas/impl/KokkosBlas1_nrm1_spec.hpp | 48 +++-- blas/src/KokkosBlas1_nrm1.hpp | 48 ++++- blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp | 29 ++- blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp | 194 ++++++++++++++++++ 5 files changed, 314 insertions(+), 54 deletions(-) diff --git a/blas/impl/KokkosBlas1_nrm1_impl.hpp b/blas/impl/KokkosBlas1_nrm1_impl.hpp index 433ce580df..6fd71f0871 100644 --- a/blas/impl/KokkosBlas1_nrm1_impl.hpp +++ b/blas/impl/KokkosBlas1_nrm1_impl.hpp @@ -35,12 +35,11 @@ namespace Impl { /// \tparam SizeType Index type. Use int (32 bits) if possible. template struct V_Nrm1_Functor { - typedef typename XV::execution_space execution_space; - typedef SizeType size_type; - typedef typename XV::non_const_value_type xvalue_type; - typedef Kokkos::ArithTraits XAT; - typedef typename XAT::mag_type value_type; - typedef Kokkos::ArithTraits MAT; + using size_type = SizeType; + using xvalue_type = typename XV::non_const_value_type; + using XAT = Kokkos::ArithTraits; + using value_type = typename XAT::mag_type; + using MAT = Kokkos::ArithTraits; typename XV::const_type m_x; @@ -70,11 +69,11 @@ struct V_Nrm1_Functor { template struct Nrm1_MV_Functor { - typedef typename RV::non_const_value_type rvalue_type; - typedef typename XV::non_const_value_type xvalue_type; - typedef Kokkos::ArithTraits XAT; - typedef typename XAT::mag_type value_type; - typedef Kokkos::ArithTraits MAT; + using rvalue_type = typename RV::non_const_value_type; + using xvalue_type = typename XV::non_const_value_type; + using XAT = Kokkos::ArithTraits; + using value_type = typename XAT::mag_type; + using MAT = Kokkos::ArithTraits; using TeamMem = typename Kokkos::TeamPolicy::member_type; @@ -112,11 +111,10 @@ struct Nrm1_MV_Functor { /// \brief Compute the 2-norm (or its square) of the single vector (1-D /// View) X, and store the result in the 0-D View r. -template -void V_Nrm1_Invoke(const RV& r, const XV& X) { - typedef typename XV::execution_space execution_space; +template +void V_Nrm1_Invoke(const execution_space& space, const RV& r, const XV& X) { const SizeType numRows = static_cast(X.extent(0)); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); typedef V_Nrm1_Functor functor_type; functor_type op(X); @@ -127,13 +125,12 @@ void V_Nrm1_Invoke(const RV& r, const XV& X) { /// multivector (2-D View) X, and store result(s) in the 1-D View r. // Main version: the result view is accessible from execution space, so it can // be computed in-place -template +template void MV_Nrm1_Invoke( - const RV& r, const XV& x, + const execution_space& space, const RV& r, const XV& x, typename std::enable_if::accessible>::type* = nullptr) { - using execution_space = typename XV::execution_space; if (r.extent(0) != x.extent(1)) { std::ostringstream oss; oss << "KokkosBlas::nrm1 (rank-2): result vector has wrong length (" @@ -142,14 +139,14 @@ void MV_Nrm1_Invoke( } // Zero out the result vector Kokkos::deep_copy( - execution_space(), r, + space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( x.extent(0), x.extent(1), teamsPerVec); size_type numTeams = x.extent(1) * teamsPerVec; - Kokkos::TeamPolicy pol(numTeams, Kokkos::AUTO); + Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); Kokkos::parallel_for( "KokkosBlas1::Nrm1::S1", pol, Nrm1_MV_Functor(r, x, teamsPerVec)); @@ -157,18 +154,18 @@ void MV_Nrm1_Invoke( // Version for when a temporary result view is needed (implemented in terms of // the other version) -template +template void MV_Nrm1_Invoke( - const RV& r, const XV& x, + const execution_space& space, const RV& r, const XV& x, typename std::enable_if::accessible>::type* = nullptr) { Kokkos::View tempResult( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm1 temp result"), r.extent(0)); - MV_Nrm1_Invoke(tempResult, x); - Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); + MV_Nrm1_Invoke(tempResult, x); + Kokkos::deep_copy(space, r, tempResult); } } // namespace Impl diff --git a/blas/impl/KokkosBlas1_nrm1_spec.hpp b/blas/impl/KokkosBlas1_nrm1_spec.hpp index dbaface96e..f94ad3a2f5 100644 --- a/blas/impl/KokkosBlas1_nrm1_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm1_spec.hpp @@ -29,7 +29,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrm1_eti_spec_avail { enum : bool { value = false }; }; @@ -46,6 +46,7 @@ struct nrm1_eti_spec_avail { #define KOKKOSBLAS1_NRM1_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct nrm1_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -68,6 +69,7 @@ struct nrm1_eti_spec_avail { MEM_SPACE) \ template <> \ struct nrm1_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, \ @@ -90,20 +92,20 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = nrm1_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = nrm1_eti_spec_avail::value> struct Nrm1 { - static void nrm1(const RMV& R, const XMV& X); + static void nrm1(const execution_space& space, const RMV& R, const XMV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Nrm1 for single vectors (1-D Views). -template -struct Nrm1 { - typedef typename XMV::size_type size_type; +template +struct Nrm1 { + using size_type = typename XMV::size_type; - static void nrm1(const RMV& R, const XMV& X) { + static void nrm1(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm1<1-D>: RMV is not a Kokkos::View."); @@ -131,20 +133,20 @@ struct Nrm1 { const size_type numRows = X.extent(0); if (numRows < static_cast(INT_MAX)) { - V_Nrm1_Invoke(R, X); + V_Nrm1_Invoke(space, R, X); } else { - typedef std::int64_t index_type; - V_Nrm1_Invoke(R, X); + using index_type = std::int64_t; + V_Nrm1_Invoke(space, R, X); } Kokkos::Profiling::popRegion(); } }; -template -struct Nrm1 { - typedef typename XMV::size_type size_type; +template +struct Nrm1 { + using size_type = typename XMV::size_type; - static void nrm1(const RV& R, const XMV& X) { + static void nrm1(const execution_space& space, const RV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm1<2-D>: RV is not a Kokkos::View."); @@ -176,18 +178,18 @@ struct Nrm1 { auto R0 = Kokkos::subview(R, 0); auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); if (numRows < static_cast(INT_MAX)) { - V_Nrm1_Invoke(R0, X0); + V_Nrm1_Invoke(space, R0, X0); } else { typedef std::int64_t index_type; - V_Nrm1_Invoke(R0, X0); + V_Nrm1_Invoke(space, R0, X0); } } else { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - MV_Nrm1_Invoke(R, X); + MV_Nrm1_Invoke(space, R, X); } else { - typedef std::int64_t index_type; - MV_Nrm1_Invoke(R, X); + using index_type = std::int64_t; + MV_Nrm1_Invoke(space, R, X); } } Kokkos::Profiling::popRegion(); @@ -207,6 +209,7 @@ struct Nrm1 { // #define KOKKOSBLAS1_NRM1_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ extern template struct Nrm1< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -223,6 +226,7 @@ struct Nrm1 { // #define KOKKOSBLAS1_NRM1_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Nrm1< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -242,6 +246,7 @@ struct Nrm1 { #define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Nrm1< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, \ @@ -261,6 +266,7 @@ struct Nrm1 { #define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Nrm1< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, \ diff --git a/blas/src/KokkosBlas1_nrm1.hpp b/blas/src/KokkosBlas1_nrm1.hpp index 62f373d7b8..481eb83346 100644 --- a/blas/src/KokkosBlas1_nrm1.hpp +++ b/blas/src/KokkosBlas1_nrm1.hpp @@ -30,10 +30,13 @@ namespace KokkosBlas { /// \param x [in] Input 1-D View. /// /// \return The nrm1 product result; a single value. -template +template ::value, int>::type = 0> typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type -nrm1(const XVector& x) { +nrm1(const execution_space& space, const XVector& x) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm1: execution_space must be a Kokkos::execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm1: XVector must be a Kokkos::View."); static_assert(XVector::rank == 1, @@ -55,11 +58,25 @@ nrm1(const XVector& x) { RVector_Internal R = RVector_Internal(&result); XVector_Internal X = x; - Impl::Nrm1::nrm1(R, X); + Impl::Nrm1::nrm1(space, R, X); Kokkos::fence(); return result; } +/// \brief Return the nrm1 of the vector x. +/// +/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. +/// +/// \param x [in] Input 1-D View. +/// +/// \return The nrm1 product result; a single value. +template +typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::mag_type +nrm1(const XVector& x) { + return nrm1(typename XVector::execution_space{}, x); +} + /// \brief R(j) = nrm1(X(i,j)) /// /// Replace each entry in R with the nrm1olute value (magnitude) of the @@ -69,8 +86,8 @@ nrm1(const XVector& x) { /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. -template -void nrm1(const RV& R, const XMV& X, +template +void nrm1(const execution_space& space, const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { static_assert(Kokkos::is_view::value, "KokkosBlas::nrm1: " @@ -87,6 +104,10 @@ void nrm1(const RV& R, const XMV& X, ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm1: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm1: execution_space cannot access data in XMV"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm1: execution_space cannot access data in RV"); typedef typename Kokkos::Details::InnerProductSpaceTraits< typename XMV::non_const_value_type>::mag_type mag_type; @@ -128,7 +149,22 @@ void nrm1(const RV& R, const XMV& X, RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Nrm1::nrm1(R_internal, X_internal); + Impl::Nrm1::nrm1(space, R_internal, X_internal); +} + +/// \brief R(j) = nrm1(X(i,j)) +/// +/// Replace each entry in R with the nrm1olute value (magnitude) of the +/// corresponding entry in X. +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV, and its entries must be assignable to +/// those of RMV. +template +void nrm1(const RV& R, const XMV& X, + typename std::enable_if::value, int>::type = 0) { + nrm1(typename XMV::execution_space{}, R, X); } /// \brief Return the nrm1 of the vector x via asum (the actual blas name). diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp index 529952c10c..add1b8db2a 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrm1_tpl_spec_avail { enum : bool { value = false }; }; @@ -84,6 +84,33 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, #endif +//rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct nrm1_tpl_spec_avail< \ + Kokkos::View< \ + typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ + LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCUBLAS(double, Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIPSpace) + + +#endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS + } // namespace Impl } // namespace KokkosBlas #endif diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index 559615d105..5b9bc2035f 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -374,4 +374,198 @@ KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, #endif +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct Nrm1< \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ROCBLAS,double]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm1_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_dasum(s.handle, N, X.data(), one, R.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + Nrm1::nrm1(R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct Nrm1< \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ROCBLAS,float]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm1_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_sasum(s.handle, N, X.data(), one, R.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + Nrm1::nrm1(R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct Nrm1 >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::nrm1[TPL_ROCBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm1_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_dzasum(s.handle, N, \ + reinterpret_cast(X.data()), \ + one, R.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + Nrm1::nrm1(R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct Nrm1 >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::nrm1[TPL_ROCBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm1_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_scasum(s.handle, N, \ + reinterpret_cast(X.data()), \ + one, R.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + Nrm1::nrm1(R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, + true) +KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, + false) + +KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, + true) +KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, + false) + +KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, + true) +KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, + false) + +KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, + true) +KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, + false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif + #endif From a760a1d60630915f23954971e6fa953f58789dd9 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 13 Apr 2023 16:15:35 -0600 Subject: [PATCH 281/442] BLAS nrm1: fixing issues with TPLs --- blas/impl/KokkosBlas1_nrm1_impl.hpp | 2 +- blas/src/KokkosBlas1_nrm1.hpp | 2 - blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp | 3 + blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp | 102 ++++++++++-------- 4 files changed, 60 insertions(+), 49 deletions(-) diff --git a/blas/impl/KokkosBlas1_nrm1_impl.hpp b/blas/impl/KokkosBlas1_nrm1_impl.hpp index 6fd71f0871..fb1e616934 100644 --- a/blas/impl/KokkosBlas1_nrm1_impl.hpp +++ b/blas/impl/KokkosBlas1_nrm1_impl.hpp @@ -164,7 +164,7 @@ void MV_Nrm1_Invoke( tempResult( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm1 temp result"), r.extent(0)); - MV_Nrm1_Invoke(tempResult, x); + MV_Nrm1_Invoke(space, tempResult, x); Kokkos::deep_copy(space, r, tempResult); } diff --git a/blas/src/KokkosBlas1_nrm1.hpp b/blas/src/KokkosBlas1_nrm1.hpp index 481eb83346..94463c30fd 100644 --- a/blas/src/KokkosBlas1_nrm1.hpp +++ b/blas/src/KokkosBlas1_nrm1.hpp @@ -106,8 +106,6 @@ void nrm1(const execution_space& space, const RV& R, const XMV& X, "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); static_assert(Kokkos::SpaceAccessibility::accessible, "KokkosBlas::nrm1: execution_space cannot access data in XMV"); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm1: execution_space cannot access data in RV"); typedef typename Kokkos::Details::InnerProductSpaceTraits< typename XMV::non_const_value_type>::mag_type mag_type; diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp index add1b8db2a..da9cc9edca 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp @@ -36,6 +36,7 @@ namespace Impl { #define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct nrm1_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -63,6 +64,7 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, #define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct nrm1_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -89,6 +91,7 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, #define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct nrm1_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index 5b9bc2035f..4d8d8c89ba 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -203,25 +203,27 @@ KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ - template \ + template <> \ struct Nrm1< \ - Kokkos::View >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ typedef Kokkos::View > \ RV; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm1(RV& R, const XV& X) { \ + static void nrm1(const execution_space& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -232,31 +234,33 @@ namespace Impl { KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ cublasDasum(s.handle, N, X.data(), one, R.data()); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ - template \ + template <> \ struct Nrm1< \ + EXECSPACE, \ Kokkos::View >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ typedef Kokkos::View > \ RV; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm1(RV& R, const XV& X) { \ + static void nrm1(const execution_space& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -267,31 +271,34 @@ namespace Impl { KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ cublasSasum(s.handle, N, X.data(), one, R.data()); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ - template \ - struct Nrm1 \ + struct Nrm1< \ + EXECSPACE, \ + Kokkos::View >, \ Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ typedef Kokkos::View > \ RV; \ typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm1(RV& R, const XV& X) { \ + static void nrm1(const execution_space& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::nrm1[TPL_CUBLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -305,31 +312,34 @@ namespace Impl { reinterpret_cast(X.data()), one, \ R.data()); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ - template \ - struct Nrm1 \ + struct Nrm1< \ + EXECSPACE, \ + Kokkos::View >, \ Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ typedef Kokkos::View > \ RV; \ typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm1(RV& R, const XV& X) { \ + static void nrm1(const execution_space& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::nrm1[TPL_CUBLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -343,31 +353,31 @@ namespace Impl { reinterpret_cast(X.data()), one, \ R.data()); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) - -KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) - -KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) - -KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) +KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) + +KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) + +KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) + +KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) } // namespace Impl } // namespace KokkosBlas From be556c08adb80eaa435217540861480908f2b7e1 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 13 Apr 2023 16:16:54 -0600 Subject: [PATCH 282/442] BLAS1 nrminf: adding execution space overload --- blas/impl/KokkosBlas1_nrminf_impl.hpp | 13 +++-- blas/impl/KokkosBlas1_nrminf_spec.hpp | 36 +++++++------ blas/src/KokkosBlas1_nrminf.hpp | 53 ++++++++++++++++--- .../KokkosBlas1_nrminf_tpl_spec_avail.hpp | 5 +- 4 files changed, 77 insertions(+), 30 deletions(-) diff --git a/blas/impl/KokkosBlas1_nrminf_impl.hpp b/blas/impl/KokkosBlas1_nrminf_impl.hpp index adbe5feb82..8710454531 100644 --- a/blas/impl/KokkosBlas1_nrminf_impl.hpp +++ b/blas/impl/KokkosBlas1_nrminf_impl.hpp @@ -69,9 +69,8 @@ struct V_NrmInf_Functor { /// \brief Compute the 2-norm (or its square) of the single vector (1-D /// View) X, and store the result in the 0-D View r. -template -void V_NrmInf_Invoke(const RV& r, const XV& X) { - typedef typename XV::execution_space execution_space; +template +void V_NrmInf_Invoke(const execution_space& space, const RV& r, const XV& X) { typedef Kokkos::ArithTraits AT; const SizeType numRows = static_cast(X.extent(0)); @@ -82,7 +81,7 @@ void V_NrmInf_Invoke(const RV& r, const XV& X) { return; } - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); typedef V_NrmInf_Functor functor_type; functor_type op(X); @@ -92,12 +91,12 @@ void V_NrmInf_Invoke(const RV& r, const XV& X) { /// \brief Compute the 2-norms (or their square) of the columns of the /// multivector (2-D View) X, and store result(s) in the 1-D View r. -template -void MV_NrmInf_Invoke(const RV& r, const XMV& X) { +template +void MV_NrmInf_Invoke(const execution_space& space, const RV& r, const XMV& X) { for (size_t i = 0; i < X.extent(1); i++) { auto ri = Kokkos::subview(r, i); auto Xi = Kokkos::subview(X, Kokkos::ALL(), i); - V_NrmInf_Invoke(ri, Xi); + V_NrmInf_Invoke(space, ri, Xi); } } diff --git a/blas/impl/KokkosBlas1_nrminf_spec.hpp b/blas/impl/KokkosBlas1_nrminf_spec.hpp index 69bc0eeb47..5f6c27f9a1 100644 --- a/blas/impl/KokkosBlas1_nrminf_spec.hpp +++ b/blas/impl/KokkosBlas1_nrminf_spec.hpp @@ -29,7 +29,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrminf_eti_spec_avail { enum : bool { value = false }; }; @@ -47,6 +47,7 @@ struct nrminf_eti_spec_avail { MEM_SPACE) \ template <> \ struct nrminf_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -69,6 +70,7 @@ struct nrminf_eti_spec_avail { MEM_SPACE) \ template <> \ struct nrminf_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, \ @@ -91,20 +93,20 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = nrminf_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = nrminf_eti_spec_avail::value> struct NrmInf { - static void nrminf(const RMV& R, const XMV& X); + static void nrminf(const execution_space& space, const RMV& R, const XMV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of NrmInf for single vectors (1-D Views). -template -struct NrmInf { +template +struct NrmInf { typedef typename XMV::size_type size_type; - static void nrminf(const RMV& R, const XMV& X) { + static void nrminf(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "NrmInf<1-D>: RMV is not a Kokkos::View."); @@ -132,20 +134,20 @@ struct NrmInf { const size_type numRows = X.extent(0); if (numRows < static_cast(INT_MAX)) { - V_NrmInf_Invoke(R, X); + V_NrmInf_Invoke(space, R, X); } else { typedef std::int64_t index_type; - V_NrmInf_Invoke(R, X); + V_NrmInf_Invoke(space, R, X); } Kokkos::Profiling::popRegion(); } }; -template -struct NrmInf { +template +struct NrmInf { typedef typename XMV::size_type size_type; - static void nrminf(const RV& R, const XMV& X) { + static void nrminf(const execution_space& space, const RV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "NrmInf<2-D>: RV is not a Kokkos::View."); @@ -175,10 +177,10 @@ struct NrmInf { const size_type numCols = X.extent(1); if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - MV_NrmInf_Invoke(R, X); + MV_NrmInf_Invoke(space, R, X); } else { typedef std::int64_t index_type; - MV_NrmInf_Invoke(R, X); + MV_NrmInf_Invoke(space, R, X); } Kokkos::Profiling::popRegion(); } @@ -198,6 +200,7 @@ struct NrmInf { #define KOKKOSBLAS1_NRMINF_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct NrmInf< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -215,6 +218,7 @@ struct NrmInf { #define KOKKOSBLAS1_NRMINF_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct NrmInf< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -234,6 +238,7 @@ struct NrmInf { #define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct NrmInf< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, \ @@ -253,6 +258,7 @@ struct NrmInf { #define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct NrmInf< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, \ diff --git a/blas/src/KokkosBlas1_nrminf.hpp b/blas/src/KokkosBlas1_nrminf.hpp index d0f4d25eab..5f4a293d5e 100644 --- a/blas/src/KokkosBlas1_nrminf.hpp +++ b/blas/src/KokkosBlas1_nrminf.hpp @@ -25,15 +25,19 @@ namespace KokkosBlas { /// \brief Return the nrminf of the vector x. /// +/// \tparam execution_space The execution space in which the kernel will run. /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// +/// \param space [in] an execution space instance that can specify computing +/// resources to be used, for instance a stream or queue. /// \param x [in] Input 1-D View. /// /// \return The nrminf product result; a single value. -template +template ::value, int>::type = 0> typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type -nrminf(const XVector& x) { +nrminf(const execution_space& space, const XVector& x) { static_assert(Kokkos::is_view::value, "KokkosBlas::nrminf: XVector must be a Kokkos::View."); static_assert(XVector::rank == 1, @@ -58,30 +62,51 @@ nrminf(const XVector& x) { RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; - Impl::NrmInf::nrminf(R, X); + Impl::NrmInf::nrminf(space, R, X); Kokkos::fence(); return result; } +/// \brief Return the nrminf of the vector x. +/// +/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. +/// +/// \param x [in] Input 1-D View. +/// +/// \return The nrminf product result; a single value. +template +typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::mag_type +nrminf(const XVector& x) { + return nrminf(typename XVector::execution_space{}, x); +} + /// \brief R(j) = nrminf(X(i,j)) /// /// Replace each entry in R with the nrminfolute value (magnitude) of the /// corresponding entry in X. /// +/// \tparam execution_space, the execution space in which the kernel will run. /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. -template +template void nrminf( - const RV& R, const XMV& X, + const execution_space& space, const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrminf: space is not an execution space instance"); static_assert(Kokkos::is_view::value, "KokkosBlas::nrminf: " "R is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrminf: R is not accessible from execution_space"); static_assert(Kokkos::is_view::value, "KokkosBlas::nrminf: " "X is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrminf: X is not accessible from execution_space"); static_assert(std::is_same::value, "KokkosBlas::nrminf: R is const. " @@ -132,7 +157,23 @@ void nrminf( RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::NrmInf::nrminf(R_internal, X_internal); + Impl::NrmInf::nrminf(space, R_internal, X_internal); +} + +/// \brief R(j) = nrminf(X(i,j)) +/// +/// Replace each entry in R with the nrminfolute value (magnitude) of the +/// corresponding entry in X. +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV, and its entries must be assignable to +/// those of RMV. +template +void nrminf( + const RV& R, const XMV& X, + typename std::enable_if::value, int>::type = 0) { + nrminf(typename XMV::execution_space{}, R, X); } } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp index 54a74cfcf7..c61802a68f 100644 --- a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrminf_tpl_spec_avail { enum : bool { value = false }; }; @@ -34,8 +34,9 @@ namespace Impl { #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // double #define KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ + template \ struct nrminf_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ From f0088ab94679f7a31ebfe4719964f6c0c364682c Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Fri, 14 Apr 2023 14:49:37 -0600 Subject: [PATCH 283/442] BLAS1: nrminf fix in the TPL layer for execution space overload --- blas/src/KokkosBlas1_nrminf.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/blas/src/KokkosBlas1_nrminf.hpp b/blas/src/KokkosBlas1_nrminf.hpp index 5f4a293d5e..00ef0df0fd 100644 --- a/blas/src/KokkosBlas1_nrminf.hpp +++ b/blas/src/KokkosBlas1_nrminf.hpp @@ -100,8 +100,6 @@ void nrminf( static_assert(Kokkos::is_view::value, "KokkosBlas::nrminf: " "R is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrminf: R is not accessible from execution_space"); static_assert(Kokkos::is_view::value, "KokkosBlas::nrminf: " "X is not a Kokkos::View."); From a0d52184df26f54f16d4ce1bc6a23bc566ac432b Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Fri, 14 Apr 2023 14:57:46 -0600 Subject: [PATCH 284/442] BLAS1: nrm2(_squared) updated to have executions_space overload --- blas/impl/KokkosBlas1_nrm2_impl.hpp | 31 +++++------ blas/impl/KokkosBlas1_nrm2_spec.hpp | 40 ++++++++------ blas/src/KokkosBlas1_nrm2.hpp | 53 +++++++++++++++--- blas/src/KokkosBlas1_nrm2_squared.hpp | 54 ++++++++++++++++--- blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp | 6 ++- 5 files changed, 136 insertions(+), 48 deletions(-) diff --git a/blas/impl/KokkosBlas1_nrm2_impl.hpp b/blas/impl/KokkosBlas1_nrm2_impl.hpp index c852447f7b..0805138e9a 100644 --- a/blas/impl/KokkosBlas1_nrm2_impl.hpp +++ b/blas/impl/KokkosBlas1_nrm2_impl.hpp @@ -35,7 +35,6 @@ namespace Impl { /// \tparam SizeType Index type. Use int (32 bits) if possible. template struct V_Nrm2_Functor { - typedef typename XV::execution_space execution_space; typedef SizeType size_type; typedef typename XV::non_const_value_type xvalue_type; typedef Kokkos::Details::InnerProductSpaceTraits IPT; @@ -136,11 +135,10 @@ struct Nrm2_MV_Functor { /// \brief Compute the 2-norm (or its square) of the single vector (1-D /// View) X, and store the result in the 0-D View r. -template -void V_Nrm2_Invoke(const RV& r, const XV& X, const bool& take_sqrt) { - typedef typename XV::execution_space execution_space; +template +void V_Nrm2_Invoke(const execution_space& space, const RV& r, const XV& X, const bool& take_sqrt) { const SizeType numRows = static_cast(X.extent(0)); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); typedef V_Nrm2_Functor functor_type; functor_type op(X, take_sqrt); @@ -151,13 +149,12 @@ void V_Nrm2_Invoke(const RV& r, const XV& X, const bool& take_sqrt) { /// multivector (2-D View) X, and store result(s) in the 1-D View r. // Main version: the result view is accessible from execution space, so it can // be computed in-place -template +template void MV_Nrm2_Invoke( - const RV& r, const XV& x, bool take_sqrt, + const execution_space& space, const RV& r, const XV& x, bool take_sqrt, typename std::enable_if::accessible>::type* = nullptr) { - using execution_space = typename XV::execution_space; if (r.extent(0) != x.extent(1)) { std::ostringstream oss; oss << "KokkosBlas::nrm2 (rank-2): result vector has wrong length (" @@ -166,38 +163,38 @@ void MV_Nrm2_Invoke( } // Zero out the result vector Kokkos::deep_copy( - execution_space(), r, + space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( x.extent(0), x.extent(1), teamsPerVec); size_type numTeams = x.extent(1) * teamsPerVec; - Kokkos::TeamPolicy pol(numTeams, Kokkos::AUTO); + Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); Kokkos::parallel_for( "KokkosBlas1::Nrm2::S1", pol, Nrm2_MV_Functor(r, x, teamsPerVec)); if (take_sqrt) { Kokkos::parallel_for("KokkosBlas1::Nrm2::Sqrt", - Kokkos::RangePolicy(0, r.extent(0)), + Kokkos::RangePolicy(space, 0, r.extent(0)), TakeSqrtFunctor(r)); } } // Version for when a temporary result view is needed (implemented in terms of // the other version) -template +template void MV_Nrm2_Invoke( - const RV& r, const XV& x, bool take_sqrt, + const execution_space& space, const RV& r, const XV& x, bool take_sqrt, typename std::enable_if::accessible>::type* = nullptr) { Kokkos::View tempResult( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2 temp result"), r.extent(0)); - MV_Nrm2_Invoke(tempResult, x, take_sqrt); - Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); + MV_Nrm2_Invoke(space, tempResult, x, take_sqrt); + Kokkos::deep_copy(space, r, tempResult); } } // namespace Impl diff --git a/blas/impl/KokkosBlas1_nrm2_spec.hpp b/blas/impl/KokkosBlas1_nrm2_spec.hpp index a8fd6eee5d..eba6e1c05a 100644 --- a/blas/impl/KokkosBlas1_nrm2_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm2_spec.hpp @@ -29,7 +29,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrm2_eti_spec_avail { enum : bool { value = false }; }; @@ -46,6 +46,7 @@ struct nrm2_eti_spec_avail { #define KOKKOSBLAS1_NRM2_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct nrm2_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -68,6 +69,7 @@ struct nrm2_eti_spec_avail { MEM_SPACE) \ template <> \ struct nrm2_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, \ @@ -90,20 +92,20 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = nrm2_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = nrm2_eti_spec_avail::value> struct Nrm2 { - static void nrm2(const RMV& R, const XMV& X, const bool& take_sqrt); + static void nrm2(const execution_space& space, const RMV& R, const XMV& X, const bool& take_sqrt); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Nrm2 for single vectors (1-D Views). -template -struct Nrm2 { +template +struct Nrm2 { typedef typename XMV::size_type size_type; - static void nrm2(const RMV& R, const XMV& X, const bool& take_sqrt) { + static void nrm2(const execution_space& space, const RMV& R, const XMV& X, const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm2<1-D>: RMV is not a Kokkos::View."); @@ -131,20 +133,20 @@ struct Nrm2 { const size_type numRows = X.extent(0); if (numRows < static_cast(INT_MAX)) { - V_Nrm2_Invoke(R, X, take_sqrt); + V_Nrm2_Invoke(space, R, X, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2_Invoke(R, X, take_sqrt); + V_Nrm2_Invoke(space, R, X, take_sqrt); } Kokkos::Profiling::popRegion(); } }; -template -struct Nrm2 { +template +struct Nrm2 { typedef typename XMV::size_type size_type; - static void nrm2(const RV& R, const XMV& X, const bool& take_sqrt) { + static void nrm2(const execution_space& space, const RV& R, const XMV& X, const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm2<2-D>: RV is not a Kokkos::View."); @@ -176,19 +178,19 @@ struct Nrm2 { auto R0 = Kokkos::subview(R, 0); auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); if (numRows < static_cast(INT_MAX)) { - V_Nrm2_Invoke(R0, X0, take_sqrt); + V_Nrm2_Invoke(space, R0, X0, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2_Invoke(R0, X0, + V_Nrm2_Invoke(space, R0, X0, take_sqrt); } } else { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - MV_Nrm2_Invoke(R, X, take_sqrt); + MV_Nrm2_Invoke(space, R, X, take_sqrt); } else { typedef std::int64_t index_type; - MV_Nrm2_Invoke(R, X, take_sqrt); + MV_Nrm2_Invoke(space, R, X, take_sqrt); } } Kokkos::Profiling::popRegion(); @@ -208,6 +210,7 @@ struct Nrm2 { // #define KOKKOSBLAS1_NRM2_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ extern template struct Nrm2< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -224,6 +227,7 @@ struct Nrm2 { // #define KOKKOSBLAS1_NRM2_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Nrm2< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -243,6 +247,7 @@ struct Nrm2 { #define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Nrm2< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, \ @@ -262,6 +267,7 @@ struct Nrm2 { #define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Nrm2< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, \ diff --git a/blas/src/KokkosBlas1_nrm2.hpp b/blas/src/KokkosBlas1_nrm2.hpp index b8e12c4f5f..72c0a3f5da 100644 --- a/blas/src/KokkosBlas1_nrm2.hpp +++ b/blas/src/KokkosBlas1_nrm2.hpp @@ -31,12 +31,18 @@ namespace KokkosBlas { /// \param x [in] Input 1-D View. /// /// \return The nrm2 product result; a single value. -template +template ::value, int>::type = 0> typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type -nrm2(const XVector& x) { +nrm2(const execution_space& space, const XVector& x) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm2: execution_space must be a valid" + " Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2: XVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2: XVector must be accessible from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::nrm2: " "XVector must have rank 1."); @@ -59,11 +65,25 @@ nrm2(const XVector& x) { RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; - Impl::Nrm2::nrm2(R, X, true); + Impl::Nrm2::nrm2(space, R, X, true); Kokkos::fence(); return result; } +/// \brief Return the nrm2 of the vector x. +/// +/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. +/// +/// \param x [in] Input 1-D View. +/// +/// \return The nrm2 product result; a single value. +template +typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::mag_type +nrm2(const XVector& x) { + return nrm2(typename XVector::execution_space{}, x); +} + /// \brief R(i,j) = nrm2(X(i,j)) /// /// Replace each entry in R with the nrm2olute value (magnitude) of the @@ -73,15 +93,21 @@ nrm2(const XVector& x) { /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. -template -void nrm2(const RV& R, const XMV& X, +template +void nrm2(const execution_space& space, const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm2: space is not a Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2: " "R is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2: R cannot be accessed from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2: " "X is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2: X cannot be accessed from execution_space."); static_assert(std::is_same::value, "KokkosBlas::nrm2: R is const. " @@ -128,7 +154,22 @@ void nrm2(const RV& R, const XMV& X, RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Nrm2::nrm2(R_internal, X_internal, true); + Impl::Nrm2::nrm2(space, R_internal, X_internal, true); +} + +/// \brief R(i,j) = nrm2(X(i,j)) +/// +/// Replace each entry in R with the nrm2olute value (magnitude) of the +/// corresponding entry in X. +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV, and its entries must be assignable to +/// those of RMV. +template +void nrm2(const RV& R, const XMV& X, + typename std::enable_if::value, int>::type = 0) { + nrm2(typename XMV::execution_space{}, R, X); } /// diff --git a/blas/src/KokkosBlas1_nrm2_squared.hpp b/blas/src/KokkosBlas1_nrm2_squared.hpp index 3a584c8a99..bb6fa512b5 100644 --- a/blas/src/KokkosBlas1_nrm2_squared.hpp +++ b/blas/src/KokkosBlas1_nrm2_squared.hpp @@ -30,12 +30,19 @@ namespace KokkosBlas { /// \param x [in] Input 1-D View. /// /// \return The nrm2 product result; a single value. -template +template ::value, int>::type = 0> typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type -nrm2_squared(const XVector& x) { +nrm2_squared(const execution_space& space, const XVector& x) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm2_squared: execution_space must be a valid" + " Kokkos execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2_squared: XVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2_squared: XVector must be accessible" + " from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::nrm2_squared: " "Both Vector inputs must have rank 1."); @@ -57,11 +64,25 @@ nrm2_squared(const XVector& x) { RVector_Internal R = RVector_Internal(&result); XVector_Internal X = x; - Impl::Nrm2::nrm2(R, X, false); + Impl::Nrm2::nrm2(space, R, X, false); Kokkos::fence(); return result; } +/// \brief Return the nrm2 of the vector x. +/// +/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. +/// +/// \param x [in] Input 1-D View. +/// +/// \return The nrm2 product result; a single value. +template +typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::mag_type +nrm2_squared(const XVector& x) { + return nrm2_squared(typename XVector::execution_space{}, x); +} + /// \brief R(i,j) = nrm2(X(i,j)) /// /// Replace each entry in R with the nrm2olute value (magnitude) of the @@ -71,16 +92,22 @@ nrm2_squared(const XVector& x) { /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. -template +template void nrm2_squared( - const RV& R, const XMV& X, + const execution_space& space, const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm2_squared: execution_space must be a valid" + " Kokkos execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2_squared: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2_squared: " "X is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2_squared: XVector must be accessible" + " from execution_space"); static_assert(std::is_same::value, "KokkosBlas::nrm2_squared: R is const. " @@ -126,7 +153,22 @@ void nrm2_squared( RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Nrm2::nrm2(R_internal, X_internal, false); + Impl::Nrm2::nrm2(space, R_internal, X_internal, false); +} + +/// \brief R(i,j) = nrm2(X(i,j)) +/// +/// Replace each entry in R with the nrm2olute value (magnitude) of the +/// corresponding entry in X. +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV, and its entries must be assignable to +/// those of RMV. +template +void nrm2_squared(const RV& R, const XMV& X, + typename std::enable_if::value, int>::type = 0) { + nrm2_squared(typename XMV::execution_space{}, R, X); } } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp index 0680a72d99..8e0dd05948 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrm2_tpl_spec_avail { enum : bool { value = false }; }; @@ -35,6 +35,7 @@ namespace Impl { #define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct nrm2_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -60,8 +61,9 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS // double #define KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ + template <> \ struct nrm2_tpl_spec_avail< \ + Kokkos::Cuda, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ From 20463f2a4786c5d831117995472d37c8294cc0e6 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Fri, 14 Apr 2023 17:58:06 -0600 Subject: [PATCH 285/442] BLAS1: nrm1/nrm2 update CUBLAS calls Add return code catching and stream execution support --- blas/src/KokkosBlas1_nrm2.hpp | 2 - blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp | 24 +++- blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp | 2 +- blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp | 118 +++++++++++------- 4 files changed, 92 insertions(+), 54 deletions(-) diff --git a/blas/src/KokkosBlas1_nrm2.hpp b/blas/src/KokkosBlas1_nrm2.hpp index 72c0a3f5da..722e2fc440 100644 --- a/blas/src/KokkosBlas1_nrm2.hpp +++ b/blas/src/KokkosBlas1_nrm2.hpp @@ -101,8 +101,6 @@ void nrm2(const execution_space& space, const RV& R, const XMV& X, static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2: " "R is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2: R cannot be accessed from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2: " "X is not a Kokkos::View."); diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index 4d8d8c89ba..f6469d29f4 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -232,7 +232,11 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasDasum(s.handle, N, X.data(), one, R.data()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDasum(s.handle, N, X.data(), one, R.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } else { \ Nrm1::nrm1(space, R, X); \ } \ @@ -269,7 +273,11 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasSasum(s.handle, N, X.data(), one, R.data()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSasum(s.handle, N, X.data(), one, R.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } else { \ Nrm1::nrm1(space, R, X); \ } \ @@ -308,9 +316,13 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasDzasum(s.handle, N, \ reinterpret_cast(X.data()), one, \ - R.data()); \ + R.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } else { \ Nrm1::nrm1(space, R, X); \ } \ @@ -349,9 +361,13 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasScasum(s.handle, N, \ reinterpret_cast(X.data()), one, \ - R.data()); \ + R.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } else { \ Nrm1::nrm1(space, R, X); \ } \ diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp index 8e0dd05948..1d894ad1da 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp @@ -68,7 +68,7 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { \ enum : bool { value = true }; \ diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp index 11a8894ca6..d38e8680f0 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp @@ -209,25 +209,25 @@ KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ - template \ + template <> \ struct Nrm2< \ + EXECSPACE, \ Kokkos::View >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ + using execution_space = EXECSPACE; \ + using RV = Kokkos::View >; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using size_type = typename XV::size_type; \ \ - static void nrm2(RV& R, const XV& X, const bool& take_sqrt) { \ + static void nrm2(const execution_space& space, RV& R, const XV& X, const bool& take_sqrt) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -236,34 +236,40 @@ namespace Impl { constexpr int int_one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasDnrm2(s.handle, N, X.data(), int_one, &R()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDnrm2(s.handle, N, X.data(), int_one, &R())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ if (!take_sqrt) R() = R() * R(); \ } else { \ - Nrm2::nrm2(R, X, take_sqrt); \ + Nrm2::nrm2(space, R, X, take_sqrt); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ - template \ + template <> \ struct Nrm2< \ + EXECSPACE, \ Kokkos::View >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ typedef Kokkos::View > \ RV; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm2(RV& R, const XV& X, const bool& take_sqrt) { \ + static void nrm2(const execution_space& space, RV& R, const XV& X, const bool& take_sqrt) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -272,34 +278,41 @@ namespace Impl { constexpr int int_one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasSnrm2(s.handle, N, X.data(), int_one, &R()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSnrm2(s.handle, N, X.data(), int_one, &R())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, NULL)); \ if (!take_sqrt) R() = R() * R(); \ } else { \ - Nrm2::nrm2(R, X, take_sqrt); \ + Nrm2::nrm2(space, R, X, take_sqrt); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ - template \ - struct Nrm2 \ + struct Nrm2< \ + EXECSPACE, \ + Kokkos::View >, \ Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ typedef Kokkos::View > \ RV; \ typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm2(RV& R, const XV& X, const bool& take_sqrt) { \ + static void nrm2(const execution_space& space, RV& R, const XV& X, const bool& take_sqrt) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::nrm2[TPL_CUBLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -309,36 +322,43 @@ namespace Impl { constexpr int int_one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasDznrm2(s.handle, N, \ reinterpret_cast(X.data()), \ - int_one, &R()); \ + int_one, &R())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ if (!take_sqrt) R() = R() * R(); \ } else { \ - Nrm2::nrm2(R, X, take_sqrt); \ + Nrm2::nrm2(space, R, X, take_sqrt); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ - template \ - struct Nrm2 \ + struct Nrm2< \ + EXECSPACE, \ + Kokkos::View >, \ Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ typedef Kokkos::View > \ RV; \ typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm2(RV& R, const XV& X, const bool& take_sqrt) { \ + static void nrm2(const execution_space& space, RV& R, const XV& X, const bool& take_sqrt) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::nrm2[TPL_CUBLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -348,35 +368,39 @@ namespace Impl { constexpr int int_one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasScnrm2(s.handle, N, \ - reinterpret_cast(X.data()), int_one, \ - &R()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasScnrm2(s.handle, N, \ + reinterpret_cast(X.data()), int_one, \ + &R())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ if (!take_sqrt) R() = R() * R(); \ } else { \ - Nrm2::nrm2(R, X, take_sqrt); \ + Nrm2::nrm2(space, R, X, take_sqrt); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, +KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) -KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, +KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, +KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) -KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, +KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, +KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, +KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) -KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, +KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) -KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, +KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, false) } // namespace Impl From e36c50e4b2d8d1813eabbacbf1bd2c5fa6e7b355 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 17 Apr 2023 10:17:04 -0600 Subject: [PATCH 286/442] BLAS1: nrm2w adding support for execution space overload --- blas/impl/KokkosBlas1_nrm2w_impl.hpp | 34 ++++---- blas/impl/KokkosBlas1_nrm2w_spec.hpp | 43 +++++----- blas/src/KokkosBlas1_nrm2w.hpp | 74 +++++++++++++--- blas/src/KokkosBlas1_nrm2w_squared.hpp | 85 ++++++++++++++----- .../tpls/KokkosBlas1_nrm2w_tpl_spec_avail.hpp | 2 +- 5 files changed, 164 insertions(+), 74 deletions(-) diff --git a/blas/impl/KokkosBlas1_nrm2w_impl.hpp b/blas/impl/KokkosBlas1_nrm2w_impl.hpp index 770846599f..eeafb67e97 100644 --- a/blas/impl/KokkosBlas1_nrm2w_impl.hpp +++ b/blas/impl/KokkosBlas1_nrm2w_impl.hpp @@ -134,12 +134,11 @@ struct Nrm2w_MV_Functor { /// \brief Compute the 2-norm (or its square) of the single vector (1-D /// View) X, and store the result in the 0-D View r. -template -void V_Nrm2w_Invoke(const RV& r, const XV& X, const XV& W, +template +void V_Nrm2w_Invoke(const execution_space& space, const RV& r, const XV& X, const XV& W, const bool& take_sqrt) { - typedef typename XV::execution_space execution_space; const SizeType numRows = static_cast(X.extent(0)); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); typedef V_Nrm2w_Functor functor_type; functor_type op(X, W, take_sqrt); @@ -150,13 +149,12 @@ void V_Nrm2w_Invoke(const RV& r, const XV& X, const XV& W, /// multivector (2-D View) X, and store result(s) in the 1-D View r. // Main version: the result view is accessible from execution space, so it can // be computed in-place -template +template void MV_Nrm2w_Invoke( - const RV& r, const XV& x, const XV& w, bool take_sqrt, + const execution_space& space, const RV& r, const XV& x, const XV& w, bool take_sqrt, typename std::enable_if::accessible>::type* = nullptr) { - using execution_space = typename XV::execution_space; + execution_space, + typename XV::memory_space>::accessible>::type* = nullptr) { if (r.extent(0) != x.extent(1)) { std::ostringstream oss; oss << "KokkosBlas::nrm2w (rank-2): result vector has wrong length (" @@ -165,39 +163,39 @@ void MV_Nrm2w_Invoke( } // Zero out the result vector Kokkos::deep_copy( - execution_space(), r, + space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( x.extent(0), x.extent(1), teamsPerVec); size_type numTeams = x.extent(1) * teamsPerVec; - Kokkos::TeamPolicy pol(numTeams, Kokkos::AUTO); + Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); Kokkos::parallel_for("KokkosBlas1::Nrm2w::S1", pol, Nrm2w_MV_Functor( r, x, w, teamsPerVec)); if (take_sqrt) { Kokkos::parallel_for("KokkosBlas1::Nrm2w::Sqrt", - Kokkos::RangePolicy(0, r.extent(0)), + Kokkos::RangePolicy(space, 0, r.extent(0)), TakeSqrtFunctor(r)); } } // Version for when a temporary result view is needed (implemented in terms of // the other version) -template +template void MV_Nrm2w_Invoke( - const RV& r, const XV& x, const XV& w, bool take_sqrt, + const execution_space& space, const RV& r, const XV& x, const XV& w, bool take_sqrt, typename std::enable_if::accessible>::type* = nullptr) { + execution_space, + typename XV::memory_space>::accessible>::type* = nullptr) { Kokkos::View tempResult( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2w temp result"), r.extent(0)); - MV_Nrm2w_Invoke(tempResult, x, w, + MV_Nrm2w_Invoke(space, tempResult, x, w, take_sqrt); - Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); + Kokkos::deep_copy(space, r, tempResult); } } // namespace Impl diff --git a/blas/impl/KokkosBlas1_nrm2w_spec.hpp b/blas/impl/KokkosBlas1_nrm2w_spec.hpp index b25199c1f5..c24d6d31fa 100644 --- a/blas/impl/KokkosBlas1_nrm2w_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm2w_spec.hpp @@ -28,7 +28,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrm2w_eti_spec_avail { enum : bool { value = false }; }; @@ -46,6 +46,7 @@ struct nrm2w_eti_spec_avail { MEM_SPACE) \ template <> \ struct nrm2w_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -68,6 +69,7 @@ struct nrm2w_eti_spec_avail { MEM_SPACE) \ template <> \ struct nrm2w_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, Kokkos::Device, \ @@ -88,21 +90,21 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = nrm2w_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = nrm2w_eti_spec_avail::value> struct Nrm2w { - static void nrm2w(const RMV& R, const XMV& X, const XMV& W, + static void nrm2w(const execution_space& space, const RMV& R, const XMV& X, const XMV& W, const bool& take_sqrt); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Nrm2w for single vectors (1-D Views). -template -struct Nrm2w { - typedef typename XMV::size_type size_type; +template +struct Nrm2w { + using size_type = typename XMV::size_type; - static void nrm2w(const RMV& R, const XMV& X, const XMV& W, + static void nrm2w(const execution_space& space, const RMV& R, const XMV& X, const XMV& W, const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -131,20 +133,20 @@ struct Nrm2w { const size_type numRows = X.extent(0); if (numRows < static_cast(INT_MAX)) { - V_Nrm2w_Invoke(R, X, W, take_sqrt); + V_Nrm2w_Invoke(space, R, X, W, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2w_Invoke(R, X, W, take_sqrt); + V_Nrm2w_Invoke(space, R, X, W, take_sqrt); } Kokkos::Profiling::popRegion(); } }; -template -struct Nrm2w { - typedef typename XMV::size_type size_type; +template +struct Nrm2w { + using size_type = typename XMV::size_type; - static void nrm2w(const RV& R, const XMV& X, const XMV& W, + static void nrm2w(const execution_space& space, const RV& R, const XMV& X, const XMV& W, const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -178,19 +180,19 @@ struct Nrm2w { auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); auto W0 = Kokkos::subview(W, Kokkos::ALL(), 0); if (numRows < static_cast(INT_MAX)) { - V_Nrm2w_Invoke(R0, X0, W0, take_sqrt); + V_Nrm2w_Invoke(space, R0, X0, W0, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2w_Invoke(R0, X0, W0, + V_Nrm2w_Invoke(space, R0, X0, W0, take_sqrt); } } else { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - MV_Nrm2w_Invoke(R, X, W, take_sqrt); + MV_Nrm2w_Invoke(space, R, X, W, take_sqrt); } else { typedef std::int64_t index_type; - MV_Nrm2w_Invoke(R, X, W, take_sqrt); + MV_Nrm2w_Invoke(space, R, X, W, take_sqrt); } } Kokkos::Profiling::popRegion(); @@ -226,6 +228,7 @@ struct Nrm2w { // #define KOKKOSBLAS1_NRM2W_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Nrm2w< \ + EXEC_SPACE, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -245,6 +248,7 @@ struct Nrm2w { #define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Nrm2w< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, Kokkos::Device, \ @@ -262,6 +266,7 @@ struct Nrm2w { #define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Nrm2w< \ + EXEC_SPACE, \ Kokkos::View::mag_type*, \ LAYOUT, Kokkos::Device, \ diff --git a/blas/src/KokkosBlas1_nrm2w.hpp b/blas/src/KokkosBlas1_nrm2w.hpp index 6a78a49bd2..42d138974f 100644 --- a/blas/src/KokkosBlas1_nrm2w.hpp +++ b/blas/src/KokkosBlas1_nrm2w.hpp @@ -25,46 +25,69 @@ namespace KokkosBlas { /// \brief Return the nrm2w of the vector x. /// +/// \tparam execution_space a Kokkos execution space where the computation +/// will be launched /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// +/// \param space [in] an execution space instance that may specify +/// a stream or queue for the compute kernel execution. /// \param x [in] Input 1-D View. /// \param w [in] /// /// \return The nrm2w product result; a single value. -template +template typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type -nrm2w(const XVector& x, const XVector& w) { +nrm2w(const execution_space& space, const XVector& x, const XVector& w, + typename std::enable_if::value, int>::type = 0) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm2w: execution_space must be a valid" + " Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w: XVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w: XVector must be accessible from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::nrm2w: " "Both Vector inputs must have rank 1."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type mag_type; + using mag_type = typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::mag_type; - typedef Kokkos::View< + using XVector_Internal = Kokkos::View< typename XVector::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits > - XVector_Internal; + typename XVector::device_type, Kokkos::MemoryTraits >; using layout_t = typename XVector_Internal::array_layout; - typedef Kokkos::View > - RVector_Internal; + using RVector_Internal = Kokkos::View >; mag_type result; RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; XVector_Internal W = w; - Impl::Nrm2w::nrm2w(R, X, W, true); + Impl::Nrm2w::nrm2w(space, R, X, W, true); Kokkos::fence(); return result; } +/// \brief Return the nrm2w of the vector x. +/// +/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. +/// +/// \param x [in] Input 1-D View. +/// \param w [in] +/// +/// \return The nrm2w product result; a single value. +template +typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::mag_type +nrm2w(const XVector& x, const XVector& w) { + return nrm2w(typename XVector::execution_space{}, x, w); +} + /// \brief R(i,j) = nrm2w(X(i,j)) /// /// Replace each entry in R with the nrm2w, absolute value (magnitude), of the @@ -74,15 +97,20 @@ nrm2w(const XVector& x, const XVector& w) { /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. -template -void nrm2w(const RV& R, const XMV& X, const XMV& W, +template +void nrm2w(const execution_space& space, const RV& R, const XMV& X, const XMV& W, typename std::enable_if::value, int>::type = 0) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm2w: execution_space must be a valid" + " Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w: " "X is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w: XMV must be accessible from execution_space"); static_assert(std::is_same::value, "KokkosBlas::nrm2w: R is const. " @@ -130,9 +158,27 @@ void nrm2w(const RV& R, const XMV& X, const XMV& W, XMV_Internal X_internal = X; XMV_Internal W_internal = W; - Impl::Nrm2w::nrm2w(R_internal, X_internal, + Impl::Nrm2w::nrm2w(space, R_internal, X_internal, W_internal, true); } + + + +/// \brief R(i,j) = nrm2w(X(i,j)) +/// +/// Replace each entry in R with the nrm2w, absolute value (magnitude), of the +/// corresponding entry in X. +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV, and its entries must be assignable to +/// those of RMV. +template +void nrm2w(const RV& R, const XMV& X, const XMV& W, + typename std::enable_if::value, int>::type = 0) { + nrm2w(typename XMV::execution_space{}, R, X, W); +} + } // namespace KokkosBlas #endif // KOKKOSBLAS1_NRM2W_HPP_ diff --git a/blas/src/KokkosBlas1_nrm2w_squared.hpp b/blas/src/KokkosBlas1_nrm2w_squared.hpp index 0a5f29011d..d777c27a2c 100644 --- a/blas/src/KokkosBlas1_nrm2w_squared.hpp +++ b/blas/src/KokkosBlas1_nrm2w_squared.hpp @@ -25,47 +25,69 @@ namespace KokkosBlas { /// \brief Return the nrm2w of the vector x. /// +/// \tparam execution_space a Kokkos execution space where the computation +/// will be launched /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// +/// \param space [in] an execution space instance that may specify /// \param x [in] Input 1-D View. /// \param w [in] Input weights (1-D View). /// /// \return The nrm2w product result; a single value. -template +template ::value, int>::type = 0> typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type -nrm2w_squared(const XVector& x, const XVector& w) { +nrm2w_squared(const execution_space& space, const XVector& x, const XVector& w) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm2w_squared: execution_space must be a valid " + "Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w_squared: XVector must be a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w_squared: XVector must be accessible from execution_space."); static_assert(XVector::rank == 1, "KokkosBlas::nrm2w_squared: " "Both Vector inputs must have rank 1."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XVector::non_const_value_type>::mag_type mag_type; + using mag_type = typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::mag_type; - typedef Kokkos::View< + using XVector_Internal = Kokkos::View< typename XVector::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits > - XVector_Internal; + typename XVector::device_type, Kokkos::MemoryTraits >; using layout_t = typename XVector_Internal::array_layout; - typedef Kokkos::View > - RVector_Internal; + using RVector_Internal = Kokkos::View >; mag_type result; RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; XVector_Internal W = w; - Impl::Nrm2w::nrm2w(R, X, W, false); + Impl::Nrm2w::nrm2w(space, R, X, W, false); Kokkos::fence(); return result; } +/// \brief Return the nrm2w of the vector x. +/// +/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. +/// +/// \param x [in] Input 1-D View. +/// \param w [in] Input weights (1-D View). +/// +/// \return The nrm2w product result; a single value. +template +typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::mag_type +nrm2w_squared(const XVector& x, const XVector& w) { + return nrm2w_squared(typename XVector::execution_space(), x, w); +} + /// \brief R(i,j) = nrm2w(X(i,j)) /// /// Replace each entry in R with the nrm2wolute value (magnitude) of the @@ -75,16 +97,21 @@ nrm2w_squared(const XVector& x, const XVector& w) { /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. -template +template void nrm2w_squared( - const RV& R, const XMV& X, const XMV& W, + const execution_space& space, const RV& R, const XMV& X, const XMV& W, typename std::enable_if::value, int>::type = 0) { + static_assert(Kokkos::is_execution_space::value, + "KokkosBlas::nrm2w_squared: execution_space must be a valid " + "Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w_squared: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w_squared: " "X is not a Kokkos::View."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w_squared: XVector must be accessible from execution_space."); static_assert(std::is_same::value, "KokkosBlas::nrm2w_squared: R is const. " @@ -94,8 +121,8 @@ void nrm2w_squared( ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm2w_squared: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename XMV::non_const_value_type>::mag_type mag_type; + using mag_type = typename Kokkos::Details::InnerProductSpaceTraits< + typename XMV::non_const_value_type>::mag_type; static_assert(std::is_same::value, "KokkosBlas::nrm2w: R must have the magnitude type of" "the xvectors value_type it is an output argument " @@ -118,22 +145,36 @@ void nrm2w_squared( // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View > - RV_Internal; - typedef Kokkos::View >; + using XMV_Internal = Kokkos::View > - XMV_Internal; + Kokkos::MemoryTraits >; RV_Internal R_internal = R; XMV_Internal X_internal = X; XMV_Internal W_internal = W; - Impl::Nrm2w::nrm2w(R_internal, X_internal, + Impl::Nrm2w::nrm2w(space, R_internal, X_internal, W_internal, false); } + +/// \brief R(i,j) = nrm2w(X(i,j)) +/// +/// Replace each entry in R with the nrm2wolute value (magnitude) of the +/// corresponding entry in X. +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV, and its entries must be assignable to +/// those of RMV. +template +void nrm2w_squared( + const RV& R, const XMV& X, const XMV& W, + typename std::enable_if::value, int>::type = 0) { + nrm2w_squared(typename XMV::execution_space{}, R, X, W); +} } // namespace KokkosBlas #endif // KOKKOSBLAS1_NRM2W_HPP_ diff --git a/blas/tpls/KokkosBlas1_nrm2w_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm2w_tpl_spec_avail.hpp index 7613190645..8a45b46521 100644 --- a/blas/tpls/KokkosBlas1_nrm2w_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm2w_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct nrm2w_tpl_spec_avail { enum : bool { value = false }; }; From f830165891ded64b67a1aca2efeda75110c834a2 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 17 Apr 2023 11:55:26 -0600 Subject: [PATCH 287/442] BLAS1: applying clang-format BLAS1: more clang-format BLAS1: fix issue with temp view running out of scope in nrm1 BLAS1: adding execution space support for iamax and sum kernels BLAS1: adding execution space support to the dot kernel BLAS1: fixing some of the synchronization in norms BLAS1: adding fix for iamax synchronization BLAS1: fixing issues in TPL layer for norms execution space overload BLAS1: improving documentation for reduction based BLAS1 functions. --- blas/impl/KokkosBlas1_dot_impl.hpp | 9 +- blas/impl/KokkosBlas1_dot_mv_impl.hpp | 29 +- blas/impl/KokkosBlas1_dot_spec.hpp | 104 +++-- blas/impl/KokkosBlas1_iamax_impl.hpp | 16 +- blas/impl/KokkosBlas1_iamax_spec.hpp | 46 +- blas/impl/KokkosBlas1_nrm1_impl.hpp | 32 +- blas/impl/KokkosBlas1_nrm1_spec.hpp | 19 +- blas/impl/KokkosBlas1_nrm2_impl.hpp | 27 +- blas/impl/KokkosBlas1_nrm2_spec.hpp | 35 +- blas/impl/KokkosBlas1_nrm2w_impl.hpp | 29 +- blas/impl/KokkosBlas1_nrm2w_spec.hpp | 40 +- blas/impl/KokkosBlas1_nrminf_impl.hpp | 3 +- blas/impl/KokkosBlas1_nrminf_spec.hpp | 16 +- blas/impl/KokkosBlas1_sum_impl.hpp | 35 +- blas/impl/KokkosBlas1_sum_spec.hpp | 47 +- blas/src/KokkosBlas1_dot.hpp | 120 +++++- blas/src/KokkosBlas1_iamax.hpp | 77 +++- blas/src/KokkosBlas1_nrm1.hpp | 40 +- blas/src/KokkosBlas1_nrm2.hpp | 52 ++- blas/src/KokkosBlas1_nrm2_squared.hpp | 57 ++- blas/src/KokkosBlas1_nrm2w.hpp | 61 ++- blas/src/KokkosBlas1_nrm2w_squared.hpp | 75 +++- blas/src/KokkosBlas1_nrminf.hpp | 22 +- blas/src/KokkosBlas1_sum.hpp | 89 +++- blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp | 25 +- blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp | 308 ++++++------- .../tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp | 83 ++-- blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp | 117 ++--- blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp | 9 +- blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp | 406 +++++++++--------- blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp | 3 +- blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp | 272 ++++++------ .../KokkosBlas1_nrminf_tpl_spec_avail.hpp | 2 +- .../tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp | 108 ++--- blas/tpls/KokkosBlas1_sum_tpl_spec_avail.hpp | 2 +- 35 files changed, 1459 insertions(+), 956 deletions(-) diff --git a/blas/impl/KokkosBlas1_dot_impl.hpp b/blas/impl/KokkosBlas1_dot_impl.hpp index 56d964ca07..2003f7cc2c 100644 --- a/blas/impl/KokkosBlas1_dot_impl.hpp +++ b/blas/impl/KokkosBlas1_dot_impl.hpp @@ -30,9 +30,9 @@ namespace Impl { /// \tparam YVector Type of the second vector y; 1-D View /// \tparam SizeType Type of the row index used in the dot product. /// For best performance, use int instead of size_t here. -template +template struct DotFunctor { - typedef typename XVector::execution_space execution_space; typedef SizeType size_type; typedef typename AV::non_const_value_type avalue_type; typedef Kokkos::Details::InnerProductSpaceTraits IPT; @@ -43,8 +43,9 @@ struct DotFunctor { DotFunctor(const XVector& x, const YVector& y) : m_x(x), m_y(y) {} - void run(const char* label, AV result) { - Kokkos::RangePolicy policy(0, m_x.extent(0)); + void run(const char* label, const execution_space& space, AV result) { + Kokkos::RangePolicy policy(space, 0, + m_x.extent(0)); Kokkos::parallel_reduce(label, policy, *this, result); } diff --git a/blas/impl/KokkosBlas1_dot_mv_impl.hpp b/blas/impl/KokkosBlas1_dot_mv_impl.hpp index 9dda766b03..c1abcafcc6 100644 --- a/blas/impl/KokkosBlas1_dot_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_dot_mv_impl.hpp @@ -72,14 +72,13 @@ struct Dot_MV_Functor { // Main version: the result view is accessible from execution space, so it can // be computed in-place -template +template void MV_Dot_Invoke( - const RV& r, const XV& x, const YV& y, + const execution_space& space, const RV& r, const XV& x, const YV& y, typename std::enable_if::accessible>::type* = nullptr) { - using execution_space = typename XV::execution_space; - size_type numDots = std::max(x.extent(1), y.extent(1)); + execution_space, typename RV::memory_space>::accessible>::type* = + nullptr) { + size_type numDots = std::max(x.extent(1), y.extent(1)); if (x.extent(0) != y.extent(0)) { std::ostringstream oss; oss << "KokkosBlas::dot (rank-2): x and y have different lengths (" @@ -103,14 +102,13 @@ void MV_Dot_Invoke( } // Zero out the result vector Kokkos::deep_copy( - execution_space(), r, - Kokkos::ArithTraits::zero()); + space, r, Kokkos::ArithTraits::zero()); size_type teamsPerDot; KokkosBlas::Impl::multipleReductionWorkDistribution( x.extent(0), numDots, teamsPerDot); size_type numTeams = numDots * teamsPerDot; - Kokkos::TeamPolicy pol(numTeams, Kokkos::AUTO); + Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); Kokkos::parallel_for("Dot_MV", pol, Dot_MV_Functor( r, x, y, teamsPerDot)); @@ -118,18 +116,19 @@ void MV_Dot_Invoke( // Version for when a temporary result view is needed (implemented in terms of // the other version) -template +template void MV_Dot_Invoke( - const RV& r, const XV& x, const YV& y, + const execution_space& space, const RV& r, const XV& x, const YV& y, typename std::enable_if::accessible>::type* = nullptr) { + execution_space, typename RV::memory_space>::accessible>::type* = + nullptr) { Kokkos::View tempResult( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Dot_MV temp result"), r.extent(0)); - MV_Dot_Invoke(tempResult, x, y); - Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); + MV_Dot_Invoke( + space, tempResult, x, y); + Kokkos::deep_copy(space, r, tempResult); } } // namespace Impl diff --git a/blas/impl/KokkosBlas1_dot_spec.hpp b/blas/impl/KokkosBlas1_dot_spec.hpp index 430f357a36..e56e6d61ef 100644 --- a/blas/impl/KokkosBlas1_dot_spec.hpp +++ b/blas/impl/KokkosBlas1_dot_spec.hpp @@ -61,8 +61,8 @@ struct HasSpecialAccumulator { }; // Specialization struct which defines whether a specialization exists -template +template struct dot_eti_spec_avail { enum : bool { value = false }; }; @@ -78,6 +78,7 @@ struct dot_eti_spec_avail { #define KOKKOSBLAS1_DOT_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct dot_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View>, \ Kokkos::View \ struct dot_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View \ struct dot_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ @@ -129,6 +132,7 @@ struct dot_eti_spec_avail { }; \ template <> \ struct dot_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ @@ -144,6 +148,7 @@ struct dot_eti_spec_avail { }; \ template <> \ struct dot_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ @@ -167,17 +172,21 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = dot_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + dot_eti_spec_avail::value> struct Dot { - static void dot(const RV&, const XV& R, const YV& X); + static void dot(const execution_space& space, const RV&, const XV& R, + const YV& X); }; // This version never has TPL support, but it does use the same ETI system -template ::value> +template ::value> struct DotSpecialAccumulator { // Note: not doing the static_asserts to validate RV, XV, YV since those // errors would have already arisen when building the library. @@ -191,15 +200,17 @@ struct DotSpecialAccumulator { typename RV::device_type, Kokkos::MemoryTraits>; - static void dot(const RV_Result& R, const XV& X, const YV& Y); + static void dot(const execution_space& space, const RV_Result& R, const XV& X, + const YV& Y); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Dot for single vectors (1-D Views). // The rank-1 case is currently the only one that may use a different // accumulator type than InnerProductSpaceTraits::dot_type. -template -struct Dot { +template +struct Dot { // Check some things about the template parameters at compile time to get nice // error messages, before using them under the assumption they are valid. static_assert(Kokkos::is_view::value, @@ -237,7 +248,8 @@ struct Dot { Kokkos::MemoryTraits> RV_Result; - static void dot(const RV& R, const XV& X, const YV& Y) { + static void dot(const execution_space& space, const RV& R, const XV& X, + const YV& Y) { Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::dot[ETI]" : "KokkosBlas::dot[noETI]"); @@ -254,12 +266,12 @@ struct Dot { if (numElems < static_cast(INT_MAX)) { typedef int index_type; - DotFunctor f(X, Y); - f.run("KokkosBlas::dot<1D>", R); + DotFunctor f(X, Y); + f.run("KokkosBlas::dot<1D>", space, R); } else { typedef int64_t index_type; - DotFunctor f(X, Y); - f.run("KokkosBlas::dot<1D>", R); + DotFunctor f(X, Y); + f.run("KokkosBlas::dot<1D>", space, R); } Kokkos::Profiling::popRegion(); } @@ -269,8 +281,9 @@ struct Dot { // uses DotAccumulatingScalar for the result view. // // Is never supported by TPLs, but uses the same dot_eti_spec_avail::value. -template -struct DotSpecialAccumulator { +template +struct DotSpecialAccumulator { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "DotSpecialAccumulator: XV is not a Kokkos::View."); @@ -306,7 +319,8 @@ struct DotSpecialAccumulator { typename RV::device_type, Kokkos::MemoryTraits>; - static void dot(const RV_Result& R, const XV& X, const YV& Y) { + static void dot(const execution_space& space, const RV_Result& R, const XV& X, + const YV& Y) { Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::dot[ETI]" : "KokkosBlas::dot[noETI]"); @@ -323,19 +337,20 @@ struct DotSpecialAccumulator { if (numElems < static_cast(INT_MAX)) { typedef int index_type; - DotFunctor f(X, Y); - f.run("KokkosBlas::dot<1D>", R); + DotFunctor f(X, Y); + f.run("KokkosBlas::dot<1D>", space, R); } else { typedef int64_t index_type; - DotFunctor f(X, Y); - f.run("KokkosBlas::dot<1D>", R); + DotFunctor f(X, Y); + f.run("KokkosBlas::dot<1D>", space, R); } Kokkos::Profiling::popRegion(); } }; -template -struct Dot +struct Dot { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" @@ -363,7 +378,8 @@ struct Dot(INT_MAX)) { typedef int index_type; - DotFunctor f(X0, - Y0); - f.run("KokkosBlas::dot<1D>", R0); + DotFunctor + f(X0, Y0); + f.run("KokkosBlas::dot<1D>", space, R0); } else { typedef int64_t index_type; - DotFunctor f(X0, - Y0); - f.run("KokkosBlas::dot<1D>", R0); + DotFunctor + f(X0, Y0); + f.run("KokkosBlas::dot<1D>", space, R0); } } else { if (numRows < static_cast(INT_MAX) && numRows * numDots < static_cast(INT_MAX)) { typedef int index_type; - MV_Dot_Invoke(R, X, Y); + MV_Dot_Invoke(space, R, X, Y); } else { typedef std::int64_t index_type; - MV_Dot_Invoke(R, X, Y); + MV_Dot_Invoke(space, R, X, Y); } } Kokkos::Profiling::popRegion(); @@ -421,6 +439,7 @@ struct Dot>, \ Kokkos::View>, \ 1, 1, false, true>; \ extern template struct Dot< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View>, \ 1, 1, false, true>; \ extern template struct DotSpecialAccumulator< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View>, \ true>; \ extern template struct DotSpecialAccumulator< \ + EXEC_SPACE, \ Kokkos::View>, \ Kokkos::View; #define KOKKOSBLAS1_DOT_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct Dot>, \ Kokkos::View, \ @@ -472,6 +495,7 @@ struct Dot>, \ 1, 1, false, true>; \ template struct Dot< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View>, \ 1, 1, false, true>; \ template struct DotSpecialAccumulator< \ + EXEC_SPACE, \ Kokkos::View>, \ Kokkos::View>, \ true>; \ template struct DotSpecialAccumulator< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View, \ @@ -523,6 +550,7 @@ struct Dot>, \ 2, 2, false, true>; \ extern template struct Dot< \ + EXEC_SPACE, \ Kokkos::View, \ @@ -535,6 +563,7 @@ struct Dot>, \ 2, 1, false, true>; \ extern template struct Dot< \ + EXEC_SPACE, \ Kokkos::View, \ @@ -550,6 +579,7 @@ struct Dot, \ @@ -562,6 +592,7 @@ struct Dot>, \ 2, 2, false, true>; \ template struct Dot< \ + EXEC_SPACE, \ Kokkos::View, \ @@ -574,6 +605,7 @@ struct Dot>, \ 2, 1, false, true>; \ template struct Dot< \ + EXEC_SPACE, \ Kokkos::View, \ diff --git a/blas/impl/KokkosBlas1_iamax_impl.hpp b/blas/impl/KokkosBlas1_iamax_impl.hpp index 369084aacb..4c7a3fcc0c 100644 --- a/blas/impl/KokkosBlas1_iamax_impl.hpp +++ b/blas/impl/KokkosBlas1_iamax_impl.hpp @@ -79,9 +79,8 @@ struct V_Iamax_Functor { /// \brief Find the index of the element with the maximum magnitude of the /// single vector (1-D /// View) X, and store the result in the 0-D View r. -template -void V_Iamax_Invoke(const RV& r, const XV& X) { - using execution_space = typename XV::execution_space; +template +void V_Iamax_Invoke(const execution_space& space, const RV& r, const XV& X) { using AT = Kokkos::ArithTraits; using mag_type = typename AT::mag_type; @@ -89,11 +88,11 @@ void V_Iamax_Invoke(const RV& r, const XV& X) { // Avoid MaxLoc Reduction if this is a zero length view if (numRows == 0) { - Kokkos::deep_copy(r, 0); + Kokkos::deep_copy(space, r, 0); return; } - Kokkos::RangePolicy policy(1, numRows + 1); + Kokkos::RangePolicy policy(space, 1, numRows + 1); using functor_type = V_Iamax_Functor; functor_type op(X); @@ -103,12 +102,13 @@ void V_Iamax_Invoke(const RV& r, const XV& X) { /// \brief Find the index of the element with the maximum magnitude of the /// columns of the /// multivector (2-D View) X, and store result(s) in the 1-D View r. -template -void MV_Iamax_Invoke(const RV& r, const XMV& X) { +template +void MV_Iamax_Invoke(const execution_space& space, const RV& r, const XMV& X) { for (size_t i = 0; i < X.extent(1); i++) { auto ri = Kokkos::subview(r, i); auto Xi = Kokkos::subview(X, Kokkos::ALL(), i); - V_Iamax_Invoke(ri, Xi); + V_Iamax_Invoke( + space, ri, Xi); } } diff --git a/blas/impl/KokkosBlas1_iamax_spec.hpp b/blas/impl/KokkosBlas1_iamax_spec.hpp index 57d0056e92..461625df67 100644 --- a/blas/impl/KokkosBlas1_iamax_spec.hpp +++ b/blas/impl/KokkosBlas1_iamax_spec.hpp @@ -29,7 +29,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct iamax_eti_spec_avail { enum : bool { value = false }; }; @@ -47,6 +47,7 @@ struct iamax_eti_spec_avail { EXEC_SPACE, MEM_SPACE) \ template <> \ struct iamax_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View >, \ Kokkos::View \ struct iamax_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View \ struct iamax_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View >, \ Kokkos::View \ struct iamax_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View::value, - bool eti_spec_avail = iamax_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + iamax_eti_spec_avail::value> struct Iamax { - static void iamax(const RMV& R, const XMV& X); + static void iamax(const execution_space& space, const RMV& R, const XMV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Iamax for single vectors (1-D Views). -template -struct Iamax { +template +struct Iamax { typedef typename XMV::size_type size_type; - static void iamax(const RMV& R, const XMV& X) { + static void iamax(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Iamax<1-D>: RMV is not a Kokkos::View."); @@ -164,20 +171,21 @@ struct Iamax { const size_type numRows = X.extent(0); if (numRows < static_cast(INT_MAX)) { - V_Iamax_Invoke(R, X); + V_Iamax_Invoke(space, R, X); } else { typedef std::int64_t index_type; - V_Iamax_Invoke(R, X); + V_Iamax_Invoke(space, R, X); } Kokkos::Profiling::popRegion(); } }; -template -struct Iamax { +template +struct Iamax { typedef typename XMV::size_type size_type; - static void iamax(const RV& R, const XMV& X) { + static void iamax(const execution_space& space, const RV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Iamax<2-D>: RV is not a Kokkos::View."); @@ -207,10 +215,10 @@ struct Iamax { const size_type numCols = X.extent(1); if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - MV_Iamax_Invoke(R, X); + MV_Iamax_Invoke(space, R, X); } else { typedef std::int64_t index_type; - MV_Iamax_Invoke(R, X); + MV_Iamax_Invoke(space, R, X); } Kokkos::Profiling::popRegion(); } @@ -230,6 +238,7 @@ struct Iamax { #define KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_INDEX(INDEX_TYPE, SCALAR, LAYOUT, \ EXEC_SPACE, MEM_SPACE) \ extern template struct Iamax< \ + EXEC_SPACE, \ Kokkos::View >, \ Kokkos::View { Kokkos::MemoryTraits >, \ 1, false, true>; \ extern template struct Iamax< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_IAMAX_ETI_SPEC_INST_INDEX(INDEX_TYPE, SCALAR, LAYOUT, \ EXEC_SPACE, MEM_SPACE) \ template struct Iamax< \ + EXEC_SPACE, \ Kokkos::View >, \ Kokkos::View { Kokkos::MemoryTraits >, \ 1, false, true>; \ template struct Iamax< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_INDEX(INDEX_TYPE, SCALAR, LAYOUT, \ EXEC_SPACE, MEM_SPACE) \ extern template struct Iamax< \ + EXEC_SPACE, \ Kokkos::View >, \ Kokkos::View { Kokkos::MemoryTraits >, \ 2, false, true>; \ extern template struct Iamax< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View { #define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_INST_INDEX(INDEX_TYPE, SCALAR, LAYOUT, \ EXEC_SPACE, MEM_SPACE) \ template struct Iamax< \ + EXEC_SPACE, \ Kokkos::View >, \ Kokkos::View { Kokkos::MemoryTraits >, \ 2, false, true>; \ template struct Iamax< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View struct V_Nrm1_Functor { - using size_type = SizeType; + using size_type = SizeType; using xvalue_type = typename XV::non_const_value_type; - using XAT = Kokkos::ArithTraits; - using value_type = typename XAT::mag_type; - using MAT = Kokkos::ArithTraits; + using XAT = Kokkos::ArithTraits; + using value_type = typename XAT::mag_type; + using MAT = Kokkos::ArithTraits; typename XV::const_type m_x; @@ -71,9 +71,9 @@ template struct Nrm1_MV_Functor { using rvalue_type = typename RV::non_const_value_type; using xvalue_type = typename XV::non_const_value_type; - using XAT = Kokkos::ArithTraits; - using value_type = typename XAT::mag_type; - using MAT = Kokkos::ArithTraits; + using XAT = Kokkos::ArithTraits; + using value_type = typename XAT::mag_type; + using MAT = Kokkos::ArithTraits; using TeamMem = typename Kokkos::TeamPolicy::member_type; @@ -129,8 +129,8 @@ template void MV_Nrm1_Invoke( const execution_space& space, const RV& r, const XV& x, typename std::enable_if::accessible>::type* = nullptr) { + execution_space, typename RV::memory_space>::accessible>::type* = + nullptr) { if (r.extent(0) != x.extent(1)) { std::ostringstream oss; oss << "KokkosBlas::nrm1 (rank-2): result vector has wrong length (" @@ -139,8 +139,7 @@ void MV_Nrm1_Invoke( } // Zero out the result vector Kokkos::deep_copy( - space, r, - Kokkos::ArithTraits::zero()); + space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( @@ -158,14 +157,19 @@ template void MV_Nrm1_Invoke( const execution_space& space, const RV& r, const XV& x, typename std::enable_if::accessible>::type* = nullptr) { + execution_space, typename RV::memory_space>::accessible>::type* = + nullptr) { Kokkos::View tempResult( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm1 temp result"), r.extent(0)); - MV_Nrm1_Invoke(space, tempResult, x); + MV_Nrm1_Invoke( + space, tempResult, x); Kokkos::deep_copy(space, r, tempResult); + // Fence needed to ensure that the deep_copy + // above finishes before we exit this function + // and tempResult runs out of scope... + space.fence(); } } // namespace Impl diff --git a/blas/impl/KokkosBlas1_nrm1_spec.hpp b/blas/impl/KokkosBlas1_nrm1_spec.hpp index f94ad3a2f5..f35a341787 100644 --- a/blas/impl/KokkosBlas1_nrm1_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm1_spec.hpp @@ -92,9 +92,10 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = nrm1_eti_spec_avail::value> +template < + class execution_space, class RMV, class XMV, int rank = XMV::rank, + bool tpl_spec_avail = nrm1_tpl_spec_avail::value, + bool eti_spec_avail = nrm1_eti_spec_avail::value> struct Nrm1 { static void nrm1(const execution_space& space, const RMV& R, const XMV& X); }; @@ -102,7 +103,8 @@ struct Nrm1 { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Nrm1 for single vectors (1-D Views). template -struct Nrm1 { +struct Nrm1 { using size_type = typename XMV::size_type; static void nrm1(const execution_space& space, const RMV& R, const XMV& X) { @@ -143,7 +145,8 @@ struct Nrm1 -struct Nrm1 { +struct Nrm1 { using size_type = typename XMV::size_type; static void nrm1(const execution_space& space, const RV& R, const XMV& X) { @@ -178,10 +181,12 @@ struct Nrm1(INT_MAX)) { - V_Nrm1_Invoke(space, R0, X0); + V_Nrm1_Invoke(space, + R0, X0); } else { typedef std::int64_t index_type; - V_Nrm1_Invoke(space, R0, X0); + V_Nrm1_Invoke( + space, R0, X0); } } else { if (numRows < static_cast(INT_MAX) && diff --git a/blas/impl/KokkosBlas1_nrm2_impl.hpp b/blas/impl/KokkosBlas1_nrm2_impl.hpp index 0805138e9a..276023c171 100644 --- a/blas/impl/KokkosBlas1_nrm2_impl.hpp +++ b/blas/impl/KokkosBlas1_nrm2_impl.hpp @@ -136,11 +136,12 @@ struct Nrm2_MV_Functor { /// \brief Compute the 2-norm (or its square) of the single vector (1-D /// View) X, and store the result in the 0-D View r. template -void V_Nrm2_Invoke(const execution_space& space, const RV& r, const XV& X, const bool& take_sqrt) { +void V_Nrm2_Invoke(const execution_space& space, const RV& r, const XV& X, + const bool& take_sqrt) { const SizeType numRows = static_cast(X.extent(0)); Kokkos::RangePolicy policy(space, 0, numRows); - typedef V_Nrm2_Functor functor_type; + using functor_type = V_Nrm2_Functor; functor_type op(X, take_sqrt); Kokkos::parallel_reduce("KokkosBlas::Nrm2::S0", policy, op, r); } @@ -153,8 +154,8 @@ template void MV_Nrm2_Invoke( const execution_space& space, const RV& r, const XV& x, bool take_sqrt, typename std::enable_if::accessible>::type* = nullptr) { + execution_space, typename RV::memory_space>::accessible>::type* = + nullptr) { if (r.extent(0) != x.extent(1)) { std::ostringstream oss; oss << "KokkosBlas::nrm2 (rank-2): result vector has wrong length (" @@ -163,8 +164,7 @@ void MV_Nrm2_Invoke( } // Zero out the result vector Kokkos::deep_copy( - space, r, - Kokkos::ArithTraits::zero()); + space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( @@ -175,9 +175,10 @@ void MV_Nrm2_Invoke( "KokkosBlas1::Nrm2::S1", pol, Nrm2_MV_Functor(r, x, teamsPerVec)); if (take_sqrt) { - Kokkos::parallel_for("KokkosBlas1::Nrm2::Sqrt", - Kokkos::RangePolicy(space, 0, r.extent(0)), - TakeSqrtFunctor(r)); + Kokkos::parallel_for( + "KokkosBlas1::Nrm2::Sqrt", + Kokkos::RangePolicy(space, 0, r.extent(0)), + TakeSqrtFunctor(r)); } } @@ -187,14 +188,16 @@ template void MV_Nrm2_Invoke( const execution_space& space, const RV& r, const XV& x, bool take_sqrt, typename std::enable_if::accessible>::type* = nullptr) { + execution_space, typename RV::memory_space>::accessible>::type* = + nullptr) { Kokkos::View tempResult( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2 temp result"), r.extent(0)); - MV_Nrm2_Invoke(space, tempResult, x, take_sqrt); + MV_Nrm2_Invoke( + space, tempResult, x, take_sqrt); Kokkos::deep_copy(space, r, tempResult); + space.fence(); } } // namespace Impl diff --git a/blas/impl/KokkosBlas1_nrm2_spec.hpp b/blas/impl/KokkosBlas1_nrm2_spec.hpp index eba6e1c05a..0a258e00f4 100644 --- a/blas/impl/KokkosBlas1_nrm2_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm2_spec.hpp @@ -92,20 +92,24 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = nrm2_eti_spec_avail::value> +template < + class execution_space, class RMV, class XMV, int rank = XMV::rank, + bool tpl_spec_avail = nrm2_tpl_spec_avail::value, + bool eti_spec_avail = nrm2_eti_spec_avail::value> struct Nrm2 { - static void nrm2(const execution_space& space, const RMV& R, const XMV& X, const bool& take_sqrt); + static void nrm2(const execution_space& space, const RMV& R, const XMV& X, + const bool& take_sqrt); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Nrm2 for single vectors (1-D Views). template -struct Nrm2 { +struct Nrm2 { typedef typename XMV::size_type size_type; - static void nrm2(const execution_space& space, const RMV& R, const XMV& X, const bool& take_sqrt) { + static void nrm2(const execution_space& space, const RMV& R, const XMV& X, + const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm2<1-D>: RMV is not a Kokkos::View."); @@ -136,17 +140,20 @@ struct Nrm2(space, R, X, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2_Invoke(space, R, X, take_sqrt); + V_Nrm2_Invoke(space, R, X, + take_sqrt); } Kokkos::Profiling::popRegion(); } }; template -struct Nrm2 { +struct Nrm2 { typedef typename XMV::size_type size_type; - static void nrm2(const execution_space& space, const RV& R, const XMV& X, const bool& take_sqrt) { + static void nrm2(const execution_space& space, const RV& R, const XMV& X, + const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm2<2-D>: RV is not a Kokkos::View."); @@ -178,11 +185,12 @@ struct Nrm2(INT_MAX)) { - V_Nrm2_Invoke(space, R0, X0, take_sqrt); + V_Nrm2_Invoke( + space, R0, X0, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2_Invoke(space, R0, X0, - take_sqrt); + V_Nrm2_Invoke( + space, R0, X0, take_sqrt); } } else { if (numRows < static_cast(INT_MAX) && @@ -190,7 +198,8 @@ struct Nrm2(space, R, X, take_sqrt); } else { typedef std::int64_t index_type; - MV_Nrm2_Invoke(space, R, X, take_sqrt); + MV_Nrm2_Invoke(space, R, X, + take_sqrt); } } Kokkos::Profiling::popRegion(); diff --git a/blas/impl/KokkosBlas1_nrm2w_impl.hpp b/blas/impl/KokkosBlas1_nrm2w_impl.hpp index eeafb67e97..fb9b1f7858 100644 --- a/blas/impl/KokkosBlas1_nrm2w_impl.hpp +++ b/blas/impl/KokkosBlas1_nrm2w_impl.hpp @@ -135,8 +135,8 @@ struct Nrm2w_MV_Functor { /// \brief Compute the 2-norm (or its square) of the single vector (1-D /// View) X, and store the result in the 0-D View r. template -void V_Nrm2w_Invoke(const execution_space& space, const RV& r, const XV& X, const XV& W, - const bool& take_sqrt) { +void V_Nrm2w_Invoke(const execution_space& space, const RV& r, const XV& X, + const XV& W, const bool& take_sqrt) { const SizeType numRows = static_cast(X.extent(0)); Kokkos::RangePolicy policy(space, 0, numRows); @@ -151,10 +151,11 @@ void V_Nrm2w_Invoke(const execution_space& space, const RV& r, const XV& X, cons // be computed in-place template void MV_Nrm2w_Invoke( - const execution_space& space, const RV& r, const XV& x, const XV& w, bool take_sqrt, + const execution_space& space, const RV& r, const XV& x, const XV& w, + bool take_sqrt, typename std::enable_if::accessible>::type* = nullptr) { + execution_space, typename XV::memory_space>::accessible>::type* = + nullptr) { if (r.extent(0) != x.extent(1)) { std::ostringstream oss; oss << "KokkosBlas::nrm2w (rank-2): result vector has wrong length (" @@ -163,8 +164,7 @@ void MV_Nrm2w_Invoke( } // Zero out the result vector Kokkos::deep_copy( - space, r, - Kokkos::ArithTraits::zero()); + space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( @@ -175,9 +175,10 @@ void MV_Nrm2w_Invoke( Nrm2w_MV_Functor( r, x, w, teamsPerVec)); if (take_sqrt) { - Kokkos::parallel_for("KokkosBlas1::Nrm2w::Sqrt", - Kokkos::RangePolicy(space, 0, r.extent(0)), - TakeSqrtFunctor(r)); + Kokkos::parallel_for( + "KokkosBlas1::Nrm2w::Sqrt", + Kokkos::RangePolicy(space, 0, r.extent(0)), + TakeSqrtFunctor(r)); } } @@ -185,10 +186,11 @@ void MV_Nrm2w_Invoke( // the other version) template void MV_Nrm2w_Invoke( - const execution_space& space, const RV& r, const XV& x, const XV& w, bool take_sqrt, + const execution_space& space, const RV& r, const XV& x, const XV& w, + bool take_sqrt, typename std::enable_if::accessible>::type* = nullptr) { + execution_space, typename XV::memory_space>::accessible>::type* = + nullptr) { Kokkos::View tempResult( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2w temp result"), @@ -196,6 +198,7 @@ void MV_Nrm2w_Invoke( MV_Nrm2w_Invoke(space, tempResult, x, w, take_sqrt); Kokkos::deep_copy(space, r, tempResult); + space.fence(); } } // namespace Impl diff --git a/blas/impl/KokkosBlas1_nrm2w_spec.hpp b/blas/impl/KokkosBlas1_nrm2w_spec.hpp index c24d6d31fa..c26d8bf004 100644 --- a/blas/impl/KokkosBlas1_nrm2w_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm2w_spec.hpp @@ -91,21 +91,24 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = nrm2w_eti_spec_avail::value> + bool tpl_spec_avail = + nrm2w_tpl_spec_avail::value, + bool eti_spec_avail = + nrm2w_eti_spec_avail::value> struct Nrm2w { - static void nrm2w(const execution_space& space, const RMV& R, const XMV& X, const XMV& W, - const bool& take_sqrt); + static void nrm2w(const execution_space& space, const RMV& R, const XMV& X, + const XMV& W, const bool& take_sqrt); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Nrm2w for single vectors (1-D Views). template -struct Nrm2w { +struct Nrm2w { using size_type = typename XMV::size_type; - static void nrm2w(const execution_space& space, const RMV& R, const XMV& X, const XMV& W, - const bool& take_sqrt) { + static void nrm2w(const execution_space& space, const RMV& R, const XMV& X, + const XMV& W, const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm2w<1-D>: RMV is not a Kokkos::View."); @@ -136,18 +139,20 @@ struct Nrm2w(space, R, X, W, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2w_Invoke(space, R, X, W, take_sqrt); + V_Nrm2w_Invoke(space, R, X, W, + take_sqrt); } Kokkos::Profiling::popRegion(); } }; template -struct Nrm2w { +struct Nrm2w { using size_type = typename XMV::size_type; - static void nrm2w(const execution_space& space, const RV& R, const XMV& X, const XMV& W, - const bool& take_sqrt) { + static void nrm2w(const execution_space& space, const RV& R, const XMV& X, + const XMV& W, const bool& take_sqrt) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Nrm2w<2-D>: RV is not a Kokkos::View."); @@ -180,19 +185,22 @@ struct Nrm2w(INT_MAX)) { - V_Nrm2w_Invoke(space, R0, X0, W0, take_sqrt); + V_Nrm2w_Invoke( + space, R0, X0, W0, take_sqrt); } else { typedef std::int64_t index_type; - V_Nrm2w_Invoke(space, R0, X0, W0, - take_sqrt); + V_Nrm2w_Invoke( + space, R0, X0, W0, take_sqrt); } } else { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - MV_Nrm2w_Invoke(space, R, X, W, take_sqrt); + MV_Nrm2w_Invoke(space, R, X, W, + take_sqrt); } else { typedef std::int64_t index_type; - MV_Nrm2w_Invoke(space, R, X, W, take_sqrt); + MV_Nrm2w_Invoke(space, R, X, W, + take_sqrt); } } Kokkos::Profiling::popRegion(); diff --git a/blas/impl/KokkosBlas1_nrminf_impl.hpp b/blas/impl/KokkosBlas1_nrminf_impl.hpp index 8710454531..b8431ac8ea 100644 --- a/blas/impl/KokkosBlas1_nrminf_impl.hpp +++ b/blas/impl/KokkosBlas1_nrminf_impl.hpp @@ -96,7 +96,8 @@ void MV_NrmInf_Invoke(const execution_space& space, const RV& r, const XMV& X) { for (size_t i = 0; i < X.extent(1); i++) { auto ri = Kokkos::subview(r, i); auto Xi = Kokkos::subview(X, Kokkos::ALL(), i); - V_NrmInf_Invoke(space, ri, Xi); + V_NrmInf_Invoke( + space, ri, Xi); } } diff --git a/blas/impl/KokkosBlas1_nrminf_spec.hpp b/blas/impl/KokkosBlas1_nrminf_spec.hpp index 5f6c27f9a1..4b39408986 100644 --- a/blas/impl/KokkosBlas1_nrminf_spec.hpp +++ b/blas/impl/KokkosBlas1_nrminf_spec.hpp @@ -94,8 +94,10 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = nrminf_eti_spec_avail::value> + bool tpl_spec_avail = + nrminf_tpl_spec_avail::value, + bool eti_spec_avail = + nrminf_eti_spec_avail::value> struct NrmInf { static void nrminf(const execution_space& space, const RMV& R, const XMV& X); }; @@ -103,7 +105,8 @@ struct NrmInf { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of NrmInf for single vectors (1-D Views). template -struct NrmInf { +struct NrmInf { typedef typename XMV::size_type size_type; static void nrminf(const execution_space& space, const RMV& R, const XMV& X) { @@ -144,7 +147,8 @@ struct NrmInf -struct NrmInf { +struct NrmInf { typedef typename XMV::size_type size_type; static void nrminf(const execution_space& space, const RV& R, const XMV& X) { @@ -238,7 +242,7 @@ struct NrmInf::mag_type*, \ LAYOUT, \ @@ -258,7 +262,7 @@ struct NrmInf::mag_type*, \ LAYOUT, \ diff --git a/blas/impl/KokkosBlas1_sum_impl.hpp b/blas/impl/KokkosBlas1_sum_impl.hpp index 20f88e6845..df08d42069 100644 --- a/blas/impl/KokkosBlas1_sum_impl.hpp +++ b/blas/impl/KokkosBlas1_sum_impl.hpp @@ -36,7 +36,6 @@ namespace Impl { /// \tparam SizeType Index type. Use int (32 bits) if possible. template struct V_Sum_Functor { - typedef typename XV::execution_space execution_space; typedef SizeType size_type; typedef typename XV::non_const_value_type xvalue_type; typedef Kokkos::Details::InnerProductSpaceTraits IPT; @@ -104,11 +103,10 @@ struct Sum_MV_Functor { /// \brief Compute the 2-norm (or its square) of the single vector (1-D /// View) X, and store the result in the 0-D View r. -template -void V_Sum_Invoke(const RV& r, const XV& X) { - typedef typename XV::execution_space execution_space; +template +void V_Sum_Invoke(const execution_space& space, const RV& r, const XV& X) { const SizeType numRows = static_cast(X.extent(0)); - Kokkos::RangePolicy policy(0, numRows); + Kokkos::RangePolicy policy(space, 0, numRows); typedef V_Sum_Functor functor_type; functor_type op(X); @@ -119,13 +117,12 @@ void V_Sum_Invoke(const RV& r, const XV& X) { /// multivector (2-D View) X, and store result(s) in the 1-D View r. // Main version: the result view is accessible from execution space, so it can // be computed in-place -template +template void MV_Sum_Invoke( - const RV& r, const XV& x, + const execution_space& space, const RV& r, const XV& x, typename std::enable_if::accessible>::type* = nullptr) { - using execution_space = typename XV::execution_space; + execution_space, typename RV::memory_space>::accessible>::type* = + nullptr) { if (r.extent(0) != x.extent(1)) { std::ostringstream oss; oss << "KokkosBlas::Sum (rank-2): result vector has wrong length (" @@ -134,14 +131,13 @@ void MV_Sum_Invoke( } // Zero out the result vector Kokkos::deep_copy( - execution_space(), r, - Kokkos::ArithTraits::zero()); + space, r, Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( x.extent(0), x.extent(1), teamsPerVec); size_type numTeams = x.extent(1) * teamsPerVec; - Kokkos::TeamPolicy pol(numTeams, Kokkos::AUTO); + Kokkos::TeamPolicy pol(space, numTeams, Kokkos::AUTO); Kokkos::parallel_for( "KokkosBlas1::Sum::S1", pol, Sum_MV_Functor(r, x, teamsPerVec)); @@ -149,18 +145,19 @@ void MV_Sum_Invoke( // Version for when a temporary result view is needed (implemented in terms of // the other version) -template +template void MV_Sum_Invoke( - const RV& r, const XV& x, + const execution_space& space, const RV& r, const XV& x, typename std::enable_if::accessible>::type* = nullptr) { + execution_space, typename RV::memory_space>::accessible>::type* = + nullptr) { Kokkos::View tempResult( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Sum temp result"), r.extent(0)); - MV_Sum_Invoke(tempResult, x); - Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); + MV_Sum_Invoke( + space, tempResult, x); + Kokkos::deep_copy(space, r, tempResult); } } // namespace Impl diff --git a/blas/impl/KokkosBlas1_sum_spec.hpp b/blas/impl/KokkosBlas1_sum_spec.hpp index db1771de8f..83fa6fc1d1 100644 --- a/blas/impl/KokkosBlas1_sum_spec.hpp +++ b/blas/impl/KokkosBlas1_sum_spec.hpp @@ -29,7 +29,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct sum_eti_spec_avail { enum : bool { value = false }; }; @@ -46,6 +46,7 @@ struct sum_eti_spec_avail { #define KOKKOSBLAS1_SUM_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct sum_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View >, \ Kokkos::View \ struct sum_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ @@ -86,20 +88,22 @@ namespace KokkosBlas { namespace Impl { // Unification layer -template ::value, - bool eti_spec_avail = sum_eti_spec_avail::value> +template < + class execution_space, class RMV, class XMV, int rank = XMV::rank, + bool tpl_spec_avail = sum_tpl_spec_avail::value, + bool eti_spec_avail = sum_eti_spec_avail::value> struct Sum { - static void sum(const RMV& R, const XMV& X); + static void sum(const execution_space& space, const RMV& R, const XMV& X); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of Sum for single vectors (1-D Views). -template -struct Sum { +template +struct Sum { typedef typename XMV::size_type size_type; - static void sum(const RMV& R, const XMV& X) { + static void sum(const execution_space& space, const RMV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Sum<1-D>: RMV is not a Kokkos::View."); @@ -128,20 +132,21 @@ struct Sum { const size_type numRows = X.extent(0); if (numRows < static_cast(INT_MAX)) { - V_Sum_Invoke(R, X); + V_Sum_Invoke(space, R, X); } else { typedef std::int64_t index_type; - V_Sum_Invoke(R, X); + V_Sum_Invoke(space, R, X); } Kokkos::Profiling::popRegion(); } }; -template -struct Sum { +template +struct Sum { typedef typename XMV::size_type size_type; - static void sum(const RV& R, const XMV& X) { + static void sum(const execution_space& space, const RV& R, const XMV& X) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Sum<2-D>: RV is not a Kokkos::View."); @@ -173,18 +178,20 @@ struct Sum { auto R0 = Kokkos::subview(R, 0); auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); if (numRows < static_cast(INT_MAX)) { - V_Sum_Invoke(R0, X0); + V_Sum_Invoke(space, + R0, X0); } else { typedef std::int64_t index_type; - V_Sum_Invoke(R0, X0); + V_Sum_Invoke( + space, R0, X0); } } else { if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - MV_Sum_Invoke(R, X); + MV_Sum_Invoke(space, R, X); } else { typedef std::int64_t index_type; - MV_Sum_Invoke(R, X); + MV_Sum_Invoke(space, R, X); } } Kokkos::Profiling::popRegion(); @@ -204,6 +211,7 @@ struct Sum { // #define KOKKOSBLAS1_SUM_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ extern template struct Sum< \ + EXEC_SPACE, \ Kokkos::View >, \ Kokkos::View { // use this macro in one or more .cpp files in this directory. // #define KOKKOSBLAS1_SUM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct Sum >, \ Kokkos::View, \ @@ -234,6 +243,7 @@ struct Sum { #define KOKKOSBLAS1_SUM_MV_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ extern template struct Sum< \ + EXEC_SPACE, \ Kokkos::View, \ @@ -251,6 +261,7 @@ struct Sum { #define KOKKOSBLAS1_SUM_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template struct Sum< \ + EXEC_SPACE, \ Kokkos::View, \ diff --git a/blas/src/KokkosBlas1_dot.hpp b/blas/src/KokkosBlas1_dot.hpp index 4a5a18b976..414a6042f7 100644 --- a/blas/src/KokkosBlas1_dot.hpp +++ b/blas/src/KokkosBlas1_dot.hpp @@ -1,4 +1,4 @@ -//@HEADER +//@HEADERA // ************************************************************************ // // Kokkos v. 4.0 @@ -25,21 +25,38 @@ namespace KokkosBlas { /// \brief Return the dot product of the two vectors x and y. /// +/// \tparam execution_space the Kokkos execution space where the kernel +/// will be executed. /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// \tparam YVector Type of the second vector y; a 1-D Kokkos::View. /// +/// \param space [in] an execution space instance that may specify +/// in which stream/queue the kernel will be executed. /// \param x [in] Input 1-D View. /// \param y [in] Input 1-D View. /// /// \return The dot product result; a single value. -template +template , + int>::type = 0> typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::dot_type -dot(const XVector& x, const YVector& y) { +dot(const execution_space& space, const XVector& x, const YVector& y) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::dot: execution_space must be a valid Kokkos " + "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::dot: XVector must be a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::dot: XVector must be accessible from execution_space"); static_assert(Kokkos::is_view::value, "KokkosBlas::dot: YVector must be a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::dot: YVector must be accessible from execution_space"); static_assert((int)XVector::rank == (int)YVector::rank, "KokkosBlas::dot: Vector ranks do not match."); static_assert(XVector::rank == 1, @@ -55,16 +72,14 @@ dot(const XVector& x, const YVector& y) { KokkosKernels::Impl::throw_runtime_exception(os.str()); } - typedef Kokkos::View< + using XVector_Internal = Kokkos::View< typename XVector::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits> - XVector_Internal; - typedef Kokkos::View< + typename XVector::device_type, Kokkos::MemoryTraits>; + using YVector_Internal = Kokkos::View< typename YVector::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits> - YVector_Internal; + typename YVector::device_type, Kokkos::MemoryTraits>; using dot_type = typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::dot_type; @@ -91,9 +106,10 @@ dot(const XVector& x, const YVector& y) { // 32-bit precision). Impl::Dot needs to support both cases, and it's easier // to do this with overloading than by extending the ETI to deal with two // different scalar types. - Impl::DotSpecialAccumulator::dot(R, X, Y); - Kokkos::fence(); + Impl::DotSpecialAccumulator::dot(space, R, + X, Y); + space.fence(); // mfh 22 Jan 2020: We need the line below because // Kokkos::complex lacks a constructor that takes a // Kokkos::complex with U != T. @@ -101,12 +117,37 @@ dot(const XVector& x, const YVector& y) { result); } +/// \brief Return the dot product of the two vectors x and y. +/// +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVector. +/// +/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. +/// \tparam YVector Type of the second vector y; a 1-D Kokkos::View. +/// +/// \param x [in] Input 1-D View. +/// \param y [in] Input 1-D View. +/// +/// \return The dot product result; a single value. +template +typename Kokkos::Details::InnerProductSpaceTraits< + typename XVector::non_const_value_type>::dot_type +dot(const XVector& x, const YVector& y) { + return dot(typename XVector::execution_space{}, x, y); +} + /// \brief Compute the column-wise dot products of two multivectors. /// +/// This function is non-blocking and thread-safe. +/// +/// \tparam execution_space the Kokkos execution space where the kernel +/// will be executed. /// \tparam RV 0-D resp. 1-D output View /// \tparam XMV 1-D resp. 2-D input View /// \tparam YMV 1-D resp. 2-D input View /// +/// \param space [in] an execution space instance that may specify +/// in which stream/queue the kernel will be executed. /// \param R [out] Output 1-D or 0-D View to which to write results. /// \param X [in] Input 2-D or 1-D View. /// \param Y [in] Input 2-D or 1-D View. @@ -127,18 +168,29 @@ dot(const XVector& x, const YVector& y) { /// \note To implementers: We use enable_if here so that the compiler /// doesn't confuse this version of dot() with the three-argument /// version of dot() in Kokkos_Blas1.hpp. -template -void dot(const RV& R, const XMV& X, const YMV& Y, +template +void dot(const execution_space& space, const RV& R, const XMV& X, const YMV& Y, typename std::enable_if::value, int>::type = 0) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::dot: excution_space must be a valid Kokkos " + "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::dot: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::dot: " "X is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::dot: XMV must be accessible from execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::dot: " "Y is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::dot: XMV must be accessible from execution_space."); static_assert(std::is_same::value, "KokkosBlas::dot: R is const. " @@ -215,8 +267,44 @@ void dot(const RV& R, const XMV& X, const YMV& Y, XMV_Internal X_internal = X; YMV_Internal Y_internal = Y; - Impl::Dot::dot( - R_internal, X_internal, Y_internal); + Impl::Dot::dot( + space, R_internal, X_internal, Y_internal); +} + +/// \brief Compute the column-wise dot products of two multivectors. +/// +/// This function is non-blocking and thread-safe. +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVM. +/// +/// \tparam RV 0-D resp. 1-D output View +/// \tparam XMV 1-D resp. 2-D input View +/// \tparam YMV 1-D resp. 2-D input View +/// +/// \param R [out] Output 1-D or 0-D View to which to write results. +/// \param X [in] Input 2-D or 1-D View. +/// \param Y [in] Input 2-D or 1-D View. +/// +/// This function implements a few different use cases: +///
    +///
  • If X and Y are both 1-D, then this is a single dot product. +/// R must be 0-D (a View of a single value).
  • +///
  • If X and Y are both 2-D, then this function computes their +/// dot products columnwise. R must be 1-D.
  • +///
  • If X is 2-D and Y is 1-D, then this function computes the dot +/// product of each column of X, with Y, in turn. R must be +/// 1-D.
  • +///
  • If X is 1-D and Y is 2-D, then this function computes the dot +/// product X with each column of Y, in turn. R must be 1-D.
  • +///
+/// +/// \note To implementers: We use enable_if here so that the compiler +/// doesn't confuse this version of dot() with the three-argument +/// version of dot() in Kokkos_Blas1.hpp. +template +void dot(const RV& R, const XMV& X, const YMV& Y, + typename std::enable_if::value, int>::type = 0) { + dot(typename XMV::execution_space{}, R, X, Y); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_iamax.hpp b/blas/src/KokkosBlas1_iamax.hpp index 11ae267855..cfaaaeed63 100644 --- a/blas/src/KokkosBlas1_iamax.hpp +++ b/blas/src/KokkosBlas1_iamax.hpp @@ -26,17 +26,29 @@ namespace KokkosBlas { /// \brief Return the (smallest) index of the element of the maximum magnitude /// of the vector x. /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// +/// \param space [in] execution space instance where the kernel will run. /// \param x [in] Input 1-D View. /// /// \return The (smallest) index of the element of the maximum magnitude; a /// single value. /// Note: Returned index is 1-based for compatibility with Fortran. -template -typename XVector::size_type iamax(const XVector& x) { +template , + int>::type = 0> +typename XVector::size_type iamax(const execution_space& space, + const XVector& x) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::iamax: execution_space must be a valid Kokkos " + "execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::iamax: XVector must be a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::iamax: XVector must be accessible from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::iamax: " "Both Vector inputs must have rank 1."); @@ -59,30 +71,61 @@ typename XVector::size_type iamax(const XVector& x) { RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; - Impl::Iamax::iamax(R, X); - Kokkos::fence(); + Impl::Iamax::iamax(space, + R, X); + space.fence(); return result; } +/// \brief Return the (smallest) index of the element of the maximum magnitude +/// of the vector x. +/// +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVector. +/// +/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. +/// +/// \param x [in] Input 1-D View. +/// +/// \return The (smallest) index of the element of the maximum magnitude; a +/// single value. +/// Note: Returned index is 1-based for compatibility with Fortran. +template +typename XVector::size_type iamax(const XVector& x) { + return iamax(typename XVector::execution_space{}, x); +} + /// \brief R(j) = iamax(X(i,j)) /// /// Replace each entry in R with the (smallest) index of the element of the /// maximum magnitude of the corresponding entry in X. +/// This function is non-blocking and thread-safe. /// /// \tparam RMV 0-D or 1-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. /// +/// \param space [in] execution space instance where the kernel will run. +/// \param R [out] Output View (rank 0 or 1) containing the results. +/// \param X [in] Input View (rank 1 or 2). +/// /// Note for TPL cuBLAS: When TPL cuBLAS iamax is used and returns result to a /// view, RMV must be 0-D view and XMV must be 1-D view. -template -void iamax(const RV& R, const XMV& X, +template +void iamax(const execution_space& space, const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::iamax: execution_space must be a valid Kokkos " + "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::iamax: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::iamax: " "X is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::iamax: XMV must be accessible from execution_space."); static_assert(std::is_same::value, "KokkosBlas::iamax: R is const. " @@ -136,7 +179,27 @@ void iamax(const RV& R, const XMV& X, RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Iamax::iamax(R_internal, X_internal); + Impl::Iamax::iamax( + space, R_internal, X_internal); +} + +/// \brief R(j) = iamax(X(i,j)) +/// +/// Replace each entry in R with the (smallest) index of the element of the +/// maximum magnitude of the corresponding entry in X. +/// This function is non-blocking and thread-safe. +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVector. +/// +/// \tparam RMV 0-D or 1-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. +/// +/// Note for TPL cuBLAS: When TPL cuBLAS iamax is used and returns result to a +/// view, RMV must be 0-D view and XMV must be 1-D view. +template +void iamax(const RV& R, const XMV& X, + typename std::enable_if::value, int>::type = 0) { + iamax(typename XMV::execution_space{}, R, X); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas1_nrm1.hpp b/blas/src/KokkosBlas1_nrm1.hpp index 94463c30fd..e9b26e6177 100644 --- a/blas/src/KokkosBlas1_nrm1.hpp +++ b/blas/src/KokkosBlas1_nrm1.hpp @@ -25,18 +25,24 @@ namespace KokkosBlas { /// \brief Return the nrm1 of the vector x. /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// +/// \param space [in] the execution space instance, possibly containing a +/// stream/queue where the kernel will be executed. /// \param x [in] Input 1-D View. /// /// \return The nrm1 product result; a single value. -template ::value, int>::type = 0> +template < + class execution_space, class XVector, + typename std::enable_if::value, + int>::type = 0> typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type nrm1(const execution_space& space, const XVector& x) { - static_assert(Kokkos::is_execution_space::value, - "KokkosBlas::nrm1: execution_space must be a Kokkos::execution_space."); + static_assert( + Kokkos::is_execution_space::value, + "KokkosBlas::nrm1: execution_space must be a Kokkos::execution_space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm1: XVector must be a Kokkos::View."); static_assert(XVector::rank == 1, @@ -58,8 +64,9 @@ nrm1(const execution_space& space, const XVector& x) { RVector_Internal R = RVector_Internal(&result); XVector_Internal X = x; - Impl::Nrm1::nrm1(space, R, X); - Kokkos::fence(); + Impl::Nrm1::nrm1(space, + R, X); + space.fence(); return result; } @@ -81,11 +88,18 @@ nrm1(const XVector& x) { /// /// Replace each entry in R with the nrm1olute value (magnitude) of the /// corresponding entry in X. +/// This function is non-blocking and thread-safe /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. +/// +/// \param space [in] the execution space instance, possibly containing a +/// stream/queue where the kernel will be executed. +/// \param R [out] Output 1-D View containing the result +/// \param X [in] Input 1-D View. template void nrm1(const execution_space& space, const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { @@ -104,8 +118,10 @@ void nrm1(const execution_space& space, const RV& R, const XMV& X, ((RV::rank == 1) && (XMV::rank == 2)), "KokkosBlas::nrm1: " "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm1: execution_space cannot access data in XMV"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm1: execution_space cannot access data in XMV"); typedef typename Kokkos::Details::InnerProductSpaceTraits< typename XMV::non_const_value_type>::mag_type mag_type; @@ -147,18 +163,24 @@ void nrm1(const execution_space& space, const RV& R, const XMV& X, RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Nrm1::nrm1(space, R_internal, X_internal); + Impl::Nrm1::nrm1( + space, R_internal, X_internal); } /// \brief R(j) = nrm1(X(i,j)) /// /// Replace each entry in R with the nrm1olute value (magnitude) of the /// corresponding entry in X. +/// This function is non-blocking and thread-safe. The kernel is executed in the +/// default stream/queue associated with the execution space of XMV. /// /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. +/// +/// \param R [out] Output 1-D View containing the result +/// \param X [in] Input 1-D View. template void nrm1(const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { diff --git a/blas/src/KokkosBlas1_nrm2.hpp b/blas/src/KokkosBlas1_nrm2.hpp index 722e2fc440..67cdde17fa 100644 --- a/blas/src/KokkosBlas1_nrm2.hpp +++ b/blas/src/KokkosBlas1_nrm2.hpp @@ -26,23 +26,30 @@ namespace KokkosBlas { /// \brief Return the nrm2 of the vector x. /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// +/// \param space [in] the execution space instance, possibly containing a +/// stream/queue where the kernel will be executed. /// \param x [in] Input 1-D View. /// /// \return The nrm2 product result; a single value. -template ::value, int>::type = 0> +template < + class execution_space, class XVector, + typename std::enable_if::value, + int>::type = 0> typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type nrm2(const execution_space& space, const XVector& x) { static_assert(Kokkos::is_execution_space::value, - "KokkosBlas::nrm2: execution_space must be a valid" - " Kokkos execution space."); + "KokkosBlas::nrm2: execution_space must be a valid" + " Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2: XVector must be a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2: XVector must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2: XVector must be accessible from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::nrm2: " "XVector must have rank 1."); @@ -65,13 +72,17 @@ nrm2(const execution_space& space, const XVector& x) { RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; - Impl::Nrm2::nrm2(space, R, X, true); - Kokkos::fence(); + Impl::Nrm2::nrm2( + space, R, X, true); + space.fence(); return result; } /// \brief Return the nrm2 of the vector x. /// +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVector. +/// /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// /// \param x [in] Input 1-D View. @@ -88,24 +99,33 @@ nrm2(const XVector& x) { /// /// Replace each entry in R with the nrm2olute value (magnitude) of the /// corresponding entry in X. +/// This function is non-blocking and thread-safe /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. +/// +/// \param space [in] the execution space instance, possibly containing a +/// stream/queue where the kernel will be executed. +/// \param R [out] Output View containing results (rank 0 or 1). +/// \param X [in] Input View (rank 1 or 2). template void nrm2(const execution_space& space, const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { static_assert(Kokkos::is_execution_space::value, - "KokkosBlas::nrm2: space is not a Kokkos execution space."); + "KokkosBlas::nrm2: space is not a Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2: " "X is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2: X cannot be accessed from execution_space."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2: X cannot be accessed from execution_space."); static_assert(std::is_same::value, "KokkosBlas::nrm2: R is const. " @@ -152,18 +172,26 @@ void nrm2(const execution_space& space, const RV& R, const XMV& X, RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Nrm2::nrm2(space, R_internal, X_internal, true); + Impl::Nrm2::nrm2( + space, R_internal, X_internal, true); } /// \brief R(i,j) = nrm2(X(i,j)) /// /// Replace each entry in R with the nrm2olute value (magnitude) of the /// corresponding entry in X. +/// This function is non-blocking and thread-safe +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XMV. /// /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. +/// +/// where the kernel will be executed. +/// \param R [out] Output View containing results (rank 0 or 1). +/// \param X [in] Input View (rank 1 or 2). template void nrm2(const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { diff --git a/blas/src/KokkosBlas1_nrm2_squared.hpp b/blas/src/KokkosBlas1_nrm2_squared.hpp index bb6fa512b5..c065efb290 100644 --- a/blas/src/KokkosBlas1_nrm2_squared.hpp +++ b/blas/src/KokkosBlas1_nrm2_squared.hpp @@ -25,24 +25,31 @@ namespace KokkosBlas { /// \brief Return the nrm2 of the vector x. /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// +/// \param space [in] the execution space instance, possibly containing a +/// stream/queue where the kernel will be executed. /// \param x [in] Input 1-D View. /// /// \return The nrm2 product result; a single value. -template ::value, int>::type = 0> +template < + class execution_space, class XVector, + typename std::enable_if::value, + int>::type = 0> typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type nrm2_squared(const execution_space& space, const XVector& x) { static_assert(Kokkos::is_execution_space::value, - "KokkosBlas::nrm2_squared: execution_space must be a valid" - " Kokkos execution space"); + "KokkosBlas::nrm2_squared: execution_space must be a valid" + " Kokkos execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2_squared: XVector must be a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2_squared: XVector must be accessible" - " from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2_squared: XVector must be accessible" + " from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::nrm2_squared: " "Both Vector inputs must have rank 1."); @@ -64,13 +71,17 @@ nrm2_squared(const execution_space& space, const XVector& x) { RVector_Internal R = RVector_Internal(&result); XVector_Internal X = x; - Impl::Nrm2::nrm2(space, R, X, false); - Kokkos::fence(); + Impl::Nrm2::nrm2( + space, R, X, false); + space.fence(); return result; } /// \brief Return the nrm2 of the vector x. /// +/// The kernel is executed in thedefault stream/queue associated +/// with the execution space of XVector. +/// /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// /// \param x [in] Input 1-D View. @@ -87,27 +98,36 @@ nrm2_squared(const XVector& x) { /// /// Replace each entry in R with the nrm2olute value (magnitude) of the /// corresponding entry in X. +/// This function is non-blocking and thread-safe /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. +/// +/// \param space [in] the execution space instance, possibly containing a +/// stream/queue where the kernel will be executed. +/// \param R [in] Output View (rank 0 or 1) that holds the result. +/// \param X [in] Input View (rank 1 or 2). template void nrm2_squared( const execution_space& space, const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { static_assert(Kokkos::is_execution_space::value, - "KokkosBlas::nrm2_squared: execution_space must be a valid" - " Kokkos execution space"); + "KokkosBlas::nrm2_squared: execution_space must be a valid" + " Kokkos execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2_squared: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2_squared: " "X is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2_squared: XVector must be accessible" - " from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2_squared: XVector must be accessible" + " from execution_space"); static_assert(std::is_same::value, "KokkosBlas::nrm2_squared: R is const. " @@ -153,20 +173,25 @@ void nrm2_squared( RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Nrm2::nrm2(space, R_internal, X_internal, false); + Impl::Nrm2::nrm2( + space, R_internal, X_internal, false); } /// \brief R(i,j) = nrm2(X(i,j)) /// /// Replace each entry in R with the nrm2olute value (magnitude) of the /// corresponding entry in X. +/// This function is non-blocking and thread-safe. +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVM. /// /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. template -void nrm2_squared(const RV& R, const XMV& X, +void nrm2_squared( + const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { nrm2_squared(typename XMV::execution_space{}, R, X); } diff --git a/blas/src/KokkosBlas1_nrm2w.hpp b/blas/src/KokkosBlas1_nrm2w.hpp index 42d138974f..c5eaa0621b 100644 --- a/blas/src/KokkosBlas1_nrm2w.hpp +++ b/blas/src/KokkosBlas1_nrm2w.hpp @@ -39,14 +39,17 @@ template typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type nrm2w(const execution_space& space, const XVector& x, const XVector& w, - typename std::enable_if::value, int>::type = 0) { + typename std::enable_if< + Kokkos::is_execution_space::value, int>::type = 0) { static_assert(Kokkos::is_execution_space::value, - "KokkosBlas::nrm2w: execution_space must be a valid" - " Kokkos execution space."); + "KokkosBlas::nrm2w: execution_space must be a valid" + " Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w: XVector must be a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2w: XVector must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w: XVector must be accessible from execution_space"); static_assert(XVector::rank == 1, "KokkosBlas::nrm2w: " "Both Vector inputs must have rank 1."); @@ -60,21 +63,26 @@ nrm2w(const execution_space& space, const XVector& x, const XVector& w, using layout_t = typename XVector_Internal::array_layout; - using RVector_Internal = Kokkos::View >; + using RVector_Internal = + Kokkos::View >; mag_type result; RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; XVector_Internal W = w; - Impl::Nrm2w::nrm2w(space, R, X, W, true); - Kokkos::fence(); + Impl::Nrm2w::nrm2w( + space, R, X, W, true); + space.fence(); return result; } /// \brief Return the nrm2w of the vector x. /// +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVector. +/// /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// /// \param x [in] Input 1-D View. @@ -92,25 +100,36 @@ nrm2w(const XVector& x, const XVector& w) { /// /// Replace each entry in R with the nrm2w, absolute value (magnitude), of the /// corresponding entry in X. +/// This function is non-blocking and thread-safe /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. +/// +/// \param space [in] the execution space instance, possibly containing a +/// stream/queue where the kernel will be executed. +/// \param R [out] Output View containing results (rank 0 or 1). +/// \param X [in] Input View (rank 1 or 2). +/// \param W [in] Input View (rank 1 or 2). template -void nrm2w(const execution_space& space, const RV& R, const XMV& X, const XMV& W, +void nrm2w(const execution_space& space, const RV& R, const XMV& X, + const XMV& W, typename std::enable_if::value, int>::type = 0) { static_assert(Kokkos::is_execution_space::value, - "KokkosBlas::nrm2w: execution_space must be a valid" - " Kokkos execution space."); + "KokkosBlas::nrm2w: execution_space must be a valid" + " Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w: " "X is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2w: XMV must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w: XMV must be accessible from execution_space"); static_assert(std::is_same::value, "KokkosBlas::nrm2w: R is const. " @@ -158,21 +177,27 @@ void nrm2w(const execution_space& space, const RV& R, const XMV& X, const XMV& W XMV_Internal X_internal = X; XMV_Internal W_internal = W; - Impl::Nrm2w::nrm2w(space, R_internal, X_internal, - W_internal, true); + Impl::Nrm2w::nrm2w( + space, R_internal, X_internal, W_internal, true); } - - /// \brief R(i,j) = nrm2w(X(i,j)) /// /// Replace each entry in R with the nrm2w, absolute value (magnitude), of the /// corresponding entry in X. +/// This function is non-blocking and thread-safe +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVM. /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. +/// +/// \param R [out] Output View containing results (rank 0 or 1). +/// \param X [in] Input View (rank 1 or 2). +/// \param W [in] Input View (rank 1 or 2). template void nrm2w(const RV& R, const XMV& X, const XMV& W, typename std::enable_if::value, int>::type = 0) { diff --git a/blas/src/KokkosBlas1_nrm2w_squared.hpp b/blas/src/KokkosBlas1_nrm2w_squared.hpp index d777c27a2c..a1fe10bf1e 100644 --- a/blas/src/KokkosBlas1_nrm2w_squared.hpp +++ b/blas/src/KokkosBlas1_nrm2w_squared.hpp @@ -34,18 +34,24 @@ namespace KokkosBlas { /// \param w [in] Input weights (1-D View). /// /// \return The nrm2w product result; a single value. -template ::value, int>::type = 0> +template < + class execution_space, class XVector, + typename std::enable_if::value, + int>::type = 0> typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type -nrm2w_squared(const execution_space& space, const XVector& x, const XVector& w) { +nrm2w_squared(const execution_space& space, const XVector& x, + const XVector& w) { static_assert(Kokkos::is_execution_space::value, - "KokkosBlas::nrm2w_squared: execution_space must be a valid " - "Kokkos execution space."); + "KokkosBlas::nrm2w_squared: execution_space must be a valid " + "Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w_squared: XVector must be a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2w_squared: XVector must be accessible from execution_space."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w_squared: XVector must be accessible from " + "execution_space."); static_assert(XVector::rank == 1, "KokkosBlas::nrm2w_squared: " "Both Vector inputs must have rank 1."); @@ -60,21 +66,26 @@ nrm2w_squared(const execution_space& space, const XVector& x, const XVector& w) using layout_t = typename XVector_Internal::array_layout; - using RVector_Internal = Kokkos::View >; + using RVector_Internal = + Kokkos::View >; mag_type result; RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; XVector_Internal W = w; - Impl::Nrm2w::nrm2w(space, R, X, W, false); - Kokkos::fence(); + Impl::Nrm2w::nrm2w( + space, R, X, W, false); + space.fence(); return result; } /// \brief Return the nrm2w of the vector x. /// +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVector. +/// /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// /// \param x [in] Input 1-D View. @@ -92,26 +103,37 @@ nrm2w_squared(const XVector& x, const XVector& w) { /// /// Replace each entry in R with the nrm2wolute value (magnitude) of the /// corresponding entry in X. +/// This function is non-blocking and thread-safe. /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. +/// +/// \param space [in] the execution space instance, possibly containing a +/// stream/queue where the kernel will be executed. +/// \param R [out] Output View containing results (rank 0 or 1). +/// \param X [in] Input View (rank 1 or 2). +/// \param W [in] Input View (rank 1 or 2). template void nrm2w_squared( const execution_space& space, const RV& R, const XMV& X, const XMV& W, typename std::enable_if::value, int>::type = 0) { static_assert(Kokkos::is_execution_space::value, - "KokkosBlas::nrm2w_squared: execution_space must be a valid " - "Kokkos execution space."); + "KokkosBlas::nrm2w_squared: execution_space must be a valid " + "Kokkos execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w_squared: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrm2w_squared: " "X is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrm2w_squared: XVector must be accessible from execution_space."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrm2w_squared: XVector must be accessible from " + "execution_space."); static_assert(std::is_same::value, "KokkosBlas::nrm2w_squared: R is const. " @@ -145,30 +167,37 @@ void nrm2w_squared( // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - using RV_Internal = Kokkos::View >; - using XMV_Internal = Kokkos::View >; + using RV_Internal = Kokkos::View >; + using XMV_Internal = Kokkos::View >; RV_Internal R_internal = R; XMV_Internal X_internal = X; XMV_Internal W_internal = W; - Impl::Nrm2w::nrm2w(space, R_internal, X_internal, - W_internal, false); + Impl::Nrm2w::nrm2w( + space, R_internal, X_internal, W_internal, false); } /// \brief R(i,j) = nrm2w(X(i,j)) /// /// Replace each entry in R with the nrm2wolute value (magnitude) of the /// corresponding entry in X. +/// This function is non-blocking and thread-safe. +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XMV. /// /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. +/// +/// \param R [out] Output View containing results (rank 0 or 1). +/// \param X [in] Input View (rank 1 or 2). +/// \param W [in] Input View (rank 1 or 2). template void nrm2w_squared( const RV& R, const XMV& X, const XMV& W, diff --git a/blas/src/KokkosBlas1_nrminf.hpp b/blas/src/KokkosBlas1_nrminf.hpp index 00ef0df0fd..c6f923aefe 100644 --- a/blas/src/KokkosBlas1_nrminf.hpp +++ b/blas/src/KokkosBlas1_nrminf.hpp @@ -33,8 +33,10 @@ namespace KokkosBlas { /// \param x [in] Input 1-D View. /// /// \return The nrminf product result; a single value. -template ::value, int>::type = 0> +template < + class execution_space, class XVector, + typename std::enable_if::value, + int>::type = 0> typename Kokkos::Details::InnerProductSpaceTraits< typename XVector::non_const_value_type>::mag_type nrminf(const execution_space& space, const XVector& x) { @@ -62,8 +64,9 @@ nrminf(const execution_space& space, const XVector& x) { RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; - Impl::NrmInf::nrminf(space, R, X); - Kokkos::fence(); + Impl::NrmInf::nrminf( + space, R, X); + space.fence(); return result; } @@ -96,15 +99,17 @@ void nrminf( const execution_space& space, const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { static_assert(Kokkos::is_execution_space::value, - "KokkosBlas::nrminf: space is not an execution space instance"); + "KokkosBlas::nrminf: space is not an execution space instance"); static_assert(Kokkos::is_view::value, "KokkosBlas::nrminf: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::nrminf: " "X is not a Kokkos::View."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::nrminf: X is not accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::nrminf: X is not accessible from execution_space"); static_assert(std::is_same::value, "KokkosBlas::nrminf: R is const. " @@ -155,7 +160,8 @@ void nrminf( RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::NrmInf::nrminf(space, R_internal, X_internal); + Impl::NrmInf::nrminf( + space, R_internal, X_internal); } /// \brief R(j) = nrminf(X(i,j)) diff --git a/blas/src/KokkosBlas1_sum.hpp b/blas/src/KokkosBlas1_sum.hpp index 6db51d7f54..88c7b10021 100644 --- a/blas/src/KokkosBlas1_sum.hpp +++ b/blas/src/KokkosBlas1_sum.hpp @@ -25,59 +25,98 @@ namespace KokkosBlas { /// \brief Return the sum of the vector x. /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. /// +/// \param space [in] execution space instance where the kernel will run. /// \param x [in] Input 1-D View. /// /// \return The sum product result; a single value. -template -typename XVector::non_const_value_type sum(const XVector& x) { +template , + int>::type = 0> +typename XVector::non_const_value_type sum(const execution_space& space, + const XVector& x) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::sum: execution_space must be a valid Kokkos " + "execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::sum: XVector must be a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::sum: XVector must be accessible from execution_space."); static_assert(XVector::rank == 1, "KokkosBlas::sum: " "Both Vector inputs must have rank 1."); - typedef Kokkos::View< + using XVector_Internal = Kokkos::View< typename XVector::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits > - XVector_Internal; + typename XVector::device_type, Kokkos::MemoryTraits >; using layout_t = typename XVector_Internal::array_layout; - typedef Kokkos::View > - RVector_Internal; + using RVector_Internal = + Kokkos::View >; typename XVector::non_const_value_type result; RVector_Internal R = RVector_Internal(&result, layout_t()); XVector_Internal X = x; - Impl::Sum::sum(R, X); - Kokkos::fence(); + Impl::Sum::sum(space, R, + X); + space.fence(); return result; } +/// \brief Return the sum of the vector x. +/// +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVector. +/// +/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. +/// +/// \param x [in] Input 1-D View. +/// +/// \return The sum product result; a single value. +template +typename XVector::non_const_value_type sum(const XVector& x) { + return sum(typename XVector::execution_space{}, x); +} + /// \brief R(j) = sum(X(i,j)) /// /// Replace each entry in R with the sumolute value (magnitude) of the /// corresponding entry in X. +/// This function is non-blocking and thread-safe. /// +/// \tparam execution_space a Kokkos execution space where the kernel will run. /// \tparam RMV 1-D or 2-D Kokkos::View specialization. /// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have /// the same rank as RMV, and its entries must be assignable to /// those of RMV. -template -void sum(const RV& R, const XMV& X, +/// +/// \param space [in] execution space instance where the kernel will run. +/// \param R [out] Output View (rank 0 or 1) containing the results. +/// \param X [in] Input View (rank 1 or 2). +template +void sum(const execution_space& space, const RV& R, const XMV& X, typename std::enable_if::value, int>::type = 0) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::sum: execution_space must be a valid Kokkos " + "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::sum: " "R is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::sum: " "X is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::sum: XMV must be accessible from execution_space."); static_assert(std::is_same::value, "KokkosBlas::sum: R is const. " @@ -117,7 +156,29 @@ void sum(const RV& R, const XMV& X, RV_Internal R_internal = R; XMV_Internal X_internal = X; - Impl::Sum::sum(R_internal, X_internal); + Impl::Sum::sum(space, R_internal, + X_internal); +} + +/// \brief R(j) = sum(X(i,j)) +/// +/// Replace each entry in R with the sumolute value (magnitude) of the +/// corresponding entry in X. +/// This function is non-blocking and thread-safe. +/// The kernel is executed in the default stream/queue associated +/// with the execution space of XVM. +/// +/// \tparam RMV 1-D or 2-D Kokkos::View specialization. +/// \tparam XMV 1-D or 2-D Kokkos::View specialization. It must have +/// the same rank as RMV, and its entries must be assignable to +/// those of RMV. +/// +/// \param R [out] Output View (rank 0 or 1) containing the results. +/// \param X [in] Input View (rank 1 or 2). +template +void sum(const RV& R, const XMV& X, + typename std::enable_if::value, int>::type = 0) { + sum(typename XMV::execution_space{}, R, X); } } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp index 500bd5f239..ca2139980d 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp @@ -20,8 +20,8 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct dot_tpl_spec_avail { enum : bool { value = false }; }; @@ -37,6 +37,7 @@ namespace Impl { #define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct dot_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View >, \ Kokkos::View, \ @@ -61,27 +62,31 @@ KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS // double -#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ + MEMSPACE) \ + template <> \ struct dot_tpl_spec_avail< \ + EXECSPACE, \ Kokkos::View >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, 1> { \ enum : bool { value = true }; \ }; -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) #endif diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp index e7054b1113..718e32f14c 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp @@ -43,6 +43,7 @@ namespace Impl { #define KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Dot< \ + ExecSpace, \ Kokkos::View >, \ Kokkos::View, \ @@ -59,7 +60,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void dot(RV& R, const XV& X, const XV& Y) { \ + static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -68,49 +69,53 @@ namespace Impl { int one = 1; \ R() = HostBlas::dot(N, X.data(), one, Y.data(), one); \ } else { \ - Dot::dot(R, X, Y); \ + Dot::dot(space, R, \ + X, Y); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Dot< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas::dot(N, X.data(), one, Y.data(), one); \ - } else { \ - Dot::dot(R, X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Dot< \ + ExecSpace, \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,float]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + dot_print_specialization(); \ + int N = numElems; \ + int one = 1; \ + R() = HostBlas::dot(N, X.data(), one, Y.data(), one); \ + } else { \ + Dot::dot(space, R, \ + X, Y); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Dot, LAYOUT, Kokkos::HostSpace, \ + struct Dot, LAYOUT, Kokkos::HostSpace, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ @@ -128,7 +133,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void dot(RV& R, const XV& X, const XV& Y) { \ + static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::dot[TPL_BLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -140,7 +145,8 @@ namespace Impl { N, reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(Y.data()), one); \ } else { \ - Dot::dot(R, X, Y); \ + Dot::dot(space, R, \ + X, Y); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -148,7 +154,8 @@ namespace Impl { #define KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Dot, LAYOUT, Kokkos::HostSpace, \ + struct Dot, LAYOUT, Kokkos::HostSpace, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ @@ -166,7 +173,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void dot(RV& R, const XV& X, const XV& Y) { \ + static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::dot[TPL_BLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -178,7 +185,8 @@ namespace Impl { N, reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(Y.data()), one); \ } else { \ - Dot::dot(R, X, Y); \ + Dot::dot(space, R, \ + X, Y); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -212,27 +220,28 @@ KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ - template \ + template <> \ struct Dot< \ + EXECSPACE, \ Kokkos::View >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, 1, true, ETI_SPEC_AVAIL> { \ typedef Kokkos::View > \ RV; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void dot(RV& R, const XV& X, const XV& Y) { \ + static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -243,113 +252,119 @@ namespace Impl { KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ cublasDdot(s.handle, N, X.data(), one, Y.data(), one, &R()); \ } else { \ - Dot::dot(R, X, Y); \ + Dot::dot(space, R, \ + X, Y); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Dot< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasSdot(s.handle, N, X.data(), one, Y.data(), one, &R()); \ - } else { \ - Dot::dot(R, X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Dot< \ + EXECSPACE, \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,float]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + dot_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + cublasSdot(s.handle, N, X.data(), one, Y.data(), one, &R()); \ + } else { \ + Dot::dot(space, R, \ + X, Y); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Dot, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::dot[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasZdotc(s.handle, N, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(&R())); \ - } else { \ - Dot::dot(R, X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Dot, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits > \ + RV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::dot[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + dot_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + cublasZdotc(s.handle, N, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(&R())); \ + } else { \ + Dot::dot(space, R, \ + X, Y); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, \ +#define KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ - template \ - struct Dot, LAYOUT, Kokkos::HostSpace, \ + template <> \ + struct Dot, LAYOUT, Kokkos::HostSpace, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ 1, 1, true, ETI_SPEC_AVAIL> { \ typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ Kokkos::MemoryTraits > \ RV; \ typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void dot(RV& R, const XV& X, const XV& Y) { \ + static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::dot[TPL_CUBLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -363,31 +378,32 @@ namespace Impl { one, reinterpret_cast(Y.data()), one, \ reinterpret_cast(&R())); \ } else { \ - Dot::dot(R, X, Y); \ + Dot::dot(space, R, \ + X, Y); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) +KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) -KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) +KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) -KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) +KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) -KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, - false) +KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp index 71d7c664aa..616c26c87a 100644 --- a/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct iamax_tpl_spec_avail { enum : bool { value = false }; }; @@ -37,6 +37,7 @@ namespace Impl { MEMSPACE) \ template \ struct iamax_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View >, \ Kokkos::View, \ @@ -61,20 +62,24 @@ KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS(unsigned long, Kokkos::complex, // double #define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(INDEX_TYPE, SCALAR, LAYOUT, \ MEMSPACE) \ - template \ + template <> \ struct iamax_tpl_spec_avail< \ + Kokkos::Cuda, \ Kokkos::View >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { \ enum : bool { value = true }; \ }; \ - template \ + template <> \ struct iamax_tpl_spec_avail< \ - Kokkos::View, \ + Kokkos::Cuda, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { \ enum : bool { value = true }; \ @@ -126,51 +131,47 @@ KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS(unsigned int, Kokkos::complex, // rocBLAS #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCBLAS) -#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(INDEX_TYPE, SCALAR, LAYOUT, \ - MEMSPACE) \ - template \ - struct iamax_tpl_spec_avail< \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ - }; \ - template \ - struct iamax_tpl_spec_avail< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1> { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(INDEX_TYPE, SCALAR, LAYOUT, \ + MEMSPACE) \ + template <> \ + struct iamax_tpl_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ + }; \ + template <> \ + struct iamax_tpl_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, double, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, double, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, float, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, float, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned long, Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_ROCBLAS(unsigned int, Kokkos::complex, - Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace) + Kokkos::LayoutLeft, Kokkos::HIPSpace) #endif diff --git a/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp index 958bc4d218..913ec5a151 100644 --- a/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp @@ -49,7 +49,8 @@ namespace Impl { #define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_BLAS( \ SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Iamax >, \ Kokkos::View, \ @@ -64,7 +65,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void iamax(RV& R, const XV& X) { \ + static void iamax(const ExecSpace& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_BLAS," #SCALAR_TYPE \ "]"); \ const size_type numElems = X.extent(0); \ @@ -81,7 +82,8 @@ namespace Impl { N, reinterpret_cast(X.data()), LDX); \ R() = static_cast(idx); \ } else { \ - Iamax::iamax(R, X); \ + Iamax::iamax(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -148,25 +150,27 @@ using CUBLASUVM_DEVICE_TYPE = #endif #define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL, RET_DEVICE_TYPE, CUBLAS_PTR_MODE_1, CUBLAS_PTR_MODE_2) \ - template \ - struct Iamax \ + struct Iamax >, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ typedef Kokkos::View > \ RV; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void iamax(RV& R, const XV& X) { \ + static void iamax(const EXEC_SPACE& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::iamax[TPL_CUBLAS," #SCALAR_TYPE "]"); \ const size_type numElems = X.extent(0); \ @@ -181,6 +185,8 @@ using CUBLASUVM_DEVICE_TYPE = const int LDX = (XST == 0) ? 1 : XST; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ cublasPointerMode_t prevPtrMode; \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasGetPointerMode(s.handle, &prevPtrMode)); \ @@ -194,9 +200,11 @@ using CUBLASUVM_DEVICE_TYPE = if (prevPtrMode == CUBLAS_PTR_MODE_2) { \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_2)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } \ } else { \ - Iamax::iamax(R, X); \ + Iamax::iamax(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -206,26 +214,26 @@ using CUBLASUVM_DEVICE_TYPE = CUBLAS_FN, INDEX_TYPE, LAYOUT, \ MEMSPACE, ETI_SPEC_AVAIL) \ KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL, Kokkos::HostSpace, CUBLAS_POINTER_MODE_HOST, \ - CUBLAS_POINTER_MODE_DEVICE) \ + SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, Kokkos::HostSpace, \ + CUBLAS_POINTER_MODE_HOST, CUBLAS_POINTER_MODE_DEVICE) \ KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL, CUBLAS_DEVICE_TYPE, CUBLAS_POINTER_MODE_DEVICE, \ - CUBLAS_POINTER_MODE_HOST) + SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, CUBLAS_DEVICE_TYPE, \ + CUBLAS_POINTER_MODE_DEVICE, CUBLAS_POINTER_MODE_HOST) #if defined(KOKKOS_ENABLE_CUDA_UVM) -#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL, Kokkos::HostSpace, CUBLAS_POINTER_MODE_HOST, \ - CUBLAS_POINTER_MODE_DEVICE) \ - KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ - SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL, CUBLASUVM_DEVICE_TYPE, CUBLAS_POINTER_MODE_DEVICE, \ - CUBLAS_POINTER_MODE_HOST) +#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_UVM( \ + SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ + SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, Kokkos::HostSpace, \ + CUBLAS_POINTER_MODE_HOST, CUBLAS_POINTER_MODE_DEVICE) \ + KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_CUBLAS_WRAPPER( \ + SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, INDEX_TYPE, LAYOUT, \ + Kokkos::Cuda, MEMSPACE, ETI_SPEC_AVAIL, CUBLASUVM_DEVICE_TYPE, \ + CUBLAS_POINTER_MODE_DEVICE, CUBLAS_POINTER_MODE_HOST) #endif #define KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_CUBLAS(INDEX_TYPE, LAYOUT, MEMSPACE, \ @@ -372,30 +380,31 @@ KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_CUBLAS_UVM(unsigned int, Kokkos::LayoutLeft, namespace KokkosBlas { namespace Impl { -using ROCBLAS_DEVICE_TYPE = - Kokkos::Device; +using ROCBLAS_DEVICE_TYPE = Kokkos::Device; #define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER( \ SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT, \ MEMSPACE, ETI_SPEC_AVAIL, RET_DEVICE_TYPE, ROCBLAS_PTR_MODE_1, \ ROCBLAS_PTR_MODE_2) \ - template \ - struct Iamax \ + struct Iamax >, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = Kokkos::HIP; \ typedef Kokkos::View > \ RV; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void iamax(RV& R, const XV& X) { \ + static void iamax(const execution_space& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::iamax[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ const size_type numElems = X.extent(0); \ @@ -410,6 +419,8 @@ using ROCBLAS_DEVICE_TYPE = const int LDX = (XST == 0) ? 1 : XST; \ KokkosBlas::Impl::RocBlasSingleton& s = \ KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ rocblas_pointer_mode prevPtrMode; \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_get_pointer_mode(s.handle, &prevPtrMode)); \ @@ -421,12 +432,14 @@ using ROCBLAS_DEVICE_TYPE = ROCBLAS_FN(s.handle, N, \ reinterpret_cast(X.data()), \ LDX, reinterpret_cast(R.data()))); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ if (prevPtrMode == ROCBLAS_PTR_MODE_2) { \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_2)); \ } \ } else { \ - Iamax::iamax(R, X); \ + Iamax::iamax(space, \ + R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -469,44 +482,44 @@ using ROCBLAS_DEVICE_TYPE = INDEX_TYPE, LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) + Kokkos::HIPSpace, true) KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) + Kokkos::HIPSpace, false) KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) + Kokkos::HIPSpace, true) KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) + Kokkos::HIPSpace, false) KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) + Kokkos::HIPSpace, true) KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) + Kokkos::HIPSpace, false) KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) + Kokkos::HIPSpace, true) KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned long, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) + Kokkos::HIPSpace, false) KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) + Kokkos::HIPSpace, true) KOKKOSBLAS1_DIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) + Kokkos::HIPSpace, false) KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) + Kokkos::HIPSpace, true) KOKKOSBLAS1_SIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) + Kokkos::HIPSpace, false) KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) + Kokkos::HIPSpace, true) KOKKOSBLAS1_ZIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) + Kokkos::HIPSpace, false) KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, true) + Kokkos::HIPSpace, true) KOKKOSBLAS1_CIAMAX_TPL_SPEC_DECL_ROCBLAS(unsigned int, Kokkos::LayoutLeft, - Kokkos::Experimental::HIPSpace, false) + Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp index da9cc9edca..082bec8135 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp @@ -36,7 +36,7 @@ namespace Impl { #define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct nrm1_tpl_spec_avail< \ - ExecSpace, \ + ExecSpace, \ Kokkos::View< \ typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ @@ -86,7 +86,7 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, #endif -//rocBLAS +// rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS #define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ @@ -103,7 +103,7 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, }; KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCUBLAS(double, Kokkos::LayoutLeft, - Kokkos::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, @@ -111,8 +111,7 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIPSpace) - -#endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index f6469d29f4..96b704321f 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -42,6 +42,7 @@ namespace Impl { #define KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Nrm1< \ + ExecSpace, \ Kokkos::View >, \ Kokkos::View, \ @@ -56,7 +57,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm1(RV& R, const XV& X) { \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -65,7 +66,7 @@ namespace Impl { int one = 1; \ R() = HostBlas::asum(N, X.data(), one); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -74,6 +75,7 @@ namespace Impl { #define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Nrm1< \ + ExecSpace, \ Kokkos::View >, \ Kokkos::View, \ @@ -88,7 +90,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm1(RV& R, const XV& X) { \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -97,7 +99,7 @@ namespace Impl { int one = 1; \ R() = HostBlas::asum(N, X.data(), one); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -105,7 +107,8 @@ namespace Impl { #define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Nrm1 >, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ @@ -120,7 +123,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm1(RV& R, const XV& X) { \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::nrm1[TPL_BLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -131,7 +134,7 @@ namespace Impl { R() = HostBlas >::asum( \ N, reinterpret_cast*>(X.data()), one); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -139,7 +142,8 @@ namespace Impl { #define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Nrm1 >, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ @@ -154,7 +158,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm1(RV& R, const XV& X) { \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::nrm1[TPL_BLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -165,7 +169,7 @@ namespace Impl { R() = HostBlas >::asum( \ N, reinterpret_cast*>(X.data()), one); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -208,12 +212,12 @@ namespace Impl { template <> \ struct Nrm1< \ EXECSPACE, \ - Kokkos::View >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ + using execution_space = EXECSPACE; \ typedef Kokkos::View > \ RV; \ @@ -223,7 +227,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm1(const execution_space& space, RV& R, const XV& X) { \ + static void nrm1(const execution_space& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -232,21 +236,22 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasDasum(s.handle, N, X.data(), one, R.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDasum(s.handle, N, X.data(), one, R.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } else { \ - Nrm1::nrm1(space, R, X); \ + Nrm1::nrm1(space, \ + R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ +#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ - template <> \ + template <> \ struct Nrm1< \ EXECSPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ + using execution_space = EXECSPACE; \ typedef Kokkos::View > \ RV; \ @@ -264,7 +269,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm1(const execution_space& space, RV& R, const XV& X) { \ + static void nrm1(const execution_space& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -273,106 +278,105 @@ namespace Impl { constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSasum(s.handle, N, X.data(), one, R.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSasum(s.handle, N, X.data(), one, R.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } else { \ - Nrm1::nrm1(space, R, X); \ + Nrm1::nrm1(space, \ + R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm1< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const execution_space& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasDzasum(s.handle, N, \ - reinterpret_cast(X.data()), one, \ - R.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - Nrm1::nrm1(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm1 >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm1(const execution_space& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::nrm1[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm1_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDzasum( \ + s.handle, N, reinterpret_cast(X.data()), \ + one, R.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + Nrm1::nrm1(space, \ + R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm1< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const execution_space& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasScasum(s.handle, N, \ - reinterpret_cast(X.data()), one, \ - R.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - Nrm1::nrm1(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm1 >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm1(const execution_space& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::nrm1[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm1_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasScasum( \ + s.handle, N, reinterpret_cast(X.data()), one, \ + R.data())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + Nrm1::nrm1(space, \ + R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, @@ -425,7 +429,7 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ROCBLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -437,7 +441,7 @@ namespace Impl { KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_stream(s.handle, space.hip_stream())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_dasum(s.handle, N, X.data(), one, R.data())); \ + rocblas_dasum(s.handle, N, X.data(), one, R.data())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ } else { \ Nrm1::nrm1(R, X); \ @@ -446,8 +450,8 @@ namespace Impl { } \ }; -#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ template \ struct Nrm1< \ Kokkos::View(INT_MAX)) { \ @@ -476,7 +480,7 @@ namespace Impl { KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_stream(s.handle, space.hip_stream())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_sasum(s.handle, N, X.data(), one, R.data())); \ + rocblas_sasum(s.handle, N, X.data(), one, R.data())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ } else { \ Nrm1::nrm1(R, X); \ @@ -485,88 +489,88 @@ namespace Impl { } \ }; -#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Nrm1 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_ROCBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_dzasum(s.handle, N, \ - reinterpret_cast(X.data()), \ - one, R.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - Nrm1::nrm1(R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct Nrm1 >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::nrm1[TPL_ROCBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm1_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dzasum( \ + s.handle, N, \ + reinterpret_cast(X.data()), one, \ + R.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + Nrm1::nrm1(R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Nrm1 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_ROCBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_scasum(s.handle, N, \ - reinterpret_cast(X.data()), \ - one, R.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - Nrm1::nrm1(R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct Nrm1 >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::nrm1[TPL_ROCBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm1_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_scasum( \ + s.handle, N, \ + reinterpret_cast(X.data()), one, \ + R.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + Nrm1::nrm1(R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp index 1d894ad1da..a58c90d8e9 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp @@ -68,7 +68,8 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ LAYOUT, Kokkos::HostSpace, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { \ enum : bool { value = true }; \ diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp index d38e8680f0..5e017cb7e1 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp @@ -42,6 +42,7 @@ namespace Impl { #define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Nrm2< \ + ExecSpace, \ Kokkos::View >, \ Kokkos::View, \ @@ -56,7 +57,8 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm2(RV& R, const XV& X, const bool& take_sqrt) { \ + static void nrm2(const ExecSpace& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -66,7 +68,8 @@ namespace Impl { R() = HostBlas::nrm2(N, X.data(), int_one); \ if (!take_sqrt) R() = R() * R(); \ } else { \ - Nrm2::nrm2(R, X, take_sqrt); \ + Nrm2::nrm2(space, R, X, \ + take_sqrt); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -75,6 +78,7 @@ namespace Impl { #define KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ struct Nrm2< \ + ExecSpace, \ Kokkos::View >, \ Kokkos::View, \ @@ -89,7 +93,8 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm2(RV& R, const XV& X, const bool& take_sqrt) { \ + static void nrm2(const ExecSpace& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_BLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -99,7 +104,8 @@ namespace Impl { R() = HostBlas::nrm2(N, X.data(), int_one); \ if (!take_sqrt) R() = R() * R(); \ } else { \ - Nrm2::nrm2(R, X, take_sqrt); \ + Nrm2::nrm2(space, R, X, \ + take_sqrt); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -107,7 +113,8 @@ namespace Impl { #define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Nrm2 >, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ @@ -122,7 +129,8 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm2(RV& R, const XV& X, const bool& take_sqrt) { \ + static void nrm2(const ExecSpace& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::nrm2[TPL_BLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -135,7 +143,8 @@ namespace Impl { int_one); \ if (!take_sqrt) R() = R() * R(); \ } else { \ - Nrm2::nrm2(R, X, take_sqrt); \ + Nrm2::nrm2(space, R, X, \ + take_sqrt); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -143,7 +152,8 @@ namespace Impl { #define KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Nrm2 >, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ @@ -158,7 +168,8 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm2(RV& R, const XV& X, const bool& take_sqrt) { \ + static void nrm2(const ExecSpace& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::nrm2[TPL_BLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -171,7 +182,8 @@ namespace Impl { int_one); \ if (!take_sqrt) R() = R() * R(); \ } else { \ - Nrm2::nrm2(R, X, take_sqrt); \ + Nrm2::nrm2(space, R, X, \ + take_sqrt); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -209,51 +221,53 @@ KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm2< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - using RV = Kokkos::View >; \ - using XV = Kokkos::View, \ - Kokkos::MemoryTraits >; \ - using size_type = typename XV::size_type; \ - \ - static void nrm2(const execution_space& space, RV& R, const XV& X, const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,double]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int int_one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasDnrm2(s.handle, N, X.data(), int_one, &R())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(space, R, X, take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm2< \ + EXECSPACE, \ + Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ + using RV = Kokkos::View >; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using size_type = typename XV::size_type; \ + \ + static void nrm2(const execution_space& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,double]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm2_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int int_one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDnrm2(s.handle, N, X.data(), int_one, &R())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2( \ + space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ template <> \ struct Nrm2< \ - EXECSPACE, \ + EXECSPACE, \ Kokkos::View >, \ Kokkos::View, \ @@ -269,7 +283,8 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm2(const execution_space& space, RV& R, const XV& X, const bool& take_sqrt) { \ + static void nrm2(const execution_space& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ @@ -278,71 +293,71 @@ namespace Impl { constexpr int int_one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSnrm2(s.handle, N, X.data(), int_one, &R())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, NULL)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSnrm2(s.handle, N, X.data(), int_one, &R())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ if (!take_sqrt) R() = R() * R(); \ } else { \ - Nrm2::nrm2(space, R, X, take_sqrt); \ + Nrm2::nrm2( \ + space, R, X, take_sqrt); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm2< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm2(const execution_space& space, RV& R, const XV& X, const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm2[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm2_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int int_one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasDznrm2(s.handle, N, \ - reinterpret_cast(X.data()), \ - int_one, &R())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(space, R, X, take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm2 >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm2(const execution_space& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::nrm2[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm2_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int int_one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDznrm2( \ + s.handle, N, reinterpret_cast(X.data()), \ + int_one, &R())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2( \ + space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) \ template <> \ - struct Nrm2< \ - EXECSPACE, \ - Kokkos::View >, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ @@ -358,7 +373,8 @@ namespace Impl { XV; \ typedef typename XV::size_type size_type; \ \ - static void nrm2(const execution_space& space, RV& R, const XV& X, const bool& take_sqrt) { \ + static void nrm2(const execution_space& space, RV& R, const XV& X, \ + const bool& take_sqrt) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::nrm2[TPL_CUBLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -368,40 +384,40 @@ namespace Impl { constexpr int int_one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasScnrm2(s.handle, N, \ - reinterpret_cast(X.data()), int_one, \ - &R())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasScnrm2( \ + s.handle, N, reinterpret_cast(X.data()), \ + int_one, &R())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ if (!take_sqrt) R() = R() * R(); \ } else { \ - Nrm2::nrm2(space, R, X, take_sqrt); \ + Nrm2::nrm2( \ + space, R, X, take_sqrt); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_DNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) -KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_SNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) -KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_ZNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) -KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - true) -KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, - false) +KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp index c61802a68f..88591fbf0c 100644 --- a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp @@ -34,7 +34,7 @@ namespace Impl { #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // double #define KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ + template \ struct nrminf_tpl_spec_avail< \ ExecSpace, \ Kokkos::View< \ diff --git a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp index aad5bbd7d4..17ec54e057 100644 --- a/blas/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp @@ -43,6 +43,7 @@ namespace Impl { ETI_SPEC_AVAIL) \ template \ struct NrmInf< \ + ExecSpace, \ Kokkos::View>, \ Kokkos::View, \ @@ -58,7 +59,7 @@ namespace Impl { typedef typename XV::size_type size_type; \ typedef Kokkos::Details::InnerProductSpaceTraits IPT; \ \ - static void nrminf(RV& R, const XV& X) { \ + static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,double]"); \ const size_type numElems = X.extent(0); \ if (numElems == 0) { \ @@ -72,7 +73,8 @@ namespace Impl { int idx = HostBlas::iamax(N, X.data(), one) - 1; \ R() = IPT::norm(X(idx)); \ } else { \ - NrmInf::nrminf(R, X); \ + NrmInf::nrminf(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -82,6 +84,7 @@ namespace Impl { ETI_SPEC_AVAIL) \ template \ struct NrmInf< \ + ExecSpace, \ Kokkos::View>, \ Kokkos::View, \ @@ -97,7 +100,7 @@ namespace Impl { typedef typename XV::size_type size_type; \ typedef Kokkos::Details::InnerProductSpaceTraits IPT; \ \ - static void nrminf(RV& R, const XV& X) { \ + static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_BLAS,float]"); \ const size_type numElems = X.extent(0); \ if (numElems == 0) { \ @@ -111,7 +114,8 @@ namespace Impl { int idx = HostBlas::iamax(N, X.data(), one) - 1; \ R() = IPT::norm(X(idx)); \ } else { \ - NrmInf::nrminf(R, X); \ + NrmInf::nrminf(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -120,7 +124,8 @@ namespace Impl { #define KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ ETI_SPEC_AVAIL) \ template \ - struct NrmInf>, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ @@ -137,7 +142,7 @@ namespace Impl { typedef Kokkos::Details::InnerProductSpaceTraits> \ IPT; \ \ - static void nrminf(RV& R, const XV& X) { \ + static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ Kokkos::Profiling::pushRegion( \ "KokkosBlas::nrminf[TPL_BLAS,complex]"); \ const size_type numElems = X.extent(0); \ @@ -156,55 +161,58 @@ namespace Impl { 1; \ R() = IPT::norm(X(idx)); \ } else { \ - NrmInf::nrminf(R, X); \ + NrmInf::nrminf(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct NrmInf>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View> \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits> \ - XV; \ - typedef typename XV::size_type size_type; \ - typedef Kokkos::Details::InnerProductSpaceTraits> \ - IPT; \ - \ - static void nrminf(RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrminf[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { \ - R() = 0.0f; \ - return; \ - } \ - if (numElems < static_cast(INT_MAX)) { \ - nrminf_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - int idx = \ - HostBlas>::iamax( \ - N, reinterpret_cast*>(X.data()), \ - one) - \ - 1; \ - R() = IPT::norm(X(idx)); \ - } else { \ - NrmInf::nrminf(R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct NrmInf>, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View> \ + RV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits> \ + XV; \ + typedef typename XV::size_type size_type; \ + typedef Kokkos::Details::InnerProductSpaceTraits> \ + IPT; \ + \ + static void nrminf(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::nrminf[TPL_BLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems == 0) { \ + R() = 0.0f; \ + return; \ + } \ + if (numElems < static_cast(INT_MAX)) { \ + nrminf_print_specialization(); \ + int N = numElems; \ + int one = 1; \ + int idx = \ + HostBlas>::iamax( \ + N, reinterpret_cast*>(X.data()), \ + one) - \ + 1; \ + R() = IPT::norm(X(idx)); \ + } else { \ + NrmInf::nrminf(space, R, \ + X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, diff --git a/blas/tpls/KokkosBlas1_sum_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_sum_tpl_spec_avail.hpp index 12470c28ed..755fa092fb 100644 --- a/blas/tpls/KokkosBlas1_sum_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_sum_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct sum_tpl_spec_avail { enum : bool { value = false }; }; From d3b8bc8239f44e7a2eafc7360105155e30cb7b60 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 25 Apr 2023 20:06:16 -0600 Subject: [PATCH 288/442] BLAS1: adding final fences for code path that return host results --- blas/impl/KokkosBlas1_dot_mv_impl.hpp | 1 + blas/impl/KokkosBlas1_sum_impl.hpp | 1 + blas/src/KokkosBlas1_dot.hpp | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/blas/impl/KokkosBlas1_dot_mv_impl.hpp b/blas/impl/KokkosBlas1_dot_mv_impl.hpp index c1abcafcc6..d19e512599 100644 --- a/blas/impl/KokkosBlas1_dot_mv_impl.hpp +++ b/blas/impl/KokkosBlas1_dot_mv_impl.hpp @@ -129,6 +129,7 @@ void MV_Dot_Invoke( MV_Dot_Invoke( space, tempResult, x, y); Kokkos::deep_copy(space, r, tempResult); + space.fence(); } } // namespace Impl diff --git a/blas/impl/KokkosBlas1_sum_impl.hpp b/blas/impl/KokkosBlas1_sum_impl.hpp index df08d42069..864c983541 100644 --- a/blas/impl/KokkosBlas1_sum_impl.hpp +++ b/blas/impl/KokkosBlas1_sum_impl.hpp @@ -158,6 +158,7 @@ void MV_Sum_Invoke( MV_Sum_Invoke( space, tempResult, x); Kokkos::deep_copy(space, r, tempResult); + space.fence(); } } // namespace Impl diff --git a/blas/src/KokkosBlas1_dot.hpp b/blas/src/KokkosBlas1_dot.hpp index 414a6042f7..ebccce7d7c 100644 --- a/blas/src/KokkosBlas1_dot.hpp +++ b/blas/src/KokkosBlas1_dot.hpp @@ -1,4 +1,4 @@ -//@HEADERA +//@HEADER // ************************************************************************ // // Kokkos v. 4.0 From 13c5d8633fa782a7613557331891fb7802f4522f Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 25 Apr 2023 21:09:59 -0600 Subject: [PATCH 289/442] BLAS2/3: adding proper execution space interfaces to gemv and gemm The current interface assumes that the execution space needs to be the same as one of the argument which is not a real requirement in practice. We only need to make sure that the input/ouput data is accessible using the Kokkos::SpaceAccessibility concept. --- blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp | 6 +-- blas/impl/KokkosBlas3_gemm_impl.hpp | 4 +- blas/impl/KokkosBlas3_gemm_spec.hpp | 48 ++++++++++--------- blas/src/KokkosBlas2_gemv.hpp | 32 ++++++++----- blas/src/KokkosBlas3_gemm.hpp | 36 +++++++++----- blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp | 5 +- blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp | 48 +++++++++++-------- 7 files changed, 105 insertions(+), 74 deletions(-) diff --git a/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp b/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp index f54a1dd68c..84963d1d2e 100644 --- a/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp +++ b/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp @@ -68,7 +68,7 @@ struct DotBasedGEMM { numCcols(C.extent(1)), dotSize(A.extent(0)) {} - void run(const typename CV::execution_space& space, bool conjugateTranspose) { + void run(const ExecSpace& space, bool conjugateTranspose) { multipleReductionWorkDistribution( dotSize, numCrows * numCcols, numDivPerDot); const size_C ndots = numCrows * numCcols; // Number of dot products @@ -77,12 +77,12 @@ struct DotBasedGEMM { // Initialize C matrix if beta != 1 if (beta == CVT::zero()) { Kokkos::MDRangePolicy> policyInit( - {0, 0}, {numCrows, numCcols}); + space, {0, 0}, {numCrows, numCcols}); Kokkos::parallel_for("Initialize C for Dot Product Based GEMM", policyInit, *this); } else if (beta != CVT::one()) { Kokkos::MDRangePolicy> policyInit( - {0, 0}, {numCrows, numCcols}); + space, {0, 0}, {numCrows, numCcols}); Kokkos::parallel_for("Initialize C for Dot Product Based GEMM", policyInit, *this); } diff --git a/blas/impl/KokkosBlas3_gemm_impl.hpp b/blas/impl/KokkosBlas3_gemm_impl.hpp index b0271ad23d..4f3e62f343 100644 --- a/blas/impl/KokkosBlas3_gemm_impl.hpp +++ b/blas/impl/KokkosBlas3_gemm_impl.hpp @@ -611,10 +611,10 @@ struct GEMMImpl { ViewTypeCScratch::shmem_size(); #if defined(KOKKOS_ENABLE_HIP) - // Note lbv, 10/29/20: The LaunchBounds<384,2> leads + // Note lbv, 10/29/20: The LaunchBounds<384, 2> leads // to an error with HIP as the heuristics on that platform // yield an optimal_num_blocks=0 which means no ressources - // are allocated... Switching to LaunchBounds<384,2> fixes + // are allocated... Switching to LaunchBounds<384, 0> fixes // that problem but I'm not sure if that it a good perf // parameter or why it is set to 2 for Cuda? Kokkos::TeamPolicy> policy( diff --git a/blas/impl/KokkosBlas3_gemm_spec.hpp b/blas/impl/KokkosBlas3_gemm_spec.hpp index 5f443ab371..08c274bf10 100644 --- a/blas/impl/KokkosBlas3_gemm_spec.hpp +++ b/blas/impl/KokkosBlas3_gemm_spec.hpp @@ -29,7 +29,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct gemm_eti_spec_avail { enum : bool { value = false }; }; @@ -47,6 +47,7 @@ struct gemm_eti_spec_avail { LAYOUTC, EXEC_SPACE, MEM_SPACE) \ template <> \ struct gemm_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -84,13 +85,13 @@ namespace Impl { // // Implementation of KokkosBlas::gemm. -template ::value, + gemm_tpl_spec_avail::value, bool eti_spec_avail = - gemm_eti_spec_avail::value> + gemm_eti_spec_avail::value> struct GEMM { - static void gemm(const typename CViewType::execution_space& space, + static void gemm(const execution_space& space, const char transA[], const char transB[], typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, @@ -118,14 +119,13 @@ struct GEMM { typedef typename AViewType::non_const_value_type ScalarA; typedef typename BViewType::non_const_value_type ScalarB; typedef typename CViewType::non_const_value_type ScalarC; - typedef typename CViewType::execution_space ExecSpace; // Figure out whether to use DotBased implementation const int M = static_cast(C.extent(0)); const int N = static_cast(C.extent(1)); const bool is_device_space = - KokkosKernels::Impl::kk_is_gpu_exec_space(); + KokkosKernels::Impl::kk_is_gpu_exec_space(); const bool A_is_lr = std::is_same::value; const bool A_is_tr = ((transA[0] == 'T') || (transA[0] == 't') || @@ -145,7 +145,7 @@ struct GEMM { // call dot-based GEMM, only for C := beta * C + alpha * A^T * B, on // device bool A_is_conj = ((transA[0] == 'C') || (transA[0] == 'c')); - DotBasedGEMM dotBasedGemm( + DotBasedGEMM dotBasedGemm( alpha, A, B, beta, C); dotBasedGemm.run(space, A_is_conj); @@ -170,11 +170,11 @@ struct GEMM { : 16; int vector_length = blockB1 / 4; int max_vector_length = KokkosKernels::Impl::kk_get_max_vector_size< - typename CViewType::execution_space>(); + execution_space>(); if (vector_length > max_vector_length) vector_length = max_vector_length; // Compute scratch space size - typedef KokkosBlas::Impl::GEMMImpl gemm_dummy_type; @@ -187,22 +187,22 @@ struct GEMM { // Figure out Team Sizes int team_size = 1; #if defined(KOKKOS_ENABLE_CUDA) - if (std::is_same::value) team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_HIP) - if (std::is_same::value) team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_ROCM) - if (std::is_same::value) team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_SYCL) - if (std::is_same::value) team_size = blockA0; #endif @@ -210,7 +210,7 @@ struct GEMM { // Call the correct kernel if ((transA[0] == 'N' || transA[0] == 'n') && (transB[0] == 'N' || transB[0] == 'n')) { - KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); @@ -218,7 +218,7 @@ struct GEMM { } if ((transA[0] == 'T' || transA[0] == 't') && (transB[0] == 'N' || transB[0] == 'n')) { - KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); @@ -226,7 +226,7 @@ struct GEMM { } if ((transA[0] == 'C' || transA[0] == 'c') && (transB[0] == 'N' || transB[0] == 'n')) { - KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); @@ -234,7 +234,7 @@ struct GEMM { } if ((transA[0] == 'N' || transA[0] == 'n') && (transB[0] == 'T' || transB[0] == 't')) { - KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); @@ -242,7 +242,7 @@ struct GEMM { } if ((transA[0] == 'T' || transA[0] == 't') && (transB[0] == 'T' || transB[0] == 't')) { - KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); @@ -250,7 +250,7 @@ struct GEMM { } if ((transA[0] == 'C' || transA[0] == 'c') && (transB[0] == 'T' || transB[0] == 't')) { - KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); @@ -258,7 +258,7 @@ struct GEMM { } if ((transA[0] == 'N' || transA[0] == 'n') && (transB[0] == 'C' || transB[0] == 'c')) { - KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); @@ -266,7 +266,7 @@ struct GEMM { } if ((transA[0] == 'T' || transA[0] == 't') && (transB[0] == 'C' || transB[0] == 'c')) { - KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); @@ -274,7 +274,7 @@ struct GEMM { } if ((transA[0] == 'C' || transA[0] == 'c') && (transB[0] == 'C' || transB[0] == 'c')) { - KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); @@ -303,6 +303,7 @@ struct GEMM { #define KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ LAYOUTC, EXEC_SPACE, MEM_SPACE) \ extern template struct GEMM< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -316,6 +317,7 @@ struct GEMM { #define KOKKOSBLAS3_GEMM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ LAYOUTC, EXEC_SPACE, MEM_SPACE) \ template struct GEMM< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/blas/src/KokkosBlas2_gemv.hpp b/blas/src/KokkosBlas2_gemv.hpp index 0cd2c8be93..11d31a741c 100644 --- a/blas/src/KokkosBlas2_gemv.hpp +++ b/blas/src/KokkosBlas2_gemv.hpp @@ -49,23 +49,35 @@ namespace KokkosBlas { /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param beta [in] Input coefficient of y /// \param y [in/out] Output vector, as a nonconst 1-D Kokkos::View -template -void gemv(const typename AViewType::execution_space& space, const char trans[], +template +void gemv(const execution_space& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::gemv: execution_space must be a valid Kokkos execution space."); static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); + "KokkosBlas::gemv: AViewType must be a Kokkos::View."); static_assert(Kokkos::is_view::value, - "XViewType must be a Kokkos::View."); + "KokkosBlas::gemv: XViewType must be a Kokkos::View."); static_assert(Kokkos::is_view::value, - "YViewType must be a Kokkos::View."); + "KokkosBlas::gemv: YViewType must be a Kokkos::View."); static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); + "KokkosBlas::gemv: AViewType must have rank 2."); static_assert(static_cast(XViewType::rank) == 1, - "XViewType must have rank 1."); + "KokkosBlas::gemv: XViewType must have rank 1."); static_assert(static_cast(YViewType::rank) == 1, - "YViewType must have rank 1."); + "KokkosBlas::gemv: YViewType must have rank 1."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemv: AViewType must be accessible from execution_space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemv: XViewType must be accessible from execution_space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemv: YViewType must be accessible from execution_space"); + static_assert(Kokkos::SpaceAccessibility::assignable, + "KokkosBlas::gemv: AViewType must be assignable to YViewType"); + static_assert(Kokkos::SpaceAccessibility::assignable, + "KokkosBlas::gemv: XViewType must be assignable to YViewType"); // Check compatibility of dimensions at run time. if (trans[0] == 'N' || trans[0] == 'n') { @@ -175,9 +187,7 @@ template void gemv(const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { - const typename AViewType::execution_space space = - typename AViewType::execution_space(); - gemv(space, trans, alpha, A, x, beta, y); + gemv(typename AViewType::execution_space{}, trans, alpha, A, x, beta, y); } namespace Experimental { diff --git a/blas/src/KokkosBlas3_gemm.hpp b/blas/src/KokkosBlas3_gemm.hpp index 586302cb01..5b7eccb222 100644 --- a/blas/src/KokkosBlas3_gemm.hpp +++ b/blas/src/KokkosBlas3_gemm.hpp @@ -38,9 +38,9 @@ namespace Impl { // This case must be intercepted here rather than impl in order to call TPL // GEMV instead of TPL GEMM. This codepath was measured to be profitable with // cuBLAS. -template +template bool gemv_based_gemm( - const typename CViewType::execution_space& space, const char transA[], + const execution_space& space, const char transA[], const char transB[], typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, typename CViewType::const_value_type& beta, const CViewType& C, @@ -107,24 +107,36 @@ bool gemv_based_gemm( /// \param B [in] Input matrix, as a 2-D Kokkos::View /// \param beta [in] Input coefficient of C /// \param C [in/out] Output vector, as a nonconst 2-D Kokkos::View -template -void gemm(const typename CViewType::execution_space& space, const char transA[], +template +void gemm(const execution_space& space, const char transA[], const char transB[], typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, typename CViewType::const_value_type& beta, const CViewType& C) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_execution_space_v, + "KokkosBlas::gemm: execution_space must be a valid Kokkos execution space"); static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); + "KokkosBlas::gemm: AViewType must be a Kokkos::View."); static_assert(Kokkos::is_view::value, - "BViewType must be a Kokkos::View."); + "KokkosBlas::gemm: BViewType must be a Kokkos::View."); static_assert(Kokkos::is_view::value, - "CViewType must be a Kokkos::View."); + "KokkosBlas::gemm: CViewType must be a Kokkos::View."); static_assert(static_cast(AViewType::rank) == 2, - "AViewType must have rank 2."); + "KokkosBlas::gemm: AViewType must have rank 2."); static_assert(static_cast(BViewType::rank) == 2, - "BViewType must have rank 2."); + "KokkosBlas::gemm: BViewType must have rank 2."); static_assert(static_cast(CViewType::rank) == 2, - "CViewType must have rank 2."); + "KokkosBlas::gemm: CViewType must have rank 2."); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemm: AViewType must be accessible from execution_space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemm: BViewType must be accessible from execution_space"); + static_assert(Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemm: CViewType must be accessible from execution_space"); + static_assert(Kokkos::SpaceAccessibility::assignable, + "KokkosBlas::gemm: CViewType must be assignable by AViewType"); + static_assert(Kokkos::SpaceAccessibility::assignable, + "KokkosBlas::gemm: CViewType must be assignable by BViewType"); // Check validity of transpose argument bool valid_transA = (transA[0] == 'N') || (transA[0] == 'n') || @@ -223,9 +235,7 @@ void gemm(const char transA[], const char transB[], typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, typename CViewType::const_value_type& beta, const CViewType& C) { - const typename CViewType::execution_space space = - typename CViewType::execution_space(); - gemm(space, transA, transB, alpha, A, B, beta, C); + gemm(typename CViewType::execution_space{}, transA, transB, alpha, A, B, beta, C); } } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp index 25d8818817..f130432978 100644 --- a/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct gemm_tpl_spec_avail { enum : bool { value = false }; }; @@ -32,6 +32,7 @@ struct gemm_tpl_spec_avail { LAYOUTC, MEMSPACE) \ template \ struct gemm_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -78,6 +79,7 @@ KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, LAYOUTC, MEMSPACE) \ template \ struct gemm_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -149,6 +151,7 @@ KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, #define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ template \ struct gemm_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp index 5508b892e7..d1e0cc26c6 100644 --- a/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp @@ -26,7 +26,9 @@ namespace Impl { #define KOKKOSBLAS3_XGEMM_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct GEMM, \ Kokkos::MemoryTraits >, \ Kokkos::View > \ CViewType; \ \ - static void gemm(const typename CViewType::execution_space& /* space*/, \ + static void gemm(const ExecSpace& /* space*/, \ const char transA[], const char transB[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B, \ @@ -163,7 +165,9 @@ namespace Impl { LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ ETI_SPEC_AVAIL) \ template \ - struct GEMM, \ Kokkos::MemoryTraits >, \ Kokkos::View > \ CViewType; \ \ - static void gemm(const typename CViewType::execution_space& space, \ + static void gemm(const ExecSpace& space, \ const char transA[], const char transB[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B, \ @@ -364,7 +368,9 @@ namespace Impl { ROCBLAS_FN, LAYOUT, MEM_SPACE, \ ETI_SPEC_AVAIL) \ template \ - struct GEMM, \ Kokkos::MemoryTraits >, \ Kokkos::View, rocblas_float_complex, \ rocblas_cgemm, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) -KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) -KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) -KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) -KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) -KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) -KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) -KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) -KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) -KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) -KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) -KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) -KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) -KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) -KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) -KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) -KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, +KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) } // namespace Impl From 99a3b9dac69a5e22ced89c3fcdf3d95d149e20f1 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 27 Mar 2023 20:32:36 -0600 Subject: [PATCH 290/442] All changes again, because previous branch got changes beyond those related to ger --- blas/CMakeLists.txt | 7 + .../ger/KokkosBlas2_ger_eti_spec_inst.cpp.in | 25 + .../KokkosBlas2_ger_eti_spec_avail.hpp.in | 25 + .../KokkosBlas2_ger_eti_spec_decl.hpp.in | 25 + blas/impl/KokkosBlas2_ger_impl.hpp | 339 ++++ blas/impl/KokkosBlas2_ger_spec.hpp | 214 +++ blas/src/KokkosBlas2_ger.hpp | 146 ++ blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp | 148 ++ blas/tpls/KokkosBlas2_ger_tpl_spec_decl.hpp | 35 + .../KokkosBlas2_ger_tpl_spec_decl_blas.hpp | 408 +++++ .../KokkosBlas2_ger_tpl_spec_decl_cublas.hpp | 438 +++++ .../KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp | 438 +++++ blas/tpls/KokkosBlas_Host_tpl.cpp | 205 ++- blas/tpls/KokkosBlas_Host_tpl.hpp | 33 + blas/unit_test/Test_Blas.hpp | 1 + blas/unit_test/Test_Blas2_ger.hpp | 1616 +++++++++++++++++ 16 files changed, 4101 insertions(+), 2 deletions(-) create mode 100644 blas/eti/generated_specializations_cpp/ger/KokkosBlas2_ger_eti_spec_inst.cpp.in create mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_avail.hpp.in create mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in create mode 100644 blas/impl/KokkosBlas2_ger_impl.hpp create mode 100644 blas/impl/KokkosBlas2_ger_spec.hpp create mode 100644 blas/src/KokkosBlas2_ger.hpp create mode 100644 blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp create mode 100644 blas/tpls/KokkosBlas2_ger_tpl_spec_decl.hpp create mode 100644 blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp create mode 100644 blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp create mode 100644 blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp create mode 100644 blas/unit_test/Test_Blas2_ger.hpp diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt index e8a90c38cf..04f883c21a 100644 --- a/blas/CMakeLists.txt +++ b/blas/CMakeLists.txt @@ -290,6 +290,13 @@ KOKKOSKERNELS_GENERATE_ETI(Blas2_gemv gemv TYPE_LISTS FLOATS LAYOUTS DEVICES ) +KOKKOSKERNELS_GENERATE_ETI(Blas2_ger ger + COMPONENTS blas + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) + KOKKOSKERNELS_GENERATE_ETI(Blas3_gemm gemm COMPONENTS blas HEADER_LIST ETI_HEADERS diff --git a/blas/eti/generated_specializations_cpp/ger/KokkosBlas2_ger_eti_spec_inst.cpp.in b/blas/eti/generated_specializations_cpp/ger/KokkosBlas2_ger_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..d256c1a6a1 --- /dev/null +++ b/blas/eti/generated_specializations_cpp/ger/KokkosBlas2_ger_eti_spec_inst.cpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" +#include "KokkosBlas2_ger_spec.hpp" + +namespace KokkosBlas { +namespace Impl { +@BLAS2_GER_ETI_INST_BLOCK@ +} //IMPL +} //Kokkos diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_avail.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..84e377eba9 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_avail.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBLAS2_GER_ETI_SPEC_AVAIL_HPP_ + +namespace KokkosBlas { +namespace Impl { +@BLAS2_GER_ETI_AVAIL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..ee14a84823 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS2_GER_ETI_SPEC_DECL_HPP_ + +namespace KokkosBlas { +namespace Impl { +@BLAS2_GER_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/blas/impl/KokkosBlas2_ger_impl.hpp b/blas/impl/KokkosBlas2_ger_impl.hpp new file mode 100644 index 0000000000..a0f00df503 --- /dev/null +++ b/blas/impl/KokkosBlas2_ger_impl.hpp @@ -0,0 +1,339 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_IMPL_HPP_ +#define KOKKOSBLAS2_GER_IMPL_HPP_ + +#include "KokkosKernels_config.h" +#include "Kokkos_Core.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" +#include "Kokkos_ArithTraits.hpp" + +namespace KokkosBlas { +namespace Impl { + +// Functor for a single-level parallel_for version of nontranspose GER. +// The functor parallelizes over rows of the input matrix A. +template +struct SingleLevelGER { + using AlphaCoeffType = typename AViewType::non_const_value_type; + using A_value_type = typename AViewType::non_const_value_type; + + SingleLevelGER( const bool justTranspose + , const AlphaCoeffType & alpha + , const XViewType & x + , const YViewType & y + , const AViewType & A + ) + : justTranspose_(justTranspose) + , alpha_ (alpha) + , x_ (x) + , y_ (y) + , A_ (A) + { + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + + static_assert(std::is_integral::value, "IndexType must be an integer."); + } + + KOKKOS_INLINE_FUNCTION void operator()(const IndexType & i) const { + using KAT = Kokkos::Details::ArithTraits; + + if (alpha_ == KAT::zero()) { + // Nothing to do + } + else { + const IndexType N ( A_.extent(1) ); + const A_value_type x_fixed( x_(i) ); + + if (justTranspose_) { + for (IndexType j = 0; j < N; ++j) { + A_(i,j) += A_value_type( alpha_ * x_fixed * y_(j) ); + } + } + else { + for (IndexType j = 0; j < N; ++j) { + A_(i,j) += A_value_type( alpha_ * x_fixed * KAT::conj( y_(j) ) ); + } + } + } + } + +private: + bool justTranspose_; + AlphaCoeffType alpha_; + typename XViewType::const_type x_; + typename YViewType::const_type y_; + AViewType A_; +}; + +// Single-level parallel version of GER. +template +void singleLevelGer( const typename AViewType::execution_space & space + , const char trans[] + , const typename AViewType::const_value_type & alpha + , const XViewType & x + , const YViewType & y + , const AViewType & A + ) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering IMPL singleLevelGer(), AViewType = %s\n", typeid(AViewType).name() ); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + + static_assert(std::is_integral::value, "IndexType must be an integer"); + + using KAT = Kokkos::Details::ArithTraits; + + if (y.extent(0) == 0) { + // no entries to update + } + else if (x.extent(0) == 0) { + // no entries to update + } + else if (alpha == KAT::zero()) { + // no entries to update + } + else { + using execution_space = typename AViewType::execution_space; + Kokkos::RangePolicy rangePolicy(space, 0, A.extent(0)); + SingleLevelGER functor( (trans[0] == 'T') || (trans[0] == 't') + , alpha + , x + , y + , A + ); + Kokkos::parallel_for("KokkosBlas::ger[SingleLevel]", rangePolicy, functor); + } +} + +struct TwoLevelGER_LayoutLeftTag {}; +struct TwoLevelGER_LayoutRightTag {}; + +// --------------------------------------------------------------------------------------------- + +// Functor for a two-level parallel_reduce version of GER, designed for performance on GPU. +// Kernel depends on the layout of A. +template +struct TwoLevelGER { + using AlphaCoeffType = typename AViewType::non_const_value_type; + using A_value_type = typename AViewType::non_const_value_type; + + using execution_space = typename AViewType::execution_space; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + + TwoLevelGER( const bool justTranspose + , const AlphaCoeffType & alpha + , const XViewType & x + , const YViewType & y + , const AViewType & A + ) + : justTranspose_(justTranspose) + , alpha_ (alpha) + , x_ (x) + , y_ (y) + , A_ (A) + { + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + + static_assert(std::is_integral::value, "IndexType must be an integer."); + } + +public: + // LayoutLeft version: one team per column + KOKKOS_INLINE_FUNCTION void operator()( TwoLevelGER_LayoutLeftTag + , const member_type & team + ) const { + using KAT = Kokkos::Details::ArithTraits; + + if (alpha_ == KAT::zero()) { + // Nothing to do + } + else { + const IndexType M ( A_.extent(0) ); + const IndexType j ( team.league_rank() ); + if (justTranspose_) { + const A_value_type y_fixed( y_(j) ); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType & i) { + A_(i,j) += A_value_type( alpha_ * x_(i) * y_fixed ); + }); + } + else { + const A_value_type y_fixed( KAT::conj( y_(j) ) ); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType & i) { + A_(i,j) += A_value_type( alpha_ * x_(i) * y_fixed ); + }); + } + } + } + + // LayoutRight version: one team per row + KOKKOS_INLINE_FUNCTION void operator()( TwoLevelGER_LayoutRightTag + , const member_type & team + ) const { + using KAT = Kokkos::Details::ArithTraits; + + if (alpha_ == KAT::zero()) { + // Nothing to do + } + else { + const IndexType N ( A_.extent(1) ); + const IndexType i ( team.league_rank() ); + const A_value_type x_fixed( x_(i) ); + if (justTranspose_) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType & j) { + A_(i,j) += A_value_type( alpha_ * x_fixed * y_(j) ); + }); + } + else { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType & j) { + A_(i,j) += A_value_type( alpha_ * x_fixed * KAT::conj( y_(j) ) ); + }); + } + } + team.team_barrier(); + } + +private: + bool justTranspose_; + AlphaCoeffType alpha_; + typename XViewType::const_type x_; + typename YViewType::const_type y_; + AViewType A_; +}; + +// Two-level parallel version of GER. +template +void twoLevelGer( const typename AViewType::execution_space & space + , const char trans[] + , const typename AViewType::const_value_type & alpha + , const XViewType & x + , const YViewType & y + , const AViewType & A + ) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering IMPL twoLevelGer(), AViewType = %s\n", typeid(AViewType).name() ); + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + + static_assert(std::is_integral::value, "IndexType must be an integer"); + + using KAT = Kokkos::Details::ArithTraits; + + if (y.extent(0) == 0) { + // no entries to update + return; + } + else if (x.extent(0) == 0) { + // no entries to update + return; + } + else if (alpha == KAT::zero()) { + // no entries to update + return; + } + + using execution_space = typename AViewType::execution_space; + constexpr bool isLayoutLeft = std::is_same::value; + using layout_tag = typename std::conditional::type; + using TeamPolicyType = Kokkos::TeamPolicy; + TeamPolicyType teamPolicy; + if (isLayoutLeft) { + // LayoutLeft: one team per column + teamPolicy = TeamPolicyType(space, A.extent(1), Kokkos::AUTO); + } + else { + // LayoutRight: one team per row + teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); + } + + TwoLevelGER functor( (trans[0] == 'T') || (trans[0] == 't') + , alpha + , x + , y + , A + ); + Kokkos::parallel_for("KokkosBlas::ger[twoLevel]", teamPolicy, functor); +} + +// --------------------------------------------------------------------------------------------- + +// generalGer: use 1 level (Range) or 2 level (Team) implementation, +// depending on whether execution space is CPU or GPU. +// The 'enable_if' makes sure unused kernels are not instantiated. + +template < class XViewType + , class YViewType + , class AViewType + , class IndexType + , typename std::enable_if() >::type* = nullptr + > +void generalGerImpl( const typename AViewType::execution_space & space + , const char trans[] + , const typename AViewType::const_value_type & alpha + , const XViewType & x + , const YViewType & y + , const AViewType & A + ) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering IMPL generalGerImpl(CPU), AViewType = %s\n", typeid(AViewType).name() ); + singleLevelGer(space, trans, alpha, x, y, A); +} + +template < class XViewType + , class YViewType + , class AViewType + , class IndexType + , typename std::enable_if()>::type* = nullptr + > +void generalGerImpl( const typename AViewType::execution_space & space + , const char trans[] + , const typename AViewType::const_value_type & alpha + , const XViewType & x + , const YViewType & y + , const AViewType & A + ) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering IMPL generalGerImpl(GPU), AViewType = %s\n", typeid(AViewType).name() ); + twoLevelGer(space, trans, alpha, x, y, A); +} + +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_GER_IMPL_HPP_ diff --git a/blas/impl/KokkosBlas2_ger_spec.hpp b/blas/impl/KokkosBlas2_ger_spec.hpp new file mode 100644 index 0000000000..f8e32111f5 --- /dev/null +++ b/blas/impl/KokkosBlas2_ger_spec.hpp @@ -0,0 +1,214 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_SPEC_HPP_ +#define KOKKOSBLAS2_GER_SPEC_HPP_ + +#include "KokkosKernels_config.h" +#include "Kokkos_Core.hpp" + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include +#endif + +namespace KokkosBlas { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct ger_eti_spec_avail { + enum : bool { value = false }; +}; +} // namespace Impl +} // namespace KokkosBlas + +// +// Macro for declaration of full specialization availability KokkosBlas::Impl::GER. +// This is NOT for users!!! +// All the declarations of full specializations go in this header file. +// We may spread out definitions (see _INST macro below) across one or more .cpp files. +// +#define KOKKOSBLAS2_GER_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct ger_eti_spec_avail< Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + > { \ + enum : bool { value = true }; \ + }; + +// Include the actual specialization declarations +#include +#include + +namespace KokkosBlas { +namespace Impl { + +// +// ger +// + +// Implementation of KokkosBlas::ger. +template < class XViewType + , class YViewType + , class AViewType + , bool tpl_spec_avail = ger_tpl_spec_avail::value + , bool eti_spec_avail = ger_eti_spec_avail::value + > +struct GER { + static void ger( const typename AViewType::execution_space & space + , const char trans[] + , const typename AViewType::const_value_type & alpha + , const XViewType & x + , const YViewType & y + , const AViewType & A + ) +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering KokkosBlas::Impl::Ger::ger()\n" ); + + static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); + + static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); + static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); + + if ((trans[0] == 'T') || + (trans[0] == 't') || + (trans[0] == 'H') || + (trans[0] == 'h')) { + // Ok + } + else { + std::ostringstream oss; + oss << "In impl of KokkosBlas2::ger(): invalid trans[0] = " << trans[0]; + throw std::runtime_error(oss.str()); + } + + if (A.extent(0) != x.extent(0)) { + std::ostringstream oss; + oss << "In impl of KokkosBlas2::ger(): A.extent(0) = " << A.extent(0) + << ", but x.extent(0) = " << x.extent(0); + throw std::runtime_error(oss.str()); + } + + if (A.extent(1) != y.extent(0)) { + std::ostringstream oss; + oss << "In impl of KokkosBlas2::ger(): A.extent(1) = " << A.extent(1) + << ", but y.extent(0) = " << y.extent(0); + throw std::runtime_error(oss.str()); + } + + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::ger[ETI]" : "KokkosBlas::ger[noETI]"); + + typedef typename AViewType::size_type size_type; + const size_type numRows = A.extent(0); + const size_type numCols = A.extent(1); + + // Prefer int as the index type, but use a larger type if needed. + if (( numRows < static_cast(INT_MAX) ) && + ( numCols < static_cast(INT_MAX) )) { + generalGerImpl( space + , trans + , alpha + , x + , y + , A + ); + } + else { + generalGerImpl( space + , trans + , alpha + , x + , y + , A + ); + } + + Kokkos::Profiling::popRegion(); + } +#else + ; +#endif // if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +}; + +} // namespace Impl +} // namespace KokkosBlas + +// +// Macro for declaration of full specialization of KokkosBlas::Impl::GER. +// This is NOT for users!!! +// All the declarations of full specializations go in this header file. +// We may spread out definitions (see _DEF macro below) across one or more .cpp files. +// +#define KOKKOSBLAS2_GER_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct GER< Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , false \ + , true \ + >; + +#define KOKKOSBLAS2_GER_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct GER< Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , false \ + , true \ + >; + +#include +#include + +#endif // KOKKOSBLAS2_GER_SPEC_HPP_ diff --git a/blas/src/KokkosBlas2_ger.hpp b/blas/src/KokkosBlas2_ger.hpp new file mode 100644 index 0000000000..f8bd1e943e --- /dev/null +++ b/blas/src/KokkosBlas2_ger.hpp @@ -0,0 +1,146 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_HPP_ +#define KOKKOSBLAS2_GER_HPP_ + +#include + +namespace KokkosBlas { + +/// \brief Rank-1 update of a general matrix: A = A + alpha * x * y^{T,H}. +/// +/// \tparam XViewType Input vector, as a 1-D Kokkos::View +/// \tparam YViewType Input vector, as a 1-D Kokkos::View +/// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View +/// +/// \param space [in] Execution space instance on which to run the kernel. +/// This may contain information about which stream to +/// run on. +/// \param trans [in] "T" or "t" for transpose, "H" or "h" for Hermitian. +/// Only the first character is taken into account. +/// \param alpha [in] Input coefficient of x * y^{T,H} +/// \param x [in] Input vector, as a 1-D Kokkos::View +/// \param y [in] Input vector, as a 1-D Kokkos::View +/// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View +template +void ger( const typename AViewType::execution_space & space + , const char trans[] + , const typename AViewType::const_value_type & alpha + , const XViewType & x + , const YViewType & y + , const AViewType & A + ) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering SRC KokkosBlas::ger(), AViewType = %s\n", typeid(AViewType).name() ); + + static_assert( Kokkos::is_view::value, "AViewType must be a Kokkos::View." ); + static_assert( Kokkos::is_view::value, "XViewType must be a Kokkos::View." ); + static_assert( Kokkos::is_view::value, "YViewType must be a Kokkos::View." ); + + static_assert( static_cast(AViewType::rank) == 2, "AViewType must have rank 2." ); + static_assert( static_cast(XViewType::rank) == 1, "XViewType must have rank 1." ); + static_assert( static_cast(YViewType::rank) == 1, "YViewType must have rank 1." ); + + // Check compatibility of dimensions at run time. + if (( A.extent(0) != x.extent(0) ) || + ( A.extent(1) != y.extent(0) )) { + std::ostringstream os; + os << "KokkosBlas::ger: Dimensions of A, x, and y do not match: " + << "A is " << A.extent(0) << " by " << A.extent(1) + << ", x has size " << x.extent(0) + << ", y has size " << y.extent(0); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + using ALayout = typename AViewType::array_layout; + + // Minimize the number of Impl::GER instantiations, by standardizing + // on particular View specializations for its template parameters. + typedef Kokkos::View< typename XViewType::const_value_type* + , typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout + , typename XViewType::device_type + , Kokkos::MemoryTraits + > XVT; + + typedef Kokkos::View< typename YViewType::const_value_type* + , typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout + , typename YViewType::device_type + , Kokkos::MemoryTraits + > YVT; + + typedef Kokkos::View< typename AViewType::non_const_value_type** + , ALayout + , typename AViewType::device_type + , Kokkos::MemoryTraits + > AVT; + + if (( A.extent(0) == 0 ) || + ( A.extent(1) == 0 )) { + // For degenerate cases, use fallback implementation to avoid potential + // (unlikely?) circular dependence issues by including other KokkosBlas + // headers. + const bool eti_spec_avail = KokkosBlas::Impl::ger_eti_spec_avail::value; + Impl::GER::ger( space + , trans + , alpha + , x + , y + , A + ); + } + else { + Impl::GER::ger( space + , trans + , alpha + , x + , y + , A + ); + } +} + +/// \brief Rank-1 update of a general matrix: A = A + alpha * x * y^{T,H}. +/// +/// \tparam XViewType Input vector, as a 1-D Kokkos::View +/// \tparam YViewType Input vector, as a 1-D Kokkos::View +/// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View +/// +/// \param trans [in] "T" or "t" for transpose, "H" or "h" for Hermitian. +/// Only the first character is taken into account. +/// \param alpha [in] Input coefficient of x * y^{T,H} +/// \param x [in] Input vector, as a 1-D Kokkos::View +/// \param y [in] Input vector, as a 1-D Kokkos::View +/// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View +template +void ger( const char trans[] + , const typename AViewType::const_value_type & alpha + , const XViewType & x + , const YViewType & y + , const AViewType & A + ) { + const typename AViewType::execution_space space = typename AViewType::execution_space(); + ger( space + , trans + , alpha + , x + , y + , A + ); +} + +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_GER_HPP_ diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp new file mode 100644 index 0000000000..250f705950 --- /dev/null +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp @@ -0,0 +1,148 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_HPP_ + +namespace KokkosBlas { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct ger_tpl_spec_avail { + enum : bool { value = false }; +}; + +// Generic Host side BLAS (could be MKL or whatever) +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTX, LAYOUTY, LAYOUTA, MEMSPACE) \ + template \ + struct ger_tpl_spec_avail< Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const SCALAR* \ + , LAYOUTY \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + > { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) + +#endif + +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS + +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTX, LAYOUTY, LAYOUTA, MEMSPACE) \ + template \ + struct ger_tpl_spec_avail< Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const SCALAR* \ + , LAYOUTY \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + > { \ + enum : bool { value = true }; \ + }; + +// We use the same layout for X, Y and Abecause the GER interface will +// switch the layouts of X and Y to that of A. So this TPL version will +// match any layout combination, as long as none are LayoutStride. + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) + +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS + +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ + template <> \ + struct ger_tpl_spec_avail< Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device< Kokkos::Experimental::HIP, \ + , Kokkos::Experimental::HIPSpace \ + > \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device< Kokkos::Experimental::HIP \ + , Kokkos::Experimental::HIPSpace \ + > \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device< Kokkos::Experimental::HIP \ + , Kokkos::Experimental::HIPSpace \ + > \ + , Kokkos::MemoryTraits \ + > \ + > { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex , Kokkos::LayoutRight) + +#endif +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_HPP_ diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl.hpp new file mode 100644 index 0000000000..f61e896951 --- /dev/null +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl.hpp @@ -0,0 +1,35 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_TPL_SPEC_DECL_HPP_ +#define KOKKOSBLAS2_GER_TPL_SPEC_DECL_HPP_ + +// BLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS +#include +#endif + +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +#include +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#include +#endif + +#endif diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp new file mode 100644 index 0000000000..a704a9ffc8 --- /dev/null +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp @@ -0,0 +1,408 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_TPL_SPEC_DECL_BLAS_HPP_ +#define KOKKOSBLAS2_GER_TPL_SPEC_DECL_BLAS_HPP_ + +#include "KokkosBlas_Host_tpl.hpp" + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUTA) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + if (( trans[0] == 'T' ) || \ + ( trans[0] == 't' ) || \ + ( trans[0] == 'H' ) || \ + ( trans[0] == 'h' )) { \ + } \ + else { \ + throw std::runtime_error("Error: invalid 'trans' for HostBlas::ger()"); \ + } + +#define KOKKOSBLAS2_DGER_BLAS(LAYOUTX, LAYOUTY, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GER< Kokkos::View< const double* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const double* \ + , LAYOUTY \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< double** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef double SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTY \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > YViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void ger( const typename AViewType::execution_space & /* space */ \ + , const char trans[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const YViewType & Y \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dger-blas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,double]"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUTA); \ + if (A_is_ll) { \ + HostBlas::ger( M \ + , N \ + , alpha \ + , X.data() \ + , one \ + , Y.data() \ + , one \ + , A.data() \ + , LDA \ + ); \ + } \ + else { \ + HostBlas::ger( M \ + , N \ + , alpha \ + , Y.data() \ + , one \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SGER_BLAS(LAYOUTX, LAYOUTY, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GER< Kokkos::View< const float* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const float* \ + , LAYOUTY \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< float** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef float SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTY \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > YViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void ger( const typename AViewType::execution_space & /* space */ \ + , const char trans[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const YViewType & Y \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-sger-blas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,float]"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUTA); \ + if (A_is_ll) { \ + HostBlas::ger( M \ + , N \ + , alpha \ + , X.data() \ + , one \ + , Y.data() \ + , one \ + , A.data() \ + , LDA \ + ); \ + } \ + else { \ + HostBlas::ger( M \ + , N \ + , alpha \ + , Y.data() \ + , one \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZGER_BLAS(LAYOUTX, LAYOUTY, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GER< Kokkos::View< const Kokkos::complex* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUTY \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTY \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > YViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void ger( const typename AViewType::execution_space & /* space */ \ + , const char trans[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const YViewType & Y \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zger-blas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,complex"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUTA); \ + const std::complex alpha_val = static_cast>(alpha); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (A_is_ll) { \ + if (justTranspose) { \ + HostBlas>::geru( M \ + , N \ + , alpha_val \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(Y.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + else { \ + HostBlas>::gerc( M \ + , N \ + , alpha_val \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(Y.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + } \ + else { \ + if (justTranspose) { \ + HostBlas>::geru( M \ + , N \ + , alpha_val \ + , reinterpret_cast*>(Y.data()) \ + , one \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + else { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasZgerc() requires LayoutLeft: throwing exception\n"); \ + throw std::runtime_error("Error: blasZgerc() requires LayoutLeft views."); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CGER_BLAS(LAYOUTX, LAYOUTY, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GER< Kokkos::View< const Kokkos::complex* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUTY \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTY \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > YViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void ger( const typename AViewType::execution_space & /* space */ \ + , const char trans[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const YViewType & Y \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-cger-blas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,complex"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUTA); \ + const std::complex alpha_val = static_cast>(alpha); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (A_is_ll) { \ + if (justTranspose) { \ + HostBlas>::geru( M \ + , N \ + , alpha_val \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(Y.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + else { \ + HostBlas>::gerc( M \ + , N \ + , alpha_val \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(Y.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + } \ + else { \ + if (justTranspose) { \ + HostBlas>::geru( M \ + , N \ + , alpha_val \ + , reinterpret_cast*>(Y.data()) \ + , one \ + , reinterpret_cast*>(X.data()) \ + , one \ + , reinterpret_cast*>(A.data()) \ + , LDA \ + ); \ + } \ + else { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasCgerc() requires LayoutLeft: throwing exception\n"); \ + throw std::runtime_error("Error: blasCgerc() requires LayoutLeft views."); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true ) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true ) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true ) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true ) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true ) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true ) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true ) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true ) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp new file mode 100644 index 0000000000..0598a2c7f2 --- /dev/null +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp @@ -0,0 +1,438 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_TPL_SPEC_DECL_CUBLAS_HPP_ +#define KOKKOSBLAS2_GER_TPL_SPEC_DECL_CUBLAS_HPP_ + +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUTA) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + if (( trans[0] == 'T' ) || \ + ( trans[0] == 't' ) || \ + ( trans[0] == 'H' ) || \ + ( trans[0] == 'h' )) { \ + } \ + else { \ + throw std::runtime_error("Error: invalid 'trans' for cudaBlas::ger()"); \ + } + +#define KOKKOSBLAS2_DGER_CUBLAS(LAYOUTX, LAYOUTY, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GER< Kokkos::View< const double* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const double* \ + , LAYOUTY \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< double** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef double SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTY \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > YViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void ger( const typename AViewType::execution_space & space \ + , const char trans[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const YViewType & Y \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dger-cublas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasDger( s.handle \ + , M \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , Y.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasDger( s.handle \ + , M \ + , N \ + , &alpha \ + , Y.data() \ + , one \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SGER_CUBLAS(LAYOUTX, LAYOUTY, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GER< Kokkos::View< const float* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const float* \ + , LAYOUTY \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< float** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef float SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTY \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > YViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void ger( const typename AViewType::execution_space & space \ + , const char trans[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const YViewType & Y \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-sger-cublas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSger( s.handle \ + , M \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , Y.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSger( s.handle \ + , M \ + , N \ + , &alpha \ + , Y.data() \ + , one \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZGER_CUBLAS(LAYOUTX, LAYOUTY, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GER< Kokkos::View< const Kokkos::complex* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUTY \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTY \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > YViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void ger( const typename AViewType::execution_space & space \ + , const char trans[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const YViewType & Y \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zger-cublas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZgeru( s.handle \ + , M \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(Y.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZgerc( s.handle \ + , M \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(Y.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + } \ + else { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZgeru( s.handle \ + , M \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(Y.data()) \ + , one \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("cublasZgerc() requires LayoutLeft: throwing exception\n"); \ + throw std::runtime_error("Error: cublasZgerc() requires LayoutLeft views."); \ + } \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CGER_CUBLAS(LAYOUTX, LAYOUTY, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GER< Kokkos::View< const Kokkos::complex* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUTY \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTX \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUTY \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > YViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUTA \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void ger( const typename AViewType::execution_space & space \ + , const char trans[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const YViewType & Y \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-cger-cublas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCgeru( s.handle \ + , M \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(Y.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCgerc( s.handle \ + , M \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(Y.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + } \ + else { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCgeru( s.handle \ + , M \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(Y.data()) \ + , one \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("cublasCgerc() requires LayoutLeft: throwing exception\n"); \ + throw std::runtime_error("Error: cublasCgerc() requires LayoutLeft views."); \ + } \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp new file mode 100644 index 0000000000..a8e349b38c --- /dev/null +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp @@ -0,0 +1,438 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_TPL_SPEC_DECL_ROCBLAS_HPP_ +#define KOKKOSBLAS2_GER_TPL_SPEC_DECL_ROCBLAS_HPP_ + +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + if (( trans[0] == 'T' ) || \ + ( trans[0] == 't' ) || \ + ( trans[0] == 'H' ) || \ + ( trans[0] == 'h' )) { \ + } \ + else { \ + throw std::runtime_error( "Error: invalid 'trans' for rocBlas::ger()"); \ + } + +#define KOKKOSBLAS2_DGER_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< Kokkos::View< const double* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const double* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< double** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef double SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > YViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void ger( const typename AViewType::execution_space & space \ + , const char trans[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const YViewType & Y \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dger-rocblas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ + if (A_is_ll) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_dger( s.handle \ + , M \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , Y.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_dger( s.handle \ + , M \ + , N \ + , &alpha \ + , Y.data() \ + , one \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, NULL) ); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SGER_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< Kokkos::View< const float* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const float* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< float** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef float SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > YViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void ger( const typename AViewType::execution_space & space \ + , const char trans[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const YViewType & Y \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-sger-rocblas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ + if (A_is_ll) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_sger( s.handle \ + , M \ + , N \ + , &alpha \ + , X.data() \ + , one \ + , Y.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_sger( s.handle \ + , M \ + , N \ + , &alpha \ + , Y.data() \ + , one \ + , X.data() \ + , one \ + , A.data() \ + , LDA \ + ) \ + ); \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, NULL) ); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZGER_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > YViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void ger( const typename AViewType::execution_space & space \ + , const char trans[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const YViewType & Y \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zger-rocblas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zgeru( s.handle \ + , M \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(Y.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zgerc( s.handle \ + , M \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(Y.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + } \ + else { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zgeru( s.handle \ + , M \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(Y.data()) \ + , one \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("rocblasZgerc() requires LayoutLeft: throwing exception\n"); \ + throw std::runtime_error("Error: rocblasZgerc() requires LayoutLeft views."); \ + } \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CGER_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const Kokkos::complex* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< Kokkos::complex** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > \ + , true \ + , ETI_SPEC_AVAIL \ + > { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > XViewType; \ + typedef Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > YViewType; \ + typedef Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device \ + , Kokkos::MemoryTraits \ + > AViewType; \ + \ + static void ger( const typename AViewType::execution_space & space \ + , const char trans[] \ + , typename AViewType::const_value_type & alpha \ + , const XViewType & X \ + , const YViewType & Y \ + , const AViewType & A \ + ) { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-cger-rocblas\n" ); \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_cgeru( s.handle \ + , M \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(Y.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_cgerc( s.handle \ + , M \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(Y.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + } \ + else { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_cgeru( s.handle \ + , M \ + , N \ + , reinterpret_cast(&alpha) \ + , reinterpret_cast(Y.data()) \ + , one \ + , reinterpret_cast(X.data()) \ + , one \ + , reinterpret_cast(A.data()) \ + , LDA \ + ) \ + ); \ + } \ + else { \ + KOKKOS_IMPL_DO_NOT_USE_PRINTF("rocblasCgerc() requires LayoutLeft: throwing exception\n"); \ + throw std::runtime_error("Error: rocblasCgec() requires LayoutLeft views."); \ + } \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, true ) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, false) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, true ) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, false) + +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, true ) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, false) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, true ) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, false) + +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, true ) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, false) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, true ) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, false) + +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, true ) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, false) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, true ) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index 24276f4a77..7f6ac280d4 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -225,6 +225,70 @@ void F77_BLAS_MANGLE(zgemv, ZGEMV)(const char*, int*, int*, const std::complex*, /* */ std::complex*, int*); +/// +/// Ger +/// +void F77_BLAS_MANGLE(sger, SGER)( int* + , int* + , const float* + , const float* + , int* + , const float* + , int* + , float* + , int* + ); +void F77_BLAS_MANGLE(dger, DGER)( int* + , int* + , const double* + , const double* + , int* + , const double* + , int* + , double* + , int* + ); +void F77_BLAS_MANGLE(cgeru, CGERU)( int* + , int* + , const std::complex* + , const std::complex* + , int* + , const std::complex* + , int* + , std::complex* + , int* + ); +void F77_BLAS_MANGLE(cgerc, CGERC)( int* + , int* + , const std::complex* + , const std::complex* + , int* + , const std::complex* + , int* + , std::complex* + , int* + ); +void F77_BLAS_MANGLE(zgeru, ZGERU)( int* + , int* + , const std::complex* + , const std::complex* + , int* + , const std::complex* + , int* + , std::complex* + , int* + ); +void F77_BLAS_MANGLE(zgerc, ZGERC)( int* + , int* + , const std::complex* + , const std::complex* + , int* + , const std::complex* + , int* + , std::complex* + , int* + ); + /// /// Trsv /// @@ -439,6 +503,13 @@ void F77_BLAS_MANGLE(zscal, #define F77_FUNC_CGEMV F77_BLAS_MANGLE(cgemv, CGEMV) #define F77_FUNC_ZGEMV F77_BLAS_MANGLE(zgemv, ZGEMV) +#define F77_FUNC_SGER F77_BLAS_MANGLE(sger, SGER) +#define F77_FUNC_DGER F77_BLAS_MANGLE(dger, DGER) +#define F77_FUNC_CGERU F77_BLAS_MANGLE(cgeru, CGERU) +#define F77_FUNC_CGERC F77_BLAS_MANGLE(cgerc, CGERC) +#define F77_FUNC_ZGERU F77_BLAS_MANGLE(zgeru, ZGERU) +#define F77_FUNC_ZGERC F77_BLAS_MANGLE(zgerc, ZGERC) + #define F77_FUNC_STRSV F77_BLAS_MANGLE(strsv, STRSV) #define F77_FUNC_DTRSV F77_BLAS_MANGLE(dtrsv, DTRSV) #define F77_FUNC_CTRSV F77_BLAS_MANGLE(ctrsv, CTRSV) @@ -540,6 +611,28 @@ void HostBlas::gemv(const char trans, int m, int n, const float alpha, F77_FUNC_SGEMV(&trans, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> +void HostBlas::ger( int m + , int n + , const float alpha + , const float* x + , int incx + , const float* y + , int incy + , float* a + , int lda + ) { + F77_FUNC_SGER( &m + , &n + , &alpha + , x + , &incx + , y + , &incy + , a + , &lda + ); +} +template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, int m, const float* a, int lda, /* */ float* b, int ldb) { @@ -653,6 +746,28 @@ void HostBlas::gemv(const char trans, int m, int n, const double alpha, F77_FUNC_DGEMV(&trans, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> +void HostBlas::ger( int m + , int n + , const double alpha + , const double* x + , int incx + , const double* y + , int incy + , double* a + , int lda + ) { + F77_FUNC_DGER( &m + , &n + , &alpha + , x + , &incx + , y + , &incy + , a + , &lda + ); +} +template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, int m, const double* a, int lda, /* */ double* b, int ldb) { @@ -768,7 +883,6 @@ void HostBlas >::swap(int const N, std::complex* X, int const incy) { F77_FUNC_CSWAP(&N, X, &incx, Y, &incy); } - template <> void HostBlas >::gemv(const char trans, int m, int n, const std::complex alpha, @@ -782,6 +896,50 @@ void HostBlas >::gemv(const char trans, int m, int n, (std::complex*)c, &ldc); } template <> +void HostBlas >::geru( int m + , int n + , const std::complex alpha + , const std::complex* x + , int incx + , const std::complex* y + , int incy + , std::complex* a + , int lda + ) { + F77_FUNC_CGERU( &m + , &n + , &alpha + , (const std::complex*)x + , &incx + , (const std::complex*)y + , &incy + , (std::complex*)a + , &lda + ); +} +template <> +void HostBlas >::gerc( int m + , int n + , const std::complex alpha + , const std::complex* x + , int incx + , const std::complex* y + , int incy + , std::complex* a + , int lda + ) { + F77_FUNC_CGERC( &m + , &n + , &alpha + , (const std::complex*)x + , &incx + , (const std::complex*)y + , &incy + , (std::complex*)a + , &lda + ); +} +template <> void HostBlas >::trsv(const char uplo, const char transa, const char diag, int m, const std::complex* a, int lda, @@ -923,7 +1081,6 @@ void HostBlas >::swap(int const N, std::complex* X, int const incy) { F77_FUNC_ZSWAP(&N, X, &incx, Y, &incy); } - template <> void HostBlas >::gemv( const char trans, int m, int n, const std::complex alpha, @@ -935,6 +1092,50 @@ void HostBlas >::gemv( (std::complex*)c, &ldc); } template <> +void HostBlas >::geru( int m + , int n + , const std::complex alpha + , const std::complex* x + , int incx + , const std::complex* y + , int incy + , std::complex* a + , int lda + ) { + F77_FUNC_ZGERU( &m + , &n + , &alpha + , (const std::complex*)x + , &incx + , (const std::complex*)y + , &incy + , (std::complex*)a + , &lda + ); +} +template <> +void HostBlas >::gerc( int m + , int n + , const std::complex alpha + , const std::complex* x + , int incx + , const std::complex* y + , int incy + , std::complex* a + , int lda + ) { + F77_FUNC_ZGERC( &m + , &n + , &alpha + , (const std::complex*)x + , &incx + , (const std::complex*)y + , &incy + , (std::complex*)a + , &lda + ); +} +template <> void HostBlas >::trsv(const char uplo, const char transa, const char diag, int m, const std::complex* a, diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index da89b5aa5d..457be2cdcc 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -64,6 +64,39 @@ struct HostBlas { int lda, const T *b, int ldb, const T beta, /* */ T *c, int ldc); + static void ger( int m + , int n + , const T alpha + , const T* x + , int incx + , const T* y + , int incy + , T* a + , int lda + ); + + static void geru( int m + , int n + , const T alpha + , const T* x + , int incx + , const T* y + , int incy + , T* a + , int lda + ); + + static void gerc( int m + , int n + , const T alpha + , const T* x + , int incx + , const T* y + , int incy + , T* a + , int lda + ); + static void trsv(const char uplo, const char transa, const char diag, int m, const T *a, int lda, /* */ T *b, int ldb); diff --git a/blas/unit_test/Test_Blas.hpp b/blas/unit_test/Test_Blas.hpp index cfa2e41c3e..ff955d13a8 100644 --- a/blas/unit_test/Test_Blas.hpp +++ b/blas/unit_test/Test_Blas.hpp @@ -61,6 +61,7 @@ // Blas 2 #include "Test_Blas2_gemv.hpp" +#include "Test_Blas2_ger.hpp" // Serial Blas 2 #include "Test_Blas2_serial_gemv.hpp" diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp new file mode 100644 index 0000000000..cf506e3fa2 --- /dev/null +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -0,0 +1,1616 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include +#include +#include + +namespace Test { + +constexpr double piVal = 3.14159265358979323846; + +template +class GerTester +{ +public: + GerTester(); + + ~GerTester(); + + void test( const int M + , const int N + , const int nonConstConstCombinations + , const bool useAnalyticalResults = false + , const bool useHermitianOption = false + ); + +private: + typedef Kokkos::View _ViewTypeX; + typedef Kokkos::View _ViewTypeY; + typedef Kokkos::View _ViewTypeA; + + typedef typename _ViewTypeX::HostMirror _HostViewTypeX; + typedef typename _ViewTypeY::HostMirror _HostViewTypeY; + typedef typename _ViewTypeA::HostMirror _HostViewTypeA; + typedef Kokkos::View _ViewTypeExpected; + + typedef Kokkos::ArithTraits _KAT_A; + typedef typename _KAT_A::mag_type _AuxType; + + void populateVariables( ScalarA & alpha + , _HostViewTypeX & h_x + , _HostViewTypeY & h_y + , _HostViewTypeA & h_A + , _ViewTypeExpected & h_expected + , _ViewTypeX & x + , _ViewTypeY & y + , _ViewTypeA & A + , bool & expectedResultIsKnown + ); + + template + typename std::enable_if< std::is_same>::value || std::is_same>::value + , void + >::type + populateAnalyticalValues( T & alpha + , _HostViewTypeX & h_x + , _HostViewTypeY & h_y + , _HostViewTypeA & h_A + , _ViewTypeExpected & h_expected + ); + + template + typename std::enable_if< !std::is_same>::value && !std::is_same>::value + , void + >::type + populateAnalyticalValues( T & alpha + , _HostViewTypeX & h_x + , _HostViewTypeY & h_y + , _HostViewTypeA & h_A + , _ViewTypeExpected & h_expected + ); + + template + typename std::enable_if< std::is_same>::value || std::is_same>::value + , void + >::type + populateVanillaValues( const T & alpha + , const _HostViewTypeX & h_x + , const _HostViewTypeY & h_y + , const _HostViewTypeA & h_A + , _ViewTypeExpected & h_vanilla + ); + + template + typename std::enable_if< !std::is_same>::value && !std::is_same>::value + , void + >::type + populateVanillaValues( const T & alpha + , const _HostViewTypeX & h_x + , const _HostViewTypeY & h_y + , const _HostViewTypeA & h_A + , _ViewTypeExpected & h_vanilla + ); + + template + typename std::enable_if< std::is_same>::value || std::is_same>::value + , void + >::type + compareVanillaExpected( const T & alpha + , const _ViewTypeExpected & h_vanilla + , const _ViewTypeExpected & h_expected + ); + + template + typename std::enable_if< !std::is_same>::value && !std::is_same>::value + , void + >::type + compareVanillaExpected( const T & alpha + , const _ViewTypeExpected & h_vanilla + , const _ViewTypeExpected & h_expected + ); + + template + typename std::enable_if< std::is_same>::value || std::is_same>::value + , void + >::type + compareKokkosExpected( const T & alpha + , const _HostViewTypeA & h_A + , const _ViewTypeExpected & h_expected + ); + + template + typename std::enable_if< !std::is_same>::value && !std::is_same>::value + , void + >::type + compareKokkosExpected( const T & alpha + , const _HostViewTypeA & h_A + , const _ViewTypeExpected & h_expected + ); + + template + T shrinkAngleToZeroTwoPiRange(const T input); + + template + void callKkGerAndCompareAgainstExpected( const ScalarA & alpha + , TX & x + , TY & y + , _ViewTypeA & A + , const _HostViewTypeA & h_A + , const _ViewTypeExpected & h_expected + , const std::string & situation + ); + + const bool _A_is_complex; + const bool _A_is_lr; + const bool _A_is_ll; + const bool _testIsGpu; + const bool _vanillaUsesDifferentOrderOfOps; + const _AuxType _epsAbs; + const _AuxType _epsRel; + int _M; + int _N; + bool _useAnalyticalResults; + bool _useHermitianOption; + bool _kkGerShouldThrowException; +}; + +template +GerTester< ScalarX + , tLayoutX + , ScalarY + , tLayoutY + , ScalarA + , tLayoutA + , Device + >::GerTester() + : _A_is_complex ( std::is_same>::value || std::is_same>::value ) + , _A_is_lr ( std::is_same< tLayoutA, Kokkos::LayoutRight >::value ) + , _A_is_ll ( std::is_same< tLayoutA, Kokkos::LayoutLeft >::value ) + , _testIsGpu ( KokkosKernels::Impl::kk_is_gpu_exec_space< typename Device::execution_space >() ) +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + , _vanillaUsesDifferentOrderOfOps( _A_is_lr && _testIsGpu ) +#else + , _vanillaUsesDifferentOrderOfOps( false ) +#endif + , _epsAbs (std::is_same<_AuxType, float>::value ? 1.0e-6 : 1.0e-9) + , _epsRel (std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6) + , _M (-1) + , _N (-1) + , _useAnalyticalResults (false) + , _useHermitianOption (false) + , _kkGerShouldThrowException (false) +{ +} + +template +GerTester< ScalarX + , tLayoutX + , ScalarY + , tLayoutY + , ScalarA + , tLayoutA + , Device + >::~GerTester() +{ + // Nothing to do +} + +template +void GerTester< ScalarX + , tLayoutX + , ScalarY + , tLayoutY + , ScalarA + , tLayoutA + , Device + >::test( const int M + , const int N + , const int nonConstConstCombinations + , const bool useAnalyticalResults + , const bool useHermitianOption + ) +{ + std::cout << "Entering GerTester::test()... - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " << std::endl; + + std::cout << "_A_is_complex = " << _A_is_complex + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", _testIsGpu = " << _testIsGpu + << ", _vanillaUsesDifferentOrderOfOps = " << _vanillaUsesDifferentOrderOfOps + << ", _epsAbs = " << _epsAbs + << ", _epsRel = " << _epsRel + << std::endl; + + // ******************************************************************** + // Step 1 of 9: declare main types and variables + // ******************************************************************** + _M = M; + _N = N; + _useAnalyticalResults = useAnalyticalResults; + _useHermitianOption = useHermitianOption; + +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + _kkGerShouldThrowException = false; + if (_A_is_complex && _useHermitianOption) { + if ((_testIsGpu == false) && + (_A_is_ll == false)) { + _kkGerShouldThrowException = true; + } + else if ((_testIsGpu == true ) && + (_A_is_ll == false)) { + _kkGerShouldThrowException = true; + } + } +#endif + + bool test_x_y (false); + bool test_cx_y (false); + bool test_x_cy (false); + bool test_cx_cy(false); + if (nonConstConstCombinations == 0) { + test_x_y = true; + } + else if (nonConstConstCombinations == 1) { + test_cx_y = true; + } + else if (nonConstConstCombinations == 2) { + test_x_cy = true; + } + else if (nonConstConstCombinations == 3) { + test_cx_cy = true; + } + else { + test_x_y = true; + test_cx_y = true; + test_x_cy = true; + test_cx_cy = true; + } + + _ViewTypeX x("X", _M); + _ViewTypeY y("Y", _N); + _ViewTypeA A("A", _M, _N); + + typename _ViewTypeX::const_type c_x = x; + typename _ViewTypeY::const_type c_y = y; + + _HostViewTypeX h_x = Kokkos::create_mirror_view(x); + _HostViewTypeY h_y = Kokkos::create_mirror_view(y); + _HostViewTypeA h_A = Kokkos::create_mirror_view(A); + + _ViewTypeExpected h_expected("expected A += alpha * x * y^{t,h}", _M, _N); + bool expectedResultIsKnown = false; + + ScalarA alpha(0.); + + // ******************************************************************** + // Step 2 of 9: populate alpha, h_x, h_y, h_A, h_expected, x, y, A + // ******************************************************************** + this->populateVariables( alpha + , h_x + , h_y + , h_A + , h_expected + , x + , y + , A + , expectedResultIsKnown + ); + + // ******************************************************************** + // Step 3 of 9: populate h_vanilla + // ******************************************************************** + _ViewTypeExpected h_vanilla("vanilla = A + alpha * x * y^{t,h}", _M, _N); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_ger.hpp, computing vanilla A with alpha type = %s\n", typeid(alpha).name() ); + this->populateVanillaValues( alpha + , h_x + , h_y + , h_A + , h_vanilla + ); + + // ******************************************************************** + // Step 4 of 9: use h_vanilla and h_expected as appropriate + // ******************************************************************** + if (expectedResultIsKnown) { + // ****************************************************************** + // Compare h_vanilla against h_expected + // ****************************************************************** + this->compareVanillaExpected( alpha + , h_vanilla + , h_expected + ); + } + else { + // ****************************************************************** + // Copy h_vanilla to h_expected + // ****************************************************************** + Kokkos::deep_copy(h_expected, h_vanilla); + } + + // ******************************************************************** + // Step 5 of 9: test with 'non const x' and 'non const y' + // ******************************************************************** + _ViewTypeA org_A("Org_A", _M, _N); + Kokkos::deep_copy(org_A, A); + + if (test_x_y) { + this->callKkGerAndCompareAgainstExpected( alpha + , x + , y + , A + , h_A + , h_expected + , "non const {x,y}" + ); + } + + // ******************************************************************** + // Step 6 of 9: test with const x + // ******************************************************************** + if (test_cx_y) { + Kokkos::deep_copy(A, org_A); + + this->callKkGerAndCompareAgainstExpected( alpha + , c_x + , y + , A + , h_A + , h_expected + , "const x" + ); + } + + // ******************************************************************** + // Step 7 of 9: test with const y + // ******************************************************************** + if (test_x_cy) { + Kokkos::deep_copy(A, org_A); + + this->callKkGerAndCompareAgainstExpected( alpha + , x + , c_y + , A + , h_A + , h_expected + , "const y" + ); + } + + // ******************************************************************** + // Step 8 of 9: test with const x and const y + // ******************************************************************** + if (test_cx_cy) { + Kokkos::deep_copy(A, org_A); + + this->callKkGerAndCompareAgainstExpected( alpha + , c_x + , c_y + , A + , h_A + , h_expected + , "const {x,y}" + ); + } + + // ******************************************************************** + // Step 9 of 9: tests with invalid values on the first input parameter + // ******************************************************************** + EXPECT_ANY_THROW( KokkosBlas::ger(".", alpha, x, y, A) ) << "Failed test: kk ger should have thrown an exception for mode '.'"; + EXPECT_ANY_THROW( KokkosBlas::ger("", alpha, x, y, A) ) << "Failed test: kk ger should have thrown an exception for mode ''"; + + std::cout << "Leaving GerTester::test() - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " << std::endl; +} + +template +void GerTester< ScalarX + , tLayoutX + , ScalarY + , tLayoutY + , ScalarA + , tLayoutA + , Device + >::populateVariables( ScalarA & alpha + , _HostViewTypeX & h_x + , _HostViewTypeY & h_y + , _HostViewTypeA & h_A + , _ViewTypeExpected & h_expected + , _ViewTypeX & x + , _ViewTypeY & y + , _ViewTypeA & A + , bool & expectedResultIsKnown + ) +{ + expectedResultIsKnown = false; + + if (_useAnalyticalResults) { + this->populateAnalyticalValues( alpha + , h_x + , h_y + , h_A + , h_expected + ); + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(y, h_y); + Kokkos::deep_copy(A, h_A); + + expectedResultIsKnown = true; + } + else if ((_M == 1) && (_N == 1)) { + alpha = 3; + + h_x[0] = 2; + + h_y[0] = 3; + + h_A(0,0) = 7; + + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(y, h_y); + Kokkos::deep_copy(A, h_A); + + h_expected(0,0) = 25; + expectedResultIsKnown = true; + } + else if ((_M == 1) && (_N == 2)) { + alpha = 3; + + h_x[0] = 2; + + h_y[0] = 3; + h_y[1] = 4; + + h_A(0,0) = 7; + h_A(0,1) = -6; + + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(y, h_y); + Kokkos::deep_copy(A, h_A); + + h_expected(0,0) = 25; + h_expected(0,1) = 18; + expectedResultIsKnown = true; + } + else if ((_M == 2) && (_N == 2)) { + alpha = 3; + + h_x[0] = 2; + h_x[1] = 9; + + h_y[0] = -3; + h_y[1] = 7; + + h_A(0,0) = 17; + h_A(0,1) = -43; + h_A(1,0) = 29; + h_A(1,1) = 101; + + Kokkos::deep_copy(x, h_x); + Kokkos::deep_copy(y, h_y); + Kokkos::deep_copy(A, h_A); + + h_expected(0,0) = -1; + h_expected(0,1) = -1; + h_expected(1,0) = -52; + h_expected(1,1) = 290; + expectedResultIsKnown = true; + } + else { + alpha = 3; + + Kokkos::Random_XorShift64_Pool rand_pool(13718); + + { + ScalarX randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(x, rand_pool, randStart, randEnd); + } + + { + ScalarY randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(y, rand_pool, randStart, randEnd); + } + + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + } + + Kokkos::deep_copy(h_x, x); + Kokkos::deep_copy(h_y, y); + Kokkos::deep_copy(h_A, A); + } +} + +// Code for complex values +template +template +typename std::enable_if< std::is_same>::value || std::is_same>::value + , void + >::type +GerTester< ScalarX + , tLayoutX + , ScalarY + , tLayoutY + , ScalarA + , tLayoutA + , Device + >::populateAnalyticalValues( T & alpha + , _HostViewTypeX & h_x + , _HostViewTypeY & h_y + , _HostViewTypeA & h_A + , _ViewTypeExpected & h_expected + ) { + _AuxType auxI(0.); + _AuxType auxJ(0.); + _AuxType auxIpJ(0.); + _AuxType auxImJ(0.); + + alpha.real() = 1.; + alpha.imag() = -1.; + + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + h_x[i].real() = sin(auxI); + h_x[i].imag() = cos(auxI); + } + + for (int j = 0; j < _N; ++j) { + auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); + h_y[j].real() = cos(auxJ); + h_y[j].imag() = sin(auxJ); + } + + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + for (int j = 0; j < _N; ++j) { + auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); + auxIpJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i+j) ); + h_A(i,j).real() = -sin(auxIpJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); + h_A(i,j).imag() = -sin(auxIpJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); + } + } + } + else { + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + for (int j = 0; j < _N; ++j) { + auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); + auxImJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i-j) ); + h_A(i,j).real() = -sin(auxImJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); + h_A(i,j).imag() = -sin(auxImJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); + } + } + } + + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + for (int j = 0; j < _N; ++j) { + auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); + auxIpJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i+j) ); + h_expected(i,j).real() = -2. * sin(auxI) * sin(auxJ); + h_expected(i,j).imag() = 2. * (cos(auxIpJ) - sin(auxIpJ)); + } + } + } + else { + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + for (int j = 0; j < _N; ++j) { + auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); + auxImJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i-j) ); + h_expected(i,j).real() = 2. * cos(auxI) * cos(auxJ); + h_expected(i,j).imag() = -2. * sin(auxImJ); + } + } + } +} + +// Code for non-complex values +template +template +typename std::enable_if< !std::is_same>::value && !std::is_same>::value + , void + >::type +GerTester< ScalarX + , tLayoutX + , ScalarY + , tLayoutY + , ScalarA + , tLayoutA + , Device + >::populateAnalyticalValues( T & alpha + , _HostViewTypeX & h_x + , _HostViewTypeY & h_y + , _HostViewTypeA & h_A + , _ViewTypeExpected & h_expected + ) { + _AuxType auxI(0.); + _AuxType auxJ(0.); + _AuxType auxIpJ(0.); + + alpha = 3; + + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + h_x[i] = sin(auxI); + } + + for (int j = 0; j < _N; ++j) { + auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); + h_y[j] = cos(auxJ); + } + + for (int i = 0; i < _M; ++i) { + auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + for (int j = 0; j < _N; ++j) { + auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); + h_A(i,j) = 3 * cos(auxI) * sin(auxJ); + } + } + + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + auxIpJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i+j) ); + h_expected(i,j) = 3 * sin(auxIpJ); + } + } +} + +// Code for complex values +template +template +typename std::enable_if< std::is_same>::value || std::is_same>::value + , void + >::type +GerTester< ScalarX + , tLayoutX + , ScalarY + , tLayoutY + , ScalarA + , tLayoutA + , Device + >::populateVanillaValues( const T & alpha + , const _HostViewTypeX & h_x + , const _HostViewTypeY & h_y + , const _HostViewTypeA & h_A + , _ViewTypeExpected & h_vanilla + ) { + if (_vanillaUsesDifferentOrderOfOps) { + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + h_vanilla(i,j) = h_A(i,j) + alpha * _KAT_A::conj( h_y(j) ) * h_x(i); + } + } + } + else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + h_vanilla(i,j) = h_A(i,j) + alpha * h_y(j) * h_x(i); + } + } + } + } + else { + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + h_vanilla(i,j) = h_A(i,j) + alpha * h_x(i) * _KAT_A::conj( h_y(j) ); + } + } + } + else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + h_vanilla(i,j) = h_A(i,j) + alpha * h_x(i) * h_y(j); + } + } + } + } +} + +// Code for non-complex values +template +template +typename std::enable_if< !std::is_same>::value && !std::is_same>::value + , void + >::type +GerTester< ScalarX + , tLayoutX + , ScalarY + , tLayoutY + , ScalarA + , tLayoutA + , Device + >::populateVanillaValues( const T & alpha + , const _HostViewTypeX & h_x + , const _HostViewTypeY & h_y + , const _HostViewTypeA & h_A + , _ViewTypeExpected & h_vanilla + ) { + if (_vanillaUsesDifferentOrderOfOps) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + h_vanilla(i,j) = h_A(i,j) + alpha * h_y(j) * h_x(i); + } + } + } + else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + h_vanilla(i,j) = h_A(i,j) + alpha * h_x(i) * h_y(j); + } + } + } +} + +template +template +T GerTester< ScalarX + , tLayoutX + , ScalarY + , tLayoutY + , ScalarA + , tLayoutA + , Device + >::shrinkAngleToZeroTwoPiRange(const T input) +{ + T output(input); +#if 0 + T twoPi( 2. * piVal ); + if (input > 0.) { + output -= std::floor( input / twoPi ) * twoPi; + } + else if (input < 0.) { + output += std::floor( -input / twoPi ) * twoPi; + } +#endif + return output; +} + +// Code for complex values +template +template +typename std::enable_if< std::is_same>::value || std::is_same>::value + , void + >::type +GerTester< ScalarX + , tLayoutX + , ScalarY + , tLayoutY + , ScalarA + , tLayoutA + , Device + >::compareVanillaExpected( const T & alpha + , const _ViewTypeExpected & h_vanilla + , const _ViewTypeExpected & h_expected + ) { + int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); + + if (_useAnalyticalResults) { + int numErrorsRealAbs (0); + int numErrorsRealRel (0); + int numErrorsImagAbs (0); + int numErrorsImagRel (0); + _AuxType diff (0.); + _AuxType diffThreshold (0.); + bool errorHappened (false); + _AuxType maxErrorRealRel (0.); + int iForMaxErrorRealRel(0); + int jForMaxErrorRealRel(0); + _AuxType maxErrorImagRel (0.); + int iForMaxErrorImagRel(0); + int jForMaxErrorImagRel(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i,j).real() - h_vanilla(i,j).real()); + errorHappened = false; + if (h_expected(i,j).real() == 0.) { + diffThreshold = _KAT_A::abs(_epsAbs); + if ( diff > diffThreshold ) { + errorHappened = true; + numErrorsRealAbs++; + } + } + else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i,j).real()); + if (maxErrorRealRel < aux) { + maxErrorRealRel = aux; + iForMaxErrorRealRel = i; + jForMaxErrorRealRel = j; + } + + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j).real()); + if ( diff > diffThreshold ) { + errorHappened = true; + numErrorsRealRel++; + } + } + if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { + std::cout << "ERROR, i = " << i + << ", j = " << j + << ": h_expected(i,j).real() = " << h_expected(i,j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i,j).real() + << ", _KAT_A::abs(h_expected(i,j).real() - h_vanilla(i,j).real()) = " << diff + << ", diffThreshold = " << diffThreshold + << std::endl; + } + + diff = _KAT_A::abs(h_expected(i,j).imag() - h_vanilla(i,j).imag()); + errorHappened = false; + if (h_expected(i,j).imag() == 0.) { + diffThreshold = _KAT_A::abs(_epsAbs); + if ( diff > diffThreshold ) { + errorHappened = true; + numErrorsImagAbs++; + } + } + else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i,j).imag()); + if (maxErrorImagRel < aux) { + maxErrorImagRel = aux; + iForMaxErrorImagRel = i; + jForMaxErrorImagRel = j; + } + + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j).imag()); + if ( diff > diffThreshold ) { + errorHappened = true; + numErrorsImagRel++; + } + } + if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { + std::cout << "ERROR, i = " << i + << ", j = " << j + << ": h_expected(i,j).imag() = " << h_expected(i,j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i,j).imag() + << ", _KAT_A::abs(h_expected(i,j).imag() - h_vanilla(i,j).imag()) = " << diff + << ", diffThreshold = " << diffThreshold + << std::endl; + } + } // for j + } // for i + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla differs too much from analytical on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_expected(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) + << ", h_vanilla(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); + if (numErrorsReal > 0) { + std::cout<< "WARNING" << msg.str() << std::endl; + } + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla differs too much from analytical on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_expected(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) + << ", h_vanilla(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); + if (numErrorsImag > 0) { + std::cout<< "WARNING" << msg.str() << std::endl; + } + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } + } + else { + int numErrorsReal(0); + int numErrorsImag(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + if ( h_expected(i,j).real() != h_vanilla(i,j).real() ) { + if (numErrorsReal == 0) { + std::cout << "ERROR, i = " << i + << ", j = " << j + << ": h_expected(i,j).real() = " << h_expected(i,j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i,j).real() + << std::endl; + } + numErrorsReal++; + } + + if ( h_expected(i,j).imag() != h_vanilla(i,j).imag() ) { + if (numErrorsImag == 0) { + std::cout << "ERROR, i = " << i + << ", j = " << j + << ": h_expected(i,j).imag() = " << h_expected(i,j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i,j).imag() + << std::endl; + } + numErrorsImag++; + } + } // for j + } // for i + EXPECT_EQ(numErrorsReal, 0) << "Failed test" + << ", A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla result is incorrect on real components" + << ", numErrorsReal = " << numErrorsReal; + EXPECT_EQ(numErrorsImag, 0) << "Failed test" + << ", A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla result is incorrect on imag components" + << ", numErrorsImag = " << numErrorsImag; + } +} + +// Code for non-complex values +template +template +typename std::enable_if< !std::is_same>::value && !std::is_same>::value + , void + >::type +GerTester< ScalarX + , tLayoutX + , ScalarY + , tLayoutY + , ScalarA + , tLayoutA + , Device + >::compareVanillaExpected( const T & alpha + , const _ViewTypeExpected & h_vanilla + , const _ViewTypeExpected & h_expected + ) { + int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); + + if (_useAnalyticalResults) { + int numErrorsAbs (0); + int numErrorsRel (0); + _AuxType diff (0.); + _AuxType diffThreshold (0.); + bool errorHappened (false); + _AuxType maxErrorRel (0.); + int iForMaxErrorRel(0); + int jForMaxErrorRel(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)); + errorHappened = false; + if (h_expected(i,j) == 0.) { + diffThreshold = _KAT_A::abs(_epsAbs); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsAbs++; + } + } + else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i,j)); + if (maxErrorRel < aux) { + maxErrorRel = aux; + iForMaxErrorRel = i; + jForMaxErrorRel = j; + } + + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j)); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRel++; + } + } + if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { + std::cout << "ERROR, i = " << i + << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i,j) + << ", h_vanilla(i,j) = " << h_vanilla(i,j) + << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " << diff + << ", diffThreshold = " << diffThreshold + << std::endl; + } + } // for j + } // for i + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla differs too much from expected" + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel + << ", h_expected(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) + << ", h_vanilla(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrors(numErrorsAbs + numErrorsRel); + if (numErrors > 0) { + std::cout<< "WARNING" << msg.str() << std::endl; + } + EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } + } + else { + int numErrors(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + if ( h_expected(i,j) != h_vanilla(i,j) ) { + if (numErrors == 0) { + std::cout << "ERROR, i = " << i + << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i,j) + << ", h_vanilla(i,j) = " << h_vanilla(i,j) + << std::endl; + } + numErrors++; + } + } // for j + } // for i + EXPECT_EQ(numErrors, 0) << "Failed test" + << ", A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla result is incorrect" + << ", numErrors = " << numErrors; + } +} + +// Code for complex values +template +template +typename std::enable_if< std::is_same>::value || std::is_same>::value + , void + >::type +GerTester< ScalarX + , tLayoutX + , ScalarY + , tLayoutY + , ScalarA + , tLayoutA + , Device + >::compareKokkosExpected( const T & alpha + , const _HostViewTypeA & h_A + , const _ViewTypeExpected & h_expected + ) { + int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); + + int numErrorsRealAbs (0); + int numErrorsRealRel (0); + int numErrorsImagAbs (0); + int numErrorsImagRel (0); + _AuxType diff (0.); + _AuxType diffThreshold (0.); + bool errorHappened (false); + _AuxType maxErrorRealRel (0.); + int iForMaxErrorRealRel(0); + int jForMaxErrorRealRel(0); + _AuxType maxErrorImagRel (0.); + int iForMaxErrorImagRel(0); + int jForMaxErrorImagRel(0); + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i,j).real() - h_A(i,j).real()); + errorHappened = false; + if (h_expected(i,j).real() == 0.) { + diffThreshold = _KAT_A::abs(_epsAbs); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealAbs++; + } + } + else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i,j).real()); + if (maxErrorRealRel < aux) { + maxErrorRealRel = aux; + iForMaxErrorRealRel = i; + jForMaxErrorRealRel = j; + } + + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j).real()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealRel++; + } + } + if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { + std::cout << "ERROR, i = " << i + << ", j = " << j + << ": h_expected(i,j).real() = " << h_expected(i,j).real() + << ", h_A(i,j).real() = " << h_A(i,j).real() + << ", _KAT_A::abs(h_expected(i,j).real() - h_A(i,j).real()) = " << diff + << ", diffThreshold = " << diffThreshold + << std::endl; + } + + diff = _KAT_A::abs(h_expected(i,j).imag() - h_A(i,j).imag()); + errorHappened = false; + if (h_expected(i,j).imag() == 0.) { + diffThreshold = _KAT_A::abs(_epsAbs); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagAbs++; + } + } + else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i,j).imag()); + if (maxErrorImagRel < aux) { + maxErrorImagRel = aux; + iForMaxErrorImagRel = i; + jForMaxErrorImagRel = j; + } + + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j).imag()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagRel++; + } + } + if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { + std::cout << "ERROR, i = " << i + << ", j = " << j + << ": h_expected(i,j).imag() = " << h_expected(i,j).imag() + << ", h_A(i,j).imag() = " << h_A(i,j).imag() + << ", _KAT_A::abs(h_expected(i,j).imag() - h_A(i,j).imag()) = " << diff + << ", diffThreshold = " << diffThreshold + << std::endl; + } + } // for j + } // for i + std::cout << "A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_expected(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) + << ", h_A(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_expected(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) + << ", h_A(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed + << std::endl; + if ((_M == 2131) && (_N == 2131)) { + std::cout << "Information" + << ": A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", h_expected(11, 2119) = (" << h_expected(11,2119).real() << ", " << h_expected(11,2119).imag() << ")" + << ", h_A(11, 2119) = (" << h_A(11,2119).real() << ", " << h_A(11,2119).imag() << ")" + << std::endl; + std::cout << "Information" + << ": A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", h_expected(710, 1065) = (" << h_expected(710,1065).real() << ", " << h_expected(710,1065).imag() << ")" + << ", h_A(710, 1065) = (" << h_A(710,1065).real() << ", " << h_A(710,1065).imag() << ")" + << std::endl; + } + + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": ger result is incorrect on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_expected(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) + << ", h_A(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); + if (numErrorsReal > 0) { + std::cout<< "WARNING" << msg.str() << std::endl; + } + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": ger result is incorrect on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_expected(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) + << ", h_A(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); + if (numErrorsImag > 0) { + std::cout<< "WARNING" << msg.str() << std::endl; + } + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } +} + +// Code for non-complex values +template +template +typename std::enable_if< !std::is_same>::value && !std::is_same>::value + , void + >::type +GerTester< ScalarX + , tLayoutX + , ScalarY + , tLayoutY + , ScalarA + , tLayoutA + , Device + >::compareKokkosExpected( const T & alpha + , const _HostViewTypeA & h_A + , const _ViewTypeExpected & h_expected + ) { + int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); + + int numErrorsAbs (0); + int numErrorsRel (0); + _AuxType diff (0.); + _AuxType diffThreshold (0.); + bool errorHappened (false); + _AuxType maxErrorRel (0.); + int iForMaxErrorRel(0); + int jForMaxErrorRel(0); + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i,j) - h_A(i,j)); + errorHappened = false; + if (h_expected(i,j) == 0.) { + diffThreshold = _KAT_A::abs(_epsAbs); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsAbs++; + } + } + else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i,j)); + if (maxErrorRel < aux) { + maxErrorRel = aux; + iForMaxErrorRel = i; + jForMaxErrorRel = j; + } + + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j)); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRel++; + } + } + if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { + std::cout << "ERROR, i = " << i + << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i,j) + << ", h_A(i,j) = " << h_A(i,j) + << ", _KAT_A::abs(h_expected(i,j) - h_A(i,j)) = " << diff + << ", diffThreshold = " << diffThreshold + << std::endl; + } + } // for j + } // for i + std::cout << "A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel + << ", h_expected(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) + << ", h_A(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed + << std::endl; + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N + << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": ger result is incorrect" + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel + << ", h_expected(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) + << ", h_A(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrors(numErrorsAbs + numErrorsRel); + if (numErrors > 0) { + std::cout<< "WARNING" << msg.str() << std::endl; + } + EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } +} + +template +template +void GerTester< ScalarX + , tLayoutX + , ScalarY + , tLayoutY + , ScalarA + , tLayoutA + , Device + >::callKkGerAndCompareAgainstExpected( const ScalarA & alpha + , TX & x + , TY & y + , _ViewTypeA & A + , const _HostViewTypeA & h_A + , const _ViewTypeExpected & h_expected + , const std::string & situation + ) +{ + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_ger.hpp, right before calling KokkosBlas::ger(): ViewTypeA = %s, _kkGerShouldThrowException=%d\n", typeid(_ViewTypeA).name(), _kkGerShouldThrowException ); + std::string mode = _useHermitianOption ? "H" : "T"; + bool gotStdException (false); + bool gotUnknownException(false); + try { + KokkosBlas::ger(mode.c_str(), alpha, x, y, A); + } + catch( const std::exception& e ) { + std::cout << "In Test_Blas2_ger, '" << situation << "': caught exception, e.what() = " << e.what() << std::endl; + gotStdException = true; + } + catch( ... ) { + std::cout << "In Test_Blas2_ger, '" << situation << "': caught unknown exception" << std::endl; + gotUnknownException = true; + } + + EXPECT_EQ(gotUnknownException, false) << "Failed test, '" << situation << "': unknown exception should not have happened"; + + EXPECT_EQ(gotStdException, _kkGerShouldThrowException) << "Failed test, '" << situation << "': kk ger() should" + << (_kkGerShouldThrowException ? " " : " not ") + << "have thrown a std::exception"; + + if (( gotStdException == false ) && + ( gotUnknownException == false )) { + Kokkos::deep_copy(h_A, A); + + this->compareKokkosExpected( alpha + , h_A + , h_expected + ); + } +} + +} // namespace Test + +template +int test_ger( const std::string & caseName ) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+==========================================================================\n" ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s ...\n", caseName.c_str() ); + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s for LAYOUTLEFT ...\n", caseName.c_str() ); + + if (true) { + Test::GerTester tester; + tester.test(0, 13, 0); + tester.test(1024, 0, 0); + tester.test(1, 1, 0); + tester.test(2, 2, 0); + tester.test(1, 2, 0); + tester.test(13, 13, 0); + tester.test(13, 1024, 0); + tester.test(13, 1024, 0 , true, false); + tester.test(13, 1024, 0 , true, true); + tester.test(50, 40, 4 ); + tester.test(1024, 1024, 0); + tester.test(2131, 2131, 0); + tester.test(2131, 2131, 0 , true, false); + tester.test(2131, 2131, 0 , true, true); + } + + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTLEFT\n", caseName.c_str() ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s for LAYOUTRIGHT ...\n", caseName.c_str() ); + + if (true) { + Test::GerTester tester; + tester.test(0, 13, 0); + tester.test(1024, 0, 0); + tester.test(1, 1, 0); + tester.test(2, 2, 0); + tester.test(1, 2, 0); + tester.test(13, 13, 0); + tester.test(13, 1024, 0); + tester.test(13, 1024, 0, true, false); + tester.test(13, 1024, 0, true, true); + tester.test(50, 40, 4); + tester.test(1024, 1024, 0); + tester.test(2131, 2131, 0); + tester.test(2131, 2131, 0, true, false); + tester.test(2131, 2131, 0, true, true); + } + + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTRIGHT\n", caseName.c_str() ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s for LAYOUTSTRIDE ...\n", caseName.c_str() ); + + if (true) { + Test::GerTester tester; + tester.test(0, 13, 0 ); + tester.test(1024, 0, 0); + tester.test(13, 13, 0); + tester.test(13, 1024, 0); + tester.test(13, 1024, 0, true, false); + tester.test(13, 1024, 0, true, true); + tester.test(50, 40, 4); + tester.test(1024, 1024, 0); + tester.test(2131, 2131, 0); + tester.test(2131, 2131, 0, true, false); + tester.test(2131, 2131, 0, true, true); + } + + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTSTRIDE\n", caseName.c_str() ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s for MIXED LAYOUTS ...\n", caseName.c_str() ); + + if (true) { + Test::GerTester tester; + tester.test(1024, 1024, 0); + tester.test(1024, 1024, 0, true, false); + tester.test(1024, 1024, 0, true, true); + } + + if (true) { + Test::GerTester tester; + tester.test(1024, 1024, 0); + } + + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for MIXED LAYOUTS\n", caseName.c_str() ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); +#endif + + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s\n", caseName.c_str() ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+==========================================================================\n" ); + + return 1; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, ger_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_float"); + test_ger( "test case ger_float" ); + Kokkos::Profiling::popRegion(); +} +#endif + +#if 1 + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, ger_complex_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_complex_float"); + test_ger, Kokkos::complex, Kokkos::complex, TestExecSpace>( "test case ger_complex_float" ); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, ger_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_double"); + test_ger( "test case ger_double" ); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, ger_complex_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_complex_double"); + test_ger, Kokkos::complex, Kokkos::complex, TestExecSpace>( "test case ger_complex_double" ); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, ger_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_int"); + test_ger( "test case ger_int" ); + Kokkos::Profiling::popRegion(); +} +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, ger_double_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_double_int"); + test_ger( "test case ger_mixed_types" ); + Kokkos::Profiling::popRegion(); +} +#endif + +#endif // if 1 From b21194af44efd3b02ed00d6efa7e6cbbe60d79f2 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 28 Mar 2023 01:23:03 -0600 Subject: [PATCH 291/442] Handling compilation warnings and errors at weaver --- blas/impl/KokkosBlas2_ger_impl.hpp | 10 +++++----- .../tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/blas/impl/KokkosBlas2_ger_impl.hpp b/blas/impl/KokkosBlas2_ger_impl.hpp index a0f00df503..62d6197c15 100644 --- a/blas/impl/KokkosBlas2_ger_impl.hpp +++ b/blas/impl/KokkosBlas2_ger_impl.hpp @@ -57,7 +57,7 @@ struct SingleLevelGER { } KOKKOS_INLINE_FUNCTION void operator()(const IndexType & i) const { - using KAT = Kokkos::Details::ArithTraits; + using KAT = Kokkos::ArithTraits; if (alpha_ == KAT::zero()) { // Nothing to do @@ -108,7 +108,7 @@ void singleLevelGer( const typename AViewType::execution_space & space static_assert(std::is_integral::value, "IndexType must be an integer"); - using KAT = Kokkos::Details::ArithTraits; + using KAT = Kokkos::ArithTraits; if (y.extent(0) == 0) { // no entries to update @@ -176,7 +176,7 @@ struct TwoLevelGER { KOKKOS_INLINE_FUNCTION void operator()( TwoLevelGER_LayoutLeftTag , const member_type & team ) const { - using KAT = Kokkos::Details::ArithTraits; + using KAT = Kokkos::ArithTraits; if (alpha_ == KAT::zero()) { // Nothing to do @@ -203,7 +203,7 @@ struct TwoLevelGER { KOKKOS_INLINE_FUNCTION void operator()( TwoLevelGER_LayoutRightTag , const member_type & team ) const { - using KAT = Kokkos::Details::ArithTraits; + using KAT = Kokkos::ArithTraits; if (alpha_ == KAT::zero()) { // Nothing to do @@ -255,7 +255,7 @@ void twoLevelGer( const typename AViewType::execution_space & space static_assert(std::is_integral::value, "IndexType must be an integer"); - using KAT = Kokkos::Details::ArithTraits; + using KAT = Kokkos::ArithTraits; if (y.extent(0) == 0) { // no entries to update diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp index 5f555f926e..cda083a6b5 100644 --- a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp @@ -129,7 +129,7 @@ template void spgemm_numeric_cusparse( - KernelHandle *handle, lno_t m, lno_t n, lno_t k, + KernelHandle *handle, lno_t /*m*/, lno_t /*n*/, lno_t /*k*/, const ConstRowMapType &row_mapA, const ConstEntriesType &entriesA, const ConstValuesType &valuesA, const ConstRowMapType &row_mapB, const ConstEntriesType &entriesB, const ConstValuesType &valuesB, From 414210378cc947b3c302b0f03e7f02929dba6ac9 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 10 Apr 2023 22:56:05 -0600 Subject: [PATCH 292/442] Addressed all feedbacks from Luc and Kim --- blas/impl/KokkosBlas2_ger_impl.hpp | 144 +++++++----------- blas/impl/KokkosBlas2_ger_spec.hpp | 75 +++------ blas/src/KokkosBlas2_ger.hpp | 82 +++++----- blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp | 58 +++---- .../KokkosBlas2_ger_tpl_spec_decl_blas.hpp | 86 +++++------ .../KokkosBlas2_ger_tpl_spec_decl_cublas.hpp | 138 +++++++++-------- .../KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp | 98 +++++------- blas/unit_test/Test_Blas2_ger.hpp | 11 +- graph/impl/KokkosGraph_Distance2MIS_impl.hpp | 4 +- 9 files changed, 305 insertions(+), 391 deletions(-) diff --git a/blas/impl/KokkosBlas2_ger_impl.hpp b/blas/impl/KokkosBlas2_ger_impl.hpp index 62d6197c15..120bfd3c13 100644 --- a/blas/impl/KokkosBlas2_ger_impl.hpp +++ b/blas/impl/KokkosBlas2_ger_impl.hpp @@ -27,11 +27,12 @@ namespace Impl { // Functor for a single-level parallel_for version of nontranspose GER. // The functor parallelizes over rows of the input matrix A. -template +template struct SingleLevelGER { using AlphaCoeffType = typename AViewType::non_const_value_type; - using A_value_type = typename AViewType::non_const_value_type; + using XComponentType = typename XViewType::non_const_value_type; + using YComponentType = typename YViewType::non_const_value_type; + using AComponentType = typename AViewType::non_const_value_type; SingleLevelGER( const bool justTranspose , const AlphaCoeffType & alpha @@ -45,35 +46,25 @@ struct SingleLevelGER { , y_ (y) , A_ (A) { - static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); - - static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); - static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); - - static_assert(std::is_integral::value, "IndexType must be an integer."); + // Nothing to do } KOKKOS_INLINE_FUNCTION void operator()(const IndexType & i) const { - using KAT = Kokkos::ArithTraits; - - if (alpha_ == KAT::zero()) { + if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do } else { - const IndexType N ( A_.extent(1) ); - const A_value_type x_fixed( x_(i) ); + const IndexType N ( A_.extent(1) ); + const XComponentType x_fixed( x_(i) ); if (justTranspose_) { for (IndexType j = 0; j < N; ++j) { - A_(i,j) += A_value_type( alpha_ * x_fixed * y_(j) ); + A_(i,j) += AComponentType( alpha_ * x_fixed * y_(j) ); } } else { for (IndexType j = 0; j < N; ++j) { - A_(i,j) += A_value_type( alpha_ * x_fixed * KAT::conj( y_(j) ) ); + A_(i,j) += AComponentType( alpha_ * x_fixed * Kokkos::ArithTraits::conj( y_(j) ) ); } } } @@ -88,27 +79,18 @@ struct SingleLevelGER { }; // Single-level parallel version of GER. -template -void singleLevelGer( const typename AViewType::execution_space & space +void singleLevelGer( const ExecutionSpace & space , const char trans[] , const typename AViewType::const_value_type & alpha , const XViewType & x , const YViewType & y , const AViewType & A ) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering IMPL singleLevelGer(), AViewType = %s\n", typeid(AViewType).name() ); - static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); - - static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); - static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); - static_assert(std::is_integral::value, "IndexType must be an integer"); - using KAT = Kokkos::ArithTraits; + using AlphaCoeffType = typename AViewType::non_const_value_type; if (y.extent(0) == 0) { // no entries to update @@ -116,12 +98,11 @@ void singleLevelGer( const typename AViewType::execution_space & space else if (x.extent(0) == 0) { // no entries to update } - else if (alpha == KAT::zero()) { + else if (alpha == Kokkos::ArithTraits::zero()) { // no entries to update } else { - using execution_space = typename AViewType::execution_space; - Kokkos::RangePolicy rangePolicy(space, 0, A.extent(0)); + Kokkos::RangePolicy rangePolicy(space, 0, A.extent(0)); SingleLevelGER functor( (trans[0] == 'T') || (trans[0] == 't') , alpha , x @@ -139,14 +120,15 @@ struct TwoLevelGER_LayoutRightTag {}; // Functor for a two-level parallel_reduce version of GER, designed for performance on GPU. // Kernel depends on the layout of A. -template +template struct TwoLevelGER { using AlphaCoeffType = typename AViewType::non_const_value_type; - using A_value_type = typename AViewType::non_const_value_type; + using XComponentType = typename XViewType::non_const_value_type; + using YComponentType = typename YViewType::non_const_value_type; + using AComponentType = typename AViewType::non_const_value_type; - using execution_space = typename AViewType::execution_space; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; TwoLevelGER( const bool justTranspose , const AlphaCoeffType & alpha @@ -160,15 +142,7 @@ struct TwoLevelGER { , y_ (y) , A_ (A) { - static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); - - static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); - static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); - - static_assert(std::is_integral::value, "IndexType must be an integer."); + // Nothing to do } public: @@ -176,24 +150,22 @@ struct TwoLevelGER { KOKKOS_INLINE_FUNCTION void operator()( TwoLevelGER_LayoutLeftTag , const member_type & team ) const { - using KAT = Kokkos::ArithTraits; - - if (alpha_ == KAT::zero()) { + if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do } else { const IndexType M ( A_.extent(0) ); const IndexType j ( team.league_rank() ); if (justTranspose_) { - const A_value_type y_fixed( y_(j) ); + const YComponentType y_fixed( y_(j) ); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType & i) { - A_(i,j) += A_value_type( alpha_ * x_(i) * y_fixed ); + A_(i,j) += AComponentType( alpha_ * x_(i) * y_fixed ); }); } else { - const A_value_type y_fixed( KAT::conj( y_(j) ) ); + const YComponentType y_fixed( Kokkos::ArithTraits::conj( y_(j) ) ); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType & i) { - A_(i,j) += A_value_type( alpha_ * x_(i) * y_fixed ); + A_(i,j) += AComponentType( alpha_ * x_(i) * y_fixed ); }); } } @@ -203,23 +175,21 @@ struct TwoLevelGER { KOKKOS_INLINE_FUNCTION void operator()( TwoLevelGER_LayoutRightTag , const member_type & team ) const { - using KAT = Kokkos::ArithTraits; - - if (alpha_ == KAT::zero()) { + if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do } else { - const IndexType N ( A_.extent(1) ); - const IndexType i ( team.league_rank() ); - const A_value_type x_fixed( x_(i) ); + const IndexType N ( A_.extent(1) ); + const IndexType i ( team.league_rank() ); + const XComponentType x_fixed( x_(i) ); if (justTranspose_) { Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType & j) { - A_(i,j) += A_value_type( alpha_ * x_fixed * y_(j) ); + A_(i,j) += AComponentType( alpha_ * x_fixed * y_(j) ); }); } else { Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType & j) { - A_(i,j) += A_value_type( alpha_ * x_fixed * KAT::conj( y_(j) ) ); + A_(i,j) += AComponentType( alpha_ * x_fixed * Kokkos::ArithTraits::conj( y_(j) ) ); }); } } @@ -235,27 +205,18 @@ struct TwoLevelGER { }; // Two-level parallel version of GER. -template -void twoLevelGer( const typename AViewType::execution_space & space +void twoLevelGer( const ExecutionSpace & space , const char trans[] , const typename AViewType::const_value_type & alpha , const XViewType & x , const YViewType & y , const AViewType & A ) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering IMPL twoLevelGer(), AViewType = %s\n", typeid(AViewType).name() ); - static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); - - static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); - static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); - static_assert(std::is_integral::value, "IndexType must be an integer"); - using KAT = Kokkos::ArithTraits; + using AlphaCoeffType = typename AViewType::non_const_value_type; if (y.extent(0) == 0) { // no entries to update @@ -265,15 +226,14 @@ void twoLevelGer( const typename AViewType::execution_space & space // no entries to update return; } - else if (alpha == KAT::zero()) { + else if (alpha == Kokkos::ArithTraits::zero()) { // no entries to update return; } - using execution_space = typename AViewType::execution_space; constexpr bool isLayoutLeft = std::is_same::value; using layout_tag = typename std::conditional::type; - using TeamPolicyType = Kokkos::TeamPolicy; + using TeamPolicyType = Kokkos::TeamPolicy; TeamPolicyType teamPolicy; if (isLayoutLeft) { // LayoutLeft: one team per column @@ -284,12 +244,12 @@ void twoLevelGer( const typename AViewType::execution_space & space teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); } - TwoLevelGER functor( (trans[0] == 'T') || (trans[0] == 't') - , alpha - , x - , y - , A - ); + TwoLevelGER functor( (trans[0] == 'T') || (trans[0] == 't') + , alpha + , x + , y + , A + ); Kokkos::parallel_for("KokkosBlas::ger[twoLevel]", teamPolicy, functor); } @@ -299,37 +259,37 @@ void twoLevelGer( const typename AViewType::execution_space & space // depending on whether execution space is CPU or GPU. // The 'enable_if' makes sure unused kernels are not instantiated. -template < class XViewType +template < class ExecutionSpace + , class XViewType , class YViewType , class AViewType , class IndexType - , typename std::enable_if() >::type* = nullptr + , typename std::enable_if()>::type* = nullptr > -void generalGerImpl( const typename AViewType::execution_space & space +void generalGerImpl( const ExecutionSpace & space , const char trans[] , const typename AViewType::const_value_type & alpha , const XViewType & x , const YViewType & y , const AViewType & A ) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering IMPL generalGerImpl(CPU), AViewType = %s\n", typeid(AViewType).name() ); singleLevelGer(space, trans, alpha, x, y, A); } -template < class XViewType +template < class ExecutionSpace + , class XViewType , class YViewType , class AViewType , class IndexType - , typename std::enable_if()>::type* = nullptr + , typename std::enable_if()>::type* = nullptr > -void generalGerImpl( const typename AViewType::execution_space & space +void generalGerImpl( const ExecutionSpace & space , const char trans[] , const typename AViewType::const_value_type & alpha , const XViewType & x , const YViewType & y , const AViewType & A ) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering IMPL generalGerImpl(GPU), AViewType = %s\n", typeid(AViewType).name() ); twoLevelGer(space, trans, alpha, x, y, A); } diff --git a/blas/impl/KokkosBlas2_ger_spec.hpp b/blas/impl/KokkosBlas2_ger_spec.hpp index f8e32111f5..ebbb38c2fd 100644 --- a/blas/impl/KokkosBlas2_ger_spec.hpp +++ b/blas/impl/KokkosBlas2_ger_spec.hpp @@ -73,14 +73,15 @@ namespace Impl { // // Implementation of KokkosBlas::ger. -template < class XViewType +template < class ExecutionSpace + , class XViewType , class YViewType , class AViewType , bool tpl_spec_avail = ger_tpl_spec_avail::value , bool eti_spec_avail = ger_eti_spec_avail::value > struct GER { - static void ger( const typename AViewType::execution_space & space + static void ger( const ExecutionSpace & space , const char trans[] , const typename AViewType::const_value_type & alpha , const XViewType & x @@ -89,42 +90,6 @@ struct GER { ) #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering KokkosBlas::Impl::Ger::ger()\n" ); - - static_assert(Kokkos::is_view::value, "XViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, "YViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); - - static_assert(static_cast(XViewType::rank) == 1, "XViewType must have rank 1."); - static_assert(static_cast(YViewType::rank) == 1, "YViewType must have rank 1."); - static_assert(static_cast(AViewType::rank) == 2, "AViewType must have rank 2."); - - if ((trans[0] == 'T') || - (trans[0] == 't') || - (trans[0] == 'H') || - (trans[0] == 'h')) { - // Ok - } - else { - std::ostringstream oss; - oss << "In impl of KokkosBlas2::ger(): invalid trans[0] = " << trans[0]; - throw std::runtime_error(oss.str()); - } - - if (A.extent(0) != x.extent(0)) { - std::ostringstream oss; - oss << "In impl of KokkosBlas2::ger(): A.extent(0) = " << A.extent(0) - << ", but x.extent(0) = " << x.extent(0); - throw std::runtime_error(oss.str()); - } - - if (A.extent(1) != y.extent(0)) { - std::ostringstream oss; - oss << "In impl of KokkosBlas2::ger(): A.extent(1) = " << A.extent(1) - << ", but y.extent(0) = " << y.extent(0); - throw std::runtime_error(oss.str()); - } - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::ger[ETI]" : "KokkosBlas::ger[noETI]"); typedef typename AViewType::size_type size_type; @@ -134,22 +99,22 @@ struct GER { // Prefer int as the index type, but use a larger type if needed. if (( numRows < static_cast(INT_MAX) ) && ( numCols < static_cast(INT_MAX) )) { - generalGerImpl( space - , trans - , alpha - , x - , y - , A - ); + generalGerImpl( space + , trans + , alpha + , x + , y + , A + ); } else { - generalGerImpl( space - , trans - , alpha - , x - , y - , A - ); + generalGerImpl( space + , trans + , alpha + , x + , y + , A + ); } Kokkos::Profiling::popRegion(); @@ -169,7 +134,8 @@ struct GER { // We may spread out definitions (see _DEF macro below) across one or more .cpp files. // #define KOKKOSBLAS2_GER_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - extern template struct GER< Kokkos::View< const SCALAR* \ + extern template struct GER< EXEC_SPACE \ + , Kokkos::View< const SCALAR* \ , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ @@ -189,7 +155,8 @@ struct GER { >; #define KOKKOSBLAS2_GER_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct GER< Kokkos::View< const SCALAR* \ + template struct GER< EXEC_SPACE \ + , Kokkos::View< const SCALAR* \ , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ diff --git a/blas/src/KokkosBlas2_ger.hpp b/blas/src/KokkosBlas2_ger.hpp index f8bd1e943e..728bb86494 100644 --- a/blas/src/KokkosBlas2_ger.hpp +++ b/blas/src/KokkosBlas2_ger.hpp @@ -23,9 +23,10 @@ namespace KokkosBlas { /// \brief Rank-1 update of a general matrix: A = A + alpha * x * y^{T,H}. /// -/// \tparam XViewType Input vector, as a 1-D Kokkos::View -/// \tparam YViewType Input vector, as a 1-D Kokkos::View -/// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View +/// \tparam ExecutionSpace The type of execution space +/// \tparam XViewType Input vector, as a 1-D Kokkos::View +/// \tparam YViewType Input vector, as a 1-D Kokkos::View +/// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View /// /// \param space [in] Execution space instance on which to run the kernel. /// This may contain information about which stream to @@ -36,15 +37,17 @@ namespace KokkosBlas { /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param y [in] Input vector, as a 1-D Kokkos::View /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View -template -void ger( const typename AViewType::execution_space & space +template +void ger( const ExecutionSpace & space , const char trans[] , const typename AViewType::const_value_type & alpha , const XViewType & x , const YViewType & y , const AViewType & A ) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Entering SRC KokkosBlas::ger(), AViewType = %s\n", typeid(AViewType).name() ); + static_assert(Kokkos::SpaceAccessibility::accessible, "AViewType memory space must be compatible with ExecutionSpace"); + static_assert(Kokkos::SpaceAccessibility::accessible, "XViewType memory space must be compatible with ExecutionSpace"); + static_assert(Kokkos::SpaceAccessibility::accessible, "YViewType memory space must be compatible with ExecutionSpace"); static_assert( Kokkos::is_view::value, "AViewType must be a Kokkos::View." ); static_assert( Kokkos::is_view::value, "XViewType must be a Kokkos::View." ); @@ -65,6 +68,25 @@ void ger( const typename AViewType::execution_space & space KokkosKernels::Impl::throw_runtime_exception(os.str()); } + + if ((trans[0] == 'T') || + (trans[0] == 't') || + (trans[0] == 'H') || + (trans[0] == 'h')) { + // Ok + } + else { + std::ostringstream os; + os << "KokkosBlas::ger: invalid trans[0] = '" << trans[0] + << "'. It must be equalt to 'T' or 't' or 'H' or 'h'"; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (( A.extent(0) == 0 ) || + ( A.extent(1) == 0 )) { + return; + } + using ALayout = typename AViewType::array_layout; // Minimize the number of Impl::GER instantiations, by standardizing @@ -87,29 +109,13 @@ void ger( const typename AViewType::execution_space & space , Kokkos::MemoryTraits > AVT; - if (( A.extent(0) == 0 ) || - ( A.extent(1) == 0 )) { - // For degenerate cases, use fallback implementation to avoid potential - // (unlikely?) circular dependence issues by including other KokkosBlas - // headers. - const bool eti_spec_avail = KokkosBlas::Impl::ger_eti_spec_avail::value; - Impl::GER::ger( space - , trans - , alpha - , x - , y - , A - ); - } - else { - Impl::GER::ger( space - , trans - , alpha - , x - , y - , A - ); - } + Impl::GER::ger( space + , trans + , alpha + , x + , y + , A + ); } /// \brief Rank-1 update of a general matrix: A = A + alpha * x * y^{T,H}. @@ -132,13 +138,17 @@ void ger( const char trans[] , const AViewType & A ) { const typename AViewType::execution_space space = typename AViewType::execution_space(); - ger( space - , trans - , alpha - , x - , y - , A - ); + ger< typename AViewType::execution_space + , XViewType + , YViewType + , AViewType + > ( space + , trans + , alpha + , x + , y + , A + ); } } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp index 250f705950..92c8c8c162 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp @@ -65,20 +65,20 @@ KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS #define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTX, LAYOUTY, LAYOUTA, MEMSPACE) \ - template \ + template <> \ struct ger_tpl_spec_avail< Kokkos::View< const SCALAR* \ , LAYOUTX \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< const SCALAR* \ , LAYOUTY \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< SCALAR** \ , LAYOUTA \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ > { \ @@ -104,31 +104,31 @@ KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRig // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ - template <> \ - struct ger_tpl_spec_avail< Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device< Kokkos::Experimental::HIP, \ - , Kokkos::Experimental::HIPSpace \ - > \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device< Kokkos::Experimental::HIP \ - , Kokkos::Experimental::HIPSpace \ - > \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device< Kokkos::Experimental::HIP \ - , Kokkos::Experimental::HIPSpace \ - > \ - , Kokkos::MemoryTraits \ - > \ - > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ + template <> \ + struct ger_tpl_spec_avail< Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device< Kokkos::HIP \ + , Kokkos::HIPSpace \ + > \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< const SCALAR* \ + , LAYOUT \ + , Kokkos::Device< Kokkos::HIP \ + , Kokkos::HIPSpace \ + > \ + , Kokkos::MemoryTraits \ + > \ + , Kokkos::View< SCALAR** \ + , LAYOUT \ + , Kokkos::Device< Kokkos::HIP \ + , Kokkos::HIPSpace \ + > \ + , Kokkos::MemoryTraits \ + > \ + > { \ + enum : bool { value = true }; \ }; KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft) diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp index a704a9ffc8..4a0326e947 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp @@ -22,36 +22,28 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUTA) \ - bool A_is_ll = std::is_same::value; \ - bool A_is_lr = std::is_same::value; \ +#define KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ constexpr int one = 1; \ - const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - if (( trans[0] == 'T' ) || \ - ( trans[0] == 't' ) || \ - ( trans[0] == 'H' ) || \ - ( trans[0] == 'h' )) { \ - } \ - else { \ - throw std::runtime_error("Error: invalid 'trans' for HostBlas::ger()"); \ - } + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DGER_BLAS(LAYOUTX, LAYOUTY, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_DGER_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GER< Kokkos::View< const double* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< const double* \ - , LAYOUTY \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< double** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ @@ -60,17 +52,17 @@ namespace Impl { > { \ typedef double SCALAR; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTY \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > YViewType; \ typedef Kokkos::View< SCALAR** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ @@ -82,9 +74,8 @@ namespace Impl { , const YViewType & Y \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dger-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,double]"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUTA); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ if (A_is_ll) { \ HostBlas::ger( M \ , N \ @@ -113,20 +104,20 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_SGER_BLAS(LAYOUTX, LAYOUTY, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_SGER_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GER< Kokkos::View< const float* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< const float* \ - , LAYOUTY \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< float** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ @@ -135,17 +126,17 @@ namespace Impl { > { \ typedef float SCALAR; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTY \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > YViewType; \ typedef Kokkos::View< SCALAR** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ @@ -157,9 +148,8 @@ namespace Impl { , const YViewType & Y \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-sger-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,float]"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUTA); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ if (A_is_ll) { \ HostBlas::ger( M \ , N \ @@ -188,20 +178,20 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_ZGER_BLAS(LAYOUTX, LAYOUTY, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_ZGER_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GER< Kokkos::View< const Kokkos::complex* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< const Kokkos::complex* \ - , LAYOUTY \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< Kokkos::complex** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ @@ -210,17 +200,17 @@ namespace Impl { > { \ typedef Kokkos::complex SCALAR; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTY \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > YViewType; \ typedef Kokkos::View< SCALAR** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ @@ -232,9 +222,8 @@ namespace Impl { , const YViewType & Y \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zger-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,complex"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUTA); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ const std::complex alpha_val = static_cast>(alpha); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ if (A_is_ll) { \ @@ -277,7 +266,6 @@ namespace Impl { ); \ } \ else { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasZgerc() requires LayoutLeft: throwing exception\n"); \ throw std::runtime_error("Error: blasZgerc() requires LayoutLeft views."); \ } \ } \ @@ -285,20 +273,20 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_CGER_BLAS(LAYOUTX, LAYOUTY, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_CGER_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GER< Kokkos::View< const Kokkos::complex* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< const Kokkos::complex* \ - , LAYOUTY \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< Kokkos::complex** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ @@ -307,17 +295,17 @@ namespace Impl { > { \ typedef Kokkos::complex SCALAR; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTY \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > YViewType; \ typedef Kokkos::View< SCALAR** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ @@ -329,9 +317,8 @@ namespace Impl { , const YViewType & Y \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-cger-blas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,complex"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUTA); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ const std::complex alpha_val = static_cast>(alpha); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ if (A_is_ll) { \ @@ -374,7 +361,6 @@ namespace Impl { ); \ } \ else { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("blasCgerc() requires LayoutLeft: throwing exception\n"); \ throw std::runtime_error("Error: blasCgerc() requires LayoutLeft views."); \ } \ } \ diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp index 0598a2c7f2..6bfe5db302 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp @@ -22,36 +22,28 @@ namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUTA) \ - bool A_is_ll = std::is_same::value; \ - bool A_is_lr = std::is_same::value; \ +#define KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ constexpr int one = 1; \ - const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - if (( trans[0] == 'T' ) || \ - ( trans[0] == 't' ) || \ - ( trans[0] == 'H' ) || \ - ( trans[0] == 'h' )) { \ - } \ - else { \ - throw std::runtime_error("Error: invalid 'trans' for cudaBlas::ger()"); \ - } + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DGER_CUBLAS(LAYOUTX, LAYOUTY, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_DGER_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GER< Kokkos::View< const double* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< const double* \ - , LAYOUTY \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< double** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ @@ -60,17 +52,17 @@ namespace Impl { > { \ typedef double SCALAR; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTY \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > YViewType; \ typedef Kokkos::View< SCALAR** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ @@ -82,9 +74,8 @@ namespace Impl { , const YViewType & Y \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dger-cublas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,double]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ if (A_is_ll) { \ @@ -120,20 +111,20 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_SGER_CUBLAS(LAYOUTX, LAYOUTY, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_SGER_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GER< Kokkos::View< const float* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< const float* \ - , LAYOUTY \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< float** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ @@ -142,17 +133,17 @@ namespace Impl { > { \ typedef float SCALAR; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTY \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > YViewType; \ typedef Kokkos::View< SCALAR** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ @@ -164,9 +155,8 @@ namespace Impl { , const YViewType & Y \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-sger-cublas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ if (A_is_ll) { \ @@ -202,20 +192,20 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_ZGER_CUBLAS(LAYOUTX, LAYOUTY, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_ZGER_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GER< Kokkos::View< const Kokkos::complex* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< const Kokkos::complex* \ - , LAYOUTY \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< Kokkos::complex** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ @@ -224,17 +214,17 @@ namespace Impl { > { \ typedef Kokkos::complex SCALAR; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTY \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > YViewType; \ typedef Kokkos::View< SCALAR** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ @@ -246,9 +236,8 @@ namespace Impl { , const YViewType & Y \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zger-cublas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ @@ -298,7 +287,6 @@ namespace Impl { ); \ } \ else { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("cublasZgerc() requires LayoutLeft: throwing exception\n"); \ throw std::runtime_error("Error: cublasZgerc() requires LayoutLeft views."); \ } \ } \ @@ -307,20 +295,20 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_CGER_CUBLAS(LAYOUTX, LAYOUTY, LAYOUTA, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_CGER_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct GER< Kokkos::View< const Kokkos::complex* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< const Kokkos::complex* \ - , LAYOUTY \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< Kokkos::complex** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ @@ -329,17 +317,17 @@ namespace Impl { > { \ typedef Kokkos::complex SCALAR; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTX \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< const SCALAR* \ - , LAYOUTY \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > YViewType; \ typedef Kokkos::View< SCALAR** \ - , LAYOUTA \ + , LAYOUT \ , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ @@ -351,9 +339,8 @@ namespace Impl { , const YViewType & Y \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-cger-cublas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ @@ -403,7 +390,6 @@ namespace Impl { ); \ } \ else { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("cublasCgerc() requires LayoutLeft: throwing exception\n"); \ throw std::runtime_error("Error: cublasCgerc() requires LayoutLeft views."); \ } \ } \ @@ -412,25 +398,45 @@ namespace Impl { } \ }; -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) + +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp index a8e349b38c..0938546c4b 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp @@ -23,36 +23,28 @@ namespace KokkosBlas { namespace Impl { #define KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT) \ - bool A_is_ll = std::is_same::value; \ + bool A_is_ll = std::is_same::value; \ bool A_is_lr = std::is_same::value; \ const int M = static_cast(A_is_lr ? A.extent(1) : A.extent(0)); \ const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ constexpr int one = 1; \ - const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ - if (( trans[0] == 'T' ) || \ - ( trans[0] == 't' ) || \ - ( trans[0] == 'H' ) || \ - ( trans[0] == 'h' )) { \ - } \ - else { \ - throw std::runtime_error( "Error: invalid 'trans' for rocBlas::ger()"); \ - } + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); #define KOKKOSBLAS2_DGER_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ template <> \ struct GER< Kokkos::View< const double* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< const double* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< double** \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , true \ @@ -61,17 +53,17 @@ namespace Impl { typedef double SCALAR; \ typedef Kokkos::View< const SCALAR* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< const SCALAR* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > YViewType; \ typedef Kokkos::View< SCALAR** \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ \ @@ -82,7 +74,6 @@ namespace Impl { , const YViewType & Y \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-dger-rocblas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,double]"); \ KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ @@ -124,17 +115,17 @@ namespace Impl { template <> \ struct GER< Kokkos::View< const float* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< const float* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< float** \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , true \ @@ -143,17 +134,17 @@ namespace Impl { typedef float SCALAR; \ typedef Kokkos::View< const SCALAR* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< const SCALAR* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > YViewType; \ typedef Kokkos::View< SCALAR** \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ \ @@ -164,7 +155,6 @@ namespace Impl { , const YViewType & Y \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-sger-rocblas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,float]"); \ KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ @@ -206,17 +196,17 @@ namespace Impl { template <> \ struct GER< Kokkos::View< const Kokkos::complex* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< const Kokkos::complex* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< Kokkos::complex** \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , true \ @@ -225,17 +215,17 @@ namespace Impl { typedef Kokkos::complex SCALAR; \ typedef Kokkos::View< const SCALAR* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< const SCALAR* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > YViewType; \ typedef Kokkos::View< SCALAR** \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ \ @@ -246,7 +236,6 @@ namespace Impl { , const YViewType & Y \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-zger-rocblas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,complex]"); \ KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ @@ -298,7 +287,6 @@ namespace Impl { ); \ } \ else { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("rocblasZgerc() requires LayoutLeft: throwing exception\n"); \ throw std::runtime_error("Error: rocblasZgerc() requires LayoutLeft views."); \ } \ } \ @@ -311,17 +299,17 @@ namespace Impl { template <> \ struct GER< Kokkos::View< const Kokkos::complex* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< const Kokkos::complex* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , Kokkos::View< Kokkos::complex** \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > \ , true \ @@ -330,17 +318,17 @@ namespace Impl { typedef Kokkos::complex SCALAR; \ typedef Kokkos::View< const SCALAR* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > XViewType; \ typedef Kokkos::View< const SCALAR* \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > YViewType; \ typedef Kokkos::View< SCALAR** \ , LAYOUT \ - , Kokkos::Device \ + , Kokkos::Device \ , Kokkos::MemoryTraits \ > AViewType; \ \ @@ -351,7 +339,6 @@ namespace Impl { , const YViewType & Y \ , const AViewType & A \ ) { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Passing through tpl-cger-rocblas\n" ); \ Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,complex]"); \ KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ @@ -403,7 +390,6 @@ namespace Impl { ); \ } \ else { \ - KOKKOS_IMPL_DO_NOT_USE_PRINTF("rocblasCgerc() requires LayoutLeft: throwing exception\n"); \ throw std::runtime_error("Error: rocblasCgec() requires LayoutLeft views."); \ } \ } \ @@ -412,25 +398,25 @@ namespace Impl { } \ }; -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, true ) -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, false) -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, true ) -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, false) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, true ) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, false) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, true ) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, false) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, true ) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, false) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, true ) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, false) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, true ) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::Experimental::HIPSpace, false) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, true ) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::Experimental::HIPSpace, false) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index cf506e3fa2..a158a8cf35 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -18,11 +18,10 @@ #include #include #include +#include namespace Test { -constexpr double piVal = 3.14159265358979323846; - template class GerTester { @@ -288,9 +287,9 @@ void GerTester< ScalarX typename _ViewTypeX::const_type c_x = x; typename _ViewTypeY::const_type c_y = y; - _HostViewTypeX h_x = Kokkos::create_mirror_view(x); - _HostViewTypeY h_y = Kokkos::create_mirror_view(y); - _HostViewTypeA h_A = Kokkos::create_mirror_view(A); + _HostViewTypeX h_x = Kokkos::create_mirror(x); + _HostViewTypeY h_y = Kokkos::create_mirror(y); + _HostViewTypeA h_A = Kokkos::create_mirror(A); _ViewTypeExpected h_expected("expected A += alpha * x * y^{t,h}", _M, _N); bool expectedResultIsKnown = false; @@ -776,7 +775,7 @@ T GerTester< ScalarX { T output(input); #if 0 - T twoPi( 2. * piVal ); + T twoPi( 2. * Kokkos::numbers::pi ); if (input > 0.) { output -= std::floor( input / twoPi ) * twoPi; } diff --git a/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index aa8180fae7..f829026e76 100644 --- a/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -828,7 +828,7 @@ struct D2_MIS_FixedPriority { Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(worklist1)); lno_t workRemain = numVerts; - int numIter = 0; + //int numIter = 0; // AquiEEP while (workRemain) { // do another iteration Kokkos::parallel_for( @@ -853,7 +853,7 @@ struct D2_MIS_FixedPriority { // Finally, flip the worklists std::swap(worklist1, worklist2); workRemain = newWorkRemain; - numIter++; + //numIter++; // AquiEEP } // now that every vertex has been decided IN_SET/OUT_SET, // build a compact list of the vertices which are IN_SET. From e4186186548001308396b0e3ff0c647576ba8819 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 11 Apr 2023 00:02:32 -0600 Subject: [PATCH 293/442] All files formatted with clang 8.0 --- .../ger/KokkosBlas2_ger_eti_spec_inst.cpp.in | 6 +- .../KokkosBlas2_ger_eti_spec_avail.hpp.in | 6 +- .../KokkosBlas2_ger_eti_spec_decl.hpp.in | 6 +- blas/impl/KokkosBlas2_ger_impl.hpp | 269 +++----- blas/impl/KokkosBlas2_ger_spec.hpp | 163 ++--- blas/src/KokkosBlas2_ger.hpp | 144 ++-- blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp | 192 +++--- .../KokkosBlas2_ger_tpl_spec_decl_blas.hpp | 594 +++++++--------- .../KokkosBlas2_ger_tpl_spec_decl_cublas.hpp | 643 +++++++----------- .../KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp | 624 +++++++---------- blas/tpls/KokkosBlas_Host_tpl.cpp | 248 ++----- blas/tpls/KokkosBlas_Host_tpl.hpp | 40 +- 12 files changed, 1183 insertions(+), 1752 deletions(-) diff --git a/blas/eti/generated_specializations_cpp/ger/KokkosBlas2_ger_eti_spec_inst.cpp.in b/blas/eti/generated_specializations_cpp/ger/KokkosBlas2_ger_eti_spec_inst.cpp.in index d256c1a6a1..8199d0b87e 100644 --- a/blas/eti/generated_specializations_cpp/ger/KokkosBlas2_ger_eti_spec_inst.cpp.in +++ b/blas/eti/generated_specializations_cpp/ger/KokkosBlas2_ger_eti_spec_inst.cpp.in @@ -20,6 +20,6 @@ namespace KokkosBlas { namespace Impl { -@BLAS2_GER_ETI_INST_BLOCK@ -} //IMPL -} //Kokkos +@BLAS2_GER_ETI_INST_BLOCK @ +} // namespace Impl +} // namespace KokkosBlas diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_avail.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_avail.hpp.in index 84e377eba9..b9f569b8ae 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_avail.hpp.in +++ b/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_avail.hpp.in @@ -19,7 +19,7 @@ namespace KokkosBlas { namespace Impl { -@BLAS2_GER_ETI_AVAIL_BLOCK@ -} //IMPL -} //Kokkos +@BLAS2_GER_ETI_AVAIL_BLOCK @ +} // namespace Impl +} // namespace KokkosBlas #endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in index ee14a84823..377397468e 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in +++ b/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in @@ -19,7 +19,7 @@ namespace KokkosBlas { namespace Impl { -@BLAS2_GER_ETI_DECL_BLOCK@ -} //IMPL -} //Kokkos +@BLAS2_GER_ETI_DECL_BLOCK @ +} // namespace Impl +} // namespace KokkosBlas #endif diff --git a/blas/impl/KokkosBlas2_ger_impl.hpp b/blas/impl/KokkosBlas2_ger_impl.hpp index 120bfd3c13..8fd166d94f 100644 --- a/blas/impl/KokkosBlas2_ger_impl.hpp +++ b/blas/impl/KokkosBlas2_ger_impl.hpp @@ -34,81 +34,64 @@ struct SingleLevelGER { using YComponentType = typename YViewType::non_const_value_type; using AComponentType = typename AViewType::non_const_value_type; - SingleLevelGER( const bool justTranspose - , const AlphaCoeffType & alpha - , const XViewType & x - , const YViewType & y - , const AViewType & A - ) - : justTranspose_(justTranspose) - , alpha_ (alpha) - , x_ (x) - , y_ (y) - , A_ (A) - { + SingleLevelGER(const bool justTranspose, const AlphaCoeffType& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) + : justTranspose_(justTranspose), alpha_(alpha), x_(x), y_(y), A_(A) { // Nothing to do } - KOKKOS_INLINE_FUNCTION void operator()(const IndexType & i) const { + KOKKOS_INLINE_FUNCTION void operator()(const IndexType& i) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do - } - else { - const IndexType N ( A_.extent(1) ); - const XComponentType x_fixed( x_(i) ); + } else { + const IndexType N(A_.extent(1)); + const XComponentType x_fixed(x_(i)); if (justTranspose_) { for (IndexType j = 0; j < N; ++j) { - A_(i,j) += AComponentType( alpha_ * x_fixed * y_(j) ); + A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); } - } - else { + } else { for (IndexType j = 0; j < N; ++j) { - A_(i,j) += AComponentType( alpha_ * x_fixed * Kokkos::ArithTraits::conj( y_(j) ) ); + A_(i, j) += + AComponentType(alpha_ * x_fixed * + Kokkos::ArithTraits::conj(y_(j))); } } } } -private: - bool justTranspose_; - AlphaCoeffType alpha_; + private: + bool justTranspose_; + AlphaCoeffType alpha_; typename XViewType::const_type x_; typename YViewType::const_type y_; - AViewType A_; + AViewType A_; }; // Single-level parallel version of GER. -template -void singleLevelGer( const ExecutionSpace & space - , const char trans[] - , const typename AViewType::const_value_type & alpha - , const XViewType & x - , const YViewType & y - , const AViewType & A - ) { - static_assert(std::is_integral::value, "IndexType must be an integer"); +template +void singleLevelGer(const ExecutionSpace& space, const char trans[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { + static_assert(std::is_integral::value, + "IndexType must be an integer"); using AlphaCoeffType = typename AViewType::non_const_value_type; if (y.extent(0) == 0) { // no entries to update - } - else if (x.extent(0) == 0) { + } else if (x.extent(0) == 0) { // no entries to update - } - else if (alpha == Kokkos::ArithTraits::zero()) { + } else if (alpha == Kokkos::ArithTraits::zero()) { // no entries to update - } - else { - Kokkos::RangePolicy rangePolicy(space, 0, A.extent(0)); - SingleLevelGER functor( (trans[0] == 'T') || (trans[0] == 't') - , alpha - , x - , y - , A - ); + } else { + Kokkos::RangePolicy rangePolicy(space, 0, + A.extent(0)); + SingleLevelGER functor( + (trans[0] == 'T') || (trans[0] == 't'), alpha, x, y, A); Kokkos::parallel_for("KokkosBlas::ger[SingleLevel]", rangePolicy, functor); } } @@ -118,138 +101,124 @@ struct TwoLevelGER_LayoutRightTag {}; // --------------------------------------------------------------------------------------------- -// Functor for a two-level parallel_reduce version of GER, designed for performance on GPU. -// Kernel depends on the layout of A. -template +// Functor for a two-level parallel_reduce version of GER, designed for +// performance on GPU. Kernel depends on the layout of A. +template struct TwoLevelGER { using AlphaCoeffType = typename AViewType::non_const_value_type; using XComponentType = typename XViewType::non_const_value_type; using YComponentType = typename YViewType::non_const_value_type; - using AComponentType = typename AViewType::non_const_value_type; + using AComponentType = typename AViewType::non_const_value_type; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; - TwoLevelGER( const bool justTranspose - , const AlphaCoeffType & alpha - , const XViewType & x - , const YViewType & y - , const AViewType & A - ) - : justTranspose_(justTranspose) - , alpha_ (alpha) - , x_ (x) - , y_ (y) - , A_ (A) - { + TwoLevelGER(const bool justTranspose, const AlphaCoeffType& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) + : justTranspose_(justTranspose), alpha_(alpha), x_(x), y_(y), A_(A) { // Nothing to do } -public: + public: // LayoutLeft version: one team per column - KOKKOS_INLINE_FUNCTION void operator()( TwoLevelGER_LayoutLeftTag - , const member_type & team - ) const { + KOKKOS_INLINE_FUNCTION void operator()(TwoLevelGER_LayoutLeftTag, + const member_type& team) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do - } - else { - const IndexType M ( A_.extent(0) ); - const IndexType j ( team.league_rank() ); + } else { + const IndexType M(A_.extent(0)); + const IndexType j(team.league_rank()); if (justTranspose_) { - const YComponentType y_fixed( y_(j) ); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType & i) { - A_(i,j) += AComponentType( alpha_ * x_(i) * y_fixed ); - }); - } - else { - const YComponentType y_fixed( Kokkos::ArithTraits::conj( y_(j) ) ); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, M), [&](const IndexType & i) { - A_(i,j) += AComponentType( alpha_ * x_(i) * y_fixed ); - }); + const YComponentType y_fixed(y_(j)); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); + }); + } else { + const YComponentType y_fixed( + Kokkos::ArithTraits::conj(y_(j))); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); + }); } } } // LayoutRight version: one team per row - KOKKOS_INLINE_FUNCTION void operator()( TwoLevelGER_LayoutRightTag - , const member_type & team - ) const { + KOKKOS_INLINE_FUNCTION void operator()(TwoLevelGER_LayoutRightTag, + const member_type& team) const { if (alpha_ == Kokkos::ArithTraits::zero()) { // Nothing to do - } - else { - const IndexType N ( A_.extent(1) ); - const IndexType i ( team.league_rank() ); - const XComponentType x_fixed( x_(i) ); + } else { + const IndexType N(A_.extent(1)); + const IndexType i(team.league_rank()); + const XComponentType x_fixed(x_(i)); if (justTranspose_) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType & j) { - A_(i,j) += AComponentType( alpha_ * x_fixed * y_(j) ); - }); - } - else { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const IndexType & j) { - A_(i,j) += AComponentType( alpha_ * x_fixed * Kokkos::ArithTraits::conj( y_(j) ) ); - }); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); + }); + } else { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + A_(i, j) += AComponentType( + alpha_ * x_fixed * + Kokkos::ArithTraits::conj(y_(j))); + }); } } team.team_barrier(); } -private: - bool justTranspose_; - AlphaCoeffType alpha_; + private: + bool justTranspose_; + AlphaCoeffType alpha_; typename XViewType::const_type x_; typename YViewType::const_type y_; - AViewType A_; + AViewType A_; }; // Two-level parallel version of GER. -template -void twoLevelGer( const ExecutionSpace & space - , const char trans[] - , const typename AViewType::const_value_type & alpha - , const XViewType & x - , const YViewType & y - , const AViewType & A - ) { - static_assert(std::is_integral::value, "IndexType must be an integer"); +template +void twoLevelGer(const ExecutionSpace& space, const char trans[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) { + static_assert(std::is_integral::value, + "IndexType must be an integer"); using AlphaCoeffType = typename AViewType::non_const_value_type; if (y.extent(0) == 0) { // no entries to update return; - } - else if (x.extent(0) == 0) { + } else if (x.extent(0) == 0) { // no entries to update return; - } - else if (alpha == Kokkos::ArithTraits::zero()) { + } else if (alpha == Kokkos::ArithTraits::zero()) { // no entries to update return; } - constexpr bool isLayoutLeft = std::is_same::value; - using layout_tag = typename std::conditional::type; + constexpr bool isLayoutLeft = + std::is_same::value; + using layout_tag = + typename std::conditional::type; using TeamPolicyType = Kokkos::TeamPolicy; TeamPolicyType teamPolicy; if (isLayoutLeft) { // LayoutLeft: one team per column teamPolicy = TeamPolicyType(space, A.extent(1), Kokkos::AUTO); - } - else { + } else { // LayoutRight: one team per row teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); } - TwoLevelGER functor( (trans[0] == 'T') || (trans[0] == 't') - , alpha - , x - , y - , A - ); + TwoLevelGER + functor((trans[0] == 'T') || (trans[0] == 't'), alpha, x, y, A); Kokkos::parallel_for("KokkosBlas::ger[twoLevel]", teamPolicy, functor); } @@ -259,37 +228,25 @@ void twoLevelGer( const ExecutionSpace & space // depending on whether execution space is CPU or GPU. // The 'enable_if' makes sure unused kernels are not instantiated. -template < class ExecutionSpace - , class XViewType - , class YViewType - , class AViewType - , class IndexType - , typename std::enable_if()>::type* = nullptr - > -void generalGerImpl( const ExecutionSpace & space - , const char trans[] - , const typename AViewType::const_value_type & alpha - , const XViewType & x - , const YViewType & y - , const AViewType & A - ) { +template ()>::type* = nullptr> +void generalGerImpl(const ExecutionSpace& space, const char trans[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { singleLevelGer(space, trans, alpha, x, y, A); } -template < class ExecutionSpace - , class XViewType - , class YViewType - , class AViewType - , class IndexType - , typename std::enable_if()>::type* = nullptr - > -void generalGerImpl( const ExecutionSpace & space - , const char trans[] - , const typename AViewType::const_value_type & alpha - , const XViewType & x - , const YViewType & y - , const AViewType & A - ) { +template ()>::type* = nullptr> +void generalGerImpl(const ExecutionSpace& space, const char trans[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { twoLevelGer(space, trans, alpha, x, y, A); } diff --git a/blas/impl/KokkosBlas2_ger_spec.hpp b/blas/impl/KokkosBlas2_ger_spec.hpp index ebbb38c2fd..d9f9dcd272 100644 --- a/blas/impl/KokkosBlas2_ger_spec.hpp +++ b/blas/impl/KokkosBlas2_ger_spec.hpp @@ -35,30 +35,23 @@ struct ger_eti_spec_avail { } // namespace KokkosBlas // -// Macro for declaration of full specialization availability KokkosBlas::Impl::GER. -// This is NOT for users!!! -// All the declarations of full specializations go in this header file. -// We may spread out definitions (see _INST macro below) across one or more .cpp files. +// Macro for declaration of full specialization availability +// KokkosBlas::Impl::GER. This is NOT for users!!! All the declarations of full +// specializations go in this header file. We may spread out definitions (see +// _INST macro below) across one or more .cpp files. // -#define KOKKOSBLAS2_GER_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct ger_eti_spec_avail< Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct ger_eti_spec_avail< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -73,55 +66,42 @@ namespace Impl { // // Implementation of KokkosBlas::ger. -template < class ExecutionSpace - , class XViewType - , class YViewType - , class AViewType - , bool tpl_spec_avail = ger_tpl_spec_avail::value - , bool eti_spec_avail = ger_eti_spec_avail::value - > +template ::value, + bool eti_spec_avail = + ger_eti_spec_avail::value> struct GER { - static void ger( const ExecutionSpace & space - , const char trans[] - , const typename AViewType::const_value_type & alpha - , const XViewType & x - , const YViewType & y - , const AViewType & A - ) + static void ger(const ExecutionSpace& space, const char trans[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY { - Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::ger[ETI]" : "KokkosBlas::ger[noETI]"); + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + ? "KokkosBlas::ger[ETI]" + : "KokkosBlas::ger[noETI]"); typedef typename AViewType::size_type size_type; const size_type numRows = A.extent(0); const size_type numCols = A.extent(1); // Prefer int as the index type, but use a larger type if needed. - if (( numRows < static_cast(INT_MAX) ) && - ( numCols < static_cast(INT_MAX) )) { - generalGerImpl( space - , trans - , alpha - , x - , y - , A - ); - } - else { - generalGerImpl( space - , trans - , alpha - , x - , y - , A - ); + if ((numRows < static_cast(INT_MAX)) && + (numCols < static_cast(INT_MAX))) { + generalGerImpl( + space, trans, alpha, x, y, A); + } else { + generalGerImpl( + space, trans, alpha, x, y, A); } Kokkos::Profiling::popRegion(); } #else - ; -#endif // if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + ; +#endif // if !defined(KOKKOSKERNELS_ETI_ONLY) || + // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY }; } // namespace Impl @@ -131,51 +111,36 @@ struct GER { // Macro for declaration of full specialization of KokkosBlas::Impl::GER. // This is NOT for users!!! // All the declarations of full specializations go in this header file. -// We may spread out definitions (see _DEF macro below) across one or more .cpp files. +// We may spread out definitions (see _DEF macro below) across one or more .cpp +// files. // -#define KOKKOSBLAS2_GER_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - extern template struct GER< EXEC_SPACE \ - , Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , false \ - , true \ - >; +#define KOKKOSBLAS2_GER_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; #define KOKKOSBLAS2_GER_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct GER< EXEC_SPACE \ - , Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , false \ - , true \ - >; + template struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; #include #include -#endif // KOKKOSBLAS2_GER_SPEC_HPP_ +#endif // KOKKOSBLAS2_GER_SPEC_HPP_ diff --git a/blas/src/KokkosBlas2_ger.hpp b/blas/src/KokkosBlas2_ger.hpp index 728bb86494..218b01bb2e 100644 --- a/blas/src/KokkosBlas2_ger.hpp +++ b/blas/src/KokkosBlas2_ger.hpp @@ -37,85 +37,85 @@ namespace KokkosBlas { /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param y [in] Input vector, as a 1-D Kokkos::View /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View -template -void ger( const ExecutionSpace & space - , const char trans[] - , const typename AViewType::const_value_type & alpha - , const XViewType & x - , const YViewType & y - , const AViewType & A - ) { - static_assert(Kokkos::SpaceAccessibility::accessible, "AViewType memory space must be compatible with ExecutionSpace"); - static_assert(Kokkos::SpaceAccessibility::accessible, "XViewType memory space must be compatible with ExecutionSpace"); - static_assert(Kokkos::SpaceAccessibility::accessible, "YViewType memory space must be compatible with ExecutionSpace"); - - static_assert( Kokkos::is_view::value, "AViewType must be a Kokkos::View." ); - static_assert( Kokkos::is_view::value, "XViewType must be a Kokkos::View." ); - static_assert( Kokkos::is_view::value, "YViewType must be a Kokkos::View." ); - - static_assert( static_cast(AViewType::rank) == 2, "AViewType must have rank 2." ); - static_assert( static_cast(XViewType::rank) == 1, "XViewType must have rank 1." ); - static_assert( static_cast(YViewType::rank) == 1, "YViewType must have rank 1." ); +template +void ger(const ExecutionSpace& space, const char trans[], + const typename AViewType::const_value_type& alpha, const XViewType& x, + const YViewType& y, const AViewType& A) { + static_assert( + Kokkos::SpaceAccessibility::accessible, + "AViewType memory space must be compatible with ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "XViewType memory space must be compatible with ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "YViewType memory space must be compatible with ExecutionSpace"); + + static_assert(Kokkos::is_view::value, + "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "YViewType must be a Kokkos::View."); + + static_assert(static_cast(AViewType::rank) == 2, + "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank) == 1, + "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank) == 1, + "YViewType must have rank 1."); // Check compatibility of dimensions at run time. - if (( A.extent(0) != x.extent(0) ) || - ( A.extent(1) != y.extent(0) )) { + if ((A.extent(0) != x.extent(0)) || (A.extent(1) != y.extent(0))) { std::ostringstream os; os << "KokkosBlas::ger: Dimensions of A, x, and y do not match: " - << "A is " << A.extent(0) << " by " << A.extent(1) - << ", x has size " << x.extent(0) - << ", y has size " << y.extent(0); + << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " + << x.extent(0) << ", y has size " << y.extent(0); KokkosKernels::Impl::throw_runtime_exception(os.str()); } - - if ((trans[0] == 'T') || - (trans[0] == 't') || - (trans[0] == 'H') || + if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || (trans[0] == 'h')) { // Ok - } - else { + } else { std::ostringstream os; os << "KokkosBlas::ger: invalid trans[0] = '" << trans[0] << "'. It must be equalt to 'T' or 't' or 'H' or 'h'"; KokkosKernels::Impl::throw_runtime_exception(os.str()); } - if (( A.extent(0) == 0 ) || - ( A.extent(1) == 0 )) { + if ((A.extent(0) == 0) || (A.extent(1) == 0)) { return; } using ALayout = typename AViewType::array_layout; - // Minimize the number of Impl::GER instantiations, by standardizing + // Minimize the number of Impl::GER instantiations, by standardizing // on particular View specializations for its template parameters. - typedef Kokkos::View< typename XViewType::const_value_type* - , typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout - , typename XViewType::device_type - , Kokkos::MemoryTraits - > XVT; - - typedef Kokkos::View< typename YViewType::const_value_type* - , typename KokkosKernels::Impl::GetUnifiedLayoutPreferring::array_layout - , typename YViewType::device_type - , Kokkos::MemoryTraits - > YVT; - - typedef Kokkos::View< typename AViewType::non_const_value_type** - , ALayout - , typename AViewType::device_type - , Kokkos::MemoryTraits - > AVT; - - Impl::GER::ger( space - , trans - , alpha - , x - , y - , A - ); + typedef Kokkos::View::array_layout, + typename XViewType::device_type, + Kokkos::MemoryTraits > + XVT; + + typedef Kokkos::View::array_layout, + typename YViewType::device_type, + Kokkos::MemoryTraits > + YVT; + + typedef Kokkos::View > + AVT; + + Impl::GER::ger(space, trans, alpha, x, y, A); } /// \brief Rank-1 update of a general matrix: A = A + alpha * x * y^{T,H}. @@ -131,26 +131,14 @@ void ger( const ExecutionSpace & space /// \param y [in] Input vector, as a 1-D Kokkos::View /// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View template -void ger( const char trans[] - , const typename AViewType::const_value_type & alpha - , const XViewType & x - , const YViewType & y - , const AViewType & A - ) { - const typename AViewType::execution_space space = typename AViewType::execution_space(); - ger< typename AViewType::execution_space - , XViewType - , YViewType - , AViewType - > ( space - , trans - , alpha - , x - , y - , A - ); +void ger(const char trans[], const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) { + const typename AViewType::execution_space space = + typename AViewType::execution_space(); + ger( + space, trans, alpha, x, y, A); } -} // namespace KokkosBlas +} // namespace KokkosBlas -#endif // KOKKOSBLAS2_GER_HPP_ +#endif // KOKKOSBLAS2_GER_HPP_ diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp index 92c8c8c162..8bedf41523 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp @@ -28,121 +28,133 @@ struct ger_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTX, LAYOUTY, LAYOUTA, MEMSPACE) \ - template \ - struct ger_tpl_spec_avail< Kokkos::View< const SCALAR* \ - , LAYOUTX \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const SCALAR* \ - , LAYOUTY \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< SCALAR** \ - , LAYOUTA \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTX, LAYOUTY, LAYOUTA, \ + MEMSPACE) \ + template \ + struct ger_tpl_spec_avail< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) - -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::HostSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::HostSpace) #endif // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTX, LAYOUTY, LAYOUTA, MEMSPACE) \ - template <> \ - struct ger_tpl_spec_avail< Kokkos::View< const SCALAR* \ - , LAYOUTX \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const SCALAR* \ - , LAYOUTY \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< SCALAR** \ - , LAYOUTA \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTX, LAYOUTY, \ + LAYOUTA, MEMSPACE) \ + template <> \ + struct ger_tpl_spec_avail< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // We use the same layout for X, Y and Abecause the GER interface will // switch the layouts of X and Y to that of A. So this TPL version will // match any layout combination, as long as none are LayoutStride. -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) - -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::CudaSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::CudaSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ - template <> \ - struct ger_tpl_spec_avail< Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device< Kokkos::HIP \ - , Kokkos::HIPSpace \ - > \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device< Kokkos::HIP \ - , Kokkos::HIPSpace \ - > \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device< Kokkos::HIP \ - , Kokkos::HIPSpace \ - > \ - , Kokkos::MemoryTraits \ - > \ - > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ + template <> \ + struct ger_tpl_spec_avail< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex , Kokkos::LayoutRight) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutRight) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutRight) #endif } // namespace Impl } // namespace KokkosBlas -#endif // KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_HPP_ +#endif // KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_HPP_ diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp index 4a0326e947..d5626dd604 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp @@ -30,363 +30,265 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DGER_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GER< Kokkos::View< const double* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const double* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< double** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef double SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > YViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void ger( const typename AViewType::execution_space & /* space */ \ - , const char trans[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const YViewType & Y \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,double]"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ - if (A_is_ll) { \ - HostBlas::ger( M \ - , N \ - , alpha \ - , X.data() \ - , one \ - , Y.data() \ - , one \ - , A.data() \ - , LDA \ - ); \ - } \ - else { \ - HostBlas::ger( M \ - , N \ - , alpha \ - , Y.data() \ - , one \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGER_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GER< \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const typename AViewType::execution_space& /* space */ \ + , \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,double]"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::ger(M, N, alpha, X.data(), one, Y.data(), one, \ + A.data(), LDA); \ + } else { \ + HostBlas::ger(M, N, alpha, Y.data(), one, X.data(), one, \ + A.data(), LDA); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGER_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GER< Kokkos::View< const float* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const float* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< float** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef float SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > YViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void ger( const typename AViewType::execution_space & /* space */ \ - , const char trans[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const YViewType & Y \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,float]"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ - if (A_is_ll) { \ - HostBlas::ger( M \ - , N \ - , alpha \ - , X.data() \ - , one \ - , Y.data() \ - , one \ - , A.data() \ - , LDA \ - ); \ - } \ - else { \ - HostBlas::ger( M \ - , N \ - , alpha \ - , Y.data() \ - , one \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGER_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GER< \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const typename AViewType::execution_space& /* space */ \ + , \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,float]"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::ger(M, N, alpha, X.data(), one, Y.data(), one, \ + A.data(), LDA); \ + } else { \ + HostBlas::ger(M, N, alpha, Y.data(), one, X.data(), one, \ + A.data(), LDA); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGER_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GER< Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > YViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void ger( const typename AViewType::execution_space & /* space */ \ - , const char trans[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const YViewType & Y \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,complex"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ - const std::complex alpha_val = static_cast>(alpha); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (A_is_ll) { \ - if (justTranspose) { \ - HostBlas>::geru( M \ - , N \ - , alpha_val \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(Y.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ - } \ - else { \ - HostBlas>::gerc( M \ - , N \ - , alpha_val \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(Y.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ - } \ - } \ - else { \ - if (justTranspose) { \ - HostBlas>::geru( M \ - , N \ - , alpha_val \ - , reinterpret_cast*>(Y.data()) \ - , one \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ - } \ - else { \ - throw std::runtime_error("Error: blasZgerc() requires LayoutLeft views."); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGER_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const typename AViewType::execution_space& /* space */ \ + , \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::ger[TPL_BLAS,complex"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + const std::complex alpha_val = \ + static_cast>(alpha); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (A_is_ll) { \ + if (justTranspose) { \ + HostBlas>::geru( \ + M, N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + HostBlas>::gerc( \ + M, N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } \ + } else { \ + if (justTranspose) { \ + HostBlas>::geru( \ + M, N, alpha_val, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + throw std::runtime_error( \ + "Error: blasZgerc() requires LayoutLeft views."); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGER_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GER< Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > YViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void ger( const typename AViewType::execution_space & /* space */ \ - , const char trans[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const YViewType & Y \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,complex"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ - const std::complex alpha_val = static_cast>(alpha); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (A_is_ll) { \ - if (justTranspose) { \ - HostBlas>::geru( M \ - , N \ - , alpha_val \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(Y.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ - } \ - else { \ - HostBlas>::gerc( M \ - , N \ - , alpha_val \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(Y.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ - } \ - } \ - else { \ - if (justTranspose) { \ - HostBlas>::geru( M \ - , N \ - , alpha_val \ - , reinterpret_cast*>(Y.data()) \ - , one \ - , reinterpret_cast*>(X.data()) \ - , one \ - , reinterpret_cast*>(A.data()) \ - , LDA \ - ); \ - } \ - else { \ - throw std::runtime_error("Error: blasCgerc() requires LayoutLeft views."); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGER_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const typename AViewType::execution_space& /* space */ \ + , \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::ger[TPL_BLAS,complex"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + const std::complex alpha_val = \ + static_cast>(alpha); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (A_is_ll) { \ + if (justTranspose) { \ + HostBlas>::geru( \ + M, N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + HostBlas>::gerc( \ + M, N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } \ + } else { \ + if (justTranspose) { \ + HostBlas>::geru( \ + M, N, alpha_val, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + throw std::runtime_error( \ + "Error: blasCgerc() requires LayoutLeft views."); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true ) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true ) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::HostSpace, false) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true ) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true ) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::HostSpace, false) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true ) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true ) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::HostSpace, false) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, true ) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, true ) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp index 6bfe5db302..e106cacc63 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp @@ -30,412 +30,287 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DGER_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GER< Kokkos::View< const double* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const double* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< double** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef double SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > YViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void ger( const typename AViewType::execution_space & space \ - , const char trans[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const YViewType & Y \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,double]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasDger( s.handle \ - , M \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , Y.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasDger( s.handle \ - , M \ - , N \ - , &alpha \ - , Y.data() \ - , one \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGER_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GER< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const typename AViewType::execution_space& space, \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, \ + X.data(), one, Y.data(), one, \ + A.data(), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, \ + Y.data(), one, X.data(), one, \ + A.data(), LDA)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGER_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GER< Kokkos::View< const float* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const float* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< float** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef float SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > YViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void ger( const typename AViewType::execution_space & space \ - , const char trans[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const YViewType & Y \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSger( s.handle \ - , M \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , Y.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSger( s.handle \ - , M \ - , N \ - , &alpha \ - , Y.data() \ - , one \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGER_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GER< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const typename AViewType::execution_space& space, \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, \ + X.data(), one, Y.data(), one, \ + A.data(), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, \ + Y.data(), one, X.data(), one, \ + A.data(), LDA)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGER_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GER< Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > YViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void ger( const typename AViewType::execution_space & space \ - , const char trans[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const YViewType & Y \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - if (A_is_ll) { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZgeru( s.handle \ - , M \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(Y.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZgerc( s.handle \ - , M \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(Y.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - } \ - else { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasZgeru( s.handle \ - , M \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(Y.data()) \ - , one \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - else { \ - throw std::runtime_error("Error: cublasZgerc() requires LayoutLeft views."); \ - } \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGER_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const typename AViewType::execution_space& space, \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::ger[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgerc( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } \ + } else { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + throw std::runtime_error( \ + "Error: cublasZgerc() requires LayoutLeft views."); \ + } \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGER_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GER< Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > YViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void ger( const typename AViewType::execution_space & space \ - , const char trans[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const YViewType & Y \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, space.cuda_stream()) ); \ - if (A_is_ll) { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCgeru( s.handle \ - , M \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(Y.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCgerc( s.handle \ - , M \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(Y.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - } \ - else { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasCgeru( s.handle \ - , M \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(Y.data()) \ - , one \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - else { \ - throw std::runtime_error("Error: cublasCgerc() requires LayoutLeft views."); \ - } \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( cublasSetStream(s.handle, NULL) ); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGER_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const typename AViewType::execution_space& space, \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::ger[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru( \ + s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgerc( \ + s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } \ + } else { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru( \ + s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + throw std::runtime_error( \ + "Error: cublasCgerc() requires LayoutLeft views."); \ + } \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true) KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true) KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true) KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true ) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true ) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true) KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true ) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true ) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) } // namespace Impl diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp index 0938546c4b..0018c36df8 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp @@ -30,392 +30,272 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DGER_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER< Kokkos::View< const double* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const double* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< double** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef double SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > YViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void ger( const typename AViewType::execution_space & space \ - , const char trans[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const YViewType & Y \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,double]"); \ - KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - if (A_is_ll) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_dger( s.handle \ - , M \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , Y.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_dger( s.handle \ - , M \ - , N \ - , &alpha \ - , Y.data() \ - , one \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, NULL) ); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGER_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const typename AViewType::execution_space& space, \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + if (A_is_ll) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dger(s.handle, M, N, &alpha, \ + X.data(), one, Y.data(), \ + one, A.data(), LDA)); \ + } else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dger(s.handle, M, N, &alpha, \ + Y.data(), one, X.data(), \ + one, A.data(), LDA)); \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGER_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER< Kokkos::View< const float* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const float* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< float** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef float SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > YViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void ger( const typename AViewType::execution_space & space \ - , const char trans[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const YViewType & Y \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,float]"); \ - KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - if (A_is_ll) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_sger( s.handle \ - , M \ - , N \ - , &alpha \ - , X.data() \ - , one \ - , Y.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_sger( s.handle \ - , M \ - , N \ - , &alpha \ - , Y.data() \ - , one \ - , X.data() \ - , one \ - , A.data() \ - , LDA \ - ) \ - ); \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, NULL) ); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGER_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const typename AViewType::execution_space& space, \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + if (A_is_ll) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_sger(s.handle, M, N, &alpha, \ + X.data(), one, Y.data(), \ + one, A.data(), LDA)); \ + } else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_sger(s.handle, M, N, &alpha, \ + Y.data(), one, X.data(), \ + one, A.data(), LDA)); \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGER_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER< Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > YViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void ger( const typename AViewType::execution_space & space \ - , const char trans[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const YViewType & Y \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - if (A_is_ll) { \ - if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zgeru( s.handle \ - , M \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(Y.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zgerc( s.handle \ - , M \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(Y.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - } \ - else { \ - if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_zgeru( s.handle \ - , M \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(Y.data()) \ - , one \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - else { \ - throw std::runtime_error("Error: rocblasZgerc() requires LayoutLeft views."); \ - } \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGER_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const typename AViewType::execution_space& space, \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::ger[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgeru( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgerc( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } \ + } else { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgeru( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + throw std::runtime_error( \ + "Error: rocblasZgerc() requires LayoutLeft views."); \ + } \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGER_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER< Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< const Kokkos::complex* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , Kokkos::View< Kokkos::complex** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > \ - , true \ - , ETI_SPEC_AVAIL \ - > { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > XViewType; \ - typedef Kokkos::View< const SCALAR* \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > YViewType; \ - typedef Kokkos::View< SCALAR** \ - , LAYOUT \ - , Kokkos::Device \ - , Kokkos::MemoryTraits \ - > AViewType; \ - \ - static void ger( const typename AViewType::execution_space & space \ - , const char trans[] \ - , typename AViewType::const_value_type & alpha \ - , const XViewType & X \ - , const YViewType & Y \ - , const AViewType & A \ - ) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_set_stream(s.handle, space.hip_stream()) ); \ - if (A_is_ll) { \ - if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_cgeru( s.handle \ - , M \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(Y.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_cgerc( s.handle \ - , M \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(Y.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - } \ - else { \ - if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( rocblas_cgeru( s.handle \ - , M \ - , N \ - , reinterpret_cast(&alpha) \ - , reinterpret_cast(Y.data()) \ - , one \ - , reinterpret_cast(X.data()) \ - , one \ - , reinterpret_cast(A.data()) \ - , LDA \ - ) \ - ); \ - } \ - else { \ - throw std::runtime_error("Error: rocblasCgec() requires LayoutLeft views."); \ - } \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGER_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const typename AViewType::execution_space& space, \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::ger[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgeru( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgerc( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } \ + } else { \ + if (justTranspose) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgeru( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + throw std::runtime_error( \ + "Error: rocblasCgec() requires LayoutLeft views."); \ + } \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true ) -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true ) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true ) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true ) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true ) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) } // namespace Impl diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index 7f6ac280d4..37733f609e 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -228,66 +228,26 @@ void F77_BLAS_MANGLE(zgemv, ZGEMV)(const char*, int*, int*, /// /// Ger /// -void F77_BLAS_MANGLE(sger, SGER)( int* - , int* - , const float* - , const float* - , int* - , const float* - , int* - , float* - , int* - ); -void F77_BLAS_MANGLE(dger, DGER)( int* - , int* - , const double* - , const double* - , int* - , const double* - , int* - , double* - , int* - ); -void F77_BLAS_MANGLE(cgeru, CGERU)( int* - , int* - , const std::complex* - , const std::complex* - , int* - , const std::complex* - , int* - , std::complex* - , int* - ); -void F77_BLAS_MANGLE(cgerc, CGERC)( int* - , int* - , const std::complex* - , const std::complex* - , int* - , const std::complex* - , int* - , std::complex* - , int* - ); -void F77_BLAS_MANGLE(zgeru, ZGERU)( int* - , int* - , const std::complex* - , const std::complex* - , int* - , const std::complex* - , int* - , std::complex* - , int* - ); -void F77_BLAS_MANGLE(zgerc, ZGERC)( int* - , int* - , const std::complex* - , const std::complex* - , int* - , const std::complex* - , int* - , std::complex* - , int* - ); +void F77_BLAS_MANGLE(sger, SGER)(int*, int*, const float*, const float*, int*, + const float*, int*, float*, int*); +void F77_BLAS_MANGLE(dger, DGER)(int*, int*, const double*, const double*, int*, + const double*, int*, double*, int*); +void F77_BLAS_MANGLE(cgeru, CGERU)(int*, int*, const std::complex*, + const std::complex*, int*, + const std::complex*, int*, + std::complex*, int*); +void F77_BLAS_MANGLE(cgerc, CGERC)(int*, int*, const std::complex*, + const std::complex*, int*, + const std::complex*, int*, + std::complex*, int*); +void F77_BLAS_MANGLE(zgeru, ZGERU)(int*, int*, const std::complex*, + const std::complex*, int*, + const std::complex*, int*, + std::complex*, int*); +void F77_BLAS_MANGLE(zgerc, ZGERC)(int*, int*, const std::complex*, + const std::complex*, int*, + const std::complex*, int*, + std::complex*, int*); /// /// Trsv @@ -503,8 +463,8 @@ void F77_BLAS_MANGLE(zscal, #define F77_FUNC_CGEMV F77_BLAS_MANGLE(cgemv, CGEMV) #define F77_FUNC_ZGEMV F77_BLAS_MANGLE(zgemv, ZGEMV) -#define F77_FUNC_SGER F77_BLAS_MANGLE(sger, SGER) -#define F77_FUNC_DGER F77_BLAS_MANGLE(dger, DGER) +#define F77_FUNC_SGER F77_BLAS_MANGLE(sger, SGER) +#define F77_FUNC_DGER F77_BLAS_MANGLE(dger, DGER) #define F77_FUNC_CGERU F77_BLAS_MANGLE(cgeru, CGERU) #define F77_FUNC_CGERC F77_BLAS_MANGLE(cgerc, CGERC) #define F77_FUNC_ZGERU F77_BLAS_MANGLE(zgeru, ZGERU) @@ -611,26 +571,10 @@ void HostBlas::gemv(const char trans, int m, int n, const float alpha, F77_FUNC_SGEMV(&trans, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> -void HostBlas::ger( int m - , int n - , const float alpha - , const float* x - , int incx - , const float* y - , int incy - , float* a - , int lda - ) { - F77_FUNC_SGER( &m - , &n - , &alpha - , x - , &incx - , y - , &incy - , a - , &lda - ); +void HostBlas::ger(int m, int n, const float alpha, const float* x, + int incx, const float* y, int incy, float* a, + int lda) { + F77_FUNC_SGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, @@ -746,26 +690,10 @@ void HostBlas::gemv(const char trans, int m, int n, const double alpha, F77_FUNC_DGEMV(&trans, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> -void HostBlas::ger( int m - , int n - , const double alpha - , const double* x - , int incx - , const double* y - , int incy - , double* a - , int lda - ) { - F77_FUNC_DGER( &m - , &n - , &alpha - , x - , &incx - , y - , &incy - , a - , &lda - ); +void HostBlas::ger(int m, int n, const double alpha, const double* x, + int incx, const double* y, int incy, double* a, + int lda) { + F77_FUNC_DGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, @@ -896,48 +824,22 @@ void HostBlas >::gemv(const char trans, int m, int n, (std::complex*)c, &ldc); } template <> -void HostBlas >::geru( int m - , int n - , const std::complex alpha - , const std::complex* x - , int incx - , const std::complex* y - , int incy - , std::complex* a - , int lda - ) { - F77_FUNC_CGERU( &m - , &n - , &alpha - , (const std::complex*)x - , &incx - , (const std::complex*)y - , &incy - , (std::complex*)a - , &lda - ); -} -template <> -void HostBlas >::gerc( int m - , int n - , const std::complex alpha - , const std::complex* x - , int incx - , const std::complex* y - , int incy - , std::complex* a - , int lda - ) { - F77_FUNC_CGERC( &m - , &n - , &alpha - , (const std::complex*)x - , &incx - , (const std::complex*)y - , &incy - , (std::complex*)a - , &lda - ); +void HostBlas >::geru( + int m, int n, const std::complex alpha, const std::complex* x, + int incx, const std::complex* y, int incy, std::complex* a, + int lda) { + F77_FUNC_CGERU(&m, &n, &alpha, (const std::complex*)x, &incx, + (const std::complex*)y, &incy, (std::complex*)a, + &lda); +} +template <> +void HostBlas >::gerc( + int m, int n, const std::complex alpha, const std::complex* x, + int incx, const std::complex* y, int incy, std::complex* a, + int lda) { + F77_FUNC_CGERC(&m, &n, &alpha, (const std::complex*)x, &incx, + (const std::complex*)y, &incy, (std::complex*)a, + &lda); } template <> void HostBlas >::trsv(const char uplo, const char transa, @@ -1092,48 +994,22 @@ void HostBlas >::gemv( (std::complex*)c, &ldc); } template <> -void HostBlas >::geru( int m - , int n - , const std::complex alpha - , const std::complex* x - , int incx - , const std::complex* y - , int incy - , std::complex* a - , int lda - ) { - F77_FUNC_ZGERU( &m - , &n - , &alpha - , (const std::complex*)x - , &incx - , (const std::complex*)y - , &incy - , (std::complex*)a - , &lda - ); -} -template <> -void HostBlas >::gerc( int m - , int n - , const std::complex alpha - , const std::complex* x - , int incx - , const std::complex* y - , int incy - , std::complex* a - , int lda - ) { - F77_FUNC_ZGERC( &m - , &n - , &alpha - , (const std::complex*)x - , &incx - , (const std::complex*)y - , &incy - , (std::complex*)a - , &lda - ); +void HostBlas >::geru( + int m, int n, const std::complex alpha, + const std::complex* x, int incx, const std::complex* y, + int incy, std::complex* a, int lda) { + F77_FUNC_ZGERU(&m, &n, &alpha, (const std::complex*)x, &incx, + (const std::complex*)y, &incy, + (std::complex*)a, &lda); +} +template <> +void HostBlas >::gerc( + int m, int n, const std::complex alpha, + const std::complex* x, int incx, const std::complex* y, + int incy, std::complex* a, int lda) { + F77_FUNC_ZGERC(&m, &n, &alpha, (const std::complex*)x, &incx, + (const std::complex*)y, &incy, + (std::complex*)a, &lda); } template <> void HostBlas >::trsv(const char uplo, const char transa, diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index 457be2cdcc..cd53537ea6 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -64,38 +64,14 @@ struct HostBlas { int lda, const T *b, int ldb, const T beta, /* */ T *c, int ldc); - static void ger( int m - , int n - , const T alpha - , const T* x - , int incx - , const T* y - , int incy - , T* a - , int lda - ); - - static void geru( int m - , int n - , const T alpha - , const T* x - , int incx - , const T* y - , int incy - , T* a - , int lda - ); - - static void gerc( int m - , int n - , const T alpha - , const T* x - , int incx - , const T* y - , int incy - , T* a - , int lda - ); + static void ger(int m, int n, const T alpha, const T *x, int incx, const T *y, + int incy, T *a, int lda); + + static void geru(int m, int n, const T alpha, const T *x, int incx, + const T *y, int incy, T *a, int lda); + + static void gerc(int m, int n, const T alpha, const T *x, int incx, + const T *y, int incy, T *a, int lda); static void trsv(const char uplo, const char transa, const char diag, int m, const T *a, int lda, From 7ce9d9f831fe39aba2e44fc83e996ee06b8a4808 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 11 Apr 2023 00:11:50 -0600 Subject: [PATCH 294/442] The clang formatting from kokkos-dev-2 puts a space into these 3 files, which needed (the space) to be removed in my Mac in order for the compilation to work. Tests pass in my Mac. --- .../ger/KokkosBlas2_ger_eti_spec_inst.cpp.in | 2 +- .../KokkosBlas2_ger_eti_spec_avail.hpp.in | 2 +- .../KokkosBlas2_ger_eti_spec_decl.hpp.in | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/blas/eti/generated_specializations_cpp/ger/KokkosBlas2_ger_eti_spec_inst.cpp.in b/blas/eti/generated_specializations_cpp/ger/KokkosBlas2_ger_eti_spec_inst.cpp.in index 8199d0b87e..edfdef0a93 100644 --- a/blas/eti/generated_specializations_cpp/ger/KokkosBlas2_ger_eti_spec_inst.cpp.in +++ b/blas/eti/generated_specializations_cpp/ger/KokkosBlas2_ger_eti_spec_inst.cpp.in @@ -20,6 +20,6 @@ namespace KokkosBlas { namespace Impl { -@BLAS2_GER_ETI_INST_BLOCK @ +@BLAS2_GER_ETI_INST_BLOCK@ } // namespace Impl } // namespace KokkosBlas diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_avail.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_avail.hpp.in index b9f569b8ae..a456744bd1 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_avail.hpp.in +++ b/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_avail.hpp.in @@ -19,7 +19,7 @@ namespace KokkosBlas { namespace Impl { -@BLAS2_GER_ETI_AVAIL_BLOCK @ +@BLAS2_GER_ETI_AVAIL_BLOCK@ } // namespace Impl } // namespace KokkosBlas #endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in index 377397468e..3ca1a64a8e 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in +++ b/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in @@ -19,7 +19,7 @@ namespace KokkosBlas { namespace Impl { -@BLAS2_GER_ETI_DECL_BLOCK @ +@BLAS2_GER_ETI_DECL_BLOCK@ } // namespace Impl } // namespace KokkosBlas #endif From 629337c26fa11403dfe9daafdd84278eee505d3a Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 11 Apr 2023 00:26:00 -0600 Subject: [PATCH 295/442] Needed to format two extra files in kokkos-dev-2 in order for the automatic 'check' step to pass --- blas/unit_test/Test_Blas2_ger.hpp | 1711 +++++++++--------- graph/impl/KokkosGraph_Distance2MIS_impl.hpp | 4 +- 2 files changed, 816 insertions(+), 899 deletions(-) diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index a158a8cf35..5b26344ded 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -22,258 +22,204 @@ namespace Test { -template -class GerTester -{ -public: +template +class GerTester { + public: GerTester(); ~GerTester(); - void test( const int M - , const int N - , const int nonConstConstCombinations - , const bool useAnalyticalResults = false - , const bool useHermitianOption = false - ); + void test(const int M, const int N, const int nonConstConstCombinations, + const bool useAnalyticalResults = false, + const bool useHermitianOption = false); -private: - typedef Kokkos::View _ViewTypeX; - typedef Kokkos::View _ViewTypeY; + private: + typedef Kokkos::View _ViewTypeX; + typedef Kokkos::View _ViewTypeY; typedef Kokkos::View _ViewTypeA; - typedef typename _ViewTypeX::HostMirror _HostViewTypeX; - typedef typename _ViewTypeY::HostMirror _HostViewTypeY; - typedef typename _ViewTypeA::HostMirror _HostViewTypeA; + typedef typename _ViewTypeX::HostMirror _HostViewTypeX; + typedef typename _ViewTypeY::HostMirror _HostViewTypeY; + typedef typename _ViewTypeA::HostMirror _HostViewTypeA; typedef Kokkos::View _ViewTypeExpected; typedef Kokkos::ArithTraits _KAT_A; - typedef typename _KAT_A::mag_type _AuxType; - - void populateVariables( ScalarA & alpha - , _HostViewTypeX & h_x - , _HostViewTypeY & h_y - , _HostViewTypeA & h_A - , _ViewTypeExpected & h_expected - , _ViewTypeX & x - , _ViewTypeY & y - , _ViewTypeA & A - , bool & expectedResultIsKnown - ); + typedef typename _KAT_A::mag_type _AuxType; + + void populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, + _HostViewTypeY& h_y, _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected, _ViewTypeX& x, + _ViewTypeY& y, _ViewTypeA& A, + bool& expectedResultIsKnown); template - typename std::enable_if< std::is_same>::value || std::is_same>::value - , void - >::type - populateAnalyticalValues( T & alpha - , _HostViewTypeX & h_x - , _HostViewTypeY & h_y - , _HostViewTypeA & h_A - , _ViewTypeExpected & h_expected - ); - + typename std::enable_if>::value || + std::is_same>::value, + void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, + _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); + template - typename std::enable_if< !std::is_same>::value && !std::is_same>::value - , void - >::type - populateAnalyticalValues( T & alpha - , _HostViewTypeX & h_x - , _HostViewTypeY & h_y - , _HostViewTypeA & h_A - , _ViewTypeExpected & h_expected - ); + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, + _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); template - typename std::enable_if< std::is_same>::value || std::is_same>::value - , void - >::type - populateVanillaValues( const T & alpha - , const _HostViewTypeX & h_x - , const _HostViewTypeY & h_y - , const _HostViewTypeA & h_A - , _ViewTypeExpected & h_vanilla - ); - + typename std::enable_if>::value || + std::is_same>::value, + void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla); + template - typename std::enable_if< !std::is_same>::value && !std::is_same>::value - , void - >::type - populateVanillaValues( const T & alpha - , const _HostViewTypeX & h_x - , const _HostViewTypeY & h_y - , const _HostViewTypeA & h_A - , _ViewTypeExpected & h_vanilla - ); - + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla); + template - typename std::enable_if< std::is_same>::value || std::is_same>::value - , void - >::type - compareVanillaExpected( const T & alpha - , const _ViewTypeExpected & h_vanilla - , const _ViewTypeExpected & h_expected - ); + typename std::enable_if>::value || + std::is_same>::value, + void>::type + compareVanillaExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected); template - typename std::enable_if< !std::is_same>::value && !std::is_same>::value - , void - >::type - compareVanillaExpected( const T & alpha - , const _ViewTypeExpected & h_vanilla - , const _ViewTypeExpected & h_expected - ); + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + compareVanillaExpected(const T& alpha, const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected); template - typename std::enable_if< std::is_same>::value || std::is_same>::value - , void - >::type - compareKokkosExpected( const T & alpha - , const _HostViewTypeA & h_A - , const _ViewTypeExpected & h_expected - ); + typename std::enable_if>::value || + std::is_same>::value, + void>::type + compareKokkosExpected(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected); template - typename std::enable_if< !std::is_same>::value && !std::is_same>::value - , void - >::type - compareKokkosExpected( const T & alpha - , const _HostViewTypeA & h_A - , const _ViewTypeExpected & h_expected - ); + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + compareKokkosExpected(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected); template T shrinkAngleToZeroTwoPiRange(const T input); template - void callKkGerAndCompareAgainstExpected( const ScalarA & alpha - , TX & x - , TY & y - , _ViewTypeA & A - , const _HostViewTypeA & h_A - , const _ViewTypeExpected & h_expected - , const std::string & situation - ); - - const bool _A_is_complex; - const bool _A_is_lr; - const bool _A_is_ll; - const bool _testIsGpu; - const bool _vanillaUsesDifferentOrderOfOps; + void callKkGerAndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, + _ViewTypeA& A, + const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected, + const std::string& situation); + + const bool _A_is_complex; + const bool _A_is_lr; + const bool _A_is_ll; + const bool _testIsGpu; + const bool _vanillaUsesDifferentOrderOfOps; const _AuxType _epsAbs; const _AuxType _epsRel; - int _M; - int _N; - bool _useAnalyticalResults; - bool _useHermitianOption; - bool _kkGerShouldThrowException; + int _M; + int _N; + bool _useAnalyticalResults; + bool _useHermitianOption; + bool _kkGerShouldThrowException; }; -template -GerTester< ScalarX - , tLayoutX - , ScalarY - , tLayoutY - , ScalarA - , tLayoutA - , Device - >::GerTester() - : _A_is_complex ( std::is_same>::value || std::is_same>::value ) - , _A_is_lr ( std::is_same< tLayoutA, Kokkos::LayoutRight >::value ) - , _A_is_ll ( std::is_same< tLayoutA, Kokkos::LayoutLeft >::value ) - , _testIsGpu ( KokkosKernels::Impl::kk_is_gpu_exec_space< typename Device::execution_space >() ) +template +GerTester::GerTester() + : _A_is_complex(std::is_same>::value || + std::is_same>::value), + _A_is_lr(std::is_same::value), + _A_is_ll(std::is_same::value), + _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space< + typename Device::execution_space>()) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS - , _vanillaUsesDifferentOrderOfOps( _A_is_lr && _testIsGpu ) + , + _vanillaUsesDifferentOrderOfOps(_A_is_lr && _testIsGpu) #else - , _vanillaUsesDifferentOrderOfOps( false ) + , + _vanillaUsesDifferentOrderOfOps(false) #endif - , _epsAbs (std::is_same<_AuxType, float>::value ? 1.0e-6 : 1.0e-9) - , _epsRel (std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6) - , _M (-1) - , _N (-1) - , _useAnalyticalResults (false) - , _useHermitianOption (false) - , _kkGerShouldThrowException (false) -{ + , + _epsAbs(std::is_same<_AuxType, float>::value ? 1.0e-6 : 1.0e-9), + _epsRel(std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6), + _M(-1), + _N(-1), + _useAnalyticalResults(false), + _useHermitianOption(false), + _kkGerShouldThrowException(false) { } -template -GerTester< ScalarX - , tLayoutX - , ScalarY - , tLayoutY - , ScalarA - , tLayoutA - , Device - >::~GerTester() -{ +template +GerTester::~GerTester() { // Nothing to do } -template -void GerTester< ScalarX - , tLayoutX - , ScalarY - , tLayoutY - , ScalarA - , tLayoutA - , Device - >::test( const int M - , const int N - , const int nonConstConstCombinations - , const bool useAnalyticalResults - , const bool useHermitianOption - ) -{ - std::cout << "Entering GerTester::test()... - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " << std::endl; - - std::cout << "_A_is_complex = " << _A_is_complex - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", _testIsGpu = " << _testIsGpu - << ", _vanillaUsesDifferentOrderOfOps = " << _vanillaUsesDifferentOrderOfOps - << ", _epsAbs = " << _epsAbs - << ", _epsRel = " << _epsRel +template +void GerTester::test(const int M, const int N, + const int nonConstConstCombinations, + const bool useAnalyticalResults, + const bool useHermitianOption) { + std::cout << "Entering GerTester::test()... - - - - - - - - - - - - - - - - " + "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " + "- - - - - - - - - " << std::endl; - + + std::cout << "_A_is_complex = " << _A_is_complex + << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", _testIsGpu = " << _testIsGpu + << ", _vanillaUsesDifferentOrderOfOps = " + << _vanillaUsesDifferentOrderOfOps << ", _epsAbs = " << _epsAbs + << ", _epsRel = " << _epsRel << std::endl; + // ******************************************************************** // Step 1 of 9: declare main types and variables // ******************************************************************** - _M = M; - _N = N; + _M = M; + _N = N; _useAnalyticalResults = useAnalyticalResults; _useHermitianOption = useHermitianOption; #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS _kkGerShouldThrowException = false; if (_A_is_complex && _useHermitianOption) { - if ((_testIsGpu == false) && - (_A_is_ll == false)) { + if ((_testIsGpu == false) && (_A_is_ll == false)) { _kkGerShouldThrowException = true; - } - else if ((_testIsGpu == true ) && - (_A_is_ll == false)) { + } else if ((_testIsGpu == true) && (_A_is_ll == false)) { _kkGerShouldThrowException = true; } } #endif - bool test_x_y (false); - bool test_cx_y (false); - bool test_x_cy (false); + bool test_x_y(false); + bool test_cx_y(false); + bool test_x_cy(false); bool test_cx_cy(false); if (nonConstConstCombinations == 0) { test_x_y = true; - } - else if (nonConstConstCombinations == 1) { + } else if (nonConstConstCombinations == 1) { test_cx_y = true; - } - else if (nonConstConstCombinations == 2) { + } else if (nonConstConstCombinations == 2) { test_x_cy = true; - } - else if (nonConstConstCombinations == 3) { + } else if (nonConstConstCombinations == 3) { test_cx_cy = true; - } - else { + } else { test_x_y = true; test_cx_y = true; test_x_cy = true; @@ -299,29 +245,18 @@ void GerTester< ScalarX // ******************************************************************** // Step 2 of 9: populate alpha, h_x, h_y, h_A, h_expected, x, y, A // ******************************************************************** - this->populateVariables( alpha - , h_x - , h_y - , h_A - , h_expected - , x - , y - , A - , expectedResultIsKnown - ); + this->populateVariables(alpha, h_x, h_y, h_A, h_expected, x, y, A, + expectedResultIsKnown); // ******************************************************************** // Step 3 of 9: populate h_vanilla // ******************************************************************** _ViewTypeExpected h_vanilla("vanilla = A + alpha * x * y^{t,h}", _M, _N); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_ger.hpp, computing vanilla A with alpha type = %s\n", typeid(alpha).name() ); - this->populateVanillaValues( alpha - , h_x - , h_y - , h_A - , h_vanilla - ); - + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "In Test_Blas2_ger.hpp, computing vanilla A with alpha type = %s\n", + typeid(alpha).name()); + this->populateVanillaValues(alpha, h_x, h_y, h_A, h_vanilla); + // ******************************************************************** // Step 4 of 9: use h_vanilla and h_expected as appropriate // ******************************************************************** @@ -329,18 +264,14 @@ void GerTester< ScalarX // ****************************************************************** // Compare h_vanilla against h_expected // ****************************************************************** - this->compareVanillaExpected( alpha - , h_vanilla - , h_expected - ); - } - else { + this->compareVanillaExpected(alpha, h_vanilla, h_expected); + } else { // ****************************************************************** // Copy h_vanilla to h_expected // ****************************************************************** Kokkos::deep_copy(h_expected, h_vanilla); } - + // ******************************************************************** // Step 5 of 9: test with 'non const x' and 'non const y' // ******************************************************************** @@ -348,14 +279,8 @@ void GerTester< ScalarX Kokkos::deep_copy(org_A, A); if (test_x_y) { - this->callKkGerAndCompareAgainstExpected( alpha - , x - , y - , A - , h_A - , h_expected - , "non const {x,y}" - ); + this->callKkGerAndCompareAgainstExpected(alpha, x, y, A, h_A, h_expected, + "non const {x,y}"); } // ******************************************************************** @@ -363,15 +288,9 @@ void GerTester< ScalarX // ******************************************************************** if (test_cx_y) { Kokkos::deep_copy(A, org_A); - - this->callKkGerAndCompareAgainstExpected( alpha - , c_x - , y - , A - , h_A - , h_expected - , "const x" - ); + + this->callKkGerAndCompareAgainstExpected(alpha, c_x, y, A, h_A, h_expected, + "const x"); } // ******************************************************************** @@ -379,15 +298,9 @@ void GerTester< ScalarX // ******************************************************************** if (test_x_cy) { Kokkos::deep_copy(A, org_A); - - this->callKkGerAndCompareAgainstExpected( alpha - , x - , c_y - , A - , h_A - , h_expected - , "const y" - ); + + this->callKkGerAndCompareAgainstExpected(alpha, x, c_y, A, h_A, h_expected, + "const y"); } // ******************************************************************** @@ -395,77 +308,60 @@ void GerTester< ScalarX // ******************************************************************** if (test_cx_cy) { Kokkos::deep_copy(A, org_A); - - this->callKkGerAndCompareAgainstExpected( alpha - , c_x - , c_y - , A - , h_A - , h_expected - , "const {x,y}" - ); + + this->callKkGerAndCompareAgainstExpected(alpha, c_x, c_y, A, h_A, + h_expected, "const {x,y}"); } // ******************************************************************** // Step 9 of 9: tests with invalid values on the first input parameter // ******************************************************************** - EXPECT_ANY_THROW( KokkosBlas::ger(".", alpha, x, y, A) ) << "Failed test: kk ger should have thrown an exception for mode '.'"; - EXPECT_ANY_THROW( KokkosBlas::ger("", alpha, x, y, A) ) << "Failed test: kk ger should have thrown an exception for mode ''"; - - std::cout << "Leaving GerTester::test() - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " << std::endl; + EXPECT_ANY_THROW(KokkosBlas::ger(".", alpha, x, y, A)) + << "Failed test: kk ger should have thrown an exception for mode '.'"; + EXPECT_ANY_THROW(KokkosBlas::ger("", alpha, x, y, A)) + << "Failed test: kk ger should have thrown an exception for mode ''"; + + std::cout << "Leaving GerTester::test() - - - - - - - - - - - - - - - - - - " + "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " + "- - - - - - - " + << std::endl; } -template -void GerTester< ScalarX - , tLayoutX - , ScalarY - , tLayoutY - , ScalarA - , tLayoutA - , Device - >::populateVariables( ScalarA & alpha - , _HostViewTypeX & h_x - , _HostViewTypeY & h_y - , _HostViewTypeA & h_A - , _ViewTypeExpected & h_expected - , _ViewTypeX & x - , _ViewTypeY & y - , _ViewTypeA & A - , bool & expectedResultIsKnown - ) -{ +template +void GerTester::populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, + _HostViewTypeY& h_y, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected, + _ViewTypeX& x, _ViewTypeY& y, + _ViewTypeA& A, + bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { - this->populateAnalyticalValues( alpha - , h_x - , h_y - , h_A - , h_expected - ); + this->populateAnalyticalValues(alpha, h_x, h_y, h_A, h_expected); Kokkos::deep_copy(x, h_x); Kokkos::deep_copy(y, h_y); Kokkos::deep_copy(A, h_A); expectedResultIsKnown = true; - } - else if ((_M == 1) && (_N == 1)) { + } else if ((_M == 1) && (_N == 1)) { alpha = 3; h_x[0] = 2; h_y[0] = 3; - h_A(0,0) = 7; + h_A(0, 0) = 7; Kokkos::deep_copy(x, h_x); Kokkos::deep_copy(y, h_y); Kokkos::deep_copy(A, h_A); - h_expected(0,0) = 25; + h_expected(0, 0) = 25; expectedResultIsKnown = true; - } - else if ((_M == 1) && (_N == 2)) { + } else if ((_M == 1) && (_N == 2)) { alpha = 3; h_x[0] = 2; @@ -473,18 +369,17 @@ void GerTester< ScalarX h_y[0] = 3; h_y[1] = 4; - h_A(0,0) = 7; - h_A(0,1) = -6; + h_A(0, 0) = 7; + h_A(0, 1) = -6; Kokkos::deep_copy(x, h_x); Kokkos::deep_copy(y, h_y); Kokkos::deep_copy(A, h_A); - h_expected(0,0) = 25; - h_expected(0,1) = 18; + h_expected(0, 0) = 25; + h_expected(0, 1) = 18; expectedResultIsKnown = true; - } - else if ((_M == 2) && (_N == 2)) { + } else if ((_M == 2) && (_N == 2)) { alpha = 3; h_x[0] = 2; @@ -493,25 +388,25 @@ void GerTester< ScalarX h_y[0] = -3; h_y[1] = 7; - h_A(0,0) = 17; - h_A(0,1) = -43; - h_A(1,0) = 29; - h_A(1,1) = 101; + h_A(0, 0) = 17; + h_A(0, 1) = -43; + h_A(1, 0) = 29; + h_A(1, 1) = 101; Kokkos::deep_copy(x, h_x); Kokkos::deep_copy(y, h_y); Kokkos::deep_copy(A, h_A); - h_expected(0,0) = -1; - h_expected(0,1) = -1; - h_expected(1,0) = -52; - h_expected(1,1) = 290; + h_expected(0, 0) = -1; + h_expected(0, 1) = -1; + h_expected(1, 0) = -52; + h_expected(1, 1) = 290; expectedResultIsKnown = true; - } - else { + } else { alpha = 3; - Kokkos::Random_XorShift64_Pool rand_pool(13718); + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); { ScalarX randStart, randEnd; @@ -538,110 +433,102 @@ void GerTester< ScalarX } // Code for complex values -template +template template -typename std::enable_if< std::is_same>::value || std::is_same>::value - , void - >::type -GerTester< ScalarX - , tLayoutX - , ScalarY - , tLayoutY - , ScalarA - , tLayoutA - , Device - >::populateAnalyticalValues( T & alpha - , _HostViewTypeX & h_x - , _HostViewTypeY & h_y - , _HostViewTypeA & h_A - , _ViewTypeExpected & h_expected - ) { +typename std::enable_if>::value || + std::is_same>::value, + void>::type +GerTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, + _HostViewTypeY& h_y, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected) { _AuxType auxI(0.); _AuxType auxJ(0.); _AuxType auxIpJ(0.); _AuxType auxImJ(0.); - alpha.real() = 1.; + alpha.real() = 1.; alpha.imag() = -1.; for (int i = 0; i < _M; ++i) { - auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); h_x[i].real() = sin(auxI); h_x[i].imag() = cos(auxI); } for (int j = 0; j < _N; ++j) { - auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); h_y[j].real() = cos(auxJ); h_y[j].imag() = sin(auxJ); } if (_useHermitianOption) { for (int i = 0; i < _M; ++i) { - auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); for (int j = 0; j < _N; ++j) { - auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); - auxIpJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i+j) ); - h_A(i,j).real() = -sin(auxIpJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); - h_A(i,j).imag() = -sin(auxIpJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_A(i, j).real() = + -sin(auxIpJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); + h_A(i, j).imag() = + -sin(auxIpJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); } } - } - else { + } else { for (int i = 0; i < _M; ++i) { - auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); for (int j = 0; j < _N; ++j) { - auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); - auxImJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i-j) ); - h_A(i,j).real() = -sin(auxImJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); - h_A(i,j).imag() = -sin(auxImJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + auxImJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + h_A(i, j).real() = + -sin(auxImJ) - sin(auxI) * sin(auxJ) + cos(auxI) * cos(auxJ); + h_A(i, j).imag() = + -sin(auxImJ) - sin(auxI) * sin(auxJ) - cos(auxI) * cos(auxJ); } } } if (_useHermitianOption) { for (int i = 0; i < _M; ++i) { - auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); for (int j = 0; j < _N; ++j) { - auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); - auxIpJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i+j) ); - h_expected(i,j).real() = -2. * sin(auxI) * sin(auxJ); - h_expected(i,j).imag() = 2. * (cos(auxIpJ) - sin(auxIpJ)); + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_expected(i, j).real() = -2. * sin(auxI) * sin(auxJ); + h_expected(i, j).imag() = 2. * (cos(auxIpJ) - sin(auxIpJ)); } } - } - else { + } else { for (int i = 0; i < _M; ++i) { - auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); for (int j = 0; j < _N; ++j) { - auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); - auxImJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i-j) ); - h_expected(i,j).real() = 2. * cos(auxI) * cos(auxJ); - h_expected(i,j).imag() = -2. * sin(auxImJ); + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + auxImJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + h_expected(i, j).real() = 2. * cos(auxI) * cos(auxJ); + h_expected(i, j).imag() = -2. * sin(auxImJ); } } } } // Code for non-complex values -template +template template -typename std::enable_if< !std::is_same>::value && !std::is_same>::value - , void - >::type -GerTester< ScalarX - , tLayoutX - , ScalarY - , tLayoutY - , ScalarA - , tLayoutA - , Device - >::populateAnalyticalValues( T & alpha - , _HostViewTypeX & h_x - , _HostViewTypeY & h_y - , _HostViewTypeA & h_A - , _ViewTypeExpected & h_expected - ) { +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +GerTester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, + _HostViewTypeY& h_y, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected) { _AuxType auxI(0.); _AuxType auxJ(0.); _AuxType auxIpJ(0.); @@ -649,78 +536,69 @@ GerTester< ScalarX alpha = 3; for (int i = 0; i < _M; ++i) { - auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); h_x[i] = sin(auxI); } for (int j = 0; j < _N; ++j) { - auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); h_y[j] = cos(auxJ); } for (int i = 0; i < _M; ++i) { - auxI = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i) ); + auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); for (int j = 0; j < _N; ++j) { - auxJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(j) ); - h_A(i,j) = 3 * cos(auxI) * sin(auxJ); + auxJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(j)); + h_A(i, j) = 3 * cos(auxI) * sin(auxJ); } } for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - auxIpJ = this->shrinkAngleToZeroTwoPiRange( static_cast<_AuxType>(i+j) ); - h_expected(i,j) = 3 * sin(auxIpJ); + auxIpJ = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_expected(i, j) = 3 * sin(auxIpJ); } } } // Code for complex values -template +template template -typename std::enable_if< std::is_same>::value || std::is_same>::value - , void - >::type -GerTester< ScalarX - , tLayoutX - , ScalarY - , tLayoutY - , ScalarA - , tLayoutA - , Device - >::populateVanillaValues( const T & alpha - , const _HostViewTypeX & h_x - , const _HostViewTypeY & h_y - , const _HostViewTypeA & h_A - , _ViewTypeExpected & h_vanilla - ) { +typename std::enable_if>::value || + std::is_same>::value, + void>::type +GerTester::populateVanillaValues(const T& alpha, + const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, + const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { if (_vanillaUsesDifferentOrderOfOps) { if (_useHermitianOption) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - h_vanilla(i,j) = h_A(i,j) + alpha * _KAT_A::conj( h_y(j) ) * h_x(i); + h_vanilla(i, j) = h_A(i, j) + alpha * _KAT_A::conj(h_y(j)) * h_x(i); } } - } - else { + } else { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - h_vanilla(i,j) = h_A(i,j) + alpha * h_y(j) * h_x(i); + h_vanilla(i, j) = h_A(i, j) + alpha * h_y(j) * h_x(i); } } } - } - else { + } else { if (_useHermitianOption) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - h_vanilla(i,j) = h_A(i,j) + alpha * h_x(i) * _KAT_A::conj( h_y(j) ); + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_y(j)); } } - } - else { + } else { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - h_vanilla(i,j) = h_A(i,j) + alpha * h_x(i) * h_y(j); + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * h_y(j); } } } @@ -728,51 +606,38 @@ GerTester< ScalarX } // Code for non-complex values -template +template template -typename std::enable_if< !std::is_same>::value && !std::is_same>::value - , void - >::type -GerTester< ScalarX - , tLayoutX - , ScalarY - , tLayoutY - , ScalarA - , tLayoutA - , Device - >::populateVanillaValues( const T & alpha - , const _HostViewTypeX & h_x - , const _HostViewTypeY & h_y - , const _HostViewTypeA & h_A - , _ViewTypeExpected & h_vanilla - ) { +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +GerTester::populateVanillaValues(const T& alpha, + const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, + const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { if (_vanillaUsesDifferentOrderOfOps) { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - h_vanilla(i,j) = h_A(i,j) + alpha * h_y(j) * h_x(i); + h_vanilla(i, j) = h_A(i, j) + alpha * h_y(j) * h_x(i); } } - } - else { + } else { for (int i = 0; i < _M; ++i) { for (int j = 0; j < _N; ++j) { - h_vanilla(i,j) = h_A(i,j) + alpha * h_x(i) * h_y(j); + h_vanilla(i, j) = h_A(i, j) + alpha * h_x(i) * h_y(j); } } } } -template +template template -T GerTester< ScalarX - , tLayoutX - , ScalarY - , tLayoutY - , ScalarA - , tLayoutA - , Device - >::shrinkAngleToZeroTwoPiRange(const T input) -{ +T GerTester::shrinkAngleToZeroTwoPiRange(const T input) { T output(input); #if 0 T twoPi( 2. * Kokkos::numbers::pi ); @@ -787,678 +652,687 @@ T GerTester< ScalarX } // Code for complex values -template +template template -typename std::enable_if< std::is_same>::value || std::is_same>::value - , void - >::type -GerTester< ScalarX - , tLayoutX - , ScalarY - , tLayoutY - , ScalarA - , tLayoutA - , Device - >::compareVanillaExpected( const T & alpha - , const _ViewTypeExpected & h_vanilla - , const _ViewTypeExpected & h_expected - ) { - int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); +typename std::enable_if>::value || + std::is_same>::value, + void>::type +GerTester::compareVanillaExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected) { + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); if (_useAnalyticalResults) { - int numErrorsRealAbs (0); - int numErrorsRealRel (0); - int numErrorsImagAbs (0); - int numErrorsImagRel (0); - _AuxType diff (0.); - _AuxType diffThreshold (0.); - bool errorHappened (false); - _AuxType maxErrorRealRel (0.); - int iForMaxErrorRealRel(0); - int jForMaxErrorRealRel(0); - _AuxType maxErrorImagRel (0.); - int iForMaxErrorImagRel(0); - int jForMaxErrorImagRel(0); + int numErrorsRealAbs(0); + int numErrorsRealRel(0); + int numErrorsImagAbs(0); + int numErrorsImagRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRealRel(0.); + int iForMaxErrorRealRel(0); + int jForMaxErrorRealRel(0); + _AuxType maxErrorImagRel(0.); + int iForMaxErrorImagRel(0); + int jForMaxErrorImagRel(0); for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - diff = _KAT_A::abs(h_expected(i,j).real() - h_vanilla(i,j).real()); + diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); errorHappened = false; - if (h_expected(i,j).real() == 0.) { + if (h_expected(i, j).real() == 0.) { diffThreshold = _KAT_A::abs(_epsAbs); - if ( diff > diffThreshold ) { + if (diff > diffThreshold) { errorHappened = true; numErrorsRealAbs++; } - } - else { - _AuxType aux = diff / _KAT_A::abs(h_expected(i,j).real()); + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j).real()); if (maxErrorRealRel < aux) { - maxErrorRealRel = aux; + maxErrorRealRel = aux; iForMaxErrorRealRel = i; jForMaxErrorRealRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j).real()); - if ( diff > diffThreshold ) { + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i, j).real()); + if (diff > diffThreshold) { errorHappened = true; numErrorsRealRel++; } } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { - std::cout << "ERROR, i = " << i - << ", j = " << j - << ": h_expected(i,j).real() = " << h_expected(i,j).real() - << ", h_vanilla(i,j).real() = " << h_vanilla(i,j).real() - << ", _KAT_A::abs(h_expected(i,j).real() - h_vanilla(i,j).real()) = " << diff - << ", diffThreshold = " << diffThreshold + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).real() = " << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() + << ", _KAT_A::abs(h_expected(i,j).real() - " + "h_vanilla(i,j).real()) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; } - diff = _KAT_A::abs(h_expected(i,j).imag() - h_vanilla(i,j).imag()); + diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); errorHappened = false; - if (h_expected(i,j).imag() == 0.) { + if (h_expected(i, j).imag() == 0.) { diffThreshold = _KAT_A::abs(_epsAbs); - if ( diff > diffThreshold ) { + if (diff > diffThreshold) { errorHappened = true; numErrorsImagAbs++; } - } - else { - _AuxType aux = diff / _KAT_A::abs(h_expected(i,j).imag()); + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j).imag()); if (maxErrorImagRel < aux) { - maxErrorImagRel = aux; + maxErrorImagRel = aux; iForMaxErrorImagRel = i; jForMaxErrorImagRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j).imag()); - if ( diff > diffThreshold ) { + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i, j).imag()); + if (diff > diffThreshold) { errorHappened = true; numErrorsImagRel++; } } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { - std::cout << "ERROR, i = " << i - << ", j = " << j - << ": h_expected(i,j).imag() = " << h_expected(i,j).imag() - << ", h_vanilla(i,j).imag() = " << h_vanilla(i,j).imag() - << ", _KAT_A::abs(h_expected(i,j).imag() - h_vanilla(i,j).imag()) = " << diff - << ", diffThreshold = " << diffThreshold + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() + << ", _KAT_A::abs(h_expected(i,j).imag() - " + "h_vanilla(i,j).imag()) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; } - } // for j - } // for i + } // for j + } // for i { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption << ": vanilla differs too much from analytical on real components" - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_expected(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) - << ", h_vanilla(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_vanilla(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); if (numErrorsReal > 0) { - std::cout<< "WARNING" << msg.str() << std::endl; + std::cout << "WARNING" << msg.str() << std::endl; } - EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) + << "Failed test" << msg.str(); } { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption << ": vanilla differs too much from analytical on imag components" - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_expected(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) - << ", h_vanilla(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_vanilla(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); if (numErrorsImag > 0) { - std::cout<< "WARNING" << msg.str() << std::endl; + std::cout << "WARNING" << msg.str() << std::endl; } - EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) + << "Failed test" << msg.str(); } - } - else { + } else { int numErrorsReal(0); int numErrorsImag(0); for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - if ( h_expected(i,j).real() != h_vanilla(i,j).real() ) { + if (h_expected(i, j).real() != h_vanilla(i, j).real()) { if (numErrorsReal == 0) { - std::cout << "ERROR, i = " << i - << ", j = " << j - << ": h_expected(i,j).real() = " << h_expected(i,j).real() - << ", h_vanilla(i,j).real() = " << h_vanilla(i,j).real() + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).real() = " + << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() << std::endl; } numErrorsReal++; } - if ( h_expected(i,j).imag() != h_vanilla(i,j).imag() ) { + if (h_expected(i, j).imag() != h_vanilla(i, j).imag()) { if (numErrorsImag == 0) { - std::cout << "ERROR, i = " << i - << ", j = " << j - << ": h_expected(i,j).imag() = " << h_expected(i,j).imag() - << ", h_vanilla(i,j).imag() = " << h_vanilla(i,j).imag() + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).imag() = " + << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() << std::endl; } numErrorsImag++; } - } // for j - } // for i - EXPECT_EQ(numErrorsReal, 0) << "Failed test" - << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ": vanilla result is incorrect on real components" - << ", numErrorsReal = " << numErrorsReal; - EXPECT_EQ(numErrorsImag, 0) << "Failed test" - << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ": vanilla result is incorrect on imag components" - << ", numErrorsImag = " << numErrorsImag; + } // for j + } // for i + EXPECT_EQ(numErrorsReal, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla result is incorrect on real components" + << ", numErrorsReal = " << numErrorsReal; + EXPECT_EQ(numErrorsImag, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla result is incorrect on imag components" + << ", numErrorsImag = " << numErrorsImag; } } - + // Code for non-complex values -template +template template -typename std::enable_if< !std::is_same>::value && !std::is_same>::value - , void - >::type -GerTester< ScalarX - , tLayoutX - , ScalarY - , tLayoutY - , ScalarA - , tLayoutA - , Device - >::compareVanillaExpected( const T & alpha - , const _ViewTypeExpected & h_vanilla - , const _ViewTypeExpected & h_expected - ) { - int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +GerTester::compareVanillaExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected) { + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); if (_useAnalyticalResults) { - int numErrorsAbs (0); - int numErrorsRel (0); - _AuxType diff (0.); - _AuxType diffThreshold (0.); - bool errorHappened (false); - _AuxType maxErrorRel (0.); - int iForMaxErrorRel(0); - int jForMaxErrorRel(0); + int numErrorsAbs(0); + int numErrorsRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRel(0.); + int iForMaxErrorRel(0); + int jForMaxErrorRel(0); for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - diff = _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)); + diff = _KAT_A::abs(h_expected(i, j) - h_vanilla(i, j)); errorHappened = false; - if (h_expected(i,j) == 0.) { + if (h_expected(i, j) == 0.) { diffThreshold = _KAT_A::abs(_epsAbs); if (diff > diffThreshold) { errorHappened = true; numErrorsAbs++; } - } - else { - _AuxType aux = diff / _KAT_A::abs(h_expected(i,j)); + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j)); if (maxErrorRel < aux) { - maxErrorRel = aux; + maxErrorRel = aux; iForMaxErrorRel = i; jForMaxErrorRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j)); + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i, j)); if (diff > diffThreshold) { errorHappened = true; numErrorsRel++; } } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { - std::cout << "ERROR, i = " << i - << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i,j) - << ", h_vanilla(i,j) = " << h_vanilla(i,j) - << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " << diff - << ", diffThreshold = " << diffThreshold + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i, j) + << ", h_vanilla(i,j) = " << h_vanilla(i, j) + << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; } - } // for j - } // for i + } // for j + } // for i { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ": vanilla differs too much from expected" - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel - << ", jForMaxErrorRel = " << jForMaxErrorRel - << ", h_expected(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) - << ", h_vanilla(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_vanilla(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_vanilla(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrors(numErrorsAbs + numErrorsRel); if (numErrors > 0) { - std::cout<< "WARNING" << msg.str() << std::endl; + std::cout << "WARNING" << msg.str() << std::endl; } EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); } - } - else { + } else { int numErrors(0); for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - if ( h_expected(i,j) != h_vanilla(i,j) ) { + if (h_expected(i, j) != h_vanilla(i, j)) { if (numErrors == 0) { - std::cout << "ERROR, i = " << i - << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i,j) - << ", h_vanilla(i,j) = " << h_vanilla(i,j) - << std::endl; + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i, j) + << ", h_vanilla(i,j) = " << h_vanilla(i, j) << std::endl; } numErrors++; } - } // for j - } // for i - EXPECT_EQ(numErrors, 0) << "Failed test" - << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ": vanilla result is incorrect" - << ", numErrors = " << numErrors; + } // for j + } // for i + EXPECT_EQ(numErrors, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ": vanilla result is incorrect" + << ", numErrors = " << numErrors; } } - + // Code for complex values -template +template template -typename std::enable_if< std::is_same>::value || std::is_same>::value - , void - >::type -GerTester< ScalarX - , tLayoutX - , ScalarY - , tLayoutY - , ScalarA - , tLayoutA - , Device - >::compareKokkosExpected( const T & alpha - , const _HostViewTypeA & h_A - , const _ViewTypeExpected & h_expected - ) { - int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); - - int numErrorsRealAbs (0); - int numErrorsRealRel (0); - int numErrorsImagAbs (0); - int numErrorsImagRel (0); - _AuxType diff (0.); - _AuxType diffThreshold (0.); - bool errorHappened (false); - _AuxType maxErrorRealRel (0.); - int iForMaxErrorRealRel(0); - int jForMaxErrorRealRel(0); - _AuxType maxErrorImagRel (0.); - int iForMaxErrorImagRel(0); - int jForMaxErrorImagRel(0); +typename std::enable_if>::value || + std::is_same>::value, + void>::type +GerTester::compareKokkosExpected(const T& alpha, + const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected) { + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + int numErrorsRealAbs(0); + int numErrorsRealRel(0); + int numErrorsImagAbs(0); + int numErrorsImagRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRealRel(0.); + int iForMaxErrorRealRel(0); + int jForMaxErrorRealRel(0); + _AuxType maxErrorImagRel(0.); + int iForMaxErrorImagRel(0); + int jForMaxErrorImagRel(0); for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - diff = _KAT_A::abs(h_expected(i,j).real() - h_A(i,j).real()); + diff = _KAT_A::abs(h_expected(i, j).real() - h_A(i, j).real()); errorHappened = false; - if (h_expected(i,j).real() == 0.) { + if (h_expected(i, j).real() == 0.) { diffThreshold = _KAT_A::abs(_epsAbs); if (diff > diffThreshold) { errorHappened = true; numErrorsRealAbs++; } - } - else { - _AuxType aux = diff / _KAT_A::abs(h_expected(i,j).real()); + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j).real()); if (maxErrorRealRel < aux) { - maxErrorRealRel = aux; + maxErrorRealRel = aux; iForMaxErrorRealRel = i; jForMaxErrorRealRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j).real()); + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i, j).real()); if (diff > diffThreshold) { errorHappened = true; numErrorsRealRel++; } } if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { - std::cout << "ERROR, i = " << i - << ", j = " << j - << ": h_expected(i,j).real() = " << h_expected(i,j).real() - << ", h_A(i,j).real() = " << h_A(i,j).real() - << ", _KAT_A::abs(h_expected(i,j).real() - h_A(i,j).real()) = " << diff - << ", diffThreshold = " << diffThreshold - << std::endl; + std::cout + << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).real() = " << h_expected(i, j).real() + << ", h_A(i,j).real() = " << h_A(i, j).real() + << ", _KAT_A::abs(h_expected(i,j).real() - h_A(i,j).real()) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; } - diff = _KAT_A::abs(h_expected(i,j).imag() - h_A(i,j).imag()); + diff = _KAT_A::abs(h_expected(i, j).imag() - h_A(i, j).imag()); errorHappened = false; - if (h_expected(i,j).imag() == 0.) { + if (h_expected(i, j).imag() == 0.) { diffThreshold = _KAT_A::abs(_epsAbs); if (diff > diffThreshold) { errorHappened = true; numErrorsImagAbs++; } - } - else { - _AuxType aux = diff / _KAT_A::abs(h_expected(i,j).imag()); + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j).imag()); if (maxErrorImagRel < aux) { - maxErrorImagRel = aux; + maxErrorImagRel = aux; iForMaxErrorImagRel = i; jForMaxErrorImagRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j).imag()); + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i, j).imag()); if (diff > diffThreshold) { errorHappened = true; numErrorsImagRel++; } } if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { - std::cout << "ERROR, i = " << i - << ", j = " << j - << ": h_expected(i,j).imag() = " << h_expected(i,j).imag() - << ", h_A(i,j).imag() = " << h_A(i,j).imag() - << ", _KAT_A::abs(h_expected(i,j).imag() - h_A(i,j).imag()) = " << diff - << ", diffThreshold = " << diffThreshold - << std::endl; + std::cout + << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + << ", h_A(i,j).imag() = " << h_A(i, j).imag() + << ", _KAT_A::abs(h_expected(i,j).imag() - h_A(i,j).imag()) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; } - } // for j - } // for i - std::cout << "A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_expected(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) - << ", h_A(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_expected(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) - << ", h_A(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed - << std::endl; + } // for j + } // for i + std::cout + << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; if ((_M == 2131) && (_N == 2131)) { std::cout << "Information" - << ": A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", h_expected(11, 2119) = (" << h_expected(11,2119).real() << ", " << h_expected(11,2119).imag() << ")" - << ", h_A(11, 2119) = (" << h_A(11,2119).real() << ", " << h_A(11,2119).imag() << ")" - << std::endl; + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", h_expected(11, 2119) = (" << h_expected(11, 2119).real() + << ", " << h_expected(11, 2119).imag() << ")" + << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " + << h_A(11, 2119).imag() << ")" << std::endl; std::cout << "Information" - << ": A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption - << ", h_expected(710, 1065) = (" << h_expected(710,1065).real() << ", " << h_expected(710,1065).imag() << ")" - << ", h_A(710, 1065) = (" << h_A(710,1065).real() << ", " << h_A(710,1065).imag() << ")" - << std::endl; + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", h_expected(710, 1065) = (" << h_expected(710, 1065).real() + << ", " << h_expected(710, 1065).imag() << ")" + << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " + << h_A(710, 1065).imag() << ")" << std::endl; } { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption << ": ger result is incorrect on real components" - << ", numErrorsRealAbs = " << numErrorsRealAbs - << ", numErrorsRealRel = " << numErrorsRealRel - << ", maxErrorRealRel = " << maxErrorRealRel - << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel - << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel - << ", h_expected(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) - << ", h_A(i,j).real() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRealRel,jForMaxErrorRealRel).real() : 9.999e+99 ) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); if (numErrorsReal > 0) { - std::cout<< "WARNING" << msg.str() << std::endl; + std::cout << "WARNING" << msg.str() << std::endl; } EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); } { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() - << ", _useHermitianOption = " << _useHermitianOption + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption << ": ger result is incorrect on imag components" - << ", numErrorsImagAbs = " << numErrorsImagAbs - << ", numErrorsImagRel = " << numErrorsImagRel - << ", maxErrorImagRel = " << maxErrorImagRel - << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel - << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel - << ", h_expected(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) - << ", h_A(i,j).imag() = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorImagRel,jForMaxErrorImagRel).imag() : 9.999e+99 ) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); if (numErrorsImag > 0) { - std::cout<< "WARNING" << msg.str() << std::endl; + std::cout << "WARNING" << msg.str() << std::endl; } EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); } } - + // Code for non-complex values -template +template template -typename std::enable_if< !std::is_same>::value && !std::is_same>::value - , void - >::type -GerTester< ScalarX - , tLayoutX - , ScalarY - , tLayoutY - , ScalarA - , tLayoutA - , Device - >::compareKokkosExpected( const T & alpha - , const _HostViewTypeA & h_A - , const _ViewTypeExpected & h_expected - ) { - int maxNumErrorsAllowed( static_cast(_M) * static_cast(_N) * 1.e-3 ); - - int numErrorsAbs (0); - int numErrorsRel (0); - _AuxType diff (0.); - _AuxType diffThreshold (0.); - bool errorHappened (false); - _AuxType maxErrorRel (0.); - int iForMaxErrorRel(0); - int jForMaxErrorRel(0); +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +GerTester::compareKokkosExpected(const T& alpha, + const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected) { + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + int numErrorsAbs(0); + int numErrorsRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRel(0.); + int iForMaxErrorRel(0); + int jForMaxErrorRel(0); for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - diff = _KAT_A::abs(h_expected(i,j) - h_A(i,j)); + diff = _KAT_A::abs(h_expected(i, j) - h_A(i, j)); errorHappened = false; - if (h_expected(i,j) == 0.) { + if (h_expected(i, j) == 0.) { diffThreshold = _KAT_A::abs(_epsAbs); if (diff > diffThreshold) { errorHappened = true; numErrorsAbs++; } - } - else { - _AuxType aux = diff / _KAT_A::abs(h_expected(i,j)); + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j)); if (maxErrorRel < aux) { - maxErrorRel = aux; + maxErrorRel = aux; iForMaxErrorRel = i; jForMaxErrorRel = j; } - diffThreshold = _KAT_A::abs(_epsRel * h_expected(i,j)); + diffThreshold = _KAT_A::abs(_epsRel * h_expected(i, j)); if (diff > diffThreshold) { errorHappened = true; numErrorsRel++; } } if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { - std::cout << "ERROR, i = " << i - << ", j = " << j - << ": h_expected(i,j) = " << h_expected(i,j) - << ", h_A(i,j) = " << h_A(i,j) + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i, j) + << ", h_A(i,j) = " << h_A(i, j) << ", _KAT_A::abs(h_expected(i,j) - h_A(i,j)) = " << diff - << ", diffThreshold = " << diffThreshold - << std::endl; + << ", diffThreshold = " << diffThreshold << std::endl; } - } // for j - } // for i - std::cout << "A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() + } // for j + } // for i + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel - << ", jForMaxErrorRel = " << jForMaxErrorRel - << ", h_expected(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) - << ", h_A(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) - << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed - << std::endl; + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel + << ", h_expected(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_A(i,j) = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; { std::ostringstream msg; - msg << ", A is " << _M << " by " << _N - << ", _A_is_lr = " << _A_is_lr - << ", _A_is_ll = " << _A_is_ll - << ", alpha type = " << typeid(alpha).name() + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() << ", _useHermitianOption = " << _useHermitianOption << ": ger result is incorrect" - << ", numErrorsAbs = " << numErrorsAbs - << ", numErrorsRel = " << numErrorsRel - << ", maxErrorRel = " << maxErrorRel - << ", iForMaxErrorRel = " << iForMaxErrorRel - << ", jForMaxErrorRel = " << jForMaxErrorRel - << ", h_expected(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_expected(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) - << ", h_A(i,j) = " << ( ((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel,jForMaxErrorRel) : 9.999e+99 ) + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_A(i,j) = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; int numErrors(numErrorsAbs + numErrorsRel); if (numErrors > 0) { - std::cout<< "WARNING" << msg.str() << std::endl; + std::cout << "WARNING" << msg.str() << std::endl; } EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); } } -template +template template -void GerTester< ScalarX - , tLayoutX - , ScalarY - , tLayoutY - , ScalarA - , tLayoutA - , Device - >::callKkGerAndCompareAgainstExpected( const ScalarA & alpha - , TX & x - , TY & y - , _ViewTypeA & A - , const _HostViewTypeA & h_A - , const _ViewTypeExpected & h_expected - , const std::string & situation - ) -{ - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_ger.hpp, right before calling KokkosBlas::ger(): ViewTypeA = %s, _kkGerShouldThrowException=%d\n", typeid(_ViewTypeA).name(), _kkGerShouldThrowException ); +void GerTester:: + callKkGerAndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, + _ViewTypeA& A, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_expected, + const std::string& situation) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "In Test_Blas2_ger.hpp, right before calling KokkosBlas::ger(): " + "ViewTypeA = %s, _kkGerShouldThrowException=%d\n", + typeid(_ViewTypeA).name(), _kkGerShouldThrowException); std::string mode = _useHermitianOption ? "H" : "T"; - bool gotStdException (false); + bool gotStdException(false); bool gotUnknownException(false); try { KokkosBlas::ger(mode.c_str(), alpha, x, y, A); - } - catch( const std::exception& e ) { - std::cout << "In Test_Blas2_ger, '" << situation << "': caught exception, e.what() = " << e.what() << std::endl; + } catch (const std::exception& e) { + std::cout << "In Test_Blas2_ger, '" << situation + << "': caught exception, e.what() = " << e.what() << std::endl; gotStdException = true; - } - catch( ... ) { - std::cout << "In Test_Blas2_ger, '" << situation << "': caught unknown exception" << std::endl; + } catch (...) { + std::cout << "In Test_Blas2_ger, '" << situation + << "': caught unknown exception" << std::endl; gotUnknownException = true; } - EXPECT_EQ(gotUnknownException, false) << "Failed test, '" << situation << "': unknown exception should not have happened"; + EXPECT_EQ(gotUnknownException, false) + << "Failed test, '" << situation + << "': unknown exception should not have happened"; - EXPECT_EQ(gotStdException, _kkGerShouldThrowException) << "Failed test, '" << situation << "': kk ger() should" - << (_kkGerShouldThrowException ? " " : " not ") - << "have thrown a std::exception"; + EXPECT_EQ(gotStdException, _kkGerShouldThrowException) + << "Failed test, '" << situation << "': kk ger() should" + << (_kkGerShouldThrowException ? " " : " not ") + << "have thrown a std::exception"; - if (( gotStdException == false ) && - ( gotUnknownException == false )) { + if ((gotStdException == false) && (gotUnknownException == false)) { Kokkos::deep_copy(h_A, A); - this->compareKokkosExpected( alpha - , h_A - , h_expected - ); + this->compareKokkosExpected(alpha, h_A, h_expected); } } -} // namespace Test +} // namespace Test template -int test_ger( const std::string & caseName ) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+==========================================================================\n" ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s ...\n", caseName.c_str() ); +int test_ger(const std::string& caseName) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+=======================================================================" + "===\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s ...\n", caseName.c_str()); #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s for LAYOUTLEFT ...\n", caseName.c_str() ); + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTLEFT ...\n", + caseName.c_str()); if (true) { - Test::GerTester tester; + Test::GerTester + tester; tester.test(0, 13, 0); tester.test(1024, 0, 0); tester.test(1, 1, 0); @@ -1466,26 +1340,35 @@ int test_ger( const std::string & caseName ) { tester.test(1, 2, 0); tester.test(13, 13, 0); tester.test(13, 1024, 0); - tester.test(13, 1024, 0 , true, false); - tester.test(13, 1024, 0 , true, true); - tester.test(50, 40, 4 ); + tester.test(13, 1024, 0, true, false); + tester.test(13, 1024, 0, true, true); + tester.test(50, 40, 4); tester.test(1024, 1024, 0); tester.test(2131, 2131, 0); - tester.test(2131, 2131, 0 , true, false); - tester.test(2131, 2131, 0 , true, true); + tester.test(2131, 2131, 0, true, false); + tester.test(2131, 2131, 0, true, true); } - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTLEFT\n", caseName.c_str() ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTLEFT\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s for LAYOUTRIGHT ...\n", caseName.c_str() ); + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTRIGHT ...\n", + caseName.c_str()); if (true) { - Test::GerTester tester; + Test::GerTester + tester; tester.test(0, 13, 0); tester.test(1024, 0, 0); tester.test(1, 1, 0); @@ -1502,18 +1385,27 @@ int test_ger( const std::string & caseName ) { tester.test(2131, 2131, 0, true, true); } - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTRIGHT\n", caseName.c_str() ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTRIGHT\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); #endif #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s for LAYOUTSTRIDE ...\n", caseName.c_str() ); + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for LAYOUTSTRIDE ...\n", + caseName.c_str()); if (true) { - Test::GerTester tester; - tester.test(0, 13, 0 ); + Test::GerTester + tester; + tester.test(0, 13, 0); tester.test(1024, 0, 0); tester.test(13, 13, 0); tester.test(13, 1024, 0); @@ -1526,41 +1418,58 @@ int test_ger( const std::string & caseName ) { tester.test(2131, 2131, 0, true, true); } - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for LAYOUTSTRIDE\n", caseName.c_str() ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTSTRIDE\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Starting %s for MIXED LAYOUTS ...\n", caseName.c_str() ); +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s for MIXED LAYOUTS ...\n", + caseName.c_str()); if (true) { - Test::GerTester tester; + Test::GerTester + tester; tester.test(1024, 1024, 0); tester.test(1024, 1024, 0, true, false); tester.test(1024, 1024, 0, true, true); } if (true) { - Test::GerTester tester; + Test::GerTester + tester; tester.test(1024, 1024, 0); } - - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s for MIXED LAYOUTS\n", caseName.c_str() ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+--------------------------------------------------------------------------\n" ); + + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for MIXED LAYOUTS\n", + caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+-----------------------------------------------------------------------" + "---\n"); #endif - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Finished %s\n", caseName.c_str() ); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+==========================================================================\n" ); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s\n", caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "+=======================================================================" + "===\n"); return 1; } #if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_float"); - test_ger( "test case ger_float" ); + test_ger("test case ger_float"); Kokkos::Profiling::popRegion(); } #endif @@ -1568,37 +1477,45 @@ TEST_F(TestCategory, ger_float) { #if 1 #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_complex_float) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_complex_float"); - test_ger, Kokkos::complex, Kokkos::complex, TestExecSpace>( "test case ger_complex_float" ); + test_ger, Kokkos::complex, + Kokkos::complex, TestExecSpace>( + "test case ger_complex_float"); Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_double"); - test_ger( "test case ger_double" ); + test_ger("test case ger_double"); Kokkos::Profiling::popRegion(); } #endif #if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_complex_double) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_complex_double"); - test_ger, Kokkos::complex, Kokkos::complex, TestExecSpace>( "test case ger_complex_double" ); + test_ger, Kokkos::complex, + Kokkos::complex, TestExecSpace>( + "test case ger_complex_double"); Kokkos::Profiling::popRegion(); } #endif -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) TEST_F(TestCategory, ger_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_int"); - test_ger( "test case ger_int" ); + test_ger("test case ger_int"); Kokkos::Profiling::popRegion(); } #endif @@ -1607,9 +1524,9 @@ TEST_F(TestCategory, ger_int) { !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) TEST_F(TestCategory, ger_double_int) { Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_double_int"); - test_ger( "test case ger_mixed_types" ); + test_ger("test case ger_mixed_types"); Kokkos::Profiling::popRegion(); } #endif -#endif // if 1 +#endif // if 1 diff --git a/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index f829026e76..29e0d91c2a 100644 --- a/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -828,7 +828,7 @@ struct D2_MIS_FixedPriority { Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(worklist1)); lno_t workRemain = numVerts; - //int numIter = 0; // AquiEEP + // int numIter = 0; // AquiEEP while (workRemain) { // do another iteration Kokkos::parallel_for( @@ -853,7 +853,7 @@ struct D2_MIS_FixedPriority { // Finally, flip the worklists std::swap(worklist1, worklist2); workRemain = newWorkRemain; - //numIter++; // AquiEEP + // numIter++; // AquiEEP } // now that every vertex has been decided IN_SET/OUT_SET, // build a compact list of the vertices which are IN_SET. From d94c0139e12beb79afa514ce79a967abd71144e6 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 11 Apr 2023 04:16:54 -0600 Subject: [PATCH 296/442] Minor corrections --- blas/impl/KokkosBlas2_ger_impl.hpp | 2 +- blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp | 86 ++++++++++---------- 2 files changed, 45 insertions(+), 43 deletions(-) diff --git a/blas/impl/KokkosBlas2_ger_impl.hpp b/blas/impl/KokkosBlas2_ger_impl.hpp index 8fd166d94f..fa2220e00a 100644 --- a/blas/impl/KokkosBlas2_ger_impl.hpp +++ b/blas/impl/KokkosBlas2_ger_impl.hpp @@ -165,7 +165,7 @@ struct TwoLevelGER { Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { A_(i, j) += AComponentType( alpha_ * x_fixed * - Kokkos::ArithTraits::conj(y_(j))); + Kokkos::ArithTraits::conj(y_(j))); }); } } diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp index 8bedf41523..5e31a832be 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp @@ -28,45 +28,37 @@ struct ger_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUTX, LAYOUTY, LAYOUTA, \ - MEMSPACE) \ - template \ - struct ger_tpl_spec_avail< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, \ + MEMSPACE) \ + template \ + struct ger_tpl_spec_avail< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) #endif @@ -74,19 +66,19 @@ KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUTX, LAYOUTY, \ - LAYOUTA, MEMSPACE) \ - template <> \ - struct ger_tpl_spec_avail< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, \ + MEMSPACE) \ + template <> \ + struct ger_tpl_spec_avail< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // We use the same layout for X, Y and Abecause the GER interface will @@ -94,31 +86,41 @@ KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, // match any layout combination, as long as none are LayoutStride. KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) + KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::CudaUVMSpace) + #endif // rocBLAS From 9f49fb9724b908ee5684468aee6183856a962556 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 12 Apr 2023 06:10:37 -0600 Subject: [PATCH 297/442] Addressing new feedbacks from Luc. --- blas/src/KokkosBlas2_ger.hpp | 15 ++++++++++++--- graph/impl/KokkosGraph_Distance2MIS_impl.hpp | 2 -- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/blas/src/KokkosBlas2_ger.hpp b/blas/src/KokkosBlas2_ger.hpp index 218b01bb2e..fbfc9c1f98 100644 --- a/blas/src/KokkosBlas2_ger.hpp +++ b/blas/src/KokkosBlas2_ger.hpp @@ -42,18 +42,27 @@ template ::assignable, + "AViewType memory space must be assignable from XViewType"); + static_assert( + Kokkos::SpaceAccessibility::assignable, + "AViewType memory space must be assignable from YViewType"); + static_assert( Kokkos::SpaceAccessibility::accessible, - "AViewType memory space must be compatible with ExecutionSpace"); + "AViewType memory space must be accessible from ExecutionSpace"); static_assert( Kokkos::SpaceAccessibility::accessible, - "XViewType memory space must be compatible with ExecutionSpace"); + "XViewType memory space must be accessible from ExecutionSpace"); static_assert( Kokkos::SpaceAccessibility::accessible, - "YViewType memory space must be compatible with ExecutionSpace"); + "YViewType memory space must be accessible from ExecutionSpace"); static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); diff --git a/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index 29e0d91c2a..a359956a23 100644 --- a/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -828,7 +828,6 @@ struct D2_MIS_FixedPriority { Kokkos::parallel_for(range_pol(0, numVerts), InitWorklistFunctor(worklist1)); lno_t workRemain = numVerts; - // int numIter = 0; // AquiEEP while (workRemain) { // do another iteration Kokkos::parallel_for( @@ -853,7 +852,6 @@ struct D2_MIS_FixedPriority { // Finally, flip the worklists std::swap(worklist1, worklist2); workRemain = newWorkRemain; - // numIter++; // AquiEEP } // now that every vertex has been decided IN_SET/OUT_SET, // build a compact list of the vertices which are IN_SET. From 3a91bb0e596a7d70895bb4ce081571e69268cc3a Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 12 Apr 2023 06:25:28 -0600 Subject: [PATCH 298/442] Proper formatting --- blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp | 26 +++++++++----------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp index 5e31a832be..925aaac945 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp @@ -28,19 +28,16 @@ struct ger_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, \ - MEMSPACE) \ - template \ - struct ger_tpl_spec_avail< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct ger_tpl_spec_avail< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, @@ -66,8 +63,7 @@ KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, \ - MEMSPACE) \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ template <> \ struct ger_tpl_spec_avail< \ Kokkos::View Date: Mon, 17 Apr 2023 09:56:07 -0600 Subject: [PATCH 299/442] Corrections for some automatic tests that are failing --- .../KokkosBlas2_ger_tpl_spec_decl_blas.hpp | 48 +++++++------------ blas/unit_test/Test_Blas2_ger.hpp | 3 +- 2 files changed, 18 insertions(+), 33 deletions(-) diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp index d5626dd604..a8bf633af1 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp @@ -254,41 +254,25 @@ namespace Impl { } \ }; -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, false) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, false) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, false) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index 5b26344ded..4710730504 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -1318,7 +1318,8 @@ int test_ger(const std::string& caseName) { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+=======================================================================" "===\n"); - KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s ...\n", caseName.c_str()); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s, device = %s ...\n", + caseName.c_str(), typeid(Device).name()); #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ From 6742ef3bfacf10c5e769a656eca35a482e10c866 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 18 Apr 2023 11:05:32 -0600 Subject: [PATCH 300/442] Solving compilation issues on the automatic tests --- .../KokkosBlas2_ger_tpl_spec_decl_blas.hpp | 526 ++++++++++-------- .../KokkosBlas2_ger_tpl_spec_decl_cublas.hpp | 260 +++++---- .../KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp | 208 ++++--- 3 files changed, 561 insertions(+), 433 deletions(-) diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp index a8bf633af1..225b877bb3 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp @@ -30,249 +30,311 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DGER_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GER< \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void ger(const typename AViewType::execution_space& /* space */ \ - , \ - const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,double]"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ - if (A_is_ll) { \ - HostBlas::ger(M, N, alpha, X.data(), one, Y.data(), one, \ - A.data(), LDA); \ - } else { \ - HostBlas::ger(M, N, alpha, Y.data(), one, X.data(), one, \ - A.data(), LDA); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const EXEC_SPACE & /* space */ \ + , \ + const char /*trans*/[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,double]"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::ger(M, N, alpha, X.data(), one, Y.data(), one, \ + A.data(), LDA); \ + } else { \ + HostBlas::ger(M, N, alpha, Y.data(), one, X.data(), one, \ + A.data(), LDA); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGER_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GER< \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void ger(const typename AViewType::execution_space& /* space */ \ - , \ - const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,float]"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ - if (A_is_ll) { \ - HostBlas::ger(M, N, alpha, X.data(), one, Y.data(), one, \ - A.data(), LDA); \ - } else { \ - HostBlas::ger(M, N, alpha, Y.data(), one, X.data(), one, \ - A.data(), LDA); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const EXEC_SPACE & /* space */ \ + , \ + const char /*trans*/[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_BLAS,float]"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::ger(M, N, alpha, X.data(), one, Y.data(), one, \ + A.data(), LDA); \ + } else { \ + HostBlas::ger(M, N, alpha, Y.data(), one, X.data(), one, \ + A.data(), LDA); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGER_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void ger(const typename AViewType::execution_space& /* space */ \ - , \ - const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::ger[TPL_BLAS,complex"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ - const std::complex alpha_val = \ - static_cast>(alpha); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (A_is_ll) { \ - if (justTranspose) { \ - HostBlas>::geru( \ - M, N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - HostBlas>::gerc( \ - M, N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } \ - } else { \ - if (justTranspose) { \ - HostBlas>::geru( \ - M, N, alpha_val, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - throw std::runtime_error( \ - "Error: blasZgerc() requires LayoutLeft views."); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const EXEC_SPACE & /* space */ \ + , \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::ger[TPL_BLAS,complex"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + const std::complex alpha_val = \ + static_cast>(alpha); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (A_is_ll) { \ + if (justTranspose) { \ + HostBlas>::geru( \ + M, N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + HostBlas>::gerc( \ + M, N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } \ + } else { \ + if (justTranspose) { \ + HostBlas>::geru( \ + M, N, alpha_val, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + throw std::runtime_error( \ + "Error: blasZgerc() requires LayoutLeft views."); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGER_BLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits> \ - AViewType; \ - \ - static void ger(const typename AViewType::execution_space& /* space */ \ - , \ - const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::ger[TPL_BLAS,complex"); \ - KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ - const std::complex alpha_val = \ - static_cast>(alpha); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (A_is_ll) { \ - if (justTranspose) { \ - HostBlas>::geru( \ - M, N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - HostBlas>::gerc( \ - M, N, alpha_val, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } \ - } else { \ - if (justTranspose) { \ - HostBlas>::geru( \ - M, N, alpha_val, \ - reinterpret_cast*>(Y.data()), one, \ - reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(A.data()), LDA); \ - } else { \ - throw std::runtime_error( \ - "Error: blasCgerc() requires LayoutLeft views."); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGER_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void ger(const EXEC_SPACE & /* space */ \ + , \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::ger[TPL_BLAS,complex"); \ + KOKKOSBLAS2_GER_DETERMINE_ARGS(LAYOUT); \ + const std::complex alpha_val = \ + static_cast>(alpha); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (A_is_ll) { \ + if (justTranspose) { \ + HostBlas>::geru( \ + M, N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + HostBlas>::gerc( \ + M, N, alpha_val, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } \ + } else { \ + if (justTranspose) { \ + HostBlas>::geru( \ + M, N, alpha_val, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + throw std::runtime_error( \ + "Error: blasCgerc() requires LayoutLeft views."); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, false) +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial + , Kokkos::HostSpace, true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial + , Kokkos::HostSpace, false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial + , Kokkos::HostSpace, true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial + , Kokkos::HostSpace, false) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial + , Kokkos::HostSpace, true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial + , Kokkos::HostSpace, false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial + , Kokkos::HostSpace, true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial + , Kokkos::HostSpace, false) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial + , Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial + , Kokkos::HostSpace, false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial + , Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial + , Kokkos::HostSpace, false) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, false) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, true) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::HostSpace, false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial + , Kokkos::HostSpace, true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial + , Kokkos::HostSpace, false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial + , Kokkos::HostSpace, true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial + , Kokkos::HostSpace, false) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP + , Kokkos::HostSpace, true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP + , Kokkos::HostSpace, false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP + , Kokkos::HostSpace, true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP + , Kokkos::HostSpace, false) + +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP + , Kokkos::HostSpace, true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP + , Kokkos::HostSpace, false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP + , Kokkos::HostSpace, true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP + , Kokkos::HostSpace, false) + +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP + , Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP + , Kokkos::HostSpace, false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP + , Kokkos::HostSpace, true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP + , Kokkos::HostSpace, false) + +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP + , Kokkos::HostSpace, true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP + , Kokkos::HostSpace, false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP + , Kokkos::HostSpace, true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP + , Kokkos::HostSpace, false) +#endif } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp index e106cacc63..699c89b3eb 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp @@ -30,34 +30,36 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DGER_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ +#define KOKKOSBLAS2_DGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ struct GER< \ + EXEC_SPACE, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ typedef double SCALAR; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ YViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ AViewType; \ \ - static void ger(const typename AViewType::execution_space& space, \ - const char trans[], \ + static void ger(const EXEC_SPACE & space, \ + const char /*trans*/[], \ typename AViewType::const_value_type& alpha, \ const XViewType& X, const YViewType& Y, \ const AViewType& A) { \ @@ -81,82 +83,88 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_SGER_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GER< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const typename AViewType::execution_space& space, \ - const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, \ - X.data(), one, Y.data(), one, \ - A.data(), LDA)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, \ - Y.data(), one, X.data(), one, \ - A.data(), LDA)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE & space, \ + const char /*trans*/[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, \ + X.data(), one, Y.data(), one, \ + A.data(), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, \ + Y.data(), one, X.data(), one, \ + A.data(), LDA)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGER_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ +#define KOKKOSBLAS2_ZGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ typedef Kokkos::complex SCALAR; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ YViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ AViewType; \ \ - static void ger(const typename AViewType::execution_space& space, \ + static void ger(const EXEC_SPACE & space, \ const char trans[], \ typename AViewType::const_value_type& alpha, \ const XViewType& X, const YViewType& Y, \ @@ -203,33 +211,35 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_CGER_CUBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ +#define KOKKOSBLAS2_CGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ typedef Kokkos::complex SCALAR; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ YViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ AViewType; \ \ - static void ger(const typename AViewType::execution_space& space, \ + static void ger(const EXEC_SPACE & space, \ const char trans[], \ typename AViewType::const_value_type& alpha, \ const XViewType& X, const YViewType& Y, \ @@ -273,45 +283,77 @@ namespace Impl { } \ }; -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace, false) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace, false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace, false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, true) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaSpace, false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace, false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace, true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace, false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp index 0018c36df8..fdb47c50ea 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp @@ -30,85 +30,89 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DGER_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const typename AViewType::execution_space& space, \ - const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,double]"); \ - KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - if (A_is_ll) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dger(s.handle, M, N, &alpha, \ - X.data(), one, Y.data(), \ - one, A.data(), LDA)); \ - } else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dger(s.handle, M, N, &alpha, \ - Y.data(), one, X.data(), \ - one, A.data(), LDA)); \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE & space, \ + const char /*trans*/[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + if (A_is_ll) { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dger(s.handle, M, N, &alpha, \ + X.data(), one, Y.data(), \ + one, A.data(), LDA)); \ + } else { \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dger(s.handle, M, N, &alpha, \ + Y.data(), one, X.data(), \ + one, A.data(), LDA)); \ + } \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGER_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_SGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ template <> \ struct GER< \ + EXEC_SPACE, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ typedef float SCALAR; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ YViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ AViewType; \ \ - static void ger(const typename AViewType::execution_space& space, \ - const char trans[], \ + static void ger(const EXEC_SPACE & space, \ + const char /*trans*/[], \ typename AViewType::const_value_type& alpha, \ const XViewType& X, const YViewType& Y, \ const AViewType& A) { \ @@ -132,33 +136,35 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_ZGER_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_ZGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ template <> \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ typedef Kokkos::complex SCALAR; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ YViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ AViewType; \ \ - static void ger(const typename AViewType::execution_space& space, \ + static void ger(const EXEC_SPACE & space, \ const char trans[], \ typename AViewType::const_value_type& alpha, \ const XViewType& X, const YViewType& Y, \ @@ -205,33 +211,35 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_CGER_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS2_CGER_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ template <> \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ typedef Kokkos::complex SCALAR; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ YViewType; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ AViewType; \ \ - static void ger(const typename AViewType::execution_space& space, \ + static void ger(const EXEC_SPACE & space, \ const char trans[], \ typename AViewType::const_value_type& alpha, \ const XViewType& X, const YViewType& Y, \ @@ -278,25 +286,41 @@ namespace Impl { } \ }; -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, true) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, false) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace, true) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace, false) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, true) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, false) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace, true) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace, false) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, false) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace, true) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace, false) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, true) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace, false) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace, true) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas From a368dd3cb642739901c4f787f4a627b885f3cdd6 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 18 Apr 2023 11:37:20 -0600 Subject: [PATCH 301/442] Formatting --- .../KokkosBlas2_ger_tpl_spec_decl_blas.hpp | 136 ++-- .../KokkosBlas2_ger_tpl_spec_decl_cublas.hpp | 610 +++++++++--------- .../KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp | 76 ++- 3 files changed, 405 insertions(+), 417 deletions(-) diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp index 225b877bb3..3ba437a5a7 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp @@ -57,7 +57,7 @@ namespace Impl { Kokkos::MemoryTraits> \ AViewType; \ \ - static void ger(const EXEC_SPACE & /* space */ \ + static void ger(const EXEC_SPACE& /* space */ \ , \ const char /*trans*/[], \ typename AViewType::const_value_type& alpha, \ @@ -103,7 +103,7 @@ namespace Impl { Kokkos::MemoryTraits> \ AViewType; \ \ - static void ger(const EXEC_SPACE & /* space */ \ + static void ger(const EXEC_SPACE& /* space */ \ , \ const char /*trans*/[], \ typename AViewType::const_value_type& alpha, \ @@ -149,7 +149,7 @@ namespace Impl { Kokkos::MemoryTraits> \ AViewType; \ \ - static void ger(const EXEC_SPACE & /* space */ \ + static void ger(const EXEC_SPACE& /* space */ \ , \ const char trans[], \ typename AViewType::const_value_type& alpha, \ @@ -218,7 +218,7 @@ namespace Impl { Kokkos::MemoryTraits> \ AViewType; \ \ - static void ger(const EXEC_SPACE & /* space */ \ + static void ger(const EXEC_SPACE& /* space */ \ , \ const char trans[], \ typename AViewType::const_value_type& alpha, \ @@ -261,79 +261,79 @@ namespace Impl { }; #ifdef KOKKOS_ENABLE_SERIAL -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial - , Kokkos::HostSpace, true) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial - , Kokkos::HostSpace, false) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial - , Kokkos::HostSpace, true) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial - , Kokkos::HostSpace, false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial - , Kokkos::HostSpace, true) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial - , Kokkos::HostSpace, false) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial - , Kokkos::HostSpace, true) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial - , Kokkos::HostSpace, false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial - , Kokkos::HostSpace, true) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial - , Kokkos::HostSpace, false) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial - , Kokkos::HostSpace, true) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial - , Kokkos::HostSpace, false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial - , Kokkos::HostSpace, true) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial - , Kokkos::HostSpace, false) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial - , Kokkos::HostSpace, true) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial - , Kokkos::HostSpace, false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP - , Kokkos::HostSpace, true) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP - , Kokkos::HostSpace, false) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP - , Kokkos::HostSpace, true) -KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP - , Kokkos::HostSpace, false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP - , Kokkos::HostSpace, true) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP - , Kokkos::HostSpace, false) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP - , Kokkos::HostSpace, true) -KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP - , Kokkos::HostSpace, false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP - , Kokkos::HostSpace, true) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP - , Kokkos::HostSpace, false) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP - , Kokkos::HostSpace, true) -KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP - , Kokkos::HostSpace, false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP - , Kokkos::HostSpace, true) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP - , Kokkos::HostSpace, false) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP - , Kokkos::HostSpace, true) -KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP - , Kokkos::HostSpace, false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CGER_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) #endif } // namespace Impl diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp index 699c89b3eb..d05b09784e 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp @@ -30,330 +30,322 @@ namespace Impl { constexpr int one = 1; \ const int LDA = A_is_lr ? A.stride(0) : A.stride(1); -#define KOKKOSBLAS2_DGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE & space, \ - const char /*trans*/[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,double]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, \ - X.data(), one, Y.data(), one, \ - A.data(), LDA)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, \ - Y.data(), one, X.data(), one, \ - A.data(), LDA)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char /*trans*/[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, \ + X.data(), one, Y.data(), one, \ + A.data(), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, \ + Y.data(), one, X.data(), one, \ + A.data(), LDA)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE & space, \ - const char /*trans*/[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, \ - X.data(), one, Y.data(), one, \ - A.data(), LDA)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, \ - Y.data(), one, X.data(), one, \ - A.data(), LDA)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char /*trans*/[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, \ + X.data(), one, Y.data(), one, \ + A.data(), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, \ + Y.data(), one, X.data(), one, \ + A.data(), LDA)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE & space, \ - const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::ger[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgerc( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } \ - } else { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru( \ - s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - throw std::runtime_error( \ - "Error: cublasZgerc() requires LayoutLeft views."); \ - } \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::ger[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgerc( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } \ + } else { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru( \ + s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + throw std::runtime_error( \ + "Error: cublasZgerc() requires LayoutLeft views."); \ + } \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct GER*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE & space, \ - const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, \ - const AViewType& A) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::ger[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru( \ - s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgerc( \ - s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } \ - } else { \ - if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru( \ - s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - } else { \ - throw std::runtime_error( \ - "Error: cublasCgerc() requires LayoutLeft views."); \ - } \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::ger[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru( \ + s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgerc( \ + s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } \ + } else { \ + if (justTranspose) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru( \ + s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + } else { \ + throw std::runtime_error( \ + "Error: cublasCgerc() requires LayoutLeft views."); \ + } \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_DGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_SGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_ZGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaSpace, false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, true) -KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, - Kokkos::CudaUVMSpace, false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_CGER_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp index fdb47c50ea..c55d091516 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp @@ -58,8 +58,7 @@ namespace Impl { Kokkos::MemoryTraits > \ AViewType; \ \ - static void ger(const EXEC_SPACE & space, \ - const char /*trans*/[], \ + static void ger(const EXEC_SPACE& space, const char /*trans*/[], \ typename AViewType::const_value_type& alpha, \ const XViewType& X, const YViewType& Y, \ const AViewType& A) { \ @@ -111,8 +110,7 @@ namespace Impl { Kokkos::MemoryTraits > \ AViewType; \ \ - static void ger(const EXEC_SPACE & space, \ - const char /*trans*/[], \ + static void ger(const EXEC_SPACE& space, const char /*trans*/[], \ typename AViewType::const_value_type& alpha, \ const XViewType& X, const YViewType& Y, \ const AViewType& A) { \ @@ -164,8 +162,7 @@ namespace Impl { Kokkos::MemoryTraits > \ AViewType; \ \ - static void ger(const EXEC_SPACE & space, \ - const char trans[], \ + static void ger(const EXEC_SPACE& space, const char trans[], \ typename AViewType::const_value_type& alpha, \ const XViewType& X, const YViewType& Y, \ const AViewType& A) { \ @@ -239,8 +236,7 @@ namespace Impl { Kokkos::MemoryTraits > \ AViewType; \ \ - static void ger(const EXEC_SPACE & space, \ - const char trans[], \ + static void ger(const EXEC_SPACE& space, const char trans[], \ typename AViewType::const_value_type& alpha, \ const XViewType& X, const YViewType& Y, \ const AViewType& A) { \ @@ -286,41 +282,41 @@ namespace Impl { } \ }; -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_DGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_SGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_ZGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace, false) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, true) -KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace, false) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_CGER_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) } // namespace Impl } // namespace KokkosBlas From 792bd5fa82b2bb2bf38e171d22b5e08f60af58b7 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 24 Apr 2023 15:29:52 -0600 Subject: [PATCH 302/442] Correcting compilation errors on blake --- blas/unit_test/Test_Blas2_ger.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index 4710730504..4a73e7b253 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -1393,6 +1393,8 @@ int test_ger(const std::string& caseName) { "---\n"); #endif +#if 0 // Compilation error "static assertion failed Layout is not constructible + // from extent arguments", Kokkos_View.hpp, circa line 1537 #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -1456,6 +1458,8 @@ int test_ger(const std::string& caseName) { "+-----------------------------------------------------------------------" "---\n"); #endif +#endif // Compilation error "static assertion failed Layout is not constructible + // from extent arguments", Kokkos_View.hpp, circa line 1537 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s\n", caseName.c_str()); KOKKOS_IMPL_DO_NOT_USE_PRINTF( From f27e4d03475b6549e699ac5c22a34c6001e512b9 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 24 Apr 2023 15:33:43 -0600 Subject: [PATCH 303/442] Formatting --- blas/unit_test/Test_Blas2_ger.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index 4a73e7b253..0045fd48a6 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -1393,8 +1393,8 @@ int test_ger(const std::string& caseName) { "---\n"); #endif -#if 0 // Compilation error "static assertion failed Layout is not constructible - // from extent arguments", Kokkos_View.hpp, circa line 1537 +#if 0 // Compilation error "static assertion failed Layout is not constructible + // from extent arguments", Kokkos_View.hpp, circa line 1537 #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -1458,8 +1458,9 @@ int test_ger(const std::string& caseName) { "+-----------------------------------------------------------------------" "---\n"); #endif -#endif // Compilation error "static assertion failed Layout is not constructible - // from extent arguments", Kokkos_View.hpp, circa line 1537 +#endif // Compilation error "static assertion failed Layout is not + // constructible from extent arguments", Kokkos_View.hpp, circa line + // 1537 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s\n", caseName.c_str()); KOKKOS_IMPL_DO_NOT_USE_PRINTF( From 0453f0d02c951583a5eaada21967bcdd7bf65b29 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 24 Apr 2023 21:15:16 -0600 Subject: [PATCH 304/442] Forgot some spots that need a template parameter for the execution space --- blas/impl/KokkosBlas2_ger_spec.hpp | 7 +- blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp | 151 +++++++++++-------- 2 files changed, 96 insertions(+), 62 deletions(-) diff --git a/blas/impl/KokkosBlas2_ger_spec.hpp b/blas/impl/KokkosBlas2_ger_spec.hpp index d9f9dcd272..0ac2a7b673 100644 --- a/blas/impl/KokkosBlas2_ger_spec.hpp +++ b/blas/impl/KokkosBlas2_ger_spec.hpp @@ -27,7 +27,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct ger_eti_spec_avail { enum : bool { value = false }; }; @@ -43,6 +43,7 @@ struct ger_eti_spec_avail { #define KOKKOSBLAS2_GER_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct ger_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -69,9 +70,9 @@ namespace Impl { template ::value, + ger_tpl_spec_avail::value, bool eti_spec_avail = - ger_eti_spec_avail::value> + ger_eti_spec_avail::value> struct GER { static void ger(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp index 925aaac945..30ce6b8595 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct ger_tpl_spec_avail { enum : bool { value = false }; }; @@ -28,128 +28,161 @@ struct ger_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct ger_tpl_spec_avail< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct ger_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, + Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP + Kokkos::HostSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::OpenMP Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP + Kokkos::HostSpace) +#endif #endif // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct ger_tpl_spec_avail< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct ger_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // We use the same layout for X, Y and Abecause the GER interface will // switch the layouts of X and Y to that of A. So this TPL version will // match any layout combination, as long as none are LayoutStride. -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaSpace) + Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaSpace) + Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace) + Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace) + Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT) \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template <> \ struct ger_tpl_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ + Kokkos::Device >, \ Kokkos::View, \ + Kokkos::Device >, \ Kokkos::View, \ + Kokkos::Device > > { \ enum : bool { value = true }; \ }; -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft) + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutLeft) + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight) + Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, - Kokkos::LayoutRight) + Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) #endif } // namespace Impl From 5cf9c3ea9f2378bf2e9910581c1ab047e01a7e90 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Mon, 24 Apr 2023 21:20:15 -0600 Subject: [PATCH 305/442] Formatting --- blas/impl/KokkosBlas2_ger_spec.hpp | 8 +- blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp | 143 ++++++++++--------- 2 files changed, 83 insertions(+), 68 deletions(-) diff --git a/blas/impl/KokkosBlas2_ger_spec.hpp b/blas/impl/KokkosBlas2_ger_spec.hpp index 0ac2a7b673..8539893658 100644 --- a/blas/impl/KokkosBlas2_ger_spec.hpp +++ b/blas/impl/KokkosBlas2_ger_spec.hpp @@ -69,10 +69,10 @@ namespace Impl { // Implementation of KokkosBlas::ger. template ::value, - bool eti_spec_avail = - ger_eti_spec_avail::value> + bool tpl_spec_avail = ger_tpl_spec_avail::value, + bool eti_spec_avail = ger_eti_spec_avail::value> struct GER { static void ger(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp index 30ce6b8595..f01e1fd852 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp @@ -28,17 +28,20 @@ struct ger_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct ger_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct ger_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -46,39 +49,41 @@ KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial, - Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, + Kokkos::Serial, Kokkos::HostSpace) #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP - Kokkos::HostSpace) - -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP - Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP - Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, + Kokkos::OpenMP Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, + Kokkos::OpenMP Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP Kokkos::HostSpace) + +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, + Kokkos::OpenMP Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, + Kokkos::OpenMP Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::OpenMP Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP - Kokkos::HostSpace) + Kokkos::LayoutRight, + Kokkos::OpenMP Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, + Kokkos::OpenMP Kokkos::HostSpace) #endif #endif @@ -86,7 +91,8 @@ KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, // cuBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ template <> \ struct ger_tpl_spec_avail< \ EXEC_SPACE, \ @@ -110,79 +116,88 @@ KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace) + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace) + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaUVMSpace) + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) #endif // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct ger_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View >, \ - Kokkos::View >, \ - Kokkos::View > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct ger_tpl_spec_avail < EXEC_SPACE, \ + Kokkos::View < const SCALAR*, LAYOUT, \ + Kokkos::Device >, \ + Kokkos::View < const SCALAR*, LAYOUT, \ + Kokkos::Device >, \ + Kokkos::View< \ + SCALAR**, LAYOUT, \ + Kokkos::Device > > { \ + enum : bool { value = true }; \ }; KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, - Kokkos::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIP, - Kokkos::HIPSpace) + Kokkos::HIPSpace) #endif } // namespace Impl From ac307232e3660bc11d5e769328b71def1a6cf75b Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 25 Apr 2023 12:05:06 -0600 Subject: [PATCH 306/442] Typo --- blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp | 28 ++++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp index f01e1fd852..cdaa35d23d 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp @@ -28,20 +28,20 @@ struct ger_tpl_spec_avail { // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct ger_tpl_spec_avail< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct ger_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL From ab59a34cfe10cc759a3d0f7e9b15fd5c2f025105 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 25 Apr 2023 17:00:14 -0600 Subject: [PATCH 307/442] Another typo --- blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp index cdaa35d23d..0f2cb55b8b 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp @@ -67,23 +67,23 @@ KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, #ifdef KOKKOS_ENABLE_OPENMP KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::OpenMP Kokkos::HostSpace) + Kokkos::OpenMP, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::OpenMP Kokkos::HostSpace) + Kokkos::OpenMP, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP Kokkos::HostSpace) + Kokkos::OpenMP, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, - Kokkos::OpenMP Kokkos::HostSpace) + Kokkos::OpenMP, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::OpenMP Kokkos::HostSpace) + Kokkos::OpenMP, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, - Kokkos::OpenMP Kokkos::HostSpace) + Kokkos::OpenMP, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, - Kokkos::OpenMP Kokkos::HostSpace) + Kokkos::OpenMP, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, - Kokkos::OpenMP Kokkos::HostSpace) + Kokkos::OpenMP, Kokkos::HostSpace) #endif #endif From 215a006928bf9621b721a08062628ed40f0831fd Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 25 Apr 2023 17:02:01 -0600 Subject: [PATCH 308/442] Formatting --- blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp index 0f2cb55b8b..edcbeed72c 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp @@ -66,22 +66,22 @@ KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, #endif #ifdef KOKKOS_ENABLE_OPENMP -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, - Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace) -KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, - Kokkos::LayoutRight, - Kokkos::OpenMP, Kokkos::HostSpace) + Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace) #endif From 6e06af03c31a4a932d654b451763eb75b03fbef0 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Tue, 25 Apr 2023 23:30:19 -0600 Subject: [PATCH 309/442] Backup --- blas/unit_test/Test_Blas2_ger.hpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index 0045fd48a6..a7b185bb45 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -226,9 +226,9 @@ void GerTester x("X", _M); + view_stride_adapater<_ViewTypeY> y("Y", _N); + view_stride_adapater<_ViewTypeA> A("A", _M, _N); typename _ViewTypeX::const_type c_x = x; typename _ViewTypeY::const_type c_y = y; @@ -1480,8 +1480,6 @@ TEST_F(TestCategory, ger_float) { } #endif -#if 1 - #if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -1534,5 +1532,3 @@ TEST_F(TestCategory, ger_double_int) { Kokkos::Profiling::popRegion(); } #endif - -#endif // if 1 From 961b6362af2f80c6f8c387be804528b6e2385b7b Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 26 Apr 2023 00:35:40 -0600 Subject: [PATCH 310/442] Changes for testing in blake --- blas/unit_test/Test_Blas2_ger.hpp | 41 +++++++++++++------------------ 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index a7b185bb45..5acd758464 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -226,16 +226,9 @@ void GerTester x("X", _M); - view_stride_adapater<_ViewTypeY> y("Y", _N); - view_stride_adapater<_ViewTypeA> A("A", _M, _N); - - typename _ViewTypeX::const_type c_x = x; - typename _ViewTypeY::const_type c_y = y; - - _HostViewTypeX h_x = Kokkos::create_mirror(x); - _HostViewTypeY h_y = Kokkos::create_mirror(y); - _HostViewTypeA h_A = Kokkos::create_mirror(A); + view_stride_adapter<_ViewTypeX> x("X", _M); + view_stride_adapter<_ViewTypeY> y("Y", _N); + view_stride_adapter<_ViewTypeA> A("A", _M, _N); _ViewTypeExpected h_expected("expected A += alpha * x * y^{t,h}", _M, _N); bool expectedResultIsKnown = false; @@ -245,7 +238,7 @@ void GerTesterpopulateVariables(alpha, h_x, h_y, h_A, h_expected, x, y, A, + this->populateVariables(alpha, x.h_view, y.h_view, A.h_view, h_expected, x.d_view, y.d_view, A.d_view, expectedResultIsKnown); // ******************************************************************** @@ -255,7 +248,7 @@ void GerTesterpopulateVanillaValues(alpha, h_x, h_y, h_A, h_vanilla); + this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, h_vanilla); // ******************************************************************** // Step 4 of 9: use h_vanilla and h_expected as appropriate @@ -275,11 +268,11 @@ void GerTester org_A("Org_A", _M, _N); + Kokkos::deep_copy(org_A.d_base, A.d_base); if (test_x_y) { - this->callKkGerAndCompareAgainstExpected(alpha, x, y, A, h_A, h_expected, + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view, A.d_view, A.h_view, h_expected, "non const {x,y}"); } @@ -287,9 +280,9 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, c_x, y, A, h_A, h_expected, + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view, A.d_view, A.h_view, h_expected, "const x"); } @@ -297,9 +290,9 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x, c_y, A, h_A, h_expected, + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view_const, A.d_view, A.h_view, h_expected, "const y"); } @@ -307,18 +300,18 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, c_x, c_y, A, h_A, + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view_const, A.d_view, A.h_view, h_expected, "const {x,y}"); } // ******************************************************************** // Step 9 of 9: tests with invalid values on the first input parameter // ******************************************************************** - EXPECT_ANY_THROW(KokkosBlas::ger(".", alpha, x, y, A)) + EXPECT_ANY_THROW(KokkosBlas::ger(".", alpha, x.d_view, y.d_view, A.d_view)) << "Failed test: kk ger should have thrown an exception for mode '.'"; - EXPECT_ANY_THROW(KokkosBlas::ger("", alpha, x, y, A)) + EXPECT_ANY_THROW(KokkosBlas::ger("", alpha, x.d_view, y.d_view, A.d_view)) << "Failed test: kk ger should have thrown an exception for mode ''"; std::cout << "Leaving GerTester::test() - - - - - - - - - - - - - - - - - - " @@ -1393,7 +1386,7 @@ int test_ger(const std::string& caseName) { "---\n"); #endif -#if 0 // Compilation error "static assertion failed Layout is not constructible +#if 1 // Compilation error "static assertion failed Layout is not constructible // from extent arguments", Kokkos_View.hpp, circa line 1537 #if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ From 89eab5240d44256e7dd3f9ac0b192aa91ab00012 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 26 Apr 2023 02:36:29 -0600 Subject: [PATCH 311/442] Changes made for compilation in blake --- blas/impl/KokkosBlas1_dot_spec.hpp | 6 +++--- blas/unit_test/Test_Blas2_ger.hpp | 8 ++++---- test_common/KokkosKernels_TestUtils.hpp | 10 +++++----- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/blas/impl/KokkosBlas1_dot_spec.hpp b/blas/impl/KokkosBlas1_dot_spec.hpp index 430f357a36..510fa0fe3a 100644 --- a/blas/impl/KokkosBlas1_dot_spec.hpp +++ b/blas/impl/KokkosBlas1_dot_spec.hpp @@ -277,9 +277,9 @@ struct DotSpecialAccumulator { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "DotSpecialAccumulator: YV is not a Kokkos::View."); - static_assert(XV::rank == YV::rank, - "KokkosBlas::Impl::" - "DotSpecialAccumulator: X and Y have different ranks."); + //static_assert(XV::rank == YV::rank, + // "KokkosBlas::Impl::" + // "DotSpecialAccumulator: X and Y have different ranks."); static_assert(XV::rank == 1, "KokkosBlas::Impl::" "DotSpecialAccumulator: X and Y are not rank-1 Views."); diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index 5acd758464..3bf1b5208a 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -226,9 +226,9 @@ void GerTester x("X", _M); - view_stride_adapter<_ViewTypeY> y("Y", _N); - view_stride_adapter<_ViewTypeA> A("A", _M, _N); + view_stride_adapter<_ViewTypeX, false> x("X", _M); + view_stride_adapter<_ViewTypeY, false> y("Y", _N); + view_stride_adapter<_ViewTypeA, false> A("A", _M, _N); _ViewTypeExpected h_expected("expected A += alpha * x * y^{t,h}", _M, _N); bool expectedResultIsKnown = false; @@ -268,7 +268,7 @@ void GerTester org_A("Org_A", _M, _N); + view_stride_adapter<_ViewTypeA, false> org_A("Org_A", _M, _N); Kokkos::deep_copy(org_A.d_base, A.d_base); if (test_x_y) { diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index cd0e651e1c..d945485a59 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -79,7 +79,7 @@ namespace Test { // deep-copied to each other. d_view aliases d_base, and h_view aliases h_base. // This means that copying between d_base and h_base // also copies between d_view and h_view. -template +template struct view_stride_adapter { static_assert(Kokkos::is_view_v, "view_stride_adapter: ViewType must be a Kokkos::View"); @@ -106,26 +106,26 @@ struct view_stride_adapter { if constexpr (rank == 1) { if constexpr (strided) { d_base = DViewBase(label, m, 2); - h_base = Kokkos::create_mirror_view(d_base); + h_base = createMirrorView ? Kokkos::create_mirror_view(d_base) : Kokkos::create_mirror(d_base); d_view = Kokkos::subview(d_base, Kokkos::ALL(), 0); h_view = Kokkos::subview(h_base, Kokkos::ALL(), 0); } else { d_base = DViewBase(label, m); - h_base = Kokkos::create_mirror_view(d_base); + h_base = createMirrorView ? Kokkos::create_mirror_view(d_base) : Kokkos::create_mirror(d_base); d_view = d_base; h_view = h_base; } } else { if constexpr (strided) { d_base = DViewBase(label, m, n, 2); - h_base = Kokkos::create_mirror_view(d_base); + h_base = createMirrorView ? Kokkos::create_mirror_view(d_base) : Kokkos::create_mirror(d_base); d_view = Kokkos::subview(d_base, Kokkos::ALL(), Kokkos::make_pair(0, n), 0); h_view = Kokkos::subview(h_base, Kokkos::ALL(), Kokkos::make_pair(0, n), 0); } else { d_base = DViewBase(label, m, n); - h_base = Kokkos::create_mirror_view(d_base); + h_base = createMirrorView ? Kokkos::create_mirror_view(d_base) : Kokkos::create_mirror(d_base); d_view = d_base; h_view = h_base; } From 4231677dbfaf417beeb74fb3ad038296fa220879 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 26 Apr 2023 02:40:57 -0600 Subject: [PATCH 312/442] Formatting --- blas/impl/KokkosBlas1_dot_spec.hpp | 2 +- blas/unit_test/Test_Blas2_ger.hpp | 16 ++++++++++------ test_common/KokkosKernels_TestUtils.hpp | 12 ++++++++---- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/blas/impl/KokkosBlas1_dot_spec.hpp b/blas/impl/KokkosBlas1_dot_spec.hpp index 510fa0fe3a..b385311d10 100644 --- a/blas/impl/KokkosBlas1_dot_spec.hpp +++ b/blas/impl/KokkosBlas1_dot_spec.hpp @@ -277,7 +277,7 @@ struct DotSpecialAccumulator { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "DotSpecialAccumulator: YV is not a Kokkos::View."); - //static_assert(XV::rank == YV::rank, + // static_assert(XV::rank == YV::rank, // "KokkosBlas::Impl::" // "DotSpecialAccumulator: X and Y have different ranks."); static_assert(XV::rank == 1, diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index 3bf1b5208a..94fd07b85f 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -238,8 +238,8 @@ void GerTesterpopulateVariables(alpha, x.h_view, y.h_view, A.h_view, h_expected, x.d_view, y.d_view, A.d_view, - expectedResultIsKnown); + this->populateVariables(alpha, x.h_view, y.h_view, A.h_view, h_expected, + x.d_view, y.d_view, A.d_view, expectedResultIsKnown); // ******************************************************************** // Step 3 of 9: populate h_vanilla @@ -272,7 +272,8 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view, A.d_view, A.h_view, h_expected, + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view, + A.d_view, A.h_view, h_expected, "non const {x,y}"); } @@ -282,7 +283,8 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view, A.d_view, A.h_view, h_expected, + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view, + A.d_view, A.h_view, h_expected, "const x"); } @@ -292,7 +294,8 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view_const, A.d_view, A.h_view, h_expected, + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view_const, + A.d_view, A.h_view, h_expected, "const y"); } @@ -302,7 +305,8 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view_const, A.d_view, A.h_view, + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view_const, + y.d_view_const, A.d_view, A.h_view, h_expected, "const {x,y}"); } diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index d945485a59..3a293ee54b 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -106,26 +106,30 @@ struct view_stride_adapter { if constexpr (rank == 1) { if constexpr (strided) { d_base = DViewBase(label, m, 2); - h_base = createMirrorView ? Kokkos::create_mirror_view(d_base) : Kokkos::create_mirror(d_base); + h_base = createMirrorView ? Kokkos::create_mirror_view(d_base) + : Kokkos::create_mirror(d_base); d_view = Kokkos::subview(d_base, Kokkos::ALL(), 0); h_view = Kokkos::subview(h_base, Kokkos::ALL(), 0); } else { d_base = DViewBase(label, m); - h_base = createMirrorView ? Kokkos::create_mirror_view(d_base) : Kokkos::create_mirror(d_base); + h_base = createMirrorView ? Kokkos::create_mirror_view(d_base) + : Kokkos::create_mirror(d_base); d_view = d_base; h_view = h_base; } } else { if constexpr (strided) { d_base = DViewBase(label, m, n, 2); - h_base = createMirrorView ? Kokkos::create_mirror_view(d_base) : Kokkos::create_mirror(d_base); + h_base = createMirrorView ? Kokkos::create_mirror_view(d_base) + : Kokkos::create_mirror(d_base); d_view = Kokkos::subview(d_base, Kokkos::ALL(), Kokkos::make_pair(0, n), 0); h_view = Kokkos::subview(h_base, Kokkos::ALL(), Kokkos::make_pair(0, n), 0); } else { d_base = DViewBase(label, m, n); - h_base = createMirrorView ? Kokkos::create_mirror_view(d_base) : Kokkos::create_mirror(d_base); + h_base = createMirrorView ? Kokkos::create_mirror_view(d_base) + : Kokkos::create_mirror(d_base); d_view = d_base; h_view = h_base; } From 2087e70095695b99b9dfd28bf88a1108a3b8a4ca Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 26 Apr 2023 08:38:05 -0600 Subject: [PATCH 313/442] BLAS3: starting to add stream support for TPL code path of trmm/trsm --- blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp | 36 ++++--- blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp | 101 +++++++++++++------ 2 files changed, 96 insertions(+), 41 deletions(-) diff --git a/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp index 44771da56f..64036ca3dc 100644 --- a/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp @@ -27,7 +27,9 @@ namespace Impl { #define KOKKOSBLAS3_TRMM_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, LAYOUTB, \ MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRMM, \ Kokkos::MemoryTraits >, \ Kokkos::View > \ BViewType; \ \ - static void trmm(const char side[], const char uplo[], const char trans[], \ + static void trmm(const ExecSpace& /*space*/, const char side[], \ + const char uplo[], const char trans[], \ const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ @@ -168,7 +171,9 @@ namespace Impl { #define KOKKOSBLAS3_TRMM_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, \ LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRMM, \ Kokkos::MemoryTraits >, \ Kokkos::View > \ BViewType; \ \ - static void trmm(const char side[], const char uplo[], const char trans[], \ + static void trmm(const ExecSpace& space, const char side[], \ + const char uplo[], const char trans[], \ const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ @@ -242,18 +248,24 @@ namespace Impl { \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - if (A_is_layout_left) \ - CUBLAS_FN(s.handle, side_, uplo_, trans_, diag_, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(B.data()), LDB); \ - else \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_layout_left) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + CUBLAS_FN(s.handle, side_, uplo_, trans_, diag_, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA,\ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(B.data()), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ CUBLAS_FN(s.handle, side_, uplo_, trans_, diag_, N, M, \ reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, \ reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(B.data()), LDB); \ + reinterpret_cast(B.data()), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp index 0d2f38ed6e..fd8c2c31a0 100644 --- a/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp @@ -26,6 +26,7 @@ namespace Impl { #define KOKKOSBLAS3_DTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct TRSM< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -42,7 +43,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const char side[], const char uplo[], const char trans[], \ + static void trsm(const ExecSpace& /*space*/, const char side[], \ + const char uplo[], const char trans[], \ const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ @@ -94,6 +96,7 @@ namespace Impl { #define KOKKOSBLAS3_STRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct TRSM< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -110,7 +113,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const char side[], const char uplo[], const char trans[], \ + static void trsm(const ExecSpace& /*space*/, const char side[], \ + const char uplo[], const char trans[], \ const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ @@ -161,7 +165,9 @@ namespace Impl { #define KOKKOSBLAS3_ZTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRSM**, LAYOUTA, \ + struct TRSM< \ + ExecSpace, \ + Kokkos::View**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View**, LAYOUTB, \ @@ -178,7 +184,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const char side[], const char uplo[], const char trans[], \ + static void trsm(const ExecSpace& /*space*/, const char side[], \ + const char uplo[], const char trans[], \ const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ @@ -235,7 +242,9 @@ namespace Impl { #define KOKKOSBLAS3_CTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRSM**, LAYOUTA, \ + struct TRSM< \ + ExecSpace, \ + Kokkos::View**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View**, LAYOUTB, \ @@ -252,7 +261,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const char side[], const char uplo[], const char trans[], \ + static void trsm(const ExecSpace& /*space*/, const char side[], \ + const char uplo[], const char trans[], \ const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ @@ -357,6 +367,7 @@ namespace Impl { #define KOKKOSBLAS3_DTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct TRSM< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -373,7 +384,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const char side[], const char uplo[], const char trans[], \ + static void trsm(const ExecSpace& space, const char side[], \ + const char uplo[], const char trans[], \ const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ @@ -427,13 +439,18 @@ namespace Impl { \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - if (A_is_ll) \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, \ - A.data(), LDA, B.data(), LDB); \ - else \ + A.data(), LDA, B.data(), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, \ - A.data(), LDA, B.data(), LDB); \ - \ + A.data(), LDA, B.data(), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -441,6 +458,7 @@ namespace Impl { #define KOKKOSBLAS3_STRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ struct TRSM< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -457,7 +475,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const char side[], const char uplo[], const char trans[], \ + static void trsm(const ExecSpace& space, const char side[], \ + const char uplo[], const char trans[], \ const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ @@ -511,12 +530,18 @@ namespace Impl { \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - if (A_is_ll) \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasStrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, \ - A.data(), LDA, B.data(), LDB); \ - else \ + A.data(), LDA, B.data(), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasStrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, \ - A.data(), LDA, B.data(), LDB); \ + A.data(), LDA, B.data(), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ \ Kokkos::Profiling::popRegion(); \ } \ @@ -524,7 +549,9 @@ namespace Impl { #define KOKKOSBLAS3_ZTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRSM**, LAYOUTA, \ + struct TRSM< \ + ExecSpace, \ + Kokkos::View**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View**, LAYOUTB, \ @@ -541,7 +568,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const char side[], const char uplo[], const char trans[], \ + static void trsm(const ExecSpace& space, const char side[], \ + const char uplo[], const char trans[], \ const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ @@ -596,16 +624,22 @@ namespace Impl { \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - if (A_is_ll) \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, \ reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB); \ - else \ + reinterpret_cast(B.data()), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, \ reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB); \ + reinterpret_cast(B.data()), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ \ Kokkos::Profiling::popRegion(); \ } \ @@ -613,7 +647,9 @@ namespace Impl { #define KOKKOSBLAS3_CTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRSM**, LAYOUTA, \ + struct TRSM< \ + ExecSpace, \ + Kokkos::View**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View**, LAYOUTB, \ @@ -630,7 +666,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const char side[], const char uplo[], const char trans[], \ + static void trsm(const ExecSpace& space, const char side[], \ + const char uplo[], const char trans[], \ const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ @@ -685,16 +722,22 @@ namespace Impl { \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - if (A_is_ll) \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasCtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, \ reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB); \ - else \ + reinterpret_cast(B.data()), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasCtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, \ reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB); \ + reinterpret_cast(B.data()), LDB)); \ + } \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ \ Kokkos::Profiling::popRegion(); \ } \ From 29034f31a8f8cd16e42cb285dba01c691416e647 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Wed, 26 Apr 2023 15:19:40 -0700 Subject: [PATCH 314/442] Minor changes to match L solve and U solve implementations --- sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 583b3e8ab9..93b6ad6844 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -398,7 +398,7 @@ struct LowerTriLvlSchedRPSolverFunctor { const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, - NGBLType nodes_grouped_by_level_) + const NGBLType &nodes_grouped_by_level_) : row_map(row_map_), entries(entries_), values(values_), @@ -412,11 +412,11 @@ struct LowerTriLvlSchedRPSolverFunctor { // Assuming indices are sorted per row, diag entry is final index in the // list - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); + long soffset = row_map(rowid); + long eoffset = row_map(rowid + 1); auto rhs_rowid = rhs(rowid); - for (auto ptr = soffset; ptr < eoffset; ++ptr) { + for (long ptr = soffset; ptr < eoffset; ++ptr) { auto colid = entries(ptr); auto val = values(ptr); if (colid != rowid) { @@ -430,12 +430,12 @@ struct LowerTriLvlSchedRPSolverFunctor { KOKKOS_INLINE_FUNCTION void operator()(const UnsortedTag &, const lno_t i) const { auto rowid = nodes_grouped_by_level(i); - auto soffset = row_map(rowid); - auto eoffset = row_map(rowid + 1); + long soffset = row_map(rowid); + long eoffset = row_map(rowid + 1); auto rhs_rowid = rhs(rowid); auto diag = -1; - for (auto ptr = soffset; ptr < eoffset; ++ptr) { + for (long ptr = soffset; ptr < eoffset; ++ptr) { auto colid = entries(ptr); auto val = values(ptr); if (colid != rowid) { From 9cee1a3d7693c914415514b567f79fe7fdc3706c Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 26 Apr 2023 16:32:08 -0600 Subject: [PATCH 315/442] sparse/unit_test: Check last entry of col_map. Improve readability. --- .../unit_test/Test_Sparse_TestUtils_RandCsMat.hpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp b/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp index 4e2aa7695c..7a88e6071c 100644 --- a/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp +++ b/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp @@ -30,13 +30,14 @@ void doCsMat(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { auto map = Kokkos::create_mirror_view(map_d); Kokkos::deep_copy(map, map_d); + // Here we treat 'cm' as a Ccs matrix for (int64_t j = 0; j < cm.get_dim1(); ++j) { - int64_t row_len = j < static_cast(m) ? (map(j + 1) - map(j)) : 0; - for (int64_t i = 0; i < row_len; ++i) { - int64_t row_start = j < static_cast(m) ? map(j) : 0; - ASSERT_FLOAT_EQ(cm(row_start + i), cm(expected_nnz + i)) << cm.info; + int64_t col_len = j < static_cast(m) ? (map(j + 1) - map(j)) : 0; + for (int64_t i = 0; i < col_len; ++i) { + int64_t col_start = j < static_cast(m) ? map(j) : 0; + ASSERT_FLOAT_EQ(cm(col_start + i), cm(expected_nnz + i)) << cm.info; } - expected_nnz += row_len; + expected_nnz += col_len; } ASSERT_EQ(cm.get_nnz(), expected_nnz) << cm.info; @@ -49,6 +50,8 @@ void doCsMat(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { auto col_map = cm.get_map(); ASSERT_EQ(col_map.extent(0), cm.get_dim1() + 1); + + ASSERT_EQ(col_map(cm.get_dim1()), expected_nnz) << cm.info; } template From 5a5a2946ca7314ae29512aae21cd7076690f0b85 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 26 Apr 2023 16:40:23 -0600 Subject: [PATCH 316/442] sparse: Encapsulate CooMatrix. Cleanup coo2crs TODO. --- sparse/src/KokkosSparse_CooMatrix.hpp | 36 ++++++++++++++---------- sparse/src/KokkosSparse_coo2crs.hpp | 9 ------ sparse/unit_test/Test_Sparse_coo2crs.hpp | 3 +- sparse/unit_test/Test_Sparse_crs2coo.hpp | 2 +- 4 files changed, 24 insertions(+), 26 deletions(-) diff --git a/sparse/src/KokkosSparse_CooMatrix.hpp b/sparse/src/KokkosSparse_CooMatrix.hpp index ccedc85530..30a41ba11c 100644 --- a/sparse/src/KokkosSparse_CooMatrix.hpp +++ b/sparse/src/KokkosSparse_CooMatrix.hpp @@ -91,15 +91,11 @@ class CooMatrix { private: size_type m_num_rows, m_num_cols; + row_view m_row; + column_view m_col; + scalar_view m_data; public: - //! The row indexes of the matrix - row_view row; - //! The column indexes of the matrix - column_view col; - //! The scalar values of the matrix - scalar_view data; - /// \brief Default constructor; constructs an empty sparse matrix. KOKKOS_INLINE_FUNCTION CooMatrix() : m_num_rows(0), m_num_cols(0) {} @@ -121,14 +117,15 @@ class CooMatrix { column_view col_in, scalar_view data_in) : m_num_rows(nrows), m_num_cols(ncols), - row(row_in), - col(col_in), - data(data_in) { - if (data.extent(0) != row.extent(0) || row.extent(0) != col.extent(0)) { + m_row(row_in), + m_col(col_in), + m_data(data_in) { + if (m_data.extent(0) != m_row.extent(0) || + m_row.extent(0) != m_col.extent(0)) { std::ostringstream os; - os << "data.extent(0): " << data.extent(0) << " != " - << "row.extent(0): " << row.extent(0) << " != " - << "col.extent(0): " << col.extent(0) << "."; + os << "data.extent(0): " << m_data.extent(0) << " != " + << "row.extent(0): " << m_row.extent(0) << " != " + << "col.extent(0): " << m_col.extent(0) << "."; KokkosKernels::Impl::throw_runtime_exception(os.str()); } } @@ -140,7 +137,16 @@ class CooMatrix { KOKKOS_INLINE_FUNCTION size_type numRows() const { return m_num_rows; } //! The number of stored entries in the sparse matrix, including zeros. - KOKKOS_INLINE_FUNCTION size_type nnz() const { return data.extent(0); } + KOKKOS_INLINE_FUNCTION size_type nnz() const { return m_data.extent(0); } + + //! The row indexes of the matrix + KOKKOS_INLINE_FUNCTION row_view row() const { return m_row; } + + //! The column indexes of the matrix + KOKKOS_INLINE_FUNCTION column_view col() const { return m_col; } + + //! The scalar values of the matrix + KOKKOS_INLINE_FUNCTION scalar_view data() const { return m_data; } }; /// \class is_coo_matrix diff --git a/sparse/src/KokkosSparse_coo2crs.hpp b/sparse/src/KokkosSparse_coo2crs.hpp index ef3f57fd36..90752c4c69 100644 --- a/sparse/src/KokkosSparse_coo2crs.hpp +++ b/sparse/src/KokkosSparse_coo2crs.hpp @@ -110,15 +110,6 @@ class Coo2Crs { } } - // TODO: umap.size cannot be called in a kernel. - // Requires updating Kokkos::BitSet::count() to be - // a host device function. - /* KOKKOS_INLINE_FUNCTION - void operator()(const rowmapRp1 &, const int &row_idx) const { - auto i = row_idx - 1; - m_crs_row_map(row_idx) = m_crs_row_map(i) + m_umaps[i].ptr->size(); - } */ - KOKKOS_INLINE_FUNCTION void operator()(const copyRp1 &, const int &i) const { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) diff --git a/sparse/unit_test/Test_Sparse_coo2crs.hpp b/sparse/unit_test/Test_Sparse_coo2crs.hpp index e185512d93..8a52a39220 100644 --- a/sparse/unit_test/Test_Sparse_coo2crs.hpp +++ b/sparse/unit_test/Test_Sparse_coo2crs.hpp @@ -264,7 +264,8 @@ TEST_F(TestCategory, sparse_coo2crs) { cooMat.get_col(), cooMat.get_data()); auto cooMatrix = KokkosSparse::crs2coo(crsMatrix); - check_crs_matrix(crsMatrix, cooMatrix.row, cooMatrix.col, cooMatrix.data); + check_crs_matrix(crsMatrix, cooMatrix.row(), cooMatrix.col(), + cooMatrix.data()); } TEST_F(TestCategory, sparse_coo2crs_staticMatrix_edgeCases) { diff --git a/sparse/unit_test/Test_Sparse_crs2coo.hpp b/sparse/unit_test/Test_Sparse_crs2coo.hpp index 71d01e6005..5f7cc64074 100644 --- a/sparse/unit_test/Test_Sparse_crs2coo.hpp +++ b/sparse/unit_test/Test_Sparse_crs2coo.hpp @@ -96,7 +96,7 @@ void doCrs2Coo(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { crsMat.get_nnz(), crsMat.get_vals(), map, ids); auto cooMat = KokkosSparse::crs2coo(crsMatrix); - check_coo_matrix(crsMatrix, cooMat.row, cooMat.col, cooMat.data); + check_coo_matrix(crsMatrix, cooMat.row(), cooMat.col(), cooMat.data()); } template From 31f2b05558530730ef64b0878c51c44040c23c75 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 26 Apr 2023 19:21:57 -0600 Subject: [PATCH 317/442] Fix name mismatch with rocblas tpl spec layer --- blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp index f4ca77ba69..5c5a6008ec 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp @@ -107,10 +107,10 @@ KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, template <> \ struct scal_tpl_spec_avail< \ EXECSPACE, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { \ enum : bool { value = true }; \ From 99cbf779da5f8c7f61acd4cf77a02f9d731b14ff Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Wed, 26 Apr 2023 19:43:38 -0600 Subject: [PATCH 318/442] Possible corrections for test on blake --- blas/unit_test/Test_Blas2_ger.hpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index 94fd07b85f..3c2c4b0f67 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -42,7 +42,7 @@ class GerTester { typedef typename _ViewTypeX::HostMirror _HostViewTypeX; typedef typename _ViewTypeY::HostMirror _HostViewTypeY; typedef typename _ViewTypeA::HostMirror _HostViewTypeA; - typedef Kokkos::View _ViewTypeExpected; + typedef Kokkos::View _ViewTypeExpected; typedef Kokkos::ArithTraits _KAT_A; typedef typename _KAT_A::mag_type _AuxType; @@ -230,7 +230,7 @@ void GerTester y("Y", _N); view_stride_adapter<_ViewTypeA, false> A("A", _M, _N); - _ViewTypeExpected h_expected("expected A += alpha * x * y^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_expected("expected A += alpha * x * y^{t,h}", _M, _N); bool expectedResultIsKnown = false; ScalarA alpha(0.); @@ -238,17 +238,17 @@ void GerTesterpopulateVariables(alpha, x.h_view, y.h_view, A.h_view, h_expected, + this->populateVariables(alpha, x.h_view, y.h_view, A.h_view, h_expected.d_view, x.d_view, y.d_view, A.d_view, expectedResultIsKnown); // ******************************************************************** // Step 3 of 9: populate h_vanilla // ******************************************************************** - _ViewTypeExpected h_vanilla("vanilla = A + alpha * x * y^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_vanilla("vanilla = A + alpha * x * y^{t,h}", _M, _N); KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_ger.hpp, computing vanilla A with alpha type = %s\n", typeid(alpha).name()); - this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, h_vanilla); + this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, h_vanilla.d_view); // ******************************************************************** // Step 4 of 9: use h_vanilla and h_expected as appropriate @@ -257,12 +257,12 @@ void GerTestercompareVanillaExpected(alpha, h_vanilla, h_expected); + this->compareVanillaExpected(alpha, h_vanilla.d_view, h_expected.d_view); } else { // ****************************************************************** // Copy h_vanilla to h_expected // ****************************************************************** - Kokkos::deep_copy(h_expected, h_vanilla); + Kokkos::deep_copy(h_expected.h_base, h_vanilla.h_base); } // ******************************************************************** @@ -273,7 +273,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view, - A.d_view, A.h_view, h_expected, + A.d_view, A.h_view, h_expected.d_view, "non const {x,y}"); } @@ -284,7 +284,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view, - A.d_view, A.h_view, h_expected, + A.d_view, A.h_view, h_expected.d_view, "const x"); } @@ -295,7 +295,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view_const, - A.d_view, A.h_view, h_expected, + A.d_view, A.h_view, h_expected.d_view, "const y"); } @@ -307,7 +307,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view_const, A.d_view, A.h_view, - h_expected, "const {x,y}"); + h_expected.d_view, "const {x,y}"); } // ******************************************************************** From 0e26dd1a1a79eed45add26d53767edbff999bc75 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Thu, 27 Apr 2023 11:51:54 -0600 Subject: [PATCH 319/442] Tests passing now at blake --- blas/unit_test/Test_Blas2_ger.hpp | 91 +++++++++++++++++++++++-------- 1 file changed, 67 insertions(+), 24 deletions(-) diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index 3c2c4b0f67..54d97f1915 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -262,7 +262,7 @@ void GerTester::value + || std::is_same::value + || std::is_same>::value + || std::is_same>::value; + bool yBool = std::is_same::value + || std::is_same::value + || std::is_same>::value + || std::is_same>::value; + bool aBool = std::is_same::value + || std::is_same::value + || std::is_same>::value + || std::is_same>::value; + bool useAnalyticalResults = xBool && yBool && aBool; + #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -1338,13 +1352,23 @@ int test_ger(const std::string& caseName) { tester.test(1, 2, 0); tester.test(13, 13, 0); tester.test(13, 1024, 0); - tester.test(13, 1024, 0, true, false); - tester.test(13, 1024, 0, true, true); + if (useAnalyticalResults) { + tester.test(13, 1024, 0, true, false); + tester.test(13, 1024, 0, true, true); + } + else { + tester.test(13, 1024, 0, false, true); + } tester.test(50, 40, 4); tester.test(1024, 1024, 0); tester.test(2131, 2131, 0); - tester.test(2131, 2131, 0, true, false); - tester.test(2131, 2131, 0, true, true); + if (useAnalyticalResults) { + tester.test(2131, 2131, 0, true, false); + tester.test(2131, 2131, 0, true, true); + } + else { + tester.test(2131, 2131, 0, false, true); + } } KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTLEFT\n", @@ -1374,13 +1398,23 @@ int test_ger(const std::string& caseName) { tester.test(1, 2, 0); tester.test(13, 13, 0); tester.test(13, 1024, 0); - tester.test(13, 1024, 0, true, false); - tester.test(13, 1024, 0, true, true); + if (useAnalyticalResults) { + tester.test(13, 1024, 0, true, false); + tester.test(13, 1024, 0, true, true); + } + else { + tester.test(13, 1024, 0, false, true); + } tester.test(50, 40, 4); tester.test(1024, 1024, 0); tester.test(2131, 2131, 0); - tester.test(2131, 2131, 0, true, false); - tester.test(2131, 2131, 0, true, true); + if (useAnalyticalResults) { + tester.test(2131, 2131, 0, true, false); + tester.test(2131, 2131, 0, true, true); + } + else { + tester.test(2131, 2131, 0, false, true); + } } KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTRIGHT\n", @@ -1390,10 +1424,7 @@ int test_ger(const std::string& caseName) { "---\n"); #endif -#if 1 // Compilation error "static assertion failed Layout is not constructible - // from extent arguments", Kokkos_View.hpp, circa line 1537 -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" @@ -1409,13 +1440,23 @@ int test_ger(const std::string& caseName) { tester.test(1024, 0, 0); tester.test(13, 13, 0); tester.test(13, 1024, 0); - tester.test(13, 1024, 0, true, false); - tester.test(13, 1024, 0, true, true); + if (useAnalyticalResults) { + tester.test(13, 1024, 0, true, false); + tester.test(13, 1024, 0, true, true); + } + else { + tester.test(13, 1024, 0, false, true); + } tester.test(50, 40, 4); tester.test(1024, 1024, 0); tester.test(2131, 2131, 0); - tester.test(2131, 2131, 0, true, false); - tester.test(2131, 2131, 0, true, true); + if (useAnalyticalResults) { + tester.test(2131, 2131, 0, true, false); + tester.test(2131, 2131, 0, true, true); + } + else { + tester.test(2131, 2131, 0, false, true); + } } KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s for LAYOUTSTRIDE\n", @@ -1438,8 +1479,13 @@ int test_ger(const std::string& caseName) { ScalarA, Kokkos::LayoutRight, Device> tester; tester.test(1024, 1024, 0); - tester.test(1024, 1024, 0, true, false); - tester.test(1024, 1024, 0, true, true); + if (useAnalyticalResults) { + tester.test(1024, 1024, 0, true, false); + tester.test(1024, 1024, 0, true, true); + } + else { + tester.test(1024, 1024, 0, false, true); + } } if (true) { @@ -1455,9 +1501,6 @@ int test_ger(const std::string& caseName) { "+-----------------------------------------------------------------------" "---\n"); #endif -#endif // Compilation error "static assertion failed Layout is not - // constructible from extent arguments", Kokkos_View.hpp, circa line - // 1537 KOKKOS_IMPL_DO_NOT_USE_PRINTF("Finished %s\n", caseName.c_str()); KOKKOS_IMPL_DO_NOT_USE_PRINTF( @@ -1523,8 +1566,8 @@ TEST_F(TestCategory, ger_int) { #if !defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) -TEST_F(TestCategory, ger_double_int) { - Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_double_int"); +TEST_F(TestCategory, ger_double_int_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::ger_double_int_float"); test_ger("test case ger_mixed_types"); Kokkos::Profiling::popRegion(); } From 7720c8199dba26b26d9aa46cc7a9f31bad911ce8 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Thu, 27 Apr 2023 11:59:21 -0600 Subject: [PATCH 320/442] Added explanations --- test_common/KokkosKernels_TestUtils.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 3a293ee54b..fc4fbed204 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -79,6 +79,11 @@ namespace Test { // deep-copied to each other. d_view aliases d_base, and h_view aliases h_base. // This means that copying between d_base and h_base // also copies between d_view and h_view. +// +// If the Boolean template parameter 'createMirrorView' is: +// - 'true' (default value), then this utility class will use +// Kokkos::create_mirror_view(); +// - 'false', then this utility class will use Kokkos::create_mirror() template struct view_stride_adapter { static_assert(Kokkos::is_view_v, From 19903279d3184a74d9eafe3f9750d460877a3508 Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Thu, 27 Apr 2023 12:03:54 -0600 Subject: [PATCH 321/442] Formatting --- blas/unit_test/Test_Blas2_ger.hpp | 78 +++++++++++++++---------------- 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index 54d97f1915..7e9ed08d88 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -42,7 +42,8 @@ class GerTester { typedef typename _ViewTypeX::HostMirror _HostViewTypeX; typedef typename _ViewTypeY::HostMirror _HostViewTypeY; typedef typename _ViewTypeA::HostMirror _HostViewTypeA; - typedef Kokkos::View _ViewTypeExpected; + typedef Kokkos::View + _ViewTypeExpected; typedef Kokkos::ArithTraits _KAT_A; typedef typename _KAT_A::mag_type _AuxType; @@ -230,7 +231,8 @@ void GerTester y("Y", _N); view_stride_adapter<_ViewTypeA, false> A("A", _M, _N); - view_stride_adapter<_ViewTypeExpected, true> h_expected("expected A += alpha * x * y^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_expected( + "expected A += alpha * x * y^{t,h}", _M, _N); bool expectedResultIsKnown = false; ScalarA alpha(0.); @@ -238,17 +240,20 @@ void GerTesterpopulateVariables(alpha, x.h_view, y.h_view, A.h_view, h_expected.d_view, - x.d_view, y.d_view, A.d_view, expectedResultIsKnown); + this->populateVariables(alpha, x.h_view, y.h_view, A.h_view, + h_expected.d_view, x.d_view, y.d_view, A.d_view, + expectedResultIsKnown); // ******************************************************************** // Step 3 of 9: populate h_vanilla // ******************************************************************** - view_stride_adapter<_ViewTypeExpected, true> h_vanilla("vanilla = A + alpha * x * y^{t,h}", _M, _N); + view_stride_adapter<_ViewTypeExpected, true> h_vanilla( + "vanilla = A + alpha * x * y^{t,h}", _M, _N); KOKKOS_IMPL_DO_NOT_USE_PRINTF( "In Test_Blas2_ger.hpp, computing vanilla A with alpha type = %s\n", typeid(alpha).name()); - this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, h_vanilla.d_view); + this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, + h_vanilla.d_view); // ******************************************************************** // Step 4 of 9: use h_vanilla and h_expected as appropriate @@ -272,9 +277,9 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view, - A.d_view, A.h_view, h_expected.d_view, - "non const {x,y}"); + this->callKkGerAndCompareAgainstExpected( + alpha, x.d_view, y.d_view, A.d_view, A.h_view, h_expected.d_view, + "non const {x,y}"); } // ******************************************************************** @@ -284,8 +289,8 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view, - A.d_view, A.h_view, h_expected.d_view, - "const x"); + A.d_view, A.h_view, + h_expected.d_view, "const x"); } // ******************************************************************** @@ -295,8 +300,8 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view_const, - A.d_view, A.h_view, h_expected.d_view, - "const y"); + A.d_view, A.h_view, + h_expected.d_view, "const y"); } // ******************************************************************** @@ -1318,18 +1323,18 @@ int test_ger(const std::string& caseName) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("Starting %s, device = %s ...\n", caseName.c_str(), typeid(Device).name()); - bool xBool = std::is_same::value - || std::is_same::value - || std::is_same>::value - || std::is_same>::value; - bool yBool = std::is_same::value - || std::is_same::value - || std::is_same>::value - || std::is_same>::value; - bool aBool = std::is_same::value - || std::is_same::value - || std::is_same>::value - || std::is_same>::value; + bool xBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; + bool yBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; + bool aBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; bool useAnalyticalResults = xBool && yBool && aBool; #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ @@ -1355,8 +1360,7 @@ int test_ger(const std::string& caseName) { if (useAnalyticalResults) { tester.test(13, 1024, 0, true, false); tester.test(13, 1024, 0, true, true); - } - else { + } else { tester.test(13, 1024, 0, false, true); } tester.test(50, 40, 4); @@ -1365,8 +1369,7 @@ int test_ger(const std::string& caseName) { if (useAnalyticalResults) { tester.test(2131, 2131, 0, true, false); tester.test(2131, 2131, 0, true, true); - } - else { + } else { tester.test(2131, 2131, 0, false, true); } } @@ -1401,8 +1404,7 @@ int test_ger(const std::string& caseName) { if (useAnalyticalResults) { tester.test(13, 1024, 0, true, false); tester.test(13, 1024, 0, true, true); - } - else { + } else { tester.test(13, 1024, 0, false, true); } tester.test(50, 40, 4); @@ -1411,8 +1413,7 @@ int test_ger(const std::string& caseName) { if (useAnalyticalResults) { tester.test(2131, 2131, 0, true, false); tester.test(2131, 2131, 0, true, true); - } - else { + } else { tester.test(2131, 2131, 0, false, true); } } @@ -1424,7 +1425,7 @@ int test_ger(const std::string& caseName) { "---\n"); #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) KOKKOS_IMPL_DO_NOT_USE_PRINTF( "+-----------------------------------------------------------------------" @@ -1443,8 +1444,7 @@ int test_ger(const std::string& caseName) { if (useAnalyticalResults) { tester.test(13, 1024, 0, true, false); tester.test(13, 1024, 0, true, true); - } - else { + } else { tester.test(13, 1024, 0, false, true); } tester.test(50, 40, 4); @@ -1453,8 +1453,7 @@ int test_ger(const std::string& caseName) { if (useAnalyticalResults) { tester.test(2131, 2131, 0, true, false); tester.test(2131, 2131, 0, true, true); - } - else { + } else { tester.test(2131, 2131, 0, false, true); } } @@ -1482,8 +1481,7 @@ int test_ger(const std::string& caseName) { if (useAnalyticalResults) { tester.test(1024, 1024, 0, true, false); tester.test(1024, 1024, 0, true, true); - } - else { + } else { tester.test(1024, 1024, 0, false, true); } } From 0055303541c3a379999227702fafe2009bb4c32d Mon Sep 17 00:00:00 2001 From: Ernesto Prudencio Date: Thu, 27 Apr 2023 13:36:03 -0600 Subject: [PATCH 322/442] Minor compilation error. Thanks to Luc for the proper suggestion. --- blas/impl/KokkosBlas1_dot_spec.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/blas/impl/KokkosBlas1_dot_spec.hpp b/blas/impl/KokkosBlas1_dot_spec.hpp index b385311d10..f16d4a55fe 100644 --- a/blas/impl/KokkosBlas1_dot_spec.hpp +++ b/blas/impl/KokkosBlas1_dot_spec.hpp @@ -277,9 +277,9 @@ struct DotSpecialAccumulator { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "DotSpecialAccumulator: YV is not a Kokkos::View."); - // static_assert(XV::rank == YV::rank, - // "KokkosBlas::Impl::" - // "DotSpecialAccumulator: X and Y have different ranks."); + static_assert(static_cast(XV::rank) == static_cast(YV::rank), + "KokkosBlas::Impl::" + "DotSpecialAccumulator: X and Y have different ranks."); static_assert(XV::rank == 1, "KokkosBlas::Impl::" "DotSpecialAccumulator: X and Y are not rank-1 Views."); From e65f6114724f334e402c38933b810a5ebdcd3f1d Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 1 May 2023 09:16:41 -0600 Subject: [PATCH 323/442] sparse/unit_test: Use host mirror of RandCsMatrix map --- sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp b/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp index 7a88e6071c..aa6d938684 100644 --- a/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp +++ b/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp @@ -51,7 +51,7 @@ void doCsMat(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { auto col_map = cm.get_map(); ASSERT_EQ(col_map.extent(0), cm.get_dim1() + 1); - ASSERT_EQ(col_map(cm.get_dim1()), expected_nnz) << cm.info; + ASSERT_EQ(map(cm.get_dim1()), expected_nnz) << cm.info; } template From a725974a3207ca0982b94a80f0e560bdb064afc9 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 2 May 2023 11:58:49 -0700 Subject: [PATCH 324/442] Minor fixes for sptrsv cuSPARSE --- sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index b3d8753526..00861727bc 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -446,7 +446,7 @@ template void sptrsvcuSPARSE_solve_streams( const std::vector &execspace_v, - const std::vector &handle_v, + std::vector &handle_v, const std::vector &row_map_v, const std::vector &entries_v, const std::vector &values_v, @@ -459,7 +459,7 @@ void sptrsvcuSPARSE_solve_streams( using scalar_type = typename KernelHandle::nnz_scalar_t; using memory_space = typename KernelHandle::HandlePersistentMemorySpace; using sptrsvHandleType = typename KernelHandle::SPTRSVHandleType; - usinf sptrsvCuSparseHandleType = typename sptrsvHandleType::SPTRSVcuSparseHandleType; + using sptrsvCuSparseHandleType = typename sptrsvHandleType::SPTRSVcuSparseHandleType; int nstreams = execspace_v.size(); #if (CUDA_VERSION >= 11030) From c00c8a6e32f56f6e37f5e039d049ccb8a886bdfb Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 2 May 2023 14:33:56 -0600 Subject: [PATCH 325/442] BLAS2/3: fixing some TPLs issues with execution space code path --- blas/impl/KokkosBlas3_trmm_spec.hpp | 19 ++++---- blas/impl/KokkosBlas3_trsm_spec.hpp | 19 ++++---- blas/src/KokkosBlas3_gemm.hpp | 2 +- blas/src/KokkosBlas3_trmm.hpp | 44 +++++++++++++++++-- blas/src/KokkosBlas3_trsm.hpp | 44 +++++++++++++++++-- blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp | 4 +- blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp | 4 +- ...kosSparse_spgemm_numeric_tpl_spec_decl.hpp | 2 +- 8 files changed, 111 insertions(+), 27 deletions(-) diff --git a/blas/impl/KokkosBlas3_trmm_spec.hpp b/blas/impl/KokkosBlas3_trmm_spec.hpp index 50d74b659f..26f918c652 100644 --- a/blas/impl/KokkosBlas3_trmm_spec.hpp +++ b/blas/impl/KokkosBlas3_trmm_spec.hpp @@ -26,7 +26,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct trmm_eti_spec_avail { enum : bool { value = false }; }; @@ -40,6 +40,7 @@ struct trmm_eti_spec_avail { EXEC_SPACE, MEM_SPACE) \ template <> \ struct trmm_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -67,19 +68,19 @@ namespace Impl { // // Unification layer -template ::value, - bool eti_spec_avail = trmm_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = trmm_eti_spec_avail::value> struct TRMM { - static void trmm(const char side[], const char uplo[], const char trans[], + static void trmm(const execution_space& space, const char side[], const char uplo[], const char trans[], const char diag[], typename BVIT::const_value_type& alpha, const AVIT& A, const BVIT& B); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -template -struct TRMM { - static void trmm(const char side[], const char uplo[], const char trans[], +template +struct TRMM { + static void trmm(const execution_space& /*space*/, const char side[], const char uplo[], const char trans[], const char diag[], typename BVIT::const_value_type& alpha, const AVIT& A, const BVIT& B) { static_assert(Kokkos::is_view::value, "AVIT must be a Kokkos::View."); @@ -121,6 +122,7 @@ struct TRMM { #define KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ EXEC_SPACE, MEM_SPACE) \ extern template struct TRMM< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -131,6 +133,7 @@ struct TRMM { #define KOKKOSBLAS3_TRMM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ EXEC_SPACE, MEM_SPACE) \ template struct TRMM< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/blas/impl/KokkosBlas3_trsm_spec.hpp b/blas/impl/KokkosBlas3_trsm_spec.hpp index d05dad2275..9d7a5bdc14 100644 --- a/blas/impl/KokkosBlas3_trsm_spec.hpp +++ b/blas/impl/KokkosBlas3_trsm_spec.hpp @@ -28,7 +28,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct trsm_eti_spec_avail { enum : bool { value = false }; }; @@ -46,6 +46,7 @@ struct trsm_eti_spec_avail { EXEC_SPACE, MEM_SPACE) \ template <> \ struct trsm_eti_spec_avail< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -71,11 +72,11 @@ namespace Impl { // Unification layer template < - class AViewType, class BViewType, - bool tpl_spec_avail = trsm_tpl_spec_avail::value, - bool eti_spec_avail = trsm_eti_spec_avail::value> + class execution_space, class AViewType, class BViewType, + bool tpl_spec_avail = trsm_tpl_spec_avail::value, + bool eti_spec_avail = trsm_eti_spec_avail::value> struct TRSM { - static void trsm(const char side[], const char uplo[], const char trans[], + static void trsm(const execution_space& space, const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B); @@ -83,9 +84,9 @@ struct TRSM { // Implementation of KokkosBlas::trsm. #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -template -struct TRSM { - static void trsm(const char side[], const char uplo[], const char trans[], +template +struct TRSM { + static void trsm(const execution_space& /*space*/, const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { @@ -134,6 +135,7 @@ struct TRSM { #define KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ EXEC_SPACE, MEM_SPACE) \ extern template struct TRSM< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -144,6 +146,7 @@ struct TRSM { #define KOKKOSBLAS3_TRSM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUTA, LAYOUTB, \ EXEC_SPACE, MEM_SPACE) \ template struct TRSM< \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/blas/src/KokkosBlas3_gemm.hpp b/blas/src/KokkosBlas3_gemm.hpp index 5b7eccb222..808fe8fb88 100644 --- a/blas/src/KokkosBlas3_gemm.hpp +++ b/blas/src/KokkosBlas3_gemm.hpp @@ -209,7 +209,7 @@ void gemm(const execution_space& space, const char transA[], typename CViewType::device_type, Kokkos::MemoryTraits> CVT; - typedef Impl::GEMM impl_type; + typedef Impl::GEMM impl_type; impl_type::gemm(space, transA, transB, alpha, A, B, beta, C); } diff --git a/blas/src/KokkosBlas3_trmm.hpp b/blas/src/KokkosBlas3_trmm.hpp index 7e2cbd5b88..741c415a9d 100644 --- a/blas/src/KokkosBlas3_trmm.hpp +++ b/blas/src/KokkosBlas3_trmm.hpp @@ -56,8 +56,8 @@ namespace KokkosBlas { /// \param B [in,out] Input/Output matrix, as a 2-D Kokkos::View /// On entry, M-by-N matrix /// On exit, overwritten with the solution -template -void trmm(const char side[], const char uplo[], const char trans[], +template +void trmm(const execution_space& space, const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { static_assert(Kokkos::is_view::value, @@ -143,8 +143,44 @@ void trmm(const char side[], const char uplo[], const char trans[], typename BViewType::device_type, Kokkos::MemoryTraits >; - KokkosBlas::Impl::TRMM::trmm( - side, uplo, trans, diag, alpha, A, B); + KokkosBlas::Impl::TRMM::trmm( + space, side, uplo, trans, diag, alpha, A, B); +} + +/// \brief Solve triangular linear system with multiple RHSs: +/// B = alpha * op(A) * B if side == "L" or "l" +/// B = alpha * B * op(A) if side == "R" or "r" +/// +/// \tparam AViewType Input matrix, as a 2-D Kokkos::View +/// \tparam BViewType Input(RHS)/Output(solution) M-by-N matrix, as a 2-D +/// Kokkos::View +/// +/// \param side [in] "L" or "l" indicates matrix A is on the left of B +/// "R" or "r" indicates matrix A is on the right of B +/// \param uplo [in] "U" or "u" indicates matrix A is an upper triangular +/// matrix +/// "L" or "l" indicates matrix A is a lower triangular matrix +/// \param trans [in] Specifies what op does to A: +// "N" or "n" for non-transpose, +// "T" or "t" for transpose, +// "C" or "c" for conjugate transpose. +/// \param diag [in] "U" or "u" indicates the diagonal of A is assumed to be +/// unit +// "N" or "n" indicates the diagonal of A is assumed to be +// non-unit +/// \param alpha [in] Input coefficient used for +// multiplication with either A or B +/// \param A [in] Input matrix, as a 2-D Kokkos::View +/// If side == "L" or "l", matrix A is a M-by-M triangular +/// matrix; otherwise, matrix A is a N-by-N triangular matrix +/// \param B [in,out] Input/Output matrix, as a 2-D Kokkos::View +/// On entry, M-by-N matrix +/// On exit, overwritten with the solution +template +void trmm(const char side[], const char uplo[], const char trans[], + const char diag[], typename BViewType::const_value_type& alpha, + const AViewType& A, const BViewType& B) { + trmm(typename AViewType::execution_space{}, side, uplo, trans, diag, alpha, A, B); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas3_trsm.hpp b/blas/src/KokkosBlas3_trsm.hpp index 2e8d2f4cfa..42b5e7de7d 100644 --- a/blas/src/KokkosBlas3_trsm.hpp +++ b/blas/src/KokkosBlas3_trsm.hpp @@ -30,11 +30,16 @@ namespace KokkosBlas { /// \brief Solve triangular linear system with multiple RHSs: /// op(A)*X = alpha*B if side == "L" or "l" /// X*op(A) = alpha*B if side == "R" or "r" +/// This function is currently blocking when running the native implementation +/// which only has a serial implementation. /// +/// \tparam execution_space a Kokkos execution space to run the kernels on. /// \tparam AViewType Input matrix, as a 2-D Kokkos::View /// \tparam BViewType Input(RHS)/Output(solution) M-by-N matrix, as a 2-D /// Kokkos::View /// +/// \param space [in] an execution space instance that may contain a stream +/// or a queue to execute the kernel on, this only works with TPLs at the moment. /// \param side [in] "L" or "l" indicates matrix A is on the left of X /// "R" or "r" indicates matrix A is on the right of X /// \param uplo [in] "U" or "u" indicates matrix A upper part is stored, the @@ -54,8 +59,8 @@ namespace KokkosBlas { /// \param B [in,out] Input/Output matrix, as a 2-D Kokkos::View /// On entry, M-by-N matrix of multile RHS /// On exit, overwritten with the solution X -template -void trsm(const char side[], const char uplo[], const char trans[], +template +void trsm(const execution_space& space, const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { static_assert(Kokkos::is_view::value, @@ -141,9 +146,42 @@ void trsm(const char side[], const char uplo[], const char trans[], typename BViewType::device_type, Kokkos::MemoryTraits >; - KokkosBlas::Impl::TRSM::trsm(side, uplo, trans, diag, alpha, A, B); + KokkosBlas::Impl::TRSM::trsm(space, side, uplo, trans, diag, alpha, A, B); } +/// \brief Solve triangular linear system with multiple RHSs: +/// op(A)*X = alpha*B if side == "L" or "l" +/// X*op(A) = alpha*B if side == "R" or "r" +/// +/// \tparam AViewType Input matrix, as a 2-D Kokkos::View +/// \tparam BViewType Input(RHS)/Output(solution) M-by-N matrix, as a 2-D +/// Kokkos::View +/// +/// \param side [in] "L" or "l" indicates matrix A is on the left of X +/// "R" or "r" indicates matrix A is on the right of X +/// \param uplo [in] "U" or "u" indicates matrix A upper part is stored, the +/// other part is not referenced +/// "L" or "l" indicates matrix A lower part is stored, the +/// other part is not referenced +/// \param trans [in] "N" or "n" for non-transpose, "T" or "t" for transpose, +/// "C" or "c" for conjugate transpose. +/// \param diag [in] "U" or "u" indicates the diagonal of A is assumed to be +/// unit +// "N" or "n" indicated the diagonal of A is assumed to be +// non-unit +/// \param alpha [in] Input coefficient used for multiplication with B +/// \param A [in] Input matrix, as a 2-D Kokkos::View +/// If side == "L" or "l", matrix A is a M-by-M triangular +/// matrix; otherwise, matrix A is a N-by-N triangular matrix +/// \param B [in,out] Input/Output matrix, as a 2-D Kokkos::View +/// On entry, M-by-N matrix of multile RHS +/// On exit, overwritten with the solution X +template +void trsm(const char side[], const char uplo[], const char trans[], + const char diag[], typename BViewType::const_value_type& alpha, + const AViewType& A, const BViewType& B) { + trsm(typename AViewType::execution_space{}, side, uplo, trans, diag, alpha, A, B); +} } // namespace KokkosBlas #endif // KOKKOS_BLAS3_TRSM_HPP_ diff --git a/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp index e44f3a9db7..9e40ba2e89 100644 --- a/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp @@ -21,7 +21,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct trmm_tpl_spec_avail { enum : bool { value = false }; }; @@ -33,6 +33,7 @@ struct trmm_tpl_spec_avail { MEMSPACE) \ template \ struct trmm_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -71,6 +72,7 @@ KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, MEMSPACE) \ template \ struct trmm_tpl_spec_avail< \ + Kokkos::Cuda, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp index 2af72d4950..d1836809ec 100644 --- a/blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp @@ -21,7 +21,7 @@ namespace KokkosBlas { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct trsm_tpl_spec_avail { enum : bool { value = false }; }; @@ -33,6 +33,7 @@ struct trsm_tpl_spec_avail { MEMSPACE) \ template \ struct trsm_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -71,6 +72,7 @@ KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, MEMSPACE) \ template \ struct trsm_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp index 5f555f926e..cda083a6b5 100644 --- a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp @@ -129,7 +129,7 @@ template void spgemm_numeric_cusparse( - KernelHandle *handle, lno_t m, lno_t n, lno_t k, + KernelHandle *handle, lno_t /*m*/, lno_t /*n*/, lno_t /*k*/, const ConstRowMapType &row_mapA, const ConstEntriesType &entriesA, const ConstValuesType &valuesA, const ConstRowMapType &row_mapB, const ConstEntriesType &entriesB, const ConstValuesType &valuesB, From db991036eebabc51b1a37c82e8a0d3cc8dc961f9 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 2 May 2023 15:03:20 -0600 Subject: [PATCH 326/442] BLAS2/3: applying clang-format --- blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp | 2 +- blas/impl/KokkosBlas3_gemm_spec.hpp | 88 +++++------ blas/impl/KokkosBlas3_trmm_spec.hpp | 23 +-- blas/impl/KokkosBlas3_trsm_spec.hpp | 26 ++-- blas/src/KokkosBlas2_gemv.hpp | 36 +++-- blas/src/KokkosBlas3_gemm.hpp | 50 +++--- blas/src/KokkosBlas3_trmm.hpp | 22 ++- blas/src/KokkosBlas3_trsm.hpp | 17 ++- blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp | 77 ++++------ blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp | 2 +- blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp | 40 +++-- blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp | 144 ++++++++---------- 12 files changed, 262 insertions(+), 265 deletions(-) diff --git a/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp b/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp index 84963d1d2e..26c4c9624a 100644 --- a/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp +++ b/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp @@ -77,7 +77,7 @@ struct DotBasedGEMM { // Initialize C matrix if beta != 1 if (beta == CVT::zero()) { Kokkos::MDRangePolicy> policyInit( - space, {0, 0}, {numCrows, numCcols}); + space, {0, 0}, {numCrows, numCcols}); Kokkos::parallel_for("Initialize C for Dot Product Based GEMM", policyInit, *this); } else if (beta != CVT::one()) { diff --git a/blas/impl/KokkosBlas3_gemm_spec.hpp b/blas/impl/KokkosBlas3_gemm_spec.hpp index 08c274bf10..5329ec1a9d 100644 --- a/blas/impl/KokkosBlas3_gemm_spec.hpp +++ b/blas/impl/KokkosBlas3_gemm_spec.hpp @@ -85,14 +85,15 @@ namespace Impl { // // Implementation of KokkosBlas::gemm. -template ::value, - bool eti_spec_avail = - gemm_eti_spec_avail::value> +template < + class execution_space, class AViewType, class BViewType, class CViewType, + bool tpl_spec_avail = gemm_tpl_spec_avail::value, + bool eti_spec_avail = gemm_eti_spec_avail::value> struct GEMM { - static void gemm(const execution_space& space, - const char transA[], const char transB[], + static void gemm(const execution_space& space, const char transA[], + const char transB[], typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, typename CViewType::const_value_type& beta, @@ -145,8 +146,8 @@ struct GEMM { // call dot-based GEMM, only for C := beta * C + alpha * A^T * B, on // device bool A_is_conj = ((transA[0] == 'C') || (transA[0] == 'c')); - DotBasedGEMM dotBasedGemm( - alpha, A, B, beta, C); + DotBasedGEMM + dotBasedGemm(alpha, A, B, beta, C); dotBasedGemm.run(space, A_is_conj); } else { @@ -168,15 +169,15 @@ struct GEMM { 24000) ? 4 : 16; - int vector_length = blockB1 / 4; - int max_vector_length = KokkosKernels::Impl::kk_get_max_vector_size< - execution_space>(); + int vector_length = blockB1 / 4; + int max_vector_length = + KokkosKernels::Impl::kk_get_max_vector_size(); if (vector_length > max_vector_length) vector_length = max_vector_length; // Compute scratch space size - typedef KokkosBlas::Impl::GEMMImpl + typedef KokkosBlas::Impl::GEMMImpl gemm_dummy_type; const int scratch_memory_size = gemm_dummy_type::ViewTypeAScratch::required_allocation_size() + @@ -187,96 +188,83 @@ struct GEMM { // Figure out Team Sizes int team_size = 1; #if defined(KOKKOS_ENABLE_CUDA) - if (std::is_same::value) + if (std::is_same::value) team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_HIP) - if (std::is_same::value) + if (std::is_same::value) team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_ROCM) - if (std::is_same::value) + if (std::is_same::value) team_size = blockA0; #endif #if defined(KOKKOS_ENABLE_SYCL) - if (std::is_same::value) + if (std::is_same::value) team_size = blockA0; #endif // Call the correct kernel if ((transA[0] == 'N' || transA[0] == 'n') && (transB[0] == 'N' || transB[0] == 'n')) { - KokkosBlas::Impl::GEMMImpl + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'T' || transA[0] == 't') && (transB[0] == 'N' || transB[0] == 'n')) { - KokkosBlas::Impl::GEMMImpl + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'C' || transA[0] == 'c') && (transB[0] == 'N' || transB[0] == 'n')) { - KokkosBlas::Impl::GEMMImpl + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'N' || transA[0] == 'n') && (transB[0] == 'T' || transB[0] == 't')) { - KokkosBlas::Impl::GEMMImpl + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'T' || transA[0] == 't') && (transB[0] == 'T' || transB[0] == 't')) { - KokkosBlas::Impl::GEMMImpl + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'C' || transA[0] == 'c') && (transB[0] == 'T' || transB[0] == 't')) { - KokkosBlas::Impl::GEMMImpl + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'N' || transA[0] == 'n') && (transB[0] == 'C' || transB[0] == 'c')) { - KokkosBlas::Impl::GEMMImpl + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'T' || transA[0] == 't') && (transB[0] == 'C' || transB[0] == 'c')) { - KokkosBlas::Impl::GEMMImpl + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'C' || transA[0] == 'c') && (transB[0] == 'C' || transB[0] == 'c')) { - KokkosBlas::Impl::GEMMImpl + KokkosBlas::Impl::GEMMImpl gemm(alpha, A, B, beta, C); gemm.run(space, team_size, vector_length, scratch_level); } diff --git a/blas/impl/KokkosBlas3_trmm_spec.hpp b/blas/impl/KokkosBlas3_trmm_spec.hpp index 26f918c652..fe3096957a 100644 --- a/blas/impl/KokkosBlas3_trmm_spec.hpp +++ b/blas/impl/KokkosBlas3_trmm_spec.hpp @@ -69,20 +69,25 @@ namespace Impl { // Unification layer template ::value, - bool eti_spec_avail = trmm_eti_spec_avail::value> + bool tpl_spec_avail = + trmm_tpl_spec_avail::value, + bool eti_spec_avail = + trmm_eti_spec_avail::value> struct TRMM { - static void trmm(const execution_space& space, const char side[], const char uplo[], const char trans[], - const char diag[], typename BVIT::const_value_type& alpha, - const AVIT& A, const BVIT& B); + static void trmm(const execution_space& space, const char side[], + const char uplo[], const char trans[], const char diag[], + typename BVIT::const_value_type& alpha, const AVIT& A, + const BVIT& B); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY template -struct TRMM { - static void trmm(const execution_space& /*space*/, const char side[], const char uplo[], const char trans[], - const char diag[], typename BVIT::const_value_type& alpha, - const AVIT& A, const BVIT& B) { +struct TRMM { + static void trmm(const execution_space& /*space*/, const char side[], + const char uplo[], const char trans[], const char diag[], + typename BVIT::const_value_type& alpha, const AVIT& A, + const BVIT& B) { static_assert(Kokkos::is_view::value, "AVIT must be a Kokkos::View."); static_assert(Kokkos::is_view::value, "BVIT must be a Kokkos::View."); static_assert(static_cast(AVIT::rank) == 2, "AVIT must have rank 2."); diff --git a/blas/impl/KokkosBlas3_trsm_spec.hpp b/blas/impl/KokkosBlas3_trsm_spec.hpp index 9d7a5bdc14..08e1edd0de 100644 --- a/blas/impl/KokkosBlas3_trsm_spec.hpp +++ b/blas/impl/KokkosBlas3_trsm_spec.hpp @@ -46,7 +46,7 @@ struct trsm_eti_spec_avail { EXEC_SPACE, MEM_SPACE) \ template <> \ struct trsm_eti_spec_avail< \ - EXEC_SPACE, \ + EXEC_SPACE, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -71,13 +71,14 @@ namespace Impl { // // Unification layer -template < - class execution_space, class AViewType, class BViewType, - bool tpl_spec_avail = trsm_tpl_spec_avail::value, - bool eti_spec_avail = trsm_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + trsm_eti_spec_avail::value> struct TRSM { - static void trsm(const execution_space& space, const char side[], const char uplo[], const char trans[], - const char diag[], + static void trsm(const execution_space& space, const char side[], + const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B); }; @@ -85,9 +86,10 @@ struct TRSM { // Implementation of KokkosBlas::trsm. #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY template -struct TRSM { - static void trsm(const execution_space& /*space*/, const char side[], const char uplo[], const char trans[], - const char diag[], +struct TRSM { + static void trsm(const execution_space& /*space*/, const char side[], + const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { static_assert(Kokkos::is_view::value, @@ -135,7 +137,7 @@ struct TRSM, \ Kokkos::MemoryTraits >, \ @@ -146,7 +148,7 @@ struct TRSM, \ Kokkos::MemoryTraits >, \ diff --git a/blas/src/KokkosBlas2_gemv.hpp b/blas/src/KokkosBlas2_gemv.hpp index 11d31a741c..a8ebf02ca3 100644 --- a/blas/src/KokkosBlas2_gemv.hpp +++ b/blas/src/KokkosBlas2_gemv.hpp @@ -49,13 +49,15 @@ namespace KokkosBlas { /// \param x [in] Input vector, as a 1-D Kokkos::View /// \param beta [in] Input coefficient of y /// \param y [in/out] Output vector, as a nonconst 1-D Kokkos::View -template +template void gemv(const execution_space& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { static_assert(Kokkos::is_execution_space_v, - "KokkosBlas::gemv: execution_space must be a valid Kokkos execution space."); + "KokkosBlas::gemv: execution_space must be a valid Kokkos " + "execution space."); static_assert(Kokkos::is_view::value, "KokkosBlas::gemv: AViewType must be a Kokkos::View."); static_assert(Kokkos::is_view::value, @@ -68,16 +70,26 @@ void gemv(const execution_space& space, const char trans[], "KokkosBlas::gemv: XViewType must have rank 1."); static_assert(static_cast(YViewType::rank) == 1, "KokkosBlas::gemv: YViewType must have rank 1."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemv: AViewType must be accessible from execution_space"); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemv: XViewType must be accessible from execution_space"); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemv: YViewType must be accessible from execution_space"); - static_assert(Kokkos::SpaceAccessibility::assignable, - "KokkosBlas::gemv: AViewType must be assignable to YViewType"); - static_assert(Kokkos::SpaceAccessibility::assignable, - "KokkosBlas::gemv: XViewType must be assignable to YViewType"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemv: AViewType must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemv: XViewType must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemv: YViewType must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::assignable, + "KokkosBlas::gemv: AViewType must be assignable to YViewType"); + static_assert( + Kokkos::SpaceAccessibility::assignable, + "KokkosBlas::gemv: XViewType must be assignable to YViewType"); // Check compatibility of dimensions at run time. if (trans[0] == 'N' || trans[0] == 'n') { diff --git a/blas/src/KokkosBlas3_gemm.hpp b/blas/src/KokkosBlas3_gemm.hpp index 808fe8fb88..0cb00c8493 100644 --- a/blas/src/KokkosBlas3_gemm.hpp +++ b/blas/src/KokkosBlas3_gemm.hpp @@ -38,12 +38,13 @@ namespace Impl { // This case must be intercepted here rather than impl in order to call TPL // GEMV instead of TPL GEMM. This codepath was measured to be profitable with // cuBLAS. -template +template bool gemv_based_gemm( - const execution_space& space, const char transA[], - const char transB[], typename AViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B, - typename CViewType::const_value_type& beta, const CViewType& C, + const execution_space& space, const char transA[], const char transB[], + typename AViewType::const_value_type& alpha, const AViewType& A, + const BViewType& B, typename CViewType::const_value_type& beta, + const CViewType& C, typename std::enable_if::value && !std::is_same +template void gemm(const execution_space& space, const char transA[], const char transB[], typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, typename CViewType::const_value_type& beta, const CViewType& C) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) static_assert(Kokkos::is_execution_space_v, - "KokkosBlas::gemm: execution_space must be a valid Kokkos execution space"); + "KokkosBlas::gemm: execution_space must be a valid Kokkos " + "execution space"); static_assert(Kokkos::is_view::value, "KokkosBlas::gemm: AViewType must be a Kokkos::View."); static_assert(Kokkos::is_view::value, @@ -127,16 +130,26 @@ void gemm(const execution_space& space, const char transA[], "KokkosBlas::gemm: BViewType must have rank 2."); static_assert(static_cast(CViewType::rank) == 2, "KokkosBlas::gemm: CViewType must have rank 2."); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemm: AViewType must be accessible from execution_space"); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemm: BViewType must be accessible from execution_space"); - static_assert(Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::gemm: CViewType must be accessible from execution_space"); - static_assert(Kokkos::SpaceAccessibility::assignable, - "KokkosBlas::gemm: CViewType must be assignable by AViewType"); - static_assert(Kokkos::SpaceAccessibility::assignable, - "KokkosBlas::gemm: CViewType must be assignable by BViewType"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemm: AViewType must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemm: BViewType must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::gemm: CViewType must be accessible from execution_space"); + static_assert( + Kokkos::SpaceAccessibility::assignable, + "KokkosBlas::gemm: CViewType must be assignable by AViewType"); + static_assert( + Kokkos::SpaceAccessibility::assignable, + "KokkosBlas::gemm: CViewType must be assignable by BViewType"); // Check validity of transpose argument bool valid_transA = (transA[0] == 'N') || (transA[0] == 'n') || @@ -235,7 +248,8 @@ void gemm(const char transA[], const char transB[], typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, typename CViewType::const_value_type& beta, const CViewType& C) { - gemm(typename CViewType::execution_space{}, transA, transB, alpha, A, B, beta, C); + gemm(typename CViewType::execution_space{}, transA, transB, alpha, A, B, beta, + C); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas3_trmm.hpp b/blas/src/KokkosBlas3_trmm.hpp index 741c415a9d..eb45da0f90 100644 --- a/blas/src/KokkosBlas3_trmm.hpp +++ b/blas/src/KokkosBlas3_trmm.hpp @@ -30,12 +30,17 @@ namespace KokkosBlas { /// \brief Solve triangular linear system with multiple RHSs: /// B = alpha * op(A) * B if side == "L" or "l" /// B = alpha * B * op(A) if side == "R" or "r" +/// This function is currently blocking when running the native implementation +/// which only has a serial implementation. /// +/// \tparam execution_space a Kokkos execution space to run the kernels on. /// \tparam AViewType Input matrix, as a 2-D Kokkos::View /// \tparam BViewType Input(RHS)/Output(solution) M-by-N matrix, as a 2-D /// Kokkos::View /// -/// \param side [in] "L" or "l" indicates matrix A is on the left of B +/// \param space [in] an execution space instance that may contain a stream +/// or a queue to execute the kernel on, this only works with TPLs at the +/// moment. \param side [in] "L" or "l" indicates matrix A is on the left of B /// "R" or "r" indicates matrix A is on the right of B /// \param uplo [in] "U" or "u" indicates matrix A is an upper triangular /// matrix @@ -57,9 +62,10 @@ namespace KokkosBlas { /// On entry, M-by-N matrix /// On exit, overwritten with the solution template -void trmm(const execution_space& space, const char side[], const char uplo[], const char trans[], - const char diag[], typename BViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B) { +void trmm(const execution_space& space, const char side[], const char uplo[], + const char trans[], const char diag[], + typename BViewType::const_value_type& alpha, const AViewType& A, + const BViewType& B) { static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); static_assert(Kokkos::is_view::value, @@ -143,8 +149,9 @@ void trmm(const execution_space& space, const char side[], const char uplo[], co typename BViewType::device_type, Kokkos::MemoryTraits >; - KokkosBlas::Impl::TRMM::trmm( - space, side, uplo, trans, diag, alpha, A, B); + KokkosBlas::Impl::TRMM::trmm(space, side, uplo, trans, + diag, alpha, A, B); } /// \brief Solve triangular linear system with multiple RHSs: @@ -180,7 +187,8 @@ template void trmm(const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { - trmm(typename AViewType::execution_space{}, side, uplo, trans, diag, alpha, A, B); + trmm(typename AViewType::execution_space{}, side, uplo, trans, diag, alpha, A, + B); } } // namespace KokkosBlas diff --git a/blas/src/KokkosBlas3_trsm.hpp b/blas/src/KokkosBlas3_trsm.hpp index 42b5e7de7d..890b2ff6aa 100644 --- a/blas/src/KokkosBlas3_trsm.hpp +++ b/blas/src/KokkosBlas3_trsm.hpp @@ -39,8 +39,8 @@ namespace KokkosBlas { /// Kokkos::View /// /// \param space [in] an execution space instance that may contain a stream -/// or a queue to execute the kernel on, this only works with TPLs at the moment. -/// \param side [in] "L" or "l" indicates matrix A is on the left of X +/// or a queue to execute the kernel on, this only works with TPLs at the +/// moment. \param side [in] "L" or "l" indicates matrix A is on the left of X /// "R" or "r" indicates matrix A is on the right of X /// \param uplo [in] "U" or "u" indicates matrix A upper part is stored, the /// other part is not referenced @@ -60,9 +60,10 @@ namespace KokkosBlas { /// On entry, M-by-N matrix of multile RHS /// On exit, overwritten with the solution X template -void trsm(const execution_space& space, const char side[], const char uplo[], const char trans[], - const char diag[], typename BViewType::const_value_type& alpha, - const AViewType& A, const BViewType& B) { +void trsm(const execution_space& space, const char side[], const char uplo[], + const char trans[], const char diag[], + typename BViewType::const_value_type& alpha, const AViewType& A, + const BViewType& B) { static_assert(Kokkos::is_view::value, "AViewType must be a Kokkos::View."); static_assert(Kokkos::is_view::value, @@ -146,7 +147,8 @@ void trsm(const execution_space& space, const char side[], const char uplo[], co typename BViewType::device_type, Kokkos::MemoryTraits >; - KokkosBlas::Impl::TRSM::trsm(space, side, uplo, trans, diag, alpha, A, B); + KokkosBlas::Impl::TRSM::trsm( + space, side, uplo, trans, diag, alpha, A, B); } /// \brief Solve triangular linear system with multiple RHSs: @@ -180,7 +182,8 @@ template void trsm(const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) { - trsm(typename AViewType::execution_space{}, side, uplo, trans, diag, alpha, A, B); + trsm(typename AViewType::execution_space{}, side, uplo, trans, diag, alpha, A, + B); } } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp index d1e0cc26c6..66177e28a6 100644 --- a/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp @@ -26,9 +26,8 @@ namespace Impl { #define KOKKOSBLAS3_XGEMM_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ LAYOUTB, LAYOUTC, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct GEMM< \ - ExecSpace, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View > \ CViewType; \ \ - static void gemm(const ExecSpace& /* space*/, \ - const char transA[], const char transB[], \ + static void gemm(const ExecSpace& /* space*/, const char transA[], \ + const char transB[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B, \ typename CViewType::const_value_type& beta, \ @@ -165,9 +164,8 @@ namespace Impl { LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ ETI_SPEC_AVAIL) \ template \ - struct GEMM< \ - ExecSpace, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View > \ CViewType; \ \ - static void gemm(const ExecSpace& space, \ - const char transA[], const char transB[], \ + static void gemm(const ExecSpace& space, const char transA[], \ + const char transB[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B, \ typename CViewType::const_value_type& beta, \ @@ -368,9 +366,8 @@ namespace Impl { ROCBLAS_FN, LAYOUT, MEM_SPACE, \ ETI_SPEC_AVAIL) \ template \ - struct GEMM< \ - ExecSpace, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View, rocblas_float_complex, \ rocblas_cgemm, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) -KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - true) -KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - false) -KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, - true) -KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, - false) - -KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - true) -KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - false) -KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, - true) -KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, - false) - -KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - true) -KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - false) -KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, - true) -KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, - false) - -KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - true) -KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - false) -KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, - true) -KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, - false) +KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS3_DGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) + +KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS3_SGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) + +KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS3_ZGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) + +KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) +KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, false) +KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, true) +KOKKOSBLAS3_CGEMM_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) } // namespace Impl } // namespace KokkosBlas diff --git a/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp index 9e40ba2e89..010b44a154 100644 --- a/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp @@ -33,7 +33,7 @@ struct trmm_tpl_spec_avail { MEMSPACE) \ template \ struct trmm_tpl_spec_avail< \ - ExecSpace, \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp index 64036ca3dc..53c73f7416 100644 --- a/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp @@ -27,9 +27,8 @@ namespace Impl { #define KOKKOSBLAS3_TRMM_BLAS(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, LAYOUTB, \ MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRMM< \ - ExecSpace, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View > \ BViewType; \ \ - static void trmm(const ExecSpace& /*space*/, const char side[], \ - const char uplo[], const char trans[], \ - const char diag[], \ + static void trmm(const ExecSpace& /*space*/, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion("KokkosBlas::trmm[TPL_BLAS," #SCALAR_TYPE \ @@ -171,9 +169,8 @@ namespace Impl { #define KOKKOSBLAS3_TRMM_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, \ LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRMM< \ - ExecSpace, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View(&alpha), \ - reinterpret_cast(A.data()), LDA,\ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(B.data()), LDB)); \ - } else { \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), \ + LDA, reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(B.data()), LDB)); \ + } else { \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - CUBLAS_FN(s.handle, side_, uplo_, trans_, diag_, N, M, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(B.data()), LDB)); \ + CUBLAS_FN(s.handle, side_, uplo_, trans_, diag_, N, M, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), \ + LDA, reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(B.data()), LDB)); \ } \ KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ diff --git a/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp index fd8c2c31a0..ec36388094 100644 --- a/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp @@ -43,9 +43,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const ExecSpace& /*space*/, const char side[], \ - const char uplo[], const char trans[], \ - const char diag[], \ + static void trsm(const ExecSpace& /*space*/, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,double]"); \ @@ -113,9 +112,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const ExecSpace& /*space*/, const char side[], \ - const char uplo[], const char trans[], \ - const char diag[], \ + static void trsm(const ExecSpace& /*space*/, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_BLAS,float]"); \ @@ -165,9 +163,8 @@ namespace Impl { #define KOKKOSBLAS3_ZTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRSM< \ - ExecSpace, \ - Kokkos::View**, LAYOUTA, \ + struct TRSM**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View**, LAYOUTB, \ @@ -184,9 +181,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const ExecSpace& /*space*/, const char side[], \ - const char uplo[], const char trans[], \ - const char diag[], \ + static void trsm(const ExecSpace& /*space*/, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion( \ @@ -242,9 +238,8 @@ namespace Impl { #define KOKKOSBLAS3_CTRSM_BLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRSM< \ - ExecSpace, \ - Kokkos::View**, LAYOUTA, \ + struct TRSM**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View**, LAYOUTB, \ @@ -261,9 +256,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const ExecSpace& /*space*/, const char side[], \ - const char uplo[], const char trans[], \ - const char diag[], \ + static void trsm(const ExecSpace& /*space*/, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion( \ @@ -384,9 +378,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const ExecSpace& space, const char side[], \ - const char uplo[], const char trans[], \ - const char diag[], \ + static void trsm(const ExecSpace& space, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,double]"); \ @@ -441,15 +434,15 @@ namespace Impl { KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ + if (A_is_ll) { \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, \ - A.data(), LDA, B.data(), LDB)); \ - } else { \ + cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, \ + A.data(), LDA, B.data(), LDB)); \ + } else { \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, \ - A.data(), LDA, B.data(), LDB)); \ - } \ + cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, \ + A.data(), LDA, B.data(), LDB)); \ + } \ KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ @@ -475,9 +468,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const ExecSpace& space, const char side[], \ - const char uplo[], const char trans[], \ - const char diag[], \ + static void trsm(const ExecSpace& space, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,float]"); \ @@ -532,15 +524,15 @@ namespace Impl { KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasStrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, \ - A.data(), LDA, B.data(), LDB)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasStrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, \ - A.data(), LDA, B.data(), LDB)); \ - } \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasStrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, \ + A.data(), LDA, B.data(), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasStrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, \ + A.data(), LDA, B.data(), LDB)); \ + } \ KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ \ Kokkos::Profiling::popRegion(); \ @@ -549,9 +541,8 @@ namespace Impl { #define KOKKOSBLAS3_ZTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRSM< \ - ExecSpace, \ - Kokkos::View**, LAYOUTA, \ + struct TRSM**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View**, LAYOUTB, \ @@ -568,9 +559,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const ExecSpace& space, const char side[], \ - const char uplo[], const char trans[], \ - const char diag[], \ + static void trsm(const ExecSpace& space, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion( \ @@ -626,19 +616,19 @@ namespace Impl { KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB)); \ - } \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZtrsm( \ + s.handle, side_, uplo_, trans_, diag_, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZtrsm( \ + s.handle, side_, uplo_, trans_, diag_, N, M, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB)); \ + } \ KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ \ Kokkos::Profiling::popRegion(); \ @@ -647,9 +637,8 @@ namespace Impl { #define KOKKOSBLAS3_CTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRSM< \ - ExecSpace, \ - Kokkos::View**, LAYOUTA, \ + struct TRSM**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View**, LAYOUTB, \ @@ -666,9 +655,8 @@ namespace Impl { Kokkos::MemoryTraits > \ BViewType; \ \ - static void trsm(const ExecSpace& space, const char side[], \ - const char uplo[], const char trans[], \ - const char diag[], \ + static void trsm(const ExecSpace& space, const char side[], \ + const char uplo[], const char trans[], const char diag[], \ typename BViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B) { \ Kokkos::Profiling::pushRegion( \ @@ -724,19 +712,19 @@ namespace Impl { KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasCtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasCtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB)); \ - } \ + if (A_is_ll) { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasCtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB)); \ + } else { \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasCtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB)); \ + } \ KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ \ Kokkos::Profiling::popRegion(); \ From 5d027ccecf0e13552e316177a908506a7a0736c4 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 2 May 2023 15:40:37 -0700 Subject: [PATCH 327/442] Add sptrsv_solve_streams for cuSPARSE < 11.3 --- .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 136 +++++++++--------- 1 file changed, 64 insertions(+), 72 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 00861727bc..5b294ad5bc 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -463,6 +463,7 @@ void sptrsvcuSPARSE_solve_streams( int nstreams = execspace_v.size(); #if (CUDA_VERSION >= 11030) + printf("CUSPARSE VERSION >= 11030\n"); (void)row_map_v; (void)entries_v; (void)values_v; @@ -516,78 +517,69 @@ void sptrsvcuSPARSE_solve_streams( } } #else // CUDA_VERSION < 11030 -// if (std::is_same::value) { -// cusparseStatus_t status; -// -// typename KernelHandle::SPTRSVcuSparseHandleType* h = -// sptrsv_handle->get_cuSparseHandle(); -// -// int nnz = entries.extent_int(0); -// -// const int* rm = !std::is_same::value -// ? sptrsv_handle->get_int_rowmap_ptr() -// : (const int*)row_map.data(); -// const int* ent = (const int*)entries.data(); -// const scalar_type* vals = values.data(); -// const scalar_type* bv = rhs.data(); -// scalar_type* xv = lhs.data(); -// -// if (std::is_same::value) { -// if (h->pBuffer == nullptr) { -// std::cout << " pBuffer invalid" << std::endl; -// } -// const double alpha = double(1); -// -// status = cusparseDcsrsv2_solve(h->handle, h->transpose, nrows, nnz, -// &alpha, h->descr, (double*)vals, (int*)rm, -// (int*)ent, h->info, (double*)bv, -// (double*)xv, h->policy, h->pBuffer); -// -// if (CUSPARSE_STATUS_SUCCESS != status) -// std::cout << "solve status error name " << (status) << std::endl; -// } else if (std::is_same::value) { -// if (h->pBuffer == nullptr) { -// std::cout << " pBuffer invalid" << std::endl; -// } -// const float alpha = float(1); -// -// status = cusparseScsrsv2_solve(h->handle, h->transpose, nrows, nnz, -// &alpha, h->descr, (float*)vals, (int*)rm, -// (int*)ent, h->info, (float*)bv, (float*)xv, -// h->policy, h->pBuffer); -// -// if (CUSPARSE_STATUS_SUCCESS != status) -// std::cout << "solve status error name " << (status) << std::endl; -// } else if (std::is_same >::value) { -// cuDoubleComplex cualpha; -// cualpha.x = 1.0; -// cualpha.y = 0.0; -// status = cusparseZcsrsv2_solve( -// h->handle, h->transpose, nrows, nnz, &cualpha, h->descr, -// (cuDoubleComplex*)vals, (int*)rm, (int*)ent, h->info, -// (cuDoubleComplex*)bv, (cuDoubleComplex*)xv, h->policy, h->pBuffer); -// -// if (CUSPARSE_STATUS_SUCCESS != status) -// std::cout << "solve status error name " << (status) << std::endl; -// } else if (std::is_same >::value) { -// cuComplex cualpha; -// cualpha.x = 1.0; -// cualpha.y = 0.0; -// status = cusparseCcsrsv2_solve( -// h->handle, h->transpose, nrows, nnz, &cualpha, h->descr, -// (cuComplex*)vals, (int*)rm, (int*)ent, h->info, (cuComplex*)bv, -// (cuComplex*)xv, h->policy, h->pBuffer); -// -// if (CUSPARSE_STATUS_SUCCESS != status) -// std::cout << "solve status error name " << (status) << std::endl; -// } else { -// throw std::runtime_error("CUSPARSE wrapper error: unsupported type.\n"); -// } -// -// } else { -// throw std::runtime_error( -// "CUSPARSE requires local ordinals to be integer.\n"); -// } + printf("CUSPARSE VERSION < 11030\n"); + if (!std::is_same::value) { + throw std::runtime_error( + "CUSPARSE requires local ordinals to be integer.\n"); + } + else { + cusparseStatus_t status; + std::vector h_v(nstreams); + + for (int i = 0; i < nstreams; i++) { + sptrsvHandleType *sptrsv_handle = handle_v[i].get_sptrsv_handle(); + h_v[i] = sptrsv_handle->get_cuSparseHandle(); + + int nnz = entries_v[i].extent_int(0); + int nrows = static_cast(sptrsv_handle->get_nrows()); + + const int* rm = !std::is_same::value + ? sptrsv_handle->get_int_rowmap_ptr() + : (const int*)row_map_v[i].data(); + const int* ent = (const int*)entries_v[i].data(); + const scalar_type* vals = values_v[i].data(); + const scalar_type* bv = rhs_v[i].data(); + scalar_type* xv = lhs_v[i].data(); + + if (h_v[i]->pBuffer == nullptr) { + std::cout << " pBuffer invalid on stream " << i << std::endl; + } + + if (std::is_same::value) { + const double alpha = double(1); + + status = cusparseDcsrsv2_solve(h_v[i]->handle, h_v[i]->transpose, nrows, nnz, &alpha, h_v[i]->descr, (double*)vals, (int*)rm, (int*)ent, h_v[i]->info, (double*)bv, (double*)xv, h_v[i]->policy, h_v[i]->pBuffer); + + if (CUSPARSE_STATUS_SUCCESS != status) + std::cout << "solve status error name " << (status) << " on stream " << i << std::endl; + } else if (std::is_same::value) { + const float alpha = float(1); + + status = cusparseScsrsv2_solve(h_v[i]->handle, h_v[i]->transpose, nrows, nnz, &alpha, h_v[i]->descr, (float*)vals, (int*)rm, (int*)ent, h_v[i]->info, (float*)bv, (float*)xv, h_v[i]->policy, h_v[i]->pBuffer); + + if (CUSPARSE_STATUS_SUCCESS != status) + std::cout << "solve status error name " << (status) << " on stream " << i << std::endl; + } else if (std::is_same >::value) { + cuDoubleComplex cualpha; + cualpha.x = 1.0; + cualpha.y = 0.0; + status = cusparseZcsrsv2_solve(h_v[i]->handle, h_v[i]->transpose, nrows, nnz, &cualpha, h_v[i]->descr, (cuDoubleComplex*)vals, (int*)rm, (int*)ent, h_v[i]->info, (cuDoubleComplex*)bv, (cuDoubleComplex*)xv, h_v[i]->policy, h_v[i]->pBuffer); + + if (CUSPARSE_STATUS_SUCCESS != status) + std::cout << "solve status error name " << (status) << " on stream " << i << std::endl; + } else if (std::is_same >::value) { + cuComplex cualpha; + cualpha.x = 1.0; + cualpha.y = 0.0; + status = cusparseCcsrsv2_solve(h_v[i]->handle, h_v[i]->transpose, nrows, nnz, &cualpha, h_v[i]->descr, (cuComplex*)vals, (int*)rm, (int*)ent, h_v[i]->info, (cuComplex*)bv, (cuComplex*)xv, h_v[i]->policy, h_v[i]->pBuffer); + + if (CUSPARSE_STATUS_SUCCESS != status) + std::cout << "solve status error name " << (status) << " on stream " << i << std::endl; + } else { + throw std::runtime_error("CUSPARSE wrapper error: unsupported type.\n"); + } + } + } #endif #else (void)execspace_v; From 2f78417b7016cd634578a7b57b128a7260ad84ac Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Wed, 3 May 2023 09:57:17 -0700 Subject: [PATCH 328/442] Some changes in sptrsv_solve_streams for cuSPARSE < 11.3 --- .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 67 +++++++------------ 1 file changed, 25 insertions(+), 42 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 5b294ad5bc..eb150e2949 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -463,7 +463,6 @@ void sptrsvcuSPARSE_solve_streams( int nstreams = execspace_v.size(); #if (CUDA_VERSION >= 11030) - printf("CUSPARSE VERSION >= 11030\n"); (void)row_map_v; (void)entries_v; (void)values_v; @@ -517,64 +516,48 @@ void sptrsvcuSPARSE_solve_streams( } } #else // CUDA_VERSION < 11030 - printf("CUSPARSE VERSION < 11030\n"); if (!std::is_same::value) { throw std::runtime_error( "CUSPARSE requires local ordinals to be integer.\n"); - } + } else { - cusparseStatus_t status; + const scalar_type alpha = scalar_type(1.0); + std::vector sptrsv_handle_v(nstreams); std::vector h_v(nstreams); + std::vector rm_v(nstreams); + std::vector ent_v(nstreams); + std::vector vals_v(nstreams); + std::vector bv_v(nstreams); + std::vector xv_v(nstreams); for (int i = 0; i < nstreams; i++) { - sptrsvHandleType *sptrsv_handle = handle_v[i].get_sptrsv_handle(); - h_v[i] = sptrsv_handle->get_cuSparseHandle(); - - int nnz = entries_v[i].extent_int(0); - int nrows = static_cast(sptrsv_handle->get_nrows()); + sptrsv_handle_v[i] = handle_v[i].get_sptrsv_handle(); + h_v[i] = sptrsv_handle_v[i]->get_cuSparseHandle(); - const int* rm = !std::is_same::value - ? sptrsv_handle->get_int_rowmap_ptr() - : (const int*)row_map_v[i].data(); - const int* ent = (const int*)entries_v[i].data(); - const scalar_type* vals = values_v[i].data(); - const scalar_type* bv = rhs_v[i].data(); - scalar_type* xv = lhs_v[i].data(); + // Bind cuspare handle to a stream + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetStream(h_v[i]->handle, execspace_v[i].cuda_stream())); if (h_v[i]->pBuffer == nullptr) { std::cout << " pBuffer invalid on stream " << i << std::endl; } + rm_v[i] = !std::is_same::value ? sptrsv_handle_v[i]->get_int_rowmap_ptr() : reinterpret_cast(row_map_v[i].data()); + ent_v[i]= reinterpret_cast(entries_v[i].data()); + vals_v[i] = values_v[i].data(); + bv_v[i] = rhs_v[i].data(); + xv_v[i] = lhs_v[i].data(); + } + for (int i = 0; i < nstreams; i++) { + int nnz = entries_v[i].extent_int(0); + int nrows = static_cast(sptrsv_handle_v[i]->get_nrows()); if (std::is_same::value) { - const double alpha = double(1); - - status = cusparseDcsrsv2_solve(h_v[i]->handle, h_v[i]->transpose, nrows, nnz, &alpha, h_v[i]->descr, (double*)vals, (int*)rm, (int*)ent, h_v[i]->info, (double*)bv, (double*)xv, h_v[i]->policy, h_v[i]->pBuffer); - - if (CUSPARSE_STATUS_SUCCESS != status) - std::cout << "solve status error name " << (status) << " on stream " << i << std::endl; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrsv2_solve(h_v[i]->handle, h_v[i]->transpose, nrows, nnz, reinterpret_cast(&alpha), h_v[i]->descr, reinterpret_cast(vals_v[i]), reinterpret_cast(rm_v[i]), reinterpret_cast(ent_v[i]), h_v[i]->info, reinterpret_cast(bv_v[i]), reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else if (std::is_same::value) { - const float alpha = float(1); - - status = cusparseScsrsv2_solve(h_v[i]->handle, h_v[i]->transpose, nrows, nnz, &alpha, h_v[i]->descr, (float*)vals, (int*)rm, (int*)ent, h_v[i]->info, (float*)bv, (float*)xv, h_v[i]->policy, h_v[i]->pBuffer); - - if (CUSPARSE_STATUS_SUCCESS != status) - std::cout << "solve status error name " << (status) << " on stream " << i << std::endl; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseScsrsv2_solve(h_v[i]->handle, h_v[i]->transpose, nrows, nnz, reinterpret_cast(&alpha), h_v[i]->descr, reinterpret_cast(vals_v[i]), reinterpret_cast(rm_v[i]), reinterpret_cast(ent_v[i]), h_v[i]->info, reinterpret_cast(bv_v[i]), reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else if (std::is_same >::value) { - cuDoubleComplex cualpha; - cualpha.x = 1.0; - cualpha.y = 0.0; - status = cusparseZcsrsv2_solve(h_v[i]->handle, h_v[i]->transpose, nrows, nnz, &cualpha, h_v[i]->descr, (cuDoubleComplex*)vals, (int*)rm, (int*)ent, h_v[i]->info, (cuDoubleComplex*)bv, (cuDoubleComplex*)xv, h_v[i]->policy, h_v[i]->pBuffer); - - if (CUSPARSE_STATUS_SUCCESS != status) - std::cout << "solve status error name " << (status) << " on stream " << i << std::endl; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseZcsrsv2_solve(h_v[i]->handle, h_v[i]->transpose, nrows, nnz, reinterpret_cast(&alpha), h_v[i]->descr, reinterpret_cast(vals_v[i]), reinterpret_cast(rm_v[i]), reinterpret_cast(ent_v[i]), h_v[i]->info, reinterpret_cast(bv_v[i]), reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else if (std::is_same >::value) { - cuComplex cualpha; - cualpha.x = 1.0; - cualpha.y = 0.0; - status = cusparseCcsrsv2_solve(h_v[i]->handle, h_v[i]->transpose, nrows, nnz, &cualpha, h_v[i]->descr, (cuComplex*)vals, (int*)rm, (int*)ent, h_v[i]->info, (cuComplex*)bv, (cuComplex*)xv, h_v[i]->policy, h_v[i]->pBuffer); - - if (CUSPARSE_STATUS_SUCCESS != status) - std::cout << "solve status error name " << (status) << " on stream " << i << std::endl; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCcsrsv2_solve(h_v[i]->handle, h_v[i]->transpose, nrows, nnz, reinterpret_cast(&alpha), h_v[i]->descr, reinterpret_cast(vals_v[i]), reinterpret_cast(rm_v[i]), reinterpret_cast(ent_v[i]), h_v[i]->info, reinterpret_cast(bv_v[i]), reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else { throw std::runtime_error("CUSPARSE wrapper error: unsupported type.\n"); } From 4fc4831fbefb5164d03c6ca95976988cdd1f1d43 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 28 Mar 2023 10:57:15 -0600 Subject: [PATCH 329/442] Update changelog --- CHANGELOG.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fa19491753..db56ed4f1c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,23 @@ # Change Log +## [4.0.01](https://github.com/kokkos/kokkos-kernels/tree/4.0.01) (2023-28-03) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.0...4.0.01) + +### Bug Fixes: +- Use the options ENABLE_PERFTEST, ENABLE_EXAMPLES [\#1667](https://github.com/kokkos/kokkos-kernels/pull/1667) +- Introduce KOKKOSKERNELS_ALL_COMPONENTS_ENABLED variable [\#1691](https://github.com/kokkos/kokkos-kernels/pull/1691) +- Kokkos Kernels version: need to use upper case variables [\#1707](https://github.com/kokkos/kokkos-kernels/pull/1707) +- GMRES: fixing some type issues related to memory space instantiation (partial) [\#1719](https://github.com/kokkos/kokkos-kernels/pull/1719) +- CUSPARSE_MM_ALG_DEFAULT deprecated by cuSparse 11.1 [\#1698](https://github.com/kokkos/kokkos-kernels/pull/1698) +- blas1: Fix a couple documentation typos [\#1704](https://github.com/kokkos/kokkos-kernels/pull/1704) +- CUDA 11.4: fixing some -Werror [\#1727](https://github.com/kokkos/kokkos-kernels/pull/1727) +- Remove unused variable in KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp [\#1734](https://github.com/kokkos/kokkos-kernels/pull/1734) +- ParIlut: create and destroy spgemm handle for each usage [\#1736](https://github.com/kokkos/kokkos-kernels/pull/1736) +- Reduce BatchedGemm test coverage time [\#1737](https://github.com/kokkos/kokkos-kernels/pull/1737) +- Fix kk_generate_diagonally_dominant_sparse_matrix hang [\#1689](https://github.com/kokkos/kokkos-kernels/pull/1689) +- Temporary spgemm workaround matching Trilinos 11663 [\#1757](https://github.com/kokkos/kokkos-kernels/pull/1757) + + ## [4.0.0](https://github.com/kokkos/kokkos-kernels/tree/4.0.0) (2023-21-02) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.7.01...4.0.0) From 1ae83cf166daef83b1be39278e4c442697c0cb56 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 4 Apr 2023 13:10:48 -0600 Subject: [PATCH 330/442] Update changelog --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index db56ed4f1c..ca5aaf046e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Change Log -## [4.0.01](https://github.com/kokkos/kokkos-kernels/tree/4.0.01) (2023-28-03) +## [4.0.01](https://github.com/kokkos/kokkos-kernels/tree/4.0.01) (2023-03-04) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.0...4.0.01) ### Bug Fixes: @@ -16,6 +16,9 @@ - Reduce BatchedGemm test coverage time [\#1737](https://github.com/kokkos/kokkos-kernels/pull/1737) - Fix kk_generate_diagonally_dominant_sparse_matrix hang [\#1689](https://github.com/kokkos/kokkos-kernels/pull/1689) - Temporary spgemm workaround matching Trilinos 11663 [\#1757](https://github.com/kokkos/kokkos-kernels/pull/1757) +- MDF: Minor changes to interface for ifpack2 impl [\#1759](https://github.com/kokkos/kokkos-kernels/pull/1759) +- Rocm TPL support upgrade [\#1763](https://github.com/kokkos/kokkos-kernels/pull/1763) +- Fix BLAS cmake check for complex types [\#1762](https://github.com/kokkos/kokkos-kernels/pull/1762) ## [4.0.0](https://github.com/kokkos/kokkos-kernels/tree/4.0.0) (2023-21-02) From 415deb091758ce00a7edcd370cd5d71ddcf0ba5c Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 20 Apr 2023 12:23:15 -0600 Subject: [PATCH 331/442] Update changelog Add and reorder parilut entries Fix broken 4.0.0 changelog url --- CHANGELOG.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ca5aaf046e..d582fc354f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,28 +1,31 @@ # Change Log -## [4.0.01](https://github.com/kokkos/kokkos-kernels/tree/4.0.01) (2023-03-04) -[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.0...4.0.01) +## [4.0.01](https://github.com/kokkos/kokkos-kernels/tree/4.0.01) (2023-04-19) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.00...4.0.01) ### Bug Fixes: - Use the options ENABLE_PERFTEST, ENABLE_EXAMPLES [\#1667](https://github.com/kokkos/kokkos-kernels/pull/1667) - Introduce KOKKOSKERNELS_ALL_COMPONENTS_ENABLED variable [\#1691](https://github.com/kokkos/kokkos-kernels/pull/1691) - Kokkos Kernels version: need to use upper case variables [\#1707](https://github.com/kokkos/kokkos-kernels/pull/1707) -- GMRES: fixing some type issues related to memory space instantiation (partial) [\#1719](https://github.com/kokkos/kokkos-kernels/pull/1719) - CUSPARSE_MM_ALG_DEFAULT deprecated by cuSparse 11.1 [\#1698](https://github.com/kokkos/kokkos-kernels/pull/1698) - blas1: Fix a couple documentation typos [\#1704](https://github.com/kokkos/kokkos-kernels/pull/1704) - CUDA 11.4: fixing some -Werror [\#1727](https://github.com/kokkos/kokkos-kernels/pull/1727) - Remove unused variable in KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp [\#1734](https://github.com/kokkos/kokkos-kernels/pull/1734) -- ParIlut: create and destroy spgemm handle for each usage [\#1736](https://github.com/kokkos/kokkos-kernels/pull/1736) - Reduce BatchedGemm test coverage time [\#1737](https://github.com/kokkos/kokkos-kernels/pull/1737) - Fix kk_generate_diagonally_dominant_sparse_matrix hang [\#1689](https://github.com/kokkos/kokkos-kernels/pull/1689) - Temporary spgemm workaround matching Trilinos 11663 [\#1757](https://github.com/kokkos/kokkos-kernels/pull/1757) - MDF: Minor changes to interface for ifpack2 impl [\#1759](https://github.com/kokkos/kokkos-kernels/pull/1759) - Rocm TPL support upgrade [\#1763](https://github.com/kokkos/kokkos-kernels/pull/1763) - Fix BLAS cmake check for complex types [\#1762](https://github.com/kokkos/kokkos-kernels/pull/1762) +- ParIlut: Adds a better parilut test with gmres [\#1661](https://github.com/kokkos/kokkos-kernels/pull/1661) +- GMRES: fixing some type issues related to memory space instantiation (partial) [\#1719](https://github.com/kokkos/kokkos-kernels/pull/1719) +- ParIlut: create and destroy spgemm handle for each usage [\#1736](https://github.com/kokkos/kokkos-kernels/pull/1736) +- ParIlut: remove par ilut limitations [\#1755](https://github.com/kokkos/kokkos-kernels/pull/1755) +- ParIlut: make Ut_values view atomic in compute_l_u_factors [\#1781](https://github.com/kokkos/kokkos-kernels/pull/1781) -## [4.0.0](https://github.com/kokkos/kokkos-kernels/tree/4.0.0) (2023-21-02) -[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.7.01...4.0.0) +## [4.0.0](https://github.com/kokkos/kokkos-kernels/tree/4.0.00) (2023-21-02) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.7.01...4.0.00) ### Features: - Copyright update 4.0 [\#1657](https://github.com/kokkos/kokkos-kernels/pull/1657) From 166716a879e1a688231ef1e749f6074b4e2ecab9 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Wed, 3 May 2023 13:27:21 -0700 Subject: [PATCH 332/442] No need to fence after each level --- sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 93b6ad6844..2680dd34c5 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2962,6 +2962,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, Kokkos::Timer sptrsv_timer; sptrsv_timer.reset(); #endif + for (size_type lvl = 0; lvl < nlevels; ++lvl) { { size_type lvl_nodes = hnodes_per_level(lvl); @@ -3252,6 +3253,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, } // scope for if-block } // end for lvl + #ifdef profile_supernodal_etree Kokkos::fence(); double sptrsv_time_seconds = sptrsv_timer.seconds(); @@ -4085,9 +4087,10 @@ void lower_tri_solve_streams(const std::vector &execspace_v, } // end for streams // 2. Wait for all streams finished - for (int i = 0; i < nstreams; i++) { - execspace_v[i].fence(); - } // end for streams + // note: not needed here unlike in the spiluk case + //for (int i = 0; i < nstreams; i++) { + // execspace_v[i].fence(); + //} // end for streams } // end for lvl } // end lower_tri_solve_streams @@ -4152,9 +4155,10 @@ void upper_tri_solve_streams(const std::vector &execspace_v, } // end for streams // 2. Wait for all streams finished - for (int i = 0; i < nstreams; i++) { - execspace_v[i].fence(); - } // end for streams + // note: not needed here unlike in the spiluk case + //for (int i = 0; i < nstreams; i++) { + // execspace_v[i].fence(); + //} // end for streams } // end for lvl } // end upper_tri_solve_streams From 28254863f73cc8e25e1f5b574bba2aa9df15c67d Mon Sep 17 00:00:00 2001 From: "Vinh Quang Dang (-EXP)" Date: Wed, 3 May 2023 14:45:41 -0600 Subject: [PATCH 333/442] Apply clang format --- .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 127 ++++++++----- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 117 ++++++++---- .../impl/KokkosSparse_sptrsv_solve_spec.hpp | 31 ++-- sparse/src/KokkosSparse_sptrsv.hpp | 169 ++++++++++++------ 4 files changed, 302 insertions(+), 142 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index eb150e2949..a1a1c85e35 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -25,7 +25,7 @@ namespace Impl { template -void sptrsvcuSPARSE_symbolic(KernelHandle* sptrsv_handle, +void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, typename KernelHandle::nnz_lno_t nrows, ain_row_index_view_type row_map, ain_nonzero_index_view_type entries, @@ -58,28 +58,28 @@ void sptrsvcuSPARSE_symbolic(KernelHandle* sptrsv_handle, bool is_lower = sptrsv_handle->is_lower_tri(); sptrsv_handle->create_cuSPARSE_Handle(trans, is_lower); - typename KernelHandle::SPTRSVcuSparseHandleType* h = + typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); int64_t nnz = static_cast(entries.extent(0)); size_t pBufferSize; - void* rm; + void *rm; // NOTE (Oct-29-2022): // cusparseCreateCsr only supports the same sizes (either 32 bits or 64 // bits) for row_map_type and entries_type if (std::is_same::value) { if (!std::is_same::value) { sptrsv_handle->allocate_tmp_int_rowmap(row_map.extent(0)); - rm = (void*)sptrsv_handle->get_int_rowmap_ptr_copy(row_map); + rm = (void *)sptrsv_handle->get_int_rowmap_ptr_copy(row_map); } else { - rm = (void*)row_map.data(); + rm = (void *)row_map.data(); } } else { // idx_type has 64 bits if (!std::is_same::value) { sptrsv_handle->allocate_tmp_int64_rowmap(row_map.extent(0)); - rm = (void*)sptrsv_handle->get_int64_rowmap_ptr_copy(row_map); + rm = (void *)sptrsv_handle->get_int64_rowmap_ptr_copy(row_map); } else { - rm = (void*)row_map.data(); + rm = (void *)row_map.data(); } } const scalar_type alpha = scalar_type(1.0); @@ -93,8 +93,8 @@ void sptrsvcuSPARSE_symbolic(KernelHandle* sptrsv_handle, // Create sparse matrix in CSR format KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( &(h->matDescr), static_cast(nrows), - static_cast(nrows), nnz, rm, (void*)entries.data(), - (void*)values.data(), cudaCsrRowMapType, cudaCsrColIndType, + static_cast(nrows), nnz, rm, (void *)entries.data(), + (void *)values.data(), cudaCsrRowMapType, cudaCsrColIndType, CUSPARSE_INDEX_BASE_ZERO, cudaValueType)); // Create dummy dense vector B (RHS) @@ -132,7 +132,7 @@ void sptrsvcuSPARSE_symbolic(KernelHandle* sptrsv_handle, h->spsvDescr, &pBufferSize)); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void**)&(h->pBuffer), pBufferSize)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc((void **)&(h->pBuffer), pBufferSize)); // Run analysis KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_analysis( @@ -284,7 +284,7 @@ template < typename KernelHandle, typename ain_row_index_view_type, typename ain_nonzero_index_view_type, typename ain_values_scalar_view_type, typename b_values_scalar_view_type, typename x_values_scalar_view_type> -void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle, +void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, typename KernelHandle::nnz_lno_t nrows, ain_row_index_view_type row_map, ain_nonzero_index_view_type entries, @@ -320,7 +320,7 @@ void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle, "CUSPARSE requires local ordinals to be integer (32 bits or 64 " "bits).\n"); } else { - typename KernelHandle::SPTRSVcuSparseHandleType* h = + typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); const scalar_type alpha = scalar_type(1.0); @@ -330,12 +330,12 @@ void sptrsvcuSPARSE_solve(KernelHandle* sptrsv_handle, // Create dense vector B (RHS) KOKKOS_CUSPARSE_SAFE_CALL( cusparseCreateDnVec(&(h->vecBDescr), static_cast(nrows), - (void*)rhs.data(), cudaValueType)); + (void *)rhs.data(), cudaValueType)); // Create dense vector X (LHS) KOKKOS_CUSPARSE_SAFE_CALL( cusparseCreateDnVec(&(h->vecXDescr), static_cast(nrows), - (void*)lhs.data(), cudaValueType)); + (void *)lhs.data(), cudaValueType)); // Solve KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_solve( @@ -454,12 +454,13 @@ void sptrsvcuSPARSE_solve_streams( std::vector &lhs_v, bool /*trans*/ ) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - using idx_type = typename KernelHandle::nnz_lno_t; - using size_type = typename KernelHandle::size_type; - using scalar_type = typename KernelHandle::nnz_scalar_t; - using memory_space = typename KernelHandle::HandlePersistentMemorySpace; - using sptrsvHandleType = typename KernelHandle::SPTRSVHandleType; - using sptrsvCuSparseHandleType = typename sptrsvHandleType::SPTRSVcuSparseHandleType; + using idx_type = typename KernelHandle::nnz_lno_t; + using size_type = typename KernelHandle::size_type; + using scalar_type = typename KernelHandle::nnz_scalar_t; + using memory_space = typename KernelHandle::HandlePersistentMemorySpace; + using sptrsvHandleType = typename KernelHandle::SPTRSVHandleType; + using sptrsvCuSparseHandleType = + typename sptrsvHandleType::SPTRSVcuSparseHandleType; int nstreams = execspace_v.size(); #if (CUDA_VERSION >= 11030) @@ -477,36 +478,44 @@ void sptrsvcuSPARSE_solve_streams( if (!is_cuda_space) { throw std::runtime_error( - "KokkosKernels sptrsvcuSPARSE_solve_streams: MEMORY IS NOT ALLOCATED IN GPU DEVICE for CUSPARSE\n"); + "KokkosKernels sptrsvcuSPARSE_solve_streams: MEMORY IS NOT ALLOCATED " + "IN GPU DEVICE for CUSPARSE\n"); } else if (!is_idx_type_supported) { throw std::runtime_error( - "CUSPARSE requires local ordinals to be integer (32 bits or 64 bits).\n"); + "CUSPARSE requires local ordinals to be integer (32 bits or 64 " + "bits).\n"); } else { const scalar_type alpha = scalar_type(1.0); cudaDataType cudaValueType = cuda_data_type_from(); - + std::vector h_v(nstreams); - + for (int i = 0; i < nstreams; i++) { sptrsvHandleType *sptrsv_handle = handle_v[i].get_sptrsv_handle(); - h_v[i] = sptrsv_handle->get_cuSparseHandle(); + h_v[i] = sptrsv_handle->get_cuSparseHandle(); // Bind cuspare handle to a stream - KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetStream(h_v[i]->handle, execspace_v[i].cuda_stream())); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h_v[i]->handle, execspace_v[i].cuda_stream())); int64_t nrows = static_cast(sptrsv_handle->get_nrows()); // Create dense vector B (RHS) - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(&(h_v[i]->vecBDescr), nrows, (void*)rhs_v[i].data(), cudaValueType)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec( + &(h_v[i]->vecBDescr), nrows, (void *)rhs_v[i].data(), cudaValueType)); // Create dense vector X (LHS) - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec(&(h_v[i]->vecXDescr), nrows, (void*)lhs_v[i].data(), cudaValueType)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec( + &(h_v[i]->vecXDescr), nrows, (void *)lhs_v[i].data(), cudaValueType)); } // Solve for (int i = 0; i < nstreams; i++) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_solve(h_v[i]->handle, h_v[i]->transpose, &alpha, h_v[i]->matDescr, h_v[i]->vecBDescr, h_v[i]->vecXDescr, cudaValueType, CUSPARSE_SPSV_ALG_DEFAULT, h_v[i]->spsvDescr)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpSV_solve( + h_v[i]->handle, h_v[i]->transpose, &alpha, h_v[i]->matDescr, + h_v[i]->vecBDescr, h_v[i]->vecXDescr, cudaValueType, + CUSPARSE_SPSV_ALG_DEFAULT, h_v[i]->spsvDescr)); } // Destroy dense vector descriptors @@ -519,11 +528,10 @@ void sptrsvcuSPARSE_solve_streams( if (!std::is_same::value) { throw std::runtime_error( "CUSPARSE requires local ordinals to be integer.\n"); - } - else { + } else { const scalar_type alpha = scalar_type(1.0); - std::vector sptrsv_handle_v(nstreams); - std::vector h_v(nstreams); + std::vector sptrsv_handle_v(nstreams); + std::vector h_v(nstreams); std::vector rm_v(nstreams); std::vector ent_v(nstreams); std::vector vals_v(nstreams); @@ -532,32 +540,67 @@ void sptrsvcuSPARSE_solve_streams( for (int i = 0; i < nstreams; i++) { sptrsv_handle_v[i] = handle_v[i].get_sptrsv_handle(); - h_v[i] = sptrsv_handle_v[i]->get_cuSparseHandle(); + h_v[i] = sptrsv_handle_v[i]->get_cuSparseHandle(); // Bind cuspare handle to a stream - KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetStream(h_v[i]->handle, execspace_v[i].cuda_stream())); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h_v[i]->handle, execspace_v[i].cuda_stream())); if (h_v[i]->pBuffer == nullptr) { std::cout << " pBuffer invalid on stream " << i << std::endl; } - rm_v[i] = !std::is_same::value ? sptrsv_handle_v[i]->get_int_rowmap_ptr() : reinterpret_cast(row_map_v[i].data()); - ent_v[i]= reinterpret_cast(entries_v[i].data()); + rm_v[i] = !std::is_same::value + ? sptrsv_handle_v[i]->get_int_rowmap_ptr() + : reinterpret_cast(row_map_v[i].data()); + ent_v[i] = reinterpret_cast(entries_v[i].data()); vals_v[i] = values_v[i].data(); bv_v[i] = rhs_v[i].data(); xv_v[i] = lhs_v[i].data(); } for (int i = 0; i < nstreams; i++) { - int nnz = entries_v[i].extent_int(0); + int nnz = entries_v[i].extent_int(0); int nrows = static_cast(sptrsv_handle_v[i]->get_nrows()); if (std::is_same::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrsv2_solve(h_v[i]->handle, h_v[i]->transpose, nrows, nnz, reinterpret_cast(&alpha), h_v[i]->descr, reinterpret_cast(vals_v[i]), reinterpret_cast(rm_v[i]), reinterpret_cast(ent_v[i]), h_v[i]->info, reinterpret_cast(bv_v[i]), reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrsv2_solve( + h_v[i]->handle, h_v[i]->transpose, nrows, nnz, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, + h_v[i]->pBuffer)); } else if (std::is_same::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseScsrsv2_solve(h_v[i]->handle, h_v[i]->transpose, nrows, nnz, reinterpret_cast(&alpha), h_v[i]->descr, reinterpret_cast(vals_v[i]), reinterpret_cast(rm_v[i]), reinterpret_cast(ent_v[i]), h_v[i]->info, reinterpret_cast(bv_v[i]), reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseScsrsv2_solve( + h_v[i]->handle, h_v[i]->transpose, nrows, nnz, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, + h_v[i]->pBuffer)); } else if (std::is_same >::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseZcsrsv2_solve(h_v[i]->handle, h_v[i]->transpose, nrows, nnz, reinterpret_cast(&alpha), h_v[i]->descr, reinterpret_cast(vals_v[i]), reinterpret_cast(rm_v[i]), reinterpret_cast(ent_v[i]), h_v[i]->info, reinterpret_cast(bv_v[i]), reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseZcsrsv2_solve( + h_v[i]->handle, h_v[i]->transpose, nrows, nnz, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, + h_v[i]->pBuffer)); } else if (std::is_same >::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCcsrsv2_solve(h_v[i]->handle, h_v[i]->transpose, nrows, nnz, reinterpret_cast(&alpha), h_v[i]->descr, reinterpret_cast(vals_v[i]), reinterpret_cast(rm_v[i]), reinterpret_cast(ent_v[i]), h_v[i]->info, reinterpret_cast(bv_v[i]), reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCcsrsv2_solve( + h_v[i]->handle, h_v[i]->transpose, nrows, nnz, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, + h_v[i]->pBuffer)); } else { throw std::runtime_error("CUSPARSE wrapper error: unsupported type.\n"); } diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 2680dd34c5..4bed672f61 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -4021,7 +4021,6 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, } // end tri_solve_chain - // -------------------------------- // Stream interfaces // -------------------------------- @@ -4034,11 +4033,12 @@ void lower_tri_solve_streams(const std::vector &execspace_v, const std::vector &entries_v, const std::vector &values_v, const std::vector &rhs_v, - std::vector &lhs_v) { + std::vector &lhs_v) { // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment using size_type = typename TriSolveHandle::size_type; using NGBLType = typename TriSolveHandle::nnz_lno_view_t; - using nodes_per_level_type = typename TriSolveHandle::hostspace_nnz_lno_view_t; + using nodes_per_level_type = + typename TriSolveHandle::hostspace_nnz_lno_view_t; using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; // Create vectors for handles' data in streams @@ -4051,14 +4051,14 @@ void lower_tri_solve_streams(const std::vector &execspace_v, // Retrieve data from handles and find max. number of levels among streams size_type nlevels_max = 0; for (int i = 0; i < nstreams; i++) { - nlevels_v[i] = thandle_v[i]->get_num_levels(); - hnodes_per_level_v[i] = thandle_v[i]->get_host_nodes_per_level(); + nlevels_v[i] = thandle_v[i]->get_num_levels(); + hnodes_per_level_v[i] = thandle_v[i]->get_host_nodes_per_level(); nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level(); - node_count_v[i] = 0; + node_count_v[i] = 0; if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; } - // Main loop must be performed sequential + // Main loop must be performed sequential for (size_type lvl = 0; lvl < nlevels_max; lvl++) { // 1. Launch work on all streams for (int i = 0; i < nstreams; i++) { @@ -4066,29 +4066,52 @@ void lower_tri_solve_streams(const std::vector &execspace_v, if (lvl < nlevels_v[i]) { size_type lvl_nodes = hnodes_per_level_v[i](lvl); if (lvl_nodes != 0) { - if (thandle_v[i]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { - Kokkos::parallel_for("parfor_fixed_lvl", Kokkos::RangePolicy(execspace_v[i], node_count_v[i], node_count_v[i] + lvl_nodes), LowerTriLvlSchedRPSolverFunctor(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i])); - } else if (thandle_v[i]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { + if (thandle_v[i]->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { + Kokkos::parallel_for( + "parfor_fixed_lvl", + Kokkos::RangePolicy( + execspace_v[i], node_count_v[i], + node_count_v[i] + lvl_nodes), + LowerTriLvlSchedRPSolverFunctor( + row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], + nodes_grouped_by_level_v[i])); + } else if (thandle_v[i]->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm:: + SEQLVLSCHD_TP1) { using policy_type = Kokkos::TeamPolicy; - int team_size = thandle_v[i]->get_team_size(); + int team_size = thandle_v[i]->get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], true, node_count_v[i]); + TriLvlSchedTP1SolverFunctor + tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i], true, + node_count_v[i]); #else - LowerTriLvlSchedTP1SolverFunctor tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); + LowerTriLvlSchedTP1SolverFunctor< + RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> + tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); #endif if (team_size == -1) - Kokkos::parallel_for("parfor_l_team", policy_type(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); + Kokkos::parallel_for( + "parfor_l_team", + policy_type(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); else - Kokkos::parallel_for("parfor_l_team", policy_type(execspace_v[i], lvl_nodes, team_size), tstf); + Kokkos::parallel_for( + "parfor_l_team", + policy_type(execspace_v[i], lvl_nodes, team_size), tstf); } node_count_v[i] += lvl_nodes; } // end if (lvl_nodes != 0) - } // end if (lvl < nlevels_v[i]) - } // end for streams + } // end if (lvl < nlevels_v[i]) + } // end for streams // 2. Wait for all streams finished // note: not needed here unlike in the spiluk case - //for (int i = 0; i < nstreams; i++) { + // for (int i = 0; i < nstreams; i++) { // execspace_v[i].fence(); //} // end for streams } // end for lvl @@ -4102,11 +4125,12 @@ void upper_tri_solve_streams(const std::vector &execspace_v, const std::vector &entries_v, const std::vector &values_v, const std::vector &rhs_v, - std::vector &lhs_v) { + std::vector &lhs_v) { // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment using size_type = typename TriSolveHandle::size_type; using NGBLType = typename TriSolveHandle::nnz_lno_view_t; - using nodes_per_level_type = typename TriSolveHandle::hostspace_nnz_lno_view_t; + using nodes_per_level_type = + typename TriSolveHandle::hostspace_nnz_lno_view_t; using nodes_grouped_by_level_type = typename TriSolveHandle::nnz_lno_view_t; // Create vectors for handles' data in streams @@ -4119,14 +4143,14 @@ void upper_tri_solve_streams(const std::vector &execspace_v, // Retrieve data from handles and find max. number of levels among streams size_type nlevels_max = 0; for (int i = 0; i < nstreams; i++) { - nlevels_v[i] = thandle_v[i]->get_num_levels(); - hnodes_per_level_v[i] = thandle_v[i]->get_host_nodes_per_level(); + nlevels_v[i] = thandle_v[i]->get_num_levels(); + hnodes_per_level_v[i] = thandle_v[i]->get_host_nodes_per_level(); nodes_grouped_by_level_v[i] = thandle_v[i]->get_nodes_grouped_by_level(); - node_count_v[i] = 0; + node_count_v[i] = 0; if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; } - // Main loop must be performed sequential + // Main loop must be performed sequential for (size_type lvl = 0; lvl < nlevels_max; lvl++) { // 1. Launch work on all streams for (int i = 0; i < nstreams; i++) { @@ -4134,29 +4158,52 @@ void upper_tri_solve_streams(const std::vector &execspace_v, if (lvl < nlevels_v[i]) { size_type lvl_nodes = hnodes_per_level_v[i](lvl); if (lvl_nodes != 0) { - if (thandle_v[i]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { - Kokkos::parallel_for("parfor_fixed_lvl", Kokkos::RangePolicy(execspace_v[i], node_count_v[i], node_count_v[i] + lvl_nodes), UpperTriLvlSchedRPSolverFunctor(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i])); - } else if (thandle_v[i]->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { + if (thandle_v[i]->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { + Kokkos::parallel_for( + "parfor_fixed_lvl", + Kokkos::RangePolicy( + execspace_v[i], node_count_v[i], + node_count_v[i] + lvl_nodes), + UpperTriLvlSchedRPSolverFunctor( + row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], + nodes_grouped_by_level_v[i])); + } else if (thandle_v[i]->get_algorithm() == + KokkosSparse::Experimental::SPTRSVAlgorithm:: + SEQLVLSCHD_TP1) { using policy_type = Kokkos::TeamPolicy; - int team_size = thandle_v[i]->get_team_size(); + int team_size = thandle_v[i]->get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED - TriLvlSchedTP1SolverFunctor tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], false, node_count_v[i]); + TriLvlSchedTP1SolverFunctor + tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i], false, + node_count_v[i]); #else - UpperTriLvlSchedTP1SolverFunctor tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); + UpperTriLvlSchedTP1SolverFunctor< + RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> + tstf(row_map_v[i], entries_v[i], values_v[i], lhs_v[i], + rhs_v[i], nodes_grouped_by_level_v[i], node_count_v[i]); #endif if (team_size == -1) - Kokkos::parallel_for("parfor_l_team", policy_type(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); + Kokkos::parallel_for( + "parfor_l_team", + policy_type(execspace_v[i], lvl_nodes, Kokkos::AUTO), tstf); else - Kokkos::parallel_for("parfor_l_team", policy_type(execspace_v[i], lvl_nodes, team_size), tstf); + Kokkos::parallel_for( + "parfor_l_team", + policy_type(execspace_v[i], lvl_nodes, team_size), tstf); } node_count_v[i] += lvl_nodes; } // end if (lvl_nodes != 0) - } // end if (lvl < nlevels_v[i]) - } // end for streams + } // end if (lvl < nlevels_v[i]) + } // end for streams // 2. Wait for all streams finished // note: not needed here unlike in the spiluk case - //for (int i = 0; i < nstreams; i++) { + // for (int i = 0; i < nstreams; i++) { // execspace_v[i].fence(); //} // end for streams } // end for lvl diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index 0004c565ce..a575557034 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -90,11 +90,11 @@ namespace Impl { template ::value, + ExecutionSpace, KernelHandle, RowMapType, EntriesType, ValuesType, + BType, XType>::value, bool eti_spec_avail = sptrsv_solve_eti_spec_avail< - ExecutionSpace, KernelHandle, RowMapType, EntriesType, - ValuesType, BType, XType>::value> + ExecutionSpace, KernelHandle, RowMapType, EntriesType, ValuesType, + BType, XType>::value> struct SPTRSV_SOLVE { static void sptrsv_solve(KernelHandle *handle, const RowMapType row_map, const EntriesType entries, const ValuesType values, @@ -105,8 +105,7 @@ struct SPTRSV_SOLVE { std::vector &handle_v, const std::vector &row_map_v, const std::vector &entries_v, - const std::vector &values_v, - const std::vector &b_v, + const std::vector &values_v, const std::vector &b_v, std::vector &x_v); }; @@ -173,13 +172,13 @@ struct SPTRSV_SOLVE &handle_v, const std::vector &row_map_v, const std::vector &entries_v, - const std::vector &values_v, - const std::vector &b_v, + const std::vector &values_v, const std::vector &b_v, std::vector &x_v) { // Call specific algorithm type // NOTE: Only support SEQLVLSCHD_RP and SEQLVLSCHD_TP1 at this moment // Assume streams have the same either lower or upper matrix type - std::vector sptrsv_handle_v(execspace_v.size()); + std::vector sptrsv_handle_v( + execspace_v.size()); for (int i = 0; i < static_cast(execspace_v.size()); i++) { sptrsv_handle_v[i] = handle_v[i].get_sptrsv_handle(); } @@ -189,17 +188,23 @@ struct SPTRSV_SOLVEis_lower_tri()) { for (int i = 0; i < static_cast(execspace_v.size()); i++) { if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { - Experimental::lower_tri_symbolic(*(sptrsv_handle_v[i]), row_map_v[i], entries_v[i]); + Experimental::lower_tri_symbolic(*(sptrsv_handle_v[i]), row_map_v[i], + entries_v[i]); } } - Experimental::lower_tri_solve_streams(execspace_v, sptrsv_handle_v, row_map_v, entries_v, values_v, b_v, x_v); + Experimental::lower_tri_solve_streams(execspace_v, sptrsv_handle_v, + row_map_v, entries_v, values_v, b_v, + x_v); } else { for (int i = 0; i < static_cast(execspace_v.size()); i++) { if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { - Experimental::upper_tri_symbolic(*(sptrsv_handle_v[i]), row_map_v[i], entries_v[i]); + Experimental::upper_tri_symbolic(*(sptrsv_handle_v[i]), row_map_v[i], + entries_v[i]); } } - Experimental::upper_tri_solve_streams(execspace_v, sptrsv_handle_v, row_map_v, entries_v, values_v, b_v, x_v); + Experimental::upper_tri_solve_streams(execspace_v, sptrsv_handle_v, + row_map_v, entries_v, values_v, b_v, + x_v); } Kokkos::Profiling::popRegion(); } diff --git a/sparse/src/KokkosSparse_sptrsv.hpp b/sparse/src/KokkosSparse_sptrsv.hpp index 6bc2c04678..fe227ac9b7 100644 --- a/sparse/src/KokkosSparse_sptrsv.hpp +++ b/sparse/src/KokkosSparse_sptrsv.hpp @@ -312,11 +312,10 @@ void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, } else { KokkosSparse::Impl::SPTRSV_SOLVE< - typename scalar_nnz_view_t_::execution_space, - const_handle_type, RowMap_Internal, Entries_Internal, Values_Internal, - BType_Internal, XType_Internal>::sptrsv_solve(&tmp_handle, rowmap_i, - entries_i, values_i, b_i, - x_i); + typename scalar_nnz_view_t_::execution_space, const_handle_type, + RowMap_Internal, Entries_Internal, Values_Internal, BType_Internal, + XType_Internal>::sptrsv_solve(&tmp_handle, rowmap_i, entries_i, + values_i, b_i, x_i); } } // sptrsv_solve @@ -370,39 +369,95 @@ void sptrsv_solve(KernelHandle *handleL, KernelHandle *handleU, XType x, } #endif -template -void sptrsv_solve_streams(const std::vector& execspace_v, - const std::vector& handle_v, - const std::vector& rowmap_v, - const std::vector& entries_v, - const std::vector& values_v, - const std::vector& b_v, - std::vector& x_v) { - using size_type = typename KernelHandle::size_type; +template +void sptrsv_solve_streams(const std::vector &execspace_v, + const std::vector &handle_v, + const std::vector &rowmap_v, + const std::vector &entries_v, + const std::vector &values_v, + const std::vector &b_v, + std::vector &x_v) { + using size_type = typename KernelHandle::size_type; using ordinal_type = typename KernelHandle::nnz_lno_t; - using scalar_type = typename KernelHandle::nnz_scalar_t; - - static_assert(Kokkos::is_execution_space::value, "ExecutionSpace is not valid"); - static_assert(Kokkos::SpaceAccessibility::accessible, "sptrsv_solve_streams: ExecutionSpace cannot access data in lno_row_view_t_"); - static_assert(Kokkos::SpaceAccessibility::accessible, "sptrsv_solve_streams: ExecutionSpace cannot access data in lno_nnz_view_t_"); - static_assert(Kokkos::SpaceAccessibility::accessible, "sptrsv_solve_streams: ExecutionSpace cannot access data in scalar_nnz_view_t_"); - static_assert(Kokkos::SpaceAccessibility::accessible, "sptrsv_solve_streams: ExecutionSpace cannot access data in BType"); - static_assert(Kokkos::SpaceAccessibility::accessible, "sptrsv_solve_streams: ExecutionSpace cannot access data in XType"); - - static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename lno_row_view_t_::non_const_value_type, size_type), "sptrsv_solve_streams: A size_type must match KernelHandle size_type (const doesn't matter)"); - static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename lno_nnz_view_t_::non_const_value_type, ordinal_type), "sptrsv_solve_streams: A entry type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)"); - static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename scalar_nnz_view_t_::value_type, scalar_type), "sptrsv_solve_streams: A scalar type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)"); - - static_assert(Kokkos::is_view::value, "sptrsv_solve_streams: b is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, "sptrsv_solve_streams: x is not a Kokkos::View."); - static_assert((int)BType::rank == (int)XType::rank, "sptrsv_solve_streams: The ranks of b and x do not match."); - static_assert(BType::rank == 1, "sptrsv_solve_streams: b and x must both either have rank 1."); - static_assert(std::is_same::value, "sptrsv_solve_streams: The output x must be nonconst."); - static_assert(std::is_same::value, "sptrsv_solve_streams: Views BType and XType have different device_types."); - static_assert(std::is_same::value, "sptrsv_solve_streams: KernelHandle's execution space is different from ExecutionSpace."); - static_assert(std::is_same::value, "sptrsv_solve_streams: KernelHandle and Views have different execution spaces."); - static_assert(std::is_same::value, "sptrsv_solve_streams: rowmap and entries have different device types."); - static_assert(std::is_same::value, "sptrsv_solve_streams: rowmap and values have different device types."); + using scalar_type = typename KernelHandle::nnz_scalar_t; + + static_assert(Kokkos::is_execution_space::value, + "ExecutionSpace is not valid"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename lno_row_view_t_::memory_space>::accessible, + "sptrsv_solve_streams: ExecutionSpace cannot access data in " + "lno_row_view_t_"); + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename lno_nnz_view_t_::memory_space>::accessible, + "sptrsv_solve_streams: ExecutionSpace cannot access data in " + "lno_nnz_view_t_"); + static_assert(Kokkos::SpaceAccessibility< + ExecutionSpace, + typename scalar_nnz_view_t_::memory_space>::accessible, + "sptrsv_solve_streams: ExecutionSpace cannot access data in " + "scalar_nnz_view_t_"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sptrsv_solve_streams: ExecutionSpace cannot access data in BType"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "sptrsv_solve_streams: ExecutionSpace cannot access data in XType"); + + static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE( + typename lno_row_view_t_::non_const_value_type, size_type), + "sptrsv_solve_streams: A size_type must match KernelHandle " + "size_type (const doesn't matter)"); + static_assert( + KOKKOSKERNELS_SPTRSV_SAME_TYPE( + typename lno_nnz_view_t_::non_const_value_type, ordinal_type), + "sptrsv_solve_streams: A entry type must match KernelHandle entry type " + "(aka nnz_lno_t, and const doesn't matter)"); + static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE( + typename scalar_nnz_view_t_::value_type, scalar_type), + "sptrsv_solve_streams: A scalar type must match KernelHandle " + "entry type (aka nnz_lno_t, and const doesn't matter)"); + + static_assert(Kokkos::is_view::value, + "sptrsv_solve_streams: b is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "sptrsv_solve_streams: x is not a Kokkos::View."); + static_assert((int)BType::rank == (int)XType::rank, + "sptrsv_solve_streams: The ranks of b and x do not match."); + static_assert(BType::rank == 1, + "sptrsv_solve_streams: b and x must both either have rank 1."); + static_assert(std::is_same::value, + "sptrsv_solve_streams: The output x must be nonconst."); + static_assert(std::is_same::value, + "sptrsv_solve_streams: Views BType and XType have different " + "device_types."); + static_assert( + std::is_same< + ExecutionSpace, + typename KernelHandle::SPTRSVHandleType::execution_space>::value, + "sptrsv_solve_streams: KernelHandle's execution space is different from " + "ExecutionSpace."); + static_assert( + std::is_same< + typename BType::device_type::execution_space, + typename KernelHandle::SPTRSVHandleType::execution_space>::value, + "sptrsv_solve_streams: KernelHandle and Views have different execution " + "spaces."); + static_assert( + std::is_same::value, + "sptrsv_solve_streams: rowmap and entries have different device types."); + static_assert( + std::is_same::value, + "sptrsv_solve_streams: rowmap and values have different device types."); // Check sizes of vectors if (execspace_v.size() != handle_v.size()) { @@ -460,23 +515,28 @@ void sptrsv_solve_streams(const std::vector& execspace_v, using c_temp_t = typename KernelHandle::HandleTempMemorySpace; using c_persist_t = typename KernelHandle::HandlePersistentMemorySpace; - using const_handle_type = typename KokkosKernels::Experimental::KokkosKernelsHandle; + using const_handle_type = + typename KokkosKernels::Experimental::KokkosKernelsHandle< + c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t>; using RowMap_Internal = Kokkos::View< typename lno_row_view_t_::const_value_type *, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename KokkosKernels::Impl::GetUnifiedLayout< + lno_row_view_t_>::array_layout, typename lno_row_view_t_::device_type, Kokkos::MemoryTraits >; using Entries_Internal = Kokkos::View< typename lno_nnz_view_t_::const_value_type *, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename KokkosKernels::Impl::GetUnifiedLayout< + lno_nnz_view_t_>::array_layout, typename lno_nnz_view_t_::device_type, Kokkos::MemoryTraits >; using Values_Internal = Kokkos::View< typename scalar_nnz_view_t_::const_value_type *, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename KokkosKernels::Impl::GetUnifiedLayout< + scalar_nnz_view_t_>::array_layout, typename scalar_nnz_view_t_::device_type, Kokkos::MemoryTraits >; @@ -491,20 +551,20 @@ void sptrsv_solve_streams(const std::vector& execspace_v, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename XType::device_type, Kokkos::MemoryTraits >; - std::vector handle_i_v (execspace_v.size()); - std::vector rowmap_i_v (execspace_v.size()); - std::vector entries_i_v(execspace_v.size()); - std::vector values_i_v (execspace_v.size()); - std::vector b_i_v(execspace_v.size()); - std::vector x_i_v(execspace_v.size()); + std::vector handle_i_v(execspace_v.size()); + std::vector rowmap_i_v(execspace_v.size()); + std::vector entries_i_v(execspace_v.size()); + std::vector values_i_v(execspace_v.size()); + std::vector b_i_v(execspace_v.size()); + std::vector x_i_v(execspace_v.size()); for (int i = 0; i < static_cast(execspace_v.size()); i++) { handle_i_v[i] = const_handle_type(*(handle_v[i])); rowmap_i_v[i] = rowmap_v[i]; entries_i_v[i] = entries_v[i]; values_i_v[i] = values_v[i]; - b_i_v[i] = b_v[i]; - x_i_v[i] = x_v[i]; + b_i_v[i] = b_v[i]; + x_i_v[i] = x_v[i]; } auto sptrsv_handle = handle_v[0]->get_sptrsv_handle(); @@ -513,12 +573,17 @@ void sptrsv_solve_streams(const std::vector& execspace_v, // NOTE: assume all streams use the same SPTRSV_CUSPARSE algo. KokkosSparse::Impl::sptrsvcuSPARSE_solve_streams< ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, - Values_Internal, BType_Internal, XType_Internal>(execspace_v, handle_i_v, rowmap_i_v, entries_i_v, values_i_v, b_i_v, x_i_v, false); - + Values_Internal, BType_Internal, XType_Internal>( + execspace_v, handle_i_v, rowmap_i_v, entries_i_v, values_i_v, b_i_v, + x_i_v, false); + } else { KokkosSparse::Impl::SPTRSV_SOLVE< ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, - Values_Internal, BType_Internal, XType_Internal>::sptrsv_solve_streams(execspace_v, handle_i_v, rowmap_i_v, entries_i_v, values_i_v, b_i_v, x_i_v); + Values_Internal, BType_Internal, + XType_Internal>::sptrsv_solve_streams(execspace_v, handle_i_v, + rowmap_i_v, entries_i_v, + values_i_v, b_i_v, x_i_v); } } // sptrsv_solve_streams From ff664866d2fde88da22a05115319a012e009e167 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Thu, 4 May 2023 11:27:16 -0600 Subject: [PATCH 334/442] cm_test_all_sandia: load openblas/0.3.20/rocm/5.2.0 for TPL spot check on caraway --- scripts/cm_test_all_sandia | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 7e5c135d7f..3a9a79b11d 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -692,17 +692,26 @@ elif [ "$MACHINE" = "caraway" ]; then # output description and success based only on build succes; build time output (no run-time) BASE_MODULE_LIST="cmake/3.19.3,/" + ROCM520_MODULE_LIST="$BASE_MODULE_LIST,openblas/0.3.20/rocm/5.2.0" HIPCLANG_BUILD_LIST="Hip_Serial" HIPCLANG_WARNING_FLAGS="" - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("rocm/5.2.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" - "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - ) + if [ "$SPOT_CHECK_TPLS" = "True" ]; then + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("rocm/5.2.0 $ROCM520_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + ) + else + # Format: (compiler module-list build-list exe-name warning-flag) + COMPILERS=("rocm/5.2.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + ) + fi + + if [ -z "$ARCH_FLAG" ]; then ARCH_FLAG="--arch=VEGA908" From 9d95d49d13dad3e1670566497753e2effe79f403 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Thu, 4 May 2023 13:10:18 -0600 Subject: [PATCH 335/442] only enable KokkosBlas gesv test for CUDA+MAGMA and HOST+BLAS --- blas/unit_test/Test_Blas_gesv.hpp | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/blas/unit_test/Test_Blas_gesv.hpp b/blas/unit_test/Test_Blas_gesv.hpp index 81c94b9109..710102137e 100644 --- a/blas/unit_test/Test_Blas_gesv.hpp +++ b/blas/unit_test/Test_Blas_gesv.hpp @@ -13,13 +13,14 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -// Note: Luc Berger-Vergiat 04/15/21 -// This test should only be included -// in the CUDA backend if TPL MAGMA -// has been enabled. -#if !defined(TEST_CUDA_BLAS_CPP) || \ - (defined(TEST_CUDA_BLAS_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)) +// only enable this test where KokkosBlas supports gesv: +// CUDA+MAGMA and HOST+BLAS +#if (defined(TEST_CUDA_BLAS_CPP) && \ + defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) && \ + (defined(TEST_OPENMP_BLAS_CPP) || defined(TEST_OPENMPTARGET_BLAS_CPP) || \ + defined(TEST_SERIAL_BLAS_CPP) || defined(TEST_THREADS_BLAS_CPP))) #include #include @@ -128,9 +129,9 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { if (ats::abs(h_B(i) - h_X0(i)) > eps) { test_flag = false; // printf( " Error %d, pivot %c, padding %c: result( %.15lf ) != - // solution( %.15lf ) at (%ld)\n", N, mode[0], padding[0], - // ats::abs(h_B(i)), ats::abs(h_X0(i)), i ); - break; + // solution( %.15lf ) at (%d)\n", N, mode[0], padding[0], + // ats::abs(h_B(i)), ats::abs(h_X0(i)), int(i) ); + // break; } } ASSERT_EQ(test_flag, true); @@ -337,9 +338,6 @@ int test_gesv_mrhs(const char* mode) { return 1; } -#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) || \ - defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) - #if defined(KOKKOSKERNELS_INST_FLOAT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) @@ -414,6 +412,4 @@ TEST_F(TestCategory, gesv_mrhs_complex_float) { } #endif -#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA || KOKKOSKERNELS_ENABLE_TPL_BLAS - -#endif // Check for TPL MAGMA when compiling the CUDA tests +#endif // CUDA+MAGMA or BLAS+HOST From 099d05784801bd3dc59cdbda158a02e2d293c71c Mon Sep 17 00:00:00 2001 From: "Roscoe A. Bartlett" Date: Wed, 19 Apr 2023 09:53:08 -0600 Subject: [PATCH 336/442] Run script remove_kokkos_subpackages_from_trilinos_packages_r.sh (#11545) This is the result of running the script: cd Trilinos/ ../commonTools/refactoring/refactors/remove_kokkos_subpackages_from_trilinos_packages_r.sh which calls remove_kokkos_subpackages_r.sh to absorb the refactoring of Kokkos to remove the usage of TriBITS subpackages. Manual changes may need to be made after this to remove duplciation in various logic (see comments in the scripts). --- cmake/Dependencies.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index d3b393ddde..3c58d5b318 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,5 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( - LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms + LIB_REQUIRED_PACKAGES Kokkos Kokkos LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE METIS SuperLU Cholmod CUBLAS ROCBLAS ROCSPARSE TEST_OPTIONAL_TPLS yaml-cpp ) From 0ba9eaa3a3a299fdd3658f90431924dfd07d9ea8 Mon Sep 17 00:00:00 2001 From: "Roscoe A. Bartlett" Date: Tue, 2 May 2023 17:01:15 -0600 Subject: [PATCH 337/442] Manually remove redundant Kokkos dep (#11545) Somehow, the script remove_kokkos_subpackages_r.sh did not remove this redundancy. (This is likely because Kokkos was at the end of the line with no space and no ')' char .) But this looks to be the only case in Trilinos it appears where redundant 'Kokkos Kokkos' was not removed by that script. --- cmake/Dependencies.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 3c58d5b318..777d4445b3 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,5 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( - LIB_REQUIRED_PACKAGES Kokkos Kokkos + LIB_REQUIRED_PACKAGES Kokkos LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE METIS SuperLU Cholmod CUBLAS ROCBLAS ROCSPARSE TEST_OPTIONAL_TPLS yaml-cpp ) From cd242ba2f360d33faf56ce9a085666bc5fb2d587 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Fri, 5 May 2023 11:11:31 -0600 Subject: [PATCH 338/442] New performance test for par_ilut, ginkgo::par_ilut, and spill (#1799) par_ilut perf testing #1799 * par_ilut: make Ut_values view atomic in compute_l_u_factors ... to fix the race issues when async updates are on. * With Ut atomic, no need to avoid async updates on GPU * Remove unnecessary header * Adjust async update views; default it to off * Fix UtValuesSafeType * Forgot to set spiluk team size * Add test selection arg * Allow for ginkgo exec selection on GPU * Allow bitwise test selection * Big improvement in ginkgo link in cmake * Use reference executor as parent executor for cuda executor * Remove old template metafunc for atomic Ut type * New defaults * Update perf_test/sparse/KokkosSparse_par_ilut.cpp * Add support for reading in files in par_ilut perf test * Integrate par_ilut perf test with google benchmark * Change return type to auto so client can add settings * Add missing returns --- perf_test/Benchmark_Context.hpp | 31 +- perf_test/sparse/CMakeLists.txt | 17 + perf_test/sparse/KokkosSparse_par_ilut.cpp | 488 +++++++++++++++++++++ sparse/unit_test/Test_Sparse_par_ilut.hpp | 2 +- 4 files changed, 532 insertions(+), 6 deletions(-) create mode 100644 perf_test/sparse/KokkosSparse_par_ilut.cpp diff --git a/perf_test/Benchmark_Context.hpp b/perf_test/Benchmark_Context.hpp index 275a9a9ab2..adfc336576 100644 --- a/perf_test/Benchmark_Context.hpp +++ b/perf_test/Benchmark_Context.hpp @@ -117,26 +117,47 @@ inline void add_benchmark_context(bool verbose = false) { } template -inline void register_benchmark(const char* name, FuncType func, +inline auto register_benchmark(const char* name, FuncType func, std::vector arg_names, std::vector args, int repeat, ArgsToCallOp&&... func_args) { if (repeat > 0) { - benchmark::RegisterBenchmark(name, func, - std::forward(func_args)...) + return benchmark::RegisterBenchmark( + name, func, std::forward(func_args)...) ->ArgNames(arg_names) ->Args(args) ->UseManualTime() ->Iterations(repeat); } else { - benchmark::RegisterBenchmark(name, func, - std::forward(func_args)...) + return benchmark::RegisterBenchmark( + name, func, std::forward(func_args)...) ->ArgNames(arg_names) ->Args(args) ->UseManualTime(); } } +template +inline auto register_benchmark_real_time(const char* name, FuncType func, + std::vector arg_names, + std::vector args, int repeat, + ArgsToCallOp&&... func_args) { + if (repeat > 0) { + return benchmark::RegisterBenchmark( + name, func, std::forward(func_args)...) + ->ArgNames(arg_names) + ->Args(args) + ->UseRealTime() + ->Iterations(repeat); + } else { + return benchmark::RegisterBenchmark( + name, func, std::forward(func_args)...) + ->ArgNames(arg_names) + ->Args(args) + ->UseRealTime(); + } +} + } // namespace KokkosKernelsBenchmark #endif diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index 6eac716aca..1fd965205a 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -115,3 +115,20 @@ KOKKOSKERNELS_ADD_EXECUTABLE( sparse_mdf SOURCES KokkosSparse_mdf.cpp ) + +if (KokkosKernels_ENABLE_BENCHMARK) + KOKKOSKERNELS_ADD_BENCHMARK( + sparse_par_ilut + SOURCES KokkosSparse_par_ilut.cpp + ) + + # Provide -DGinkgo_DIR to cmake to enable the ginkgo test in sparse_par_ilut. Ginkgo_DIR should + # point to the dir in the ginkgo install area that contains the GinkgoConfig.cmake file. + # For me, this was $gingko_install_dir/lib64/cmake/Ginkgo + if (Ginkgo_DIR) + find_package(Ginkgo REQUIRED) + + target_compile_definitions(sparse_par_ilut PRIVATE "USE_GINKGO") + target_link_libraries(sparse_par_ilut PRIVATE Ginkgo::ginkgo) + endif() +endif() diff --git a/perf_test/sparse/KokkosSparse_par_ilut.cpp b/perf_test/sparse/KokkosSparse_par_ilut.cpp new file mode 100644 index 0000000000..15715fe5a5 --- /dev/null +++ b/perf_test/sparse/KokkosSparse_par_ilut.cpp @@ -0,0 +1,488 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include +#include +#include +#include +#include +#include +#include // std::setprecision + +#include + +#include "KokkosSparse_Utils.hpp" +#include "KokkosSparse_spiluk.hpp" +#include "KokkosSparse_par_ilut.hpp" +#include "KokkosSparse_spmv.hpp" +#include "KokkosBlas1_nrm2.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosKernels_default_types.hpp" +#include +#include + +#include "Benchmark_Context.hpp" +#include + +#ifdef USE_GINKGO +#include +#endif + +namespace { + +using KokkosSparse::Experimental::par_ilut_numeric; +using KokkosSparse::Experimental::par_ilut_symbolic; + +using KokkosSparse::Experimental::spiluk_numeric; +using KokkosSparse::Experimental::spiluk_symbolic; +using KokkosSparse::Experimental::SPILUKAlgorithm; + +// Build up useful types +using scalar_t = default_scalar; +using lno_t = default_lno_t; +using size_type = default_size_type; +using exe_space = Kokkos::DefaultExecutionSpace; +using mem_space = typename exe_space::memory_space; +using device = Kokkos::Device; + +using RowMapType = Kokkos::View; +using EntriesType = Kokkos::View; +using ValuesType = Kokkos::View; + +using sp_matrix_type = + KokkosSparse::CrsMatrix; +using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, exe_space, mem_space, mem_space>; +using float_t = typename Kokkos::ArithTraits::mag_type; + +/////////////////////////////////////////////////////////////////////////////// +void run_par_ilut_test(benchmark::State& state, KernelHandle& kh, + const sp_matrix_type& A, int& num_iters) +/////////////////////////////////////////////////////////////////////////////// +{ + const int rows = state.range(0); + + auto par_ilut_handle = kh.get_par_ilut_handle(); + + // Pull out views from CRS + auto A_row_map = A.graph.row_map; + auto A_entries = A.graph.entries; + auto A_values = A.values; + + // Allocate L and U CRS views as outputs + RowMapType L_row_map("L_row_map", rows + 1); + RowMapType U_row_map("U_row_map", rows + 1); + + // Initial L/U approximations for A + EntriesType L_entries("L_entries", 0); + ValuesType L_values("L_values", 0); + EntriesType U_entries("U_entries", 0); + ValuesType U_values("U_values", 0); + + size_type nnzL = 0; + size_type nnzU = 0; + for (auto _ : state) { + // Run par_ilut + state.ResumeTiming(); + par_ilut_symbolic(&kh, A_row_map, A_entries, L_row_map, U_row_map); + + nnzL = par_ilut_handle->get_nnzL(); + nnzU = par_ilut_handle->get_nnzU(); + + Kokkos::resize(L_entries, nnzL); + Kokkos::resize(U_entries, nnzU); + Kokkos::resize(L_values, nnzL); + Kokkos::resize(U_values, nnzU); + Kokkos::deep_copy(L_entries, 0); + Kokkos::deep_copy(U_entries, 0); + Kokkos::deep_copy(L_values, 0); + Kokkos::deep_copy(U_values, 0); + + par_ilut_numeric(&kh, A_row_map, A_entries, A_values, L_row_map, L_entries, + L_values, U_row_map, U_entries, U_values); + Kokkos::fence(); + + state.PauseTiming(); + + // Check worked + num_iters = par_ilut_handle->get_num_iters(); + KK_REQUIRE_MSG(num_iters < par_ilut_handle->get_max_iter(), + "par_ilut hit max iters"); + + // Reset inputs + Kokkos::deep_copy(L_row_map, 0); + Kokkos::deep_copy(U_row_map, 0); + + std::cout << "Finished par_ilut run" << std::endl; + } +} + +#ifdef USE_GINKGO +/////////////////////////////////////////////////////////////////////////////// +using ginkgo_exec = + std::conditional_t(), + gko::CudaExecutor, gko::OmpExecutor>; + +template +std::shared_ptr get_ginkgo_exec() { + return GinkgoT::create(); +} + +template <> +std::shared_ptr get_ginkgo_exec() { + auto ref_exec = gko::ReferenceExecutor::create(); + return gko::CudaExecutor::create(0 /*device id*/, ref_exec); +} + +/////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////// +void run_par_ilut_test_ginkgo(benchmark::State& state, KernelHandle& kh, + const sp_matrix_type& A, const int& num_iters) +/////////////////////////////////////////////////////////////////////////////// +{ + auto par_ilut_handle = kh.get_par_ilut_handle(); + + // Pull out views from CRS + auto A_row_map = A.graph.row_map; + auto A_entries = A.graph.entries; + auto A_values = A.values; + + using mtx = gko::matrix::Csr; + + auto exec = get_ginkgo_exec(); + + // ginkgo does not differentiate between index type and size type. We need + // to convert A_row_map to lno_t. + EntriesType A_row_map_cp("A_row_map_cp", rows + 1); + Kokkos::deep_copy(A_row_map_cp, A_row_map); + + // Populate mtx + auto a_mtx_uniq = + mtx::create_const(exec, gko::dim<2>(rows, rows), + gko::array::const_view( + exec, A_values.extent(0), A_values.data()), + gko::array::const_view(exec, A_entries.extent(0), + A_entries.data()), + gko::array::const_view( + exec, A_row_map_cp.extent(0), A_row_map_cp.data())); + + std::shared_ptr a_mtx = std::move(a_mtx_uniq); + + for (auto _ : state) { + auto fact = gko::factorization::ParIlut::build() + .with_fill_in_limit(par_ilut_handle->get_fill_in_limit()) + .with_approximate_select(false) + .with_iterations(num_iters) + .on(exec) + ->generate(a_mtx); + + // Report run so user knows something is happening + std::cout << "GINKGO Finished a run " << std::endl; + } +} +#endif + +/////////////////////////////////////////////////////////////////////////////// +void run_spiluk_test(benchmark::State& state, KernelHandle& kh, + const sp_matrix_type& A, const int& team_size, + const bool measure_symbolic) +/////////////////////////////////////////////////////////////////////////////// +{ + const int rows = state.range(0); + + constexpr int EXPAND_FACT = 10; + const lno_t fill_lev = 2; + const size_type handle_nnz = EXPAND_FACT * A.nnz() * (fill_lev + 1); + kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, rows, handle_nnz, + handle_nnz); + auto spiluk_handle = kh.get_spiluk_handle(); + spiluk_handle->set_team_size(team_size); + + // Pull out views from CRS + auto A_row_map = A.graph.row_map; + auto A_entries = A.graph.entries; + auto A_values = A.values; + + // Allocate L and U CRS views as outputs + RowMapType L_row_map("L_row_map", rows + 1); + RowMapType U_row_map("U_row_map", rows + 1); + + // Initial L/U approximations for A + EntriesType L_entries("L_entries", handle_nnz); + ValuesType L_values("L_values", handle_nnz); + EntriesType U_entries("U_entries", handle_nnz); + ValuesType U_values("U_values", handle_nnz); + + for (auto _ : state) { + state.PauseTiming(); + + if (measure_symbolic) { + state.ResumeTiming(); + } + spiluk_symbolic(&kh, fill_lev, A_row_map, A_entries, L_row_map, L_entries, + U_row_map, U_entries); + Kokkos::fence(); + state.PauseTiming(); + + const size_type nnzL = spiluk_handle->get_nnzL(); + const size_type nnzU = spiluk_handle->get_nnzU(); + + Kokkos::resize(L_entries, nnzL); + Kokkos::resize(U_entries, nnzU); + Kokkos::resize(L_values, nnzL); + Kokkos::resize(U_values, nnzU); + + if (!measure_symbolic) { + state.ResumeTiming(); + } + spiluk_numeric(&kh, fill_lev, A_row_map, A_entries, A_values, L_row_map, + L_entries, L_values, U_row_map, U_entries, U_values); + Kokkos::fence(); + state.PauseTiming(); + + // Reset inputs + Kokkos::deep_copy(L_row_map, 0); + Kokkos::deep_copy(U_row_map, 0); + Kokkos::deep_copy(L_entries, 0); + Kokkos::deep_copy(U_entries, 0); + Kokkos::deep_copy(L_values, 0); + Kokkos::deep_copy(U_values, 0); + Kokkos::resize(L_entries, handle_nnz); + Kokkos::resize(U_entries, handle_nnz); + + spiluk_handle->reset_handle(rows, handle_nnz, handle_nnz); + + std::cout << "Finished spiluk run" << std::endl; + } +} + +/////////////////////////////////////////////////////////////////////////////// +int test_par_ilut_perf(const std::string& matrix_file, int rows, + const int nnz_per_row, const int bandwidth, + const int team_size, const int loop, const int test) +/////////////////////////////////////////////////////////////////////////////// +{ + KernelHandle kh; + kh.create_par_ilut_handle(); + + // Generate or read A + sp_matrix_type A; + if (matrix_file == "") { + size_type nnz = rows * nnz_per_row; + const lno_t row_size_variance = 0; + const scalar_t diag_dominance = 1; + A = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< + sp_matrix_type>(rows, rows, nnz, row_size_variance, bandwidth, + diag_dominance); + } else { + A = KokkosSparse::Impl::read_kokkos_crst_matrix( + matrix_file.c_str()); + rows = A.numRows(); + } + + KokkosSparse::sort_crs_matrix(A); + + // Make handles + auto par_ilut_handle = kh.get_par_ilut_handle(); + par_ilut_handle->set_team_size(team_size); + par_ilut_handle->set_nrows(rows); + + const auto default_policy = par_ilut_handle->get_default_team_policy(); + + // Report test config to user + if (matrix_file == "") { + std::cout << "Testing par_ilut with rows=" << rows + << "\n nnz_per_row=" << nnz_per_row + << "\n bandwidth=" << bandwidth; + } else { + std::cout << "Testing par_ilut with input matrix=" << matrix_file; + } + std::cout << "\n total nnz=" << A.nnz() + << "\n league_size=" << default_policy.league_size() + << "\n team_size=" << default_policy.team_size() + << "\n concurrent teams=" + << exe_space().concurrency() / default_policy.team_size() + << "\n loop=" << loop << std::endl; + + std::string name = "KokkosSparse_par_ilut"; + int num_iters = 6; + const auto arg_names = std::vector{"rows"}; + const auto args = std::vector{rows}; + + if (test & 1) { + auto plambda = [&](benchmark::State& state) { + run_par_ilut_test(state, kh, A, num_iters); + }; + KokkosKernelsBenchmark::register_benchmark((name + "_par_ilut").c_str(), + plambda, arg_names, args, loop) + ->UseRealTime(); + } + +#ifdef USE_GINKGO + if (test & 2) { + auto glambda = [&](benchmark::State& state) { + run_par_ilut_test_ginkgo(state, kh, A, num_iters); + }; + KokkosKernelsBenchmark::register_benchmark((name + "_gingko").c_str(), + glambda, arg_names, args, loop) + ->UseRealTime(); + } +#endif + + if (test & 4) { + auto s1lambda = [&](benchmark::State& state) { + run_spiluk_test(state, kh, A, team_size, true); + }; + auto s2lambda = [&](benchmark::State& state) { + run_spiluk_test(state, kh, A, team_size, false); + }; + KokkosKernelsBenchmark::register_benchmark( + (name + "_spiluk_symbolic").c_str(), s1lambda, arg_names, args, loop) + ->UseRealTime(); + + KokkosKernelsBenchmark::register_benchmark( + (name + "_spiluk_numeric").c_str(), s2lambda, arg_names, args, loop) + ->UseRealTime(); + } + + // Need to run before vars used by lambdas go out of scope + benchmark::RunSpecifiedBenchmarks(); + + return 0; +} + +} // namespace + +/////////////////////////////////////////////////////////////////////////////// +void print_help_par_ilut() +/////////////////////////////////////////////////////////////////////////////// +{ + printf("Options:\n"); + printf(" -f [F] : Read in Matrix Market formatted text file.\n"); + printf(" -n [N] : generate a semi-random banded NxN matrix.\n"); + printf(" -z [Z] : number nnz per row. Default is min(1%% of N, 50).\n"); + printf(" -b [B] : bandwidth per row. Default is max(2 * n^(1/2), nnz).\n"); + printf( + " -ts [T] : Number of threads per team. Default is 1 on OpenMP, " + "nnz_per_row on CUDA\n"); + // printf(" -vl [V] : Vector-length (i.e. how many Cuda threads are a Kokkos + // 'thread').\n"); + printf( + " -l [L] : How many runs to aggregate average time. Default is 4\n\n"); + printf( + " -t [T] : Which tests to run. Bitwise. e.g. 7 => run all, 1 => " + "par_ilut, 2 => ginkgo, 4 => spiluk,. Default is 7\n\n"); +} + +/////////////////////////////////////////////////////////////////////////////// +void handle_int_arg(int argc, char** argv, int& i, + std::map option_map) +/////////////////////////////////////////////////////////////////////////////// +{ + std::string arg = argv[i]; + auto it = option_map.find(arg); + if (it == option_map.end()) { + throw std::runtime_error(std::string("Unknown option: ") + arg); + } + if (i + 1 == argc) { + throw std::runtime_error(std::string("Missing option value for option: ") + + arg); + } + *(it->second) = atoi(argv[++i]); +} + +/////////////////////////////////////////////////////////////////////////////// +int main(int argc, char** argv) +/////////////////////////////////////////////////////////////////////////////// +{ + std::string mfile = ""; + int rows = -1; + int nnz_per_row = + -1; // depends on other options, so don't set to default yet + int bandwidth = -1; + int team_size = -1; + int loop = 4; + int test = 7; + + std::map option_map = { + {"-n", &rows}, {"-z", &nnz_per_row}, {"-b", &bandwidth}, + {"-ts", &team_size}, {"-l", &loop}, {"-t", &test}}; + + if (argc == 1) { + print_help_par_ilut(); + return 0; + } + + // Handle user options + for (int i = 1; i < argc; i++) { + if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) { + print_help_par_ilut(); + return 0; + } else if ((strcmp(argv[i], "-f") == 0)) { + mfile = argv[++i]; + } else { + handle_int_arg(argc, argv, i, option_map); + } + } + + // Determine where A is coming from + if (rows != -1) { + // We are randomly generating the input A + if (rows < 100) { + throw std::runtime_error("Need to have at least 100 rows"); + } + if (mfile != "") { + throw std::runtime_error( + "Need provide either -n or -f argument to this program, not both"); + } + } else { + // We are reading A from a file + if (mfile == "") { + throw std::runtime_error( + "Need provide either -n or -f argument to this program"); + } + } + + // Set dependent defaults + if (nnz_per_row == -1) { + nnz_per_row = std::min(rows / 100, 50); + } + if (bandwidth == -1) { + bandwidth = std::max(2 * (int)std::sqrt(rows), 2 * nnz_per_row); + } + if (team_size == -1) { + team_size = KokkosKernels::Impl::kk_is_gpu_exec_space() + ? nnz_per_row + : 1; + } + + Kokkos::initialize(argc, argv); + { + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kSecond); + KokkosKernelsBenchmark::add_benchmark_context(true); + + test_par_ilut_perf(mfile, rows, nnz_per_row, bandwidth, team_size, loop, + test); + + benchmark::Shutdown(); + } + Kokkos::finalize(); + return 0; +} diff --git a/sparse/unit_test/Test_Sparse_par_ilut.hpp b/sparse/unit_test/Test_Sparse_par_ilut.hpp index 9b99c1000d..4370ebe37e 100644 --- a/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -304,7 +304,7 @@ void run_test_par_ilut_precond() { constexpr auto diagDominance = 1; constexpr bool verbose = false; - typename sp_matrix_type::non_const_size_type nnz = 10 * numRows; + size_type nnz = 10 * numRows; auto A = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< sp_matrix_type>(numRows, numCols, nnz, 0, lno_t(0.01 * numRows), diagDominance); From 4e6c85c39da395c72647fa235320a7a3b9807ba6 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Fri, 5 May 2023 12:31:38 -0600 Subject: [PATCH 339/442] Docs: adding stubs for trsm and trmm and updating gemv and gemm --- docs/developer/apidocs/blas2.rst | 2 +- docs/developer/apidocs/blas3.rst | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/developer/apidocs/blas2.rst b/docs/developer/apidocs/blas2.rst index 1d9a3f3fa7..0947eeadda 100644 --- a/docs/developer/apidocs/blas2.rst +++ b/docs/developer/apidocs/blas2.rst @@ -4,4 +4,4 @@ BLAS2 -- KokkosKernels blas2 interfaces gemv ---- .. doxygenfunction:: KokkosBlas::gemv(const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y) -.. doxygenfunction:: KokkosBlas::gemv(const typename AViewType::execution_space &space, const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y) +.. doxygenfunction:: KokkosBlas::gemv(const execution_space &space, const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y) diff --git a/docs/developer/apidocs/blas3.rst b/docs/developer/apidocs/blas3.rst index 3fa4e3e9c7..8303c9f17e 100644 --- a/docs/developer/apidocs/blas3.rst +++ b/docs/developer/apidocs/blas3.rst @@ -3,5 +3,9 @@ BLAS3 -- KokkosKernels blas3 interfaces gemm ---- +.. doxygenfunction:: KokkosBlas::gemm(const execution_space &space, const char transA[], const char transB[], typename AViewType::const_value_type &alpha, const AViewType &A, const BViewType &B, typename CViewType::const_value_type &beta, const CViewType &C) .. doxygenfunction:: KokkosBlas::gemm(const char transA[], const char transB[], typename AViewType::const_value_type &alpha, const AViewType &A, const BViewType &B, typename CViewType::const_value_type &beta, const CViewType &C) -.. doxygenfunction:: KokkosBlas::gemm(const typename CViewType::execution_space &space, const char transA[], const char transB[], typename AViewType::const_value_type &alpha, const AViewType &A, const BViewType &B, typename CViewType::const_value_type &beta, const CViewType &C) +.. doxygenfunction:: KokkosBlas::trmm(const execution_space& space, const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) +.. doxygenfunction:: KokkosBlas::trmm(const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) +.. doxygenfunction:: KokkosBlas::trsm(const execution_space& space, const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) +.. doxygenfunction:: KokkosBlas::trsm(const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) From ab0f774cda176b265c8e3abb22bfe09e1b58eee6 Mon Sep 17 00:00:00 2001 From: brian-kelley Date: Fri, 5 May 2023 14:46:36 -0600 Subject: [PATCH 340/442] Workaround for #1777 - cusparse spgemm test hang (#1811) Disable issue 1738 test in spgemm, if in cuda 11.0-11.3 and cusparse is enabled. For some reason (that appears to be a compiler bug?) _other_ spgemm tests hang after this particular unit test is run. --- sparse/unit_test/Test_Sparse_spgemm.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sparse/unit_test/Test_Sparse_spgemm.hpp b/sparse/unit_test/Test_Sparse_spgemm.hpp index bd1e68c370..7e655d4c0c 100644 --- a/sparse/unit_test/Test_Sparse_spgemm.hpp +++ b/sparse/unit_test/Test_Sparse_spgemm.hpp @@ -486,6 +486,16 @@ void test_issue402() { template void test_issue1738() { +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && (CUDA_VERSION >= 11000) && \ + (CUDA_VERSION < 11040) + { + std::cerr + << "TEST SKIPPED: See " + "https://github.com/kokkos/kokkos-kernels/issues/1777 for details." + << std::endl; + return; + } +#endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL // Make sure that std::invalid_argument is thrown if you: // - call numeric where an input matrix's entries have changed. // - try to reuse an spgemm handle by calling symbolic with new input From b60e681daedf7df61cbc453c526960ff9706045f Mon Sep 17 00:00:00 2001 From: James Foucar Date: Fri, 5 May 2023 12:37:12 -0600 Subject: [PATCH 341/442] Reorganize par_ilut performance test I saw a big (~2s) discrepency between the times google benchmark was measuring for me with UseRealTime and the times I was getting with a manual Kokkos timer, so I went back to my previous approach of manual timings and itegrated them with google benchmark via UseManualTime. To reduce some code duplication I added a generic time_call function to time a lambda call. --- perf_test/sparse/CMakeLists.txt | 4 +- perf_test/sparse/KokkosSparse_par_ilut.cpp | 129 ++++++++++++++------- 2 files changed, 86 insertions(+), 47 deletions(-) diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index 1fd965205a..f63560e0f4 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -128,7 +128,7 @@ if (KokkosKernels_ENABLE_BENCHMARK) if (Ginkgo_DIR) find_package(Ginkgo REQUIRED) - target_compile_definitions(sparse_par_ilut PRIVATE "USE_GINKGO") - target_link_libraries(sparse_par_ilut PRIVATE Ginkgo::ginkgo) + target_compile_definitions(KokkosKernels_sparse_par_ilut PRIVATE "USE_GINKGO") + target_link_libraries(KokkosKernels_sparse_par_ilut PRIVATE Ginkgo::ginkgo) endif() endif() diff --git a/perf_test/sparse/KokkosSparse_par_ilut.cpp b/perf_test/sparse/KokkosSparse_par_ilut.cpp index 15715fe5a5..e133c9ade7 100644 --- a/perf_test/sparse/KokkosSparse_par_ilut.cpp +++ b/perf_test/sparse/KokkosSparse_par_ilut.cpp @@ -70,6 +70,35 @@ using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, exe_space, mem_space, mem_space>; using float_t = typename Kokkos::ArithTraits::mag_type; +/////////////////////////////////////////////////////////////////////////////// +template +void time_call(L& lam, State& state, const std::string& name) +/////////////////////////////////////////////////////////////////////////////// +{ + Kokkos::Timer timer; + double min_time = std::numeric_limits::infinity(); + double max_time = 0.0; + double ave_time = 0.0; + + for (auto _ : state) { + // Run timable thing + double time = lam(); + + // Record time + ave_time += time; + if (time > max_time) max_time = time; + if (time < min_time) min_time = time; + state.SetIterationTime(time); + + // Report run so user knows something is happening + std::cout << name << " Finished a run in: " << time << " seconds" << std::endl; + } + + std::cout << name << " LOOP_AVG_TIME: " << ave_time / state.iterations() << std::endl; + std::cout << name << " LOOP_MAX_TIME: " << max_time << std::endl; + std::cout << name << " LOOP_MIN_TIME: " << min_time << std::endl; +} + /////////////////////////////////////////////////////////////////////////////// void run_par_ilut_test(benchmark::State& state, KernelHandle& kh, const sp_matrix_type& A, int& num_iters) @@ -94,15 +123,13 @@ void run_par_ilut_test(benchmark::State& state, KernelHandle& kh, EntriesType U_entries("U_entries", 0); ValuesType U_values("U_values", 0); - size_type nnzL = 0; - size_type nnzU = 0; - for (auto _ : state) { - // Run par_ilut - state.ResumeTiming(); + auto plambda = [&]() { + Kokkos::Timer timer; + timer.reset(); par_ilut_symbolic(&kh, A_row_map, A_entries, L_row_map, U_row_map); - nnzL = par_ilut_handle->get_nnzL(); - nnzU = par_ilut_handle->get_nnzU(); + size_type nnzL = par_ilut_handle->get_nnzL(); + size_type nnzU = par_ilut_handle->get_nnzU(); Kokkos::resize(L_entries, nnzL); Kokkos::resize(U_entries, nnzU); @@ -116,8 +143,7 @@ void run_par_ilut_test(benchmark::State& state, KernelHandle& kh, par_ilut_numeric(&kh, A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values); Kokkos::fence(); - - state.PauseTiming(); + const double time = timer.seconds(); // Check worked num_iters = par_ilut_handle->get_num_iters(); @@ -128,8 +154,11 @@ void run_par_ilut_test(benchmark::State& state, KernelHandle& kh, Kokkos::deep_copy(L_row_map, 0); Kokkos::deep_copy(U_row_map, 0); - std::cout << "Finished par_ilut run" << std::endl; - } + // Return time + return time; + }; + + time_call(plambda, state, "PAR_ILUT"); } #ifdef USE_GINKGO @@ -156,6 +185,8 @@ void run_par_ilut_test_ginkgo(benchmark::State& state, KernelHandle& kh, const sp_matrix_type& A, const int& num_iters) /////////////////////////////////////////////////////////////////////////////// { + const int rows = state.range(0); + auto par_ilut_handle = kh.get_par_ilut_handle(); // Pull out views from CRS @@ -184,7 +215,10 @@ void run_par_ilut_test_ginkgo(benchmark::State& state, KernelHandle& kh, std::shared_ptr a_mtx = std::move(a_mtx_uniq); - for (auto _ : state) { + auto plambda = [&]() { + Kokkos::Timer timer; + timer.reset(); + auto fact = gko::factorization::ParIlut::build() .with_fill_in_limit(par_ilut_handle->get_fill_in_limit()) .with_approximate_select(false) @@ -192,9 +226,11 @@ void run_par_ilut_test_ginkgo(benchmark::State& state, KernelHandle& kh, .on(exec) ->generate(a_mtx); - // Report run so user knows something is happening - std::cout << "GINKGO Finished a run " << std::endl; - } + // Return time + return timer.seconds(); + }; + + time_call(plambda, state, "GINKGO"); } #endif @@ -229,16 +265,16 @@ void run_spiluk_test(benchmark::State& state, KernelHandle& kh, EntriesType U_entries("U_entries", handle_nnz); ValuesType U_values("U_values", handle_nnz); - for (auto _ : state) { - state.PauseTiming(); - - if (measure_symbolic) { - state.ResumeTiming(); - } + auto plambda = [&]() { + Kokkos::Timer timer; + double time; + timer.reset(); spiluk_symbolic(&kh, fill_lev, A_row_map, A_entries, L_row_map, L_entries, U_row_map, U_entries); Kokkos::fence(); - state.PauseTiming(); + if (measure_symbolic) { + time = timer.seconds(); + } const size_type nnzL = spiluk_handle->get_nnzL(); const size_type nnzU = spiluk_handle->get_nnzU(); @@ -249,12 +285,12 @@ void run_spiluk_test(benchmark::State& state, KernelHandle& kh, Kokkos::resize(U_values, nnzU); if (!measure_symbolic) { - state.ResumeTiming(); + timer.reset(); + spiluk_numeric(&kh, fill_lev, A_row_map, A_entries, A_values, L_row_map, + L_entries, L_values, U_row_map, U_entries, U_values); + Kokkos::fence(); + time = timer.seconds(); } - spiluk_numeric(&kh, fill_lev, A_row_map, A_entries, A_values, L_row_map, - L_entries, L_values, U_row_map, U_entries, U_values); - Kokkos::fence(); - state.PauseTiming(); // Reset inputs Kokkos::deep_copy(L_row_map, 0); @@ -268,14 +304,17 @@ void run_spiluk_test(benchmark::State& state, KernelHandle& kh, spiluk_handle->reset_handle(rows, handle_nnz, handle_nnz); - std::cout << "Finished spiluk run" << std::endl; - } + return time; + }; + + std::string name = std::string("SPILUK_") + (measure_symbolic ? "SYM" : "NUM"); + time_call(plambda, state, name); } /////////////////////////////////////////////////////////////////////////////// int test_par_ilut_perf(const std::string& matrix_file, int rows, - const int nnz_per_row, const int bandwidth, - const int team_size, const int loop, const int test) + int nnz_per_row, const int bandwidth, + int team_size, const int loop, const int test) /////////////////////////////////////////////////////////////////////////////// { KernelHandle kh; @@ -294,6 +333,14 @@ int test_par_ilut_perf(const std::string& matrix_file, int rows, A = KokkosSparse::Impl::read_kokkos_crst_matrix( matrix_file.c_str()); rows = A.numRows(); + nnz_per_row = A.nnz() / rows; + } + + // Now that we have A, we can set team_size + if (team_size == -1) { + team_size = KokkosKernels::Impl::kk_is_gpu_exec_space() + ? nnz_per_row + : 1; } KokkosSparse::sort_crs_matrix(A); @@ -330,8 +377,7 @@ int test_par_ilut_perf(const std::string& matrix_file, int rows, run_par_ilut_test(state, kh, A, num_iters); }; KokkosKernelsBenchmark::register_benchmark((name + "_par_ilut").c_str(), - plambda, arg_names, args, loop) - ->UseRealTime(); + plambda, arg_names, args, loop); } #ifdef USE_GINKGO @@ -340,8 +386,7 @@ int test_par_ilut_perf(const std::string& matrix_file, int rows, run_par_ilut_test_ginkgo(state, kh, A, num_iters); }; KokkosKernelsBenchmark::register_benchmark((name + "_gingko").c_str(), - glambda, arg_names, args, loop) - ->UseRealTime(); + glambda, arg_names, args, loop); } #endif @@ -353,12 +398,10 @@ int test_par_ilut_perf(const std::string& matrix_file, int rows, run_spiluk_test(state, kh, A, team_size, false); }; KokkosKernelsBenchmark::register_benchmark( - (name + "_spiluk_symbolic").c_str(), s1lambda, arg_names, args, loop) - ->UseRealTime(); + (name + "_spiluk_symbolic").c_str(), s1lambda, arg_names, args, loop); KokkosKernelsBenchmark::register_benchmark( - (name + "_spiluk_numeric").c_str(), s2lambda, arg_names, args, loop) - ->UseRealTime(); + (name + "_spiluk_numeric").c_str(), s2lambda, arg_names, args, loop); } // Need to run before vars used by lambdas go out of scope @@ -459,18 +502,14 @@ int main(int argc, char** argv) } } - // Set dependent defaults + // Set dependent defaults. Default team_size cannot be set + // until we know more about A if (nnz_per_row == -1) { nnz_per_row = std::min(rows / 100, 50); } if (bandwidth == -1) { bandwidth = std::max(2 * (int)std::sqrt(rows), 2 * nnz_per_row); } - if (team_size == -1) { - team_size = KokkosKernels::Impl::kk_is_gpu_exec_space() - ? nnz_per_row - : 1; - } Kokkos::initialize(argc, argv); { From 6e80b37f96a1d5e6669419f060012a578e9b3957 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Sun, 7 May 2023 15:03:29 -0600 Subject: [PATCH 342/442] formatting --- perf_test/sparse/KokkosSparse_par_ilut.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_par_ilut.cpp b/perf_test/sparse/KokkosSparse_par_ilut.cpp index e133c9ade7..44557a5a51 100644 --- a/perf_test/sparse/KokkosSparse_par_ilut.cpp +++ b/perf_test/sparse/KokkosSparse_par_ilut.cpp @@ -91,10 +91,12 @@ void time_call(L& lam, State& state, const std::string& name) state.SetIterationTime(time); // Report run so user knows something is happening - std::cout << name << " Finished a run in: " << time << " seconds" << std::endl; + std::cout << name << " Finished a run in: " << time << " seconds" + << std::endl; } - std::cout << name << " LOOP_AVG_TIME: " << ave_time / state.iterations() << std::endl; + std::cout << name << " LOOP_AVG_TIME: " << ave_time / state.iterations() + << std::endl; std::cout << name << " LOOP_MAX_TIME: " << max_time << std::endl; std::cout << name << " LOOP_MIN_TIME: " << min_time << std::endl; } @@ -307,14 +309,15 @@ void run_spiluk_test(benchmark::State& state, KernelHandle& kh, return time; }; - std::string name = std::string("SPILUK_") + (measure_symbolic ? "SYM" : "NUM"); + std::string name = + std::string("SPILUK_") + (measure_symbolic ? "SYM" : "NUM"); time_call(plambda, state, name); } /////////////////////////////////////////////////////////////////////////////// int test_par_ilut_perf(const std::string& matrix_file, int rows, - int nnz_per_row, const int bandwidth, - int team_size, const int loop, const int test) + int nnz_per_row, const int bandwidth, int team_size, + const int loop, const int test) /////////////////////////////////////////////////////////////////////////////// { KernelHandle kh; @@ -332,7 +335,7 @@ int test_par_ilut_perf(const std::string& matrix_file, int rows, } else { A = KokkosSparse::Impl::read_kokkos_crst_matrix( matrix_file.c_str()); - rows = A.numRows(); + rows = A.numRows(); nnz_per_row = A.nnz() / rows; } @@ -398,10 +401,10 @@ int test_par_ilut_perf(const std::string& matrix_file, int rows, run_spiluk_test(state, kh, A, team_size, false); }; KokkosKernelsBenchmark::register_benchmark( - (name + "_spiluk_symbolic").c_str(), s1lambda, arg_names, args, loop); + (name + "_spiluk_symbolic").c_str(), s1lambda, arg_names, args, loop); KokkosKernelsBenchmark::register_benchmark( - (name + "_spiluk_numeric").c_str(), s2lambda, arg_names, args, loop); + (name + "_spiluk_numeric").c_str(), s2lambda, arg_names, args, loop); } // Need to run before vars used by lambdas go out of scope From 954750d0c09075cb5386e6bec00c8b6e0ca4633f Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 8 May 2023 10:58:28 -0600 Subject: [PATCH 343/442] rocblas tpl spec: add missing comma separating vars in some macros --- blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index 9fb67e726b..33e18ab843 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -258,7 +258,7 @@ namespace Impl { #define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, EXECSPACE, \ - MEMSPACE ETI_SPEC_AVAIL) \ + MEMSPACE, ETI_SPEC_AVAIL) \ template <> \ struct Scal< \ EXECSPACE, \ @@ -319,18 +319,18 @@ namespace Impl { ETI_SPEC_AVAIL) #define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, \ - MEMSPACE ETI_SPEC_AVAIL) \ + MEMSPACE, ETI_SPEC_AVAIL) \ KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(float, float, rocblas_sscal, LAYOUT, \ EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) #define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, \ - MEMSPACE ETI_SPEC_AVAIL) \ + MEMSPACE, ETI_SPEC_AVAIL) \ KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ Kokkos::complex, rocblas_double_complex, rocblas_zscal, LAYOUT, \ EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) #define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, \ - MEMSPACE ETI_SPEC_AVAIL) \ + MEMSPACE, ETI_SPEC_AVAIL) \ KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ Kokkos::complex, rocblas_float_complex, rocblas_cscal, LAYOUT, \ EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) From 4f1abd7942803fb7bac72f0ee56693d332c35c44 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 8 May 2023 11:02:12 -0600 Subject: [PATCH 344/442] apply clang-format --- blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index 33e18ab843..c09839edb2 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -257,8 +257,8 @@ namespace KokkosBlas { namespace Impl { #define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ - SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, EXECSPACE, \ - MEMSPACE, ETI_SPEC_AVAIL) \ + SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ template <> \ struct Scal< \ EXECSPACE, \ @@ -318,21 +318,21 @@ namespace Impl { LAYOUT, EXECSPACE, MEMSPACE, \ ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, \ - MEMSPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_SSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(float, float, rocblas_sscal, LAYOUT, \ EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, \ - MEMSPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ Kokkos::complex, rocblas_double_complex, rocblas_zscal, LAYOUT, \ EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) -#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, \ - MEMSPACE, ETI_SPEC_AVAIL) \ - KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ - Kokkos::complex, rocblas_float_complex, rocblas_cscal, LAYOUT, \ +#define KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS( \ + Kokkos::complex, rocblas_float_complex, rocblas_cscal, LAYOUT, \ EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, From 146ce522f74021d9e5506f943e893682c90f2a06 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 8 May 2023 12:12:24 -0600 Subject: [PATCH 345/442] blas: various rocblas execspace fixes --- blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp | 2 +- blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp | 14 ++++++++------ blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp | 10 +++++----- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp index 082bec8135..7c1e1a7de2 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp @@ -102,7 +102,7 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, enum : bool { value = true }; \ }; -KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCUBLAS(double, Kokkos::LayoutLeft, +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index 96b704321f..5510e4c08b 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -415,6 +415,7 @@ namespace Impl { ETI_SPEC_AVAIL) \ template \ struct Nrm1< \ + ExecSpace, \ Kokkos::View >, \ Kokkos::View, \ @@ -444,7 +445,7 @@ namespace Impl { rocblas_dasum(s.handle, N, X.data(), one, R.data())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -454,6 +455,7 @@ namespace Impl { ETI_SPEC_AVAIL) \ template \ struct Nrm1< \ + ExecSpace, \ Kokkos::View >, \ Kokkos::View, \ @@ -483,7 +485,7 @@ namespace Impl { rocblas_sasum(s.handle, N, X.data(), one, R.data())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -492,7 +494,7 @@ namespace Impl { #define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ ETI_SPEC_AVAIL) \ template \ - struct Nrm1 >, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ @@ -525,7 +527,7 @@ namespace Impl { R.data())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -534,7 +536,7 @@ namespace Impl { #define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ ETI_SPEC_AVAIL) \ template \ - struct Nrm1 >, \ Kokkos::View*, LAYOUT, \ Kokkos::Device, \ @@ -567,7 +569,7 @@ namespace Impl { R.data())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ } else { \ - Nrm1::nrm1(R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index c09839edb2..9a8ef052fa 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -262,21 +262,21 @@ namespace Impl { template <> \ struct Scal< \ EXECSPACE, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ SCALAR_TYPE, \ Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits >, \ 1, true, ETI_SPEC_AVAIL> { \ using execution_space = EXECSPACE; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ RV; \ typedef SCALAR_TYPE AS; \ typedef Kokkos::View, \ + Kokkos::Device, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ @@ -306,7 +306,7 @@ namespace Impl { KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_pointer_mode(s.handle, pointer_mode)); \ } else { \ - Scal::scal(R, alpha, X); \ + Scal::scal(space, R, alpha, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ From b8a22cc6c30b4a4c58f099010cfce228de78ac09 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 8 May 2023 12:22:40 -0600 Subject: [PATCH 346/442] blas: fixups for ger exec space instances --- blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp index edcbeed72c..c8bfdd438f 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp @@ -163,16 +163,17 @@ KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, #define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ template <> \ - struct ger_tpl_spec_avail < EXEC_SPACE, \ + struct ger_tpl_spec_avail < \ + EXEC_SPACE, \ Kokkos::View < const SCALAR*, LAYOUT, \ - Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View < const SCALAR*, LAYOUT, \ - Kokkos::Device, \ Kokkos::MemoryTraits >, \ Kokkos::View< \ SCALAR**, LAYOUT, \ - Kokkos::Device, \ Kokkos::MemoryTraits > > { \ enum : bool { value = true }; \ }; From 725b46b89cff09ae71f6793c8326fac419a39ec2 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 8 May 2023 12:24:28 -0600 Subject: [PATCH 347/442] apply clang-format --- blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp | 2 +- blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp | 166 +++++++++--------- blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp | 3 +- blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp | 30 ++-- 4 files changed, 101 insertions(+), 100 deletions(-) diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp index 7c1e1a7de2..04ec811990 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp @@ -103,7 +103,7 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, }; KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, - Kokkos::HIPSpace) + Kokkos::HIPSpace) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index 5510e4c08b..b5b6e061ec 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -445,7 +445,7 @@ namespace Impl { rocblas_dasum(s.handle, N, X.data(), one, R.data())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ } else { \ - Nrm1::nrm1(space, R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -485,94 +485,96 @@ namespace Impl { rocblas_sasum(s.handle, N, X.data(), one, R.data())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ } else { \ - Nrm1::nrm1(space, R, X); \ + Nrm1::nrm1(space, R, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Nrm1 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_ROCBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dzasum( \ - s.handle, N, \ - reinterpret_cast(X.data()), one, \ - R.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - Nrm1::nrm1(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct Nrm1 >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::nrm1[TPL_ROCBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm1_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dzasum( \ + s.handle, N, \ + reinterpret_cast(X.data()), one, \ + R.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + Nrm1::nrm1(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Nrm1 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_ROCBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_scasum( \ - s.handle, N, \ - reinterpret_cast(X.data()), one, \ - R.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - Nrm1::nrm1(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct Nrm1 >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ + RV; \ + typedef Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::nrm1[TPL_ROCBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + nrm1_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_scasum( \ + s.handle, N, \ + reinterpret_cast(X.data()), one, \ + R.data())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + Nrm1::nrm1(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index 9a8ef052fa..4771f0862e 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -306,7 +306,8 @@ namespace Impl { KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_set_pointer_mode(s.handle, pointer_mode)); \ } else { \ - Scal::scal(space, R, alpha, X); \ + Scal::scal(space, R, \ + alpha, X); \ } \ Kokkos::Profiling::popRegion(); \ } \ diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp index c8bfdd438f..b672c690d5 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_avail.hpp @@ -160,22 +160,20 @@ KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS -#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - template <> \ - struct ger_tpl_spec_avail < \ - EXEC_SPACE, \ - Kokkos::View < const SCALAR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View < const SCALAR*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View< \ - SCALAR**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct ger_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; KOKKOSBLAS2_GER_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, From 924cdee42bc9961511fbd653c3e020a8756fd2d4 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Mon, 8 May 2023 13:03:26 -0700 Subject: [PATCH 348/442] Add unit test for sptrsv via streams --- .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 15 +- sparse/unit_test/Test_Sparse_sptrsv.hpp | 264 ++++++++++++++++++ 2 files changed, 276 insertions(+), 3 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index a1a1c85e35..6469bfe9a2 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -476,11 +476,11 @@ void sptrsvcuSPARSE_solve_streams( const bool is_idx_type_supported = std::is_same::value || std::is_same::value; - if (!is_cuda_space) { + if constexpr(!is_cuda_space) { throw std::runtime_error( "KokkosKernels sptrsvcuSPARSE_solve_streams: MEMORY IS NOT ALLOCATED " "IN GPU DEVICE for CUSPARSE\n"); - } else if (!is_idx_type_supported) { + } else if constexpr(!is_idx_type_supported) { throw std::runtime_error( "CUSPARSE requires local ordinals to be integer (32 bits or 64 " "bits).\n"); @@ -525,7 +525,16 @@ void sptrsvcuSPARSE_solve_streams( } } #else // CUDA_VERSION < 11030 - if (!std::is_same::value) { + const bool is_cuda_space = + std::is_same::value || + std::is_same::value || + std::is_same::value; + + if constexpr(!is_cuda_space) { + throw std::runtime_error( + "KokkosKernels sptrsvcuSPARSE_solve_streams: MEMORY IS NOT ALLOCATED " + "IN GPU DEVICE for CUSPARSE\n"); + } else if constexpr(!std::is_same::value) { throw std::runtime_error( "CUSPARSE requires local ordinals to be integer.\n"); } else { diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index 8b3e64ecae..7b4cc3181a 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -1049,6 +1049,234 @@ void run_test_sptrsv() { } } +template +void run_test_sptrsv_streams(int test_algo, int nstreams) { + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using RowMapType_hostmirror = typename RowMapType::HostMirror; + using EntriesType_hostmirror = typename EntriesType::HostMirror; + using ValuesType_hostmirror = typename ValuesType::HostMirror; + using execution_space = typename device::execution_space; + using memory_space = typename device::memory_space; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; + using crsMat_t = CrsMatrix; + + scalar_t ZERO = scalar_t(0); + scalar_t ONE = scalar_t(1); + + const size_type nrows = 5; + const size_type nnz = 10; + + std::vector instances; + if (nstreams == 2) + instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); + else if (nstreams == 3) + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); + else // (nstreams == 4) + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); + + std::vector kh_v(nstreams); + std::vector kh_ptr_v(nstreams); + std::vector row_map_v(nstreams); + std::vector entries_v(nstreams); + std::vector values_v(nstreams); + std::vector rhs_v(nstreams); + std::vector lhs_v(nstreams); + + RowMapType_hostmirror hrow_map("hrow_map", nrows + 1); + EntriesType_hostmirror hentries("hentries", nnz); + ValuesType_hostmirror hvalues("hvalues", nnz); + + // Upper tri + { + hrow_map(0) = 0; + hrow_map(1) = 2; + hrow_map(2) = 4; + hrow_map(3) = 7; + hrow_map(4) = 9; + hrow_map(5) = 10; + + hentries(0) = 0; + hentries(1) = 2; + hentries(2) = 1; + hentries(3) = 4; + hentries(4) = 2; + hentries(5) = 3; + hentries(6) = 4; + hentries(7) = 3; + hentries(8) = 4; + hentries(9) = 4; + + for (size_type i = 0; i < nnz; ++i) { + hvalues(i) = ONE; + } + + for (int i = 0; i < nstreams; i++) { + // Allocate U + row_map_v[i] = RowMapType("row_map", nrows + 1); + entries_v[i] = EntriesType("entries", nnz); + values_v[i] = ValuesType("values", nnz); + + // Copy from host to device + Kokkos::deep_copy(row_map_v[i], hrow_map); + Kokkos::deep_copy(entries_v[i], hentries); + Kokkos::deep_copy(values_v[i], hvalues); + + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); + + // Solution to find + lhs_v[i] = ValuesType("lhs", nrows); + + // A*known_lhs generates rhs: rhs is dense, use spmv + rhs_v[i] = ValuesType("rhs", nrows); + + crsMat_t triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], + entries_v[i]); + + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); + Kokkos::fence(); + + // Create handle + kh_v[i] = KernelHandle(); + bool is_lower_tri = false; + if (test_algo == 0) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, + is_lower_tri); + else if (test_algo == 1) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, + is_lower_tri); + else + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, + is_lower_tri); + + kh_ptr_v[i] = &kh_v[i]; + + // Symbolic phase + sptrsv_symbolic( kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i] ); + Kokkos::fence(); + } // Done handle creation and sptrsv_symbolic on all streams + + // Solve phase + sptrsv_solve_streams( instances, kh_ptr_v, row_map_v, entries_v, values_v, rhs_v, lhs_v ); + + for (int i = 0; i < nstreams; i++) instances[i].fence(); + + // Checking + for (int i = 0; i < nstreams; i++) { + scalar_t sum = 0.0; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, lhs_v[i].extent(0)), ReductionCheck(lhs_v[i]), sum); + if (sum != lhs_v[i].extent(0)) { + std::cout << "Upper Tri Solve FAILURE on stream " << i << std::endl; + kh_v[i].get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE(sum == scalar_t(lhs_v[i].extent(0))); + + kh_v[i].destroy_sptrsv_handle(); + } + } + + // Lower tri + { + hrow_map(0) = 0; + hrow_map(1) = 1; + hrow_map(2) = 2; + hrow_map(3) = 4; + hrow_map(4) = 6; + hrow_map(5) = 10; + + hentries(0) = 0; + hentries(1) = 1; + hentries(2) = 0; + hentries(3) = 2; + hentries(4) = 2; + hentries(5) = 3; + hentries(6) = 1; + hentries(7) = 2; + hentries(8) = 3; + hentries(9) = 4; + + for (size_type i = 0; i < nnz; ++i) { + hvalues(i) = ONE; + } + + for (int i = 0; i < nstreams; i++) { + // Allocate L + row_map_v[i] = RowMapType("row_map", nrows + 1); + entries_v[i] = EntriesType("entries", nnz); + values_v[i] = ValuesType("values", nnz); + + // Copy from host to device + Kokkos::deep_copy(row_map_v[i], hrow_map); + Kokkos::deep_copy(entries_v[i], hentries); + Kokkos::deep_copy(values_v[i], hvalues); + + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); + + // Solution to find + lhs_v[i] = ValuesType("lhs", nrows); + + // A*known_lhs generates rhs: rhs is dense, use spmv + rhs_v[i] = ValuesType("rhs", nrows); + + crsMat_t triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], + entries_v[i]); + + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); + Kokkos::fence(); + + // Create handle + kh_v[i] = KernelHandle(); + bool is_lower_tri = true; + if (test_algo == 0) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, + is_lower_tri); + else if (test_algo == 1) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, + is_lower_tri); + else + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, + is_lower_tri); + + kh_ptr_v[i] = &kh_v[i]; + + // Symbolic phase + sptrsv_symbolic( kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i] ); + Kokkos::fence(); + } // Done handle creation and sptrsv_symbolic on all streams + + // Solve phase + sptrsv_solve_streams( instances, kh_ptr_v, row_map_v, entries_v, values_v, rhs_v, lhs_v ); + + for (int i = 0; i < nstreams; i++) instances[i].fence(); + + // Checking + for (int i = 0; i < nstreams; i++) { + scalar_t sum = 0.0; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0, lhs_v[i].extent(0)), ReductionCheck(lhs_v[i]), sum); + if (sum != lhs_v[i].extent(0)) { + std::cout << "Lower Tri Solve FAILURE on stream " << i << std::endl; + kh_v[i].get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE(sum == scalar_t(lhs_v[i].extent(0))); + + kh_v[i].destroy_sptrsv_handle(); + } + } +} + } // namespace Test template (); } +template +void test_sptrsv_streams() { + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 2 streams" << std::endl; + Test::run_test_sptrsv_streams(0, 2); + + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 3 streams" << std::endl; + Test::run_test_sptrsv_streams(0, 3); + + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 4 streams" << std::endl; + Test::run_test_sptrsv_streams(0, 4); + + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 2 streams" << std::endl; + Test::run_test_sptrsv_streams(1, 2); + + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 3 streams" << std::endl; + Test::run_test_sptrsv_streams(1, 3); + + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 4 streams" << std::endl; + Test::run_test_sptrsv_streams(1, 4); + +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + if (std::is_same::value && std::is_same::value) { + std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 2 streams" << std::endl; + Test::run_test_sptrsv_streams(2, 2); + + std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 3 streams" << std::endl; + Test::run_test_sptrsv_streams(2, 3); + + std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 4 streams" << std::endl; + Test::run_test_sptrsv_streams(2, 4); + } +#endif +} + #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##sptrsv##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ test_sptrsv(); \ + test_sptrsv_streams(); \ } #include From 89d67ff14ec9641a09c9a2438a56a1d815a43552 Mon Sep 17 00:00:00 2001 From: "Vinh Quang Dang (-EXP)" Date: Mon, 8 May 2023 14:10:45 -0600 Subject: [PATCH 349/442] Apply clang format --- .../KokkosSparse_sptrsv_cuSPARSE_impl.hpp | 8 ++-- sparse/unit_test/Test_Sparse_sptrsv.hpp | 45 +++++++++++-------- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 6469bfe9a2..7605f03fa2 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -476,11 +476,11 @@ void sptrsvcuSPARSE_solve_streams( const bool is_idx_type_supported = std::is_same::value || std::is_same::value; - if constexpr(!is_cuda_space) { + if constexpr (!is_cuda_space) { throw std::runtime_error( "KokkosKernels sptrsvcuSPARSE_solve_streams: MEMORY IS NOT ALLOCATED " "IN GPU DEVICE for CUSPARSE\n"); - } else if constexpr(!is_idx_type_supported) { + } else if constexpr (!is_idx_type_supported) { throw std::runtime_error( "CUSPARSE requires local ordinals to be integer (32 bits or 64 " "bits).\n"); @@ -530,11 +530,11 @@ void sptrsvcuSPARSE_solve_streams( std::is_same::value || std::is_same::value; - if constexpr(!is_cuda_space) { + if constexpr (!is_cuda_space) { throw std::runtime_error( "KokkosKernels sptrsvcuSPARSE_solve_streams: MEMORY IS NOT ALLOCATED " "IN GPU DEVICE for CUSPARSE\n"); - } else if constexpr(!std::is_same::value) { + } else if constexpr (!std::is_same::value) { throw std::runtime_error( "CUSPARSE requires local ordinals to be integer.\n"); } else { diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index 7b4cc3181a..ec929a064b 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -1052,9 +1052,9 @@ void run_test_sptrsv() { template void run_test_sptrsv_streams(int test_algo, int nstreams) { - using RowMapType = Kokkos::View; - using EntriesType = Kokkos::View; - using ValuesType = Kokkos::View; + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; using RowMapType_hostmirror = typename RowMapType::HostMirror; using EntriesType_hostmirror = typename EntriesType::HostMirror; using ValuesType_hostmirror = typename ValuesType::HostMirror; @@ -1076,12 +1076,12 @@ void run_test_sptrsv_streams(int test_algo, int nstreams) { else if (nstreams == 3) instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); - else // (nstreams == 4) + else // (nstreams == 4) instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); std::vector kh_v(nstreams); - std::vector kh_ptr_v(nstreams); + std::vector kh_ptr_v(nstreams); std::vector row_map_v(nstreams); std::vector entries_v(nstreams); std::vector values_v(nstreams); @@ -1134,10 +1134,10 @@ void run_test_sptrsv_streams(int test_algo, int nstreams) { Kokkos::deep_copy(known_lhs, ONE); // Solution to find - lhs_v[i] = ValuesType("lhs", nrows); + lhs_v[i] = ValuesType("lhs", nrows); // A*known_lhs generates rhs: rhs is dense, use spmv - rhs_v[i] = ValuesType("rhs", nrows); + rhs_v[i] = ValuesType("rhs", nrows); crsMat_t triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], entries_v[i]); @@ -1146,7 +1146,7 @@ void run_test_sptrsv_streams(int test_algo, int nstreams) { Kokkos::fence(); // Create handle - kh_v[i] = KernelHandle(); + kh_v[i] = KernelHandle(); bool is_lower_tri = false; if (test_algo == 0) kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, @@ -1161,19 +1161,23 @@ void run_test_sptrsv_streams(int test_algo, int nstreams) { kh_ptr_v[i] = &kh_v[i]; // Symbolic phase - sptrsv_symbolic( kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i] ); + sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); Kokkos::fence(); } // Done handle creation and sptrsv_symbolic on all streams // Solve phase - sptrsv_solve_streams( instances, kh_ptr_v, row_map_v, entries_v, values_v, rhs_v, lhs_v ); + sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, + rhs_v, lhs_v); for (int i = 0; i < nstreams; i++) instances[i].fence(); // Checking for (int i = 0; i < nstreams; i++) { scalar_t sum = 0.0; - Kokkos::parallel_reduce(Kokkos::RangePolicy(0, lhs_v[i].extent(0)), ReductionCheck(lhs_v[i]), sum); + Kokkos::parallel_reduce( + Kokkos::RangePolicy( + 0, lhs_v[i].extent(0)), + ReductionCheck(lhs_v[i]), sum); if (sum != lhs_v[i].extent(0)) { std::cout << "Upper Tri Solve FAILURE on stream " << i << std::endl; kh_v[i].get_sptrsv_handle()->print_algorithm(); @@ -1226,10 +1230,10 @@ void run_test_sptrsv_streams(int test_algo, int nstreams) { Kokkos::deep_copy(known_lhs, ONE); // Solution to find - lhs_v[i] = ValuesType("lhs", nrows); + lhs_v[i] = ValuesType("lhs", nrows); // A*known_lhs generates rhs: rhs is dense, use spmv - rhs_v[i] = ValuesType("rhs", nrows); + rhs_v[i] = ValuesType("rhs", nrows); crsMat_t triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], entries_v[i]); @@ -1238,7 +1242,7 @@ void run_test_sptrsv_streams(int test_algo, int nstreams) { Kokkos::fence(); // Create handle - kh_v[i] = KernelHandle(); + kh_v[i] = KernelHandle(); bool is_lower_tri = true; if (test_algo == 0) kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, @@ -1253,19 +1257,23 @@ void run_test_sptrsv_streams(int test_algo, int nstreams) { kh_ptr_v[i] = &kh_v[i]; // Symbolic phase - sptrsv_symbolic( kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i] ); + sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); Kokkos::fence(); } // Done handle creation and sptrsv_symbolic on all streams // Solve phase - sptrsv_solve_streams( instances, kh_ptr_v, row_map_v, entries_v, values_v, rhs_v, lhs_v ); + sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, + rhs_v, lhs_v); for (int i = 0; i < nstreams; i++) instances[i].fence(); // Checking for (int i = 0; i < nstreams; i++) { scalar_t sum = 0.0; - Kokkos::parallel_reduce(Kokkos::RangePolicy(0, lhs_v[i].extent(0)), ReductionCheck(lhs_v[i]), sum); + Kokkos::parallel_reduce( + Kokkos::RangePolicy( + 0, lhs_v[i].extent(0)), + ReductionCheck(lhs_v[i]), sum); if (sum != lhs_v[i].extent(0)) { std::cout << "Lower Tri Solve FAILURE on stream " << i << std::endl; kh_v[i].get_sptrsv_handle()->print_algorithm(); @@ -1308,7 +1316,8 @@ void test_sptrsv_streams() { Test::run_test_sptrsv_streams(1, 4); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) - if (std::is_same::value && std::is_same::value) { + if (std::is_same::value && + std::is_same::value) { std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 2 streams" << std::endl; Test::run_test_sptrsv_streams(2, 2); From 5b1c1f4fa8446d4cd057434847bb95410fb090e5 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Mon, 8 May 2023 14:49:10 -0700 Subject: [PATCH 350/442] Remove unused variable --- sparse/src/KokkosSparse_sptrsv.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sparse/src/KokkosSparse_sptrsv.hpp b/sparse/src/KokkosSparse_sptrsv.hpp index fe227ac9b7..859918c58d 100644 --- a/sparse/src/KokkosSparse_sptrsv.hpp +++ b/sparse/src/KokkosSparse_sptrsv.hpp @@ -567,7 +567,6 @@ void sptrsv_solve_streams(const std::vector &execspace_v, x_i_v[i] = x_v[i]; } - auto sptrsv_handle = handle_v[0]->get_sptrsv_handle(); if (handle_v[0]->get_sptrsv_handle()->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { // NOTE: assume all streams use the same SPTRSV_CUSPARSE algo. From af4688919183e4ad42fe04b10ec3857f531ed11a Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 9 May 2023 08:11:03 -0600 Subject: [PATCH 351/442] Ger: adding documentation stubs in apidocs --- docs/developer/apidocs/blas2.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/developer/apidocs/blas2.rst b/docs/developer/apidocs/blas2.rst index 0947eeadda..20dbc5ea9a 100644 --- a/docs/developer/apidocs/blas2.rst +++ b/docs/developer/apidocs/blas2.rst @@ -5,3 +5,8 @@ gemv ---- .. doxygenfunction:: KokkosBlas::gemv(const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y) .. doxygenfunction:: KokkosBlas::gemv(const execution_space &space, const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y) + +ger +---- +.. doxygenfunction:: KokkosBlas::ger(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) +.. doxygenfunction:: KokkosBlas::ger(const char trans[], const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) From 69d0a8b5b047610acf7d1ca5307bc9c5e2d0b732 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 10 May 2023 09:51:31 -0600 Subject: [PATCH 352/442] Add BsrMatrix SpMV in rocSparse TPL, rewrite BsrMatrix SpMV unit tests (#1769) * masks unused variable in Test_Sparse_rocsparse.hpp * add BsrMatrix::block_layout * BsrMatrix: fill row map even if num rows is 0 * add KokkosSparse::Impl::bsr_to_crs * Add KokkosKernels::Impl::always_false * add KokkosSparse::Impl::{expand, blocked}_crs_to_bsr * stricter kokkos_to_rocsparse_type in KokkosSparse_Utils_rocsparse.hpp * BsrMatrix: improve checks for bad ncols/nrows * Assert BsrMatrix MemoryTraits is a memory trait * Add rocSparse TPL for BsrMatrix SpMV, rewrite BsrMatrix SpMV unit tests. * remove unused code, only retrieve nnz from RandCsMatrix * RandCsMat: remove fences * use KokkosSparse::Impl::graph_max_degree for max row length * use Impl::getRandomBounds in Bsr SpMV unit tests * Simplify 0x0, non-zero imaginary parts in Bsr SpMV unit tests * TestUtils: non-mangled exec space name * don't provide ordinal type as size type --- common/src/KokkosKernels_AlwaysFalse.hpp | 39 + sparse/impl/KokkosSparse_bsr_to_crs_impl.hpp | 147 +++ sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp | 124 ++ .../impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 22 +- sparse/src/KokkosSparse_BsrMatrix.hpp | 32 +- sparse/src/KokkosSparse_Utils_rocsparse.hpp | 16 +- sparse/src/KokkosSparse_spmv.hpp | 8 + ...osSparse_spmv_bsrmatrix_tpl_spec_avail.hpp | 63 + ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 240 +++- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 2 + .../Test_Sparse_TestUtils_RandCsMat.hpp | 6 +- sparse/unit_test/Test_Sparse_ccs2crs.hpp | 9 +- sparse/unit_test/Test_Sparse_crs2ccs.hpp | 9 +- sparse/unit_test/Test_Sparse_crs2coo.hpp | 10 +- sparse/unit_test/Test_Sparse_spmv_bsr.hpp | 1067 +++++++++-------- test_common/KokkosKernels_TestUtils.hpp | 50 +- 16 files changed, 1279 insertions(+), 565 deletions(-) create mode 100644 common/src/KokkosKernels_AlwaysFalse.hpp create mode 100644 sparse/impl/KokkosSparse_bsr_to_crs_impl.hpp create mode 100644 sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp diff --git a/common/src/KokkosKernels_AlwaysFalse.hpp b/common/src/KokkosKernels_AlwaysFalse.hpp new file mode 100644 index 0000000000..36f4572d29 --- /dev/null +++ b/common/src/KokkosKernels_AlwaysFalse.hpp @@ -0,0 +1,39 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_ALWAYSFALSE_HPP +#define KOKKOSKERNELS_ALWAYSFALSE_HPP + +#include + +/*! \file KokkosKernels_AlwaysFalse.hpp + \brief A convenience type to be used in a static_assert that should always + fail +*/ + +namespace KokkosKernels { +namespace Impl { + +template +using always_false = std::false_type; + +template +inline constexpr bool always_false_v = always_false::value; + +} // namespace Impl +} // namespace KokkosKernels + +#endif // KOKKOSKERNELS_ALWAYSFALSE_HPP diff --git a/sparse/impl/KokkosSparse_bsr_to_crs_impl.hpp b/sparse/impl/KokkosSparse_bsr_to_crs_impl.hpp new file mode 100644 index 0000000000..7c232fc6ab --- /dev/null +++ b/sparse/impl/KokkosSparse_bsr_to_crs_impl.hpp @@ -0,0 +1,147 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_BSR_TO_CRS_IMPL_HPP +#define KOKKOSSPARSE_BSR_TO_CRS_IMPL_HPP + +#include "KokkosSparse_BsrMatrix.hpp" +#include "KokkosSparse_CrsMatrix.hpp" + +namespace KokkosSparse { + +namespace Impl { + +/*! \brief Create an equivalent point matrix from a Bsr matrix + The Crs and Bsr matrix do not have to be on the same device +*/ +template +Crs bsr_to_crs(const Bsr &bsr) { + using crs_device_type = typename Crs::device_type; + using crs_values_type = typename Crs::values_type; + using crs_index_type = typename Crs::index_type; + using crs_ordinal_type = typename Crs::non_const_ordinal_type; + using crs_scalar_type = typename Crs::non_const_value_type; + using crs_size_type = typename Crs::non_const_size_type; + + using crs_row_map_type = + Kokkos::View; + using bsr_ordinal_type = typename Bsr::non_const_ordinal_type; + + using bsr_size_type = typename Bsr::non_const_size_type; + + // determine what some output matrix parameter will be + const size_t bs = bsr.blockDim(); + const crs_ordinal_type crsNumRows = bsr.numRows() * bs; + const crs_ordinal_type crsNumCols = bsr.numCols() * bs; + const crs_size_type crsNnz = bsr.nnz() * bs * bs; + + // clone Bsr row map to host memory space + auto bRows = Kokkos::create_mirror_view(bsr.graph.row_map); + auto bInds = Kokkos::create_mirror_view(bsr.graph.entries); + auto bVals = Kokkos::create_mirror_view(bsr.values); + Kokkos::deep_copy(bRows, bsr.graph.row_map); + Kokkos::deep_copy(bInds, bsr.graph.entries); + Kokkos::deep_copy(bVals, bsr.values); + + using Entry = + std::pair; // {column, value} + using Row = std::vector; // all entries in a row + std::map rows; // entries in each row + + // sort entries in a row by column + auto by_col = [](const Entry &a, const Entry &b) { + return a.first < b.first; + }; + + // Convert BSR data into CRS rows + for (bsr_ordinal_type bRow = 0; bRow < bsr_ordinal_type(bsr.numRows()); + ++bRow) { + for (bsr_size_type bColIdx = bRows(bRow); bColIdx < bRows(bRow + 1); + ++bColIdx) { + const crs_ordinal_type bCol = bInds(bColIdx); + + // add all points in this block + for (bsr_size_type lr = 0; lr < bsr_size_type(bs); ++lr) { + const crs_ordinal_type cRow = bRow * bs + lr; + for (bsr_size_type lc = 0; lc < bsr_size_type(bs); ++lc) { + const crs_size_type cvi = bColIdx * bs * bs + lr * bs + lc; + const crs_ordinal_type cCol = bCol * bs + lc; + const crs_scalar_type cVal = bVals(cvi); + auto entry = std::make_pair(cCol, cVal); + + auto it = rows.find(cRow); + if (it == rows.end()) { + Row newRow; + newRow.push_back(entry); + rows[cRow] = newRow; + } else { + it->second.push_back(entry); + } + } + } + } + } + + // device and host views of Crs data + crs_row_map_type devCrsRows("crs row map", crsNumRows + 1); + crs_index_type devCrsIdx("crs columns", crsNnz); + crs_values_type devCrsVals("crs values", crsNnz); + auto hostCrsRows = Kokkos::create_mirror_view(devCrsRows); + auto hostCrsIdx = Kokkos::create_mirror_view(devCrsIdx); + auto hostCrsVals = Kokkos::create_mirror_view(devCrsVals); + + // convert to Crs format + crs_ordinal_type iRowMap = 0; + crs_size_type nentries = 0; + for (auto &kv : rows) { // iterating through rows in order + const crs_ordinal_type &row = kv.first; // block's position + Row &entries = kv.second; // non-zeros in the block + + // update row map if we've moved to a new row + for (; iRowMap < row; ++iRowMap) { + hostCrsRows(iRowMap + 1) = nentries; // row ends at entries so far + } + + // make sure crs points in each row are sorted by column + std::sort(entries.begin(), entries.end(), by_col); + + // add columns and values to Crs data + for (size_t i = 0; i < entries.size(); ++i, ++nentries) { + hostCrsIdx(nentries) = entries[i].first; + hostCrsVals(nentries) = entries[i].second; + } + } + // complete row map if last blocks are empty + for (; iRowMap < crsNumRows; ++iRowMap) { + hostCrsRows(iRowMap + 1) = nentries; + } + + // move to device + Kokkos::deep_copy(devCrsRows, hostCrsRows); + Kokkos::deep_copy(devCrsIdx, hostCrsIdx); + Kokkos::deep_copy(devCrsVals, hostCrsVals); + + // construct the resulting Crs matrix + Crs crs("", crsNumRows, crsNumCols, crsNnz, devCrsVals, devCrsRows, + devCrsIdx); + return crs; +} // bsr_to_crs + +} // namespace Impl +} // namespace KokkosSparse + +#endif // KOKKOSSPARSE_BSR_TO_CRS_IMPL_HPP \ No newline at end of file diff --git a/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp b/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp new file mode 100644 index 0000000000..8e4c187b99 --- /dev/null +++ b/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp @@ -0,0 +1,124 @@ +namespace KokkosSparse { + +#include "KokkosSparse_BsrMatrix.hpp" +#include "KokkosSparse_CrsMatrix.hpp" + +namespace Impl { + +/*! \brief Expand each entry of a crs matrix to a block in a bsr matrix + The scalar, ordinal, and device types of the two matrices do not need + to be compatible +*/ +template +Bsr expand_crs_to_bsr(const Crs &crs, size_t blockSize) { + using bsr_device_type = typename Bsr::device_type; + using bsr_execution_space = typename Bsr::execution_space; + + using crs_values_type = typename Crs::values_type; + using bsr_values_type = typename Bsr::values_type; + + using crs_index_type = typename Crs::index_type; + using bsr_index_type = typename Bsr::index_type; + + using crs_row_map_type = typename Crs::row_map_type; + using bsr_row_map_type = + Kokkos::View; + + // construct the Bsr row map + bsr_row_map_type bsrRowMap("bsrRowMap", crs.graph.row_map.size()); + { + // clone Crs row map in Bsr memory space + Kokkos::View + crows("crows", crs.graph.row_map.size()); + Kokkos::deep_copy(crows, crs.graph.row_map); + + // copy to actual row map + Kokkos::RangePolicy policy(0, + crs.graph.row_map.size()); + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(size_t i) { bsrRowMap(i) = crows(i); }); + } + + // construct the BSR col indices + bsr_index_type bsrIndices("bsrIndices", crs.graph.entries.size()); + { + // clone Crs row map in Bsr memory space + Kokkos::View + cinds("cinds", crs.graph.entries.size()); + Kokkos::deep_copy(cinds, crs.graph.entries); + + // copy to actual row map + Kokkos::RangePolicy policy(0, + crs.graph.entries.size()); + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(size_t i) { bsrIndices(i) = cinds(i); }); + } + + // construct BSR values + bsr_values_type bsrVals("bsrVals", crs.nnz() * blockSize * blockSize); + { + // clone Crs values in Bsr memory space + Kokkos::View + cvals("cvals", crs.values.size()); + Kokkos::deep_copy(cvals, crs.values); + + // copy to actual values + Kokkos::RangePolicy policy(0, crs.values.size()); + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(size_t i) { + for (size_t ii = i; ii < i + blockSize * blockSize; ++ii) { + bsrVals(ii) = cvals(i); + } + }); + } + + Bsr bsr("", crs.numRows(), crs.numCols(), crs.nnz(), bsrVals, bsrRowMap, + bsrIndices, blockSize); + return bsr; +} // expand_crs_to_bsr + +/*! \brief convert a crs already in block format to a Bsr matrix + */ +template +Bsr blocked_crs_to_bsr(const Crs &crs, size_t blockSize) { + using bsr_value_type = typename Bsr::value_type; + using bsr_ordinal_type = typename Bsr::ordinal_type; + + // copy matrix data to host + auto hRowMap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + crs.graph.row_map); + auto hColInds = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + crs.graph.entries); + auto hVals = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), crs.values); + Kokkos::fence(); + + // construct COO data on host + std::vector vals; + std::vector rows, cols; + + vals.reserve(crs.nnz()); + rows.reserve(crs.nnz()); + cols.reserve(crs.nnz()); + + for (bsr_ordinal_type row = 0; row < bsr_ordinal_type(hRowMap.size()) - 1; + ++row) { + for (size_t ci = hRowMap(row); ci < hRowMap(row + 1); ++ci) { + bsr_ordinal_type col = hColInds(ci); + bsr_value_type val = hVals(ci); + + rows.push_back(row); + cols.push_back(col); + vals.push_back(val); + } + } + + Bsr bsr("", crs.numRows(), crs.numCols(), crs.nnz(), vals.data(), rows.data(), + cols.data(), blockSize); + return bsr; +} // expand_crs_to_bsr + +} // namespace Impl +} // namespace KokkosSparse diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index ef5c2e0684..e403ee6b20 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -150,14 +150,13 @@ struct SPMV_BSRMATRIX::value, "BsrMatrix requires that OrdinalType is a signed integer type."); + static_assert(Kokkos::is_memory_traits_v || + std::is_void_v, + "BsrMatrix: MemoryTraits (4th template param) must be a Kokkos " + "MemoryTraits or void"); private: typedef @@ -389,6 +393,11 @@ class BsrMatrix { //! Nonconst version of the type of the entries in the sparse matrix. typedef typename values_type::non_const_value_type non_const_value_type; + // block values are actually a 1-D view, however they are implicitly + // arranged in LayoutRight, e.g. consecutive entries in the values view + // are consecutive entries within a row inside a block + using block_layout = Kokkos::LayoutRight; + /// \name Storage of the actual sparsity structure and values. /// /// BsrMatrix uses the compressed sparse row (CSR) storage format to @@ -489,12 +498,19 @@ class BsrMatrix { KokkosKernels::Impl::throw_runtime_exception(os.str()); } - if ((ncols % blockDim_ != 0) || (nrows % blockDim_ != 0)) { - assert( - (ncols % blockDim_ == 0) && - "BsrMatrix: input CrsMatrix columns is not a multiple of block size"); - assert((nrows % blockDim_ == 0) && - "BsrMatrix: input CrsMatrix rows is not a multiple of block size"); + if (ncols % blockDim_) { + std::ostringstream os; + os << "BsrMatrix: " << ncols + << " input CrsMatrix columns is not a multiple of block size " + << blockDim_; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + if (nrows % blockDim_) { + std::ostringstream os; + os << "BsrMatrix: " << nrows + << " input CrsMatrix rows is not a multiple of block size " + << blockDim_; + KokkosKernels::Impl::throw_runtime_exception(os.str()); } if (annz % (blockDim_ * blockDim_)) { throw std::runtime_error( @@ -597,8 +613,8 @@ class BsrMatrix { ++bi; } // complete row map if last blocks are empty - for (; row < numRows; ++row) { - row_map_host(row + 1) = bi; + for (; row < numRows + 1; ++row) { + row_map_host(row) = bi; } // move graph data to the requested device diff --git a/sparse/src/KokkosSparse_Utils_rocsparse.hpp b/sparse/src/KokkosSparse_Utils_rocsparse.hpp index b146aff782..e263dfd0fa 100644 --- a/sparse/src/KokkosSparse_Utils_rocsparse.hpp +++ b/sparse/src/KokkosSparse_Utils_rocsparse.hpp @@ -17,6 +17,8 @@ #ifndef _KOKKOSKERNELS_SPARSEUTILS_ROCSPARSE_HPP #define _KOKKOSKERNELS_SPARSEUTILS_ROCSPARSE_HPP +#include + #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE #include #include "rocsparse/rocsparse.h" @@ -150,21 +152,29 @@ inline rocsparse_datatype rocsparse_compute_type>() { return rocsparse_datatype_f64_c; } -template -struct kokkos_to_rocsparse_type { - using type = Scalar; +template +struct kokkos_to_rocsparse_type; + +// for floats, rocsparse uses c++ builtin types +template +struct kokkos_to_rocsparse_type>> { + using type = T; }; +// translate complex float template <> struct kokkos_to_rocsparse_type> { using type = rocsparse_float_complex; }; +// translate complex double template <> struct kokkos_to_rocsparse_type> { using type = rocsparse_double_complex; }; +// e.g. 5.4 -> 50400 #define KOKKOSSPARSE_IMPL_ROCM_VERSION \ ROCM_VERSION_MAJOR * 10000 + ROCM_VERSION_MINOR * 100 + ROCM_VERSION_PATCH diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index 8ec6635ffb..c18c0bfeb4 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -349,6 +349,14 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], } #endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + // rocSparse does not support the modes (C), (T), (H) + if constexpr (std::is_same_v) { + useFallback = useFallback || (mode[0] != NoTranspose[0]); + } +#endif + if (useFallback) { // Explicitly call the non-TPL SPMV_BSRMATRIX implementation std::string label = diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp index 97a9790385..3ce22c630a 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp @@ -248,6 +248,69 @@ KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, #endif +#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) + +#include "KokkosSparse_Utils_rocsparse.hpp" + +#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE( \ + SCALAR, ORDINAL, OFFSET, LAYOUT, MEMSPACE) \ + template <> \ + struct spmv_bsrmatrix_tpl_spec_avail< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET, const SCALAR*, \ + LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits, SCALAR*, \ + LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > { \ + enum : bool { value = true }; \ + }; + +// These things may also be valid before 5.4, but I haven't tested it. +#if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400 + +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(float, rocsparse_int, + rocsparse_int, + Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(double, rocsparse_int, + rocsparse_int, + Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(float, rocsparse_int, + rocsparse_int, + Kokkos::LayoutRight, + Kokkos::HIPSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(double, rocsparse_int, + rocsparse_int, + Kokkos::LayoutRight, + Kokkos::HIPSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex, + rocsparse_int, + rocsparse_int, + Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex, + rocsparse_int, + rocsparse_int, + Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex, + rocsparse_int, + rocsparse_int, + Kokkos::LayoutRight, + Kokkos::HIPSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex, + rocsparse_int, + rocsparse_int, + Kokkos::LayoutRight, + Kokkos::HIPSpace) + +#endif // KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400 + +#undef KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE + +#endif // defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) + } // namespace Impl } // namespace Experimental } // namespace KokkosSparse diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 0e0fe463a5..cc3e2a6b1e 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -17,6 +17,7 @@ #ifndef KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP #define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP +#include "KokkosKernels_AlwaysFalse.hpp" #include "KokkosKernels_Controls.hpp" #include "KokkosSparse_Utils_mkl.hpp" @@ -421,7 +422,7 @@ KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::OpenMP, } // namespace Experimental } // namespace KokkosSparse -#endif +#endif // KOKKOSKERNELS_ENABLE_TPL_MKL // cuSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE @@ -525,7 +526,7 @@ void spmv_block_impl_cusparse( } KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descrA)); -#endif // CUDA_VERSION +#endif // (9000 <= CUDA_VERSION) } // Reference @@ -542,7 +543,7 @@ void spmv_block_impl_cusparse( // -> t(t(C)) = t(A * t(B)) + t(t(C)) // -> C = t(t(B)) * t(A) + C // -> C = B * t(A) + C -// This is impossible in cuSparse without explicitly transposing C, +// This is impossible in cuSparse without explicitly transposing A, // so we just do not support LayoutRight in cuSparse TPL now // template < @@ -647,7 +648,7 @@ void spm_mv_block_impl_cusparse( } KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descrA)); -#endif // CUDA_VERSION +#endif // (9000 <= CUDA_VERSION) } #define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ @@ -735,7 +736,7 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -#endif // 9000 <= CUDA_VERSION +#endif // (9000 <= CUDA_VERSION) #undef KOKKOSSPARSE_SPMV_CUSPARSE @@ -803,7 +804,7 @@ KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, Kokkos::CudaUVMSpace, false) -#endif // 9000 <= CUDA_VERSION +#endif // (9000 <= CUDA_VERSION) #undef KOKKOSSPARSE_SPMV_MV_CUSPARSE @@ -813,4 +814,231 @@ KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, #endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +// -------------------- +// rocSparse +// -------------------- +#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) + +#include + +#include "KokkosSparse_Utils_rocsparse.hpp" + +namespace KokkosSparse { +namespace Experimental { +namespace Impl { + +template +void spmv_block_impl_rocsparse( + const KokkosKernels::Experimental::Controls& controls, const char mode[], + typename YVector::non_const_value_type const& alpha, const AMatrix& A, + const XVector& x, typename YVector::non_const_value_type const& beta, + const YVector& y) { + /* + rocm 5.4.0 rocsparse_*bsrmv reference: + https://rocsparse.readthedocs.io/en/rocm-5.4.0/usermanual.html#rocsparse-bsrmv-ex + + only trans = rocsparse_operation_none is supported + only descr = rocsparse_matrix_type_general is supported + + */ + + using offset_type = typename AMatrix::non_const_size_type; + using ordinal_type = typename AMatrix::non_const_ordinal_type; + using value_type = typename AMatrix::non_const_value_type; + using rocsparse_value_type = + typename KokkosSparse::Impl::kokkos_to_rocsparse_type::type; + + // assert ordinals and offsets are the expected types + static_assert(std::is_same_v, + "A offset_type must be rocsparse_int"); + static_assert(std::is_same_v, + "A ordinal_type must be rocsparse_int"); + + // assert all operands are the same type + using x_value_type = typename XVector::non_const_value_type; + using y_value_type = typename YVector::non_const_value_type; + static_assert(std::is_same_v, + "A and x must have same value type"); + static_assert(std::is_same_v, + "A and y must have same value type"); + + // assert X and Y are non-stride (pass raw pointers to TPL) + static_assert( + !std::is_same_v, + "x must be contiguous"); + static_assert( + !std::is_same_v, + "y must be contiguous"); + + // assert BSR data is non-stride (pass raw pointers to TPL) + static_assert(!std::is_same_v, + "A values must be contiguous"); + static_assert(!std::is_same_v, + "A row_map must be contiguous"); + static_assert(!std::is_same_v, + "A entries must be contiguous"); + + rocsparse_handle handle = controls.getRocsparseHandle(); + + // set the mode + rocsparse_operation trans; + switch (toupper(mode[0])) { + case 'N': trans = rocsparse_operation_none; break; + default: { + std::stringstream ss; + ss << "Mode " << mode << " invalid for rocsparse_[*]bsrmv\n"; + throw std::invalid_argument(ss.str()); + } + } + + /* + Specify the matrix direction. + The rocsparse_direction indicates whether a dense matrix should be parsed by + rows or by columns, assuming column-major storage. Values: enumerator + rocsparse_direction_row Parse the matrix by rows. enumerator + rocsparse_direction_column Parse the matrix by columns. + */ + // KokkosSparse Bsr matrix blocks are layoutright (row-major) + static_assert( + std::is_same_v, + "A blocks must be stored layout-right"); + rocsparse_direction dir = rocsparse_direction_row; + + const rocsparse_int mb = rocsparse_int(A.numRows()); // number of block rows + const rocsparse_int nb = rocsparse_int(A.numCols()); // number of block cols + const rocsparse_int nnzb = + rocsparse_int(A.nnz()); // number of non-zero blocks + const rocsparse_value_type* alpha_ = + reinterpret_cast(&alpha); + + const rocsparse_value_type* bsr_val = + reinterpret_cast(A.values.data()); + const rocsparse_int* bsr_row_ptr = A.graph.row_map.data(); + const rocsparse_int* bsr_col_ind = A.graph.entries.data(); + const rocsparse_int block_dim = rocsparse_int(A.blockDim()); + const rocsparse_value_type* x_ = + reinterpret_cast(x.data()); + const rocsparse_value_type* beta_ = + reinterpret_cast(&beta); + rocsparse_value_type* y_ = reinterpret_cast(y.data()); + + rocsparse_mat_descr descr; + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr)); + rocsparse_mat_info info; + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_info(&info)); + if constexpr (std::is_same_v) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_ex_analysis( + handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_ex( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info, x_, beta_, y_)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + } else if constexpr (std::is_same_v) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_ex_analysis( + handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_ex( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info, x_, beta_, y_)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + } else if constexpr (std::is_same_v>) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_ex_analysis( + handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_ex( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info, x_, beta_, y_)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + } else if constexpr (std::is_same_v>) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_ex_analysis( + handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_ex( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, info, x_, beta_, y_)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + } else { + static_assert(KokkosKernels::Impl::always_false_v, + "unsupported value type for rocsparse_*bsrmv"); + } + rocsparse_destroy_mat_descr(descr); + rocsparse_destroy_mat_info(info); + +} // spmv_block_impl_rocsparse + +#define KOKKOSSPARSE_SPMV_ROCSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ + COMPILE_LIBRARY) \ + template <> \ + struct SPMV_BSRMATRIX< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const, SCALAR const*, \ + LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits, SCALAR*, \ + LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using AMatrix = BsrMatrix; \ + using XVector = Kokkos::View< \ + SCALAR const*, LAYOUT, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View; \ + using Controls = KokkosKernels::Experimental::Controls; \ + \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv_bsrmatrix(const Controls& controls, const char mode[], \ + const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, \ + const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_ROCSPARSE,BSRMATRIX" + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_block_impl_rocsparse(controls, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSSPARSE_SPMV_ROCSPARSE(float, rocsparse_int, rocsparse_int, + Kokkos::LayoutLeft, Kokkos::HIPSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); +KOKKOSSPARSE_SPMV_ROCSPARSE(float, rocsparse_int, rocsparse_int, + Kokkos::LayoutRight, Kokkos::HIPSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); +KOKKOSSPARSE_SPMV_ROCSPARSE(double, rocsparse_int, rocsparse_int, + Kokkos::LayoutLeft, Kokkos::HIPSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); +KOKKOSSPARSE_SPMV_ROCSPARSE(double, rocsparse_int, rocsparse_int, + Kokkos::LayoutRight, Kokkos::HIPSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); +KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, rocsparse_int, + rocsparse_int, Kokkos::LayoutLeft, Kokkos::HIPSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); +KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, rocsparse_int, + rocsparse_int, Kokkos::LayoutRight, + Kokkos::HIPSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); +KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, rocsparse_int, + rocsparse_int, Kokkos::LayoutLeft, Kokkos::HIPSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); +KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, rocsparse_int, + rocsparse_int, Kokkos::LayoutRight, + Kokkos::HIPSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY); + +#undef KOKKOSSPARSE_SPMV_ROCSPARSE + +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse + +#endif // defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) + #endif // KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 7d14e304d7..11570c0d29 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -506,6 +506,8 @@ KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, Kokkos::LayoutLeft, KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, Kokkos::LayoutRight, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +#undef KOKKOSSPARSE_SPMV_ROCSPARSE + } // namespace Impl } // namespace KokkosSparse #endif // KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE diff --git a/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp b/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp index aa6d938684..279f4f89f9 100644 --- a/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp +++ b/sparse/unit_test/Test_Sparse_TestUtils_RandCsMat.hpp @@ -19,11 +19,11 @@ namespace Test { template void doCsMat(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { - auto expected_min = ScalarType(1.0); - int64_t expected_nnz = 0; + auto expected_min = ScalarType(1.0); + size_t expected_nnz = 0; RandCsMatrix cm(m, n, min_val, max_val); - for (int64_t i = 0; i < cm.get_nnz(); ++i) + for (size_t i = 0; i < cm.get_nnz(); ++i) ASSERT_GE(cm(i), expected_min) << cm.info; auto map_d = cm.get_map(); diff --git a/sparse/unit_test/Test_Sparse_ccs2crs.hpp b/sparse/unit_test/Test_Sparse_ccs2crs.hpp index 902bf41b4f..56972b8a07 100644 --- a/sparse/unit_test/Test_Sparse_ccs2crs.hpp +++ b/sparse/unit_test/Test_Sparse_ccs2crs.hpp @@ -24,6 +24,9 @@ template ; RandCrsMatType crsMat(m, n, min_val, max_val, m == 0 || n == 0); - using CrsOT = typename RandCrsMatType::IdViewTypeD::value_type; - using CrsType = typename KokkosSparse::CrsMatrix; - auto map = crsMat.get_map(); - auto ids = crsMat.get_ids(); + using CrsOT = typename RandCrsMatType::IdViewTypeD::value_type; + using CrsType = + typename KokkosSparse::CrsMatrix; + auto map = crsMat.get_map(); + auto ids = crsMat.get_ids(); CrsType crsMatrix("doCrs2Coo", crsMat.get_dim1(), crsMat.get_dim2(), crsMat.get_nnz(), crsMat.get_vals(), map, ids); diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index ccbcb21301..36237785b2 100644 --- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -14,563 +14,629 @@ // //@HEADER +/*! \file Test_Sparse_spmv_bsr.hpp + + Test the following 768 combos for at least a few matcies. + + Algorithms Alpha Beta Block Sizes Modes + (none) 0 0 1 N + native x 1 x 1 x 2 x T + experimental_bsr_tc -1 -1 5 C + 3.7 -1.5 9 H + + There are also a subset of tests on larger matrices +*/ + #include +#include +#include + #include #include -#include -#include "KokkosSparse_spmv.hpp" -#include "KokkosSparse_BsrMatrix.hpp" -#include "KokkosSparse_CrsMatrix.hpp" #include #include #include #include - #include "KokkosKernels_Controls.hpp" #include "KokkosKernels_default_types.hpp" -typedef Kokkos::complex kokkos_complex_double; -typedef Kokkos::complex kokkos_complex_float; +#include "KokkosSparse_spmv.hpp" +#include "KokkosSparse_BsrMatrix.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_crs_to_bsr_impl.hpp" +#include "KokkosSparse_bsr_to_crs_impl.hpp" +#include "KokkosSparse_Utils.hpp" + +using kokkos_complex_double = Kokkos::complex; +using kokkos_complex_float = Kokkos::complex; -namespace Test_Bsr { +namespace Test_Spmv_Bsr { -/// Random generator -template -inline Scalar random() { - auto const max = static_cast(RAND_MAX) + static_cast(1); - return static_cast(std::rand()) / max; +/*! \brief Maximum value used to fill A */ +template +constexpr T max_a() { + T discard, maxVal; + KokkosKernels::Impl::getRandomBounds(10.0, discard, maxVal); + return maxVal; } -template -inline void set_random_value(Scalar &v) { - v = random(); +/*! \brief Maximum value used to fill X */ +template +constexpr T max_x() { + T discard, maxVal; + KokkosKernels::Impl::getRandomBounds(10.0, discard, maxVal); + return maxVal; } -template -inline void set_random_value(Kokkos::complex &v) { - Scalar vre = random(); - Scalar vim = random(); - v = Kokkos::complex(vre, vim); +/*! \brief Maximum value used to fill Y */ +template +constexpr T max_y() { + T discard, maxVal; + KokkosKernels::Impl::getRandomBounds(10.0, discard, maxVal); + return maxVal; } -template -inline void set_random_value(std::complex &v) { - Scalar vre = random(); - Scalar vim = random(); - v = std::complex(vre, vim); +/*! \brief whether the mode transposes the matrix*/ +inline bool mode_is_transpose(const char *mode) { + return mode[0] == 'T' || mode[0] == 'H'; } -/// \brief Routine to make CRS-style entries of the block matrix -/// -/// \tparam scalar_t Template type for the numerical values -/// \param mat_b1 Sparse matrix whose graph will be used -/// \param blockSize Block size for each entries -/// \param mat_rowmap[out] CRS-style row map for the block matrix -/// \param mat_colidx[out] CRS-style column entries for the block matrix -/// \param mat_val[out] Numerical (random) values -template -void make_block_entries( - const KokkosSparse::CrsMatrix &mat_b1, - int blockSize, rowmap_type &mat_rowmap, colidx_type &mat_colidx, - values_type &mat_val) { - size_t nnz = static_cast(blockSize) * static_cast(blockSize) * - mat_b1.nnz(); - - for (size_t ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]); - - // - // Create graph for CrsMatrix - // - - for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) { - const size_type jbeg = mat_b1.graph.row_map(ir); - const size_type jend = mat_b1.graph.row_map(ir + 1); - for (lno_t ib = 0; ib < blockSize; ++ib) { - const lno_t my_row = ir * blockSize + ib; - mat_rowmap[my_row + 1] = mat_rowmap[my_row] + (jend - jbeg) * blockSize; - for (size_type ijk = jbeg; ijk < jend; ++ijk) { - const auto col0 = mat_b1.graph.entries(ijk); - for (lno_t jb = 0; jb < blockSize; ++jb) { - mat_colidx[mat_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = - col0 * blockSize + jb; - } - } - } - } // for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) +/*! \brief 0x0 matrix */ +template +Bsr bsr_corner_case_0_by_0(const int blockSize) { + return Bsr("empty", 0, 0, 0, nullptr, nullptr, nullptr, blockSize); } -/// \brief Driver routine for checking BsrMatrix times vector -template -void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta, - const lno_t bMax, int &num_errors) { - // The mat_structure view is used to generate a matrix using - // finite difference (FD) or finite element (FE) discretization - // on a cartesian grid. - Kokkos::View mat_structure("Matrix Structure", - 3); - mat_structure(0, 0) = 8; // Request 8 grid point in 'x' direction - mat_structure(0, 1) = 0; // Add BC to the left - mat_structure(0, 2) = 0; // Add BC to the right - mat_structure(1, 0) = 7; // Request 7 grid point in 'y' direction - mat_structure(1, 1) = 0; // Add BC to the bottom - mat_structure(1, 2) = 0; // Add BC to the top - mat_structure(2, 0) = 9; // Request 9 grid point in 'z' direction - mat_structure(2, 1) = 0; // Add BC to the bottom - mat_structure(2, 2) = 0; // Add BC to the top - - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; - typedef typename KokkosSparse::CrsMatrix - h_crsMat_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef scalar_view_t x_vector_type; - typedef scalar_view_t y_vector_type; - - h_crsMat_t mat_b1 = - Test::generate_structured_matrix3D("FD", mat_structure); - - num_errors = 0; - for (lno_t blockSize = 1; blockSize <= bMax; ++blockSize) { - // - // Fill blocks with random values - // - - lno_t nRow = blockSize * mat_b1.numRows(); - lno_t nCol = blockSize * mat_b1.numCols(); - size_type nnz = static_cast(blockSize) * - static_cast(blockSize) * mat_b1.nnz(); - - Kokkos::View d_rowmap("crsmatrix", nRow + 1); - auto h_rowmap = Kokkos::create_mirror_view(d_rowmap); - - Kokkos::View d_colidx("crsmatrix", nnz); - auto h_colidx = Kokkos::create_mirror_view(d_colidx); - - Kokkos::View d_matval("crsmatrix", nnz); - auto h_matval = Kokkos::create_mirror_view(d_matval); - - // Create the entries - make_block_entries(mat_b1, blockSize, h_rowmap, - h_colidx, h_matval); - - Kokkos::deep_copy(d_matval, h_matval); - Kokkos::deep_copy(d_colidx, h_colidx); - Kokkos::deep_copy(d_rowmap, h_rowmap); - - // Create the CrsMatrix for the reference computation - crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, d_matval, d_rowmap, - d_colidx); - - x_vector_type xref("new_right_hand_side", nRow); - auto h_xref = Kokkos::create_mirror_view(xref); - for (lno_t ir = 0; ir < nRow; ++ir) { - set_random_value(h_xref(ir)); - } - Kokkos::deep_copy(xref, h_xref); +/*! \brief 0x1 matrix */ +template +Bsr bsr_corner_case_0_by_1(const int blockSize) { + return Bsr("empty", 0, blockSize, 0, nullptr, nullptr, nullptr, blockSize); +} - y_vector_type y0("y_init", nRow); - auto h_y0 = Kokkos::create_mirror_view(y0); - for (lno_t ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir)); - Kokkos::deep_copy(y0, h_y0); +/*! \brief 1x0 matrix */ +template +Bsr bsr_corner_case_1_by_0(const int blockSize) { + return Bsr("empty", blockSize, 0, 0, nullptr, nullptr, nullptr, blockSize); +} + +template +Bsr bsr_random(const int blockSize, const int blockRows, const int blockCols) { + using scalar_type = typename Bsr::non_const_value_type; + using ordinal_type = typename Bsr::non_const_ordinal_type; + using size_type = typename Bsr::non_const_size_type; + using Crs = + KokkosSparse::CrsMatrix; + using Graph = typename Crs::staticcrsgraph_type; + + // construct a random Crs Matrix + Test::RandCsMatrix + rcs(blockRows, blockCols, scalar_type(0), max_a()); + + const auto colids = + Kokkos::subview(rcs.get_ids(), Kokkos::pair{size_t(0), rcs.get_nnz()}); + const auto vals = + Kokkos::subview(rcs.get_vals(), Kokkos::pair{size_t(0), rcs.get_nnz()}); + Graph graph(colids, rcs.get_map()); + Crs crs("crs", blockCols, vals, graph); + + // expand to Bsr matrix + return KokkosSparse::Impl::expand_crs_to_bsr(crs, blockSize); +} - y_vector_type ycrs("crs_product_result", nRow); - auto h_ycrs = Kokkos::create_mirror_view(ycrs); - for (lno_t ir = 0; ir < nRow; ++ir) h_ycrs(ir) = h_y0(ir); - Kokkos::deep_copy(ycrs, h_ycrs); +/*! \brief reference SpMV is the KokkosSparse::spmv on the equivalent point + * matrix + */ +template +void reference_spmv(const char *mode, const Alpha &alpha, const Bsr &a, + const XVector &x, const Beta &beta, const YVector &y) { + using Crs = KokkosSparse::CrsMatrix< + typename Bsr::non_const_value_type, typename Bsr::non_const_ordinal_type, + typename Bsr::device_type, void, typename Bsr::non_const_size_type>; + const Crs crs = KokkosSparse::Impl::bsr_to_crs(a); + + KokkosSparse::spmv(mode, alpha, crs, x, beta, y); +} - // - // Make reference computation with a CrsMatrix variable - // +/*! \brief test a specific spmv + +*/ +template +void test_spmv(const char *alg, const char *mode, const Alpha &alpha, + const Beta &beta, const Bsr &a, const XVector &x, + const YVector &y) { + using execution_space = typename Bsr::execution_space; + using scalar_type = typename Bsr::non_const_value_type; + using ordinal_type = typename Bsr::non_const_ordinal_type; + using KATS = Kokkos::ArithTraits; + using mag_type = typename KATS::mag_type; + + // generate expected result from reference implementation + YVector yExp("yExp", y.extent(0)); + Kokkos::deep_copy(yExp, y); + reference_spmv(mode, alpha, a, x, beta, yExp); + + // scratch space for actual value (don't modify input) + YVector yAct("yAct", y.extent(0)); + Kokkos::deep_copy(yAct, y); + + if (alg) { KokkosKernels::Experimental::Controls controls; - // Use the native implementation since the CUDA 11.2.2 spmv implementation - // is not matching the bsr spmv test tolerance when OFFSET is int. - // See https://github.com/kokkos/kokkos-kernels/issues/1586 -#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && (11200 <= CUSPARSE_VERSION) - controls.setParameter("algorithm", "native"); -#endif - KokkosSparse::spmv(controls, fOp, alpha, Acrs, xref, beta, ycrs); - - y_vector_type ybsr("bsr_product_result", nRow); - auto h_ybsr = Kokkos::create_mirror_view(ybsr); - for (lno_t ir = 0; ir < nRow; ++ir) h_ybsr(ir) = h_y0(ir); - Kokkos::deep_copy(ybsr, h_ybsr); - - // Create the BsrMatrix for the check test - KokkosSparse::Experimental::BsrMatrix - Absr(Acrs, blockSize); - - // - // Make computation with the BsrMatrix format - // - KokkosSparse::spmv(fOp, alpha, Absr, xref, beta, ybsr); - - // - // Compare the two products - // - using KATS = Kokkos::ArithTraits; - using mag_type = typename KATS::mag_type; - - const mag_type zero_mag = Kokkos::ArithTraits::zero(); - mag_type error = zero_mag, maxNorm = zero_mag; - - Kokkos::deep_copy(h_ycrs, ycrs); - Kokkos::deep_copy(h_ybsr, ybsr); - for (lno_t ir = 0; ir < nRow; ++ir) { - error = std::max(error, KATS::abs(h_ycrs(ir) - h_ybsr(ir))); - maxNorm = std::max(maxNorm, KATS::abs(h_ycrs(ir))); - } + controls.setParameter("algorithm", alg); + KokkosSparse::spmv(controls, mode, alpha, a, x, beta, yAct); + } else { + KokkosSparse::spmv(mode, alpha, a, x, beta, yAct); + } + + // compare yExp and yAct + auto hyExp = Kokkos::create_mirror_view(yExp); + auto hyAct = Kokkos::create_mirror_view(yAct); + Kokkos::deep_copy(hyExp, yExp); + Kokkos::deep_copy(hyAct, yAct); + + // max nnz per row is used for the tolerance + // for a transposed computation, need to transpose the matrix before + // seeing which rows are longest + size_t maxNnzPerRow; + if (mode_is_transpose(mode)) { + auto at = KokkosSparse::Impl::transpose_bsr_matrix(a); + maxNnzPerRow = + at.blockDim() * + KokkosSparse::Impl::graph_max_degree( + at.graph.row_map); + } else { + maxNnzPerRow = + a.blockDim() * + KokkosSparse::Impl::graph_max_degree( + a.graph.row_map); + } + + /* assume that any floating-point op may introduce eps() error + scaling y is one op + dot product of x is two ops per entry (mul and add) - mag_type tmps = KATS::abs(alpha) + KATS::abs(beta); - if ((tmps > zero_mag) && (maxNorm == zero_mag)) { - std::cout << " BSR - SpMV times MV >> blockSize " << blockSize - << " maxNorm " << maxNorm << " error " << error << " alpha " - << alpha << " beta " << beta << "\n"; - num_errors += 1; + 10x means same order of magnitude + */ + const mag_type tolerance = + KATS::eps() * KATS::abs(beta) * KATS::abs(max_y()) + + 10 * KATS::eps() * maxNnzPerRow * KATS::abs(alpha) * + KATS::abs(max_a()) * KATS::abs(max_x()); + + std::vector errIdx; + + for (ordinal_type i = 0; i < ordinal_type(hyAct.extent(0)); ++i) { + if (KATS::abs(hyExp(i) - hyAct(i)) > tolerance) { + errIdx.push_back(i); } + } - // - // --- Factor ((nnz / nRow) + 1) = Average number of non-zeros per row - // - const mag_type tol = ((static_cast(nnz) / nRow) + 1) * - Kokkos::ArithTraits::epsilon(); - if (error > tol * maxNorm) { - std::cout << " BSR - SpMV times V >> blockSize " << blockSize << " ratio " - << error / maxNorm << " tol " << tol << " maxNorm " << maxNorm - << " alpha " << alpha << " beta " << beta << "\n"; - num_errors += 1; + if (!errIdx.empty()) { + std::cerr << __FILE__ << ":" << __LINE__ << " BsrMatrix SpMV failure!" + << std::endl; + std::cerr << "alg: " << (alg ? alg : "") << std::endl; + std::cerr << "mode: " << mode << std::endl; + std::cerr << "A: " << a.numRows() << "x" << a.numCols() + << std::endl; + std::cerr << "A blockdim: " << a.blockDim() << std::endl; + std::cerr << "alpha: " << alpha << std::endl; + std::cerr << "beta: " << beta << std::endl; + std::cerr << "maxNnzPerRow: " << maxNnzPerRow << std::endl; + std::cerr << "First 100 errors:" << std::endl; + std::cerr << "y\texp\tact\terr\ttol" << std::endl; + std::cerr << "-\t---\t---\t---\t---" << std::endl; + for (size_t i = 0; i < 100 && i < errIdx.size(); ++i) { + size_t ei = errIdx[i]; + // clang-format off + std::cerr << ei + << "\t" << hyExp(ei) + << "\t" << hyAct(ei) + << "\t" << KATS::abs(hyExp(ei) - hyAct(ei)) + << "\t" << tolerance + << std::endl; + // clang-format on } + } + + EXPECT_TRUE(errIdx.empty()); +} + +template +struct VectorTypeFor { + using type = Kokkos::View; +}; + +template +std::tuple::type, + typename VectorTypeFor::type> +spmv_corner_case_0_by_0(const char * /*mode*/, const int blockSize) { + using vector_type = typename VectorTypeFor::type; + Bsr a = bsr_corner_case_0_by_0(blockSize); + vector_type x("x", 0); + vector_type y("y", 0); + return std::make_tuple(a, x, y); +} + +template +std::tuple::type, + typename VectorTypeFor::type> +spmv_corner_case_0_by_1(const char *mode, const int blockSize) { + using vector_type = typename VectorTypeFor::type; + using execution_space = typename Bsr::execution_space; + using scalar_type = typename Bsr::non_const_value_type; + Bsr a = bsr_corner_case_0_by_1(blockSize); + + size_t nx = a.numCols() * a.blockDim(); + size_t ny = a.numRows() * a.blockDim(); + if (mode_is_transpose(mode)) { + std::swap(nx, ny); + } + vector_type x("x", nx); + vector_type y("y", ny); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(x, random, max_x()); + Kokkos::fill_random(y, random, max_y()); + + return std::make_tuple(a, x, y); +} + +template +std::tuple::type, + typename VectorTypeFor::type> +spmv_corner_case_1_by_0(const char *mode, const int blockSize) { + using vector_type = typename VectorTypeFor::type; + using execution_space = typename Bsr::execution_space; + using scalar_type = typename Bsr::non_const_value_type; + Bsr a = bsr_corner_case_1_by_0(blockSize); + + size_t nx = a.numCols() * a.blockDim(); + size_t ny = a.numRows() * a.blockDim(); + if (mode_is_transpose(mode)) { + std::swap(nx, ny); + } + vector_type x("x", nx); + vector_type y("y", ny); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(x, random, max_x()); + Kokkos::fill_random(y, random, max_y()); + + return std::make_tuple(a, x, y); +} + +/*! \brief + +*/ +template +std::tuple::type, + typename VectorTypeFor::type> +spmv_random(const char *mode, const int blockSize, const int blockRows, + const int blockCols) { + using scalar_type = typename Bsr::non_const_value_type; + + // expand to Bsr matrix + Bsr a = bsr_random(blockSize, blockRows, blockCols); + + // generate some random vectors + using vector_type = typename VectorTypeFor::type; + using execution_space = typename Bsr::execution_space; + + size_t nx = a.numCols() * a.blockDim(); + size_t ny = a.numRows() * a.blockDim(); + if (mode_is_transpose(mode)) { + std::swap(nx, ny); + } + vector_type x("x", nx); + vector_type y("y", ny); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(x, random, max_x()); + Kokkos::fill_random(y, random, max_y()); - } // for (int blockSize = 1; blockSize <= bMax; ++blockSize) + return std::make_tuple(a, x, y); } -/// \brief Driver routine for checking BsrMatrix times multiple vector -template -void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, - const lno_t bMax, int &num_errors) { - // The mat_structure view is used to generate a matrix using - // finite difference (FD) or finite element (FE) discretization - // on a cartesian grid. - Kokkos::View mat_structure("Matrix Structure", - 3); - mat_structure(0, 0) = 7; // Request 7 grid point in 'x' direction - mat_structure(0, 1) = 0; // Add BC to the left - mat_structure(0, 2) = 0; // Add BC to the right - mat_structure(1, 0) = 5; // Request 11 grid point in 'y' direction - mat_structure(1, 1) = 0; // Add BC to the bottom - mat_structure(1, 2) = 0; // Add BC to the top - mat_structure(2, 0) = 9; // Request 13 grid point in 'y' direction - mat_structure(2, 1) = 0; // Add BC to the bottom - mat_structure(2, 2) = 0; // Add BC to the top - - typedef typename KokkosSparse::CrsMatrix - h_crsMat_t; - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; - typedef Kokkos::View block_vector_t; - - h_crsMat_t mat_b1 = - Test::generate_structured_matrix3D("FD", mat_structure); - - num_errors = 0; - const int nrhs = 5; - - for (lno_t blockSize = 1; blockSize <= bMax; ++blockSize) { - // - // Fill blocks with random values - // - - lno_t nRow = blockSize * mat_b1.numRows(); - lno_t nCol = blockSize * mat_b1.numCols(); - size_type nnz = static_cast(blockSize) * - static_cast(blockSize) * mat_b1.nnz(); - - Kokkos::View d_rowmap("crsmatrix", nRow + 1); - auto h_rowmap = Kokkos::create_mirror_view(d_rowmap); - - Kokkos::View d_colidx("crsmatrix", nnz); - auto h_colidx = Kokkos::create_mirror_view(d_colidx); - - Kokkos::View d_matval("crsmatrix", nnz); - auto h_matval = Kokkos::create_mirror_view(d_matval); - - // Create the entries - make_block_entries(mat_b1, blockSize, h_rowmap, - h_colidx, h_matval); - - Kokkos::deep_copy(d_matval, h_matval); - Kokkos::deep_copy(d_colidx, h_colidx); - Kokkos::deep_copy(d_rowmap, h_rowmap); - - // Create the CrsMatrix for the reference computation - crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, d_matval, d_rowmap, - d_colidx); - - block_vector_t xref("new_right_hand_side", nRow, nrhs); - auto h_xref = Kokkos::create_mirror_view(xref); - for (int jc = 0; jc < nrhs; ++jc) - for (lno_t ir = 0; ir < nRow; ++ir) set_random_value(h_xref(ir, jc)); - Kokkos::deep_copy(xref, h_xref); - - block_vector_t y0("y_init", nRow, nrhs); - auto h_y0 = Kokkos::create_mirror_view(y0); - for (int jc = 0; jc < nrhs; ++jc) - for (lno_t ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir, jc)); - Kokkos::deep_copy(y0, h_y0); - - block_vector_t ycrs("crs_product_result", nRow, nrhs); - auto h_ycrs = Kokkos::create_mirror_view(ycrs); - for (int jc = 0; jc < nrhs; ++jc) - for (lno_t ir = 0; ir < nRow; ++ir) h_ycrs(ir, jc) = h_y0(ir, jc); - Kokkos::deep_copy(ycrs, h_ycrs); - - // - // Compute the reference product with a CrsMatrix variable - // - KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs); - - block_vector_t ybsr("bsr_product_result", nRow, nrhs); - auto h_ybsr = Kokkos::create_mirror_view(ybsr); - for (int jc = 0; jc < nrhs; ++jc) - for (lno_t ir = 0; ir < nRow; ++ir) h_ybsr(ir, jc) = h_y0(ir, jc); - Kokkos::deep_copy(ybsr, h_ybsr); - - // Create the BsrMatrix for the check test - KokkosSparse::Experimental::BsrMatrix - Absr(Acrs, blockSize); - - // - // Compute the product with the BsrMatrix format - // - KokkosSparse::spmv(fOp, alpha, Absr, xref, beta, ybsr); - - Kokkos::deep_copy(h_ycrs, ycrs); - Kokkos::deep_copy(h_ybsr, ybsr); - - // - // Compare the two products - // - using KATS = Kokkos::ArithTraits; - using mag_type = typename KATS::mag_type; - - const mag_type zero_mag = Kokkos::ArithTraits::zero(); - mag_type error = zero_mag, maxNorm = zero_mag; - for (int jc = 0; jc < nrhs; ++jc) { - for (int ir = 0; ir < nRow; ++ir) { - error = std::max(error, - KATS::abs(h_ycrs(ir, jc) - h_ybsr(ir, jc))); - maxNorm = std::max(maxNorm, KATS::abs(h_ycrs(ir, jc))); +/*! \brief create random x and y multivectors for a given matrix and spmv mode + */ +template +auto random_vecs_for_spmv(const char *mode, const Bsr &a) { + using scalar_type = typename Bsr::non_const_value_type; + using vector_type = typename VectorTypeFor::type; + using execution_space = typename Bsr::execution_space; + + size_t nx = a.numCols() * a.blockDim(); + size_t ny = a.numRows() * a.blockDim(); + if (mode_is_transpose(mode)) { + std::swap(nx, ny); + } + vector_type x("x", nx); + vector_type y("y", ny); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(x, random, max_x()); + Kokkos::fill_random(y, random, max_y()); + + return std::make_tuple(x, y); +} + +/*! \brief test all combos of the provided matrix + */ +template +void test_spmv_combos(const char *mode, const Bsr &a) { + using scalar_type = typename Bsr::non_const_value_type; + + auto [x, y] = random_vecs_for_spmv(mode, a); + + for (auto alg : {(const char *)(nullptr), "native", "experimental_tc_bsr"}) { + for (scalar_type alpha : + {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { + for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), + scalar_type(-1.5)}) { + test_spmv(alg, mode, alpha, beta, a, x, y); } } + } +} - mag_type tmps = KATS::abs(alpha) + KATS::abs(beta); - if ((tmps > zero_mag) && (maxNorm == zero_mag)) { - std::cout << " BSR - SpMV times MV >> blockSize " << blockSize - << " maxNorm " << maxNorm << " error " << error << " alpha " - << alpha << " beta " << beta << "\n"; - num_errors += 1; +/*! \brief test all combos of all matrices with different block sizes + */ +template +void test_spmv_corner_cases() { + using Bsr = KokkosSparse::Experimental::BsrMatrix; + for (auto mode : {"N", "T", "C", "H"}) { + for (int bs : {1, 2, 5, 9}) { + test_spmv_combos(mode, bsr_corner_case_0_by_0(bs)); + test_spmv_combos(mode, bsr_corner_case_0_by_1(bs)); + test_spmv_combos(mode, bsr_corner_case_1_by_0(bs)); } + } +} - const mag_type tol = ((static_cast(nnz) / nRow) + 1) * - Kokkos::ArithTraits::epsilon(); - if (error > tol * maxNorm) { - std::cout << " BSR - SpMV times MV >> blockSize " << blockSize - << " ratio " << error / maxNorm << " tol " << tol << " maxNorm " - << maxNorm << " alpha " << alpha << " beta " << beta << "\n"; - num_errors += 1; +template +void test_spmv_random() { + using Bsr = KokkosSparse::Experimental::BsrMatrix; + for (auto mode : {"N", "T", "C", "H"}) { + for (int bs : {1, 2, 5, 9}) { + test_spmv_combos(mode, bsr_random(bs, 10, 10)); + test_spmv_combos(mode, bsr_random(bs, 10, 50)); + test_spmv_combos(mode, bsr_random(bs, 50, 10)); } + } - } // for (int blockSize = 1; blockSize <= bMax; ++blockSize) + // test a tougher case on a big matrix + constexpr int blockSizePrime = 7; + constexpr int smallPrime = 11; + constexpr int largePrime = 499; + for (auto mode : {"N", "T"}) { + test_spmv_combos(mode, + bsr_random(blockSizePrime, smallPrime, largePrime)); + } } -} // namespace Test_Bsr - -template -void testSpMVBsrMatrix() { - // - // Check a few corner cases - // - - // 0 x 0 case - { - typedef - typename KokkosSparse::Experimental::BsrMatrix - bsrMat_t; - bsrMat_t Absr("empty", 0, 0, 0, nullptr, nullptr, nullptr, 1); - typedef typename bsrMat_t::values_type::non_const_type scalar_view_t; - typedef scalar_view_t x_vector_type; - typedef scalar_view_t y_vector_type; - x_vector_type x("corner-case-x", Absr.numCols()); - y_vector_type y("corner-case-y", Absr.numRows()); - Kokkos::deep_copy(y, static_cast(0)); - scalar_t alpha = static_cast(1); - scalar_t beta = static_cast(1); - const char fOp = 'N'; - int num_errors = 0; - try { - KokkosSparse::spmv(&fOp, alpha, Absr, x, beta, y); - Kokkos::fence(); - } catch (std::exception &e) { - num_errors += 1; - std::cout << e.what(); - } - EXPECT_TRUE(num_errors == 0); +template +void test_spmv() { + test_spmv_corner_cases(); + test_spmv_random(); +} + +// ---------------------------------------------------------------------------- +// Multivector +// ---------------------------------------------------------------------------- + +template +void test_spm_mv(const char *alg, const char *mode, const Alpha &alpha, + const Beta &beta, const Bsr &a, const XVector &x, + const YVector &y) { + using execution_space = typename Bsr::execution_space; + using scalar_type = typename Bsr::non_const_value_type; + using ordinal_type = typename Bsr::non_const_ordinal_type; + using KATS = Kokkos::ArithTraits; + using mag_type = typename KATS::mag_type; + + // generate expected result from reference implementation + YVector yExp("yExp", y.extent(0), y.extent(1)); + Kokkos::deep_copy(yExp, y); + reference_spmv(mode, alpha, a, x, beta, yExp); + + // scratch space for actual value (don't modify input) + YVector yAct("yAct", y.extent(0), y.extent(1)); + Kokkos::deep_copy(yAct, y); + + if (alg) { + KokkosKernels::Experimental::Controls controls; + controls.setParameter("algorithm", alg); + KokkosSparse::spmv(controls, mode, alpha, a, x, beta, yAct); + } else { + KokkosSparse::spmv(mode, alpha, a, x, beta, yAct); + } + + // compare yExp and yAct + auto hyExp = Kokkos::create_mirror_view(yExp); + auto hyAct = Kokkos::create_mirror_view(yAct); + Kokkos::deep_copy(hyExp, yExp); + Kokkos::deep_copy(hyAct, yAct); + + // max nnz per row is used for the tolerance + // for a transposed computation, need to transpose the matrix before + // seeing which rows are longest + size_t maxNnzPerRow; + if (mode_is_transpose(mode)) { + auto at = KokkosSparse::Impl::transpose_bsr_matrix(a); + maxNnzPerRow = + at.blockDim() * + KokkosSparse::Impl::graph_max_degree( + at.graph.row_map); + } else { + maxNnzPerRow = + a.blockDim() * + KokkosSparse::Impl::graph_max_degree( + a.graph.row_map); } - // 0 x 1 case - { - typedef - typename KokkosSparse::Experimental::BsrMatrix - bsrMat_t; - bsrMat_t Absr("empty", 0, 1, 0, nullptr, nullptr, nullptr, 1); - typedef typename bsrMat_t::values_type::non_const_type scalar_view_t; - typedef scalar_view_t x_vector_type; - typedef scalar_view_t y_vector_type; - x_vector_type x("corner-case-x", Absr.numCols()); - y_vector_type y("corner-case-y", Absr.numRows()); - Kokkos::deep_copy(y, static_cast(0)); - scalar_t alpha = static_cast(1); - scalar_t beta = static_cast(1); - const char fOp = 'N'; - int num_errors = 0; - try { - KokkosSparse::spmv(&fOp, alpha, Absr, x, beta, y); - Kokkos::fence(); - } catch (std::exception &e) { - num_errors += 1; - std::cout << e.what(); + /* assume that any floating-point op may introduce eps() error + scaling y is one op + dot product of x is two ops per entry (mul and add) + */ + const mag_type tolerance = + KATS::eps() * KATS::abs(beta) * KATS::abs(max_y()) + + 10 * KATS::eps() * maxNnzPerRow * KATS::abs(alpha) * + KATS::abs(max_a()) * KATS::abs(max_x()); + + std::vector> errIdx; + + for (ordinal_type i = 0; i < ordinal_type(hyAct.extent(0)); ++i) { + for (ordinal_type j = 0; j < ordinal_type(hyAct.extent(1)); ++j) { + if (KATS::abs(hyExp(i, j) - hyAct(i, j)) > tolerance) { + errIdx.push_back({i, j}); + } } - EXPECT_TRUE(num_errors == 0); } - // 1 x 0 case - { - typedef - typename KokkosSparse::Experimental::BsrMatrix - bsrMat_t; - bsrMat_t Absr("empty", 1, 0, 0, nullptr, nullptr, nullptr, 1); - typedef typename bsrMat_t::values_type::non_const_type scalar_view_t; - typedef scalar_view_t x_vector_type; - typedef scalar_view_t y_vector_type; - x_vector_type x("corner-case-x", Absr.numCols()); - y_vector_type y("corner-case-y", Absr.numRows()); - Kokkos::deep_copy(y, static_cast(0)); - scalar_t alpha = static_cast(1); - scalar_t beta = static_cast(1); - const char fOp = 'N'; - int num_errors = 0; - try { - KokkosSparse::spmv(&fOp, alpha, Absr, x, beta, y); - Kokkos::fence(); - } catch (std::exception &e) { - num_errors += 1; - std::cout << e.what(); + if (!errIdx.empty()) { + std::cerr << __FILE__ << ":" << __LINE__ << " BsrMatrix SpMMV failure!" + << std::endl; + std::cerr << "alg: " << (alg ? alg : "") << std::endl; + std::cerr << "mode: " << mode << std::endl; + std::cerr << "A: " << a.numRows() << "x" << a.numCols() + << std::endl; + std::cerr << "A blockdim: " << a.blockDim() << std::endl; + std::cerr << "alpha: " << alpha << std::endl; + std::cerr << "beta: " << beta << std::endl; + std::cerr << "maxNnzPerRow: " << maxNnzPerRow << std::endl; + std::cerr << "First 100 errors:" << std::endl; + std::cerr << "i\tj\texp\tact\terr\ttol" << std::endl; + std::cerr << "-\t-\t---\t---\t---\t---" << std::endl; + for (size_t e = 0; e < 100 && e < errIdx.size(); ++e) { + auto ij = errIdx[e]; + auto i = ij.first; + auto j = ij.second; + // clang-format off + std::cerr << i << "\t" << j + << "\t" << hyExp(i,j) + << "\t" << hyAct(i,j) + << "\t" << KATS::abs(hyExp(i,j) - hyAct(i,j)) + << "\t" << tolerance + << std::endl; + // clang-format on } - EXPECT_TRUE(num_errors == 0); } - // - // Test for the operation y <- alpha * Op(A) * x + beta * y - // - - // Define the function Op: Op(A) = A, Op(A) = conj(A), Op(A) = A^T, Op(A) = - // A^H - std::vector modes = {'N', 'C', 'T', 'H'}; - - // Define a set of pairs (alpha, beta) - std::vector testAlphaBeta = {0.0, 0.0, -1.0, 0.0, - 0.0, 1.0, 3.1, -2.5}; - - // - // Set the largest block size for the block matrix - // The code will create matrices with block sizes 1, .., bMax - // - constexpr lno_t bMax = 13; - - // - //--- Test single vector case - // - for (const auto mode : modes) { - int num_errors = 0; - for (size_t ii = 0; ii < testAlphaBeta.size(); ii += 2) { - auto alpha_s = static_cast(testAlphaBeta[ii]); - auto beta_s = static_cast(testAlphaBeta[ii + 1]); - num_errors = 0; - Test_Bsr::check_bsrm_times_v( - &mode, alpha_s, beta_s, bMax, num_errors); - if (num_errors > 0) { - std::cout << "KokkosSparse::Test::spmv_bsr: " << num_errors - << " errors of %i with params: " << bMax << " " << mode << " " - << Kokkos::ArithTraits::abs(alpha_s) << " " - << Kokkos::ArithTraits::abs(beta_s) << std::endl; + EXPECT_TRUE(errIdx.empty()); +} + +template +struct MultiVectorTypeFor { + using type = Kokkos::View; +}; + +/*! \brief create random x and y multivectors for a given matrix and spmv mode + */ +template +auto random_multivecs_for_spm_mv(const char *mode, const Bsr &a, + const size_t numVecs) { + using scalar_type = typename Bsr::non_const_value_type; + using vector_type = typename MultiVectorTypeFor::type; + using execution_space = typename Bsr::execution_space; + + size_t nx = a.numCols() * a.blockDim(); + size_t ny = a.numRows() * a.blockDim(); + if (mode_is_transpose(mode)) { + std::swap(nx, ny); + } + vector_type x("x", nx, numVecs); + vector_type y("y", ny, numVecs); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(x, random, max_x()); + Kokkos::fill_random(y, random, max_y()); + + return std::make_tuple(x, y); +} + +template +void test_spm_mv_combos(const char *mode, const Bsr &a) { + using scalar_type = typename Bsr::non_const_value_type; + + for (size_t numVecs : {1, 2, 7}) { // num multivecs + auto [x, y] = random_multivecs_for_spm_mv(mode, a, numVecs); + for (auto alg : + {(const char *)(nullptr), "native", "experimental_tc_bsr"}) { + for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), + scalar_type(3.7)}) { + for (scalar_type beta : {scalar_type(0), scalar_type(1), + scalar_type(-1), scalar_type(-1.5)}) { + test_spm_mv(alg, mode, alpha, beta, a, x, y); + } } - EXPECT_TRUE(num_errors == 0); } } } -template -void testBsrMatrix_SpM_MV() { - // - // Test for the operation Y <- alpha * Op(A) * X + beta * Y - // - - // Define the function Op: Op(A) = A, Op(A) = conj(A), Op(A) = A^T, Op(A) = - // A^H - std::vector modes = {'N', 'C', 'T', 'H'}; - - // Define a set of pairs (alpha, beta) - std::vector testAlphaBeta = {0.0, 0.0, -1.0, 0.0, - 0.0, 1.0, 3.1, -2.5}; - - // - // Set the largest block size for the block matrix - // The code will create matrices with block sizes 1, .., bMax - // - const lno_t bMax = 13; - - //--- Test multiple vector case - for (auto mode : modes) { - int num_errors = 0; - for (size_t ii = 0; ii < testAlphaBeta.size(); ii += 2) { - auto alpha_s = static_cast(testAlphaBeta[ii]); - auto beta_s = static_cast(testAlphaBeta[ii + 1]); - num_errors = 0; - Test_Bsr::check_bsrm_times_mv( - &mode, alpha_s, beta_s, bMax, num_errors); - if (num_errors > 0) { - std::cout << "KokkosSparse::Test::spm_mv_bsr: " << num_errors - << " errors of " << bMax << " with params: " << mode << " " - << Kokkos::ArithTraits::abs(alpha_s) << " " - << Kokkos::ArithTraits::abs(beta_s) << std::endl; - } - EXPECT_TRUE(num_errors == 0); +/*! \brief test all combos of all matrices with different block sizes + */ +template +void test_spm_mv_corner_cases() { + using Bsr = KokkosSparse::Experimental::BsrMatrix; + for (auto mode : {"N", "T", "C", "H"}) { + for (int bs : {1, 2, 5, 9}) { + test_spm_mv_combos(mode, bsr_corner_case_0_by_0(bs)); + test_spm_mv_combos(mode, bsr_corner_case_0_by_1(bs)); + test_spm_mv_combos(mode, bsr_corner_case_1_by_0(bs)); } } } +template +void test_spm_mv_random() { + using Bsr = KokkosSparse::Experimental::BsrMatrix; + // thoroughly test smaller matrices + for (auto mode : {"N", "T", "C", "H"}) { + for (int bs : {1, 2, 5, 9}) { + test_spm_mv_combos(mode, bsr_random(bs, 10, 10)); + test_spm_mv_combos(mode, bsr_random(bs, 10, 50)); + test_spm_mv_combos(mode, bsr_random(bs, 50, 10)); + } + } + + // test a tougher case on a big matrix + constexpr int blockSizePrime = 7; + constexpr int smallPrime = 11; + constexpr int largePrime = 499; + for (auto mode : {"N", "T"}) { + test_spm_mv_combos( + mode, bsr_random(blockSizePrime, smallPrime, largePrime)); + } +} + +template +void test_spm_mv() { + test_spm_mv_corner_cases(); + test_spm_mv_random(); +} + +} // namespace Test_Spmv_Bsr + ////////////////////////// -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F( \ - TestCategory, \ - sparse##_##bsrmat_times_vec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - testSpMVBsrMatrix(); \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, \ + sparse##_##bsr_spmv##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + Test_Spmv_Bsr::test_spmv(); \ } #include @@ -579,11 +645,12 @@ void testBsrMatrix_SpM_MV() { ////////////////////////// -#define EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \ - TEST_F( \ - TestCategory, \ - sparse##_##bsrmat_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ - testBsrMatrix_SpM_MV(); \ +#define EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse##_##bsr_spmmv##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ + Test_Spmv_Bsr::test_spm_mv(); \ } #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 6f58e25e63..95a3459699 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -638,18 +638,27 @@ class RandCooMat { /// dim2 refers to either columns for a Crs matrix or rows for a Ccs matrix. /// \tparam ScalarType /// \tparam LayoutType -/// \tparam ExeSpaceType -template +/// \tparam Device +template ::size_type> class RandCsMatrix { public: - using ValViewTypeD = Kokkos::View; - using IdViewTypeD = Kokkos::View; - using MapViewTypeD = Kokkos::View; + using value_type = ScalarType; + using array_layout = LayoutType; + using device_type = Device; + using ordinal_type = Ordinal; + using size_type = Size; + using ValViewTypeD = Kokkos::View; + using IdViewTypeD = Kokkos::View; + using MapViewTypeD = Kokkos::View; private: - int64_t __dim2; - int64_t __dim1; - int64_t __nnz = 0; + using execution_space = typename Device::execution_space; + Ordinal __dim2; + Ordinal __dim1; + Size __nnz = 0; MapViewTypeD __map_d; IdViewTypeD __ids_d; ValViewTypeD __vals_d; @@ -668,19 +677,19 @@ class RandCsMatrix { /// 4. __map(i) - col_map(i - 1) is in [0, m] void __populate_random_cs_mat(uint64_t ticks) { std::srand(ticks); - for (int64_t col_idx = 0; col_idx < __dim1; col_idx++) { - int64_t r = std::rand() % (__dim2 + 1); + for (Ordinal col_idx = 0; col_idx < __dim1; col_idx++) { + Ordinal r = std::rand() % (__dim2 + 1); if (r == 0 || __fully_sparse) { // 100% sparse vector __map(col_idx) = __nnz; } else { // sparse vector with r elements // Populate r row ids - std::vector v(r); + std::vector v(r); - for (int64_t i = 0; i < r; i++) v.at(i) = i; + for (Ordinal i = 0; i < r; i++) v.at(i) = i; std::shuffle(v.begin(), v.end(), std::mt19937(std::random_device()())); - for (int64_t i = 0; i < r; i++) __ids(i + __nnz) = v.at(i); + for (Ordinal i = 0; i < r; i++) __ids(i + __nnz) = v.at(i); // Point to new column and accumulate number of non zeros __map(col_idx) = __nnz; @@ -699,7 +708,6 @@ class RandCsMatrix { Kokkos::deep_copy( tight_ids, Kokkos::subview(__ids, Kokkos::make_pair(0, static_cast(__nnz)))); - ExeSpaceType().fence(); __ids_d = tight_ids; } @@ -708,7 +716,6 @@ class RandCsMatrix { T dst(std::string("RandCsMatrix.") + typeid(T).name() + " copy", src.extent(0)); Kokkos::deep_copy(dst, src); - ExeSpaceType().fence(); return dst; } @@ -719,7 +726,7 @@ class RandCsMatrix { /// \param dim2 The second dimension: columns for Crs or rows for Ccs /// \param min_val The minimum scalar value in the matrix. /// \param max_val The maximum scalar value in the matrix. - RandCsMatrix(int64_t dim1, int64_t dim2, ScalarType min_val, + RandCsMatrix(Ordinal dim1, Ordinal dim2, ScalarType min_val, ScalarType max_val, bool fully_sparse = false) { __dim1 = dim1; __dim2 = dim2; @@ -736,7 +743,7 @@ class RandCsMatrix { info = std::string( std::string("RandCsMatrix<") + typeid(ScalarType).name() + ", " + - typeid(LayoutType).name() + ", " + typeid(ExeSpaceType).name() + ">(" + + typeid(LayoutType).name() + ", " + execution_space().name() + ">(" + std::to_string(dim2) + ", " + std::to_string(dim1) + "...): rand seed: " + std::to_string(ticks) + ", fully sparse: " + (__fully_sparse ? "true" : "false") + "\n"); @@ -751,17 +758,16 @@ class RandCsMatrix { // Copy to device Kokkos::deep_copy(__vals_d, __vals); - ExeSpaceType().fence(); } // O(c), where c is a constant. - ScalarType operator()(int64_t idx) { return __vals(idx); } - int64_t get_nnz() { return __nnz; } + ScalarType operator()(Size idx) { return __vals(idx); } + size_t get_nnz() { return size_t(__nnz); } // dimension2: This is either columns for a Crs matrix or rows for a Ccs // matrix. - int64_t get_dim2() { return __dim2; } + Ordinal get_dim2() { return __dim2; } // dimension1: This is either rows for Crs matrix or columns for a Ccs matrix. - int64_t get_dim1() { return __dim1; } + Ordinal get_dim1() { return __dim1; } ValViewTypeD get_vals() { return __getter_copy_helper(__vals_d); } IdViewTypeD get_ids() { return __getter_copy_helper(__ids_d); } MapViewTypeD get_map() { return __getter_copy_helper(__map_d); } From 2bb633d4628f4f362fa5759530fb5ce60fefdc01 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 May 2023 12:30:00 -0600 Subject: [PATCH 353/442] .github/workflows: Summarize github-DOCS errors and warnings docs: Remove unsupported Doxyfile.in tags --- .github/workflows/docs.yml | 16 +++++++++++++++- docs/Doxyfile.in | 8 ++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 70a97119c9..2488790254 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -63,7 +63,21 @@ jobs: - name: build_kokkos_kernels_doxygen working-directory: kokkos-kernels/build - run: make Doxygen + run: | + echo "Redirecting full output to doxygen.out..." + make Doxygen > doxygen.out 2>&1 || true + error_ret=$(grep 'Error' doxygen.out | head -c 1) || true + if [ ! -z $error_ret ]; then + echo "---- BEGIN: Summary of errors ---- " + cat doxygen.out | grep -i 'error:' || true + echo "---- END: Summary of errors ---- " + echo + echo + echo "---- BEGIN: Summary of warnings ---- " + cat doxygen.out | grep -i 'warning:' || true + echo "---- END: Summary of warnings ---- " + exit 1 + fi - name: build_kokkos_kernels_sphinx working-directory: kokkos-kernels/build diff --git a/docs/Doxyfile.in b/docs/Doxyfile.in index deb47d9d2b..ed581f3010 100644 --- a/docs/Doxyfile.in +++ b/docs/Doxyfile.in @@ -606,7 +606,7 @@ HIDE_COMPOUND_REFERENCE= NO # will show which file needs to be included to use the class. # The default value is: YES. -SHOW_HEADERFILE = YES +# SHOW_HEADERFILE = YES # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. @@ -824,7 +824,7 @@ WARN_IF_DOC_ERROR = YES # parameters have no documentation without warning. # The default value is: YES. -WARN_IF_INCOMPLETE_DOC = YES +# WARN_IF_INCOMPLETE_DOC = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return @@ -1571,7 +1571,7 @@ GENERATE_TREEVIEW = NO # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. -FULL_SIDEBAR = NO +# FULL_SIDEBAR = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. @@ -1654,7 +1654,7 @@ USE_MATHJAX = NO # The default value is: MathJax_2. # This tag requires that the tag USE_MATHJAX is set to YES. -MATHJAX_VERSION = MathJax_2 +# MATHJAX_VERSION = MathJax_2 # When MathJax is enabled you can set the default output format to be used for # the MathJax output. For more details about the output format see MathJax From 990d7db764044d8e178e330293b14b60bd23c6b3 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 12 May 2023 13:16:37 -0600 Subject: [PATCH 354/442] Fix errors and warnings in sems-rhel nighly --- common/unit_test/Test_Common_LowerBound.hpp | 10 ++++++---- common/unit_test/Test_Common_UpperBound.hpp | 8 +++++--- sparse/unit_test/Test_Sparse_spmv_bsr.hpp | 8 ++++---- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/common/unit_test/Test_Common_LowerBound.hpp b/common/unit_test/Test_Common_LowerBound.hpp index f2b54eed32..3ff27da23c 100644 --- a/common/unit_test/Test_Common_LowerBound.hpp +++ b/common/unit_test/Test_Common_LowerBound.hpp @@ -190,12 +190,14 @@ void test_lower_bound() { test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(5)); test_lower_bound({0, 1, T(2.5), 3, 4, 5}, T(6)); - auto randn = [](T n) { + auto randn = [](T n) -> T { + T ret; if constexpr (std::is_floating_point_v) { - return T(rand()) / T(RAND_MAX) * n; + ret = T(rand()) / T(RAND_MAX) * n; } else { - return T(rand()) % n; + ret = T(rand()) % n; } + return ret; }; T maxEntry = 20; @@ -253,4 +255,4 @@ EXECUTE_TEST(float, TestExecSpace) EXECUTE_TEST(double, TestExecSpace) #endif -#undef EXECUTE_TEST \ No newline at end of file +#undef EXECUTE_TEST diff --git a/common/unit_test/Test_Common_UpperBound.hpp b/common/unit_test/Test_Common_UpperBound.hpp index 9a440b376a..a6d3b24d84 100644 --- a/common/unit_test/Test_Common_UpperBound.hpp +++ b/common/unit_test/Test_Common_UpperBound.hpp @@ -181,12 +181,14 @@ void test_upper_bound() { test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(5)); test_upper_bound({0, 1, T(2.5), 3, 4, 5}, T(6)); - auto randn = [](T n) { + auto randn = [](T n) -> T { + T ret; if constexpr (std::is_floating_point_v) { - return T(rand()) / T(RAND_MAX) * n; + ret = T(rand()) / T(RAND_MAX) * n; } else { - return T(rand()) % n; + ret = T(rand()) % n; } + return ret; }; constexpr T maxEntry = 20; diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index 36237785b2..b2883c1e91 100644 --- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -115,10 +115,10 @@ Bsr bsr_random(const int blockSize, const int blockRows, const int blockCols) { ordinal_type, size_type> rcs(blockRows, blockCols, scalar_type(0), max_a()); - const auto colids = - Kokkos::subview(rcs.get_ids(), Kokkos::pair{size_t(0), rcs.get_nnz()}); - const auto vals = - Kokkos::subview(rcs.get_vals(), Kokkos::pair{size_t(0), rcs.get_nnz()}); + const auto colids = Kokkos::subview( + rcs.get_ids(), Kokkos::make_pair(size_t(0), rcs.get_nnz())); + const auto vals = Kokkos::subview( + rcs.get_vals(), Kokkos::make_pair(size_t(0), rcs.get_nnz())); Graph graph(colids, rcs.get_map()); Crs crs("crs", blockCols, vals, graph); From 1424f8aef09266a3b27b1168339a30e8c4b844f6 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Fri, 12 May 2023 20:20:47 +0000 Subject: [PATCH 355/442] Kokkos 4 compatibility: modifying the preprocessor logic Should check that Kokkos version is less than 40099 and use the old Impl::ALL_t in that case only for versions higher than 40099 should the new public interface be used. Note that this assumes the public ALL_t will be included in 40100... --- batched/KokkosBatched_Util.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/batched/KokkosBatched_Util.hpp b/batched/KokkosBatched_Util.hpp index f6b08764c9..27fb0bf338 100644 --- a/batched/KokkosBatched_Util.hpp +++ b/batched/KokkosBatched_Util.hpp @@ -624,7 +624,7 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, const Trans::NoTranspose) { return subview_wrapper(v, i1, i2, i3, layout_tag); } -#if KOKKOS_VERSION <= 40000 +#if KOKKOS_VERSION < 40099 template KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, Kokkos::Impl::ALL_t i2, @@ -669,7 +669,7 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper( const BatchLayout::Right &layout_tag, const Trans::NoTranspose &) { return subview_wrapper(v, i1, i2, i3, layout_tag); } -#if KOKKOS_VERSION <= 40000 +#if KOKKOS_VERSION < 40099 template KOKKOS_INLINE_FUNCTION auto subview_wrapper( ViewType v, IdxType1 i1, Kokkos::Impl::ALL_t i2, Kokkos::Impl::ALL_t i3, From f30291cd16d2e13f1c1d6aff1e616f866deb80dc Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 12 May 2023 15:43:23 -0600 Subject: [PATCH 356/442] spmv cusparse version check modified for cuda/11.1 resolves compilation errors when enabling cusparse with cuda/11.1 --- sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 11570c0d29..6cbd1fff29 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -94,7 +94,7 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, size_t bufferSize = 0; void* dBuffer = NULL; -#if CUSPARSE_VERSION >= 11201 +#if CUSPARSE_VERSION >= 11301 cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; #else cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; @@ -102,13 +102,13 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, if (controls.isParameter("algorithm")) { const std::string algName = controls.getParameter("algorithm"); if (algName == "default") -#if CUSPARSE_VERSION >= 11201 +#if CUSPARSE_VERSION >= 11301 alg = CUSPARSE_SPMV_ALG_DEFAULT; #else alg = CUSPARSE_MV_ALG_DEFAULT; #endif else if (algName == "merge") -#if CUSPARSE_VERSION >= 11201 +#if CUSPARSE_VERSION >= 11301 alg = CUSPARSE_SPMV_CSR_ALG2; #else alg = CUSPARSE_CSRMV_ALG2; From 28e8130860ed0fb080c9d91d7aaa869d20411440 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Sat, 13 May 2023 16:07:56 -0600 Subject: [PATCH 357/442] Sparse: fixing a few issues related to coo2csr and par_ilut benchmark coo2csr needed to be split between src and impl, eventually we could decide to add a spec file and do the whole ETI for it, to be discussed. Additionally coo2csr uses features that are in Kokkos Core develop which prevents Kokkos Kernels from building against 4.0.00 or 4.0.01. Adding preprocessor checks to avoid this issue. The par_ilut benchmark unfortunately does not make any sense, it force its way around using google benchmark for every things from arguments setting, to timing and to printing outputs. This leads to unsued variable issues which cannot be fixed... This should really be reverted but I am just commenting it out for now. --- perf_test/sparse/CMakeLists.txt | 32 +-- sparse/impl/KokkosSparse_coo2crs_impl.hpp | 280 ++++++++++++++++++++++ sparse/src/KokkosSparse_coo2crs.hpp | 265 +------------------- sparse/unit_test/Test_Sparse.hpp | 2 + 4 files changed, 310 insertions(+), 269 deletions(-) create mode 100644 sparse/impl/KokkosSparse_coo2crs_impl.hpp diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index f63560e0f4..00d7bdaa4d 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -116,19 +116,19 @@ KOKKOSKERNELS_ADD_EXECUTABLE( SOURCES KokkosSparse_mdf.cpp ) -if (KokkosKernels_ENABLE_BENCHMARK) - KOKKOSKERNELS_ADD_BENCHMARK( - sparse_par_ilut - SOURCES KokkosSparse_par_ilut.cpp - ) - - # Provide -DGinkgo_DIR to cmake to enable the ginkgo test in sparse_par_ilut. Ginkgo_DIR should - # point to the dir in the ginkgo install area that contains the GinkgoConfig.cmake file. - # For me, this was $gingko_install_dir/lib64/cmake/Ginkgo - if (Ginkgo_DIR) - find_package(Ginkgo REQUIRED) - - target_compile_definitions(KokkosKernels_sparse_par_ilut PRIVATE "USE_GINKGO") - target_link_libraries(KokkosKernels_sparse_par_ilut PRIVATE Ginkgo::ginkgo) - endif() -endif() +# if (KokkosKernels_ENABLE_BENCHMARK) +# KOKKOSKERNELS_ADD_BENCHMARK( +# sparse_par_ilut +# SOURCES KokkosSparse_par_ilut.cpp +# ) + +# # Provide -DGinkgo_DIR to cmake to enable the ginkgo test in sparse_par_ilut. Ginkgo_DIR should +# # point to the dir in the ginkgo install area that contains the GinkgoConfig.cmake file. +# # For me, this was $gingko_install_dir/lib64/cmake/Ginkgo +# if (Ginkgo_DIR) +# find_package(Ginkgo REQUIRED) + +# target_compile_definitions(KokkosKernels_sparse_par_ilut PRIVATE "USE_GINKGO") +# target_link_libraries(KokkosKernels_sparse_par_ilut PRIVATE Ginkgo::ginkgo) +# endif() +# endif() diff --git a/sparse/impl/KokkosSparse_coo2crs_impl.hpp b/sparse/impl/KokkosSparse_coo2crs_impl.hpp new file mode 100644 index 0000000000..52363dea6a --- /dev/null +++ b/sparse/impl/KokkosSparse_coo2crs_impl.hpp @@ -0,0 +1,280 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSSPARSE_COO2CRS_IMPL_HPP +#define KOKKOSSPARSE_COO2CRS_IMPL_HPP +// The unorderedmap changes necessary for this to work +// have not made it into Kokkos 4.0.00 pr 4.0.01 will +// need to see if it happens in 4.1.00 to have a final +// version check here. +#if KOKKOS_VERSION >= 40099 + +#include +#include "Kokkos_UnorderedMap.hpp" +#include "KokkosKernels_Utils.hpp" + +namespace KokkosSparse { +namespace Impl { +template +class Coo2Crs { + private: + using RowViewScalarType = typename RowViewType::value_type; + using ColViewScalarType = typename ColViewType::value_type; + using DataViewScalarType = typename DataViewType::value_type; + using CrsST = DataViewScalarType; + using CrsOT = RowViewScalarType; + using CrsET = typename DataViewType::execution_space; + using CrsMT = void; + using CrsSzT = ColViewScalarType; + using CrsType = CrsMatrix; + using CrsValsViewType = typename CrsType::values_type; + using CrsRowMapViewType = typename CrsType::row_map_type::non_const_type; + using CrsColIdViewType = typename CrsType::index_type; + + using UmapValueViewType = Kokkos::View; + using UmapOpTypes = + Kokkos::UnorderedMapInsertOpTypes; + using UmapOpType = typename UmapOpTypes::AtomicAdd; + + // Make public for Kokkos::View + public: + using UmapHasherType = typename Kokkos::pod_hash; + using UmapEqualToType = typename Kokkos::pod_equal_to; + using UmapType = Kokkos::UnorderedMap; + using UmapMemorySpace = typename UmapType::device_type::memory_space; + + // Public for kokkos policies + struct coo2crsRp1 {}; + struct rowmapRp1 {}; + struct copyTp1 {}; + struct copyRp1 {}; + + using copyTp1Pt = Kokkos::TeamPolicy; + using copyTp1MemberType = typename copyTp1Pt::member_type; + + private: + using CrsRowMapView = Kokkos::View; + using CrsRowMapAtomicView = + Kokkos::View>; + using CrsValuesView = Kokkos::View; + using CrsColIdsView = Kokkos::View; + + // Needed since Kokkos::Bitset cannot be accessed on the host + using BmapViewType = + Kokkos::View>; + using Bitset = Kokkos::Bitset; + + CrsRowMapView m_crs_row_map; + CrsRowMapAtomicView m_crs_row_map_tmp; + CrsValuesView m_crs_vals; + CrsColIdsView m_crs_col_ids; + UmapType *m_umaps; + BmapViewType m_capacity_bmap; + Bitset m_tuple_bmap; + UmapOpType m_insert_op; + CrsOT m_nrows; + CrsOT m_ncols; + RowViewType m_row; + ColViewType m_col; + DataViewType m_data; + CrsSzT m_nnz; + + int m_n_tuples; + + public: + KOKKOS_INLINE_FUNCTION + void operator()(const coo2crsRp1 &, const int &idx) const { + auto i = m_row(idx); + auto j = m_col(idx); + auto is_inserted = m_tuple_bmap.test(idx); + + if (i >= m_nrows || j >= m_ncols) { + Kokkos::abort("tuple is out of bounds"); + } else if (!is_inserted && i >= 0 && j >= 0) { + if (m_umaps[i].insert(j, m_data(idx), m_insert_op).failed()) { + m_capacity_bmap(i) = true; // hmap at index i reached capacity + } else { + m_tuple_bmap.set(idx); // checklist of inserted tuples + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const copyRp1 &, const int &i) const { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UN + for (int j = 0; j < m_ncols; j++) { + if (m_umaps[i].exists(j)) { + auto umap_idx = m_umaps[i].find(j); + auto offset = m_crs_row_map_tmp(i)++; + m_crs_vals(offset) = m_umaps[i].value_at(umap_idx); + m_crs_col_ids(offset) = m_umaps[i].key_at(umap_idx); + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const copyTp1 &, const copyTp1MemberType &member) const { + auto row_idx = member.league_rank(); + auto cpy_beg = m_crs_row_map(row_idx); + auto cpy_end = m_crs_row_map(row_idx + 1); + auto cpy_len = cpy_end - cpy_beg; + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, cpy_len), + [&](const CrsOT &i) { + auto offset = i + cpy_beg; + m_crs_vals(offset) = m_umaps[i].value_at(i); + m_crs_col_ids(offset) = m_umaps[i].key_at(i); + }); + } + + Coo2Crs(DimType m, DimType n, RowViewType row, ColViewType col, + DataViewType data) { + m_n_tuples = data.extent(0); + m_nrows = m; + m_ncols = n; + m_row = row; + m_col = col; + m_data = data; + + typename UmapType::size_type arg_capacity_hint = + m_nrows > 0 ? (m_n_tuples / m_nrows / 4) : 16; + typename UmapType::hasher_type arg_hasher; + typename UmapType::equal_to_type arg_equal_to; + arg_capacity_hint = arg_capacity_hint < 16 ? 16 : arg_capacity_hint; + + // Record of whether capacity was reached in any unordered map + m_capacity_bmap = BmapViewType("m_capacity_bmap", m_nrows); + typename BmapViewType::HostMirror m_capacity_bmap_mirror = + Kokkos::create_mirror_view(m_capacity_bmap); + + // Track which tuples have been processed + m_tuple_bmap = Bitset(m_n_tuples); + + m_crs_row_map = CrsRowMapView( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_crs_row_map"), + m_nrows + 1); + + // Memory management notes for `umap_ptrs` and `m_umaps`: + // `umap_ptrs` is a two dimensional array. The first dimension contains + // pointers to mixed-memory (host and device memory). The second + // dimension is the array of UnorderedMap objects. Some of the object + // methods are callable from only the device (device-callable), others + // are callable from only the host. Some of the host-callable methods, + // such as rehash are intended to be observable on the device. + // See Kokkos::UnorderedMap for details. + // + // `m_umaps` is a single dimension array of device memory. This array + // contains a shallow copy of all the UnorderedMap members that are + // allocated manually below. + // + // Any time a host-callable method with device observable results is + // invoked, we must shallow-copy the given `umap_ptrs` member back to + // the device. + // + // However, since we are using shallow copies of objects of type + // UnorderedMap, we do not need to copy the device memory back to + // the host before using a host-callable method. + + // Setup a nrows length array of Unordered Maps + m_umaps = + reinterpret_cast(Kokkos::kokkos_malloc( + "m_umaps", m_nrows * sizeof(UmapType))); + + using shallow_copy_to_device = + Kokkos::Impl::DeepCopy; + + UmapType **umap_ptrs = new UmapType *[m_nrows]; + // TODO: use host-level parallel_for with tag rowmapRp1 + for (int i = 0; i < m_nrows; i++) { + umap_ptrs[i] = new UmapType(arg_capacity_hint, arg_hasher, arg_equal_to); + shallow_copy_to_device(m_umaps + i, umap_ptrs[i], sizeof(UmapType)); + } + + using coo2crsRp1Pt = Kokkos::RangePolicy; + bool rehashed = true; + while (rehashed) { + Kokkos::parallel_for("coo2crsRp1", coo2crsRp1Pt(0, m_n_tuples), *this); + + CrsET().fence(); // Wait for bitmap writes to land + Kokkos::deep_copy(m_capacity_bmap_mirror, m_capacity_bmap); + CrsET().fence(); + + rehashed = false; + // TODO: covert to host-level parallel for. + for (int i = 0; i < m_nrows; i++) { + if (m_capacity_bmap_mirror(i)) { + umap_ptrs[i]->rehash(umap_ptrs[i]->capacity() * 2); + rehashed = true; + m_capacity_bmap_mirror(i) = false; + shallow_copy_to_device(m_umaps + i, umap_ptrs[i], sizeof(UmapType)); + } + } + Kokkos::deep_copy(m_capacity_bmap, m_capacity_bmap_mirror); + CrsET().fence(); + } + + typename CrsRowMapView::HostMirror m_crs_row_map_h = + Kokkos::create_mirror_view(m_crs_row_map); + + // TODO: convert to host-level parallel_for / prefix sum + m_crs_row_map_h(0) = 0; + for (int i = 1; i < m_nrows + 1; i++) { + auto adj_i = i - 1; + auto sz = umap_ptrs[adj_i]->size(); + m_crs_row_map_h(i) = m_crs_row_map_h(adj_i) + sz; + } + + m_crs_row_map_tmp = CrsRowMapAtomicView( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_crs_row_map_tmp"), + m_nrows + 1); + Kokkos::deep_copy(m_crs_row_map, m_crs_row_map_h); + Kokkos::deep_copy(m_crs_row_map_tmp, m_crs_row_map_h); + CrsET().fence(); + + m_nnz = m_crs_row_map_h(m_nrows); + + m_crs_vals = CrsValuesView( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_crs_vals"), m_nnz); + m_crs_col_ids = CrsColIdsView( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_crs_col_ids"), + m_nnz); + + using copyRp1Pt = Kokkos::RangePolicy; + Kokkos::parallel_for("copyRp1", copyRp1Pt(0, m_nrows), *this); + CrsET().fence(); + + // Cleanup + for (int i = 0; i < m_nrows; i++) { + delete umap_ptrs[i]; + } + delete[] umap_ptrs; + Kokkos::kokkos_free(m_umaps); + } + + CrsType get_crsMat() { + return CrsType("coo2crs", m_nrows, m_ncols, m_nnz, m_crs_vals, + m_crs_row_map, m_crs_col_ids); + } +}; +} // namespace Impl +} + +#endif // KOKKOS_VERSION >= 40099 + +#endif // KOKKOSSPARSE_COO2CRS_IMPL_HPP diff --git a/sparse/src/KokkosSparse_coo2crs.hpp b/sparse/src/KokkosSparse_coo2crs.hpp index 90752c4c69..1fe122f30e 100644 --- a/sparse/src/KokkosSparse_coo2crs.hpp +++ b/sparse/src/KokkosSparse_coo2crs.hpp @@ -14,263 +14,21 @@ // //@HEADER -#include "KokkosSparse_CooMatrix.hpp" -#include "KokkosSparse_CrsMatrix.hpp" -#include "KokkosKernels_Utils.hpp" -#include "Kokkos_UnorderedMap.hpp" -#include - #ifndef _KOKKOSSPARSE_COO2CRS_HPP #define _KOKKOSSPARSE_COO2CRS_HPP -namespace KokkosSparse { -namespace Impl { -template -class Coo2Crs { - private: - using RowViewScalarType = typename RowViewType::value_type; - using ColViewScalarType = typename ColViewType::value_type; - using DataViewScalarType = typename DataViewType::value_type; - using CrsST = DataViewScalarType; - using CrsOT = RowViewScalarType; - using CrsET = typename DataViewType::execution_space; - using CrsMT = void; - using CrsSzT = ColViewScalarType; - using CrsType = CrsMatrix; - using CrsValsViewType = typename CrsType::values_type; - using CrsRowMapViewType = typename CrsType::row_map_type::non_const_type; - using CrsColIdViewType = typename CrsType::index_type; - - using UmapValueViewType = Kokkos::View; - using UmapOpTypes = - Kokkos::UnorderedMapInsertOpTypes; - using UmapOpType = typename UmapOpTypes::AtomicAdd; - - // Make public for Kokkos::View - public: - using UmapHasherType = typename Kokkos::pod_hash; - using UmapEqualToType = typename Kokkos::pod_equal_to; - using UmapType = Kokkos::UnorderedMap; - using UmapMemorySpace = typename UmapType::device_type::memory_space; - - // Public for kokkos policies - struct coo2crsRp1 {}; - struct rowmapRp1 {}; - struct copyTp1 {}; - struct copyRp1 {}; - - using copyTp1Pt = Kokkos::TeamPolicy; - using copyTp1MemberType = typename copyTp1Pt::member_type; - - private: - using CrsRowMapView = Kokkos::View; - using CrsRowMapAtomicView = - Kokkos::View>; - using CrsValuesView = Kokkos::View; - using CrsColIdsView = Kokkos::View; - - // Needed since Kokkos::Bitset cannot be accessed on the host - using BmapViewType = - Kokkos::View>; - using Bitset = Kokkos::Bitset; - - CrsRowMapView m_crs_row_map; - CrsRowMapAtomicView m_crs_row_map_tmp; - CrsValuesView m_crs_vals; - CrsColIdsView m_crs_col_ids; - UmapType *m_umaps; - BmapViewType m_capacity_bmap; - Bitset m_tuple_bmap; - UmapOpType m_insert_op; - CrsOT m_nrows; - CrsOT m_ncols; - RowViewType m_row; - ColViewType m_col; - DataViewType m_data; - CrsSzT m_nnz; - - int m_n_tuples; - - public: - KOKKOS_INLINE_FUNCTION - void operator()(const coo2crsRp1 &, const int &idx) const { - auto i = m_row(idx); - auto j = m_col(idx); - auto is_inserted = m_tuple_bmap.test(idx); - - if (i >= m_nrows || j >= m_ncols) { - Kokkos::abort("tuple is out of bounds"); - } else if (!is_inserted && i >= 0 && j >= 0) { - if (m_umaps[i].insert(j, m_data(idx), m_insert_op).failed()) { - m_capacity_bmap(i) = true; // hmap at index i reached capacity - } else { - m_tuple_bmap.set(idx); // checklist of inserted tuples - } - } - } - - KOKKOS_INLINE_FUNCTION - void operator()(const copyRp1 &, const int &i) const { -#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) -#pragma unroll -#endif // KOKKOS_ENABLE_PRAGMA_UN - for (int j = 0; j < m_ncols; j++) { - if (m_umaps[i].exists(j)) { - auto umap_idx = m_umaps[i].find(j); - auto offset = m_crs_row_map_tmp(i)++; - m_crs_vals(offset) = m_umaps[i].value_at(umap_idx); - m_crs_col_ids(offset) = m_umaps[i].key_at(umap_idx); - } - } - } - - KOKKOS_INLINE_FUNCTION - void operator()(const copyTp1 &, const copyTp1MemberType &member) const { - auto row_idx = member.league_rank(); - auto cpy_beg = m_crs_row_map(row_idx); - auto cpy_end = m_crs_row_map(row_idx + 1); - auto cpy_len = cpy_end - cpy_beg; - - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, cpy_len), - [&](const CrsOT &i) { - auto offset = i + cpy_beg; - m_crs_vals(offset) = m_umaps[i].value_at(i); - m_crs_col_ids(offset) = m_umaps[i].key_at(i); - }); - } - - Coo2Crs(DimType m, DimType n, RowViewType row, ColViewType col, - DataViewType data) { - m_n_tuples = data.extent(0); - m_nrows = m; - m_ncols = n; - m_row = row; - m_col = col; - m_data = data; +// The unorderedmap changes necessary for this to work +// have not made it into Kokkos 4.0.00 pr 4.0.01 will +// need to see if it happens in 4.1.00 to have a final +// version check here. +#if KOKKOS_VERSION >= 40099 - typename UmapType::size_type arg_capacity_hint = - m_nrows > 0 ? (m_n_tuples / m_nrows / 4) : 16; - typename UmapType::hasher_type arg_hasher; - typename UmapType::equal_to_type arg_equal_to; - arg_capacity_hint = arg_capacity_hint < 16 ? 16 : arg_capacity_hint; - - // Record of whether capacity was reached in any unordered map - m_capacity_bmap = BmapViewType("m_capacity_bmap", m_nrows); - typename BmapViewType::HostMirror m_capacity_bmap_mirror = - Kokkos::create_mirror_view(m_capacity_bmap); - - // Track which tuples have been processed - m_tuple_bmap = Bitset(m_n_tuples); - - m_crs_row_map = CrsRowMapView( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_crs_row_map"), - m_nrows + 1); - - // Memory management notes for `umap_ptrs` and `m_umaps`: - // `umap_ptrs` is a two dimensional array. The first dimension contains - // pointers to mixed-memory (host and device memory). The second - // dimension is the array of UnorderedMap objects. Some of the object - // methods are callable from only the device (device-callable), others - // are callable from only the host. Some of the host-callable methods, - // such as rehash are intended to be observable on the device. - // See Kokkos::UnorderedMap for details. - // - // `m_umaps` is a single dimension array of device memory. This array - // contains a shallow copy of all the UnorderedMap members that are - // allocated manually below. - // - // Any time a host-callable method with device observable results is - // invoked, we must shallow-copy the given `umap_ptrs` member back to - // the device. - // - // However, since we are using shallow copies of objects of type - // UnorderedMap, we do not need to copy the device memory back to - // the host before using a host-callable method. - - // Setup a nrows length array of Unordered Maps - m_umaps = - reinterpret_cast(Kokkos::kokkos_malloc( - "m_umaps", m_nrows * sizeof(UmapType))); - - using shallow_copy_to_device = - Kokkos::Impl::DeepCopy; - - UmapType **umap_ptrs = new UmapType *[m_nrows]; - // TODO: use host-level parallel_for with tag rowmapRp1 - for (int i = 0; i < m_nrows; i++) { - umap_ptrs[i] = new UmapType(arg_capacity_hint, arg_hasher, arg_equal_to); - shallow_copy_to_device(m_umaps + i, umap_ptrs[i], sizeof(UmapType)); - } - - using coo2crsRp1Pt = Kokkos::RangePolicy; - bool rehashed = true; - while (rehashed) { - Kokkos::parallel_for("coo2crsRp1", coo2crsRp1Pt(0, m_n_tuples), *this); - - CrsET().fence(); // Wait for bitmap writes to land - Kokkos::deep_copy(m_capacity_bmap_mirror, m_capacity_bmap); - CrsET().fence(); - - rehashed = false; - // TODO: covert to host-level parallel for. - for (int i = 0; i < m_nrows; i++) { - if (m_capacity_bmap_mirror(i)) { - umap_ptrs[i]->rehash(umap_ptrs[i]->capacity() * 2); - rehashed = true; - m_capacity_bmap_mirror(i) = false; - shallow_copy_to_device(m_umaps + i, umap_ptrs[i], sizeof(UmapType)); - } - } - Kokkos::deep_copy(m_capacity_bmap, m_capacity_bmap_mirror); - CrsET().fence(); - } - - typename CrsRowMapView::HostMirror m_crs_row_map_h = - Kokkos::create_mirror_view(m_crs_row_map); - - // TODO: convert to host-level parallel_for / prefix sum - m_crs_row_map_h(0) = 0; - for (int i = 1; i < m_nrows + 1; i++) { - auto adj_i = i - 1; - auto sz = umap_ptrs[adj_i]->size(); - m_crs_row_map_h(i) = m_crs_row_map_h(adj_i) + sz; - } - - m_crs_row_map_tmp = CrsRowMapAtomicView( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_crs_row_map_tmp"), - m_nrows + 1); - Kokkos::deep_copy(m_crs_row_map, m_crs_row_map_h); - Kokkos::deep_copy(m_crs_row_map_tmp, m_crs_row_map_h); - CrsET().fence(); - - m_nnz = m_crs_row_map_h(m_nrows); - - m_crs_vals = CrsValuesView( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_crs_vals"), m_nnz); - m_crs_col_ids = CrsColIdsView( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "m_crs_col_ids"), - m_nnz); - - using copyRp1Pt = Kokkos::RangePolicy; - Kokkos::parallel_for("copyRp1", copyRp1Pt(0, m_nrows), *this); - CrsET().fence(); - - // Cleanup - for (int i = 0; i < m_nrows; i++) { - delete umap_ptrs[i]; - } - delete[] umap_ptrs; - Kokkos::kokkos_free(m_umaps); - } +#include "KokkosSparse_CooMatrix.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosKernels_Utils.hpp" - CrsType get_crsMat() { - return CrsType("coo2crs", m_nrows, m_ncols, m_nnz, m_crs_vals, - m_crs_row_map, m_crs_col_ids); - } -}; -} // namespace Impl +#include "KokkosSparse_coo2crs_impl.hpp" +namespace KokkosSparse { // clang-format off /// /// \brief Blocking function that converts a CooMatrix into a CrsMatrix. Values are summed. @@ -340,4 +98,5 @@ auto coo2crs(KokkosSparse::CooMatrix= 40099 +#endif // _KOKKOSSPARSE_COO2CRS_HPP diff --git a/sparse/unit_test/Test_Sparse.hpp b/sparse/unit_test/Test_Sparse.hpp index d0233a9c67..18e3b45b98 100644 --- a/sparse/unit_test/Test_Sparse.hpp +++ b/sparse/unit_test/Test_Sparse.hpp @@ -16,7 +16,9 @@ #ifndef TEST_SPARSE_HPP #define TEST_SPARSE_HPP +#if KOKKOS_VERSION >= 40099 #include "Test_Sparse_coo2crs.hpp" +#endif // KOKKOS_VERSION >= 40099 #include "Test_Sparse_crs2coo.hpp" #include "Test_Sparse_block_gauss_seidel.hpp" #include "Test_Sparse_Controls.hpp" From b3ef19c74c30818e8b2cf6dd01047a7db11e692e Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Sat, 13 May 2023 16:16:21 -0600 Subject: [PATCH 358/442] Applying clang-format --- sparse/impl/KokkosSparse_coo2crs_impl.hpp | 6 +++--- sparse/unit_test/Test_Sparse.hpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sparse/impl/KokkosSparse_coo2crs_impl.hpp b/sparse/impl/KokkosSparse_coo2crs_impl.hpp index 52363dea6a..d00a6f34a9 100644 --- a/sparse/impl/KokkosSparse_coo2crs_impl.hpp +++ b/sparse/impl/KokkosSparse_coo2crs_impl.hpp @@ -273,8 +273,8 @@ class Coo2Crs { } }; } // namespace Impl -} +} // namespace KokkosSparse -#endif // KOKKOS_VERSION >= 40099 +#endif // KOKKOS_VERSION >= 40099 -#endif // KOKKOSSPARSE_COO2CRS_IMPL_HPP +#endif // KOKKOSSPARSE_COO2CRS_IMPL_HPP diff --git a/sparse/unit_test/Test_Sparse.hpp b/sparse/unit_test/Test_Sparse.hpp index 18e3b45b98..e0d0085be1 100644 --- a/sparse/unit_test/Test_Sparse.hpp +++ b/sparse/unit_test/Test_Sparse.hpp @@ -18,7 +18,7 @@ #if KOKKOS_VERSION >= 40099 #include "Test_Sparse_coo2crs.hpp" -#endif // KOKKOS_VERSION >= 40099 +#endif // KOKKOS_VERSION >= 40099 #include "Test_Sparse_crs2coo.hpp" #include "Test_Sparse_block_gauss_seidel.hpp" #include "Test_Sparse_Controls.hpp" From bf498cd4ae2047db58999e6eed79621c457c1b83 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 16 May 2023 09:39:06 -0700 Subject: [PATCH 359/442] Remove unnecessary code --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 11 ----------- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 19 ++----------------- 2 files changed, 2 insertions(+), 28 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 511fb35709..47e831b301 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -570,11 +570,6 @@ void iluk_numeric_streams(const std::vector &execspace_v, tstf); } // end if (stream_have_level_v[i]) } // end for streams - - // 2. Wait for all streams finished - for (int i = 0; i < nstreams; i++) { - if (stream_have_level_v[i]) execspace_v[i].fence(); - } // end for streams } // end for lvl } // end SEQLVLSCHD_RP else if (thandle_v[0]->get_algorithm() == @@ -656,12 +651,6 @@ void iluk_numeric_streams(const std::vector &execspace_v, } // end if (chunkid < lvl_nchunks_h_v[i](lvl)) } // end if (stream_have_level_v[i]) } // end for streams - - // 2. Wait for all streams finishing - for (int i = 0; i < nstreams; i++) { - if (stream_have_level_v[i]) - if (chunkid < lvl_nchunks_h_v[i](lvl)) execspace_v[i].fence(); - } // end for streams } // end for chunkid } // end for lvl } // end SEQLVLSCHD_TP1 diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 4bed672f61..5414b06f1d 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -103,15 +103,13 @@ struct TriLvlSchedTP1SolverFunctor { long node_count; // like "block" offset into ngbl, my_league is the "local" // offset - long dense_nrows; TriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, - const bool is_lowertri_, long node_count_, - long dense_nrows_ = 0) + const bool &is_lowertri_, const long &node_count_) : row_map(row_map_), entries(entries_), values(values_), @@ -119,8 +117,7 @@ struct TriLvlSchedTP1SolverFunctor { rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), is_lowertri(is_lowertri_), - node_count(node_count_), - dense_nrows(dense_nrows_) {} + node_count(node_count_) {} KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { @@ -4108,12 +4105,6 @@ void lower_tri_solve_streams(const std::vector &execspace_v, } // end if (lvl_nodes != 0) } // end if (lvl < nlevels_v[i]) } // end for streams - - // 2. Wait for all streams finished - // note: not needed here unlike in the spiluk case - // for (int i = 0; i < nstreams; i++) { - // execspace_v[i].fence(); - //} // end for streams } // end for lvl } // end lower_tri_solve_streams @@ -4200,12 +4191,6 @@ void upper_tri_solve_streams(const std::vector &execspace_v, } // end if (lvl_nodes != 0) } // end if (lvl < nlevels_v[i]) } // end for streams - - // 2. Wait for all streams finished - // note: not needed here unlike in the spiluk case - // for (int i = 0; i < nstreams; i++) { - // execspace_v[i].fence(); - //} // end for streams } // end for lvl } // end upper_tri_solve_streams From ea3321c2fbdf9bc7668f1809a22d7e6b88fb0ac0 Mon Sep 17 00:00:00 2001 From: "Vinh Quang Dang (-EXP)" Date: Tue, 16 May 2023 10:44:56 -0600 Subject: [PATCH 360/442] Apply clang format --- sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp | 10 +++++----- sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 47e831b301..c2863885b2 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -570,8 +570,8 @@ void iluk_numeric_streams(const std::vector &execspace_v, tstf); } // end if (stream_have_level_v[i]) } // end for streams - } // end for lvl - } // end SEQLVLSCHD_RP + } // end for lvl + } // end SEQLVLSCHD_RP else if (thandle_v[0]->get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { using policy_type = Kokkos::TeamPolicy; @@ -651,9 +651,9 @@ void iluk_numeric_streams(const std::vector &execspace_v, } // end if (chunkid < lvl_nchunks_h_v[i](lvl)) } // end if (stream_have_level_v[i]) } // end for streams - } // end for chunkid - } // end for lvl - } // end SEQLVLSCHD_TP1 + } // end for chunkid + } // end for lvl + } // end SEQLVLSCHD_TP1 } // end iluk_numeric_streams diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 5414b06f1d..e2a625e2a7 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -4105,7 +4105,7 @@ void lower_tri_solve_streams(const std::vector &execspace_v, } // end if (lvl_nodes != 0) } // end if (lvl < nlevels_v[i]) } // end for streams - } // end for lvl + } // end for lvl } // end lower_tri_solve_streams template &execspace_v, } // end if (lvl_nodes != 0) } // end if (lvl < nlevels_v[i]) } // end for streams - } // end for lvl + } // end for lvl } // end upper_tri_solve_streams } // namespace Experimental From 09a4820b37ae98fcdd01f4d7e9e120d6f994944e Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 16 May 2023 11:14:53 -0600 Subject: [PATCH 361/442] cm_test_all_sandia: updates for weaver --- scripts/cm_test_all_sandia | 40 +++++++------------------------------- 1 file changed, 7 insertions(+), 33 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 3a9a79b11d..7f992175bb 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -587,30 +587,16 @@ elif [ "$MACHINE" = "inouye" ]; then SPACK_HOST_ARCH="+a64fx" elif [ "$MACHINE" = "weaver" ]; then - MODULE_ENVIRONMENT="source /etc/profile.d/modules.sh" + # Use the legacy env for now until all modules are part of the new system + MODULE_ENVIRONMENT="source /projects/ppc64le-pwr9-rhel8/legacy-env.sh" eval "$MODULE_ENVIRONMENT" SKIP_HWLOC=True -# used with rhel7W queue - BASE_MODULE_LIST="cmake/3.23.1,/" - CUDA_MODULE_LIST="cmake/3.23.1,/,ibm/xl/16.1.1,gcc/7.2.0" - CUDA10_MODULE_LIST="cmake/3.23.1,/,ibm/xl/16.1.1,gcc/7.4.0" - - GCC72_MODULE_TPL_LIST="cmake/3.23.1,/,openblas/0.2.20/gcc/7.2.0" - GCC74_MODULE_TPL_LIST="cmake/3.23.1,/,openblas/0.2.20/gcc/7.2.0,gcc/7.4.0" GCC93_MODULE_TPL_LIST="cmake/3.23.1,/,openblas/0.3.20/gcc/9.3.0,gcc/9.3.0" - CUDA_MODULE_TPL_LIST="cmake/3.23.1,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" - CUDA10_MODULE_TPL_LIST="cmake/3.23.1,/,gcc/7.2.0,openblas/0.2.20/gcc/7.2.0" - - # Issues finding CUBLAS with cuda/10.1.243 module at configure - # "Could NOT find TPLCUBLAS (missing: CUDA_CUBLAS_LIBRARIES)" - # Once resolved add the compiler + modules below to the SPOT_CHECK_TPLS -# "cuda/10.1.243 $CUDA10_MODULE_TPL_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" CLANG13_MODULE_TPL_LIST="cmake/3.23.1,/,openblas/0.3.20/gcc/9.3.0,cuda/10.1.243" -# used with rhel8 queue - RHEL8_BASE_MODULE_LIST="cmake/3.23.1,/" - # Cuda/11 modules available only on the dev queue (rhel8 OS); gcc/8.3.1 load by default + BASE_MODULE_LIST="cmake/3.23.1,/" + # Cuda/11 modules available rhel8 queue (rhel8 OS); gcc/8.3.1 load by default RHEL8_CUDA11_MODULE_LIST="cmake/3.23.1,/,openblas/0.3.20/gcc/9.3.0" # Don't do Threads on weaver @@ -618,31 +604,19 @@ elif [ "$MACHINE" = "weaver" ]; then if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/7.2.0 $BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "cuda/10.1.243 $CUDA10_MODULE_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/11.2.2 $RHEL8_CUDA11_MODULE_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + COMPILERS=("cuda/11.2.2 $RHEL8_CUDA11_MODULE_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "gcc/9.3.0 $BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/7.2.0 $GCC72_MODULE_TPL_LIST "Serial,OpenMP" g++ $GCC_WARNING_FLAGS" - "gcc/7.4.0 $GCC74_MODULE_TPL_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" - "cuda/9.2.88 $CUDA_MODULE_TPL_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/10.1.243 $CUDA10_MODULE_TPL_LIST "Cuda_Serial" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/11.2.2 $RHEL8_CUDA11_MODULE_LIST "Cuda_Serial" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + COMPILERS=("cuda/11.2.2 $RHEL8_CUDA11_MODULE_LIST "Cuda_Serial" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "gcc/9.3.0 $GCC93_MODULE_TPL_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS" "clang/13.0.0 $CLANG13_MODULE_TPL_LIST "Cuda" clang++ $CUDA_WARNING_FLAGS" ) else # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/7.2.0 $BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/7.4.0 $BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/8.3.1 $RHEL8_BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" + COMPILERS=("gcc/8.3.1 $BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/9.3.0 $BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "cuda/10.0.130 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/10.1.105 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/10.1.243 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/10.2.089 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/11.2.2 $RHEL8_CUDA11_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "clang/13.0.0 $CLANG13_MODULE_TPL_LIST $CUDA_IBM_BUILD_LIST clang++ $CUDA_WARNING_FLAGS" ) From 4a8667228c44ecb84daeea3dfcef9b91f872214a Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 16 May 2023 11:48:59 -0600 Subject: [PATCH 362/442] scripts/cm_test_all_sandia: Update cuda11 modules --- scripts/cm_test_all_sandia | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 7f992175bb..5dcd70d885 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -597,19 +597,19 @@ elif [ "$MACHINE" = "weaver" ]; then BASE_MODULE_LIST="cmake/3.23.1,/" # Cuda/11 modules available rhel8 queue (rhel8 OS); gcc/8.3.1 load by default - RHEL8_CUDA11_MODULE_LIST="cmake/3.23.1,/,openblas/0.3.20/gcc/9.3.0" + RHEL8_CUDA11_MODULE_LIST="cmake/3.23.1,cuda/11.2.2/gcc/8.3.1,openblas/0.3.18/gcc/8.3.1" # Don't do Threads on weaver GCC_IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("cuda/11.2.2 $RHEL8_CUDA11_MODULE_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + COMPILERS=("cuda/11.2.2/gcc/8.3.1 $RHEL8_CUDA11_MODULE_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "gcc/9.3.0 $BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("cuda/11.2.2 $RHEL8_CUDA11_MODULE_LIST "Cuda_Serial" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + COMPILERS=("cuda/11.2.2/gcc/8.3.1 $RHEL8_CUDA11_MODULE_LIST "Cuda_Serial" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "gcc/9.3.0 $GCC93_MODULE_TPL_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS" "clang/13.0.0 $CLANG13_MODULE_TPL_LIST "Cuda" clang++ $CUDA_WARNING_FLAGS" ) @@ -617,7 +617,7 @@ elif [ "$MACHINE" = "weaver" ]; then # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("gcc/8.3.1 $BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/9.3.0 $BASE_MODULE_LIST $GCC_IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "cuda/11.2.2 $RHEL8_CUDA11_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/11.2.2/gcc/8.3.1 $RHEL8_CUDA11_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "clang/13.0.0 $CLANG13_MODULE_TPL_LIST $CUDA_IBM_BUILD_LIST clang++ $CUDA_WARNING_FLAGS" ) fi From ad541587d9632ab544e47973bede58ddaff8b288 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 May 2023 16:01:12 -0600 Subject: [PATCH 363/442] sparse/eti: Remove unused decl.hpp.in files blas/eti: Remove unused decl.hpp.in files cmake: Remove decl.hpp.in processing --- .../KokkosBlas1_abs_eti_spec_decl.hpp.in | 26 ----------------- .../KokkosBlas1_abs_mv_eti_spec_decl.hpp.in | 26 ----------------- .../KokkosBlas1_axpby_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_axpby_mv_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_dot_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_dot_mv_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_iamax_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_iamax_mv_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_mult_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_mult_mv_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_nrm1_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_nrm1_mv_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_nrm2_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_nrm2_mv_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_nrm2w_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_nrm2w_mv_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_nrminf_eti_spec_decl.hpp.in | 24 --------------- ...KokkosBlas1_nrminf_mv_eti_spec_decl.hpp.in | 24 --------------- ...okkosBlas1_reciprocal_eti_spec_decl.hpp.in | 24 --------------- ...osBlas1_reciprocal_mv_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_rot_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_rotg_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_rotm_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_rotmg_eti_spec_decl.hpp.in | 25 ---------------- .../KokkosBlas1_scal_eti_spec_decl.hpp.in | 25 ---------------- .../KokkosBlas1_scal_mv_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_sum_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_sum_mv_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas1_swap_eti_spec_decl.hpp.in | 26 ----------------- .../KokkosBlas1_update_eti_spec_decl.hpp.in | 24 --------------- ...KokkosBlas1_update_mv_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas2_gemv_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas3_gemm_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas3_trmm_eti_spec_decl.hpp.in | 26 ----------------- .../KokkosBlas3_trsm_eti_spec_decl.hpp.in | 26 ----------------- .../KokkosBlas_gesv_eti_spec_decl.hpp.in | 24 --------------- .../KokkosBlas_trtri_eti_spec_decl.hpp.in | 26 ----------------- cmake/kokkoskernels_eti.cmake | 8 ----- .../KokkosGraph_color_d1_eti_spec_decl.hpp.in | 24 --------------- ...parse_bspgemm_numeric_eti_spec_decl.hpp.in | 24 --------------- ...se_gauss_seidel_apply_eti_spec_decl.hpp.in | 24 --------------- ..._gauss_seidel_numeric_eti_spec_decl.hpp.in | 24 --------------- ...gauss_seidel_symbolic_eti_spec_decl.hpp.in | 24 --------------- ...rse_par_ilut_symbolic_eti_spec_decl.hpp.in | 26 ----------------- ...sSparse_spadd_numeric_eti_spec_decl.hpp.in | 24 --------------- ...Sparse_spadd_symbolic_eti_spec_decl.hpp.in | 24 --------------- ...sSparse_spgemm_jacobi_eti_spec_decl.hpp.in | 24 --------------- ...Sparse_spgemm_noreuse_eti_spec_decl.hpp.in | 24 --------------- ...Sparse_spgemm_numeric_eti_spec_decl.hpp.in | 24 --------------- ...parse_spgemm_symbolic_eti_spec_decl.hpp.in | 24 --------------- ...parse_spiluk_symbolic_eti_spec_decl.hpp.in | 26 ----------------- ...Sparse_spmv_bsrmatrix_eti_spec_decl.hpp.in | 28 ------------------ .../KokkosSparse_spmv_eti_spec_decl.hpp.in | 24 --------------- ...rse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in | 29 ------------------- .../KokkosSparse_spmv_mv_eti_spec_decl.hpp.in | 24 --------------- ...Sparse_spmv_mv_struct_eti_spec_decl.hpp.in | 24 --------------- ...kosSparse_spmv_struct_eti_spec_decl.hpp.in | 24 --------------- ...osSparse_sptrsv_solve_eti_spec_decl.hpp.in | 24 --------------- ...parse_sptrsv_symbolic_eti_spec_decl.hpp.in | 24 --------------- .../KokkosSparse_trsv_eti_spec_decl.hpp.in | 24 --------------- 60 files changed, 1451 deletions(-) delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_abs_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_abs_mv_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_mv_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_mv_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_mv_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_mv_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_mv_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_rot_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_rotg_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_rotm_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_rotmg_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_scal_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_scal_mv_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_sum_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_sum_mv_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_swap_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_update_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas1_update_mv_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas3_gemm_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas3_trmm_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas3_trsm_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_decl.hpp.in delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_decl.hpp.in delete mode 100644 graph/eti/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_apply_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_numeric_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_symbolic_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_symbolic_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_symbolic_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_struct_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_struct_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_solve_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_symbolic_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_trsv_eti_spec_decl.hpp.in diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_eti_spec_decl.hpp.in deleted file mode 100644 index 2780dee8ff..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_eti_spec_decl.hpp.in +++ /dev/null @@ -1,26 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_ABS_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_ABS_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { - -@BLAS1_ABS_ETI_DECL_BLOCK@ - -} // Impl -} // KokkosBlas -#endif // KOKKOSBLAS1_ABS_ETI_SPEC_DECL_HPP_ diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_mv_eti_spec_decl.hpp.in deleted file mode 100644 index c7af4806be..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,26 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_ABS_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_ABS_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { - -@BLAS1_ABS_MV_ETI_DECL_BLOCK@ - -} // Impl -} // KokkosBlas -#endif // KOKKOSBLAS1_ABS_MV_ETI_SPEC_DECL_HPP_ diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_decl.hpp.in deleted file mode 100644 index 3f8cfa92d5..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_AXPBY_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_AXPBY_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_AXPBY_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_decl.hpp.in deleted file mode 100644 index 44b6708c99..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER -#ifndef KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_DECL_HPP_ - -namespace KokkosBlas { -namespace Impl { -@BLAS1_AXPBY_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_decl.hpp.in deleted file mode 100644 index 42982920fd..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_DOT_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_DOT_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_DOT_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_decl.hpp.in deleted file mode 100644 index da7f48f325..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_DOT_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_DOT_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_eti_spec_decl.hpp.in deleted file mode 100644 index 17b61a8857..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_IAMAX_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_mv_eti_spec_decl.hpp.in deleted file mode 100644 index 35d654012e..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_IAMAX_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_decl.hpp.in deleted file mode 100644 index 406feeaf5d..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_MULT_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_MULT_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_MULT_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_decl.hpp.in deleted file mode 100644 index a59f2af39d..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_MULT_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_MULT_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_eti_spec_decl.hpp.in deleted file mode 100644 index 1c9a088122..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_NRM1_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_NRM1_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_NRM1_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_mv_eti_spec_decl.hpp.in deleted file mode 100644 index d2a322a0ad..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_NRM1_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_NRM1_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_decl.hpp.in deleted file mode 100644 index dfe891afc9..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_NRM2_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_NRM2_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_NRM2_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_decl.hpp.in deleted file mode 100644 index 2e0f745682..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_NRM2_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_NRM2_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_eti_spec_decl.hpp.in deleted file mode 100644 index bd7d1b11b8..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_NRM2W_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_NRM2W_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_NRM2W_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_mv_eti_spec_decl.hpp.in deleted file mode 100644 index 0a0aadc87a..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_NRM2W_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_eti_spec_decl.hpp.in deleted file mode 100644 index 3f1e874724..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_NRMINF_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_NRMINF_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_NRMINF_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_mv_eti_spec_decl.hpp.in deleted file mode 100644 index 17559306bf..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_NRMINF_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_eti_spec_decl.hpp.in deleted file mode 100644 index 7ac4b74ea4..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_RECIPROCAL_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_mv_eti_spec_decl.hpp.in deleted file mode 100644 index f40958465f..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_RECIPROCAL_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_rot_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_rot_eti_spec_decl.hpp.in deleted file mode 100644 index 5e6b197460..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_rot_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_ROT_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_ROT_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_ROT_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_rotg_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_rotg_eti_spec_decl.hpp.in deleted file mode 100644 index e410696d54..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_rotg_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_ROTG_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_ROTG_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_ROTG_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_rotm_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_rotm_eti_spec_decl.hpp.in deleted file mode 100644 index bd88a1e4c6..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_rotm_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_ROTM_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_ROTM_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_ROTM_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_rotmg_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_rotmg_eti_spec_decl.hpp.in deleted file mode 100644 index b69e9b6b4b..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_rotmg_eti_spec_decl.hpp.in +++ /dev/null @@ -1,25 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_ROTMG_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_ROTMG_ETI_SPEC_DECL_HPP_ - -namespace KokkosBlas { -namespace Impl { -@BLAS1_ROTMG_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_eti_spec_decl.hpp.in deleted file mode 100644 index a2da20787d..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_eti_spec_decl.hpp.in +++ /dev/null @@ -1,25 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_SCAL_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_SCAL_ETI_SPEC_DECL_HPP_ - -namespace KokkosBlas { -namespace Impl { -@BLAS1_SCAL_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_mv_eti_spec_decl.hpp.in deleted file mode 100644 index 953f8e6954..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_SCAL_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_SCAL_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_eti_spec_decl.hpp.in deleted file mode 100644 index bdac3456e8..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_SUM_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_SUM_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_SUM_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_mv_eti_spec_decl.hpp.in deleted file mode 100644 index 5182f61985..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_SUM_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_SUM_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_SUM_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_swap_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_swap_eti_spec_decl.hpp.in deleted file mode 100644 index e795c8fb9c..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_swap_eti_spec_decl.hpp.in +++ /dev/null @@ -1,26 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER -*/ -#ifndef KOKKOSBLAS1_SWAP_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_SWAP_ETI_SPEC_DECL_HPP_ - -namespace KokkosBlas { -namespace Impl { -@BLAS1_SWAP_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_update_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_update_eti_spec_decl.hpp.in deleted file mode 100644 index cff04c9fbe..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_update_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_UPDATE_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_UPDATE_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_UPDATE_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_update_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_update_mv_eti_spec_decl.hpp.in deleted file mode 100644 index deec84712b..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_update_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS1_UPDATE_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_decl.hpp.in deleted file mode 100644 index 9d69383b3d..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS2_GEMV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS2_GEMV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS2_GEMV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas3_gemm_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas3_gemm_eti_spec_decl.hpp.in deleted file mode 100644 index 22ea9a1ed1..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas3_gemm_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS3_GEMM_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas3_trmm_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas3_trmm_eti_spec_decl.hpp.in deleted file mode 100644 index e802ccf4fc..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas3_trmm_eti_spec_decl.hpp.in +++ /dev/null @@ -1,26 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { - -@BLAS3_TRMM_ETI_DECL_BLOCK@ - -} // Impl -} // KokkosBlas -#endif // KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_HPP_ diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas3_trsm_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas3_trsm_eti_spec_decl.hpp.in deleted file mode 100644 index 11ca605f4f..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas3_trsm_eti_spec_decl.hpp.in +++ /dev/null @@ -1,26 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { - -@BLAS3_TRSM_ETI_DECL_BLOCK@ - -} // Impl -} // KokkosBlas -#endif // KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_HPP_ diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_decl.hpp.in deleted file mode 100644 index 2ae20b772c..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas_gesv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS_GESV_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS_GESV_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { -@BLAS_GESV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_decl.hpp.in deleted file mode 100644 index 1bd8c9da19..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas_trtri_eti_spec_decl.hpp.in +++ /dev/null @@ -1,26 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS_TRTRI_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS_TRTRI_ETI_SPEC_DECL_HPP_ -namespace KokkosBlas { -namespace Impl { - -@BLAS_TRTRI_ETI_DECL_BLOCK@ - -} // Impl -} // KokkosBlas -#endif // KOKKOSBLAS_TRTRI_ETI_SPEC_DECL_HPP_ diff --git a/cmake/kokkoskernels_eti.cmake b/cmake/kokkoskernels_eti.cmake index 1823bf96b6..524cad11f9 100644 --- a/cmake/kokkoskernels_eti.cmake +++ b/cmake/kokkoskernels_eti.cmake @@ -130,7 +130,6 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) ${ARGN}) STRING(TOUPPER "${FUNCTION_NAME}" UPPER_NAME) - SET(ETI_DECL_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_DECL") SET(ETI_AVAIL_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_AVAIL") SET(ETI_INST_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_INST") @@ -152,9 +151,7 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) STRING(APPEND MACRO_STRING ")") STRING(REPLACE ",)" ")" MACRO_STRING ${MACRO_STRING}) #Make a single header file for all instances - LIST(APPEND ${UPPER_NAME}_ETI_DECL_LIST "${ETI_DECL_MACRO}${MACRO_STRING}") LIST(APPEND ${UPPER_NAME}_ETI_AVAIL_LIST "${ETI_AVAIL_MACRO}${MACRO_STRING}") - SET(${UPPER_NAME}_ETI_DECL_LIST "${ETI_DECL_MACRO}${MACRO_STRING}") #Make a different source file for each instance SET(INST_SOURCE "${ETI_COMPONENTS}/eti/generated_specializations_cpp/${SUBFOLDER}/${ETI}.cpp") SET(INST_TEMPLATE "${ETI_COMPONENTS}/eti/generated_specializations_cpp/${SUBFOLDER}/Kokkos${FUNCTION_NAME}_eti_spec_inst.cpp.in") @@ -169,17 +166,12 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) SET(AVAIL_HEADER "${ETI_COMPONENTS}/eti/generated_specializations_hpp/Kokkos${FUNCTION_NAME}_eti_spec_avail.hpp") SET(AVAIL_TEMPLATE "${AVAIL_HEADER}.in") - SET(DECL_HEADER "${ETI_COMPONENTS}/eti/generated_specializations_hpp/Kokkos${FUNCTION_NAME}_eti_spec_decl.hpp") - SET(DECL_TEMPLATE "${DECL_HEADER}.in") STRING(REPLACE ";" "\n" ${UPPER_NAME}_ETI_INST_BLOCK "${${UPPER_NAME}_ETI_INST_LIST}") STRING(REPLACE ";" "\n" ${UPPER_NAME}_ETI_AVAIL_BLOCK "${${UPPER_NAME}_ETI_AVAIL_LIST}") - CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${DECL_TEMPLATE} - ${CMAKE_CURRENT_BINARY_DIR}/${DECL_HEADER}) CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${AVAIL_TEMPLATE} ${CMAKE_CURRENT_BINARY_DIR}/${AVAIL_HEADER}) LIST(APPEND ${ETI_HEADER_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${AVAIL_HEADER}) - LIST(APPEND ${ETI_HEADER_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${DECL_HEADER}) ENDMACRO(KOKKOSKERNELS_GENERATE_ETI) diff --git a/graph/eti/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in b/graph/eti/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in deleted file mode 100644 index 23e1699557..0000000000 --- a/graph/eti/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL_HPP_ -#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL_HPP_ -namespace KokkosGraph { -namespace Impl { -@GRAPH_COLOR_D1_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in deleted file mode 100644 index 2fdcd740e2..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_BSPGEMM_NUMERIC_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_apply_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_apply_eti_spec_decl.hpp.in deleted file mode 100644 index 9be44095f0..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_apply_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_GAUSS_SEIDEL_APPLY_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_GAUSS_SEIDEL_APPLY_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_GAUSS_SEIDEL_APPLY_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_numeric_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_numeric_eti_spec_decl.hpp.in deleted file mode 100644 index 1e3befcc89..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_numeric_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_GAUSS_SEIDEL_NUMERIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_GAUSS_SEIDEL_NUMERIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_GAUSS_SEIDEL_NUMERIC_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_symbolic_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_symbolic_eti_spec_decl.hpp.in deleted file mode 100644 index 493740dfb2..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_symbolic_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_symbolic_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_symbolic_eti_spec_decl.hpp.in deleted file mode 100644 index c30fe10f82..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_symbolic_eti_spec_decl.hpp.in +++ /dev/null @@ -1,26 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_PAR_ILUT_SYMBOLIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_PAR_ILUT_SYMBOLIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { - -@SPARSE_PAR_ILUT_SYMBOLIC_ETI_DECL_BLOCK@ - -} // Impl -} // KokkosSparse -#endif // KOKKOSSPARSE_PAR_ILUT_SYMBOLIC_ETI_SPEC_DECL_HPP_ diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in deleted file mode 100644 index 43b1da79d1..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPADD_NUMERIC_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in deleted file mode 100644 index 131960272e..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPADD_SYMBOLIC_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_decl.hpp.in deleted file mode 100644 index 313f1a88d0..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPGEMM_JACOBI_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPGEMM_JACOBI_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPGEMM_JACOBI_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_decl.hpp.in deleted file mode 100644 index 2ca1ecf07b..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPGEMM_NOREUSE_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPGEMM_NOREUSE_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPGEMM_NOREUSE_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_decl.hpp.in deleted file mode 100644 index af422e6fe5..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPGEMM_NUMERIC_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_decl.hpp.in deleted file mode 100644 index 2f3870e948..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPGEMM_SYMBOLIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPGEMM_SYMBOLIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPGEMM_SYMBOLIC_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_symbolic_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_symbolic_eti_spec_decl.hpp.in deleted file mode 100644 index bfffae9dc0..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_symbolic_eti_spec_decl.hpp.in +++ /dev/null @@ -1,26 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { - -@SPARSE_SPILUK_SYMBOLIC_ETI_DECL_BLOCK@ - -} // Impl -} // KokkosSparse -#endif // KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_DECL_HPP_ diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_decl.hpp.in deleted file mode 100644 index 5a7977921d..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_decl.hpp.in +++ /dev/null @@ -1,28 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Experimental { -namespace Impl { -// clang-format off -@SPARSE_SPMV_BSRMATRIX_ETI_DECL_BLOCK@ -// clang-format on -} // namespace Impl -} // namespace Experimental -} // namespace KokkosSparse -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_eti_spec_decl.hpp.in deleted file mode 100644 index 14813536f0..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPMV_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPMV_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPMV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in deleted file mode 100644 index 4eb5388da1..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in +++ /dev/null @@ -1,29 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_DECL_HPP_ - -namespace KokkosSparse { -namespace Experimental { -namespace Impl { -// clang-format off -@SPARSE_SPMV_MV_BSRMATRIX_ETI_DECL_BLOCK@ -// clang-format on -} // namespace Impl -} // namespace Experimental -} // namespace KokkosSparse -#endif \ No newline at end of file diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_decl.hpp.in deleted file mode 100644 index af58d3e7fc..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPMV_MV_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPMV_MV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_struct_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_struct_eti_spec_decl.hpp.in deleted file mode 100644 index 11ba625f3c..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_struct_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPMV_MV_STRUCT_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_struct_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_struct_eti_spec_decl.hpp.in deleted file mode 100644 index a03fcf586e..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_struct_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPMV_STRUCT_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_solve_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_solve_eti_spec_decl.hpp.in deleted file mode 100644 index aa3d2b2cef..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_solve_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPTRSV_SOLVE_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPTRSV_SOLVE_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPTRSV_SOLVE_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_symbolic_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_symbolic_eti_spec_decl.hpp.in deleted file mode 100644 index 4c48c895a1..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_symbolic_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPTRSV_SYMBOLIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPTRSV_SYMBOLIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_SPTRSV_SYMBOLIC_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_trsv_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_trsv_eti_spec_decl.hpp.in deleted file mode 100644 index 5b24a276d0..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_trsv_eti_spec_decl.hpp.in +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_TRSV_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_TRSV_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { -@SPARSE_TRSV_ETI_DECL_BLOCK@ - } //IMPL -} //Kokkos -#endif From b1e22208f36522104ffdcae5bb3272eb6b44ed47 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 16 May 2023 12:43:49 -0600 Subject: [PATCH 364/442] Remove includes of decl.hpp files --- blas/impl/KokkosBlas1_abs_spec.hpp | 2 -- blas/impl/KokkosBlas1_axpby_spec.hpp | 2 -- blas/impl/KokkosBlas1_dot_spec.hpp | 2 -- blas/impl/KokkosBlas1_iamax_spec.hpp | 2 -- blas/impl/KokkosBlas1_mult_spec.hpp | 2 -- blas/impl/KokkosBlas1_nrm1_spec.hpp | 2 -- blas/impl/KokkosBlas1_nrm2_spec.hpp | 2 -- blas/impl/KokkosBlas1_nrm2w_spec.hpp | 2 -- blas/impl/KokkosBlas1_nrminf_spec.hpp | 2 -- blas/impl/KokkosBlas1_reciprocal_spec.hpp | 2 -- blas/impl/KokkosBlas1_rot_spec.hpp | 1 - blas/impl/KokkosBlas1_rotg_spec.hpp | 1 - blas/impl/KokkosBlas1_rotm_spec.hpp | 1 - blas/impl/KokkosBlas1_rotmg_spec.hpp | 1 - blas/impl/KokkosBlas1_scal_spec.hpp | 2 -- blas/impl/KokkosBlas1_sum_spec.hpp | 2 -- blas/impl/KokkosBlas1_swap_spec.hpp | 1 - blas/impl/KokkosBlas1_update_spec.hpp | 2 -- blas/impl/KokkosBlas2_gemv_spec.hpp | 1 - blas/impl/KokkosBlas3_gemm_spec.hpp | 1 - blas/impl/KokkosBlas3_trmm_spec.hpp | 1 - blas/impl/KokkosBlas3_trsm_spec.hpp | 1 - blas/impl/KokkosBlas_gesv_spec.hpp | 1 - blas/impl/KokkosBlas_trtri_spec.hpp | 1 - graph/impl/KokkosGraph_color_d1_spec.hpp | 2 -- sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp | 1 - sparse/impl/KokkosSparse_gauss_seidel_spec.hpp | 3 --- sparse/impl/KokkosSparse_gmres_spec.hpp | 1 - sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp | 1 - sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp | 1 - sparse/impl/KokkosSparse_spadd_numeric_spec.hpp | 1 - sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp | 1 - sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp | 1 - sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp | 1 - sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp | 1 - sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp | 1 - sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp | 1 - sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp | 1 - sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 2 -- sparse/impl/KokkosSparse_spmv_spec.hpp | 2 -- sparse/impl/KokkosSparse_spmv_struct_spec.hpp | 2 -- sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp | 1 - sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp | 1 - sparse/impl/KokkosSparse_trsv_spec.hpp | 1 - 44 files changed, 63 deletions(-) diff --git a/blas/impl/KokkosBlas1_abs_spec.hpp b/blas/impl/KokkosBlas1_abs_spec.hpp index 76555aec5a..a4695bd505 100644 --- a/blas/impl/KokkosBlas1_abs_spec.hpp +++ b/blas/impl/KokkosBlas1_abs_spec.hpp @@ -254,7 +254,5 @@ struct Abs; #include -#include -#include #endif // KOKKOS_BLAS1_MV_IMPL_ABS_HPP_ diff --git a/blas/impl/KokkosBlas1_axpby_spec.hpp b/blas/impl/KokkosBlas1_axpby_spec.hpp index 187ea04c2e..da2924c9f3 100644 --- a/blas/impl/KokkosBlas1_axpby_spec.hpp +++ b/blas/impl/KokkosBlas1_axpby_spec.hpp @@ -513,7 +513,5 @@ struct Axpby; #include -#include -#include #endif // KOKKOS_BLAS1_MV_IMPL_AXPBY_HPP_ diff --git a/blas/impl/KokkosBlas1_dot_spec.hpp b/blas/impl/KokkosBlas1_dot_spec.hpp index 063bda41b6..02efee6bc5 100644 --- a/blas/impl/KokkosBlas1_dot_spec.hpp +++ b/blas/impl/KokkosBlas1_dot_spec.hpp @@ -619,7 +619,5 @@ struct Dot; #include -#include -#include #endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_ diff --git a/blas/impl/KokkosBlas1_iamax_spec.hpp b/blas/impl/KokkosBlas1_iamax_spec.hpp index 461625df67..341b949050 100644 --- a/blas/impl/KokkosBlas1_iamax_spec.hpp +++ b/blas/impl/KokkosBlas1_iamax_spec.hpp @@ -363,7 +363,5 @@ struct Iamax -#include -#include #endif // KOKKOSBLAS1_IAMAX_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_mult_spec.hpp b/blas/impl/KokkosBlas1_mult_spec.hpp index 4a38c347f5..c81e00a6b0 100644 --- a/blas/impl/KokkosBlas1_mult_spec.hpp +++ b/blas/impl/KokkosBlas1_mult_spec.hpp @@ -311,7 +311,5 @@ struct Mult; #include -#include -#include #endif // KOKKOSBLAS1_MULT_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_nrm1_spec.hpp b/blas/impl/KokkosBlas1_nrm1_spec.hpp index f35a341787..24f093c736 100644 --- a/blas/impl/KokkosBlas1_nrm1_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm1_spec.hpp @@ -284,7 +284,5 @@ struct Nrm1; #include -#include -#include #endif // KOKKOSBLAS1_NRM1_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_nrm2_spec.hpp b/blas/impl/KokkosBlas1_nrm2_spec.hpp index 0a258e00f4..6c21e551a8 100644 --- a/blas/impl/KokkosBlas1_nrm2_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm2_spec.hpp @@ -289,7 +289,5 @@ struct Nrm2; #include -#include -#include #endif // KOKKOSBLAS1_NRM2_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_nrm2w_spec.hpp b/blas/impl/KokkosBlas1_nrm2w_spec.hpp index c26d8bf004..f4bbe286ef 100644 --- a/blas/impl/KokkosBlas1_nrm2w_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm2w_spec.hpp @@ -285,7 +285,5 @@ struct Nrm2w; #include -#include -#include #endif // KOKKOSBLAS1_NRM2W_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_nrminf_spec.hpp b/blas/impl/KokkosBlas1_nrminf_spec.hpp index 4b39408986..3659d61f19 100644 --- a/blas/impl/KokkosBlas1_nrminf_spec.hpp +++ b/blas/impl/KokkosBlas1_nrminf_spec.hpp @@ -275,7 +275,5 @@ struct NrmInf; #include -#include -#include #endif // KOKKOSBLAS1_NRMINF_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_reciprocal_spec.hpp b/blas/impl/KokkosBlas1_reciprocal_spec.hpp index f758acae2f..08fc8bc341 100644 --- a/blas/impl/KokkosBlas1_reciprocal_spec.hpp +++ b/blas/impl/KokkosBlas1_reciprocal_spec.hpp @@ -262,7 +262,5 @@ struct Reciprocal; #include -#include -#include #endif // KOKKOS_BLAS1_MV_IMPL_RECIPROCAL_HPP_ diff --git a/blas/impl/KokkosBlas1_rot_spec.hpp b/blas/impl/KokkosBlas1_rot_spec.hpp index 6547884d46..214e0399e5 100644 --- a/blas/impl/KokkosBlas1_rot_spec.hpp +++ b/blas/impl/KokkosBlas1_rot_spec.hpp @@ -138,6 +138,5 @@ struct Rot; #include -#include #endif // KOKKOSBLAS1_ROT_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_rotg_spec.hpp b/blas/impl/KokkosBlas1_rotg_spec.hpp index 9b911a28f6..bdf313e3d0 100644 --- a/blas/impl/KokkosBlas1_rotg_spec.hpp +++ b/blas/impl/KokkosBlas1_rotg_spec.hpp @@ -135,6 +135,5 @@ struct Rotg; #include -#include #endif // KOKKOSBLAS1_ROTG_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_rotm_spec.hpp b/blas/impl/KokkosBlas1_rotm_spec.hpp index 9cc9ae3e61..854f2abacc 100644 --- a/blas/impl/KokkosBlas1_rotm_spec.hpp +++ b/blas/impl/KokkosBlas1_rotm_spec.hpp @@ -133,6 +133,5 @@ struct Rotm; #include -#include #endif // KOKKOSBLAS1_ROTM_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_rotmg_spec.hpp b/blas/impl/KokkosBlas1_rotmg_spec.hpp index b3aeaa1da3..b90a158654 100644 --- a/blas/impl/KokkosBlas1_rotmg_spec.hpp +++ b/blas/impl/KokkosBlas1_rotmg_spec.hpp @@ -141,6 +141,5 @@ struct Rotmg; #include -#include #endif // KOKKOSBLAS1_ROTMG_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_scal_spec.hpp b/blas/impl/KokkosBlas1_scal_spec.hpp index a7e6ef1f11..38972b2223 100644 --- a/blas/impl/KokkosBlas1_scal_spec.hpp +++ b/blas/impl/KokkosBlas1_scal_spec.hpp @@ -390,7 +390,5 @@ struct Scal; #include -#include -#include #endif // KOKKOS_BLAS1_MV_IMPL_SCAL_HPP_ diff --git a/blas/impl/KokkosBlas1_sum_spec.hpp b/blas/impl/KokkosBlas1_sum_spec.hpp index 83fa6fc1d1..458e7ffdb7 100644 --- a/blas/impl/KokkosBlas1_sum_spec.hpp +++ b/blas/impl/KokkosBlas1_sum_spec.hpp @@ -272,7 +272,5 @@ struct Sum; #include -#include -#include #endif // KOKKOSBLAS1_SUM_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_swap_spec.hpp b/blas/impl/KokkosBlas1_swap_spec.hpp index ed0a14e257..db09a62f8f 100644 --- a/blas/impl/KokkosBlas1_swap_spec.hpp +++ b/blas/impl/KokkosBlas1_swap_spec.hpp @@ -134,6 +134,5 @@ struct Swap; #include -#include #endif // KOKKOSBLAS1_SWAP_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_update_spec.hpp b/blas/impl/KokkosBlas1_update_spec.hpp index d1e8692c8a..9a54888012 100644 --- a/blas/impl/KokkosBlas1_update_spec.hpp +++ b/blas/impl/KokkosBlas1_update_spec.hpp @@ -391,7 +391,5 @@ struct Update; #include -#include -#include #endif // KOKKOSBLAS1_UPDATE_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas2_gemv_spec.hpp b/blas/impl/KokkosBlas2_gemv_spec.hpp index a4582b9d72..42e2465494 100644 --- a/blas/impl/KokkosBlas2_gemv_spec.hpp +++ b/blas/impl/KokkosBlas2_gemv_spec.hpp @@ -153,6 +153,5 @@ struct GEMV { false, true>; #include -#include #endif // KOKKOSBLAS1_GEMV_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas3_gemm_spec.hpp b/blas/impl/KokkosBlas3_gemm_spec.hpp index 5329ec1a9d..c340a41fc1 100644 --- a/blas/impl/KokkosBlas3_gemm_spec.hpp +++ b/blas/impl/KokkosBlas3_gemm_spec.hpp @@ -345,6 +345,5 @@ struct GEMM { EXEC_SPACE, MEM_SPACE) #include -#include #endif // KOKKOSBLAS3_GEMM_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas3_trmm_spec.hpp b/blas/impl/KokkosBlas3_trmm_spec.hpp index fe3096957a..85a8b1c6db 100644 --- a/blas/impl/KokkosBlas3_trmm_spec.hpp +++ b/blas/impl/KokkosBlas3_trmm_spec.hpp @@ -162,6 +162,5 @@ struct TRMM -#include #endif // KOKKOSBLAS3_TRMM_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas3_trsm_spec.hpp b/blas/impl/KokkosBlas3_trsm_spec.hpp index 08e1edd0de..93d01ed53b 100644 --- a/blas/impl/KokkosBlas3_trsm_spec.hpp +++ b/blas/impl/KokkosBlas3_trsm_spec.hpp @@ -165,6 +165,5 @@ struct TRSM -#include #endif // KOKKOSBLAS3_TRSM_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas_gesv_spec.hpp b/blas/impl/KokkosBlas_gesv_spec.hpp index 8b554f7130..f1dff467c8 100644 --- a/blas/impl/KokkosBlas_gesv_spec.hpp +++ b/blas/impl/KokkosBlas_gesv_spec.hpp @@ -128,6 +128,5 @@ struct GESV { false, true>; #include -#include #endif // KOKKOSBLAS_IMPL_GESV_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas_trtri_spec.hpp b/blas/impl/KokkosBlas_trtri_spec.hpp index 7b3e6b1fc3..2a4d2db576 100644 --- a/blas/impl/KokkosBlas_trtri_spec.hpp +++ b/blas/impl/KokkosBlas_trtri_spec.hpp @@ -123,6 +123,5 @@ struct TRTRI { false, true>; #include -#include #endif // KOKKOSBLAS_TRTRI_SPEC_HPP_ diff --git a/graph/impl/KokkosGraph_color_d1_spec.hpp b/graph/impl/KokkosGraph_color_d1_spec.hpp index af5e2f0751..5d66240763 100644 --- a/graph/impl/KokkosGraph_color_d1_spec.hpp +++ b/graph/impl/KokkosGraph_color_d1_spec.hpp @@ -120,6 +120,4 @@ struct COLOR_D1>, \ false, true>; -#include - #endif diff --git a/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp b/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp index 79ca6c778d..2d408f9440 100644 --- a/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp @@ -377,6 +377,5 @@ struct BSPGEMM_NUMERIC< false, true>; //#include -#include #endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_ diff --git a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp index 2f6bb4d9b4..f04ae34fc9 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp @@ -509,8 +509,5 @@ struct GAUSS_SEIDEL_APPLY; #include -#include -#include -#include #endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_ diff --git a/sparse/impl/KokkosSparse_gmres_spec.hpp b/sparse/impl/KokkosSparse_gmres_spec.hpp index f0498e5efe..bfe1c4539a 100644 --- a/sparse/impl/KokkosSparse_gmres_spec.hpp +++ b/sparse/impl/KokkosSparse_gmres_spec.hpp @@ -158,6 +158,5 @@ struct GMRES; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp index fd3bc2b8bb..142f6dc912 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp @@ -239,6 +239,5 @@ struct PAR_ILUT_NUMERIC; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp b/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp index b822d12ab0..512752d3d9 100644 --- a/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp @@ -172,6 +172,5 @@ struct PAR_ILUT_SYMBOLIC; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp b/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp index 04fc372100..e81649f552 100644 --- a/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp @@ -211,6 +211,5 @@ struct SPADD_NUMERIC; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp b/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp index 13f70abfd0..aaab68568a 100644 --- a/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp @@ -156,6 +156,5 @@ struct SPADD_SYMBOLIC; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp b/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp index 5be268a4ef..d36457a893 100644 --- a/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp +++ b/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp @@ -291,6 +291,5 @@ struct SPGEMM_JACOBI; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp b/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp index 352e3384ac..5ade88cb83 100644 --- a/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp +++ b/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp @@ -158,6 +158,5 @@ struct SPGEMM_NOREUSE; #include -#include #endif // KOKKOSSPARSE_IMPL_SPGEMM_NOREUSE_SPEC_HPP_ diff --git a/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp b/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp index 21faea977c..b325f98796 100644 --- a/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp @@ -251,6 +251,5 @@ struct SPGEMM_NUMERIC< false, true>; #include -#include #endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_ diff --git a/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp b/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp index 671017133a..3a74fb231e 100644 --- a/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp @@ -204,6 +204,5 @@ struct SPGEMM_SYMBOLIC; #include -#include #endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_ diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp index 6081878e9c..12f8c43caf 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp @@ -289,6 +289,5 @@ struct SPILUK_NUMERIC; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp b/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp index 32f306904b..9d8f410918 100644 --- a/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp @@ -204,6 +204,5 @@ struct SPILUK_SYMBOLIC; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index e403ee6b20..678aaaa0c5 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -403,7 +403,5 @@ struct SPMV_MV_BSRMATRIX; #include -#include -#include #endif // KOKKOSSPARSE_IMPL_SPMV_BSRMATRIX_SPEC_HPP_ diff --git a/sparse/impl/KokkosSparse_spmv_spec.hpp b/sparse/impl/KokkosSparse_spmv_spec.hpp index 329e7b93e3..95cd022159 100644 --- a/sparse/impl/KokkosSparse_spmv_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_spec.hpp @@ -353,9 +353,7 @@ struct SPMV_MV; #include -#include #include -#include #endif // KOKKOSSPARSE_IMPL_SPMV_SPEC_HPP_ diff --git a/sparse/impl/KokkosSparse_spmv_struct_spec.hpp b/sparse/impl/KokkosSparse_spmv_struct_spec.hpp index 7ade8e2536..9b22278db2 100644 --- a/sparse/impl/KokkosSparse_spmv_struct_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_struct_spec.hpp @@ -357,7 +357,5 @@ struct SPMV_MV_STRUCT; #include -#include -#include #endif // KOKKOSSPARSE_IMPL_SPMV_STRUCT_SPEC_HPP_ diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index fce10e3acd..1748016822 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -225,6 +225,5 @@ struct SPTRSV_SOLVE; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp index f13c25dec6..73389d10d0 100644 --- a/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp @@ -144,6 +144,5 @@ struct SPTRSV_SYMBOLIC; #include -#include #endif diff --git a/sparse/impl/KokkosSparse_trsv_spec.hpp b/sparse/impl/KokkosSparse_trsv_spec.hpp index ff4a6d90cd..2e838337d2 100644 --- a/sparse/impl/KokkosSparse_trsv_spec.hpp +++ b/sparse/impl/KokkosSparse_trsv_spec.hpp @@ -189,6 +189,5 @@ struct TRSV; #include -#include #endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_ From abe8558b1ff01dd3d32b357b2f3a5e14e1ec74c3 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 18 May 2023 07:18:42 -0600 Subject: [PATCH 365/442] Remove remaining decl.hpp files --- .../KokkosBlas2_ger_eti_spec_decl.hpp.in | 25 ----------------- blas/impl/KokkosBlas2_ger_spec.hpp | 1 - .../KokkosSparse_gmres_eti_spec_decl.hpp.in | 28 ------------------- ...arse_par_ilut_numeric_eti_spec_decl.hpp.in | 26 ----------------- ...Sparse_spiluk_numeric_eti_spec_decl.hpp.in | 26 ----------------- 5 files changed, 106 deletions(-) delete mode 100644 blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_numeric_eti_spec_decl.hpp.in delete mode 100644 sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_numeric_eti_spec_decl.hpp.in diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in deleted file mode 100644 index 3ca1a64a8e..0000000000 --- a/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in +++ /dev/null @@ -1,25 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSBLAS2_GER_ETI_SPEC_DECL_HPP_ -#define KOKKOSBLAS2_GER_ETI_SPEC_DECL_HPP_ - -namespace KokkosBlas { -namespace Impl { -@BLAS2_GER_ETI_DECL_BLOCK@ -} // namespace Impl -} // namespace KokkosBlas -#endif diff --git a/blas/impl/KokkosBlas2_ger_spec.hpp b/blas/impl/KokkosBlas2_ger_spec.hpp index 8539893658..9802194b98 100644 --- a/blas/impl/KokkosBlas2_ger_spec.hpp +++ b/blas/impl/KokkosBlas2_ger_spec.hpp @@ -142,6 +142,5 @@ struct GER { false, true>; #include -#include #endif // KOKKOSBLAS2_GER_SPEC_HPP_ diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_decl.hpp.in deleted file mode 100644 index 980540f7ba..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_decl.hpp.in +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef KOKKOSSPARSE_GMRES_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_GMRES_ETI_SPEC_DECL_HPP_ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER -*/ - -namespace KokkosSparse { -namespace Impl { - -@SPARSE_GMRES_DECL_BLOCK@ - -} // Impl -} // KokkosSparse -#endif // KOKKOSSPARSE_GMRES_ETI_SPEC_DECL_HPP_ diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_numeric_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_numeric_eti_spec_decl.hpp.in deleted file mode 100644 index 943b721880..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_numeric_eti_spec_decl.hpp.in +++ /dev/null @@ -1,26 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_PAR_ILUT_NUMERIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_PAR_ILUT_NUMERIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { - -@SPARSE_PAR_ILUT_NUMERIC_DECL_BLOCK@ - -} // Impl -} // KokkosSparse -#endif // KOKKOSSPARSE_PAR_ILUT_NUMERIC_ETI_SPEC_DECL_HPP_ diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_numeric_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_numeric_eti_spec_decl.hpp.in deleted file mode 100644 index fe5cc1bfa7..0000000000 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_numeric_eti_spec_decl.hpp.in +++ /dev/null @@ -1,26 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_DECL_HPP_ -namespace KokkosSparse { -namespace Impl { - -@SPARSE_SPILUK_NUMERIC_DECL_BLOCK@ - -} // Impl -} // KokkosSparse -#endif // KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_DECL_HPP_ From ebd1406fb3e7e0d93e1579e1afcbafc85033d1a6 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 18 May 2023 14:35:03 -0400 Subject: [PATCH 366/442] Remove dead code guarded by `#ifdef KOKKOSKERNELS_INST_MEMSPACE_CUDAHOSTPINNEDSPACE` --- perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp index ff30fdf565..0f705e1209 100644 --- a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp @@ -253,15 +253,9 @@ int main(int argc, char** argv) { #if defined(KOKKOS_ENABLE_CUDA) if (params.use_cuda) { -#ifdef KOKKOSKERNELS_INST_MEMSPACE_CUDAHOSTPINNEDSPACE - KokkosKernels::Experiment::run_spgemm_jacobi< - size_type, lno_t, scalar_t, Kokkos::Cuda, Kokkos::Cuda::memory_space, - Kokkos::CudaHostPinnedSpace>(params); -#else KokkosKernels::Experiment::run_spgemm_jacobi< size_type, lno_t, scalar_t, Kokkos::Cuda, Kokkos::Cuda::memory_space, Kokkos::Cuda::memory_space>(params); -#endif } #endif From 3273a031b675767c1b106e950de82bdcb98ff73d Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 18 May 2023 14:58:46 -0400 Subject: [PATCH 367/442] Do not adjust KokkosKernels_INST_MEMSPACE_CUDA[UVM]SPACE default value depending on whether Kokkos_ENABLE_CUDA_UVM is ON or OFF. Kokkos_ENABLE_CUDA_UVM was deprecated in Kokkos 4.0. --- cmake/kokkoskernels_eti_devices.cmake | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake index ea03953d29..e6a72123a4 100644 --- a/cmake/kokkoskernels_eti_devices.cmake +++ b/cmake/kokkoskernels_eti_devices.cmake @@ -46,24 +46,15 @@ IF(KOKKOS_ENABLE_CUDA) "Whether to pre instantiate kernels for the execution space Kokkos::Cuda. Disabling this when Kokkos_ENABLE_CUDA is enabled may increase build times. Default: ON if Kokkos is CUDA-enabled, OFF otherwise." ) - # By default, instantiate only for Cuda's default memory space (either CudaSpace, or CudaUVMSpace). - IF(KOKKOS_ENABLE_CUDA_UVM) - SET(CUDA_CUDAUVMSPACE_DEFAULT ON) - SET(CUDA_CUDASPACE_DEFAULT OFF) - ELSE() - SET(CUDA_CUDAUVMSPACE_DEFAULT OFF) - SET(CUDA_CUDASPACE_DEFAULT ON) - ENDIF() - KOKKOSKERNELS_ADD_OPTION( INST_MEMSPACE_CUDAUVMSPACE - ${CUDA_CUDAUVMSPACE_DEFAULT} + OFF BOOL - "Whether to pre instantiate kernels for the memory space Kokkos::CudaUVMSpace. Disabling this when Kokkos_ENABLE_CUDA is enabled may increase build times. Default: ON if Kokkos is CUDA-enabled, OFF otherwise." + "Whether to pre instantiate kernels for the memory space Kokkos::CudaUVMSpace. Disabling this when Kokkos_ENABLE_CUDA is enabled may increase build times. Default: OFF." ) KOKKOSKERNELS_ADD_OPTION( INST_MEMSPACE_CUDASPACE - ${CUDA_CUDASPACE_DEFAULT} + ON BOOL "Whether to pre instantiate kernels for the memory space Kokkos::CudaSpace. Disabling this when Kokkos_ENABLE_CUDA is enabled may increase build times. Default: ON if Kokkos is CUDA-enabled, OFF otherwise." ) From e329be8dc7ac840ea52b8a9cc15d8395d8a07077 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 18 May 2023 15:03:24 -0400 Subject: [PATCH 368/442] Do not bother querying the value of Kokkos_ENABLE_CUDA_UVM --- CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2baa77084f..646c89c813 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,10 +40,8 @@ INCLUDE(GNUInstallDirs) IF (KOKKOSKERNELS_HAS_TRILINOS) SET(TRILINOS_INCDIR ${${PROJECT_NAME}_INSTALL_INCLUDE_DIR}) SET(KOKKOSKERNELS_HEADER_INSTALL_DIR ${TRILINOS_INCDIR}) - SET(KOKKOS_ENABLE_CUDA_UVM ${Kokkos_ENABLE_CUDA_UVM}) ELSEIF(KOKKOSKERNELS_HAS_PARENT) SET(KOKKOSKERNELS_HEADER_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}/kokkos-kernels") - SET(KOKKOS_ENABLE_CUDA_UVM ${Kokkos_ENABLE_CUDA_UVM}) ELSE() SET(KOKKOSKERNELS_HEADER_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}") ENDIF() @@ -129,7 +127,6 @@ ELSE() # This is a standalone build FIND_PACKAGE(Kokkos REQUIRED) MESSAGE(STATUS "Found Kokkos at ${Kokkos_DIR}") - KOKKOS_CHECK(OPTIONS CUDA_UVM RETURN_VALUE KOKKOS_ENABLE_CUDA_UVM) ENDIF() INCLUDE(cmake/kokkos_backends.cmake) From 6f26e1527cb60e0d2dcc6a7a2985ce826a7f1350 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 18 May 2023 16:09:51 -0400 Subject: [PATCH 369/442] Drop outdated workarounds for backward compatibility with now unsupported Kokkos versions --- cmake/kokkos_backends.cmake | 6 ------ common/src/KokkosKernels_default_types.hpp | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/cmake/kokkos_backends.cmake b/cmake/kokkos_backends.cmake index 9346475f91..a90ad69bf0 100644 --- a/cmake/kokkos_backends.cmake +++ b/cmake/kokkos_backends.cmake @@ -16,9 +16,3 @@ CHECK_KOKKOS_BACKEND(OPENMPTARGET) CHECK_KOKKOS_BACKEND(CUDA) CHECK_KOKKOS_BACKEND(HIP) CHECK_KOKKOS_BACKEND(SYCL) - -# for backward compatibility. can be dropped when requiring Kokkos 3.6 -IF (Kokkos_ENABLE_PTHREAD) - SET(KOKKOS_ENABLE_THREADS ON) - SET(KOKKOSKERNELS_INST_EXECSPACE_THREADS_DEFAULT ON) -ENDIF() diff --git a/common/src/KokkosKernels_default_types.hpp b/common/src/KokkosKernels_default_types.hpp index 9210264b61..672bdf3fbb 100644 --- a/common/src/KokkosKernels_default_types.hpp +++ b/common/src/KokkosKernels_default_types.hpp @@ -65,7 +65,7 @@ using default_device = Kokkos::Experimental::HIP; using default_device = Kokkos::Experimental::OpenMPTarget; #elif defined(KOKKOS_ENABLE_OPENMP) using default_device = Kokkos::OpenMP; -#elif defined(KOKKOS_ENABLE_PTHREAD) || defined(KOKKOS_ENABLE_THREADS) +#elif defined(KOKKOS_ENABLE_THREADS) using default_device = Kokkos::Threads; #else using default_device = Kokkos::Serial; From f75527cd6938ff43e01a8e11c223bd69b21b48ad Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 8 May 2023 16:44:46 -0600 Subject: [PATCH 370/442] SpMV: adding benchmark for spmv This is an initial version, we probably want to improve the followin: - actually check the alg parameter and set the control accordingly - implement TPL logic depending on how the library is built --- perf_test/sparse/CMakeLists.txt | 8 +- .../sparse/KokkosSparse_spmv_benchmark.cpp | 167 ++++++++++++++++++ 2 files changed, 173 insertions(+), 2 deletions(-) create mode 100644 perf_test/sparse/KokkosSparse_spmv_benchmark.cpp diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index 00d7bdaa4d..c9bd79c92f 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -116,7 +116,7 @@ KOKKOSKERNELS_ADD_EXECUTABLE( SOURCES KokkosSparse_mdf.cpp ) -# if (KokkosKernels_ENABLE_BENCHMARK) +if (KokkosKernels_ENABLE_BENCHMARK) # KOKKOSKERNELS_ADD_BENCHMARK( # sparse_par_ilut # SOURCES KokkosSparse_par_ilut.cpp @@ -131,4 +131,8 @@ KOKKOSKERNELS_ADD_EXECUTABLE( # target_compile_definitions(KokkosKernels_sparse_par_ilut PRIVATE "USE_GINKGO") # target_link_libraries(KokkosKernels_sparse_par_ilut PRIVATE Ginkgo::ginkgo) # endif() -# endif() + + KOKKOSKERNELS_ADD_BENCHMARK( + sparse_spmv_benchmark SOURCES KokkosSparse_spmv_benchmark.cpp + ) +endif() diff --git a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp new file mode 100644 index 0000000000..69e21d4e31 --- /dev/null +++ b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp @@ -0,0 +1,167 @@ +//@HEADERA +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +// Headers needed to create initial data +// and to check results at the end +#include +#include +#include "KokkosKernels_default_types.hpp" +#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +// Headers for benchmark library +#include +#include "Benchmark_Context.hpp" + +// Headers for spmv +#include +#include + +namespace { + +struct spmv_parameters { + + int N, offset; + std::string filename; + std::string alg; + std::string tpl; + + spmv_parameters(const int N_) : N(N_), offset(0), filename(""), alg(""), tpl("") {} + +}; + +void print_options() { + std::cerr << "Options\n" << std::endl; + + std::cerr << perf_test::list_common_options(); + + std::cerr + << "\t[Optional] --repeat :: how many times to repeat overall test" + << std::endl; + std::cerr << " -s [N] :: generate a semi-random banded (band size 0.01xN)\n" + "NxN matrix with average of 10 entries per row." << std::endl; + std::cerr << "\t[Optional] --alg :: the algorithm to run (classic, merge)" + << std::endl; + std::cerr + << "\t[Optional] --TPL :: when available and compatible with alg, a TPL can be used (cusparse, rocsparse, MKL)" + << std::endl; + std::cerr << " -f [file] : Read in Matrix Market formatted text file 'file'." << std::endl; + std::cerr << " --offset [O] : Subtract O from every index.\n" + << " Useful in case the matrix market file is not 0 based." << std::endl; +} // print_options + +int parse_inputs(int argc, char** argv, spmv_parameters& params) { + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_int(i, argc, argv, "-n", params.N)) { + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--alg", + params.alg)) { + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--TPL", + params.tpl)) { + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "-f", + params.filename)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--offset", + params.offset)) { + ++i; + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + return 1; + } + } + return 0; +} // parse_inputs + +} // namespace + +template +void run_spmv(benchmark::State& state, int argc, char** argv) { + using matrix_type = KokkosSparse::CrsMatrix; + using mv_type = Kokkos::View; + + spmv_parameters inputs(state.range(0)); + parse_inputs(argc, argv, inputs); + + srand(17312837); + matrix_type A; + if (inputs.filename == "") { + int nnz = 10 * inputs.N; + // note: the help text says the bandwidth is fixed at 0.01 * numRows + // CAVEAT: small problem sizes are problematic, b/c of 0.01*numRows + A = KokkosSparse::Impl::kk_generate_sparse_matrix( + inputs.N, inputs.N, nnz, 0, 0.01 * inputs.N); + } else { + A = KokkosSparse::Impl::read_kokkos_crst_matrix(inputs.filename.c_str()); + } + + mv_type x("X", A.numRows()); + mv_type y("Y", A.numCols()); + + Kokkos::Random_XorShift64_Pool rand_pool(13718); + Kokkos::fill_random(x, rand_pool, 10); + Kokkos::fill_random(y, rand_pool, 10); + + for (auto _ : state) { + (void)_; + KokkosSparse::spmv(KokkosSparse::NoTranspose, 1.0, A, x, 0.0, y); + Kokkos::fence(); + } +} + +int main(int argc, char** argv) { + Kokkos::initialize(argc, argv); + + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kMillisecond); + KokkosKernelsBenchmark::add_benchmark_context(true); + + perf_test::CommonInputParams common_params; + perf_test::parse_common_options(argc, argv, common_params); + + std::string bench_name = "KokkosSparse_spmv"; + + if (0 < common_params.repeat) { + benchmark::RegisterBenchmark( + bench_name.c_str(), + run_spmv, argc, argv) + ->UseRealTime() + ->ArgNames({"n"}) + ->Args({100000}) + ->Iterations(common_params.repeat); + } else { + benchmark::RegisterBenchmark( + bench_name.c_str(), + run_spmv, argc, argv) + ->UseRealTime() + ->ArgNames({"n"}) + ->Args({100000}); + } + + benchmark::RunSpecifiedBenchmarks(); + + benchmark::Shutdown(); + Kokkos::finalize(); + + return 0; +} From 09dc9ff27e8536dff2e12af3573345f1d0cb3f35 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 9 May 2023 09:31:03 -0600 Subject: [PATCH 371/442] SpMV: applying clang-format to benchmark source file --- .../sparse/KokkosSparse_spmv_benchmark.cpp | 49 ++++++++++--------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp index 69e21d4e31..5154f92d93 100644 --- a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp @@ -35,14 +35,13 @@ namespace { struct spmv_parameters { - int N, offset; std::string filename; std::string alg; std::string tpl; - spmv_parameters(const int N_) : N(N_), offset(0), filename(""), alg(""), tpl("") {} - + spmv_parameters(const int N_) + : N(N_), offset(0), filename(""), alg(""), tpl("") {} }; void print_options() { @@ -53,33 +52,37 @@ void print_options() { std::cerr << "\t[Optional] --repeat :: how many times to repeat overall test" << std::endl; - std::cerr << " -s [N] :: generate a semi-random banded (band size 0.01xN)\n" - "NxN matrix with average of 10 entries per row." << std::endl; - std::cerr << "\t[Optional] --alg :: the algorithm to run (classic, merge)" + std::cerr << " -s [N] :: generate a semi-random banded (band size " + "0.01xN)\n" + "NxN matrix with average of 10 entries per row." + << std::endl; + std::cerr + << "\t[Optional] --alg :: the algorithm to run (classic, merge)" + << std::endl; + std::cerr << "\t[Optional] --TPL :: when available and compatible with " + "alg, a TPL can be used (cusparse, rocsparse, MKL)" << std::endl; std::cerr - << "\t[Optional] --TPL :: when available and compatible with alg, a TPL can be used (cusparse, rocsparse, MKL)" + << " -f [file] : Read in Matrix Market formatted text file 'file'." << std::endl; - std::cerr << " -f [file] : Read in Matrix Market formatted text file 'file'." << std::endl; std::cerr << " --offset [O] : Subtract O from every index.\n" - << " Useful in case the matrix market file is not 0 based." << std::endl; + << " Useful in case the matrix market file is " + "not 0 based." + << std::endl; } // print_options int parse_inputs(int argc, char** argv, spmv_parameters& params) { for (int i = 1; i < argc; ++i) { if (perf_test::check_arg_int(i, argc, argv, "-n", params.N)) { ++i; - } else if (perf_test::check_arg_str(i, argc, argv, "--alg", - params.alg)) { + } else if (perf_test::check_arg_str(i, argc, argv, "--alg", params.alg)) { ++i; - } else if (perf_test::check_arg_str(i, argc, argv, "--TPL", - params.tpl)) { + } else if (perf_test::check_arg_str(i, argc, argv, "--TPL", params.tpl)) { ++i; - } else if (perf_test::check_arg_str(i, argc, argv, "-f", - params.filename)) { + } else if (perf_test::check_arg_str(i, argc, argv, "-f", params.filename)) { ++i; } else if (perf_test::check_arg_int(i, argc, argv, "--offset", - params.offset)) { + params.offset)) { ++i; } else { std::cerr << "Unrecognized command line argument #" << i << ": " @@ -95,9 +98,8 @@ int parse_inputs(int argc, char** argv, spmv_parameters& params) { template void run_spmv(benchmark::State& state, int argc, char** argv) { - using matrix_type = KokkosSparse::CrsMatrix; + using matrix_type = + KokkosSparse::CrsMatrix; using mv_type = Kokkos::View; spmv_parameters inputs(state.range(0)); @@ -112,7 +114,8 @@ void run_spmv(benchmark::State& state, int argc, char** argv) { A = KokkosSparse::Impl::kk_generate_sparse_matrix( inputs.N, inputs.N, nnz, 0, 0.01 * inputs.N); } else { - A = KokkosSparse::Impl::read_kokkos_crst_matrix(inputs.filename.c_str()); + A = KokkosSparse::Impl::read_kokkos_crst_matrix( + inputs.filename.c_str()); } mv_type x("X", A.numRows()); @@ -143,16 +146,14 @@ int main(int argc, char** argv) { if (0 < common_params.repeat) { benchmark::RegisterBenchmark( - bench_name.c_str(), - run_spmv, argc, argv) + bench_name.c_str(), run_spmv, argc, argv) ->UseRealTime() ->ArgNames({"n"}) ->Args({100000}) ->Iterations(common_params.repeat); } else { benchmark::RegisterBenchmark( - bench_name.c_str(), - run_spmv, argc, argv) + bench_name.c_str(), run_spmv, argc, argv) ->UseRealTime() ->ArgNames({"n"}) ->Args({100000}); From e3b6eb19eb6baa001d910cc931f1ca61150618a0 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 9 May 2023 10:05:27 -0600 Subject: [PATCH 372/442] SpMV: adding logic in benchmark to chose algorithm to test. --- .../sparse/KokkosSparse_spmv_benchmark.cpp | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp index 5154f92d93..4f9686322a 100644 --- a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp @@ -52,9 +52,9 @@ void print_options() { std::cerr << "\t[Optional] --repeat :: how many times to repeat overall test" << std::endl; - std::cerr << " -s [N] :: generate a semi-random banded (band size " - "0.01xN)\n" - "NxN matrix with average of 10 entries per row." + std::cerr << " -s [N] :: generate a semi-random banded (band size 0.01xN)\n" + "NxN matrix with average of 10 entries per row." << std::endl; + std::cerr << "\t[Optional] --alg :: the algorithm to run (native, merge)" << std::endl; std::cerr << "\t[Optional] --alg :: the algorithm to run (classic, merge)" @@ -75,7 +75,11 @@ int parse_inputs(int argc, char** argv, spmv_parameters& params) { for (int i = 1; i < argc; ++i) { if (perf_test::check_arg_int(i, argc, argv, "-n", params.N)) { ++i; - } else if (perf_test::check_arg_str(i, argc, argv, "--alg", params.alg)) { + } else if (perf_test::check_arg_str(i, argc, argv, "--alg", + params.alg)) { + if((params.alg != "") && (params.alg != "native") && (params.alg != "merge")) { + throw std::runtime_error("--alg can only be an empty string, `native` or `merge`!"); + } ++i; } else if (perf_test::check_arg_str(i, argc, argv, "--TPL", params.tpl)) { ++i; @@ -102,15 +106,20 @@ void run_spmv(benchmark::State& state, int argc, char** argv) { KokkosSparse::CrsMatrix; using mv_type = Kokkos::View; + // Set input parameters spmv_parameters inputs(state.range(0)); parse_inputs(argc, argv, inputs); + KokkosKernels::Experimental::Controls controls; + if(inputs.alg == "native") { + controls.setParameter("algorithm", "native"); + } + + // Create test matrix srand(17312837); matrix_type A; if (inputs.filename == "") { int nnz = 10 * inputs.N; - // note: the help text says the bandwidth is fixed at 0.01 * numRows - // CAVEAT: small problem sizes are problematic, b/c of 0.01*numRows A = KokkosSparse::Impl::kk_generate_sparse_matrix( inputs.N, inputs.N, nnz, 0, 0.01 * inputs.N); } else { @@ -118,6 +127,7 @@ void run_spmv(benchmark::State& state, int argc, char** argv) { inputs.filename.c_str()); } + // Create input vectors mv_type x("X", A.numRows()); mv_type y("Y", A.numCols()); @@ -125,6 +135,7 @@ void run_spmv(benchmark::State& state, int argc, char** argv) { Kokkos::fill_random(x, rand_pool, 10); Kokkos::fill_random(y, rand_pool, 10); + // Run the actual experiments for (auto _ : state) { (void)_; KokkosSparse::spmv(KokkosSparse::NoTranspose, 1.0, A, x, 0.0, y); From 29c24f2bdf109e1ac78ecf656b7c8fa41354f4f9 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 9 May 2023 10:08:23 -0600 Subject: [PATCH 373/442] SpMV: applying clang-format to benchmark --- .../sparse/KokkosSparse_spmv_benchmark.cpp | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp index 4f9686322a..48914ea8ed 100644 --- a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp @@ -52,10 +52,13 @@ void print_options() { std::cerr << "\t[Optional] --repeat :: how many times to repeat overall test" << std::endl; - std::cerr << " -s [N] :: generate a semi-random banded (band size 0.01xN)\n" - "NxN matrix with average of 10 entries per row." << std::endl; - std::cerr << "\t[Optional] --alg :: the algorithm to run (native, merge)" + std::cerr << " -s [N] :: generate a semi-random banded (band size " + "0.01xN)\n" + "NxN matrix with average of 10 entries per row." << std::endl; + std::cerr + << "\t[Optional] --alg :: the algorithm to run (native, merge)" + << std::endl; std::cerr << "\t[Optional] --alg :: the algorithm to run (classic, merge)" << std::endl; @@ -75,10 +78,11 @@ int parse_inputs(int argc, char** argv, spmv_parameters& params) { for (int i = 1; i < argc; ++i) { if (perf_test::check_arg_int(i, argc, argv, "-n", params.N)) { ++i; - } else if (perf_test::check_arg_str(i, argc, argv, "--alg", - params.alg)) { - if((params.alg != "") && (params.alg != "native") && (params.alg != "merge")) { - throw std::runtime_error("--alg can only be an empty string, `native` or `merge`!"); + } else if (perf_test::check_arg_str(i, argc, argv, "--alg", params.alg)) { + if ((params.alg != "") && (params.alg != "native") && + (params.alg != "merge")) { + throw std::runtime_error( + "--alg can only be an empty string, `native` or `merge`!"); } ++i; } else if (perf_test::check_arg_str(i, argc, argv, "--TPL", params.tpl)) { @@ -111,7 +115,7 @@ void run_spmv(benchmark::State& state, int argc, char** argv) { parse_inputs(argc, argv, inputs); KokkosKernels::Experimental::Controls controls; - if(inputs.alg == "native") { + if (inputs.alg == "native") { controls.setParameter("algorithm", "native"); } @@ -120,7 +124,7 @@ void run_spmv(benchmark::State& state, int argc, char** argv) { matrix_type A; if (inputs.filename == "") { int nnz = 10 * inputs.N; - A = KokkosSparse::Impl::kk_generate_sparse_matrix( + A = KokkosSparse::Impl::kk_generate_sparse_matrix( inputs.N, inputs.N, nnz, 0, 0.01 * inputs.N); } else { A = KokkosSparse::Impl::read_kokkos_crst_matrix( From 1a69ed2ae26500b12ab8bfa4c8077d672e31358d Mon Sep 17 00:00:00 2001 From: Luc Date: Sun, 14 May 2023 20:31:45 -0700 Subject: [PATCH 374/442] SpMV benchmark: adding logic for spmv algorithm --- perf_test/sparse/KokkosSparse_spmv_benchmark.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp index 48914ea8ed..27070c2e60 100644 --- a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp @@ -57,7 +57,7 @@ void print_options() { "NxN matrix with average of 10 entries per row." << std::endl; std::cerr - << "\t[Optional] --alg :: the algorithm to run (native, merge)" + << "\t[Optional] --alg :: the algorithm to run (default, native, merge)" << std::endl; std::cerr << "\t[Optional] --alg :: the algorithm to run (classic, merge)" @@ -79,10 +79,10 @@ int parse_inputs(int argc, char** argv, spmv_parameters& params) { if (perf_test::check_arg_int(i, argc, argv, "-n", params.N)) { ++i; } else if (perf_test::check_arg_str(i, argc, argv, "--alg", params.alg)) { - if ((params.alg != "") && (params.alg != "native") && - (params.alg != "merge")) { + if ((params.alg != "") && (params.alg != "default") && + (params.alg != "native") && (params.alg != "merge")) { throw std::runtime_error( - "--alg can only be an empty string, `native` or `merge`!"); + "--alg can only be an empty string, `default`, `native` or `merge`!"); } ++i; } else if (perf_test::check_arg_str(i, argc, argv, "--TPL", params.tpl)) { @@ -115,8 +115,9 @@ void run_spmv(benchmark::State& state, int argc, char** argv) { parse_inputs(argc, argv, inputs); KokkosKernels::Experimental::Controls controls; - if (inputs.alg == "native") { - controls.setParameter("algorithm", "native"); + if ((inputs.alg == "default") || (inputs.alg == "native") + || (inputs.alg == "merge")) { + controls.setParameter("algorithm", inputs.alg); } // Create test matrix @@ -141,8 +142,7 @@ void run_spmv(benchmark::State& state, int argc, char** argv) { // Run the actual experiments for (auto _ : state) { - (void)_; - KokkosSparse::spmv(KokkosSparse::NoTranspose, 1.0, A, x, 0.0, y); + KokkosSparse::spmv(controls, KokkosSparse::NoTranspose, 1.0, A, x, 0.0, y); Kokkos::fence(); } } From 2b3a070c1f9736da79a4e1242d84aa8c8c0175b7 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Sun, 14 May 2023 21:34:36 -0600 Subject: [PATCH 375/442] applying clang-format --- perf_test/sparse/KokkosSparse_spmv_benchmark.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp index 27070c2e60..3c2374d655 100644 --- a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp @@ -56,9 +56,9 @@ void print_options() { "0.01xN)\n" "NxN matrix with average of 10 entries per row." << std::endl; - std::cerr - << "\t[Optional] --alg :: the algorithm to run (default, native, merge)" - << std::endl; + std::cerr << "\t[Optional] --alg :: the algorithm to run (default, " + "native, merge)" + << std::endl; std::cerr << "\t[Optional] --alg :: the algorithm to run (classic, merge)" << std::endl; @@ -80,9 +80,10 @@ int parse_inputs(int argc, char** argv, spmv_parameters& params) { ++i; } else if (perf_test::check_arg_str(i, argc, argv, "--alg", params.alg)) { if ((params.alg != "") && (params.alg != "default") && - (params.alg != "native") && (params.alg != "merge")) { + (params.alg != "native") && (params.alg != "merge")) { throw std::runtime_error( - "--alg can only be an empty string, `default`, `native` or `merge`!"); + "--alg can only be an empty string, `default`, `native` or " + "`merge`!"); } ++i; } else if (perf_test::check_arg_str(i, argc, argv, "--TPL", params.tpl)) { @@ -115,8 +116,8 @@ void run_spmv(benchmark::State& state, int argc, char** argv) { parse_inputs(argc, argv, inputs); KokkosKernels::Experimental::Controls controls; - if ((inputs.alg == "default") || (inputs.alg == "native") - || (inputs.alg == "merge")) { + if ((inputs.alg == "default") || (inputs.alg == "native") || + (inputs.alg == "merge")) { controls.setParameter("algorithm", inputs.alg); } From 5ea1c3c32d345ca1f4afa58f6aa1226168abda57 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Mon, 22 May 2023 11:12:02 -0600 Subject: [PATCH 376/442] Update perf_test/sparse/KokkosSparse_spmv_benchmark.cpp Co-authored-by: brian-kelley --- perf_test/sparse/KokkosSparse_spmv_benchmark.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp index 3c2374d655..523e33b72d 100644 --- a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp @@ -1,4 +1,4 @@ -//@HEADERA +//@HEADER // ************************************************************************ // // Kokkos v. 4.0 From 82d93a25cbad9f17da89c99d5fd7f9ce5a7dd651 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 22 May 2023 15:50:26 -0600 Subject: [PATCH 377/442] Support rocSparse in rocm 5.2.0 (#1833) --- ...osSparse_spmv_bsrmatrix_tpl_spec_avail.hpp | 5 ++-- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 25 +++++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp index 3ce22c630a..6846e27748 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp @@ -265,8 +265,7 @@ KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, enum : bool { value = true }; \ }; -// These things may also be valid before 5.4, but I haven't tested it. -#if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400 +#if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50200 KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(float, rocsparse_int, rocsparse_int, @@ -305,7 +304,7 @@ KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex, Kokkos::LayoutRight, Kokkos::HIPSpace) -#endif // KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400 +#endif // KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50200 #undef KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index cc3e2a6b1e..36a64228b8 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -929,6 +929,30 @@ void spmv_block_impl_rocsparse( KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr)); rocsparse_mat_info info; KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_info(&info)); + + // *_ex* functions introduced in 5.4.0 +#if KOKKOSSPARSE_IMPL_ROCM_VERSION < 50400 + if constexpr (std::is_same_v) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, x_, beta_, y_)); + } else if constexpr (std::is_same_v) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, x_, beta_, y_)); + } else if constexpr (std::is_same_v>) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, x_, beta_, y_)); + } else if constexpr (std::is_same_v>) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv( + handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, x_, beta_, y_)); + } else { + static_assert(KokkosKernels::Impl::always_false_v, + "unsupported value type for rocsparse_*bsrmv"); + } +#else if constexpr (std::is_same_v) { KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_ex_analysis( handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, @@ -965,6 +989,7 @@ void spmv_block_impl_rocsparse( static_assert(KokkosKernels::Impl::always_false_v, "unsupported value type for rocsparse_*bsrmv"); } +#endif rocsparse_destroy_mat_descr(descr); rocsparse_destroy_mat_info(info); From 237597a00da141d68bc32ca331c201a32a0b3939 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 23 May 2023 11:57:09 -0600 Subject: [PATCH 378/442] cm_test_all_sandia: update to add caraway queues for MI210, MI250 - fat*: caraway MI250 queue - lean*: caraway MI210 queue cm_generate_makefile.bash Update AMD GPU arch options - Add VEGA90A for MI200 series GPUs - Remove VEGA900 (support dropped with 4.0 release) --- cm_generate_makefile.bash | 4 ++-- scripts/cm_test_all_sandia | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index 21d3176cec..913b4e67a5 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -263,9 +263,9 @@ display_help_text() { echo " ZEN = AMD Zen-Core CPU" echo " ZEN2 = AMD Zen2-Core CPU" echo " [AMD: GPU]" - echo " VEGA900 = AMD GPU MI25 GFX900" echo " VEGA906 = AMD GPU MI50/MI60 GFX906" - echo " VEGA908 = AMD GPU" + echo " VEGA908 = AMD GPU MI100 GFX908" + echo " VEGA90A = AMD GPU MI200 series GFX90A" echo " [ARM]" echo " ARMV80 = ARMv8.0 Compatible CPU" echo " ARMV81 = ARMv8.1 Compatible CPU" diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 5dcd70d885..b8a52067e0 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -176,6 +176,14 @@ if [[ "$HOSTNAME" == caraway* ]]; then # Warning: very generic name MACHINE=caraway fi +if [[ "$HOSTNAME" == fat* ]]; then # Caraway MI250 queues + MACHINE=caraway +fi + +if [[ "$HOSTNAME" == lean* ]]; then # Caraway MI210 queues + MACHINE=caraway +fi + if [[ "$HOSTNAME" == kokkos-dev\.sandia\.gov* ]]; then MACHINE=kokkos-dev fi From 7b6073bb932db42f5b8da75bc84fc5892cc02532 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 3 May 2023 13:49:40 -0600 Subject: [PATCH 379/442] batched/eti: ETI host-level interfaces --- CMakeLists.txt | 1 - batched/CMakeLists.txt | 50 ++ .../KokkosBatched_HostLevel_Gemm_Impl.hpp | 680 ++++++++++++++++++ batched/dense/src/KokkosBatched_Gemm_Decl.hpp | 537 +------------- .../unit_test/Test_Batched_BatchedGemm.hpp | 81 ++- .../Test_Batched_BatchedGemm_Complex.hpp | 64 +- .../Test_Batched_BatchedGemm_Real.hpp | 131 ++-- ...atched_Gemm_nt_nt_bll_eti_spec_inst.cpp.in | 23 + ...atched_Gemm_nt_nt_blr_eti_spec_inst.cpp.in | 23 + ...Batched_Gemm_nt_t_bll_eti_spec_inst.cpp.in | 23 + ...Batched_Gemm_nt_t_blr_eti_spec_inst.cpp.in | 23 + ...Batched_Gemm_t_nt_bll_eti_spec_inst.cpp.in | 23 + ...Batched_Gemm_t_nt_blr_eti_spec_inst.cpp.in | 23 + ...sBatched_Gemm_t_t_bll_eti_spec_inst.cpp.in | 23 + ...sBatched_Gemm_t_t_blr_eti_spec_inst.cpp.in | 23 + ...tched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in | 0 ...atched_Gemm_nt_nt_bll_eti_spec_decl.hpp.in | 0 ...tched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in | 0 ...atched_Gemm_nt_nt_blr_eti_spec_decl.hpp.in | 0 ...atched_Gemm_nt_t_bll_eti_spec_avail.hpp.in | 0 ...Batched_Gemm_nt_t_bll_eti_spec_decl.hpp.in | 0 ...atched_Gemm_nt_t_blr_eti_spec_avail.hpp.in | 0 ...Batched_Gemm_nt_t_blr_eti_spec_decl.hpp.in | 0 ...atched_Gemm_t_nt_bll_eti_spec_avail.hpp.in | 0 ...Batched_Gemm_t_nt_bll_eti_spec_decl.hpp.in | 0 ...atched_Gemm_t_nt_blr_eti_spec_avail.hpp.in | 0 ...Batched_Gemm_t_nt_blr_eti_spec_decl.hpp.in | 0 ...Batched_Gemm_t_t_bll_eti_spec_avail.hpp.in | 0 ...sBatched_Gemm_t_t_bll_eti_spec_decl.hpp.in | 0 ...Batched_Gemm_t_t_blr_eti_spec_avail.hpp.in | 0 ...sBatched_Gemm_t_t_blr_eti_spec_decl.hpp.in | 0 test_common/Test_Cuda.hpp | 2 + test_common/Test_HIP.hpp | 2 + test_common/Test_OpenMP.hpp | 2 + test_common/Test_OpenMPTarget.hpp | 2 + test_common/Test_SYCL.hpp | 2 + test_common/Test_Serial.hpp | 2 + test_common/Test_Threads.hpp | 2 + 38 files changed, 1062 insertions(+), 680 deletions(-) create mode 100644 batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp create mode 100644 batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_bll_eti_spec_inst.cpp.in create mode 100644 batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_blr_eti_spec_inst.cpp.in create mode 100644 batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_bll_eti_spec_inst.cpp.in create mode 100644 batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_blr_eti_spec_inst.cpp.in create mode 100644 batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_bll_eti_spec_inst.cpp.in create mode 100644 batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_blr_eti_spec_inst.cpp.in create mode 100644 batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_bll_eti_spec_inst.cpp.in create mode 100644 batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_blr_eti_spec_inst.cpp.in create mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in create mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_decl.hpp.in create mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in create mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_decl.hpp.in create mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in create mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_decl.hpp.in create mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in create mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_decl.hpp.in create mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in create mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_decl.hpp.in create mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in create mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_decl.hpp.in create mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in create mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_decl.hpp.in create mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in create mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_decl.hpp.in diff --git a/CMakeLists.txt b/CMakeLists.txt index 2baa77084f..906efa6200 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -308,7 +308,6 @@ ELSE() # This doesn't change pre-existing behavior before the ETI changes #LIST(APPEND HEADERS ${ETI_HEADERS}) #----------------------------------------------------------------------------- - KOKKOSKERNELS_ADD_LIBRARY( kokkoskernels HEADERS ${HEADERS} diff --git a/batched/CMakeLists.txt b/batched/CMakeLists.txt index 2816620e87..8d4319e63d 100644 --- a/batched/CMakeLists.txt +++ b/batched/CMakeLists.txt @@ -19,3 +19,53 @@ ENDIF() # Adding unit-tests KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/batched) KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/batched) + +KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_nt_nt_bll Gemm + COMPONENTS batched + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) +KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_nt_t_bll Gemm + COMPONENTS batched + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) +KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_t_nt_bll Gemm + COMPONENTS batched + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) +KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_t_t_bll Gemm + COMPONENTS batched + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) + +KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_nt_nt_blr Gemm + COMPONENTS batched + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) +KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_nt_t_blr Gemm + COMPONENTS batched + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) +KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_t_nt_blr Gemm + COMPONENTS batched + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) +KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_t_t_blr Gemm + COMPONENTS batched + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) \ No newline at end of file diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp new file mode 100644 index 0000000000..b532fd1d0f --- /dev/null +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -0,0 +1,680 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP__ +#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP__ +#include "Kokkos_Core.hpp" // KOKKOS_INLINE_FUNCTION +#include "KokkosBatched_Util.hpp" // Trans, BatchLayout +#include +#include +#include + +namespace KokkosBatched { +namespace Impl { +/********************* BEGIN forward declarations *********************/ +// clang-format off +/// \brief Non-blocking general matrix multiply on a batch of +/// uniform matrices. +/// +/// +/// C = alpha * op(A) * op(B) + beta * C +/// +/// \tparam ArgTransA Specifies what op does to A: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose +/// \tparam ArgTransB Specifies what op does to B: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose +/// \tparam ArgMode Specifies algorithm mode to use for serial work: +/// Algo::Gemm::Unblocked for no register blocking +/// Algo::Gemm::Blocked for register blocking +/// Algo::Gemm::CompactMKL for mkl compact tpl interface +/// \tparam ArgBatchSzDim Specifies where the batch dimension is allocated in +/// AViewType, BViewType, and CViewType: +/// BatchSzDim::Left Batch dimension is leftmost +/// BatchSzDim::Right Batch dimension is rightmost +/// \tparam ArgResultsPerThread Specifies how to divide work among threads. For +/// this serial interface, each rank specifies how +/// much work to assign a single thread. +/// ResultsPerThread::Rank0 Each thread computes a scalar of C +/// ResultsPerThread::Rank1 Each thread computes a 1-rank chunk of C +/// ResultsPerThread::Rank2 Each thread computes a 2-rank chunk of C +/// \tparam ScalarType Specifies the scalar type of alpha and beta +/// \tparam AViewType Input matrix, as either a 3-rank Kokkos::View or a +/// 4-rank Kokkos::View for SIMD operations. +/// \tparam BViewType Input matrix, as either a 3-rank Kokkos::View or a +/// 4-rank Kokkos::View for SIMD operations. +/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as either a 3-rank +/// Kokkos::View or a 4-rank Kokkos::View for SIMD +/// operations. +/// +/// See struct BatchedGemmHandle for details. +/// \param alpha [in] Input coefficient used for multiplication with A +/// \param A [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix A is BxMxK +/// \param B [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix B is BxKxN +/// \param beta [in] Input coefficient used for multiplication with C +/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix C is BxMxN +/// \return 0 upon success, non-zero otherwise +/// +/// Usage Example: +/// BatchedSerialGemm(alpha, A, B, beta, C).invoke(); +// clang-format on +template +class BatchedSerialGemm; + +// clang-format off +/// \brief Non-blocking general matrix multiply on a batch of +/// uniform matrices with an algorithm based on: +/// B. P. D. J. Kunkel, Julian, “Performance, design, and autotuning of batched gemm for GPUs,” +/// in Lecture Notes in Computer Science, ser. ISC High Performance Computing ’16, vol. 9697, 06 2016. +/// +/// +/// C = alpha * op(A) * op(B) + beta * C +/// +/// \tparam ArgTransA Specifies what op does to A: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose (unsupported) +/// \tparam ArgTransB Specifies what op does to B: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose (unsupported) +/// \tparam ArgBatchSzDim Specifies where the batch dimension is allocated in +/// AViewType, BViewType, and CViewType: +/// BatchSzDim::Left Batch dimension is leftmost +/// BatchSzDim::Right Batch dimension is rightmost +/// \tparam ArgResultsPerThread Specifies how to divide work among threads. For +/// this serial interface, each rank specifies how +/// much work to assign a single thread. +/// ResultsPerThread::Rank0 Each thread computes a scalar of C +/// ResultsPerThread::Rank1 Each thread computes a 1-rank chunk of C +/// ResultsPerThread::Rank2 Each thread computes a 2-rank chunk of C +/// \tparam HandleType Specifies the handle type of the kernel handle +/// \tparam ScalarType Specifies the scalar type of alpha and beta +/// \tparam AViewType Input matrix, as either a 3-rank Kokkos::View or a +/// 4-rank Kokkos::View for SIMD operations. +/// \tparam BViewType Input matrix, as either a 3-rank Kokkos::View or a +/// 4-rank Kokkos::View for SIMD operations. +/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as either a 3-rank +/// Kokkos::View or a 4-rank Kokkos::View for SIMD +/// operations. +/// \tparam ArgBoundsCheck Specifies whether to perform global memory access +/// bounds checks within the functor. Bounds checks +/// are required when matrix sizes are not evenly divisible +/// by tile sizes. +/// BoundsCheck::Yes The functor will perform bound checks (recommended) +/// BoundsCheck::No The functor will NOT perform bound checks +/// \tparam ArgAlphaFmaTag Specifies whether to apply alpha during fmas. +/// AlphaFmaTag::Yes alpha will be applied during fma (C = C * alpha + AB). +/// AlphaFmaTag::No alpha will be applied during mul (A * B * alpha). +/// \tparam TILE_M Specifies the number of rows in each tile. +/// \tparam TILE_N Specifies the number of cols in each tile. +/// \tparam TILE_K Specifies the number of cols or rows in a tile of A or tile of B, respectively. +/// +/// See struct BatchedGemmHandle for details. +/// \param alpha [in] Input coefficient used for multiplication with A +/// \param A [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix A is BxMxK +/// \param B [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix B is BxKxN +/// \param beta [in] Input coefficient used for multiplication with C +/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix C is BxMxN +/// \return 0 upon success, non-zero otherwise +/// +/// Usage Example: +/// BatchedSerialGemm(alpha, A, B, beta, C).invoke(); +// clang-format on +template +class BatchedDblBufGemm; + +//////////////////////////////// tile_m ////////////////////////////////// +template +constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_m() { + return 32; +} +//////////////////////////////// tile_n ////////////////////////////////// +template +constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_n() { + return 32; +} +//////////////////////////////// tile_k ////////////////////////////////// +template +constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_k() { + return 8; +} + +// On MI100, batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_right fails +// without this. See https://github.com/kokkos/kokkos-kernels/issues/1547. +// This reduces the register allocations (REG_M and REG_N) in the double +// buffering algorithm by a factor of 2. +#if defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ARCH_VEGA908) +template <> +constexpr KOKKOS_INLINE_FUNCTION int +kk_gemm_dlb_buf_tile_k() { + return 16; +} +#endif +////////////////////////// alpha_in_fma_thresh //////////////////////////// +constexpr KOKKOS_INLINE_FUNCTION size_t kk_gemm_dbl_buf_alpha_in_fma_thresh() { +#ifdef __CUDACC_RDC__ + return 24; +#else + return 64; +#endif // __CUDAACC_RDC__ +} + +// clang-format off +/// \brief Blocking general matrix multiply on a batch of uniform matrices. +/// +/// +/// C = alpha * op(A) * op(B) + beta * C +/// +/// \tparam ArgTransA Specifies what op does to A: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose (unsupported) +/// \tparam ArgTransB Specifies what op does to B: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose (unsupported) +/// \tparam HandleType Specifies the handle type of the kernel handle +/// \tparam ScalarType Specifies the scalar type of alpha and beta +/// \tparam AViewType Input matrix, as a 3-rank Kokkos::View +/// \tparam BViewType Input matrix, as a 3-rank Kokkos::View +/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as a 3-rank +/// Kokkos::View +/// +/// See struct BatchedGemmHandle for details +/// \param handle [in] A handle which specifies how to invoke the batched +/// gemm. handle->get_tpl_params() returns &ninter. +/// ninter: The number of matrices to interleave. +/// \param alpha [in] Input coefficient used for multiplication with A +/// \param A [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix A is BxMxK +/// \param B [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix B is BxKxN +/// \param beta [in] Input coefficient used for multiplication with C +/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix C is BxMxN +/// \return 0 upon success, non-zero otherwise +/// + +/// Usage Example: +/// BatchedArmplGemm +/// (handle, alpha, A, B, beta, C).invoke(); +// clang-format on +template +class BatchedArmplGemm; +/********************* END forward declarations *********************/ + +template +struct BatchedGemmWrapperInner { + static int run(BatchedGemmHandleType *const handle, const ScalarType alpha, + const AViewType &A, const BViewType &B, const ScalarType beta, + const CViewType &C) { + int ret = 0; + size_t c_m, c_n; + using ViewValueType = typename CViewType::value_type; + // Check for valid input views + static_assert(Kokkos::is_view::value, + "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "BViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "CViewType must be a Kokkos::View."); + static_assert( + std::is_same::value || + std::is_same::value, + "ArgTransA must be either Trans::Transpose or Trans::NoTranspose."); + static_assert( + std::is_same::value || + std::is_same::value, + "ArgTransB must be either Trans::Transpose or Trans::NoTranspose."); + if constexpr (is_vector::value) { + // Check ranks of view with underlying SIMD value types + // For SIMD views, we can have either 3-rank or 4-ranks inputs. + switch (handle->get_kernel_algo_type()) { + case BaseKokkosBatchedAlgos::KK_SERIAL: + case BaseHeuristicAlgos::SQUARE: + case BaseTplAlgos::ARMPL: + static_assert(static_cast(AViewType::rank) == 3, + "AViewType must have rank 3."); + static_assert(static_cast(BViewType::rank) == 3, + "BViewType must have rank 3."); + static_assert(static_cast(CViewType::rank) == 3, + "CViewType must have rank 3."); + break; + default: + std::ostringstream os; + os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " + << std::to_string(handle->get_kernel_algo_type()) + << " with SIMD views." << std::endl; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + break; + } + } else { + // Check ranks of views with underlying scalar value types + static_assert(static_cast(AViewType::rank) == 3, + "AViewType must have rank 3."); + static_assert(static_cast(BViewType::rank) == 3, + "BViewType must have rank 3."); + static_assert(static_cast(CViewType::rank) == 3, + "CViewType must have rank 3."); + } + + // Check for valid data access patterns + // Skip checking a_layout == b_layout == c_layout + // Skip checking for LayoutStride + using c_layout = typename CViewType::array_layout; + static_assert(!(std::is_same::value && + !std::is_same::value), + "LayoutLeft views require BatchLayout::Right"); + static_assert(!(std::is_same::value && + !std::is_same::value), + "LayoutRight views require BatchLayout::Left"); + + if constexpr (std::is_same::value) { + // c_b = C.extent(0); + c_m = C.extent(1); + c_n = C.extent(2); + } else { + // c_b = C.extent(2); + c_m = C.extent(0); + c_n = C.extent(1); + } + + // Begin checking conditions for optimal BatchedGemm invocation. + using view_scalar_type = typename CViewType::value_type; + using layout_type = typename CViewType::array_layout; + using exec_space = typename CViewType::execution_space; + constexpr bool is_vector = + KokkosBatched::is_vector::value; + constexpr bool on_gpu = + KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool on_x86_64 = KokkosKernels::Impl::kk_is_x86_64_mem_space< + typename exec_space::memory_space>(); + constexpr bool on_a64fx = KokkosKernels::Impl::kk_is_a64fx_mem_space< + typename exec_space::memory_space>(); + + if (handle->enableDebug) { + std::cout << "view_scalar_type:" << typeid(view_scalar_type).name() + << std::endl + << "execution_space:" << typeid(exec_space).name() << std::endl + << std::endl + << "is_vector:" << is_vector << std::endl + << "on_gpu:" << on_gpu << std::endl + << "on_x86_64:" << on_x86_64 << std::endl + << "on_a64fx:" << on_a64fx << std::endl; + } + + switch (handle->get_kernel_algo_type()) { + ////////////// HEURISTIC ALGOS ////////////// + case BaseHeuristicAlgos::SQUARE: + if (c_m != c_n) { + std::ostringstream os; + os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " + << std::to_string(handle->get_kernel_algo_type()) << " when c_m(" + << std::to_string(c_m) << ") != c_n(" << std::to_string(c_n) << ")" + << std::endl; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + // Select optimal resultsPerThread param for BatchedSerialGemm + using bsgResultsPerThread = + typename std::conditional::type; + + // Select optimal mode param for SerialGemm. + using bsgModeType = typename std::conditional< + is_vector, + typename std::conditional::type, + typename std::conditional< + on_gpu, Algo::Gemm::Unblocked, + typename std::conditional::type>::type>:: + type; + + if (handle->enableDebug) { + std::cout << "bsgResultsPerThread: " + << typeid(bsgResultsPerThread).name() << std::endl + << "bsgModeType: " << typeid(bsgModeType).name() + << std::endl; + } + + if constexpr (on_gpu && + ((std::is_same::value) + ? (c_m >= 16) + : (c_m >= 24 && c_m <= 32) || c_m >= 40)) { + handle->teamSz = handle->vecLen = 8; + constexpr int tile_m = Impl::kk_gemm_dlb_buf_tile_m(); + constexpr int tile_n = Impl::kk_gemm_dlb_buf_tile_n(); + constexpr int tile_k = Impl::kk_gemm_dlb_buf_tile_k(); + constexpr size_t alpha_in_fma_thresh = + Impl::kk_gemm_dbl_buf_alpha_in_fma_thresh(); + + if (c_m % 32 == 0) { // No bounds checking + if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma + ret = Impl::BatchedDblBufGemm( + handle, alpha, A, B, beta, C) + .invoke(); + } else { // apply alpha in mul + ret = Impl::BatchedDblBufGemm( + handle, alpha, A, B, beta, C) + .invoke(); + } + } else { // bounds checking + if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma + ret = Impl::BatchedDblBufGemm( + handle, alpha, A, B, beta, C) + .invoke(); + } else { // apply alpha in mul + ret = Impl::BatchedDblBufGemm( + handle, alpha, A, B, beta, C) + .invoke(); + } + } + } else { + ret = Impl::BatchedSerialGemm(alpha, A, B, beta, C) + .invoke(); + } + break; + + // case BaseHeuristicAlgos::TALL: + // + // case BaseHeuristicAlgos::WIDE: + ////////////// TPL ALGOS ////////////// +#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058 + case BaseTplAlgos::ARMPL: + ret = Impl::BatchedArmplGemm( + handle, alpha, A, B, beta, C) + .invoke(); + break; +#endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL + // case BaseTplAlgos::MKL: + // + // case GemmTplAlgos::CUBLAS: + // + // case GemmTplAlgos::MAGMA: + + ////////////// KokkosBatched ALGOS ////////////// + case BaseKokkosBatchedAlgos::KK_SERIAL: + ret = + Impl::BatchedSerialGemm(alpha, A, B, beta, C) + .invoke(); + break; + + // case GemmKokkosBatchedAlgos::KK_SERIALSIMD: + + case GemmKokkosBatchedAlgos::KK_SERIAL_RANK0: + ret = + Impl::BatchedSerialGemm(alpha, A, B, beta, C) + .invoke(); + break; + + // case GemmKokkosBatchedAlgos::KK_SERIAL_SHMEM: + // case GemmKokkosBatchedAlgos::KK_TEAM: + // case GemmKokkosBatchedAlgos::KK_TEAMVECTOR: + // case GemmKokkosBatchedAlgos::KK_TEAMSIMD: + + case GemmKokkosBatchedAlgos::KK_DBLBUF: + // Note: The tile sizes of 1x1x1 here will not perform well but must be + // selected in order to function on all devices since the serial + // execution space has a max team size of 1. KokkosKernels API users + // will need to follow an approach similar to KK_SQUARE above for best + // performance. + + // TODO: Add auto-selection of tile size based on inputs and device type + ret = Impl::BatchedDblBufGemm( + handle, alpha, A, B, beta, C) + .invoke(); + break; + + default: + std::ostringstream os; + os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " + << std::to_string(handle->get_kernel_algo_type()) << "." + << std::endl; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + break; + } + return ret; + } +}; + +// Primary template +template +struct BatchedGemmWrapper { + static int run(BatchedGemmHandleType *const handle, const ScalarType alpha, + const AViewType &A, const BViewType &B, const ScalarType beta, + const CViewType &C); +}; + +// ETI specialization +// KOKKOSKERNELS_IMPL_COMPILE_LIBRARY should only be set +// when the *.cpp.in files include this. +#if KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +template +struct BatchedGemmWrapper { + static int run(BatchedGemmHandleType *const handle, const ScalarType alpha, + const AViewType &A, const BViewType &B, const ScalarType beta, + const CViewType &C) { +#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION + printf( + "KokkosBatched::BatchedGemm<> ETI specialization for < %s, %s, %s, " + "%s, %s, %s, %s, %s >\n", + typeid(ArgTransA).name(), typeid(ArgTransB).name(), + typeid(ArgBatchSzDim).name(), typeid(BatchedGemmHandleType).name(), + typeid(ScalarType).name(), typeid(AViewType).name(), + typeid(BViewType).name(), typeid(CViewType).name()); +#endif + return Impl::BatchedGemmWrapperInner< + ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, ScalarType, + AViewType, BViewType, CViewType>::run(handle, alpha, A, B, beta, C); + } +}; +#endif + +// If KOKKOSKERNELS_ETI_ONLY is defined, restrict invocations to ETI types only +// via above ETI specialization. +#if !defined(KOKKOSKERNELS_ETI_ONLY) +// Non-ETI specialization +template +struct BatchedGemmWrapper { + static int run(BatchedGemmHandleType *const handle, const ScalarType alpha, + const AViewType &A, const BViewType &B, const ScalarType beta, + const CViewType &C) { +#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION + printf( + "KokkosBatched::BatchedGemm<> non-ETI specialization for < %s, %s, " + "%s, %s, %s, %s, %s, %s >\n", + typeid(ArgTransA).name(), typeid(ArgTransB).name(), + typeid(ArgBatchSzDim).name(), typeid(BatchedGemmHandleType).name(), + typeid(ScalarType).name(), typeid(AViewType).name(), + typeid(BViewType).name(), typeid(CViewType).name()); +#endif + return Impl::BatchedGemmWrapperInner< + ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, ScalarType, + AViewType, BViewType, CViewType>::run(handle, alpha, A, B, beta, C); + } +}; +#endif + +// TODO: Include MEM_SPACE in Kokkos::View below? +#define KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, \ + ARG_BATCH_LAYOUT, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct BatchedGemmWrapper< \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, BatchedGemmHandle, SCALAR, \ + Kokkos::View>, \ + Kokkos::View>, \ + Kokkos::View>, \ + true>; + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ + EXEC_SPACE, MEM_SPACE) +#else +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ + EXEC_SPACE, MEM_SPACE) +#else +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) +#endif + +///////////////// BatchLayout::Left Permutations ///////////////// +#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ + Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ + Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ + Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ + Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +///////////////// BatchLayout::Right Permutations ///////////////// +#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ + Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ + Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ + Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ + Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +} // namespace Impl +} // namespace KokkosBatched +#include "KokkosBatched_Gemm_Serial_Impl.hpp" +#include "KokkosBatched_Gemm_Team_Impl.hpp" +#include "KokkosBatched_Gemm_TeamVector_Impl.hpp" +#include "KokkosBatched_Gemm_DblBuf_Impl.hpp" +#include "KokkosBatched_Gemm_Armpl_Impl.hpp" +#endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP__ \ No newline at end of file diff --git a/batched/dense/src/KokkosBatched_Gemm_Decl.hpp b/batched/dense/src/KokkosBatched_Gemm_Decl.hpp index d182197192..af05aca2ae 100644 --- a/batched/dense/src/KokkosBatched_Gemm_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Gemm_Decl.hpp @@ -16,14 +16,9 @@ #ifndef __KOKKOSBATCHED_GEMM_DECL_HPP__ #define __KOKKOSBATCHED_GEMM_DECL_HPP__ -#include "KokkosBatched_Util.hpp" #include "KokkosBatched_Vector.hpp" -// Includes for non-functor-level routines -#include -#include -#include - +/********************* BEGIN non-functor-level routines *********************/ namespace KokkosBatched { /********************* BEGIN functor-level routines *********************/ /// @@ -93,231 +88,11 @@ struct Gemm { }; /********************* END functor-level routines *********************/ -/********************* BEGIN non-functor-level routines *********************/ - namespace Impl { -/********************* BEGIN forward declarations *********************/ -// clang-format off -/// \brief Non-blocking solve of general matrix multiply on a batch of -/// uniform matrices. -/// -/// -/// C = alpha * op(A) * op(B) + beta * C -/// -/// \tparam ArgTransA Specifies what op does to A: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose -/// \tparam ArgTransB Specifies what op does to B: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose -/// \tparam ArgMode Specifies algorithm mode to use for serial work: -/// Algo::Gemm::Unblocked for no register blocking -/// Algo::Gemm::Blocked for register blocking -/// Algo::Gemm::CompactMKL for mkl compact tpl interface -/// \tparam ArgBatchSzDim Specifies where the batch dimension is allocated in -/// AViewType, BViewType, and CViewType: -/// BatchSzDim::Left Batch dimension is leftmost -/// BatchSzDim::Right Batch dimension is rightmost -/// \tparam ArgResultsPerThread Specifies how to divide work among threads. For -/// this serial interface, each rank specifies how -/// much work to assign a single thread. -/// ResultsPerThread::Rank0 Each thread computes a scalar of C -/// ResultsPerThread::Rank1 Each thread computes a 1-rank chunk of C -/// ResultsPerThread::Rank2 Each thread computes a 2-rank chunk of C -/// \tparam ScalarType Specifies the scalar type of alpha and beta -/// \tparam AViewType Input matrix, as either a 3-rank Kokkos::View or a -/// 4-rank Kokkos::View for SIMD operations. -/// \tparam BViewType Input matrix, as either a 3-rank Kokkos::View or a -/// 4-rank Kokkos::View for SIMD operations. -/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as either a 3-rank -/// Kokkos::View or a 4-rank Kokkos::View for SIMD -/// operations. -/// -/// See struct BatchedGemmHandle for details. -/// \param alpha [in] Input coefficient used for multiplication with A -/// \param A [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix A is BxMxK -/// \param B [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix B is BxKxN -/// \param beta [in] Input coefficient used for multiplication with C -/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix C is BxMxN -/// \return 0 upon success, non-zero otherwise -/// -/// Usage Example: -/// BatchedSerialGemm(alpha, A, B, beta, C).invoke(); -// clang-format on -template -class BatchedSerialGemm; - -// clang-format off -/// \brief Non-blocking solve of general matrix multiply on a batch of -/// uniform matrices with an algorithm based on: -/// B. P. D. J. Kunkel, Julian, “Performance, design, and autotuning of batched gemm for GPUs,” -/// in Lecture Notes in Computer Science, ser. ISC High Performance Computing ’16, vol. 9697, 06 2016. -/// -/// -/// C = alpha * op(A) * op(B) + beta * C -/// -/// \tparam ArgTransA Specifies what op does to A: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose (unsupported) -/// \tparam ArgTransB Specifies what op does to B: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose (unsupported) -/// \tparam ArgBatchSzDim Specifies where the batch dimension is allocated in -/// AViewType, BViewType, and CViewType: -/// BatchSzDim::Left Batch dimension is leftmost -/// BatchSzDim::Right Batch dimension is rightmost -/// \tparam ArgResultsPerThread Specifies how to divide work among threads. For -/// this serial interface, each rank specifies how -/// much work to assign a single thread. -/// ResultsPerThread::Rank0 Each thread computes a scalar of C -/// ResultsPerThread::Rank1 Each thread computes a 1-rank chunk of C -/// ResultsPerThread::Rank2 Each thread computes a 2-rank chunk of C -/// \tparam HandleType Specifies the handle type of the kernel handle -/// \tparam ScalarType Specifies the scalar type of alpha and beta -/// \tparam AViewType Input matrix, as either a 3-rank Kokkos::View or a -/// 4-rank Kokkos::View for SIMD operations. -/// \tparam BViewType Input matrix, as either a 3-rank Kokkos::View or a -/// 4-rank Kokkos::View for SIMD operations. -/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as either a 3-rank -/// Kokkos::View or a 4-rank Kokkos::View for SIMD -/// operations. -/// \tparam ArgBoundsCheck Specifies whether to perform global memory access -/// bounds checks within the functor. Bounds checks -/// are required when matrix sizes are not evenly divisible -/// by tile sizes. -/// BoundsCheck::Yes The functor will perform bound checks (recommended) -/// BoundsCheck::No The functor will NOT perform bound checks -/// \tparam ArgAlphaFmaTag Specifies whether to apply alpha during fmas. -/// AlphaFmaTag::Yes alpha will be applied during fma (C = C * alpha + AB). -/// AlphaFmaTag::No alpha will be applied during mul (A * B * alpha). -/// \tparam TILE_M Specifies the number of rows in each tile. -/// \tparam TILE_N Specifies the number of cols in each tile. -/// \tparam TILE_K Specifies the number of cols or rows in a tile of A or tile of B, respectively. -/// -/// See struct BatchedGemmHandle for details. -/// \param alpha [in] Input coefficient used for multiplication with A -/// \param A [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix A is BxMxK -/// \param B [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix B is BxKxN -/// \param beta [in] Input coefficient used for multiplication with C -/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix C is BxMxN -/// \return 0 upon success, non-zero otherwise -/// -/// Usage Example: -/// BatchedSerialGemm(alpha, A, B, beta, C).invoke(); -// clang-format on -template -class BatchedDblBufGemm; - -//////////////////////////////// tile_m ////////////////////////////////// -template -constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_m() { - return 32; -} -//////////////////////////////// tile_n ////////////////////////////////// -template -constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_n() { - return 32; -} -//////////////////////////////// tile_k ////////////////////////////////// -template -constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_k() { - return 8; -} - -// On MI100, batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_right fails -// without this. See https://github.com/kokkos/kokkos-kernels/issues/1547. -// This reduces the register allocations (REG_M and REG_N) in the double -// buffering algorithm by a factor of 2. -#if defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ARCH_VEGA908) -template <> -constexpr KOKKOS_INLINE_FUNCTION int -kk_gemm_dlb_buf_tile_k() { - return 16; -} -#endif -////////////////////////// alpha_in_fma_thresh //////////////////////////// -constexpr KOKKOS_INLINE_FUNCTION size_t kk_gemm_dbl_buf_alpha_in_fma_thresh() { -#ifdef __CUDACC_RDC__ - return 24; -#else - return 64; -#endif // __CUDAACC_RDC__ -} - -// clang-format off -/// \brief Blocking solve of general matrix multiply on a batch of uniform matrices. -/// -/// -/// C = alpha * op(A) * op(B) + beta * C -/// -/// \tparam ArgTransA Specifies what op does to A: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose (unsupported) -/// \tparam ArgTransB Specifies what op does to B: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose (unsupported) -/// \tparam HandleType Specifies the handle type of the kernel handle -/// \tparam ScalarType Specifies the scalar type of alpha and beta -/// \tparam AViewType Input matrix, as a 3-rank Kokkos::View -/// \tparam BViewType Input matrix, as a 3-rank Kokkos::View -/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as a 3-rank -/// Kokkos::View -/// -/// See struct BatchedGemmHandle for details -/// \param handle [in] A handle which specifies how to invoke the batched -/// gemm. handle->get_tpl_params() returns &ninter. -/// ninter: The number of matrices to interleave. -/// \param alpha [in] Input coefficient used for multiplication with A -/// \param A [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix A is BxMxK -/// \param B [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix B is BxKxN -/// \param beta [in] Input coefficient used for multiplication with C -/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix C is BxMxN -/// \return 0 upon success, non-zero otherwise -/// - -/// Usage Example: -/// BatchedArmplGemm -/// (handle, alpha, A, B, beta, C).invoke(); -// clang-format on -template -class BatchedArmplGemm; -/********************* END forward declarations *********************/ +template +struct BatchedGemmWrapper; } // namespace Impl // clang-format off @@ -376,289 +151,35 @@ class BatchedArmplGemm; template -int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha, - const AViewType &A, const BViewType &B, const ScalarType beta, - const CViewType &C) { - int ret = 0; - size_t c_m, c_n; - using ViewValueType = typename CViewType::value_type; - // Check for valid input views - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "BViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "CViewType must be a Kokkos::View."); - static_assert( - std::is_same::value || - std::is_same::value, - "ArgTransA must be either Trans::Transpose or Trans::NoTranspose."); - static_assert( - std::is_same::value || - std::is_same::value, - "ArgTransB must be either Trans::Transpose or Trans::NoTranspose."); - if (is_vector::value) { - // Check ranks of view with underlying SIMD value types - // For SIMD views, we can have either 3-rank or 4-ranks inputs. - switch (handle->get_kernel_algo_type()) { - case BaseKokkosBatchedAlgos::KK_SERIAL: - case BaseHeuristicAlgos::SQUARE: - case BaseTplAlgos::ARMPL: - static_assert(static_cast(AViewType::rank) == 3, - "AViewType must have rank 3."); - static_assert(static_cast(BViewType::rank) == 3, - "BViewType must have rank 3."); - static_assert(static_cast(CViewType::rank) == 3, - "CViewType must have rank 3."); - break; - - // TODO: check this once KK_TEAM is supported - // case GemmKokkosBatchedAlgos::KK_TEAM: - // static_assert(static_cast(AViewType::rank) == 4, - // "AViewType must have rank 4."); - // static_assert(static_cast(BViewType::rank) == 4, - // "BViewType must have rank 4."); - // static_assert(static_cast(CViewType::rank) == 4, - // "CViewType must have rank 4."); - // break; - - default: - std::ostringstream os; - os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " - << std::to_string(handle->get_kernel_algo_type()) - << " with SIMD views." << std::endl; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - break; - } - } else { - // Check ranks of views with underlying scalar value types - static_assert(static_cast(AViewType::rank) == 3, - "AViewType must have rank 3."); - static_assert(static_cast(BViewType::rank) == 3, - "BViewType must have rank 3."); - static_assert(static_cast(CViewType::rank) == 3, - "CViewType must have rank 3."); - } - - // Check for valid data access patterns - // Skip checking a_layout == b_layout == c_layout - // Skip checking for LayoutStride - using c_layout = typename CViewType::array_layout; - if (std::is_same::value && - !std::is_same::value) { - throw std::runtime_error( - "Error: LayoutLeft views require BatchLayout::Right"); - } - if (std::is_same::value && - !std::is_same::value) { - throw std::runtime_error( - "Error: LayoutRight views require BatchLayout::Left"); - } - - if (std::is_same::value) { - // c_b = C.extent(0); - c_m = C.extent(1); - c_n = C.extent(2); - } else { - // c_b = C.extent(2); - c_m = C.extent(0); - c_n = C.extent(1); - } - - // Begin checking conditions for optimal BatchedGemm invocation. - using view_scalar_type = typename CViewType::value_type; - using layout_type = typename CViewType::array_layout; - using exec_space = typename CViewType::execution_space; - constexpr bool is_vector = KokkosBatched::is_vector::value; - constexpr bool on_gpu = - KokkosKernels::Impl::kk_is_gpu_exec_space(); - constexpr bool on_x86_64 = KokkosKernels::Impl::kk_is_x86_64_mem_space< - typename exec_space::memory_space>(); - constexpr bool on_a64fx = KokkosKernels::Impl::kk_is_a64fx_mem_space< - typename exec_space::memory_space>(); - - if (handle->enableDebug) { - std::cout << "view_scalar_type:" << typeid(view_scalar_type).name() - << std::endl - << "execution_space:" << typeid(exec_space).name() << std::endl - << std::endl - << "is_vector:" << is_vector << std::endl - << "on_gpu:" << on_gpu << std::endl - << "on_x86_64:" << on_x86_64 << std::endl - << "on_a64fx:" << on_a64fx << std::endl; - } - - switch (handle->get_kernel_algo_type()) { - ////////////// HEURISTIC ALGOS ////////////// - case BaseHeuristicAlgos::SQUARE: - if (c_m != c_n) { - std::ostringstream os; - os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " - << std::to_string(handle->get_kernel_algo_type()) << " when c_m(" - << std::to_string(c_m) << ") != c_n(" << std::to_string(c_n) << ")" - << std::endl; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - - // Select optimal resultsPerThread param for BatchedSerialGemm - using bsgResultsPerThread = - typename std::conditional::type; - - // Select optimal mode param for SerialGemm. - using bsgModeType = typename std::conditional< - is_vector, - typename std::conditional::type, - typename std::conditional< - on_gpu, Algo::Gemm::Unblocked, - typename std::conditional::type>::type>:: - type; - - if (handle->enableDebug) { - std::cout << "bsgResultsPerThread: " - << typeid(bsgResultsPerThread).name() << std::endl - << "bsgModeType: " << typeid(bsgModeType).name() << std::endl; - } - - // if (on_gpu && c_m >= 20 && - // (alpha == 1.0F && beta == 0.0F) ? c_m <= 24 : c_m <= 21) { - // // TODO: invoke TeamShmem - // } else - if (on_gpu && ((std::is_same::value) - ? (c_m >= 16) - : (c_m >= 24 && c_m <= 32) || c_m >= 40)) { - handle->teamSz = handle->vecLen = 8; - constexpr int tile_m = Impl::kk_gemm_dlb_buf_tile_m(); - constexpr int tile_n = Impl::kk_gemm_dlb_buf_tile_n(); - constexpr int tile_k = Impl::kk_gemm_dlb_buf_tile_k(); - constexpr size_t alpha_in_fma_thresh = - Impl::kk_gemm_dbl_buf_alpha_in_fma_thresh(); - - if (c_m % 32 == 0) { // No bounds checking - if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma - ret = - Impl::BatchedDblBufGemm< - ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, - ScalarType, AViewType, BViewType, CViewType, - BoundsCheck::No, AlphaTag::Yes, tile_m, tile_n, tile_k>( - handle, alpha, A, B, beta, C) - .invoke(); - } else { // apply alpha in mul - ret = - Impl::BatchedDblBufGemm< - ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, - ScalarType, AViewType, BViewType, CViewType, - BoundsCheck::No, AlphaTag::No, tile_m, tile_n, tile_k>( - handle, alpha, A, B, beta, C) - .invoke(); - } - } else { // bounds checking - if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma - ret = - Impl::BatchedDblBufGemm< - ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, - ScalarType, AViewType, BViewType, CViewType, - BoundsCheck::Yes, AlphaTag::Yes, tile_m, tile_n, tile_k>( - handle, alpha, A, B, beta, C) - .invoke(); - } else { // apply alpha in mul - ret = - Impl::BatchedDblBufGemm< - ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, - ScalarType, AViewType, BViewType, CViewType, - BoundsCheck::Yes, AlphaTag::No, tile_m, tile_n, tile_k>( - handle, alpha, A, B, beta, C) - .invoke(); - } - } - } else { - ret = Impl::BatchedSerialGemm(alpha, A, B, beta, C) - .invoke(); - } - break; - - // case BaseHeuristicAlgos::TALL: - // - // case BaseHeuristicAlgos::WIDE: - ////////////// TPL ALGOS ////////////// -#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058 - case BaseTplAlgos::ARMPL: - ret = Impl::BatchedArmplGemm(handle, alpha, A, B, - beta, C) - .invoke(); - break; -#endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL - // case BaseTplAlgos::MKL: - // - // case GemmTplAlgos::CUBLAS: - // - // case GemmTplAlgos::MAGMA: - - ////////////// KokkosBatched ALGOS ////////////// - case BaseKokkosBatchedAlgos::KK_SERIAL: - ret = - Impl::BatchedSerialGemm( - alpha, A, B, beta, C) - .invoke(); - break; - - // case GemmKokkosBatchedAlgos::KK_SERIALSIMD: - - case GemmKokkosBatchedAlgos::KK_SERIAL_RANK0: - ret = - Impl::BatchedSerialGemm( - alpha, A, B, beta, C) - .invoke(); - break; - - // case GemmKokkosBatchedAlgos::KK_SERIAL_SHMEM: - // case GemmKokkosBatchedAlgos::KK_TEAM: - // case GemmKokkosBatchedAlgos::KK_TEAMVECTOR: - // case GemmKokkosBatchedAlgos::KK_TEAMSIMD: - - case GemmKokkosBatchedAlgos::KK_DBLBUF: - // Note: The tile sizes of 1x1x1 here will not perform well but must be - // selected in order to function on all devices since the serial execution - // space has a max team size of 1. KokkosKernels API users will need to - // follow an approach similar to KK_SQUARE above for best performance. - - // TODO: Add auto-selection of tile size based on inputs and device type - ret = Impl::BatchedDblBufGemm( - handle, alpha, A, B, beta, C) - .invoke(); - break; - - default: - std::ostringstream os; - os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " - << std::to_string(handle->get_kernel_algo_type()) << "." << std::endl; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - break; - } - return ret; +inline int BatchedGemm(BatchedGemmHandleType *const handle, + const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, + const CViewType &C) { + // If either this is being processed by a *.cpp.in file or KK ETI_ONLY + // is defined, use the ETI specialization. Defer till link time + // for which specilization will be used from + // KokkosBatched_HostLevel_Gemm_Impl.hpp. +#if defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + return Impl::BatchedGemmWrapper::run(handle, + alpha, A, B, + beta, C); +#else + // Use the non-ETI specialization. + return Impl::BatchedGemmWrapper::run(handle, + alpha, A, B, + beta, C); +#endif // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY } -/********************* END non-functor-level routines *********************/ } // namespace KokkosBatched +/********************* END non-functor-level routines *********************/ +#include "KokkosBatched_HostLevel_Gemm_Impl.hpp" #include "KokkosBatched_Gemm_Serial_Impl.hpp" #include "KokkosBatched_Gemm_Team_Impl.hpp" #include "KokkosBatched_Gemm_TeamVector_Impl.hpp" -#include "KokkosBatched_Gemm_DblBuf_Impl.hpp" -#include "KokkosBatched_Gemm_Armpl_Impl.hpp" #endif diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp index ac38da8270..4295d880fa 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp @@ -36,7 +36,6 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, using transA = typename ParamTagType::transA; using transB = typename ParamTagType::transB; using batchLayout = typename ParamTagType::batchLayout; - using view_layout = typename ViewType::array_layout; using ats = Kokkos::ArithTraits; int ret = 0; @@ -127,11 +126,6 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, batchedGemmHandle, alpha, a_actual, b_actual, beta, c_actual); // Compute c_actual } catch (const std::runtime_error& error) { - bool is_invalid_layout = - (std::is_same::value && - std::is_same::value) || - (std::is_same::value && - std::is_same::value); std::string error_msg = error.what(); if (algo_type == BaseHeuristicAlgos::SQUARE && matCdim1 != matCdim2) { ; @@ -140,17 +134,14 @@ void impl_test_batched_gemm_with_handle(BatchedGemmHandle* batchedGemmHandle, auto ninter = batchedGemmHandle->get_tpl_params()[0]; // No runtime errors expected since layout is valid, double is a supported // type, and ninter != 0 - if (!is_invalid_layout && - std::is_same::value && + if (std::is_same::value && ninter != 0) { FAIL() << (error_msg + fmsg + fmsg_rhs); } #else ; // We expect a runtime error if the ARMPL TPL is not enabled #endif - } else if (!is_invalid_layout) { - // No runtime errors expected since we only support certain BatchLayouts - // for LayoutLeft and LayoutRight. + } else { FAIL() << (error_msg + fmsg + fmsg_rhs); } return; @@ -290,11 +281,13 @@ void impl_test_batched_gemm(const int N, const int matAdim1, const int matAdim2, ViewType a_actual("a_actual", N, matAdim1, matAdim2); ViewType b_actual("b_actual", N, matBdim1, matBdim2); ViewType c_actual("c_actual", N, matCdim1, matCdim2); - using ta = typename ParamTagType::transA; - using tb = typename ParamTagType::transB; - using bl = typename ParamTagType::batchLayout; - BatchedGemm(&batchedGemmHandle, 0.34, a_actual, - b_actual, 0.43, c_actual); + using ta = typename ParamTagType::transA; + using tb = typename ParamTagType::transB; + using bl = typename ParamTagType::batchLayout; + ScalarType alpha = 0.34; + ScalarType beta = 0.43; + BatchedGemm(&batchedGemmHandle, alpha, a_actual, + b_actual, beta, c_actual); std::string fmsg = kk_failure_str(__FILE__, __FUNCTION__, __LINE__); FAIL() << fmsg; } catch (const std::runtime_error& error) { @@ -382,31 +375,43 @@ template int test_batched_gemm() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) - typedef Kokkos::View llVt; - test_batched_gemm_with_layout(0); - test_batched_gemm_with_layout(1); - test_batched_gemm_with_layout(4); - test_batched_gemm_with_layout(8); - test_batched_gemm_with_layout(16); + if constexpr (std::is_same_v) { + using param_tag_type = ::Test::SharedParamTag; + typedef Kokkos::View llVt; + test_batched_gemm_with_layout(0); + test_batched_gemm_with_layout(1); + test_batched_gemm_with_layout(4); + test_batched_gemm_with_layout(8); + test_batched_gemm_with_layout(16); + } #endif // KOKKOSKERNELS_INST_LAYOUTLEFT #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) - typedef Kokkos::View lrVt; - test_batched_gemm_with_layout(0); - test_batched_gemm_with_layout(1); - test_batched_gemm_with_layout(4); - test_batched_gemm_with_layout(8); - test_batched_gemm_with_layout(16); + if constexpr (std::is_same_v) { + using param_tag_type = ::Test::SharedParamTag; + typedef Kokkos::View lrVt; + test_batched_gemm_with_layout(0); + test_batched_gemm_with_layout(1); + test_batched_gemm_with_layout(4); + test_batched_gemm_with_layout(8); + test_batched_gemm_with_layout(16); + } #endif // KOKKOSKERNELS_INST_LAYOUTRIGHT return 0; } diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp index 73ff26a8a7..01622258ab 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp @@ -20,9 +20,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_scomplex_scomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_scomplex_scomplex_left) { @@ -30,9 +28,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_scomplex_scomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_scomplex_scomplex_left) { @@ -40,9 +36,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_scomplex_scomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_scomplex_scomplex_left) { @@ -50,9 +44,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_scomplex_scomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } /********************* BatchLayout::Right *********************/ @@ -62,9 +54,7 @@ TEST_F(TestCategory, BatchLayout::Right> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_scomplex_scomplex_right) { @@ -72,9 +62,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_scomplex_scomplex_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_scomplex_scomplex_right) { @@ -82,9 +70,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_scomplex_scomplex_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_scomplex_scomplex_right) { @@ -92,9 +78,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_scomplex_scomplex_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } #endif @@ -106,9 +90,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_left) { @@ -116,9 +98,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_left) { @@ -126,9 +106,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_left) { @@ -136,9 +114,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } /********************* BatchLayout::Right *********************/ @@ -148,9 +124,7 @@ TEST_F(TestCategory, BatchLayout::Right> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_right) { @@ -158,9 +132,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_right) { @@ -168,9 +140,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_right) { @@ -178,9 +148,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm, - Kokkos::complex, param_tag_type>(); - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp index 3bc48c1aaf..92ea8bcc67 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp @@ -13,16 +13,19 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#if defined(KOKKOS_BHALF_T_IS_FLOAT) + +// We do not ETI half-types. Only test this if ETI ONLY is off +// and bhalf_t is not an alias to float. +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) && \ + defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT /********************* BatchLayout::Left *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_bhalf_bhalf_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_left) { @@ -30,9 +33,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_left) { @@ -40,9 +41,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_left) { @@ -50,9 +49,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } /********************* BatchLayout::Right *********************/ @@ -61,9 +58,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_bhalf_bhalf_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_right) { @@ -71,9 +66,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_right) { @@ -81,9 +74,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_right) { @@ -91,23 +82,23 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } #endif // KOKKOS_BHALF_T_IS_FLOAT -#if defined(KOKKOS_HALF_T_IS_FLOAT) +// We do not ETI half-types. Only test this if ETI ONLY is off +// and half_t is not an alias to float. +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) && \ + defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT /********************* BatchLayout::Left *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_half_half_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_half_half_left) { @@ -115,9 +106,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_half_half_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_half_half_left) { @@ -125,9 +114,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_half_half_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_half_half_left) { @@ -135,9 +122,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_half_half_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } /********************* BatchLayout::Right *********************/ @@ -146,9 +131,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_half_half_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_half_half_right) { @@ -156,9 +139,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_half_half_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_half_half_right) { @@ -166,9 +147,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_half_half_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_half_half_right) { @@ -176,9 +155,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_half_half_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); - test_batched_gemm(); } #endif // KOKKOS_HALF_T_IS_FLOAT @@ -190,32 +167,28 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_float_float_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_float_float_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_float_float_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_right) { @@ -223,68 +196,60 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_float_float_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_float_float_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_float_float_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) /********************* BatchLayout::Left *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_left) { - typedef ::Test::SharedParamTag - param_tag_type; + using param_tag_type = + ::Test::SharedParamTag; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_double_double_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_double_double_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_double_double_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_right) { @@ -292,31 +257,27 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_double_double_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_double_double_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_double_double_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); - test_batched_gemm(); + test_batched_gemm(); } #endif diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_bll_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_bll_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..30be7867c2 --- /dev/null +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_bll_eti_spec_inst.cpp.in @@ -0,0 +1,23 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosBatched_Gemm_Decl.hpp" +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_NT_BLL_ETI_INST_BLOCK@ +} // namespace Impl +} // namespace KokkosBatched \ No newline at end of file diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_blr_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_blr_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..e8603023fa --- /dev/null +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_blr_eti_spec_inst.cpp.in @@ -0,0 +1,23 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosBatched_Gemm_Decl.hpp" +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_NT_BLR_ETI_INST_BLOCK@ +} // namespace Impl +} // namespace KokkosBatched \ No newline at end of file diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_bll_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_bll_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..2f97a26f55 --- /dev/null +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_bll_eti_spec_inst.cpp.in @@ -0,0 +1,23 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosBatched_Gemm_Decl.hpp" +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_T_BLL_ETI_INST_BLOCK@ +} // namespace Impl +} // namespace KokkosBatched \ No newline at end of file diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_blr_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_blr_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..db9b9aacc2 --- /dev/null +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_blr_eti_spec_inst.cpp.in @@ -0,0 +1,23 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosBatched_Gemm_Decl.hpp" +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_T_BLR_ETI_INST_BLOCK@ +} // namespace Impl +} // namespace KokkosBatched \ No newline at end of file diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_bll_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_bll_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..1bc954e943 --- /dev/null +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_bll_eti_spec_inst.cpp.in @@ -0,0 +1,23 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosBatched_Gemm_Decl.hpp" +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_NT_BLL_ETI_INST_BLOCK@ +} // namespace Impl +} // namespace KokkosBatched \ No newline at end of file diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_blr_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_blr_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..ab533445a8 --- /dev/null +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_blr_eti_spec_inst.cpp.in @@ -0,0 +1,23 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosBatched_Gemm_Decl.hpp" +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_NT_BLR_ETI_INST_BLOCK@ +} // namespace Impl +} // namespace KokkosBatched \ No newline at end of file diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_bll_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_bll_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..46a84f1e09 --- /dev/null +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_bll_eti_spec_inst.cpp.in @@ -0,0 +1,23 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosBatched_Gemm_Decl.hpp" +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_T_BLL_ETI_INST_BLOCK@ +} // namespace Impl +} // namespace KokkosBatched \ No newline at end of file diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_blr_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_blr_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..da388cbfac --- /dev/null +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_blr_eti_spec_inst.cpp.in @@ -0,0 +1,23 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosBatched_Gemm_Decl.hpp" +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_T_BLR_ETI_INST_BLOCK@ +} // namespace Impl +} // namespace KokkosBatched \ No newline at end of file diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..e69de29bb2 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..e69de29bb2 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..e69de29bb2 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..e69de29bb2 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..e69de29bb2 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..e69de29bb2 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..e69de29bb2 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..e69de29bb2 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..e69de29bb2 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..e69de29bb2 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..e69de29bb2 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..e69de29bb2 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..e69de29bb2 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..e69de29bb2 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..e69de29bb2 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_common/Test_Cuda.hpp b/test_common/Test_Cuda.hpp index 0bfe35718b..83c1aa7d80 100644 --- a/test_common/Test_Cuda.hpp +++ b/test_common/Test_Cuda.hpp @@ -33,5 +33,7 @@ class Cuda : public ::testing::Test { #define TestCategory Cuda #define TestExecSpace Kokkos::Cuda +#define TestDeviceType \ + Kokkos::Device #endif // TEST_CUDA_HPP diff --git a/test_common/Test_HIP.hpp b/test_common/Test_HIP.hpp index 7e61bfc9c3..8cfae41cc4 100644 --- a/test_common/Test_HIP.hpp +++ b/test_common/Test_HIP.hpp @@ -33,5 +33,7 @@ class hip : public ::testing::Test { #define TestCategory hip #define TestExecSpace Kokkos::Experimental::HIP +#define TestDeviceType \ + Kokkos::Device #endif // TEST_HIP_HPP diff --git a/test_common/Test_OpenMP.hpp b/test_common/Test_OpenMP.hpp index 8b4f90730e..43ca0e2627 100644 --- a/test_common/Test_OpenMP.hpp +++ b/test_common/Test_OpenMP.hpp @@ -33,5 +33,7 @@ class openmp : public ::testing::Test { #define TestCategory openmp #define TestExecSpace Kokkos::OpenMP +#define TestDeviceType \ + Kokkos::Device #endif // TEST_OPENMP_HPP diff --git a/test_common/Test_OpenMPTarget.hpp b/test_common/Test_OpenMPTarget.hpp index 2056d8be01..1cd901c332 100644 --- a/test_common/Test_OpenMPTarget.hpp +++ b/test_common/Test_OpenMPTarget.hpp @@ -33,5 +33,7 @@ class openmptarget : public ::testing::Test { #define TestCategory openmptarget #define TestExecSpace Kokkos::Experimental::OpenMPTarget +#define TestDeviceType \ + Kokkos::Device #endif // TEST_OPENMPTARGET_HPP diff --git a/test_common/Test_SYCL.hpp b/test_common/Test_SYCL.hpp index c7022f35d1..e85ce3865f 100644 --- a/test_common/Test_SYCL.hpp +++ b/test_common/Test_SYCL.hpp @@ -30,3 +30,5 @@ class sycl_test : public ::testing::Test { #define TestCategory sycl_test #define TestExecSpace Kokkos::Experimental::SYCL +#define TestDeviceType \ + Kokkos::Device diff --git a/test_common/Test_Serial.hpp b/test_common/Test_Serial.hpp index fe2917937b..ba31c8d417 100644 --- a/test_common/Test_Serial.hpp +++ b/test_common/Test_Serial.hpp @@ -33,5 +33,7 @@ class serial : public ::testing::Test { #define TestCategory serial #define TestExecSpace Kokkos::Serial +#define TestDeviceType \ + Kokkos::Device #endif // TEST_SERIAL_HPP diff --git a/test_common/Test_Threads.hpp b/test_common/Test_Threads.hpp index 1e2919b68f..4e39fb16bc 100644 --- a/test_common/Test_Threads.hpp +++ b/test_common/Test_Threads.hpp @@ -33,5 +33,7 @@ class threads : public ::testing::Test { #define TestCategory threads #define TestExecSpace Kokkos::Threads +#define TestDeviceType \ + Kokkos::Device #endif // TEST_THREADS_HPP From 60ddbb25af6d7206f13e6ad597c4750dcb29f876 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 3 May 2023 14:36:45 -0600 Subject: [PATCH 380/442] Fix constexpr branch --- .../KokkosBatched_HostLevel_Gemm_Impl.hpp | 57 +++++++++++-------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index b532fd1d0f..cd289227ff 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -336,6 +336,7 @@ struct BatchedGemmWrapperInner { typename exec_space::memory_space>(); constexpr bool on_a64fx = KokkosKernels::Impl::kk_is_a64fx_mem_space< typename exec_space::memory_space>(); + bool out_of_range = false; if (handle->enableDebug) { std::cout << "view_scalar_type:" << typeid(view_scalar_type).name() @@ -384,55 +385,63 @@ struct BatchedGemmWrapperInner { << std::endl; } - if constexpr (on_gpu && - ((std::is_same::value) - ? (c_m >= 16) - : (c_m >= 24 && c_m <= 32) || c_m >= 40)) { - handle->teamSz = handle->vecLen = 8; - constexpr int tile_m = Impl::kk_gemm_dlb_buf_tile_m(); - constexpr int tile_n = Impl::kk_gemm_dlb_buf_tile_n(); - constexpr int tile_k = Impl::kk_gemm_dlb_buf_tile_k(); - constexpr size_t alpha_in_fma_thresh = - Impl::kk_gemm_dbl_buf_alpha_in_fma_thresh(); - - if (c_m % 32 == 0) { // No bounds checking - if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma - ret = Impl::BatchedDblBufGemm::value) + ? (c_m >= 16) + : (c_m >= 24 && c_m <= 32) || c_m >= 40)) { + handle->teamSz = handle->vecLen = 8; + constexpr int tile_m = Impl::kk_gemm_dlb_buf_tile_m(); + constexpr int tile_n = Impl::kk_gemm_dlb_buf_tile_n(); + constexpr int tile_k = Impl::kk_gemm_dlb_buf_tile_k(); + constexpr size_t alpha_in_fma_thresh = + Impl::kk_gemm_dbl_buf_alpha_in_fma_thresh(); + + if (c_m % 32 == 0) { // No bounds checking + if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma + ret = + Impl::BatchedDblBufGemm( handle, alpha, A, B, beta, C) .invoke(); - } else { // apply alpha in mul - ret = Impl::BatchedDblBufGemm( handle, alpha, A, B, beta, C) .invoke(); - } - } else { // bounds checking - if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma - ret = Impl::BatchedDblBufGemm= alpha_in_fma_thresh) { // apply alpha in fma + ret = + Impl::BatchedDblBufGemm( handle, alpha, A, B, beta, C) .invoke(); - } else { // apply alpha in mul - ret = Impl::BatchedDblBufGemm( handle, alpha, A, B, beta, C) .invoke(); + } } + } else { + out_of_range = true; } - } else { + } + if (!on_gpu || out_of_range) { ret = Impl::BatchedSerialGemm Date: Wed, 10 May 2023 16:47:32 -0600 Subject: [PATCH 381/442] minor cleanup --- batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp | 1 - batched/dense/src/KokkosBatched_Gemm_Decl.hpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index cd289227ff..cbf35b6931 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -592,7 +592,6 @@ struct BatchedGemmWrapper Date: Wed, 10 May 2023 16:58:52 -0600 Subject: [PATCH 382/442] Start moving into HostLevel headers --- .../impl/KokkosBatched_Gemm_Serial_Impl.hpp | 111 --------------- ...kkosBatched_HostLevel_Gemm_Armpl_Impl.hpp} | 4 +- ...kosBatched_HostLevel_Gemm_DblBuf_Impl.hpp} | 4 +- .../KokkosBatched_HostLevel_Gemm_Impl.hpp | 8 +- ...kkosBatched_HostLevel_Gemm_Serial_Impl.hpp | 127 ++++++++++++++++++ 5 files changed, 134 insertions(+), 120 deletions(-) rename batched/dense/impl/{KokkosBatched_Gemm_Armpl_Impl.hpp => KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp} (98%) rename batched/dense/impl/{KokkosBatched_Gemm_DblBuf_Impl.hpp => KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp} (99%) create mode 100644 batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp diff --git a/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp index 97d3d70e9d..6b3cec25da 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp @@ -20,7 +20,6 @@ #include "KokkosBatched_Gemm_Serial_Internal.hpp" namespace KokkosBatched { -/********************* BEGIN functor-level routines *********************/ /// /// Serial Impl /// =========== @@ -352,116 +351,6 @@ SerialGemm::invoke( A.stride_0(), B.data(), B.stride_1(), B.stride_0(), beta, C.data(), C.stride_0(), C.stride_1()); } -/********************* END functor-level routines *********************/ - -namespace Impl { -/********************* BEGIN non-functor-level routines *********************/ -template -class BatchedSerialGemm { - private: - AViewType A; - BViewType B; - CViewType C; - ScalarType alpha, beta; - size_t divisor, c_cols, batch_size; - ArgBatchSzDim batch_layout_tag; - ArgTransA transA_tag; - ArgTransB transB_tag; - - void run() { - using execution_space = typename CViewType::device_type::execution_space; - using policy_type = - Kokkos::RangePolicy; - Kokkos::parallel_for("BatchedSerialGemm", policy_type(0, batch_size), - *this); - } - - public: - int invoke() { - if (std::is_same::value) { - // Set members for ResultsPerThread::Rank0 operator; these members allow - // each thread to calculate its C output index - if (std::is_same::value) { - batch_size = C.extent(0); - divisor = C.extent(1) * C.extent(2); - c_cols = C.extent(2); - } else { - batch_size = C.extent(2); - divisor = C.extent(0) * C.extent(1); - c_cols = C.extent(1); - } - - // Increase the number of threads by the divisor - batch_size *= divisor; - - run(); - } else if (std::is_same::value) { - if (std::is_same::value) - batch_size = C.extent(0); - else - batch_size = C.extent(2); - - run(); - } else { - std::cerr << "Error: ArgResultsPerThread not supported" << std::endl; - return -1; - } - return 0; - } - - BatchedSerialGemm(ScalarType _alpha, AViewType _A, BViewType _B, - ScalarType _beta, CViewType _C) - : A(_A), B(_B), C(_C), alpha(_alpha), beta(_beta) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const ResultsPerThread::Rank0 &, const int &i) const { - // Here, the batch_idx is strided by c_rows * c_cols - auto batch_idx = i / divisor; - // For every batch, we need mod in [0, c_rows*c_cols-1] - auto mod = i % divisor; - // For every mod, we need a column index in [0, c_cols-1] - auto col_idx = mod % c_cols; - // For every mod, we need a row index in [0, c_rows-1] - auto row_idx = mod / c_cols; - - // Due to taking 1-rank subviews out, we must handle transpose here. - // Use overloads of subview_wrapper to handle transpose at compile time. - auto svA_row = subview_wrapper(A, batch_idx, row_idx, Kokkos::ALL(), - batch_layout_tag, transA_tag); - auto svB_col = subview_wrapper(B, batch_idx, Kokkos::ALL(), col_idx, - batch_layout_tag, transB_tag); - auto svC_ele = - subview_wrapper(C, batch_idx, row_idx, col_idx, batch_layout_tag); - - // Kokkos::subview(scalar, ALL) or Kokkos::subview(ALL, scalar) always - // returns a column vector. Since the subviews above handle the - // matrix transpositions, here we must perform the GEMM on: - // row_vec x col_vec, which is svA_row' x svB_col to compute the element - // of C. - KokkosBatched::SerialGemm::invoke(alpha, svA_row, svB_col, beta, - svC_ele); - } - - KOKKOS_INLINE_FUNCTION - void operator()(const ResultsPerThread::Rank2 &, const int &i) const { - auto svA = - subview_wrapper(A, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); - auto svB = - subview_wrapper(B, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); - auto svC = - subview_wrapper(C, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); - - KokkosBatched::SerialGemm::invoke( - alpha, svA, svB, beta, svC); - } -}; -/********************* END non-functor-level routines *********************/ -} // namespace Impl - } // namespace KokkosBatched #endif diff --git a/batched/dense/impl/KokkosBatched_Gemm_Armpl_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp similarity index 98% rename from batched/dense/impl/KokkosBatched_Gemm_Armpl_Impl.hpp rename to batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp index 16355654c1..2974587fa8 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_Armpl_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMM_ARMPL_IMPL_HPP__ -#define __KOKKOSBATCHED_GEMM_ARMPL_IMPL_HPP__ +#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_ARMPL_IMPL_HPP__ +#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_ARMPL_IMPL_HPP__ #if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058 #include "KokkosBatched_Util.hpp" #include "KokkosKernels_Error.hpp" diff --git a/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp similarity index 99% rename from batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp rename to batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp index 301b962fcb..c45935ce09 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMM_DBLBUF_IMPL_HPP__ -#define __KOKKOSBATCHED_GEMM_DBLBUF_IMPL_HPP__ +#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_DBLBUF_IMPL_HPP__ +#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_DBLBUF_IMPL_HPP__ #include "KokkosBatched_Util.hpp" #include "KokkosKernels_Error.hpp" diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index cbf35b6931..f908011628 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -680,9 +680,7 @@ struct BatchedGemmWrapper +class BatchedSerialGemm { + private: + AViewType A; + BViewType B; + CViewType C; + ScalarType alpha, beta; + size_t divisor, c_cols, batch_size; + ArgBatchSzDim batch_layout_tag; + ArgTransA transA_tag; + ArgTransB transB_tag; + + void run() { + using execution_space = typename CViewType::device_type::execution_space; + using policy_type = + Kokkos::RangePolicy; + Kokkos::parallel_for("BatchedSerialGemm", policy_type(0, batch_size), + *this); + } + + public: + int invoke() { + if (std::is_same::value) { + // Set members for ResultsPerThread::Rank0 operator; these members allow + // each thread to calculate its C output index + if (std::is_same::value) { + batch_size = C.extent(0); + divisor = C.extent(1) * C.extent(2); + c_cols = C.extent(2); + } else { + batch_size = C.extent(2); + divisor = C.extent(0) * C.extent(1); + c_cols = C.extent(1); + } + + // Increase the number of threads by the divisor + batch_size *= divisor; + + run(); + } else if (std::is_same::value) { + if (std::is_same::value) + batch_size = C.extent(0); + else + batch_size = C.extent(2); + + run(); + } else { + std::cerr << "Error: ArgResultsPerThread not supported" << std::endl; + return -1; + } + return 0; + } + + BatchedSerialGemm(ScalarType _alpha, AViewType _A, BViewType _B, + ScalarType _beta, CViewType _C) + : A(_A), B(_B), C(_C), alpha(_alpha), beta(_beta) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ResultsPerThread::Rank0 &, const int &i) const { + // Here, the batch_idx is strided by c_rows * c_cols + auto batch_idx = i / divisor; + // For every batch, we need mod in [0, c_rows*c_cols-1] + auto mod = i % divisor; + // For every mod, we need a column index in [0, c_cols-1] + auto col_idx = mod % c_cols; + // For every mod, we need a row index in [0, c_rows-1] + auto row_idx = mod / c_cols; + + // Due to taking 1-rank subviews out, we must handle transpose here. + // Use overloads of subview_wrapper to handle transpose at compile time. + auto svA_row = subview_wrapper(A, batch_idx, row_idx, Kokkos::ALL(), + batch_layout_tag, transA_tag); + auto svB_col = subview_wrapper(B, batch_idx, Kokkos::ALL(), col_idx, + batch_layout_tag, transB_tag); + auto svC_ele = + subview_wrapper(C, batch_idx, row_idx, col_idx, batch_layout_tag); + + // Kokkos::subview(scalar, ALL) or Kokkos::subview(ALL, scalar) always + // returns a column vector. Since the subviews above handle the + // matrix transpositions, here we must perform the GEMM on: + // row_vec x col_vec, which is svA_row' x svB_col to compute the element + // of C. + KokkosBatched::SerialGemm::invoke(alpha, svA_row, svB_col, beta, + svC_ele); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const ResultsPerThread::Rank2 &, const int &i) const { + auto svA = + subview_wrapper(A, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); + auto svB = + subview_wrapper(B, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); + auto svC = + subview_wrapper(C, i, Kokkos::ALL(), Kokkos::ALL(), batch_layout_tag); + + KokkosBatched::SerialGemm::invoke( + alpha, svA, svB, beta, svC); + } +}; +} // namespace Impl +} // namespace KokkosBatched +#endif \ No newline at end of file From 57bfb3f0b63b8ae23d6d965515492ec7f11d9ecf Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 17 May 2023 12:33:43 -0600 Subject: [PATCH 383/442] batched/dense/unit_test: Run tests if ETI_ONLY is disabled --- batched/dense/unit_test/Test_Batched_BatchedGemm.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp index 4295d880fa..ad702273c5 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp @@ -374,7 +374,9 @@ void test_batched_gemm_with_layout(int N) { template int test_batched_gemm() { -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) if constexpr (std::is_same_v) { using param_tag_type = ::Test::SharedParamTag) { using param_tag_type = ::Test::SharedParamTag Date: Wed, 17 May 2023 12:34:22 -0600 Subject: [PATCH 384/442] perf_test/blas/blas3: Add compile-time checks for BatchLayout --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 61 ++++++++++++------- 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 40bc80d0df..4ec2f1fd9a 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -466,80 +466,99 @@ void __do_gemm_parallel_batched_heuristic_template(options_t options, STATUS; if (a == 'N' && b == 'N') { - if (options.blas_args.batch_size_last_dim) - if (options.use_simd) + if constexpr (std::is_same_v) { + if (options.use_simd) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); - else + } else if constexpr (std::is_same_v) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); - else if (options.use_simd) + } + } else if (options.use_simd) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); - else + } else if constexpr (std::is_same_v) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); - + } } else if (a == 'N' && b == 'T') { - if (options.blas_args.batch_size_last_dim) - if (options.use_simd) + if constexpr (std::is_same_v) { + if (options.use_simd) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); - else + } else if constexpr (std::is_same_v) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); - else if (options.use_simd) + } + } else if (options.use_simd) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); - else + } else if constexpr (std::is_same_v) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); + } //} else if (a == 'N' && b == 'C') { // __do_gemm_serial_batched_template(options, gemm_args); } else if (a == 'T' && b == 'N') { - if (options.blas_args.batch_size_last_dim) - if (options.use_simd) + if constexpr (std::is_same_v) { + if (options.use_simd) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); - else + } else if constexpr (std::is_same_v) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); - else if (options.use_simd) + } + } else if (options.use_simd) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); - else + } else if constexpr (std::is_same_v) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); + } } else if (a == 'T' && b == 'T') { - if (options.blas_args.batch_size_last_dim) - if (options.use_simd) + if constexpr (std::is_same_v) { + if (options.use_simd) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); - else + } else if constexpr (std::is_same_v) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); - else if (options.use_simd) + } + } else if (options.use_simd) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.Av.vec_3d, gemm_args.Bv.vec_3d, gemm_args.beta, gemm_args.Cv.vec_3d); - else + } else if constexpr (std::is_same_v) { KokkosBatched::BatchedGemm( &batchedGemmHandle, gemm_args.alpha, gemm_args.A, gemm_args.B, gemm_args.beta, gemm_args.C); + } //} else if (a == 'T' && b == 'C') { // __do_gemm_serial_batched_template(options, gemm_args); //} else if (a == 'C' && b == 'N') { From dca6ee561a4d46198286aa9c41121553ffcad8f4 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 17 May 2023 13:17:09 -0600 Subject: [PATCH 385/442] batched/dense/src: Add KokkosBatched_HostLevel_Gemm.hpp --- ...kkosBatched_HostLevel_Gemm_Serial_Impl.hpp | 2 +- batched/dense/src/KokkosBatched_Gemm_Decl.hpp | 94 +--------------- .../src/KokkosBatched_HostLevel_Gemm.hpp | 102 ++++++++++++++++++ .../unit_test/Test_Batched_BatchedGemm.hpp | 2 +- ...atched_Gemm_nt_nt_bll_eti_spec_inst.cpp.in | 2 +- ...atched_Gemm_nt_nt_blr_eti_spec_inst.cpp.in | 2 +- ...Batched_Gemm_nt_t_bll_eti_spec_inst.cpp.in | 2 +- ...Batched_Gemm_nt_t_blr_eti_spec_inst.cpp.in | 2 +- ...Batched_Gemm_t_nt_bll_eti_spec_inst.cpp.in | 2 +- ...Batched_Gemm_t_nt_blr_eti_spec_inst.cpp.in | 2 +- ...sBatched_Gemm_t_t_bll_eti_spec_inst.cpp.in | 2 +- ...sBatched_Gemm_t_t_blr_eti_spec_inst.cpp.in | 2 +- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 4 +- 13 files changed, 114 insertions(+), 106 deletions(-) create mode 100644 batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp index d8337714ce..09e587b28e 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp @@ -15,7 +15,7 @@ //@HEADER #ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_SERIAL_IMPL_HPP__ #define __KOKKOSBATCHED_HOSTLEVEL_GEMM_SERIAL_IMPL_HPP__ -#include "KokkosBatched_Gemm_Serial_Impl.hpp" +#include "KokkosBatched_Gemm_Decl.hpp" namespace KokkosBatched { namespace Impl { diff --git a/batched/dense/src/KokkosBatched_Gemm_Decl.hpp b/batched/dense/src/KokkosBatched_Gemm_Decl.hpp index b212cfba1e..1febcaa771 100644 --- a/batched/dense/src/KokkosBatched_Gemm_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Gemm_Decl.hpp @@ -18,9 +18,7 @@ #include "KokkosBatched_Vector.hpp" -/********************* BEGIN non-functor-level routines *********************/ namespace KokkosBatched { -/********************* BEGIN functor-level routines *********************/ /// /// Serial Gemm /// @@ -86,100 +84,10 @@ struct Gemm { return r_val; } }; -/********************* END functor-level routines *********************/ - -namespace Impl { -template -struct BatchedGemmWrapper; -} // namespace Impl - -// clang-format off -/// \brief Non-blocking solve of general matrix multiply on a batch of -/// uniform matrices. -/// -/// Note: If a TPL is selected, this interface follows the blocking -/// behavior (either blocking or non-blocking) of the TPL vendor's API. -/// -/// Note: To leverage SIMD instructions, 4-rank views must be selected via the -/// template parameters documented below. -/// -/// C = alpha * op(A) * op(B) + beta * C -/// -/// \tparam ArgTransA Specifies what op does to A: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose -/// \tparam ArgTransB Specifies what op does to B: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose -/// \tparam ArgBatchSzDim Specifies where the batch dimension is allocated in -/// AViewType, BViewType, and CViewType: -/// BatchLayout::Left Batch dimension is leftmost -/// BatchLayout::Right Batch dimension is rightmost -/// \tparam ScalarType Specifies the scalar type of alpha and beta -/// \tparam AViewType Input matrix, as either a 3-rank Kokkos::View or a -/// 4-rank Kokkos::View for SIMD operations. -/// \tparam BViewType Input matrix, as either a 3-rank Kokkos::View or a -/// 4-rank Kokkos::View for SIMD operations. -/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as either a 3-rank -/// Kokkos::View or a 4-rank Kokkos::View for SIMD -/// operations. -/// -/// \param handle [in] A handle which specifies how to invoke the batched -/// gemm. -/// See struct BatchedGemmHandle for details. -/// \param alpha [in] Input coefficient used for multiplication with A -/// \param A [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchLayout::Right", matrix A is MxKxB -/// If ArgBatchSzDim == "BatchLayout::Left", matrix A is BxMxK -/// \param B [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchLayout::Right", matrix B is KxNxB -/// If ArgBatchSzDim == "BatchLayout::Left", matrix B is BxKxN -/// \param beta [in] Input coefficient used for multiplication with C -/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchLayout::Right", matrix C is MxNxB -/// If ArgBatchSzDim == "BatchLayout::Left", matrix C is BxMxN -/// \return 0 upon success, non-zero otherwise -/// -/// Usage Example: -/// BatchedGemm(handle, alpha, A, B, beta, C); -// clang-format on -template -inline int BatchedGemm(BatchedGemmHandleType *const handle, - const ScalarType alpha, const AViewType &A, - const BViewType &B, const ScalarType beta, - const CViewType &C) { - // If either this is being processed by a *.cpp.in file or KK ETI_ONLY - // is defined, use the ETI specialization. Defer till link time - // for which specialization will be used from - // KokkosBatched_HostLevel_Gemm_Impl.hpp. -#if defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - return Impl::BatchedGemmWrapper::run(handle, - alpha, A, B, - beta, C); -#else - // Use the non-ETI specialization. - return Impl::BatchedGemmWrapper::run(handle, - alpha, A, B, - beta, C); -#endif // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -} } // namespace KokkosBatched -/********************* END non-functor-level routines *********************/ -#include "KokkosBatched_HostLevel_Gemm_Impl.hpp" #include "KokkosBatched_Gemm_Serial_Impl.hpp" #include "KokkosBatched_Gemm_Team_Impl.hpp" #include "KokkosBatched_Gemm_TeamVector_Impl.hpp" -#endif +#endif // __KOKKOSBATCHED_GEMM_DECL_HPP__ diff --git a/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp new file mode 100644 index 0000000000..915414b81e --- /dev/null +++ b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp @@ -0,0 +1,102 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP__ +#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP__ + +#include "KokkosBatched_HostLevel_Gemm_Impl.hpp" + +namespace KokkosBatched { +// clang-format off +/// \brief Non-blocking solve of general matrix multiply on a batch of +/// uniform matrices. +/// +/// Note: If a TPL is selected, this interface follows the blocking +/// behavior (either blocking or non-blocking) of the TPL vendor's API. +/// +/// Note: To leverage SIMD instructions, 4-rank views must be selected via the +/// template parameters documented below. +/// +/// C = alpha * op(A) * op(B) + beta * C +/// +/// \tparam ArgTransA Specifies what op does to A: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose +/// \tparam ArgTransB Specifies what op does to B: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose +/// \tparam ArgBatchSzDim Specifies where the batch dimension is allocated in +/// AViewType, BViewType, and CViewType: +/// BatchLayout::Left Batch dimension is leftmost +/// BatchLayout::Right Batch dimension is rightmost +/// \tparam ScalarType Specifies the scalar type of alpha and beta +/// \tparam AViewType Input matrix, as either a 3-rank Kokkos::View or a +/// 4-rank Kokkos::View for SIMD operations. +/// \tparam BViewType Input matrix, as either a 3-rank Kokkos::View or a +/// 4-rank Kokkos::View for SIMD operations. +/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as either a 3-rank +/// Kokkos::View or a 4-rank Kokkos::View for SIMD +/// operations. +/// +/// \param handle [in] A handle which specifies how to invoke the batched +/// gemm. +/// See struct BatchedGemmHandle for details. +/// \param alpha [in] Input coefficient used for multiplication with A +/// \param A [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchLayout::Right", matrix A is MxKxB +/// If ArgBatchSzDim == "BatchLayout::Left", matrix A is BxMxK +/// \param B [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchLayout::Right", matrix B is KxNxB +/// If ArgBatchSzDim == "BatchLayout::Left", matrix B is BxKxN +/// \param beta [in] Input coefficient used for multiplication with C +/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchLayout::Right", matrix C is MxNxB +/// If ArgBatchSzDim == "BatchLayout::Left", matrix C is BxMxN +/// \return 0 upon success, non-zero otherwise +/// +/// Usage Example: +/// BatchedGemm(handle, alpha, A, B, beta, C); +// clang-format on +template +inline int BatchedGemm(BatchedGemmHandleType *const handle, + const ScalarType alpha, const AViewType &A, + const BViewType &B, const ScalarType beta, + const CViewType &C) { + // If either this is being processed by a *.cpp.in file or KK ETI_ONLY + // is defined, use the ETI specialization. Defer till link time + // for which specialization will be used from + // KokkosBatched_HostLevel_Gemm_Impl.hpp. +#if defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + return Impl::BatchedGemmWrapper::run(handle, + alpha, A, B, + beta, C); +#else + // Use the non-ETI specialization. + return Impl::BatchedGemmWrapper::run(handle, + alpha, A, B, + beta, C); +#endif // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +} +} // namespace KokkosBatched +#endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP__ diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp index ad702273c5..2be26c2c45 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp @@ -17,7 +17,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_HostLevel_Gemm.hpp" #include "KokkosKernels_TestUtils.hpp" diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_bll_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_bll_eti_spec_inst.cpp.in index 30be7867c2..a8a05850e4 100644 --- a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_bll_eti_spec_inst.cpp.in +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_bll_eti_spec_inst.cpp.in @@ -14,7 +14,7 @@ // //@HEADER #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true -#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_HostLevel_Gemm.hpp" namespace KokkosBatched { namespace Impl { using KokkosBlas::Trans; diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_blr_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_blr_eti_spec_inst.cpp.in index e8603023fa..01525f4031 100644 --- a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_blr_eti_spec_inst.cpp.in +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_nt_blr_eti_spec_inst.cpp.in @@ -14,7 +14,7 @@ // //@HEADER #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true -#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_HostLevel_Gemm.hpp" namespace KokkosBatched { namespace Impl { using KokkosBlas::Trans; diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_bll_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_bll_eti_spec_inst.cpp.in index 2f97a26f55..c026119b97 100644 --- a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_bll_eti_spec_inst.cpp.in +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_bll_eti_spec_inst.cpp.in @@ -14,7 +14,7 @@ // //@HEADER #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true -#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_HostLevel_Gemm.hpp" namespace KokkosBatched { namespace Impl { using KokkosBlas::Trans; diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_blr_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_blr_eti_spec_inst.cpp.in index db9b9aacc2..9e1eba730d 100644 --- a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_blr_eti_spec_inst.cpp.in +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_nt_t_blr_eti_spec_inst.cpp.in @@ -14,7 +14,7 @@ // //@HEADER #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true -#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_HostLevel_Gemm.hpp" namespace KokkosBatched { namespace Impl { using KokkosBlas::Trans; diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_bll_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_bll_eti_spec_inst.cpp.in index 1bc954e943..86aa818b42 100644 --- a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_bll_eti_spec_inst.cpp.in +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_bll_eti_spec_inst.cpp.in @@ -14,7 +14,7 @@ // //@HEADER #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true -#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_HostLevel_Gemm.hpp" namespace KokkosBatched { namespace Impl { using KokkosBlas::Trans; diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_blr_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_blr_eti_spec_inst.cpp.in index ab533445a8..bd8d246708 100644 --- a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_blr_eti_spec_inst.cpp.in +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_nt_blr_eti_spec_inst.cpp.in @@ -14,7 +14,7 @@ // //@HEADER #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true -#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_HostLevel_Gemm.hpp" namespace KokkosBatched { namespace Impl { using KokkosBlas::Trans; diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_bll_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_bll_eti_spec_inst.cpp.in index 46a84f1e09..450d7bb5f6 100644 --- a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_bll_eti_spec_inst.cpp.in +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_bll_eti_spec_inst.cpp.in @@ -14,7 +14,7 @@ // //@HEADER #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true -#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_HostLevel_Gemm.hpp" namespace KokkosBatched { namespace Impl { using KokkosBlas::Trans; diff --git a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_blr_eti_spec_inst.cpp.in b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_blr_eti_spec_inst.cpp.in index da388cbfac..95a2faf3d7 100644 --- a/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_blr_eti_spec_inst.cpp.in +++ b/batched/eti/generated_specializations_cpp/Gemm/KokkosBatched_Gemm_t_t_blr_eti_spec_inst.cpp.in @@ -14,7 +14,7 @@ // //@HEADER #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true -#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_HostLevel_Gemm.hpp" namespace KokkosBatched { namespace Impl { using KokkosBlas::Trans; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 4ec2f1fd9a..8fe23d2515 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -25,10 +25,8 @@ #include +#include "KokkosBatched_HostLevel_Gemm.hpp" #include "KokkosBatched_Gemm_Decl.hpp" -#include "KokkosBatched_Gemm_Serial_Impl.hpp" -//#include "KokkosBatched_Gemm_Team_Impl.hpp" -//#include "KokkosBatched_Gemm_TeamVector_Impl.hpp" #include "KokkosBatched_Util.hpp" #include "gtest/gtest.h" // EXPECT_NEAR #include "KokkosKernels_TestUtils.hpp" From 62b863de5a7ecfbbc135303ab2c50dce17a86446 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 17 May 2023 13:29:10 -0600 Subject: [PATCH 386/442] batched/dense/impl: Remove forward decls --- ...okkosBatched_HostLevel_Gemm_Armpl_Impl.hpp | 45 ++++ ...kkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp | 69 +++++++ .../KokkosBatched_HostLevel_Gemm_Impl.hpp | 194 +----------------- ...kkosBatched_HostLevel_Gemm_Serial_Impl.hpp | 57 +++++ 4 files changed, 175 insertions(+), 190 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp index 2974587fa8..971fb36081 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp @@ -22,6 +22,51 @@ namespace KokkosBatched { namespace Impl { /********************* BEGIN non-functor-level routines *********************/ + +// clang-format off +/// \brief Blocking general matrix multiply on a batch of uniform matrices. +/// +/// +/// C = alpha * op(A) * op(B) + beta * C +/// +/// \tparam ArgTransA Specifies what op does to A: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose (unsupported) +/// \tparam ArgTransB Specifies what op does to B: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose (unsupported) +/// \tparam HandleType Specifies the handle type of the kernel handle +/// \tparam ScalarType Specifies the scalar type of alpha and beta +/// \tparam AViewType Input matrix, as a 3-rank Kokkos::View +/// \tparam BViewType Input matrix, as a 3-rank Kokkos::View +/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as a 3-rank +/// Kokkos::View +/// +/// See struct BatchedGemmHandle for details +/// \param handle [in] A handle which specifies how to invoke the batched +/// gemm. handle->get_tpl_params() returns &ninter. +/// ninter: The number of matrices to interleave. +/// \param alpha [in] Input coefficient used for multiplication with A +/// \param A [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix A is BxMxK +/// \param B [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix B is BxKxN +/// \param beta [in] Input coefficient used for multiplication with C +/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix C is BxMxN +/// \return 0 upon success, non-zero otherwise +/// + +/// Usage Example: +/// BatchedArmplGemm +/// (handle, alpha, A, B, beta, C).invoke(); +// clang-format on template diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp index c45935ce09..50d662b281 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp @@ -57,6 +57,75 @@ using TagFromLayout = typename TagFromLayoutHelper::tag; // Option 2: Fix league_size and have single team solve full tile followed // by same team solving extra rows/cols (without multiplying by the // zero rows/cols) + +// clang-format off +/// \brief Non-blocking general matrix multiply on a batch of +/// uniform matrices with an algorithm based on: +/// B. P. D. J. Kunkel, Julian, “Performance, design, and autotuning of batched gemm for GPUs,” +/// in Lecture Notes in Computer Science, ser. ISC High Performance Computing ’16, vol. 9697, 06 2016. +/// +/// +/// C = alpha * op(A) * op(B) + beta * C +/// +/// \tparam ArgTransA Specifies what op does to A: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose (unsupported) +/// \tparam ArgTransB Specifies what op does to B: +/// Trans::NoTranspose for non-transpose +/// Trans::Transpose for transpose +/// Trans::ConjTranspose for conjugate transpose (unsupported) +/// \tparam ArgBatchSzDim Specifies where the batch dimension is allocated in +/// AViewType, BViewType, and CViewType: +/// BatchSzDim::Left Batch dimension is leftmost +/// BatchSzDim::Right Batch dimension is rightmost +/// \tparam ArgResultsPerThread Specifies how to divide work among threads. For +/// this serial interface, each rank specifies how +/// much work to assign a single thread. +/// ResultsPerThread::Rank0 Each thread computes a scalar of C +/// ResultsPerThread::Rank1 Each thread computes a 1-rank chunk of C +/// ResultsPerThread::Rank2 Each thread computes a 2-rank chunk of C +/// \tparam HandleType Specifies the handle type of the kernel handle +/// \tparam ScalarType Specifies the scalar type of alpha and beta +/// \tparam AViewType Input matrix, as either a 3-rank Kokkos::View or a +/// 4-rank Kokkos::View for SIMD operations. +/// \tparam BViewType Input matrix, as either a 3-rank Kokkos::View or a +/// 4-rank Kokkos::View for SIMD operations. +/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as either a 3-rank +/// Kokkos::View or a 4-rank Kokkos::View for SIMD +/// operations. +/// \tparam ArgBoundsCheck Specifies whether to perform global memory access +/// bounds checks within the functor. Bounds checks +/// are required when matrix sizes are not evenly divisible +/// by tile sizes. +/// BoundsCheck::Yes The functor will perform bound checks (recommended) +/// BoundsCheck::No The functor will NOT perform bound checks +/// \tparam ArgAlphaFmaTag Specifies whether to apply alpha during fmas. +/// AlphaFmaTag::Yes alpha will be applied during fma (C = C * alpha + AB). +/// AlphaFmaTag::No alpha will be applied during mul (A * B * alpha). +/// \tparam TILE_M Specifies the number of rows in each tile. +/// \tparam TILE_N Specifies the number of cols in each tile. +/// \tparam TILE_K Specifies the number of cols or rows in a tile of A or tile of B, respectively. +/// +/// See struct BatchedGemmHandle for details. +/// \param alpha [in] Input coefficient used for multiplication with A +/// \param A [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix A is BxMxK +/// \param B [in] Input matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix B is BxKxN +/// \param beta [in] Input coefficient used for multiplication with C +/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View +/// If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB +/// If ArgBatchSzDim == "BatchSzDim::Left", matrix C is BxMxN +/// \return 0 upon success, non-zero otherwise +/// +/// Usage Example: +/// BatchedSerialGemm(alpha, A, B, beta, C).invoke(); +// clang-format on template #include +#include "KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp" +#include "KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp" +#include "KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp" + namespace KokkosBatched { namespace Impl { -/********************* BEGIN forward declarations *********************/ -// clang-format off -/// \brief Non-blocking general matrix multiply on a batch of -/// uniform matrices. -/// -/// -/// C = alpha * op(A) * op(B) + beta * C -/// -/// \tparam ArgTransA Specifies what op does to A: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose -/// \tparam ArgTransB Specifies what op does to B: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose -/// \tparam ArgMode Specifies algorithm mode to use for serial work: -/// Algo::Gemm::Unblocked for no register blocking -/// Algo::Gemm::Blocked for register blocking -/// Algo::Gemm::CompactMKL for mkl compact tpl interface -/// \tparam ArgBatchSzDim Specifies where the batch dimension is allocated in -/// AViewType, BViewType, and CViewType: -/// BatchSzDim::Left Batch dimension is leftmost -/// BatchSzDim::Right Batch dimension is rightmost -/// \tparam ArgResultsPerThread Specifies how to divide work among threads. For -/// this serial interface, each rank specifies how -/// much work to assign a single thread. -/// ResultsPerThread::Rank0 Each thread computes a scalar of C -/// ResultsPerThread::Rank1 Each thread computes a 1-rank chunk of C -/// ResultsPerThread::Rank2 Each thread computes a 2-rank chunk of C -/// \tparam ScalarType Specifies the scalar type of alpha and beta -/// \tparam AViewType Input matrix, as either a 3-rank Kokkos::View or a -/// 4-rank Kokkos::View for SIMD operations. -/// \tparam BViewType Input matrix, as either a 3-rank Kokkos::View or a -/// 4-rank Kokkos::View for SIMD operations. -/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as either a 3-rank -/// Kokkos::View or a 4-rank Kokkos::View for SIMD -/// operations. -/// -/// See struct BatchedGemmHandle for details. -/// \param alpha [in] Input coefficient used for multiplication with A -/// \param A [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix A is BxMxK -/// \param B [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix B is BxKxN -/// \param beta [in] Input coefficient used for multiplication with C -/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix C is BxMxN -/// \return 0 upon success, non-zero otherwise -/// -/// Usage Example: -/// BatchedSerialGemm(alpha, A, B, beta, C).invoke(); -// clang-format on -template -class BatchedSerialGemm; - -// clang-format off -/// \brief Non-blocking general matrix multiply on a batch of -/// uniform matrices with an algorithm based on: -/// B. P. D. J. Kunkel, Julian, “Performance, design, and autotuning of batched gemm for GPUs,” -/// in Lecture Notes in Computer Science, ser. ISC High Performance Computing ’16, vol. 9697, 06 2016. -/// -/// -/// C = alpha * op(A) * op(B) + beta * C -/// -/// \tparam ArgTransA Specifies what op does to A: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose (unsupported) -/// \tparam ArgTransB Specifies what op does to B: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose (unsupported) -/// \tparam ArgBatchSzDim Specifies where the batch dimension is allocated in -/// AViewType, BViewType, and CViewType: -/// BatchSzDim::Left Batch dimension is leftmost -/// BatchSzDim::Right Batch dimension is rightmost -/// \tparam ArgResultsPerThread Specifies how to divide work among threads. For -/// this serial interface, each rank specifies how -/// much work to assign a single thread. -/// ResultsPerThread::Rank0 Each thread computes a scalar of C -/// ResultsPerThread::Rank1 Each thread computes a 1-rank chunk of C -/// ResultsPerThread::Rank2 Each thread computes a 2-rank chunk of C -/// \tparam HandleType Specifies the handle type of the kernel handle -/// \tparam ScalarType Specifies the scalar type of alpha and beta -/// \tparam AViewType Input matrix, as either a 3-rank Kokkos::View or a -/// 4-rank Kokkos::View for SIMD operations. -/// \tparam BViewType Input matrix, as either a 3-rank Kokkos::View or a -/// 4-rank Kokkos::View for SIMD operations. -/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as either a 3-rank -/// Kokkos::View or a 4-rank Kokkos::View for SIMD -/// operations. -/// \tparam ArgBoundsCheck Specifies whether to perform global memory access -/// bounds checks within the functor. Bounds checks -/// are required when matrix sizes are not evenly divisible -/// by tile sizes. -/// BoundsCheck::Yes The functor will perform bound checks (recommended) -/// BoundsCheck::No The functor will NOT perform bound checks -/// \tparam ArgAlphaFmaTag Specifies whether to apply alpha during fmas. -/// AlphaFmaTag::Yes alpha will be applied during fma (C = C * alpha + AB). -/// AlphaFmaTag::No alpha will be applied during mul (A * B * alpha). -/// \tparam TILE_M Specifies the number of rows in each tile. -/// \tparam TILE_N Specifies the number of cols in each tile. -/// \tparam TILE_K Specifies the number of cols or rows in a tile of A or tile of B, respectively. -/// -/// See struct BatchedGemmHandle for details. -/// \param alpha [in] Input coefficient used for multiplication with A -/// \param A [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix A is BxMxK -/// \param B [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix B is BxKxN -/// \param beta [in] Input coefficient used for multiplication with C -/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix C is BxMxN -/// \return 0 upon success, non-zero otherwise -/// -/// Usage Example: -/// BatchedSerialGemm(alpha, A, B, beta, C).invoke(); -// clang-format on -template -class BatchedDblBufGemm; - //////////////////////////////// tile_m ////////////////////////////////// template constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_m() { @@ -196,56 +63,6 @@ constexpr KOKKOS_INLINE_FUNCTION size_t kk_gemm_dbl_buf_alpha_in_fma_thresh() { #endif // __CUDAACC_RDC__ } -// clang-format off -/// \brief Blocking general matrix multiply on a batch of uniform matrices. -/// -/// -/// C = alpha * op(A) * op(B) + beta * C -/// -/// \tparam ArgTransA Specifies what op does to A: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose (unsupported) -/// \tparam ArgTransB Specifies what op does to B: -/// Trans::NoTranspose for non-transpose -/// Trans::Transpose for transpose -/// Trans::ConjTranspose for conjugate transpose (unsupported) -/// \tparam HandleType Specifies the handle type of the kernel handle -/// \tparam ScalarType Specifies the scalar type of alpha and beta -/// \tparam AViewType Input matrix, as a 3-rank Kokkos::View -/// \tparam BViewType Input matrix, as a 3-rank Kokkos::View -/// \tparam CViewType Input(RHS)/Output(LHS) matrix, as a 3-rank -/// Kokkos::View -/// -/// See struct BatchedGemmHandle for details -/// \param handle [in] A handle which specifies how to invoke the batched -/// gemm. handle->get_tpl_params() returns &ninter. -/// ninter: The number of matrices to interleave. -/// \param alpha [in] Input coefficient used for multiplication with A -/// \param A [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix A is MxKxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix A is BxMxK -/// \param B [in] Input matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix B is KxNxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix B is BxKxN -/// \param beta [in] Input coefficient used for multiplication with C -/// \param C [in/out] Input/Output matrix, as a 3-rank Kokkos::View -/// If ArgBatchSzDim == "BatchSzDim::Right", matrix C is MxNxB -/// If ArgBatchSzDim == "BatchSzDim::Left", matrix C is BxMxN -/// \return 0 upon success, non-zero otherwise -/// - -/// Usage Example: -/// BatchedArmplGemm -/// (handle, alpha, A, B, beta, C).invoke(); -// clang-format on -template -class BatchedArmplGemm; -/********************* END forward declarations *********************/ - template @@ -680,7 +497,4 @@ struct BatchedGemmWrapper(alpha, A, B, beta, C).invoke(); +// clang-format on template From f64d6361ad19aa77fe2322ccab41a95e6df1bbd0 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 23 May 2023 08:54:31 -0600 Subject: [PATCH 387/442] batched/dense: Add HostLevel Gemm unification layer --- .../KokkosBatched_HostLevel_Gemm_Impl.hpp | 19 ++++++----- .../src/KokkosBatched_HostLevel_Gemm.hpp | 32 ++++++++++++++----- 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index 46d88c6f85..4f9c23b223 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -409,14 +409,17 @@ struct BatchedGemmWrapper>, \ - Kokkos::View>, \ - Kokkos::View>, \ +#define KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, \ + ARG_BATCH_LAYOUT, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct BatchedGemmWrapper< \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, BatchedGemmHandle, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ true>; #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) diff --git a/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp index 915414b81e..b29c553e0d 100644 --- a/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp +++ b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp @@ -79,23 +79,39 @@ inline int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { + // Minimize the number of ImplBatchedGemmWrapper instantiations, by + // standardizing on particular View specializations for its template + // parameters. + using UnifiedAVT = Kokkos::View< + typename AViewType::value_type ***, typename AViewType::array_layout, + typename AViewType::device_type, Kokkos::MemoryTraits>; + using UnifiedBVT = Kokkos::View< + typename BViewType::value_type ***, typename BViewType::array_layout, + typename BViewType::device_type, Kokkos::MemoryTraits>; + using UnifiedCVT = Kokkos::View>; + // If either this is being processed by a *.cpp.in file or KK ETI_ONLY // is defined, use the ETI specialization. Defer till link time // for which specialization will be used from // KokkosBatched_HostLevel_Gemm_Impl.hpp. #if defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY return Impl::BatchedGemmWrapper::run(handle, - alpha, A, B, - beta, C); + BatchedGemmHandleType, ScalarType, UnifiedAVT, + UnifiedBVT, UnifiedCVT, true>::run(handle, + alpha, A, + B, beta, + C); #else // Use the non-ETI specialization. return Impl::BatchedGemmWrapper::run(handle, - alpha, A, B, - beta, C); + BatchedGemmHandleType, ScalarType, UnifiedAVT, + UnifiedBVT, UnifiedCVT, false>::run(handle, + alpha, A, + B, beta, + C); #endif // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY } } // namespace KokkosBatched From d55fb1054e3040a47b444d07fea8c127f0321508 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 23 May 2023 08:55:21 -0600 Subject: [PATCH 388/442] .github/workflows: Print out arch in osx CI --- .github/workflows/osx.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 769957b953..820af0e6fd 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -77,6 +77,7 @@ jobs: -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ -DCMAKE_INSTALL_PREFIX=$PWD/../install \ .. + grep -i Kokkos_ARCH CMakeCache.txt | grep ON - name: build_and_install_kokkos working-directory: kokkos/build From 721f388f961fd98a78cfc3dbd3a26d7e7bbea2fc Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 23 May 2023 14:54:59 -0600 Subject: [PATCH 389/442] batched: Populate avail eti files --- .../KokkosBatched_HostLevel_Gemm_Impl.hpp | 172 +++++++++++++----- .../src/KokkosBatched_HostLevel_Gemm.hpp | 20 +- ...tched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in | 25 +++ ...tched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in | 25 +++ ...atched_Gemm_nt_t_bll_eti_spec_avail.hpp.in | 25 +++ ...atched_Gemm_nt_t_blr_eti_spec_avail.hpp.in | 25 +++ ...atched_Gemm_t_nt_bll_eti_spec_avail.hpp.in | 25 +++ ...atched_Gemm_t_nt_blr_eti_spec_avail.hpp.in | 25 +++ ...Batched_Gemm_t_t_bll_eti_spec_avail.hpp.in | 25 +++ ...Batched_Gemm_t_t_blr_eti_spec_avail.hpp.in | 25 +++ 10 files changed, 329 insertions(+), 63 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index 4f9c23b223..077f326ba0 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -340,30 +340,126 @@ struct BatchedGemmWrapperInner { } }; -// Primary template -template -struct BatchedGemmWrapper { - static int run(BatchedGemmHandleType *const handle, const ScalarType alpha, - const AViewType &A, const BViewType &B, const ScalarType beta, - const CViewType &C); +// Specialization struct which defines whether a specialization exists +template +struct batched_gemm_tpl_spec_avail { + enum : bool { value = false }; +}; + +template +struct batched_gemm_eti_spec_avail { + enum : bool { value = false }; }; -// ETI specialization -// KOKKOSKERNELS_IMPL_COMPILE_LIBRARY should only be set -// when the *.cpp.in files include this. -#if KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, \ + ARG_BATCH_LAYOUT, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct batched_gemm_eti_spec_avail< \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, BatchedGemmHandle, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ + }; + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ + EXEC_SPACE, MEM_SPACE) +#else +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ + EXEC_SPACE, MEM_SPACE) +#else +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) +#endif + +///////////////// BatchLayout::Left Permutations ///////////////// +#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +///////////////// BatchLayout::Right Permutations ///////////////// +#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + template -struct BatchedGemmWrapper { + typename AViewType, typename BViewType, typename CViewType, + bool tpl_spec_avail = batched_gemm_tpl_spec_avail< + ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, + ScalarType, AViewType, BViewType, CViewType>::value, + bool eti_spec_avail = batched_gemm_eti_spec_avail< + ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, + ScalarType, AViewType, BViewType, CViewType>::value> +struct BatchedGemmWrapper { static int run(BatchedGemmHandleType *const handle, const ScalarType alpha, const AViewType &A, const BViewType &B, const ScalarType beta, const CViewType &C) { +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY #ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION printf( "KokkosBatched::BatchedGemm<> ETI specialization for < %s, %s, %s, " @@ -372,28 +468,7 @@ struct BatchedGemmWrapper::run(handle, alpha, A, B, beta, C); - } -}; -#endif - -// If KOKKOSKERNELS_ETI_ONLY is defined, restrict invocations to ETI types only -// via above ETI specialization. -#if !defined(KOKKOSKERNELS_ETI_ONLY) -// Non-ETI specialization -template -struct BatchedGemmWrapper { - static int run(BatchedGemmHandleType *const handle, const ScalarType alpha, - const AViewType &A, const BViewType &B, const ScalarType beta, - const CViewType &C) { -#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION +#else printf( "KokkosBatched::BatchedGemm<> non-ETI specialization for < %s, %s, " "%s, %s, %s, %s, %s, %s >\n", @@ -401,13 +476,20 @@ struct BatchedGemmWrapper::run(handle, alpha, A, B, beta, C); +#endif // !defined(KOKKOSKERNELS_ETI_ONLY) || + // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + std::ostringstream os; + os << "KokkosBatched::BatchedGemm does not support the provided template " + "parameters." + << std::endl; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + return -1; } }; -#endif #define KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, \ ARG_BATCH_LAYOUT, SCALAR, \ @@ -415,12 +497,12 @@ struct BatchedGemmWrapper, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true>; + Kokkos::MemoryTraits>, \ + false, true>; #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) #define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ diff --git a/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp index b29c553e0d..25473e3c3a 100644 --- a/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp +++ b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp @@ -93,26 +93,10 @@ inline int BatchedGemm(BatchedGemmHandleType *const handle, typename CViewType::device_type, Kokkos::MemoryTraits>; - // If either this is being processed by a *.cpp.in file or KK ETI_ONLY - // is defined, use the ETI specialization. Defer till link time - // for which specialization will be used from - // KokkosBatched_HostLevel_Gemm_Impl.hpp. -#if defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY return Impl::BatchedGemmWrapper::run(handle, - alpha, A, - B, beta, - C); -#else - // Use the non-ETI specialization. - return Impl::BatchedGemmWrapper::run(handle, - alpha, A, - B, beta, - C); -#endif // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + UnifiedBVT, UnifiedCVT>::run(handle, alpha, A, + B, beta, C); } } // namespace KokkosBatched #endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP__ diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in index e69de29bb2..590b16c025 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBlas { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_NT_BLL_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif \ No newline at end of file diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in index e69de29bb2..4839cb5c57 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBlas { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_NT_BLR_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif \ No newline at end of file diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in index e69de29bb2..b65daecc14 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBlas { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_T_BLL_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif \ No newline at end of file diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in index e69de29bb2..fe9603c8d0 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBlas { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_T_BLR_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif \ No newline at end of file diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in index e69de29bb2..6e10908e3f 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBlas { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_NT_BLL_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif \ No newline at end of file diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in index e69de29bb2..3f6764a69b 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBlas { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_NT_BLR_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif \ No newline at end of file diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in index e69de29bb2..a3c871492a 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBlas { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_T_BLL_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif \ No newline at end of file diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in index e69de29bb2..e873867bc0 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBlas { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_T_BLR_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif \ No newline at end of file From f663066d6b6fadb30720116351dec519f0ed625f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 23 May 2023 15:15:01 -0600 Subject: [PATCH 390/442] batched: Remove empty decl ETI files --- .../KokkosBatched_Gemm_nt_nt_bll_eti_spec_decl.hpp.in | 0 .../KokkosBatched_Gemm_nt_nt_blr_eti_spec_decl.hpp.in | 0 .../KokkosBatched_Gemm_nt_t_bll_eti_spec_decl.hpp.in | 0 .../KokkosBatched_Gemm_nt_t_blr_eti_spec_decl.hpp.in | 0 .../KokkosBatched_Gemm_t_nt_bll_eti_spec_decl.hpp.in | 0 .../KokkosBatched_Gemm_t_nt_blr_eti_spec_decl.hpp.in | 0 .../KokkosBatched_Gemm_t_t_bll_eti_spec_decl.hpp.in | 0 .../KokkosBatched_Gemm_t_t_blr_eti_spec_decl.hpp.in | 0 8 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_decl.hpp.in delete mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_decl.hpp.in delete mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_decl.hpp.in delete mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_decl.hpp.in delete mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_decl.hpp.in delete mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_decl.hpp.in delete mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_decl.hpp.in delete mode 100644 batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_decl.hpp.in diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_decl.hpp.in deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_decl.hpp.in deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_decl.hpp.in deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_decl.hpp.in deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_decl.hpp.in deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_decl.hpp.in deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_decl.hpp.in deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_decl.hpp.in deleted file mode 100644 index e69de29bb2..0000000000 From 9ad25c9b982eace59312da2bfb6beaabc6d3cb09 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 23 May 2023 15:15:38 -0600 Subject: [PATCH 391/442] batched: note that tpl struct is unused --- batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index 077f326ba0..046025c8df 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -341,6 +341,7 @@ struct BatchedGemmWrapperInner { }; // Specialization struct which defines whether a specialization exists +// This struct is currently never specialized. template @@ -348,6 +349,7 @@ struct batched_gemm_tpl_spec_avail { enum : bool { value = false }; }; +// Specialization struct which defines whether a specialization exists template From 9d4de5dbec0555ba01804ea6b8e782e23848502b Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 23 May 2023 15:27:48 -0600 Subject: [PATCH 392/442] add rocblas and rocsparse to --spot-check-tpls --- scripts/cm_test_all_sandia | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 5dcd70d885..8d7a2cf2eb 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -1052,6 +1052,9 @@ setup_env() { NEW_TPL_LIST="cublas,cusparse," export KOKKOS_CUDA_OPTIONS="${KOKKOS_CUDA_OPTIONS},enable_lambda" fi + if [[ "$compiler" == rocm* ]]; then + NEW_TPL_LIST="rocblas,rocsparse," + fi # host tpls - use mkl with intel, else use host blas if [[ "$compiler" == intel* ]]; then NEW_TPL_LIST="mkl," From 6116419961426a4a9b2cb8bfc68b813350bae80b Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 23 May 2023 15:38:16 -0600 Subject: [PATCH 393/442] .github/workflows: Print out arch in osx CI --- .github/workflows/osx.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 820af0e6fd..df9c81afc8 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -77,7 +77,7 @@ jobs: -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ -DCMAKE_INSTALL_PREFIX=$PWD/../install \ .. - grep -i Kokkos_ARCH CMakeCache.txt | grep ON + grep -i Kokkos_ARCH CMakeCache.txt | grep ON || true - name: build_and_install_kokkos working-directory: kokkos/build From c0349db3ac02f1830a70cf71ab046cfeb5ae1a33 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 23 May 2023 15:47:47 -0600 Subject: [PATCH 394/442] .github/workflows: Print out arch in osx CI --- .github/workflows/osx.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index df9c81afc8..3808c75d87 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -77,7 +77,7 @@ jobs: -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ -DCMAKE_INSTALL_PREFIX=$PWD/../install \ .. - grep -i Kokkos_ARCH CMakeCache.txt | grep ON || true + grep -i Kokkos_ARCH CMakeCache.txt || true - name: build_and_install_kokkos working-directory: kokkos/build From 1c256b1a3e0a3192b4f78df2887f90076d5eb08d Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 24 May 2023 11:23:16 -0600 Subject: [PATCH 395/442] batched: fix eti avail and wrapper --- batched/CMakeLists.txt | 1 + .../KokkosBatched_HostLevel_Gemm_Impl.hpp | 251 ++++++++++-------- .../unit_test/Test_Batched_BatchedGemm.hpp | 2 +- ...tched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in | 8 +- ...tched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in | 8 +- ...atched_Gemm_nt_t_bll_eti_spec_avail.hpp.in | 8 +- ...atched_Gemm_nt_t_blr_eti_spec_avail.hpp.in | 8 +- ...atched_Gemm_t_nt_bll_eti_spec_avail.hpp.in | 8 +- ...atched_Gemm_t_nt_blr_eti_spec_avail.hpp.in | 8 +- ...Batched_Gemm_t_t_bll_eti_spec_avail.hpp.in | 8 +- ...Batched_Gemm_t_t_blr_eti_spec_avail.hpp.in | 8 +- 11 files changed, 168 insertions(+), 150 deletions(-) diff --git a/batched/CMakeLists.txt b/batched/CMakeLists.txt index 8d4319e63d..053ed96012 100644 --- a/batched/CMakeLists.txt +++ b/batched/CMakeLists.txt @@ -1,5 +1,6 @@ # Adding source directory to the build LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/batched) +LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/batched/eti) LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/batched/dense/src) LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/batched/dense/impl) diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index 046025c8df..9b505e72f6 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -63,6 +63,131 @@ constexpr KOKKOS_INLINE_FUNCTION size_t kk_gemm_dbl_buf_alpha_in_fma_thresh() { #endif // __CUDAACC_RDC__ } +// Specialization struct which defines whether a specialization exists +// This struct is currently never specialized. +template +struct batched_gemm_tpl_spec_avail { + enum : bool { value = false }; +}; + +// Specialization struct which defines whether a specialization exists +template +struct batched_gemm_eti_spec_avail { + enum : bool { value = false }; +}; +} // namespace Impl +} // namespace KokkosBatched + +// ETI specalization macros, consumed by generated *_eti_spec_avail.hpp files +#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, \ + ARG_BATCH_LAYOUT, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct batched_gemm_eti_spec_avail< \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, BatchedGemmHandle, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ + }; + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ + EXEC_SPACE, MEM_SPACE) +#else +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ + EXEC_SPACE, MEM_SPACE) +#else +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) +#endif + +///////////////// BatchLayout::Left Permutations ///////////////// +#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +// Include the BLL ETI specalizations +#include +#include +#include +#include + +///////////////// BatchLayout::Right Permutations ///////////////// +#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +// Include the BLR ETI specalizations +#include +#include +#include +#include + +namespace KokkosBatched { +namespace Impl { template @@ -340,117 +465,9 @@ struct BatchedGemmWrapperInner { } }; -// Specialization struct which defines whether a specialization exists -// This struct is currently never specialized. template -struct batched_gemm_tpl_spec_avail { - enum : bool { value = false }; -}; - -// Specialization struct which defines whether a specialization exists -template -struct batched_gemm_eti_spec_avail { - enum : bool { value = false }; -}; - -#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, \ - ARG_BATCH_LAYOUT, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct batched_gemm_eti_spec_avail< \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, BatchedGemmHandle, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ - }; - -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ - EXEC_SPACE, MEM_SPACE) -#else -#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) -#endif - -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ - EXEC_SPACE, MEM_SPACE) -#else -#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) -#endif - -///////////////// BatchLayout::Left Permutations ///////////////// -#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) - -#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) - -#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) - -#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) - -///////////////// BatchLayout::Right Permutations ///////////////// -#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) - -#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) - -#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) - -#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) - -template ::value, @@ -460,9 +477,11 @@ template ETI specialization for < %s, %s, %s, " "%s, %s, %s, %s, %s >\n", @@ -478,21 +497,19 @@ struct BatchedGemmWrapper { typeid(ArgBatchSzDim).name(), typeid(BatchedGemmHandleType).name(), typeid(ScalarType).name(), typeid(AViewType).name(), typeid(BViewType).name(), typeid(CViewType).name()); +#endif // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY #endif // KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION return Impl::BatchedGemmWrapperInner< ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, ScalarType, AViewType, BViewType, CViewType>::run(handle, alpha, A, B, beta, C); + } +#else + ; #endif // !defined(KOKKOSKERNELS_ETI_ONLY) || // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - std::ostringstream os; - os << "KokkosBatched::BatchedGemm does not support the provided template " - "parameters." - << std::endl; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - return -1; - } }; +// ETI instantiation macros, consumed by *.cpp.in files #define KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, \ ARG_BATCH_LAYOUT, SCALAR, \ LAYOUT, EXEC_SPACE, MEM_SPACE) \ diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp index 2be26c2c45..3c9a3f3c07 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp @@ -378,7 +378,7 @@ int test_batched_gemm() { (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) if constexpr (std::is_same_v) { + typename BatchLayout::Right>) { using param_tag_type = ::Test::SharedParamTag; diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in index 590b16c025..070f64f648 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in @@ -14,11 +14,11 @@ // //@HEADER -#ifndef KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ -#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ -namespace KokkosBlas { +#ifndef KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBatched { namespace Impl { -using KokkosBlas::Trans; +using KokkosBatched::Trans; @BATCHED_GEMM_NT_NT_BLL_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in index 4839cb5c57..ab333ad536 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in @@ -14,11 +14,11 @@ // //@HEADER -#ifndef KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ -#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ -namespace KokkosBlas { +#ifndef KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBatched { namespace Impl { -using KokkosBlas::Trans; +using KokkosBatched::Trans; @BATCHED_GEMM_NT_NT_BLR_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in index b65daecc14..da1649ecd7 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in @@ -14,11 +14,11 @@ // //@HEADER -#ifndef KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ -#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ -namespace KokkosBlas { +#ifndef KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBatched { namespace Impl { -using KokkosBlas::Trans; +using KokkosBatched::Trans; @BATCHED_GEMM_NT_T_BLL_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in index fe9603c8d0..01a52d1e6d 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in @@ -14,11 +14,11 @@ // //@HEADER -#ifndef KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ -#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ -namespace KokkosBlas { +#ifndef KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBatched { namespace Impl { -using KokkosBlas::Trans; +using KokkosBatched::Trans; @BATCHED_GEMM_NT_T_BLR_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in index 6e10908e3f..2b794505a5 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in @@ -14,11 +14,11 @@ // //@HEADER -#ifndef KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ -#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ -namespace KokkosBlas { +#ifndef KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBatched { namespace Impl { -using KokkosBlas::Trans; +using KokkosBatched::Trans; @BATCHED_GEMM_T_NT_BLL_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in index 3f6764a69b..ee72422e11 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in @@ -14,11 +14,11 @@ // //@HEADER -#ifndef KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ -#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ -namespace KokkosBlas { +#ifndef KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBatched { namespace Impl { -using KokkosBlas::Trans; +using KokkosBatched::Trans; @BATCHED_GEMM_T_NT_BLR_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in index a3c871492a..fa2bcbde47 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in @@ -14,11 +14,11 @@ // //@HEADER -#ifndef KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ -#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ -namespace KokkosBlas { +#ifndef KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBatched { namespace Impl { -using KokkosBlas::Trans; +using KokkosBatched::Trans; @BATCHED_GEMM_T_T_BLL_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in index e873867bc0..4dfb2b8c9a 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in @@ -14,11 +14,11 @@ // //@HEADER -#ifndef KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ -#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_HPP_ -namespace KokkosBlas { +#ifndef KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBatched { namespace Impl { -using KokkosBlas::Trans; +using KokkosBatched::Trans; @BATCHED_GEMM_T_T_BLR_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos From d55ba7bf391b98eaf4b2ea538a1e2d77e6f7bc0a Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 24 May 2023 11:39:44 -0600 Subject: [PATCH 396/442] batched/dense/unit_test: Add TEST SKIPPED prints --- batched/dense/unit_test/Test_Batched_BatchedGemm.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp index 3c9a3f3c07..8329b8863d 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp @@ -393,7 +393,11 @@ int test_batched_gemm() { param_tag_type>(8); test_batched_gemm_with_layout(16); + } else { + std::cerr << "TEST SKIPPED since BatchLayout is not Right." << std::endl; } +#else + std::cerr << "TEST SKIPPED since LayoutLeft is not ETI'd." << std::endl; #endif // KOKKOSKERNELS_INST_LAYOUTLEFT #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ @@ -415,7 +419,11 @@ int test_batched_gemm() { param_tag_type>(8); test_batched_gemm_with_layout(16); + } else { + std::cerr << "TEST SKIPPED since BatchLayout is not Left." << std::endl; } +#else + std::cerr << "TEST SKIPPED since LayoutRight is not ETI'd." << std::endl; #endif // KOKKOSKERNELS_INST_LAYOUTRIGHT return 0; } From 557002b53c17c7df5a83ca74ca74f69314b406ad Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 24 May 2023 12:34:49 -0600 Subject: [PATCH 397/442] batched/CMakeLists.txt: ETI valid args only --- batched/CMakeLists.txt | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/batched/CMakeLists.txt b/batched/CMakeLists.txt index 053ed96012..6f196b6867 100644 --- a/batched/CMakeLists.txt +++ b/batched/CMakeLists.txt @@ -25,48 +25,48 @@ KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_nt_nt_bll Gemm COMPONENTS batched HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTRIGHT DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_nt_t_bll Gemm COMPONENTS batched HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTRIGHT DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_t_nt_bll Gemm COMPONENTS batched HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTRIGHT DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_t_t_bll Gemm COMPONENTS batched HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTRIGHT DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_nt_nt_blr Gemm COMPONENTS batched HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTLEFT DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_nt_t_blr Gemm COMPONENTS batched HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTLEFT DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_t_nt_blr Gemm COMPONENTS batched HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTLEFT DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_t_t_blr Gemm COMPONENTS batched HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTS DEVICES + TYPE_LISTS FLOATS LAYOUTLEFT DEVICES ) \ No newline at end of file From 48d647966e27d80a9f4c2385fe934d40eebab1af Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 24 May 2023 12:47:33 -0600 Subject: [PATCH 398/442] cmake: Fix batched eti args --- batched/CMakeLists.txt | 16 ++++++++-------- cmake/kokkoskernels_eti_layouts.cmake | 4 ++++ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/batched/CMakeLists.txt b/batched/CMakeLists.txt index 6f196b6867..3f13ac5084 100644 --- a/batched/CMakeLists.txt +++ b/batched/CMakeLists.txt @@ -25,48 +25,48 @@ KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_nt_nt_bll Gemm COMPONENTS batched HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTRIGHT DEVICES + TYPE_LISTS FLOATS RIGHT_LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_nt_t_bll Gemm COMPONENTS batched HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTRIGHT DEVICES + TYPE_LISTS FLOATS RIGHT_LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_t_nt_bll Gemm COMPONENTS batched HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTRIGHT DEVICES + TYPE_LISTS FLOATS RIGHT_LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_t_t_bll Gemm COMPONENTS batched HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTRIGHT DEVICES + TYPE_LISTS FLOATS RIGHT_LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_nt_nt_blr Gemm COMPONENTS batched HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTLEFT DEVICES + TYPE_LISTS FLOATS LEFT_LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_nt_t_blr Gemm COMPONENTS batched HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTLEFT DEVICES + TYPE_LISTS FLOATS LEFT_LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_t_nt_blr Gemm COMPONENTS batched HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTLEFT DEVICES + TYPE_LISTS FLOATS LEFT_LAYOUTS DEVICES ) KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_t_t_blr Gemm COMPONENTS batched HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS LAYOUTLEFT DEVICES + TYPE_LISTS FLOATS LEFT_LAYOUTS DEVICES ) \ No newline at end of file diff --git a/cmake/kokkoskernels_eti_layouts.cmake b/cmake/kokkoskernels_eti_layouts.cmake index 38835c129d..647d835353 100644 --- a/cmake/kokkoskernels_eti_layouts.cmake +++ b/cmake/kokkoskernels_eti_layouts.cmake @@ -1,3 +1,7 @@ +SET(RIGHT_LAYOUTS + LAYOUTRIGHT) +SET(LEFT_LAYOUTS + LAYOUTLEFT) SET(LAYOUTS LAYOUTLEFT LAYOUTRIGHT) From 90c8a5ed1fe86496f8564701edad7c3acf55d5b6 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 24 May 2023 13:37:10 -0600 Subject: [PATCH 399/442] batched/eti: Use Trans from KokkosBlas --- .../KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in | 2 +- .../KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in | 2 +- .../KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in | 2 +- .../KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in | 2 +- .../KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in | 2 +- .../KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in | 2 +- .../KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in | 2 +- .../KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in index 070f64f648..6bdcb095f4 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_avail.hpp.in @@ -18,7 +18,7 @@ #define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_AVAIL_HPP_ namespace KokkosBatched { namespace Impl { -using KokkosBatched::Trans; +using KokkosBlas::Trans; @BATCHED_GEMM_NT_NT_BLL_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in index ab333ad536..f0098ff1f0 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_avail.hpp.in @@ -18,7 +18,7 @@ #define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_AVAIL_HPP_ namespace KokkosBatched { namespace Impl { -using KokkosBatched::Trans; +using KokkosBlas::Trans; @BATCHED_GEMM_NT_NT_BLR_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in index da1649ecd7..b7efe9f5d4 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_avail.hpp.in @@ -18,7 +18,7 @@ #define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_AVAIL_HPP_ namespace KokkosBatched { namespace Impl { -using KokkosBatched::Trans; +using KokkosBlas::Trans; @BATCHED_GEMM_NT_T_BLL_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in index 01a52d1e6d..4ef39901f9 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_avail.hpp.in @@ -18,7 +18,7 @@ #define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_AVAIL_HPP_ namespace KokkosBatched { namespace Impl { -using KokkosBatched::Trans; +using KokkosBlas::Trans; @BATCHED_GEMM_NT_T_BLR_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in index 2b794505a5..f40acc60b1 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_avail.hpp.in @@ -18,7 +18,7 @@ #define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_AVAIL_HPP_ namespace KokkosBatched { namespace Impl { -using KokkosBatched::Trans; +using KokkosBlas::Trans; @BATCHED_GEMM_T_NT_BLL_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in index ee72422e11..a8e23a5169 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_avail.hpp.in @@ -18,7 +18,7 @@ #define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_AVAIL_HPP_ namespace KokkosBatched { namespace Impl { -using KokkosBatched::Trans; +using KokkosBlas::Trans; @BATCHED_GEMM_T_NT_BLR_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in index fa2bcbde47..33e865fceb 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_avail.hpp.in @@ -18,7 +18,7 @@ #define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_AVAIL_HPP_ namespace KokkosBatched { namespace Impl { -using KokkosBatched::Trans; +using KokkosBlas::Trans; @BATCHED_GEMM_T_T_BLL_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in index 4dfb2b8c9a..f81d3d6c53 100644 --- a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_avail.hpp.in @@ -18,7 +18,7 @@ #define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_AVAIL_HPP_ namespace KokkosBatched { namespace Impl { -using KokkosBatched::Trans; +using KokkosBlas::Trans; @BATCHED_GEMM_T_T_BLR_ETI_AVAIL_BLOCK@ } //IMPL } //Kokkos From 5ece26d89989b3dc4aa72f4812bf9de33a101d4f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 24 May 2023 13:58:20 -0600 Subject: [PATCH 400/442] batched/dense: cleanup and move ETI into spec file --- .../KokkosBatched_HostLevel_Gemm_Impl.hpp | 713 ++++++------------ .../KokkosBatched_HostLevel_Gemm_Spec.hpp | 287 +++++++ .../src/KokkosBatched_HostLevel_Gemm.hpp | 15 +- ...> KokkosBatched_HostLevel_Gemm_Handle.hpp} | 0 .../dense/src/KokkosBatched_Kernel_Handle.hpp | 5 +- .../unit_test/Test_Batched_BatchedGemm.hpp | 1 + 6 files changed, 521 insertions(+), 500 deletions(-) create mode 100644 batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp rename batched/dense/src/{KokkosBatched_Gemm_Handle.hpp => KokkosBatched_HostLevel_Gemm_Handle.hpp} (100%) diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index 9b505e72f6..db837f298a 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -15,12 +15,12 @@ //@HEADER #ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP__ #define __KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP__ -#include "Kokkos_Core.hpp" // KOKKOS_INLINE_FUNCTION -#include "KokkosBatched_Util.hpp" // Trans, BatchLayout -#include +#include +#include // Trans, BatchLayout #include #include +#include "KokkosBatched_HostLevel_Gemm_Handle.hpp" // BatchedGemmHandle #include "KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp" #include "KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp" #include "KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp" @@ -63,347 +63,215 @@ constexpr KOKKOS_INLINE_FUNCTION size_t kk_gemm_dbl_buf_alpha_in_fma_thresh() { #endif // __CUDAACC_RDC__ } -// Specialization struct which defines whether a specialization exists -// This struct is currently never specialized. -template -struct batched_gemm_tpl_spec_avail { - enum : bool { value = false }; -}; - -// Specialization struct which defines whether a specialization exists -template -struct batched_gemm_eti_spec_avail { - enum : bool { value = false }; -}; -} // namespace Impl -} // namespace KokkosBatched - -// ETI specalization macros, consumed by generated *_eti_spec_avail.hpp files -#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, \ - ARG_BATCH_LAYOUT, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template <> \ - struct batched_gemm_eti_spec_avail< \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, BatchedGemmHandle, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ - }; - -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ - EXEC_SPACE, MEM_SPACE) -#else -#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) -#endif - -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ - EXEC_SPACE, MEM_SPACE) -#else -#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) -#endif - -///////////////// BatchLayout::Left Permutations ///////////////// -#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) - -#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) - -#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) - -#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ - Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) - -// Include the BLL ETI specalizations -#include -#include -#include -#include - -///////////////// BatchLayout::Right Permutations ///////////////// -#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) - -#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) - -#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) - -#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ - Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) - -// Include the BLR ETI specalizations -#include -#include -#include -#include - -namespace KokkosBatched { -namespace Impl { template -struct BatchedGemmWrapperInner { - static int run(BatchedGemmHandleType *const handle, const ScalarType alpha, - const AViewType &A, const BViewType &B, const ScalarType beta, - const CViewType &C) { - int ret = 0; - size_t c_m, c_n; - using ViewValueType = typename CViewType::value_type; - // Check for valid input views - static_assert(Kokkos::is_view::value, - "AViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "BViewType must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "CViewType must be a Kokkos::View."); - static_assert( - std::is_same::value || - std::is_same::value, - "ArgTransA must be either Trans::Transpose or Trans::NoTranspose."); - static_assert( - std::is_same::value || - std::is_same::value, - "ArgTransB must be either Trans::Transpose or Trans::NoTranspose."); - if constexpr (is_vector::value) { - // Check ranks of view with underlying SIMD value types - // For SIMD views, we can have either 3-rank or 4-ranks inputs. - switch (handle->get_kernel_algo_type()) { - case BaseKokkosBatchedAlgos::KK_SERIAL: - case BaseHeuristicAlgos::SQUARE: - case BaseTplAlgos::ARMPL: - static_assert(static_cast(AViewType::rank) == 3, - "AViewType must have rank 3."); - static_assert(static_cast(BViewType::rank) == 3, - "BViewType must have rank 3."); - static_assert(static_cast(CViewType::rank) == 3, - "CViewType must have rank 3."); - break; - default: - std::ostringstream os; - os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " - << std::to_string(handle->get_kernel_algo_type()) - << " with SIMD views." << std::endl; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - break; - } - } else { - // Check ranks of views with underlying scalar value types - static_assert(static_cast(AViewType::rank) == 3, - "AViewType must have rank 3."); - static_assert(static_cast(BViewType::rank) == 3, - "BViewType must have rank 3."); - static_assert(static_cast(CViewType::rank) == 3, - "CViewType must have rank 3."); - } - - // Check for valid data access patterns - // Skip checking a_layout == b_layout == c_layout - // Skip checking for LayoutStride - using c_layout = typename CViewType::array_layout; - static_assert(!(std::is_same::value && - !std::is_same::value), - "LayoutLeft views require BatchLayout::Right"); - static_assert(!(std::is_same::value && - !std::is_same::value), - "LayoutRight views require BatchLayout::Left"); - - if constexpr (std::is_same::value) { - // c_b = C.extent(0); - c_m = C.extent(1); - c_n = C.extent(2); - } else { - // c_b = C.extent(2); - c_m = C.extent(0); - c_n = C.extent(1); - } - - // Begin checking conditions for optimal BatchedGemm invocation. - using view_scalar_type = typename CViewType::value_type; - using layout_type = typename CViewType::array_layout; - using exec_space = typename CViewType::execution_space; - constexpr bool is_vector = - KokkosBatched::is_vector::value; - constexpr bool on_gpu = - KokkosKernels::Impl::kk_is_gpu_exec_space(); - constexpr bool on_x86_64 = KokkosKernels::Impl::kk_is_x86_64_mem_space< - typename exec_space::memory_space>(); - constexpr bool on_a64fx = KokkosKernels::Impl::kk_is_a64fx_mem_space< - typename exec_space::memory_space>(); - bool out_of_range = false; - - if (handle->enableDebug) { - std::cout << "view_scalar_type:" << typeid(view_scalar_type).name() - << std::endl - << "execution_space:" << typeid(exec_space).name() << std::endl - << std::endl - << "is_vector:" << is_vector << std::endl - << "on_gpu:" << on_gpu << std::endl - << "on_x86_64:" << on_x86_64 << std::endl - << "on_a64fx:" << on_a64fx << std::endl; - } - +int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, + const AViewType &A, const BViewType &B, + const ScalarType beta, const CViewType &C) { + int ret = 0; + size_t c_m, c_n; + using ViewValueType = typename CViewType::value_type; + // Check for valid input views + static_assert(Kokkos::is_view::value, + "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "BViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "CViewType must be a Kokkos::View."); + static_assert( + std::is_same::value || + std::is_same::value, + "ArgTransA must be either Trans::Transpose or Trans::NoTranspose."); + static_assert( + std::is_same::value || + std::is_same::value, + "ArgTransB must be either Trans::Transpose or Trans::NoTranspose."); + if constexpr (is_vector::value) { + // Check ranks of view with underlying SIMD value types + // For SIMD views, we can have either 3-rank or 4-ranks inputs. switch (handle->get_kernel_algo_type()) { - ////////////// HEURISTIC ALGOS ////////////// + case BaseKokkosBatchedAlgos::KK_SERIAL: case BaseHeuristicAlgos::SQUARE: - if (c_m != c_n) { - std::ostringstream os; - os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " - << std::to_string(handle->get_kernel_algo_type()) << " when c_m(" - << std::to_string(c_m) << ") != c_n(" << std::to_string(c_n) << ")" - << std::endl; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } + case BaseTplAlgos::ARMPL: + static_assert(static_cast(AViewType::rank) == 3, + "AViewType must have rank 3."); + static_assert(static_cast(BViewType::rank) == 3, + "BViewType must have rank 3."); + static_assert(static_cast(CViewType::rank) == 3, + "CViewType must have rank 3."); + break; + default: + std::ostringstream os; + os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " + << std::to_string(handle->get_kernel_algo_type()) + << " with SIMD views." << std::endl; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + break; + } + } else { + // Check ranks of views with underlying scalar value types + static_assert(static_cast(AViewType::rank) == 3, + "AViewType must have rank 3."); + static_assert(static_cast(BViewType::rank) == 3, + "BViewType must have rank 3."); + static_assert(static_cast(CViewType::rank) == 3, + "CViewType must have rank 3."); + } - // Select optimal resultsPerThread param for BatchedSerialGemm - using bsgResultsPerThread = - typename std::conditional::type; + // Check for valid data access patterns + // Skip checking a_layout == b_layout == c_layout + // Skip checking for LayoutStride + using c_layout = typename CViewType::array_layout; + static_assert(!(std::is_same::value && + !std::is_same::value), + "LayoutLeft views require BatchLayout::Right"); + static_assert(!(std::is_same::value && + !std::is_same::value), + "LayoutRight views require BatchLayout::Left"); + + if constexpr (std::is_same::value) { + // c_b = C.extent(0); + c_m = C.extent(1); + c_n = C.extent(2); + } else { + // c_b = C.extent(2); + c_m = C.extent(0); + c_n = C.extent(1); + } - // Select optimal mode param for SerialGemm. - using bsgModeType = typename std::conditional< - is_vector, - typename std::conditional::type, - typename std::conditional< - on_gpu, Algo::Gemm::Unblocked, - typename std::conditional::type>::type>:: - type; + // Begin checking conditions for optimal BatchedGemm invocation. + using view_scalar_type = typename CViewType::value_type; + using layout_type = typename CViewType::array_layout; + using exec_space = typename CViewType::execution_space; + constexpr bool is_vector = KokkosBatched::is_vector::value; + constexpr bool on_gpu = + KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool on_x86_64 = KokkosKernels::Impl::kk_is_x86_64_mem_space< + typename exec_space::memory_space>(); + constexpr bool on_a64fx = KokkosKernels::Impl::kk_is_a64fx_mem_space< + typename exec_space::memory_space>(); + bool out_of_range = false; + + if (handle->enableDebug) { + std::cout << "view_scalar_type:" << typeid(view_scalar_type).name() + << std::endl + << "execution_space:" << typeid(exec_space).name() << std::endl + << std::endl + << "is_vector:" << is_vector << std::endl + << "on_gpu:" << on_gpu << std::endl + << "on_x86_64:" << on_x86_64 << std::endl + << "on_a64fx:" << on_a64fx << std::endl; + } - if (handle->enableDebug) { - std::cout << "bsgResultsPerThread: " - << typeid(bsgResultsPerThread).name() << std::endl - << "bsgModeType: " << typeid(bsgModeType).name() - << std::endl; - } + switch (handle->get_kernel_algo_type()) { + ////////////// HEURISTIC ALGOS ////////////// + case BaseHeuristicAlgos::SQUARE: + if (c_m != c_n) { + std::ostringstream os; + os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " + << std::to_string(handle->get_kernel_algo_type()) << " when c_m(" + << std::to_string(c_m) << ") != c_n(" << std::to_string(c_n) << ")" + << std::endl; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } - if constexpr (on_gpu) { - if (((std::is_same::value) - ? (c_m >= 16) - : (c_m >= 24 && c_m <= 32) || c_m >= 40)) { - handle->teamSz = handle->vecLen = 8; - constexpr int tile_m = Impl::kk_gemm_dlb_buf_tile_m(); - constexpr int tile_n = Impl::kk_gemm_dlb_buf_tile_n(); - constexpr int tile_k = Impl::kk_gemm_dlb_buf_tile_k(); - constexpr size_t alpha_in_fma_thresh = - Impl::kk_gemm_dbl_buf_alpha_in_fma_thresh(); + // Select optimal resultsPerThread param for BatchedSerialGemm + using bsgResultsPerThread = + typename std::conditional::type; + + // Select optimal mode param for SerialGemm. + using bsgModeType = typename std::conditional< + is_vector, + typename std::conditional::type, + typename std::conditional< + on_gpu, Algo::Gemm::Unblocked, + typename std::conditional::type>::type>:: + type; + + if (handle->enableDebug) { + std::cout << "bsgResultsPerThread: " + << typeid(bsgResultsPerThread).name() << std::endl + << "bsgModeType: " << typeid(bsgModeType).name() << std::endl; + } - if (c_m % 32 == 0) { // No bounds checking - if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma - ret = - Impl::BatchedDblBufGemm::value) + ? (c_m >= 16) + : (c_m >= 24 && c_m <= 32) || c_m >= 40)) { + handle->teamSz = handle->vecLen = 8; + constexpr int tile_m = Impl::kk_gemm_dlb_buf_tile_m(); + constexpr int tile_n = Impl::kk_gemm_dlb_buf_tile_n(); + constexpr int tile_k = Impl::kk_gemm_dlb_buf_tile_k(); + constexpr size_t alpha_in_fma_thresh = + Impl::kk_gemm_dbl_buf_alpha_in_fma_thresh(); + + if (c_m % 32 == 0) { // No bounds checking + if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma + ret = Impl::BatchedDblBufGemm( handle, alpha, A, B, beta, C) .invoke(); - } else { // apply alpha in mul - ret = - Impl::BatchedDblBufGemm( handle, alpha, A, B, beta, C) .invoke(); - } - } else { // bounds checking - if (c_m >= alpha_in_fma_thresh) { // apply alpha in fma - ret = - Impl::BatchedDblBufGemm= alpha_in_fma_thresh) { // apply alpha in fma + ret = Impl::BatchedDblBufGemm( handle, alpha, A, B, beta, C) .invoke(); - } else { // apply alpha in mul - ret = - Impl::BatchedDblBufGemm( handle, alpha, A, B, beta, C) .invoke(); - } } - } else { - out_of_range = true; } + } else { + out_of_range = true; } - if (!on_gpu || out_of_range) { - ret = Impl::BatchedSerialGemm(alpha, A, B, beta, C) - .invoke(); - } - break; + } + if (!on_gpu || out_of_range) { + ret = Impl::BatchedSerialGemm(alpha, A, B, beta, C) + .invoke(); + } + break; - // case BaseHeuristicAlgos::TALL: - // - // case BaseHeuristicAlgos::WIDE: - ////////////// TPL ALGOS ////////////// + // case BaseHeuristicAlgos::TALL: + // + // case BaseHeuristicAlgos::WIDE: + ////////////// TPL ALGOS ////////////// #if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058 - case BaseTplAlgos::ARMPL: - ret = Impl::BatchedArmplGemm( - handle, alpha, A, B, beta, C) - .invoke(); - break; + case BaseTplAlgos::ARMPL: + ret = Impl::BatchedArmplGemm(handle, alpha, A, B, + beta, C) + .invoke(); + break; #endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL // case BaseTplAlgos::MKL: // @@ -411,194 +279,57 @@ struct BatchedGemmWrapperInner { // // case GemmTplAlgos::MAGMA: - ////////////// KokkosBatched ALGOS ////////////// - case BaseKokkosBatchedAlgos::KK_SERIAL: - ret = - Impl::BatchedSerialGemm(alpha, A, B, beta, C) - .invoke(); - break; - - // case GemmKokkosBatchedAlgos::KK_SERIALSIMD: - - case GemmKokkosBatchedAlgos::KK_SERIAL_RANK0: - ret = - Impl::BatchedSerialGemm(alpha, A, B, beta, C) + ////////////// KokkosBatched ALGOS ////////////// + case BaseKokkosBatchedAlgos::KK_SERIAL: + ret = + Impl::BatchedSerialGemm( + alpha, A, B, beta, C) + .invoke(); + break; + + // case GemmKokkosBatchedAlgos::KK_SERIALSIMD: + + case GemmKokkosBatchedAlgos::KK_SERIAL_RANK0: + ret = + Impl::BatchedSerialGemm( + alpha, A, B, beta, C) + .invoke(); + break; + + // case GemmKokkosBatchedAlgos::KK_SERIAL_SHMEM: + // case GemmKokkosBatchedAlgos::KK_TEAM: + // case GemmKokkosBatchedAlgos::KK_TEAMVECTOR: + // case GemmKokkosBatchedAlgos::KK_TEAMSIMD: + + case GemmKokkosBatchedAlgos::KK_DBLBUF: + // Note: The tile sizes of 1x1x1 here will not perform well but must be + // selected in order to function on all devices since the serial + // execution space has a max team size of 1. KokkosKernels API users + // will need to follow an approach similar to KK_SQUARE above for best + // performance. + + // TODO: Add auto-selection of tile size based on inputs and device type + ret = Impl::BatchedDblBufGemm( + handle, alpha, A, B, beta, C) .invoke(); - break; - - // case GemmKokkosBatchedAlgos::KK_SERIAL_SHMEM: - // case GemmKokkosBatchedAlgos::KK_TEAM: - // case GemmKokkosBatchedAlgos::KK_TEAMVECTOR: - // case GemmKokkosBatchedAlgos::KK_TEAMSIMD: - - case GemmKokkosBatchedAlgos::KK_DBLBUF: - // Note: The tile sizes of 1x1x1 here will not perform well but must be - // selected in order to function on all devices since the serial - // execution space has a max team size of 1. KokkosKernels API users - // will need to follow an approach similar to KK_SQUARE above for best - // performance. - - // TODO: Add auto-selection of tile size based on inputs and device type - ret = Impl::BatchedDblBufGemm( - handle, alpha, A, B, beta, C) - .invoke(); - break; - - default: - std::ostringstream os; - os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " - << std::to_string(handle->get_kernel_algo_type()) << "." - << std::endl; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - break; - } - return ret; - } -}; - -template ::value, - bool eti_spec_avail = batched_gemm_eti_spec_avail< - ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, - ScalarType, AViewType, BViewType, CViewType>::value> -struct BatchedGemmWrapper { - static int run(BatchedGemmHandleType *const handle, const ScalarType alpha, - const AViewType &A, const BViewType &B, const ScalarType beta, - const CViewType &C) -#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - { -#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION -#if KOKKOSKERNELS_IMPL_COMPILE_LIBRARY - printf( - "KokkosBatched::BatchedGemm<> ETI specialization for < %s, %s, %s, " - "%s, %s, %s, %s, %s >\n", - typeid(ArgTransA).name(), typeid(ArgTransB).name(), - typeid(ArgBatchSzDim).name(), typeid(BatchedGemmHandleType).name(), - typeid(ScalarType).name(), typeid(AViewType).name(), - typeid(BViewType).name(), typeid(CViewType).name()); -#else - printf( - "KokkosBatched::BatchedGemm<> non-ETI specialization for < %s, %s, " - "%s, %s, %s, %s, %s, %s >\n", - typeid(ArgTransA).name(), typeid(ArgTransB).name(), - typeid(ArgBatchSzDim).name(), typeid(BatchedGemmHandleType).name(), - typeid(ScalarType).name(), typeid(AViewType).name(), - typeid(BViewType).name(), typeid(CViewType).name()); -#endif // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -#endif // KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION - return Impl::BatchedGemmWrapperInner< - ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, ScalarType, - AViewType, BViewType, CViewType>::run(handle, alpha, A, B, beta, C); + break; + + default: + std::ostringstream os; + os << "KokkosBatched::BatchedGemm does not support kernelAlgoType = " + << std::to_string(handle->get_kernel_algo_type()) << "." << std::endl; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + break; } -#else - ; -#endif // !defined(KOKKOSKERNELS_ETI_ONLY) || - // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -}; - -// ETI instantiation macros, consumed by *.cpp.in files -#define KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, \ - ARG_BATCH_LAYOUT, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) \ - template struct BatchedGemmWrapper< \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, BatchedGemmHandle, SCALAR, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - false, true>; - -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ - EXEC_SPACE, MEM_SPACE) -#else -#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) -#endif - -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ - EXEC_SPACE, MEM_SPACE) -#else -#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ - ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) -#endif - -///////////////// BatchLayout::Left Permutations ///////////////// -#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ - Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) - -#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ - Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) - -#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ - Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) - -#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ - Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) - -///////////////// BatchLayout::Right Permutations ///////////////// -#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ - Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) - -#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ - Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) - -#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ - Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ - LAYOUT, EXEC_SPACE, MEM_SPACE) - -#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ - MEM_SPACE) \ - KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ - Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ - EXEC_SPACE, MEM_SPACE) - + return ret; +} } // namespace Impl } // namespace KokkosBatched #endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP__ diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp new file mode 100644 index 0000000000..23d561f53b --- /dev/null +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp @@ -0,0 +1,287 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP__ +#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP__ + +#include // BatchedGemmHandle + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include "KokkosBatched_HostLevel_Gemm_Impl.hpp" +#endif + +namespace KokkosBatched { +namespace Impl { +// Specialization struct which defines whether a specialization exists +// This struct is currently never specialized. +template +struct batched_gemm_tpl_spec_avail { + enum : bool { value = false }; +}; + +// Specialization struct which defines whether a specialization exists +template +struct batched_gemm_eti_spec_avail { + enum : bool { value = false }; +}; +} // namespace Impl +} // namespace KokkosBatched + +// ETI specalization macros, consumed by generated *_eti_spec_avail.hpp files +#define KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER(ARG_TRANS_A, ARG_TRANS_B, \ + ARG_BATCH_LAYOUT, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct batched_gemm_eti_spec_avail< \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, BatchedGemmHandle, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ + }; + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ + EXEC_SPACE, MEM_SPACE) +#else +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ + EXEC_SPACE, MEM_SPACE) +#else +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) +#endif + +///////////////// BatchLayout::Left Permutations ///////////////// +#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_AVAIL_INNER( \ + Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +// Include the BLL ETI specalizations +#include +#include +#include +#include + +///////////////// BatchLayout::Right Permutations ///////////////// +#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_AVAIL_INNER( \ + Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +// Include the BLR ETI specalizations +#include +#include +#include +#include + +namespace KokkosBatched { +namespace Impl { +template ::value, + bool eti_spec_avail = batched_gemm_eti_spec_avail< + ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, + ScalarType, AViewType, BViewType, CViewType>::value> +struct BatchedGemmSpec { + static int run(BatchedGemmHandleType *const handle, const ScalarType alpha, + const AViewType &A, const BViewType &B, const ScalarType beta, + const CViewType &C) +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + { +#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION +#if KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + printf( + "KokkosBatched::BatchedGemm<> ETI specialization for < %s, %s, %s, " + "%s, %s, %s, %s, %s >\n", + typeid(ArgTransA).name(), typeid(ArgTransB).name(), + typeid(ArgBatchSzDim).name(), typeid(BatchedGemmHandleType).name(), + typeid(ScalarType).name(), typeid(AViewType).name(), + typeid(BViewType).name(), typeid(CViewType).name()); +#else + printf( + "KokkosBatched::BatchedGemm<> non-ETI specialization for < %s, %s, " + "%s, %s, %s, %s, %s, %s >\n", + typeid(ArgTransA).name(), typeid(ArgTransB).name(), + typeid(ArgBatchSzDim).name(), typeid(BatchedGemmHandleType).name(), + typeid(ScalarType).name(), typeid(AViewType).name(), + typeid(BViewType).name(), typeid(CViewType).name()); +#endif // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#endif // KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION + return KokkosBatched::Impl::BatchedGemmImpl< + ArgTransA, ArgTransB, ArgBatchSzDim, BatchedGemmHandleType, ScalarType, + AViewType, BViewType, CViewType>(handle, alpha, A, B, beta, C); + } +#else + ; +#endif // !defined(KOKKOSKERNELS_ETI_ONLY) || + // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +}; +} // namespace Impl +} // namespace KokkosBatched + +// ETI instantiation macros, consumed by *.cpp.in files +#define KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, \ + ARG_BATCH_LAYOUT, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct BatchedGemmSpec< \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, BatchedGemmHandle, SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ + EXEC_SPACE, MEM_SPACE) +#else +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ + EXEC_SPACE, MEM_SPACE) +#else +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ + ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) +#endif + +///////////////// BatchLayout::Left Permutations ///////////////// +#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ + Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ + Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ + Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_INST_INNER( \ + Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +///////////////// BatchLayout::Right Permutations ///////////////// +#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ + Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ + Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ + Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER( \ + Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) +#endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP__ diff --git a/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp index 25473e3c3a..4f62d0b0d4 100644 --- a/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp +++ b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp @@ -16,7 +16,11 @@ #ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP__ #define __KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP__ -#include "KokkosBatched_HostLevel_Gemm_Impl.hpp" +// Include explicit specializations of BatchedGemm. +// If ETI_ONLY is disabled, the primary template will +// be inlined into each caller's invocation using non- +// ETI'd template arguments. +#include "KokkosBatched_HostLevel_Gemm_Spec.hpp" namespace KokkosBatched { // clang-format off @@ -93,10 +97,11 @@ inline int BatchedGemm(BatchedGemmHandleType *const handle, typename CViewType::device_type, Kokkos::MemoryTraits>; - return Impl::BatchedGemmWrapper::run(handle, alpha, A, - B, beta, C); + // Go through specialization layer in case ETI'd symbols are available. + return Impl::BatchedGemmSpec::run(handle, alpha, A, B, + beta, C); } } // namespace KokkosBatched #endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP__ diff --git a/batched/dense/src/KokkosBatched_Gemm_Handle.hpp b/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp similarity index 100% rename from batched/dense/src/KokkosBatched_Gemm_Handle.hpp rename to batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp diff --git a/batched/dense/src/KokkosBatched_Kernel_Handle.hpp b/batched/dense/src/KokkosBatched_Kernel_Handle.hpp index cc8d1ff480..051f78979d 100644 --- a/batched/dense/src/KokkosBatched_Kernel_Handle.hpp +++ b/batched/dense/src/KokkosBatched_Kernel_Handle.hpp @@ -14,13 +14,10 @@ // //@HEADER -// -// Created by Harvey, Evan on 7/13/21. -// - #ifndef KOKKOSKERNELS_KOKKOSBATCHED_KERNEL_HEADER_HPP #define KOKKOSKERNELS_KOKKOSBATCHED_KERNEL_HEADER_HPP +#include #include "KokkosKernels_Error.hpp" #if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp index 8329b8863d..d57e671908 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm.hpp @@ -18,6 +18,7 @@ #include "Kokkos_Random.hpp" #include "KokkosBatched_HostLevel_Gemm.hpp" +#include "KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp" #include "KokkosKernels_TestUtils.hpp" From 3a1ea766bdced2cecec43b3a85496709282b2935 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 24 May 2023 13:59:39 -0600 Subject: [PATCH 401/442] batched/dense: cleanup gemm handle --- .../dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp b/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp index 51dfd3b5e3..95e8f36bc2 100644 --- a/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp +++ b/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp @@ -14,12 +14,8 @@ // //@HEADER -// -// Created by Harvey, Evan on 7/13/21. -// - -#ifndef KOKKOSKERNELS_KOKKOSBATCHED_GEMM_HANDLE_HPP -#define KOKKOSKERNELS_KOKKOSBATCHED_GEMM_HANDLE_HPP +#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_HANDLE_DECL_HPP__ +#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_HANDLE_DECL_HPP__ #include "KokkosBatched_Kernel_Handle.hpp" @@ -166,4 +162,4 @@ class BatchedGemmHandle : public BatchedKernelHandle { } // namespace KokkosBatched -#endif // KOKKOSKERNELS_KOKKOSBATCHED_GEMM_HANDLE_HPP +#endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_HANDLE_DECL_HPP__ From 9292be86df7c1b286abbf8c61f52dbb13e98e4b1 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 24 May 2023 17:42:10 -0600 Subject: [PATCH 402/442] batched/dense/impl: Fix headers --- batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp index 23d561f53b..6ec792172b 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp @@ -16,10 +16,13 @@ #ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP__ #define __KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP__ +#include +#include #include // BatchedGemmHandle #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY #include "KokkosBatched_HostLevel_Gemm_Impl.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" #endif namespace KokkosBatched { From c5302a1cadbe28fa8911fdc62cc6ac8bc2e0167c Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 30 May 2023 16:52:56 -0600 Subject: [PATCH 403/442] docs: Add profiling for compile times --- docs/developer/index.rst | 4 +++- docs/developer/profiling.rst | 24 ++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 docs/developer/profiling.rst diff --git a/docs/developer/index.rst b/docs/developer/index.rst index 7ee05f98ae..58f89084ac 100644 --- a/docs/developer/index.rst +++ b/docs/developer/index.rst @@ -7,4 +7,6 @@ Developer Manual Source Code Documentation Building the Documentation Code Style Guide - Contributing \ No newline at end of file + Contributing + Profiling + \ No newline at end of file diff --git a/docs/developer/profiling.rst b/docs/developer/profiling.rst new file mode 100644 index 0000000000..d5f3e4ceb2 --- /dev/null +++ b/docs/developer/profiling.rst @@ -0,0 +1,24 @@ +Profiling +========= + +Compile Times +------------- +1. Select a clang compiler +2. Configure and include `-ftime-trace` in your CXX FLAGS (this works with clang+cuda). +3. Clone and build https://github.com/aras-p/ClangBuildAnalyzer. Put the binary directory in your `PATH`. +4. Compile Kokkos and KokkosKernels +5. Create a directory called `ftime-trace-artifacts` in your build directory +6. Copy the json files you care about in this directory, for example: + +.. code-block:: + + cp ./{sparse,blas}/unit_test/CMakeFiles/*.dir/backends/*.json ftime-trace-artifacts/ + +7. Run `ClangBuildAnalyzer: + +.. code-block:: + + ClangBuildAnalyzer --all ftime-trace-artifacts/ profile.txt + ClangBuildAnalyzer --analyze profile.txt > analyze.txt + +8. Open `analyze.txt` \ No newline at end of file From 723ab23aa445b3232494df4e459ce5f30ea28ff7 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 31 May 2023 10:57:28 -0600 Subject: [PATCH 404/442] blas/tpls: Fix gemm include guard typo --- blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp index f130432978..69146baf4f 100644 --- a/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOSBLAS3_GEMV_TPL_SPEC_AVAIL_HPP_ -#define KOKKOSBLAS3_GEMV_TPL_SPEC_AVAIL_HPP_ +#ifndef KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_HPP_ namespace KokkosBlas { namespace Impl { From 5b369abef3fcdca162ea4e8a0199ff0038515c49 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 31 May 2023 12:10:10 -0600 Subject: [PATCH 405/442] Update cmake option naming in docs/comments Kokkos_ENABLE_OpenMP* to Kokkos_ENALBE_OPENMP* Related to trilinos/Trilinos#11930 and kokkos/kokkos#6138 --- BUILD.md | 2 +- cmake/kokkoskernels_eti_devices.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/BUILD.md b/BUILD.md index 6c91042b78..5be269bd7c 100644 --- a/BUILD.md +++ b/BUILD.md @@ -192,7 +192,7 @@ endif() * Whether to pre instantiate kernels for the scalar type double. This option is KokkosKernels_INST_DOUBLE=ON by default. Disabling this may increase build times. * Default: ON * KokkosKernels_INST_EXECSPACE_OPENMP: BOOL - * Whether to pre instantiate kernels for the execution space Kokkos::OpenMP. Disabling this when Kokkos_ENABLE_OpenMP is enabled may increase build times. + * Whether to pre instantiate kernels for the execution space Kokkos::OpenMP. Disabling this when Kokkos_ENABLE_OPENMP is enabled may increase build times. * Default: ON if Kokkos is OpenMP-enabled, OFF otherwise. * KokkosKernels_INST_EXECSPACE_SERIAL: BOOL * Whether to build kernels for the execution space Kokkos::Serial. If explicit template instantiation (ETI) is enabled in Trilinos, disabling this when Kokkos_ENABLE_SERIAL is enabled may increase build times. diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake index e6a72123a4..d223e00171 100644 --- a/cmake/kokkoskernels_eti_devices.cmake +++ b/cmake/kokkoskernels_eti_devices.cmake @@ -127,7 +127,7 @@ IF(KOKKOS_ENABLE_OPENMPTARGET) INST_EXECSPACE_OPENMPTARGET ${KOKKOSKERNELS_INST_EXECSPACE_OPENMPTARGET_DEFAULT} BOOL - "Whether to pre instantiate kernels for the execution space Kokkos::Experimental::OpenMPTarget. Disabling this when Kokkos_ENABLE_OpenMPTarget is enabled may increase build times. Default: ON if Kokkos is OpenMPTarget-enabled, OFF otherwise." + "Whether to pre instantiate kernels for the execution space Kokkos::Experimental::OpenMPTarget. Disabling this when Kokkos_ENABLE_OPENMPTARGET is enabled may increase build times. Default: ON if Kokkos is OpenMPTarget-enabled, OFF otherwise." ) KOKKOSKERNELS_ADD_OPTION( INST_MEMSPACE_OPENMPTARGETSPACE @@ -163,7 +163,7 @@ KOKKOSKERNELS_ADD_OPTION( INST_EXECSPACE_OPENMP ${KOKKOSKERNELS_INST_EXECSPACE_OPENMP_DEFAULT} BOOL - "Whether to pre instantiate kernels for the execution space Kokkos::OpenMP. Disabling this when Kokkos_ENABLE_OpenMP is enabled may increase build times. Default: ON if Kokkos is OpenMP-enabled, OFF otherwise." + "Whether to pre instantiate kernels for the execution space Kokkos::OpenMP. Disabling this when Kokkos_ENABLE_OPENMP is enabled may increase build times. Default: ON if Kokkos is OpenMP-enabled, OFF otherwise." ) IF(KOKKOSKERNELS_INST_EXECSPACE_OPENMP AND KOKKOSKERNELS_INST_MEMSPACE_HOSTSPACE) LIST(APPEND DEVICE_LIST "") From b3328390e70cac18f38e2c0c787d4a5aa181e43b Mon Sep 17 00:00:00 2001 From: "Roscoe A. Bartlett" Date: Wed, 31 May 2023 07:12:53 -0600 Subject: [PATCH 406/442] KokkosKernels: Remove non-existent common/src/[impl,tpls] include dirs (trilinos/Trilinos#11545) Listing this non-existant directories: kokkos-kernels/common/src/impl kokkos-kernels/common/src/tpls as an include directories for the KokkosKernels targets actually causes the new Trilinos test: TrilinosInstallTests_simpleBuildAgainstTrilinos_by_package_build_tree to fail in Trilinos PR testing for Trilinos PR trilinos/Trilinos#11545 (and local testing when building KokkosKernels as part of Trilinos). These lines in the CMakeLists.txt file was added in the Trilinos commit: 62672e4b4e5 "Snapshot of kokkos-kernels.git from commit 518efd270918e739d4e0106f5bc1aedd8c99d464" Author: Nathan Ellingwood Date: Thu Feb 23 21:19:24 2023 -0700 (3 months ago) A packages/kokkos-kernels/common/CMakeLists.txt According to the kokkos-kerenls git history, these lines got added in the kokkos-kernels repo commit: dc77279f8 "Modular build: allowing to build "common" only" Author: Luc Berger-Vergiat Date: Wed Aug 24 16:49:41 2022 -0600 (9 months ago) A common/CMakeLists.txt There is no evidence that those directories ever existed in either the kokkos-kernals or Trilinos git repos. --- common/CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 72972b5cd7..13e445226f 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,7 +1,5 @@ # Adding source directory to the build LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/src) -LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/src/impl) -LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/src/tpls) LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/unit_test) # Adding unit-tests From d7c9a07714eebdd3174dd9ac4ac468f841d02663 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 7 Jun 2023 14:25:19 -0600 Subject: [PATCH 407/442] docs/developer: Add Experimental namespace --- docs/developer/apidocs/sparse.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index 83299f5764..15375086d6 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -82,7 +82,7 @@ par_ilut -------- .. doxygenfunction:: par_ilut_symbolic(KernelHandle* handle, ARowMapType& A_rowmap, AEntriesType& A_entries, LRowMapType& L_rowmap, URowMapType& U_rowmap) .. doxygenfunction:: par_ilut_numeric(KernelHandle* handle, ARowMapType& A_rowmap, AEntriesType& A_entries, AValuesType& A_values, LRowMapType& L_rowmap, LEntriesType& L_entries, LValuesType& L_values, URowMapType& U_rowmap, UEntriesType& U_entries, UValuesType& U_values) -.. doxygenclass:: KokkosSparse::PAR_ILUTHandle +.. doxygenclass:: KokkosSparse::Experimental::PAR_ILUTHandle :members: gmres From 127c28198114978e31d74015083732dd23f6c77e Mon Sep 17 00:00:00 2001 From: "Roscoe A. Bartlett" Date: Tue, 6 Jun 2023 19:54:08 -0600 Subject: [PATCH 408/442] Remove non-existant subdir kokkos-kernels/common/common (#11921, #11863) The new test `TrilinosInstallTests_simpleBuildAgainstTrilinos_by_package_build_tree` merged from PR #11863 fails because the subdirs ${CMAKE_CURRENT_BINARY_DIR}/common and ${CMAKE_CURRENT_SOURCE_DIR}/common because this CMakeLists.txt file already sits in the kokkos-kernels/common/ subdir. I don't know why this error did not happen with PR testing for PR #11863 but this is clearly the right thing to do. --- common/CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 13e445226f..88bf237274 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -1,7 +1,3 @@ # Adding source directory to the build LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/src) LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/unit_test) - -# Adding unit-tests -KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/common) -KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/common) From a8884845a4bbd358e9a06ee1af9c64a1ff3d9c12 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 7 Jun 2023 16:52:03 -0600 Subject: [PATCH 409/442] CMakeLists.txt: Add alias to match what is exported from Trilinos Partially addresses issue #1749 --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 646c89c813..8df113a6a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -321,6 +321,8 @@ ELSE() #no linking commands required - tribits does this ELSE() ADD_LIBRARY(Kokkos::kokkoskernels ALIAS kokkoskernels) + # Address kokkos/kokkos-kernels#1749 + ADD_LIBRARY(KokkosKernels::kokkoskernels ALIAS kokkoskernels) TARGET_LINK_LIBRARIES(kokkoskernels PUBLIC Kokkos::kokkos) FOREACH(DIR ${KK_INCLUDE_DIRS}) TARGET_INCLUDE_DIRECTORIES(kokkoskernels PUBLIC $) From 48d67ff62f77b3640b0464fd4e00c4abc834816d Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 7 Jun 2023 16:56:48 -0600 Subject: [PATCH 410/442] CMakeLists.txt: Add all_libs alias Required for for TriBITS-compliance Discussion at https://github.com/trilinos/Trilinos/issues/8866#issuecomment-796969671 --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8df113a6a8..5bc71af5bd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -323,6 +323,8 @@ ELSE() ADD_LIBRARY(Kokkos::kokkoskernels ALIAS kokkoskernels) # Address kokkos/kokkos-kernels#1749 ADD_LIBRARY(KokkosKernels::kokkoskernels ALIAS kokkoskernels) + # all_libs target is required for TriBITS-compliance + ADD_LIBRARY(KokkosKernels::all_libs ALIAS kokkoskernels) TARGET_LINK_LIBRARIES(kokkoskernels PUBLIC Kokkos::kokkos) FOREACH(DIR ${KK_INCLUDE_DIRS}) TARGET_INCLUDE_DIRECTORIES(kokkoskernels PUBLIC $) From 91c0b606ae000f60b93d78247fbc38eac34b308d Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 7 Jun 2023 17:32:43 -0600 Subject: [PATCH 411/442] Revert ".github/workflows: Print out arch in osx CI" This reverts commit c0349db3ac02f1830a70cf71ab046cfeb5ae1a33. --- .github/workflows/osx.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 3808c75d87..df9c81afc8 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -77,7 +77,7 @@ jobs: -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ -DCMAKE_INSTALL_PREFIX=$PWD/../install \ .. - grep -i Kokkos_ARCH CMakeCache.txt || true + grep -i Kokkos_ARCH CMakeCache.txt | grep ON || true - name: build_and_install_kokkos working-directory: kokkos/build From 341a4779f9d01a433adbcd983d6bbddc21d9a031 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 7 Jun 2023 17:33:06 -0600 Subject: [PATCH 412/442] Revert ".github/workflows: Print out arch in osx CI" This reverts commit 6116419961426a4a9b2cb8bfc68b813350bae80b. --- .github/workflows/osx.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index df9c81afc8..820af0e6fd 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -77,7 +77,7 @@ jobs: -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ -DCMAKE_INSTALL_PREFIX=$PWD/../install \ .. - grep -i Kokkos_ARCH CMakeCache.txt | grep ON || true + grep -i Kokkos_ARCH CMakeCache.txt | grep ON - name: build_and_install_kokkos working-directory: kokkos/build From fea22d883a4ae42f71cac06a78bdfb4d2720aac5 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 7 Jun 2023 17:33:28 -0600 Subject: [PATCH 413/442] Revert ".github/workflows: Print out arch in osx CI" This reverts commit d55fb1054e3040a47b444d07fea8c127f0321508. --- .github/workflows/osx.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 820af0e6fd..769957b953 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -77,7 +77,6 @@ jobs: -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ -DCMAKE_INSTALL_PREFIX=$PWD/../install \ .. - grep -i Kokkos_ARCH CMakeCache.txt | grep ON - name: build_and_install_kokkos working-directory: kokkos/build From 87a384657f2e0a38a76d90510092d366b5833906 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 7 Jun 2023 17:42:04 -0600 Subject: [PATCH 414/442] Address PR feedback --- .../KokkosBatched_HostLevel_Gemm_Impl.hpp | 28 ++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index db837f298a..7a93309e65 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -29,17 +29,17 @@ namespace KokkosBatched { namespace Impl { //////////////////////////////// tile_m ////////////////////////////////// template -constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_m() { +constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dbl_buf_tile_m() { return 32; } //////////////////////////////// tile_n ////////////////////////////////// template -constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_n() { +constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dbl_buf_tile_n() { return 32; } //////////////////////////////// tile_k ////////////////////////////////// template -constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_k() { +constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dbl_buf_tile_k() { return 8; } @@ -50,7 +50,7 @@ constexpr KOKKOS_INLINE_FUNCTION int kk_gemm_dlb_buf_tile_k() { #if defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ARCH_VEGA908) template <> constexpr KOKKOS_INLINE_FUNCTION int -kk_gemm_dlb_buf_tile_k() { +kk_gemm_dbl_buf_tile_k() { return 16; } #endif @@ -94,12 +94,9 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, case BaseKokkosBatchedAlgos::KK_SERIAL: case BaseHeuristicAlgos::SQUARE: case BaseTplAlgos::ARMPL: - static_assert(static_cast(AViewType::rank) == 3, - "AViewType must have rank 3."); - static_assert(static_cast(BViewType::rank) == 3, - "BViewType must have rank 3."); - static_assert(static_cast(CViewType::rank) == 3, - "CViewType must have rank 3."); + assert(A.rank_dynamic() == 3 && "AViewType must have rank 3."); + assert(B.rank_dynamic() == 3 && "BViewType must have rank 3."); + assert(C.rank_dynamic() == 3 && "CViewType must have rank 3."); break; default: std::ostringstream os; @@ -178,9 +175,8 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, // Select optimal resultsPerThread param for BatchedSerialGemm using bsgResultsPerThread = - typename std::conditional::type; + std::conditional_t; // Select optimal mode param for SerialGemm. using bsgModeType = typename std::conditional< @@ -204,9 +200,9 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, ? (c_m >= 16) : (c_m >= 24 && c_m <= 32) || c_m >= 40)) { handle->teamSz = handle->vecLen = 8; - constexpr int tile_m = Impl::kk_gemm_dlb_buf_tile_m(); - constexpr int tile_n = Impl::kk_gemm_dlb_buf_tile_n(); - constexpr int tile_k = Impl::kk_gemm_dlb_buf_tile_k(); + constexpr int tile_m = Impl::kk_gemm_dbl_buf_tile_m(); + constexpr int tile_n = Impl::kk_gemm_dbl_buf_tile_n(); + constexpr int tile_k = Impl::kk_gemm_dbl_buf_tile_k(); constexpr size_t alpha_in_fma_thresh = Impl::kk_gemm_dbl_buf_alpha_in_fma_thresh(); From 9d723f6feee01e4ec6242dc1f0aeaa130762d333 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 5 Jun 2023 11:24:37 -0600 Subject: [PATCH 415/442] batched/dense: Add gesv DynRankView runtime checks --- .../impl/KokkosBatched_Gemv_Team_Impl.hpp | 50 ++++++++++++++----- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp index 1407bf43b4..48627aaf30 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp @@ -21,6 +21,7 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Gemv_Team_Internal.hpp" #include "KokkosBlas2_team_gemv.hpp" +#include namespace KokkosBatched { @@ -46,9 +47,16 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - static_assert(AViewType::rank == 3, - "Batched TeamGemv requires rank-3 A matrix (use " - "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + if constexpr (Kokkos::is_dyn_rank_view::value) { + assert(A.rank_dynamic() == 3 && + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + } else { + static_assert(AViewType::rank == 3, + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + } + if (A.extent(0) == 1) { KokkosBlas::TeamGemv< MemberType, Trans::NoTranspose, @@ -79,9 +87,15 @@ struct TeamGemv { const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { - static_assert(AViewType::rank == 3, - "Batched TeamGemv requires rank-3 A matrix (use " - "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + /* if constexpr (Kokkos::is_dyn_rank_view::value) { + assert(A.rank_dynamic() == 3 && + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + } else { + static_assert(AViewType::rank == 3, + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + } */ Kokkos::abort( "KokkosBlas::TeamGemv for rank-3 matrix is NOT " "implemented"); @@ -99,9 +113,15 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - static_assert(AViewType::rank == 3, - "Batched TeamGemv requires rank-3 A matrix (use " - "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + if constexpr (Kokkos::is_dyn_rank_view::value) { + assert(A.rank_dynamic() == 3 && + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + } else { + static_assert(AViewType::rank == 3, + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + } if (A.extent(0) == 1) { KokkosBlas:: TeamGemv::invoke( @@ -129,9 +149,15 @@ struct TeamGemv { const xViewType & /*x*/, const ScalarType /*beta*/, const yViewType & /*y*/) { - static_assert(AViewType::rank == 3, - "Batched TeamGemv requires rank-3 A matrix (use " - "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + /* if constexpr (Kokkos::is_dyn_rank_view::value) { + assert(A.rank_dynamic() == 3 && + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + } else { + static_assert(AViewType::rank == 3, + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + } */ Kokkos::abort( "KokkosBlas::TeamGemv for rank-3 matrix is NOT " "implemented"); From d88ad352337f903e24f6723dd9968ab984a5908c Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 8 Jun 2023 12:48:06 -0600 Subject: [PATCH 416/442] sparse: Various doxygen fixes --- sparse/src/KokkosKernels_Handle.hpp | 8 +++++--- sparse/src/KokkosSparse_BsrMatrix.hpp | 7 ++++--- sparse/src/KokkosSparse_coo2crs.hpp | 7 ++++--- sparse/src/KokkosSparse_spmv.hpp | 12 +++++++----- 4 files changed, 20 insertions(+), 14 deletions(-) diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index dc76ee23d7..dae3f12462 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -422,9 +422,11 @@ class KokkosKernelsHandle { /// set_team_work_size, it will return the set value. Otherwise it will return /// the teamsize. /// \param team_size input, team size used by the kernel. - /// \param nnz_lno_t filler for overall_work_size - int get_team_work_size(const int team_size, const int /* concurrency */, - const nnz_lno_t /* overall_work_size */) { + /// \param concurrency filler for concurrency + /// \param overall_work_size filler for overall_work_size + int get_team_work_size(const int team_size, + [[maybe_unused]] const int concurrency, + [[maybe_unused]] const nnz_lno_t overall_work_size) { if (this->team_work_size != -1) { return this->team_work_size; } else { diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index ff9eace826..b077215635 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -631,16 +631,17 @@ class BsrMatrix { /// The matrix will store and use the row map, indices, and values /// directly (by view, not by deep copy). /// + /// \param label /// \param nrows [in] The number of rows. /// \param ncols [in] The number of columns. - /// \param size_type [in] Filler for annz + /// \param annz [in] Filler for annz. /// \param vals [in/out] The entries. /// \param rows [in/out] The row map (containing the offsets to the /// data in each row). /// \param cols [in/out] The column indices. /// \param blockDimIn [in] The block dimensions. - BsrMatrix(const std::string& /*label*/, const OrdinalType nrows, - const OrdinalType ncols, const size_type /*annz*/, + BsrMatrix([[maybe_unused]] const std::string& label, const OrdinalType nrows, + const OrdinalType ncols, [[maybe_unused]] const size_type annz, const values_type& vals, const row_map_type& rows, const index_type& cols, const OrdinalType blockDimIn) : graph(cols, rows), diff --git a/sparse/src/KokkosSparse_coo2crs.hpp b/sparse/src/KokkosSparse_coo2crs.hpp index 1fe122f30e..45e54ce474 100644 --- a/sparse/src/KokkosSparse_coo2crs.hpp +++ b/sparse/src/KokkosSparse_coo2crs.hpp @@ -20,7 +20,7 @@ // have not made it into Kokkos 4.0.00 pr 4.0.01 will // need to see if it happens in 4.1.00 to have a final // version check here. -#if KOKKOS_VERSION >= 40099 +#if KOKKOS_VERSION >= 40099 || defined(DOXY) #include "KokkosSparse_CooMatrix.hpp" #include "KokkosSparse_CrsMatrix.hpp" @@ -90,6 +90,7 @@ auto coo2crs(DimType m, DimType n, RowViewType row, ColViewType col, /// \tparam SizeType The KokkosSparse::CooMatrix::size_type /// \param cooMatrix The sparse matrix stored in coordinate ("Coo") format. /// \return A KokkosSparse::CrsMatrix. +// clang-format on template auto coo2crs(KokkosSparse::CooMatrix= 40099 -#endif // _KOKKOSSPARSE_COO2CRS_HPP +#endif // KOKKOS_VERSION >= 40099 || defined(DOXY) +#endif // _KOKKOSSPARSE_COO2CRS_HPP diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index c18c0bfeb4..60fb5331cf 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -53,7 +53,7 @@ struct RANK_TWO {}; /// \param x [in] A vector. /// \param beta [in] Scalar multiplier for the multivector y. /// \param y [in/out] vector. -/// \param RANK_ONE tag dispatch +/// \param tag RANK_ONE dispatch /// #ifdef DOXY // documentation version template Date: Thu, 8 Jun 2023 12:50:36 -0600 Subject: [PATCH 417/442] docs: Fix sphinx warnings --- docs/Doxyfile.in | 7 ++++--- docs/conf.py | 2 +- docs/developer/apidocs/blas1.rst | 2 +- docs/developer/apidocs/sparse.rst | 5 +---- docs/developer/profiling.rst | 2 +- docs/developer/testing.rst.parked | 16 ++++++++++++++++ ...er_doc.rst => write_developer_doc.rst.parked} | 0 ...te_user_doc.rst => write_user_doc.rst.parked} | 0 docs/index.rst | 2 +- 9 files changed, 25 insertions(+), 11 deletions(-) create mode 100644 docs/developer/testing.rst.parked rename docs/developer/{write_developer_doc.rst => write_developer_doc.rst.parked} (100%) rename docs/developer/{write_user_doc.rst => write_user_doc.rst.parked} (100%) diff --git a/docs/Doxyfile.in b/docs/Doxyfile.in index deb47d9d2b..b9bc8ba6e3 100644 --- a/docs/Doxyfile.in +++ b/docs/Doxyfile.in @@ -2199,7 +2199,7 @@ ENABLE_PREPROCESSING = YES # The default value is: NO. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -MACRO_EXPANSION = NO +MACRO_EXPANSION = YES # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then # the macro expansion is limited to the macros specified with the PREDEFINED and @@ -2207,7 +2207,7 @@ MACRO_EXPANSION = NO # The default value is: NO. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -EXPAND_ONLY_PREDEF = NO +EXPAND_ONLY_PREDEF = YES # If the SEARCH_INCLUDES tag is set to YES, the include files in the # INCLUDE_PATH will be searched if a #include is found. @@ -2239,7 +2239,8 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = DOXY +PREDEFINED = DOXY \ + "KOKKOS_INLINE_FUNCTION=" # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/docs/conf.py b/docs/conf.py index ce7385acad..f7027880c5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -79,4 +79,4 @@ def configureDoxyfile(input_dir, output_dir, doxyfile_in, doxyfile_out): # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +# html_static_path = ['_static'] diff --git a/docs/developer/apidocs/blas1.rst b/docs/developer/apidocs/blas1.rst index f800f6e5ce..1b1ea0c640 100644 --- a/docs/developer/apidocs/blas1.rst +++ b/docs/developer/apidocs/blas1.rst @@ -63,7 +63,7 @@ sum swap ---- .. doxygenfunction:: KokkosBlas::swap(execution_space const& space, XVector const& X, YVector const& Y) -.. doxygenfunction:: KokkosBlas::swap(XVector const& X, YVector const& Y) +.. doxygenfunction:: KokkosBlas::swap(const XVector&, const YVector&) update ------ diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index 15375086d6..f73b507439 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -28,7 +28,7 @@ ccs2crs coo2crs ------- -.. doxygenfunction:: KokkosSparse::coo2crs(DimType, DimType, RowViewType, ColViewType, DataViewType) +.. doxygenfunction:: KokkosSparse::coo2crs(DimType m, DimType n, RowViewType row, ColViewType col, DataViewType data) .. doxygenfunction:: KokkosSparse::coo2crs(KokkosSparse::CooMatrix &cooMatrix) crs2coo @@ -38,9 +38,6 @@ crs2coo spmv ---- - -.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls, const char[], const AlphaType&, const AMatrix&, const XVector&, const BetaType&, const YVector&) -.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) .. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_ONE) .. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_TWO) .. doxygenfunction:: KokkosSparse::spmv(const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) diff --git a/docs/developer/profiling.rst b/docs/developer/profiling.rst index d5f3e4ceb2..326281ab83 100644 --- a/docs/developer/profiling.rst +++ b/docs/developer/profiling.rst @@ -14,7 +14,7 @@ Compile Times cp ./{sparse,blas}/unit_test/CMakeFiles/*.dir/backends/*.json ftime-trace-artifacts/ -7. Run `ClangBuildAnalyzer: +7. Run `ClangBuildAnalyzer`: .. code-block:: diff --git a/docs/developer/testing.rst.parked b/docs/developer/testing.rst.parked new file mode 100644 index 0000000000..c6bb810dc3 --- /dev/null +++ b/docs/developer/testing.rst.parked @@ -0,0 +1,16 @@ +Testing +======= + +Test matrix +----------------------------------- + +Unless noted otherwise in `Types`, we test with `float`. + +Format of name column: COMPILERS_BACKENDS_LAYOUTS_TPLS. + +.. csv-table:: :rst:dir:`Test Matrix` + :header: "Name", "Architectures", "Compilers", "Backends", "Layouts", "Types" + + "NIGHTLY_GCC930_CUDA11_OPENMP_CUDA_LEFT_BLAS_LAPACK", "Power9, Volta70, Pascal60", "GNU 9.3.0, NVCC 11.0.1", "OpenMP, Cuda", "Left" + "PR_GCC1030_INTEL20_SERIAL_OPENMP_RIGHT_BLAS_LAPACK", "Broadwell", "GCC, INTEL", "Serial, OpenMP", "Right" + "PR_ARMPL2110_SERIAL_OPENMP_LEFT_BLAS_LAPACK", "A64FX", "ARMPL 21.1.0", "Serial, OpenMP", "LayoutLeft" \ No newline at end of file diff --git a/docs/developer/write_developer_doc.rst b/docs/developer/write_developer_doc.rst.parked similarity index 100% rename from docs/developer/write_developer_doc.rst rename to docs/developer/write_developer_doc.rst.parked diff --git a/docs/developer/write_user_doc.rst b/docs/developer/write_user_doc.rst.parked similarity index 100% rename from docs/developer/write_user_doc.rst rename to docs/developer/write_user_doc.rst.parked diff --git a/docs/index.rst b/docs/index.rst index db873e9a3b..cd8a174ff9 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,5 +1,5 @@ Kokkos Kernels documentation: Under Construction -========================================== +================================================ .. toctree:: :maxdepth: 2 From c5b2305aa0016fd564a1d0fbace5c15931b5a3b2 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 8 Jun 2023 12:50:57 -0600 Subject: [PATCH 418/442] docs: Enable sphinx -werror --- docs/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index 34f54edbbb..41be4736c3 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -41,7 +41,7 @@ if (Sphinx_FOUND) set(SPHINX_BUILD ${CMAKE_CURRENT_BINARY_DIR}/docs/sphinx) add_custom_target(Sphinx ALL - COMMAND ${SPHINX_EXECUTABLE} -b html + COMMAND ${SPHINX_EXECUTABLE} -W --keep-going -b html # Tell Breathe where to find the Doxygen output -Dbreathe_projects.${PROJECT_NAME}=${DOXYGEN_OUTPUT_DIR}/xml ${SPHINX_SOURCE} ${SPHINX_BUILD} From dec2bcb8dfdc68b19f14ffd0faa9804dea019104 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 8 Jun 2023 13:03:10 -0600 Subject: [PATCH 419/442] Remove TestDeviceType --- .../Test_Batched_BatchedGemm_Complex.hpp | 32 +++++----- .../Test_Batched_BatchedGemm_Real.hpp | 64 +++++++++---------- test_common/Test_Cuda.hpp | 2 - test_common/Test_HIP.hpp | 2 - test_common/Test_OpenMP.hpp | 2 - test_common/Test_OpenMPTarget.hpp | 2 - test_common/Test_SYCL.hpp | 2 - test_common/Test_Serial.hpp | 2 - test_common/Test_Threads.hpp | 2 - 9 files changed, 48 insertions(+), 62 deletions(-) diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp index 01622258ab..a2b9edf1e6 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm_Complex.hpp @@ -20,7 +20,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_scomplex_scomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_scomplex_scomplex_left) { @@ -28,7 +28,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_scomplex_scomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_scomplex_scomplex_left) { @@ -36,7 +36,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_scomplex_scomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_scomplex_scomplex_left) { @@ -44,7 +44,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_scomplex_scomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } /********************* BatchLayout::Right *********************/ @@ -54,7 +54,7 @@ TEST_F(TestCategory, BatchLayout::Right> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_scomplex_scomplex_right) { @@ -62,7 +62,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_scomplex_scomplex_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_scomplex_scomplex_right) { @@ -70,7 +70,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_scomplex_scomplex_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_scomplex_scomplex_right) { @@ -78,7 +78,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_scomplex_scomplex_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } #endif @@ -90,7 +90,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_dcomplex_dcomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_left) { @@ -98,7 +98,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_left) { @@ -106,7 +106,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_left) { @@ -114,7 +114,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } /********************* BatchLayout::Right *********************/ @@ -124,7 +124,7 @@ TEST_F(TestCategory, BatchLayout::Right> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_right) { @@ -132,7 +132,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_dcomplex_dcomplex_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_right) { @@ -140,7 +140,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_dcomplex_dcomplex_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_right) { @@ -148,7 +148,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_dcomplex_dcomplex_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm, + test_batched_gemm, Kokkos::complex, param_tag_type>(); } #endif diff --git a/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp b/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp index 92ea8bcc67..00561e0317 100644 --- a/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp +++ b/batched/dense/unit_test/Test_Batched_BatchedGemm_Real.hpp @@ -25,7 +25,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_bhalf_bhalf_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_left) { @@ -33,7 +33,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_left) { @@ -41,7 +41,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_left) { @@ -49,7 +49,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); } /********************* BatchLayout::Right *********************/ @@ -58,7 +58,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_bhalf_bhalf_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_right) { @@ -66,7 +66,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_bhalf_bhalf_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_right) { @@ -74,7 +74,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_bhalf_bhalf_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_right) { @@ -82,7 +82,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_bhalf_bhalf_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); } #endif // KOKKOS_BHALF_T_IS_FLOAT @@ -98,7 +98,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_half_half_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_half_half_left) { @@ -106,7 +106,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_half_half_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_half_half_left) { @@ -114,7 +114,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_half_half_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_half_half_left) { @@ -122,7 +122,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_half_half_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); } /********************* BatchLayout::Right *********************/ @@ -131,7 +131,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_half_half_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_half_half_right) { @@ -139,7 +139,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_half_half_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_half_half_right) { @@ -147,7 +147,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_half_half_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_half_half_right) { @@ -155,7 +155,7 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_half_half_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); } #endif // KOKKOS_HALF_T_IS_FLOAT @@ -167,28 +167,28 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_left) { BatchLayout::Left> param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_float_float_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_float_float_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_float_float_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_right) { @@ -196,28 +196,28 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_float_float_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_float_float_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_float_float_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_float_float_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } #endif @@ -228,28 +228,28 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_left) { ::Test::SharedParamTag; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_double_double_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_double_double_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_double_double_left) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } /********************* BatchLayout::Right *********************/ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_right) { @@ -257,27 +257,27 @@ TEST_F(TestCategory, batched_scalar_batched_gemm_nt_nt_double_double_right) { BatchLayout::Right> param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_nt_double_double_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_nt_t_double_double_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } TEST_F(TestCategory, batched_scalar_batched_gemm_t_t_double_double_right) { typedef ::Test::SharedParamTag param_tag_type; - test_batched_gemm(); + test_batched_gemm(); } #endif diff --git a/test_common/Test_Cuda.hpp b/test_common/Test_Cuda.hpp index 83c1aa7d80..0bfe35718b 100644 --- a/test_common/Test_Cuda.hpp +++ b/test_common/Test_Cuda.hpp @@ -33,7 +33,5 @@ class Cuda : public ::testing::Test { #define TestCategory Cuda #define TestExecSpace Kokkos::Cuda -#define TestDeviceType \ - Kokkos::Device #endif // TEST_CUDA_HPP diff --git a/test_common/Test_HIP.hpp b/test_common/Test_HIP.hpp index 8cfae41cc4..7e61bfc9c3 100644 --- a/test_common/Test_HIP.hpp +++ b/test_common/Test_HIP.hpp @@ -33,7 +33,5 @@ class hip : public ::testing::Test { #define TestCategory hip #define TestExecSpace Kokkos::Experimental::HIP -#define TestDeviceType \ - Kokkos::Device #endif // TEST_HIP_HPP diff --git a/test_common/Test_OpenMP.hpp b/test_common/Test_OpenMP.hpp index 43ca0e2627..8b4f90730e 100644 --- a/test_common/Test_OpenMP.hpp +++ b/test_common/Test_OpenMP.hpp @@ -33,7 +33,5 @@ class openmp : public ::testing::Test { #define TestCategory openmp #define TestExecSpace Kokkos::OpenMP -#define TestDeviceType \ - Kokkos::Device #endif // TEST_OPENMP_HPP diff --git a/test_common/Test_OpenMPTarget.hpp b/test_common/Test_OpenMPTarget.hpp index 1cd901c332..2056d8be01 100644 --- a/test_common/Test_OpenMPTarget.hpp +++ b/test_common/Test_OpenMPTarget.hpp @@ -33,7 +33,5 @@ class openmptarget : public ::testing::Test { #define TestCategory openmptarget #define TestExecSpace Kokkos::Experimental::OpenMPTarget -#define TestDeviceType \ - Kokkos::Device #endif // TEST_OPENMPTARGET_HPP diff --git a/test_common/Test_SYCL.hpp b/test_common/Test_SYCL.hpp index e85ce3865f..c7022f35d1 100644 --- a/test_common/Test_SYCL.hpp +++ b/test_common/Test_SYCL.hpp @@ -30,5 +30,3 @@ class sycl_test : public ::testing::Test { #define TestCategory sycl_test #define TestExecSpace Kokkos::Experimental::SYCL -#define TestDeviceType \ - Kokkos::Device diff --git a/test_common/Test_Serial.hpp b/test_common/Test_Serial.hpp index ba31c8d417..fe2917937b 100644 --- a/test_common/Test_Serial.hpp +++ b/test_common/Test_Serial.hpp @@ -33,7 +33,5 @@ class serial : public ::testing::Test { #define TestCategory serial #define TestExecSpace Kokkos::Serial -#define TestDeviceType \ - Kokkos::Device #endif // TEST_SERIAL_HPP diff --git a/test_common/Test_Threads.hpp b/test_common/Test_Threads.hpp index 4e39fb16bc..1e2919b68f 100644 --- a/test_common/Test_Threads.hpp +++ b/test_common/Test_Threads.hpp @@ -33,7 +33,5 @@ class threads : public ::testing::Test { #define TestCategory threads #define TestExecSpace Kokkos::Threads -#define TestDeviceType \ - Kokkos::Device #endif // TEST_THREADS_HPP From 24d259b0d24ce29768dc745bc9330e37f99bb94f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 8 Jun 2023 13:39:15 -0600 Subject: [PATCH 420/442] docs: Fix blas rst files --- docs/developer/apidocs/blas1.rst | 4 ++-- docs/developer/apidocs/blas3.rst | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/developer/apidocs/blas1.rst b/docs/developer/apidocs/blas1.rst index 1b1ea0c640..72c2612c7f 100644 --- a/docs/developer/apidocs/blas1.rst +++ b/docs/developer/apidocs/blas1.rst @@ -2,7 +2,7 @@ BLAS1 -- KokkosKernels blas1 interfaces ======================================= abs -___ +--- .. doxygenfunction:: KokkosBlas::abs(const execution_space& space, const RMV& R, const XMV& X) .. doxygenfunction:: KokkosBlas::abs(const RMV& R, const XMV& X) @@ -62,7 +62,7 @@ sum swap ---- -.. doxygenfunction:: KokkosBlas::swap(execution_space const& space, XVector const& X, YVector const& Y) +.. doxygenfunction:: KokkosBlas::swap(execution_space const&, XVector const&, YVector const&) .. doxygenfunction:: KokkosBlas::swap(const XVector&, const YVector&) update diff --git a/docs/developer/apidocs/blas3.rst b/docs/developer/apidocs/blas3.rst index 8303c9f17e..cac0dd6bc2 100644 --- a/docs/developer/apidocs/blas3.rst +++ b/docs/developer/apidocs/blas3.rst @@ -5,7 +5,13 @@ gemm ---- .. doxygenfunction:: KokkosBlas::gemm(const execution_space &space, const char transA[], const char transB[], typename AViewType::const_value_type &alpha, const AViewType &A, const BViewType &B, typename CViewType::const_value_type &beta, const CViewType &C) .. doxygenfunction:: KokkosBlas::gemm(const char transA[], const char transB[], typename AViewType::const_value_type &alpha, const AViewType &A, const BViewType &B, typename CViewType::const_value_type &beta, const CViewType &C) + +trmm +---- .. doxygenfunction:: KokkosBlas::trmm(const execution_space& space, const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) .. doxygenfunction:: KokkosBlas::trmm(const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) + +trsm +---- .. doxygenfunction:: KokkosBlas::trsm(const execution_space& space, const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) .. doxygenfunction:: KokkosBlas::trsm(const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) From 558dbe4a92d4ab113425e59017e05484718787e9 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 8 Jun 2023 13:50:12 -0600 Subject: [PATCH 421/442] docs: Update trmm. Add trtri. --- blas/src/KokkosBlas3_trmm.hpp | 4 +++- blas/src/KokkosBlas_trtri.hpp | 1 + docs/developer/apidocs/blas3.rst | 4 ++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/blas/src/KokkosBlas3_trmm.hpp b/blas/src/KokkosBlas3_trmm.hpp index eb45da0f90..bdc86d4d9e 100644 --- a/blas/src/KokkosBlas3_trmm.hpp +++ b/blas/src/KokkosBlas3_trmm.hpp @@ -27,9 +27,11 @@ namespace KokkosBlas { -/// \brief Solve triangular linear system with multiple RHSs: +/// \brief Triangular matrix multiply: +/// /// B = alpha * op(A) * B if side == "L" or "l" /// B = alpha * B * op(A) if side == "R" or "r" +/// /// This function is currently blocking when running the native implementation /// which only has a serial implementation. /// diff --git a/blas/src/KokkosBlas_trtri.hpp b/blas/src/KokkosBlas_trtri.hpp index 22556bc35a..b1a34f0483 100644 --- a/blas/src/KokkosBlas_trtri.hpp +++ b/blas/src/KokkosBlas_trtri.hpp @@ -28,6 +28,7 @@ namespace KokkosBlas { /// \brief Find the inverse of the triangular matrix, A +/// /// A = inv(A) /// /// \tparam AViewType Input matrix, as a 2-D Kokkos::View diff --git a/docs/developer/apidocs/blas3.rst b/docs/developer/apidocs/blas3.rst index cac0dd6bc2..fea3dc252a 100644 --- a/docs/developer/apidocs/blas3.rst +++ b/docs/developer/apidocs/blas3.rst @@ -11,6 +11,10 @@ trmm .. doxygenfunction:: KokkosBlas::trmm(const execution_space& space, const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) .. doxygenfunction:: KokkosBlas::trmm(const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) +trtri +----- +.. doxygenfunction:: KokkosBlas::trtri + trsm ---- .. doxygenfunction:: KokkosBlas::trsm(const execution_space& space, const char side[], const char uplo[], const char trans[], const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, const BViewType& B) From eb92728a666d0ee944012b35a747bfde5ff668ed Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 8 Jun 2023 13:33:51 -0600 Subject: [PATCH 422/442] batched/unit_test: Optionally skip simd dcomplex4 --- batched/dense/unit_test/Test_Batched_VectorView.hpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/batched/dense/unit_test/Test_Batched_VectorView.hpp b/batched/dense/unit_test/Test_Batched_VectorView.hpp index 9c0e9845d9..793c4ac3f3 100644 --- a/batched/dense/unit_test/Test_Batched_VectorView.hpp +++ b/batched/dense/unit_test/Test_Batched_VectorView.hpp @@ -382,9 +382,19 @@ TEST_F(TestCategory, batched_vector_view_simd_scomplex8) { TEST_F(TestCategory, batched_vector_view_simd_dcomplex2) { test_batched_vector_view >, 2>(); } + +#if defined(KOKKOS_COMPILER_INTEL) && \ + ((KOKKOS_COMPILER_INTEL > 1900) && (KOKKOS_COMPILER_INTEL <= 2021)) +TEST_F(TestCategory, batched_vector_view_simd_dcomplex4) { + printf( + "Skipped: intel compiler version > 19.0.05 && <= 2021\n" + "See https://github.com/kokkos/kokkos-kernels/issues/1673."); +} +#else TEST_F(TestCategory, batched_vector_view_simd_dcomplex4) { test_batched_vector_view >, 4>(); } -#endif +#endif // KOKKOS_COMPILER_INTEL +#endif // KOKKOSKERNELS_INST_COMPLEX_DOUBLE #endif // check to not include this in a device test From 478a56b53b77f172276c3a54574ea1bce24d3e73 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Mon, 12 Jun 2023 09:41:02 -0600 Subject: [PATCH 423/442] use host pointer mode in rocBLAS scal --- blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index 4771f0862e..da11555f7b 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -299,7 +299,7 @@ namespace Impl { KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ + rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN( \ s.handle, N, reinterpret_cast(&alpha), \ reinterpret_cast(R.data()), one)); \ From 6d79eaf5d126cafb97c40b764aa3ab871a6ee602 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 12 Jun 2023 10:24:13 -0600 Subject: [PATCH 424/442] sparse/src: Work around gnu compiler bug --- sparse/src/KokkosSparse_BsrMatrix.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index b077215635..bb45d6070f 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -640,7 +640,7 @@ class BsrMatrix { /// data in each row). /// \param cols [in/out] The column indices. /// \param blockDimIn [in] The block dimensions. - BsrMatrix([[maybe_unused]] const std::string& label, const OrdinalType nrows, + BsrMatrix(const std::string& label [[maybe_unused]], const OrdinalType nrows, const OrdinalType ncols, [[maybe_unused]] const size_type annz, const values_type& vals, const row_map_type& rows, const index_type& cols, const OrdinalType blockDimIn) From ce8bb989f2100cafafd4d9529203c00209ca57e1 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 12 Jun 2023 10:34:43 -0600 Subject: [PATCH 425/442] Benchmark cleanup for par_ilut and spmv (#1853) 1) Make better use of google benchmark in par_ilut benchmark 2) Re-enable par_ilut benchmark, warnings should be fixed 3) Add new KK_USER_REQUIRE macros to KokkosKernels_Error. Makes it convenient to check and throw a good exception for user errors. 4) Fix option help in spmv benchmark (was saying -s instead of -n) 5) Fix option parsing in spmv benchmark. Bad options were printed to cerr but program execution was not halted. 6) Fix google bm argument list in spmv benchmark. It was always reporting n=100000 even when that was not the case. It will still report an incorrect n if an input A is used, but this is a step in the right direction. This requires run_spmv to a spmv_parameters object instead of argc/argv. 7) Use register_benchmark_real_time in spmv benchmark. The code there was identical to it. --- common/src/KokkosKernels_Error.hpp | 6 + perf_test/sparse/CMakeLists.txt | 28 ++-- perf_test/sparse/KokkosSparse_par_ilut.cpp | 149 ++++++------------ .../sparse/KokkosSparse_spmv_benchmark.cpp | 41 ++--- 4 files changed, 83 insertions(+), 141 deletions(-) diff --git a/common/src/KokkosKernels_Error.hpp b/common/src/KokkosKernels_Error.hpp index 9ebb104378..4d732a8437 100644 --- a/common/src/KokkosKernels_Error.hpp +++ b/common/src/KokkosKernels_Error.hpp @@ -62,6 +62,8 @@ inline void hip_internal_safe_call(hipError_t e, const char *name, * * For _MSG checks, the msg argument can contain '<<' if not a kernel check. * + * KK_USER_REQUIRE* are for checking user inputs + * * This code is adapted from EKAT/src/ekat/ekat_assert.hpp */ @@ -103,6 +105,10 @@ inline void hip_internal_safe_call(hipError_t e, const char *name, #define KK_REQUIRE_MSG(condition, msg) \ IMPL_THROW(condition, msg, std::logic_error) +#define KK_USER_REQUIRE(condition) IMPL_THROW(condition, "", std::runtime_error) +#define KK_USER_REQUIRE_MSG(condition, msg) \ + IMPL_THROW(condition, msg, std::runtime_error) + #define KK_KERNEL_REQUIRE(condition) IMPL_KERNEL_THROW(condition, "") #define KK_KERNEL_REQUIRE_MSG(condition, msg) IMPL_KERNEL_THROW(condition, msg) diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index c9bd79c92f..263f59671a 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -117,20 +117,20 @@ KOKKOSKERNELS_ADD_EXECUTABLE( ) if (KokkosKernels_ENABLE_BENCHMARK) -# KOKKOSKERNELS_ADD_BENCHMARK( -# sparse_par_ilut -# SOURCES KokkosSparse_par_ilut.cpp -# ) - -# # Provide -DGinkgo_DIR to cmake to enable the ginkgo test in sparse_par_ilut. Ginkgo_DIR should -# # point to the dir in the ginkgo install area that contains the GinkgoConfig.cmake file. -# # For me, this was $gingko_install_dir/lib64/cmake/Ginkgo -# if (Ginkgo_DIR) -# find_package(Ginkgo REQUIRED) - -# target_compile_definitions(KokkosKernels_sparse_par_ilut PRIVATE "USE_GINKGO") -# target_link_libraries(KokkosKernels_sparse_par_ilut PRIVATE Ginkgo::ginkgo) -# endif() + KOKKOSKERNELS_ADD_BENCHMARK( + sparse_par_ilut + SOURCES KokkosSparse_par_ilut.cpp + ) + + # Provide -DGinkgo_DIR to cmake to enable the ginkgo test in sparse_par_ilut. Ginkgo_DIR should + # point to the dir in the ginkgo install area that contains the GinkgoConfig.cmake file. + # For me, this was $gingko_install_dir/lib64/cmake/Ginkgo + if (Ginkgo_DIR) + find_package(Ginkgo REQUIRED) + + target_compile_definitions(KokkosKernels_sparse_par_ilut PRIVATE "USE_GINKGO") + target_link_libraries(KokkosKernels_sparse_par_ilut PRIVATE Ginkgo::ginkgo) + endif() KOKKOSKERNELS_ADD_BENCHMARK( sparse_spmv_benchmark SOURCES KokkosSparse_spmv_benchmark.cpp diff --git a/perf_test/sparse/KokkosSparse_par_ilut.cpp b/perf_test/sparse/KokkosSparse_par_ilut.cpp index 44557a5a51..ef144f2817 100644 --- a/perf_test/sparse/KokkosSparse_par_ilut.cpp +++ b/perf_test/sparse/KokkosSparse_par_ilut.cpp @@ -35,6 +35,7 @@ #include "KokkosKernels_default_types.hpp" #include #include +#include "KokkosKernels_perf_test_utilities.hpp" #include "Benchmark_Context.hpp" #include @@ -70,36 +71,8 @@ using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, exe_space, mem_space, mem_space>; using float_t = typename Kokkos::ArithTraits::mag_type; -/////////////////////////////////////////////////////////////////////////////// -template -void time_call(L& lam, State& state, const std::string& name) -/////////////////////////////////////////////////////////////////////////////// -{ - Kokkos::Timer timer; - double min_time = std::numeric_limits::infinity(); - double max_time = 0.0; - double ave_time = 0.0; - - for (auto _ : state) { - // Run timable thing - double time = lam(); - - // Record time - ave_time += time; - if (time > max_time) max_time = time; - if (time < min_time) min_time = time; - state.SetIterationTime(time); - - // Report run so user knows something is happening - std::cout << name << " Finished a run in: " << time << " seconds" - << std::endl; - } - - std::cout << name << " LOOP_AVG_TIME: " << ave_time / state.iterations() - << std::endl; - std::cout << name << " LOOP_MAX_TIME: " << max_time << std::endl; - std::cout << name << " LOOP_MIN_TIME: " << min_time << std::endl; -} +static constexpr bool IS_GPU = + KokkosKernels::Impl::kk_is_gpu_exec_space(); /////////////////////////////////////////////////////////////////////////////// void run_par_ilut_test(benchmark::State& state, KernelHandle& kh, @@ -125,9 +98,8 @@ void run_par_ilut_test(benchmark::State& state, KernelHandle& kh, EntriesType U_entries("U_entries", 0); ValuesType U_values("U_values", 0); - auto plambda = [&]() { - Kokkos::Timer timer; - timer.reset(); + for (auto _ : state) { + state.ResumeTiming(); par_ilut_symbolic(&kh, A_row_map, A_entries, L_row_map, U_row_map); size_type nnzL = par_ilut_handle->get_nnzL(); @@ -145,7 +117,7 @@ void run_par_ilut_test(benchmark::State& state, KernelHandle& kh, par_ilut_numeric(&kh, A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values); Kokkos::fence(); - const double time = timer.seconds(); + state.PauseTiming(); // Check worked num_iters = par_ilut_handle->get_num_iters(); @@ -155,30 +127,26 @@ void run_par_ilut_test(benchmark::State& state, KernelHandle& kh, // Reset inputs Kokkos::deep_copy(L_row_map, 0); Kokkos::deep_copy(U_row_map, 0); - - // Return time - return time; - }; - - time_call(plambda, state, "PAR_ILUT"); + } } #ifdef USE_GINKGO /////////////////////////////////////////////////////////////////////////////// using ginkgo_exec = - std::conditional_t(), - gko::CudaExecutor, gko::OmpExecutor>; + std::conditional_t; template std::shared_ptr get_ginkgo_exec() { return GinkgoT::create(); } +#ifdef KOKKOS_ENABLE_CUDA template <> std::shared_ptr get_ginkgo_exec() { auto ref_exec = gko::ReferenceExecutor::create(); return gko::CudaExecutor::create(0 /*device id*/, ref_exec); } +#endif /////////////////////////////////////////////////////////////////////////////// @@ -217,22 +185,14 @@ void run_par_ilut_test_ginkgo(benchmark::State& state, KernelHandle& kh, std::shared_ptr a_mtx = std::move(a_mtx_uniq); - auto plambda = [&]() { - Kokkos::Timer timer; - timer.reset(); - + for (auto _ : state) { auto fact = gko::factorization::ParIlut::build() .with_fill_in_limit(par_ilut_handle->get_fill_in_limit()) .with_approximate_select(false) .with_iterations(num_iters) .on(exec) ->generate(a_mtx); - - // Return time - return timer.seconds(); - }; - - time_call(plambda, state, "GINKGO"); + } } #endif @@ -267,16 +227,14 @@ void run_spiluk_test(benchmark::State& state, KernelHandle& kh, EntriesType U_entries("U_entries", handle_nnz); ValuesType U_values("U_values", handle_nnz); - auto plambda = [&]() { - Kokkos::Timer timer; - double time; - timer.reset(); + for (auto _ : state) { + if (measure_symbolic) { + state.ResumeTiming(); + } spiluk_symbolic(&kh, fill_lev, A_row_map, A_entries, L_row_map, L_entries, U_row_map, U_entries); Kokkos::fence(); - if (measure_symbolic) { - time = timer.seconds(); - } + state.PauseTiming(); const size_type nnzL = spiluk_handle->get_nnzL(); const size_type nnzU = spiluk_handle->get_nnzU(); @@ -287,11 +245,11 @@ void run_spiluk_test(benchmark::State& state, KernelHandle& kh, Kokkos::resize(U_values, nnzU); if (!measure_symbolic) { - timer.reset(); + state.ResumeTiming(); spiluk_numeric(&kh, fill_lev, A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values); Kokkos::fence(); - time = timer.seconds(); + state.PauseTiming(); } // Reset inputs @@ -305,13 +263,7 @@ void run_spiluk_test(benchmark::State& state, KernelHandle& kh, Kokkos::resize(U_entries, handle_nnz); spiluk_handle->reset_handle(rows, handle_nnz, handle_nnz); - - return time; - }; - - std::string name = - std::string("SPILUK_") + (measure_symbolic ? "SYM" : "NUM"); - time_call(plambda, state, name); + } } /////////////////////////////////////////////////////////////////////////////// @@ -379,8 +331,8 @@ int test_par_ilut_perf(const std::string& matrix_file, int rows, auto plambda = [&](benchmark::State& state) { run_par_ilut_test(state, kh, A, num_iters); }; - KokkosKernelsBenchmark::register_benchmark((name + "_par_ilut").c_str(), - plambda, arg_names, args, loop); + KokkosKernelsBenchmark::register_benchmark_real_time( + (name + "_par_ilut").c_str(), plambda, arg_names, args, loop); } #ifdef USE_GINKGO @@ -388,8 +340,8 @@ int test_par_ilut_perf(const std::string& matrix_file, int rows, auto glambda = [&](benchmark::State& state) { run_par_ilut_test_ginkgo(state, kh, A, num_iters); }; - KokkosKernelsBenchmark::register_benchmark((name + "_gingko").c_str(), - glambda, arg_names, args, loop); + KokkosKernelsBenchmark::register_benchmark_real_time( + (name + "_gingko").c_str(), glambda, arg_names, args, loop); } #endif @@ -400,10 +352,10 @@ int test_par_ilut_perf(const std::string& matrix_file, int rows, auto s2lambda = [&](benchmark::State& state) { run_spiluk_test(state, kh, A, team_size, false); }; - KokkosKernelsBenchmark::register_benchmark( + KokkosKernelsBenchmark::register_benchmark_real_time( (name + "_spiluk_symbolic").c_str(), s1lambda, arg_names, args, loop); - KokkosKernelsBenchmark::register_benchmark( + KokkosKernelsBenchmark::register_benchmark_real_time( (name + "_spiluk_numeric").c_str(), s2lambda, arg_names, args, loop); } @@ -413,8 +365,6 @@ int test_par_ilut_perf(const std::string& matrix_file, int rows, return 0; } -} // namespace - /////////////////////////////////////////////////////////////////////////////// void print_help_par_ilut() /////////////////////////////////////////////////////////////////////////////// @@ -443,16 +393,13 @@ void handle_int_arg(int argc, char** argv, int& i, { std::string arg = argv[i]; auto it = option_map.find(arg); - if (it == option_map.end()) { - throw std::runtime_error(std::string("Unknown option: ") + arg); - } - if (i + 1 == argc) { - throw std::runtime_error(std::string("Missing option value for option: ") + - arg); - } + KK_USER_REQUIRE_MSG(it != option_map.end(), "Unknown option: " << arg); + KK_USER_REQUIRE_MSG(i + 1 < argc, "Missing option value for option: " << arg); *(it->second) = atoi(argv[++i]); } +} // namespace + /////////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv) /////////////////////////////////////////////////////////////////////////////// @@ -463,18 +410,23 @@ int main(int argc, char** argv) -1; // depends on other options, so don't set to default yet int bandwidth = -1; int team_size = -1; - int loop = 4; int test = 7; - std::map option_map = { - {"-n", &rows}, {"-z", &nnz_per_row}, {"-b", &bandwidth}, - {"-ts", &team_size}, {"-l", &loop}, {"-t", &test}}; + std::map option_map = {{"-n", &rows}, + {"-z", &nnz_per_row}, + {"-b", &bandwidth}, + {"-ts", &team_size}, + {"-t", &test}}; if (argc == 1) { print_help_par_ilut(); return 0; } + // Handle common params + perf_test::CommonInputParams common_params; + perf_test::parse_common_options(argc, argv, common_params); + // Handle user options for (int i = 1; i < argc; i++) { if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) { @@ -490,19 +442,16 @@ int main(int argc, char** argv) // Determine where A is coming from if (rows != -1) { // We are randomly generating the input A - if (rows < 100) { - throw std::runtime_error("Need to have at least 100 rows"); - } - if (mfile != "") { - throw std::runtime_error( - "Need provide either -n or -f argument to this program, not both"); - } + KK_USER_REQUIRE_MSG(rows >= 100, "Need to have at least 100 rows"); + + KK_USER_REQUIRE_MSG( + mfile == "", + "Need provide either -n or -f argument to this program, not both"); } else { // We are reading A from a file - if (mfile == "") { - throw std::runtime_error( - "Need provide either -n or -f argument to this program"); - } + KK_USER_REQUIRE_MSG( + mfile != "", + "Need provide either -n or -f argument to this program, not both"); } // Set dependent defaults. Default team_size cannot be set @@ -520,8 +469,8 @@ int main(int argc, char** argv) benchmark::SetDefaultTimeUnit(benchmark::kSecond); KokkosKernelsBenchmark::add_benchmark_context(true); - test_par_ilut_perf(mfile, rows, nnz_per_row, bandwidth, team_size, loop, - test); + test_par_ilut_perf(mfile, rows, nnz_per_row, bandwidth, team_size, + common_params.repeat, test); benchmark::Shutdown(); } diff --git a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp index 523e33b72d..aeaa37db96 100644 --- a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp @@ -52,7 +52,7 @@ void print_options() { std::cerr << "\t[Optional] --repeat :: how many times to repeat overall test" << std::endl; - std::cerr << " -s [N] :: generate a semi-random banded (band size " + std::cerr << " -n [N] :: generate a semi-random banded (band size " "0.01xN)\n" "NxN matrix with average of 10 entries per row." << std::endl; @@ -74,7 +74,7 @@ void print_options() { << std::endl; } // print_options -int parse_inputs(int argc, char** argv, spmv_parameters& params) { +void parse_inputs(int argc, char** argv, spmv_parameters& params) { for (int i = 1; i < argc; ++i) { if (perf_test::check_arg_int(i, argc, argv, "-n", params.N)) { ++i; @@ -94,27 +94,19 @@ int parse_inputs(int argc, char** argv, spmv_parameters& params) { params.offset)) { ++i; } else { - std::cerr << "Unrecognized command line argument #" << i << ": " - << argv[i] << std::endl; print_options(); - return 1; + KK_USER_REQUIRE_MSG(false, "Unrecognized command line argument #" + << i << ": " << argv[i]); } } - return 0; } // parse_inputs -} // namespace - template -void run_spmv(benchmark::State& state, int argc, char** argv) { +void run_spmv(benchmark::State& state, const spmv_parameters& inputs) { using matrix_type = KokkosSparse::CrsMatrix; using mv_type = Kokkos::View; - // Set input parameters - spmv_parameters inputs(state.range(0)); - parse_inputs(argc, argv, inputs); - KokkosKernels::Experimental::Controls controls; if ((inputs.alg == "default") || (inputs.alg == "native") || (inputs.alg == "merge")) { @@ -148,6 +140,8 @@ void run_spmv(benchmark::State& state, int argc, char** argv) { } } +} // namespace + int main(int argc, char** argv) { Kokkos::initialize(argc, argv); @@ -160,21 +154,14 @@ int main(int argc, char** argv) { std::string bench_name = "KokkosSparse_spmv"; - if (0 < common_params.repeat) { - benchmark::RegisterBenchmark( - bench_name.c_str(), run_spmv, argc, argv) - ->UseRealTime() - ->ArgNames({"n"}) - ->Args({100000}) - ->Iterations(common_params.repeat); - } else { - benchmark::RegisterBenchmark( - bench_name.c_str(), run_spmv, argc, argv) - ->UseRealTime() - ->ArgNames({"n"}) - ->Args({100000}); - } + // Set input parameters, default to random 100000x100000 + spmv_parameters inputs(100000); + parse_inputs(argc, argv, inputs); + // Google benchmark will report the wrong n if an input file matrix is used. + KokkosKernelsBenchmark::register_benchmark_real_time( + bench_name.c_str(), run_spmv, {"n"}, + {inputs.N}, common_params.repeat, inputs); benchmark::RunSpecifiedBenchmarks(); benchmark::Shutdown(); From f75ec31ce4faee34bc2a788bf1aab021a72f6616 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 12 Jun 2023 12:12:39 -0600 Subject: [PATCH 426/442] sparse/src: Add ifdef for doxgen < v1.9.7 --- sparse/src/KokkosSparse_BsrMatrix.hpp | 44 +++++++++++++++------------ 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index bb45d6070f..a366245a86 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -625,25 +625,31 @@ class BsrMatrix { graph = staticcrsgraph_type(entries_device, row_map_device); } - /// \brief Constructor that accepts a row map, column indices, and - /// values. - /// - /// The matrix will store and use the row map, indices, and values - /// directly (by view, not by deep copy). - /// - /// \param label - /// \param nrows [in] The number of rows. - /// \param ncols [in] The number of columns. - /// \param annz [in] Filler for annz. - /// \param vals [in/out] The entries. - /// \param rows [in/out] The row map (containing the offsets to the - /// data in each row). - /// \param cols [in/out] The column indices. - /// \param blockDimIn [in] The block dimensions. - BsrMatrix(const std::string& label [[maybe_unused]], const OrdinalType nrows, - const OrdinalType ncols, [[maybe_unused]] const size_type annz, - const values_type& vals, const row_map_type& rows, - const index_type& cols, const OrdinalType blockDimIn) +/// \brief Constructor that accepts a row map, column indices, and +/// values. +/// +/// The matrix will store and use the row map, indices, and values +/// directly (by view, not by deep copy). +/// +/// \param label +/// \param nrows [in] The number of rows. +/// \param ncols [in] The number of columns. +/// \param annz [in] Filler for annz. +/// \param vals [in/out] The entries. +/// \param rows [in/out] The row map (containing the offsets to the +/// data in each row). +/// \param cols [in/out] The column indices. +/// \param blockDimIn [in] The block dimensions. +#if defined(DOXY) + BsrMatrix([[maybe_unused]] const std::string& label, +#else + // Work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81429. + BsrMatrix(const std::string& label [[maybe_unused]], +#endif + const OrdinalType nrows, const OrdinalType ncols, + [[maybe_unused]] const size_type annz, const values_type& vals, + const row_map_type& rows, const index_type& cols, + const OrdinalType blockDimIn) : graph(cols, rows), values(vals), numCols_(ncols), From 71f04ce8a2da0a0e93705c278f6f80aaf51d9718 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Mon, 12 Jun 2023 12:44:56 -0700 Subject: [PATCH 427/442] Workaround checking OMP_NUM_THREADS with number of streams --- sparse/src/KokkosSparse_BsrMatrix.hpp | 6 +- sparse/src/KokkosSparse_BsrMatrix.hpp_ORIG | 1085 ++++++++++++++++++++ sparse/unit_test/Test_Sparse_spiluk.hpp | 17 + sparse/unit_test/Test_Sparse_sptrsv.hpp | 17 + 4 files changed, 1124 insertions(+), 1 deletion(-) create mode 100644 sparse/src/KokkosSparse_BsrMatrix.hpp_ORIG diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index b077215635..f073c93336 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -640,7 +640,11 @@ class BsrMatrix { /// data in each row). /// \param cols [in/out] The column indices. /// \param blockDimIn [in] The block dimensions. - BsrMatrix([[maybe_unused]] const std::string& label, const OrdinalType nrows, + //BsrMatrix(/*[[maybe_unused]]*/ const std::string& label, const OrdinalType nrows, + // const OrdinalType ncols, /*[[maybe_unused]]*/ const size_type annz, + // const values_type& vals, const row_map_type& rows, + // const index_type& cols, const OrdinalType blockDimIn) + BsrMatrix(const std::string& label [[maybe_unused]], const OrdinalType nrows, const OrdinalType ncols, [[maybe_unused]] const size_type annz, const values_type& vals, const row_map_type& rows, const index_type& cols, const OrdinalType blockDimIn) diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp_ORIG b/sparse/src/KokkosSparse_BsrMatrix.hpp_ORIG new file mode 100644 index 0000000000..b077215635 --- /dev/null +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp_ORIG @@ -0,0 +1,1085 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/// \file KokkosSparse_BsrMatrix.hpp +/// \brief Local sparse matrix interface +/// +/// This file provides KokkosSparse::Experimental::BsrMatrix. +/// This implements a local (no MPI) sparse matrix stored in block-by-block +/// compressed row sparse format. + +#ifndef KOKKOS_SPARSE_BSRMATRIX_HPP_ +#define KOKKOS_SPARSE_BSRMATRIX_HPP_ + +#include +#include +#include +#include + +#include "Kokkos_Core.hpp" +#include "Kokkos_StaticCrsGraph.hpp" +#include "Kokkos_ArithTraits.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosKernels_Error.hpp" + +namespace KokkosSparse { + +namespace Experimental { + +template +struct BsrRowView { + //! The type of the values in the row. + typedef typename MatrixType::value_type value_type; + //! The type of the column indices in the row. + typedef typename MatrixType::ordinal_type ordinal_type; + //! The type for returned block of values. + typedef Kokkos::View + block_values_type; + + private: + //! Array of values in the row. + value_type* values_; + //! Array of (local) column indices in the row. + ordinal_type* colidx_; + /// \brief Stride between successive rows in a block. + /// + const ordinal_type blockDim_; + + public: + /// \brief Constructor + /// + /// \param values [in] Array of the row's values. + /// \param colidx [in] Array of the row's column indices. + /// \param blockDim [in] (Constant) stride between block rows + /// within a block-row in the above arrays. + /// \param count [in] Number of blocks in the desired block-row. + // + // Assumes values and colidx already offset to the correct location + KOKKOS_INLINE_FUNCTION + BsrRowView(value_type* const values, ordinal_type* const colidx, + const ordinal_type& blockDim, const ordinal_type& count) + : values_(values), colidx_(colidx), blockDim_(blockDim), length(count) {} + + /// \brief Constructor with offset into \c colidx array + /// + /// \param values [in] Array of the row's values. + /// \param colidx [in] Array of the row's column indices. + /// \param blockDim [in] (Constant) stride between rows in + /// within a block in the above arrays. + /// \param count [in] Number of blocks in the desired block-row + /// \param start [in] Offset into values and colidx of the desired block-row + /// start. + /// Note: The offset into the values array for a block-row equals + /// num_blocks_prior_to_block-row*blockDim*blockDim + /// + /// \tparam OffsetType The type of \c start (see above). Must be a + /// built-in integer type. This may differ from ordinal_type. + /// For example, the matrix may have dimensions that fit in int, + /// but a number of entries that does not fit in int. + template + KOKKOS_INLINE_FUNCTION BsrRowView( + const typename MatrixType::values_type& values, + const typename MatrixType::index_type& colidx, + const ordinal_type& blockDim, const ordinal_type& count, + const OffsetType& start, + const typename std::enable_if::value, + int>::type& = 0) + : values_(&values(start * blockDim * blockDim)), + colidx_(&colidx(start)), + blockDim_(blockDim), + length(count) {} + + /// \brief Number of entries (i.e. blocks) in the row. + /// + /// This is a public const field rather than a public const method, + /// in order to avoid possible overhead of a method call if the + /// compiler is unable to inline that method call. + /// + /// We assume that rows contain no duplicate entries (i.e., entries + /// with the same column index). Thus, a row may have up to + /// A.numCols() entries. This means that the correct type of + /// 'length' is ordinal_type. + /// Here, length refers to the number of blocks in a block-row + const ordinal_type length; + + /// /brief Return a pointer offset to local row i of block K of values_ array; + /// user responsible for indexing into this pointer correctly + /// \param K [in] must be the LOCAL block index within this block-row + /// \param i [in] must be the LOCAL row index offset within this block-row + /// + /// Output: pointer to values_ array at start of local row within block K + /// + /// Pointer interfaces are NOT guaranteed for backward compatibility + /// This interface is intended for performant kernels, not common usage + KOKKOS_INLINE_FUNCTION + value_type* local_row_in_block(const ordinal_type& K, + const ordinal_type& i) const { + return (values_ + (K * blockDim_ * blockDim_ + i * blockDim_)); + } + + /// \brief Return the value at a specified block K of block-row + /// with local row and col offset (i,j) + /// \param K [in] must be the LOCAL block index within this block-row + /// \param i [in] must be the LOCAL row index offset within this block-row + /// \param j [in] must be the LOCAL col index offset within this block-row + /// + /// Output: reference to value_type at the given (K, i, j) offset into values_ + KOKKOS_INLINE_FUNCTION + value_type& local_block_value(const ordinal_type& K, const ordinal_type& i, + const ordinal_type& j) const { + return values_[K * blockDim_ * blockDim_ + i * blockDim_ + j]; + } + + /// \brief Return unmanaged 2D strided View wrapping local block K from this + /// block-row + /// \param K [in] must be the LOCAL block index within this + /// block-row + KOKKOS_INLINE_FUNCTION + block_values_type block(const ordinal_type& K) const { + return block_values_type(&(values_[K * blockDim_ * blockDim_]), + Kokkos::LayoutRight(blockDim_, blockDim_)); + } + + /// \brief Return offset into colidx_ for the requested block idx + /// If none found, return Kokkos::ArithTraits::max + /// \param idx_to_match [in] local block idx within block-row + KOKKOS_INLINE_FUNCTION + ordinal_type findRelBlockOffset(const ordinal_type idx_to_match, + bool /*is_sorted*/ = false) const { + ordinal_type offset = Kokkos::ArithTraits::max(); + for (ordinal_type blk_offset = 0; blk_offset < length; ++blk_offset) { + ordinal_type idx = colidx_[blk_offset]; + if (idx == idx_to_match) { + offset = blk_offset; + break; + } // return relative offset + } + return offset; + } +}; + +template +struct BsrRowViewConst { + //! The type of the values in the row. + typedef const typename MatrixType::non_const_value_type value_type; + //! The type of the column indices in the row. + typedef const typename MatrixType::non_const_ordinal_type ordinal_type; + //! The type for returned block of values. + typedef Kokkos::View + block_values_type; + + private: + //! Array of values in the row. + value_type* values_; + //! Array of (local) column indices in the row. + ordinal_type* colidx_; + /// \brief Stride between successive rows in a block + const ordinal_type blockDim_; + + public: + /// \brief Constructor + /// + /// \param values [in] Array of the row's values. + /// \param colidx [in] Array of the row's column indices. + /// \param blockDim [in] (Constant) stride between block rows + /// within a block-row in the above arrays. + /// \param count [in] Number of entries in the row. + // + // Assumes values and colidx already offset to the correct location + KOKKOS_INLINE_FUNCTION + BsrRowViewConst(value_type* const values, ordinal_type* const colidx, + const ordinal_type& blockDim, const ordinal_type& count) + : values_(values), colidx_(colidx), blockDim_(blockDim), length(count) {} + + /// \brief Constructor with offset into \c colidx array + /// + /// \param values [in] Array of the row's values. + /// \param colidx [in] Array of the row's column indices. + /// \param blockDim [in] The block dimensions. + /// \param count [in] Number of entries in the row. + /// \param start [in] Offset into values and colidx of the desired block-row + /// start. + /// Note: The offset into the values array for a block-row equals + /// num_blocks_prior_to_block-row*blockDim*blockDim + /// + /// \tparam OffsetType The type of \c start (see above). Must be a + /// built-in integer type. This may differ from ordinal_type. + /// For example, the matrix may have dimensions that fit in int, + /// but a number of entries that does not fit in int. + template + KOKKOS_INLINE_FUNCTION BsrRowViewConst( + const typename MatrixType::values_type& values, + const typename MatrixType::index_type& colidx, + const ordinal_type& blockDim, const ordinal_type& count, + const OffsetType& start, + const typename std::enable_if::value, + int>::type& = 0) + : values_(&values(start * blockDim * blockDim)), + colidx_(&colidx(start)), + blockDim_(blockDim), + length(count) {} + + /// \brief Number of entries (i.e. blocks) in the row. + /// + /// This is a public const field rather than a public const method, + /// in order to avoid possible overhead of a method call if the + /// compiler is unable to inline that method call. + /// + /// We assume that rows contain no duplicate entries (i.e., entries + /// with the same column index). Thus, a row may have up to + /// A.numCols() entries. This means that the correct type of + /// 'length' is ordinal_type. + const ordinal_type length; + + /// /brief Return a pointer offset to local row i of block K of values_ array; + /// user responsible for indexing into this pointer correctly + /// \param K [in] must be the LOCAL block index within this block-row + /// \param i [in] must be the LOCAL row index offset within this block-row + /// + /// Output: pointer to values_ array at start of local row within block K + /// + /// Pointer interfaces are NOT guaranteed for backward compatibility + /// This interface is intended for performant kernels, not common usage + KOKKOS_INLINE_FUNCTION + value_type* local_row_in_block(const ordinal_type& K, + const ordinal_type& i) const { + return (values_ + (K * blockDim_ * blockDim_ + i * blockDim_)); + } + + /// \brief Return the value at a specified block K with local row and col ids + /// (i,j) \param K [in] must be the LOCAL block index within this block-row + /// \param i [in] must be the LOCAL row index offset within this block-row + /// \param j [in] must be the LOCAL col index offset within this block-row + /// + /// Output: reference to value_type at the given (K, i, j) offset into values_ + KOKKOS_INLINE_FUNCTION + value_type& local_block_value(const ordinal_type& K, const ordinal_type& i, + const ordinal_type& j) const { + return values_[K * blockDim_ * blockDim_ + i * blockDim_ + j]; + } + + /// \brief Return the block column index for a specified block K + /// + /// \param K [in] must be the LOCAL block index within this block-row + /// \return Block column index for "uncompressed" block row + KOKKOS_INLINE_FUNCTION + ordinal_type block_colidx(const ordinal_type K) const { return colidx_[K]; } + + /// \brief Return unmanaged 2D strided View wrapping local block K from this + /// block-row \param K [in] must be the LOCAL block index within this + /// block-row + KOKKOS_INLINE_FUNCTION + block_values_type block(const ordinal_type& K) const { + return block_values_type(&(values_[K * blockDim_ * blockDim_]), + Kokkos::LayoutRight(blockDim_, blockDim_)); + } + + /// \brief Return offset into colidx_ for the requested block idx + /// If none found, return Kokkos::ArithTraits::max + /// \param idx_to_match [in] local block idx within block-row + KOKKOS_INLINE_FUNCTION + ordinal_type findRelBlockOffset(const ordinal_type& idx_to_match, + bool /*is_sorted*/ = false) const { + typedef typename std::remove_cv::type non_const_ordinal_type; + non_const_ordinal_type offset = + Kokkos::ArithTraits::max(); + for (non_const_ordinal_type blk_offset = 0; blk_offset < length; + ++blk_offset) { + ordinal_type idx = colidx_[blk_offset]; + if (idx == idx_to_match) { + offset = blk_offset; + break; + } // return relative offset + } + return offset; + } +}; + +/// \class BsrMatrix +/// \brief Compressed sparse row implementation of a sparse matrix. +/// \tparam ScalarType The type of entries in the sparse matrix. +/// \tparam OrdinalType The type of column indices in the sparse matrix. +/// \tparam Device The Kokkos Device type. +/// \tparam MemoryTraits Traits describing how Kokkos manages and +/// accesses data. The default parameter suffices for most users. +/// +/// "Crs" stands for "compressed row sparse." This is the phrase +/// Trilinos traditionally uses to describe compressed sparse row +/// storage for sparse matrices, as described, for example, in Saad +/// (2nd ed.). +template ::size_type> +class BsrMatrix { + static_assert( + std::is_signed::value, + "BsrMatrix requires that OrdinalType is a signed integer type."); + static_assert(Kokkos::is_memory_traits_v || + std::is_void_v, + "BsrMatrix: MemoryTraits (4th template param) must be a Kokkos " + "MemoryTraits or void"); + + private: + typedef + typename Kokkos::ViewTraits::host_mirror_space host_mirror_space; + + public: + //! Type of the matrix's execution space. + typedef typename Device::execution_space execution_space; + //! Type of the matrix's memory space. + typedef typename Device::memory_space memory_space; + //! Type of the matrix's device type. + typedef Kokkos::Device device_type; + + //! Type of each value in the matrix. + typedef ScalarType value_type; + //! Type of each (column) index in the matrix. + typedef OrdinalType ordinal_type; + typedef MemoryTraits memory_traits; + /// \brief Type of each entry of the "row map." + /// + /// The "row map" corresponds to the \c ptr array of row offsets in + /// compressed sparse row (CSR) storage. + typedef SizeType size_type; + + //! Type of a host-memory mirror of the sparse matrix. + typedef BsrMatrix + HostMirror; + //! Type of the graph structure of the sparse matrix. + typedef Kokkos::StaticCrsGraph + StaticCrsGraphType; + //! Type of the graph structure of the sparse matrix - consistent with Kokkos. + typedef Kokkos::StaticCrsGraph + staticcrsgraph_type; + //! Type of column indices in the sparse matrix. + typedef typename staticcrsgraph_type::entries_type index_type; + //! Const version of the type of column indices in the sparse matrix. + typedef typename index_type::const_value_type const_ordinal_type; + //! Nonconst version of the type of column indices in the sparse matrix. + typedef typename index_type::non_const_value_type non_const_ordinal_type; + //! Type of the "row map" (which contains the offset for each row's data). + typedef typename staticcrsgraph_type::row_map_type row_map_type; + //! Const version of the type of row offsets in the sparse matrix. + typedef typename row_map_type::const_value_type const_size_type; + //! Nonconst version of the type of row offsets in the sparse matrix. + typedef typename row_map_type::non_const_value_type non_const_size_type; + //! Kokkos Array type of the entries (values) in the sparse matrix. + typedef Kokkos::View + values_type; + //! Const version of the type of the entries in the sparse matrix. + typedef typename values_type::const_value_type const_value_type; + //! Nonconst version of the type of the entries in the sparse matrix. + typedef typename values_type::non_const_value_type non_const_value_type; + + // block values are actually a 1-D view, however they are implicitly + // arranged in LayoutRight, e.g. consecutive entries in the values view + // are consecutive entries within a row inside a block + using block_layout = Kokkos::LayoutRight; + + /// \name Storage of the actual sparsity structure and values. + /// + /// BsrMatrix uses the compressed sparse row (CSR) storage format to + /// store the sparse matrix. CSR is also called "compressed row + /// storage"; hence the name, which it inherits from Tpetra and from + /// Epetra before it. + //@{ + //! The graph (sparsity structure) of the sparse matrix. + staticcrsgraph_type graph; + //! The 1-D array of values of the sparse matrix. + values_type values; + //@} + + /// \brief Launch configuration that can be used by + /// overloads/specializations of MV_multiply(). + /// + /// This is a hack and needs to be replaced by a general + /// state mechanism. + DeviceConfig dev_config; + + /// \brief Default constructor; constructs an empty sparse matrix. + /// + /// mfh: numCols and nnz should be properties of the graph, not the matrix. + /// Then BsrMatrix needs methods to get these from the graph. + BsrMatrix() : graph(), values(), dev_config(), numCols_(0), blockDim_(1) {} + + //! Copy constructor (shallow copy). + template + explicit BsrMatrix(const BsrMatrix& B) + : graph(B.graph.entries, B.graph.row_map), + values(B.values), + dev_config(B.dev_config), + numCols_(B.numCols()), + blockDim_(B.blockDim()) { + graph.row_block_offsets = B.graph.row_block_offsets; + // MD: Changed the copy constructor of graph + // as the constructor of StaticCrsGraph does not allow copy from non const + // version. + } + + /// \brief Construct with a graph that will be shared. + /// + /// \param[in] arg_label The sparse matrix's label. + /// \param[in] arg_graph The graph between the blocks. + /// \param[in] blockDimIn The block size. + /// + /// Allocate the values array for subsequent fill. + BsrMatrix(const std::string& arg_label, const staticcrsgraph_type& arg_graph, + const OrdinalType& blockDimIn) + : graph(arg_graph), + values(arg_label, + arg_graph.entries.extent(0) * blockDimIn * blockDimIn), + numCols_(maximum_entry(arg_graph) + 1), + blockDim_(blockDimIn) { + if (blockDim_ < 1) { + std::ostringstream os; + os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + } + + /// \brief Construct BsrMatrix from host data in COO format. + /// + /// The COO matrix must already have a block structure. + /// Each entry k of the input sparse matrix has a value stored in val[k], + /// row index in rows[k] and column index in cols[k]. + /// The COO data must be sorted by increasing row index + /// + /// This constructor is mainly useful for benchmarking or for + /// reading the sparse matrix's data from a file. + /// + /// \param label [in] The sparse matrix's label. + /// \param nrows [in] The number of rows. + /// \param ncols [in] The number of columns. + /// \param annz [in] The number of entries. + /// \param vals [in] The entries. + /// \param rows [in] The row indices. rows[k] is the row index of + /// val[k]. + /// \param cols [in] The column indices. cols[k] is the column + /// index of val[k]. + /// \param blockdim [in] The block size of the constructed BsrMatrix. + /// \param pad [in] If true, pad the sparse matrix's storage with + /// zeros in order to improve cache alignment and / or + /// vectorization. + /// + /// The \c pad argument is currently not used. + BsrMatrix(const std::string& label, OrdinalType nrows, OrdinalType ncols, + size_type annz, ScalarType* vals, OrdinalType* rows, + OrdinalType* cols, OrdinalType blockdim, bool pad = false) { + (void)label; + (void)pad; + blockDim_ = blockdim; + + if (blockDim_ < 1) { + std::ostringstream os; + os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if (ncols % blockDim_) { + std::ostringstream os; + os << "BsrMatrix: " << ncols + << " input CrsMatrix columns is not a multiple of block size " + << blockDim_; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + if (nrows % blockDim_) { + std::ostringstream os; + os << "BsrMatrix: " << nrows + << " input CrsMatrix rows is not a multiple of block size " + << blockDim_; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + if (annz % (blockDim_ * blockDim_)) { + throw std::runtime_error( + "BsrMatrix:: annz should be a multiple of the number of entries in a " + "block"); + } + + using Coord = std::pair; // row, col + using CoordComp = std::function; // type that can order Coords + using Entry = std::pair; // (row, col), val + using Blocks = std::map, + CoordComp>; // map a block to its non-zeros, sorted + // by row, then col + + numCols_ = ncols / blockDim_; + ordinal_type numRows = nrows / blockDim_; + size_type numBlocks = annz / (blockDim_ * blockDim_); + + // device data + typename row_map_type::non_const_type row_map_device( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map_device"), + numRows + 1); + index_type entries_device("entries_device", numBlocks); + Kokkos::resize(values, annz); + + // mirror views on host + auto row_map_host = Kokkos::create_mirror_view(row_map_device); + auto entries_host = Kokkos::create_mirror_view(entries_device); + auto values_host = Kokkos::create_mirror_view(values); + + auto coord_by_row_col = [](const Coord& a, const Coord& b) { + const auto& arow = std::get<0>(a); + const auto& brow = std::get<0>(b); + const auto& acol = std::get<1>(a); + const auto& bcol = std::get<1>(b); + if (arow < brow) { + return true; + } else if (arow > brow) { + return false; + } else { + return acol < bcol; + } + }; + + auto entry_by_row_col = [coord_by_row_col](const Entry& a, const Entry& b) { + return coord_by_row_col(std::get<0>(a), std::get<0>(b)); + }; + + // organize all blocks and their entries + Blocks blocks(coord_by_row_col); + for (size_type i = 0; i < annz; ++i) { + const ordinal_type row = rows[i]; + const ordinal_type col = cols[i]; + const ScalarType val = vals[i]; + const Coord block = Coord(row / blockDim_, col / blockDim_); + const Entry entry(Coord(row, col), val); + + // add entry to the correct block + auto it = blocks.find(block); + if (it == blocks.end()) { + std::vector entries = {entry}; + entries.reserve(blockDim_ * blockDim_); + blocks[block] = std::move(entries); // new block with entry + } else { + it->second.push_back(entry); // add entry to block + } + } + + // write block data out to BSR format + ordinal_type row = 0; // current row we're in + size_t bi = 0; // how many blocks so far + for (auto& kv : blocks) { // iterating through blocks in row/col order + const Coord& block = kv.first; // block's position + auto& entries = kv.second; // non-zeros in the block + + if (OrdinalType(entries.size()) != blockDim_ * blockDim_) { + std::stringstream ss; + ss << "BsrMatrix: block " << block.first << "," << block.second + << " had only " << entries.size() << " non-zeros, expected " + << blockDim_ * blockDim_; + KokkosKernels::Impl::throw_runtime_exception(ss.str()); + } + + // update row-map if block is in a new row + for (; row < block.first; ++row) { + row_map_host(row + 1) = bi; // `row` ends at bi + } + + // record column of block + entries_host(bi) = block.second; // block's column + + // add contiguous entries of block sorted by row/col + std::sort(entries.begin(), entries.end(), entry_by_row_col); + for (size_type ei = 0; ei < size_type(entries.size()); ++ei) { + values_host(bi * blockDim_ * blockDim_ + ei) = std::get<1>(entries[ei]); + } + + // next block + ++bi; + } + // complete row map if last blocks are empty + for (; row < numRows + 1; ++row) { + row_map_host(row) = bi; + } + + // move graph data to the requested device + Kokkos::deep_copy(row_map_device, row_map_host); + Kokkos::deep_copy(entries_device, entries_host); + Kokkos::deep_copy(values, values_host); + + graph = staticcrsgraph_type(entries_device, row_map_device); + } + + /// \brief Constructor that accepts a row map, column indices, and + /// values. + /// + /// The matrix will store and use the row map, indices, and values + /// directly (by view, not by deep copy). + /// + /// \param label + /// \param nrows [in] The number of rows. + /// \param ncols [in] The number of columns. + /// \param annz [in] Filler for annz. + /// \param vals [in/out] The entries. + /// \param rows [in/out] The row map (containing the offsets to the + /// data in each row). + /// \param cols [in/out] The column indices. + /// \param blockDimIn [in] The block dimensions. + BsrMatrix([[maybe_unused]] const std::string& label, const OrdinalType nrows, + const OrdinalType ncols, [[maybe_unused]] const size_type annz, + const values_type& vals, const row_map_type& rows, + const index_type& cols, const OrdinalType blockDimIn) + : graph(cols, rows), + values(vals), + numCols_(ncols), + blockDim_(blockDimIn) { + if (blockDim_ < 1) { + std::ostringstream os; + os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + const ordinal_type actualNumRows = + (rows.extent(0) != 0) ? static_cast( + rows.extent(0) - static_cast(1)) + : static_cast(0); + if (nrows != actualNumRows) { + std::ostringstream os; + os << "Input argument nrows = " << nrows + << " != the actual number of " + "rows " + << actualNumRows << " according to the 'rows' input argument."; + throw std::invalid_argument(os.str()); + } + // nnz returns graph.entries.extent(0) i.e. ptr[ nrows + 1 ] nnz entry + // input annz is nnz of values, not comparable with block ptr 'nnz' i.e. + // numBlocks + if (blockDim_ <= 0) { + std::ostringstream os; + os << "Input argument blockDim = " << blockDim_ + << " is not larger than 0."; + throw std::invalid_argument(os.str()); + } + } + + /// \brief Constructor that accepts a a static graph, and values. + /// + /// The matrix will store and use the row map, indices, and values + /// directly (by view, not by deep copy). + /// + /// \param ncols [in] The number of columns. + /// \param vals [in] The entries. + /// \param graph_ [in] The graph between the blocks. + /// \param blockDimIn [in] The block dimensions. + BsrMatrix(const std::string& /*label*/, const OrdinalType& ncols, + const values_type& vals, const staticcrsgraph_type& graph_, + const OrdinalType& blockDimIn) + : graph(graph_), values(vals), numCols_(ncols), blockDim_(blockDimIn) { + if (blockDim_ < 1) { + std::ostringstream os; + os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + } + + /// \brief Constructor that accepts a CrsMatrix and block dimension, + /// assuming the provided CrsMatrix has appropriate block structure. + template + BsrMatrix(const KokkosSparse::CrsMatrix& + crs_mtx, + const OrdinalType blockDimIn) { + typedef typename KokkosSparse::CrsMatrix + crs_matrix_type; + typedef typename crs_matrix_type::staticcrsgraph_type crs_graph_type; + typedef typename crs_graph_type::entries_type crs_graph_entries_type; + typedef typename crs_graph_type::row_map_type crs_graph_row_map_type; + + blockDim_ = blockDimIn; + if (blockDim_ < 1) { + std::ostringstream os; + os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + assert( + (crs_mtx.numCols() % blockDim_ == 0) && + "BsrMatrix: input CrsMatrix columns is not a multiple of block size"); + assert((crs_mtx.numRows() % blockDim_ == 0) && + "BsrMatrix: input CrsMatrix rows is not a multiple of block size"); + + numCols_ = crs_mtx.numCols() / blockDim_; + + OrdinalType nbrows = + crs_mtx.numRows() / + blockDim_; // actual number of block rows; add 1 for ptr length + + // block_rows will accumulate the number of blocks per row - this is NOT the + // row_map with cum sum!! + std::vector block_rows(nbrows, 0); + + typename crs_graph_row_map_type::HostMirror h_crs_row_map = + Kokkos::create_mirror_view(crs_mtx.graph.row_map); + Kokkos::deep_copy(h_crs_row_map, crs_mtx.graph.row_map); + typename crs_graph_entries_type::HostMirror h_crs_entries = + Kokkos::create_mirror_view(crs_mtx.graph.entries); + Kokkos::deep_copy(h_crs_entries, crs_mtx.graph.entries); + + // determine size of block cols indices == number of blocks, + // i.e. nnz for the block CRS graph + OrdinalType numBlocks = 0; + for (OrdinalType i = 0; i < crs_mtx.numRows(); i += blockDim_) { + std::set col_set; + for (auto ie = h_crs_row_map(i); ie < h_crs_row_map(i + blockDim_); + ++ie) { + col_set.insert(h_crs_entries(ie) / blockDim_); + } + numBlocks += col_set.size(); // cum sum + block_rows[i / blockDim_] = col_set.size(); // frequency counts + } + + // create_staticcrsgraph takes the frequency of blocks per row + // and returns the cum sum pointer row_map with nbrows+1 size, and total + // numBlocks in the final entry + graph = Kokkos::create_staticcrsgraph("blockgraph", + block_rows); + typename row_map_type::HostMirror h_row_map = + Kokkos::create_mirror_view(graph.row_map); + Kokkos::deep_copy(h_row_map, graph.row_map); + + typename index_type::HostMirror h_entries = + Kokkos::create_mirror_view(graph.entries); + + OrdinalType ientry = 0; + for (OrdinalType ib = 0; ib < nbrows; ++ib) { + auto ir_start = ib * blockDim_; + auto ir_stop = (ib + 1) * blockDim_; + std::set col_set; + for (auto jk = h_crs_row_map(ir_start); jk < h_crs_row_map(ir_stop); + ++jk) { + col_set.insert(h_crs_entries(jk) / blockDim_); + } + for (auto col_block : col_set) { + h_entries(ientry++) = col_block; + } + } + Kokkos::deep_copy(graph.entries, h_entries); + + // Copy the numerical values + + typename values_type::HostMirror h_crs_values = + Kokkos::create_mirror_view(crs_mtx.values); + Kokkos::deep_copy(h_crs_values, crs_mtx.values); + + typename values_type::HostMirror h_values = + Kokkos::create_mirror_view(values); + if (h_values.extent(0) < size_t(numBlocks * blockDim_ * blockDim_)) { + Kokkos::resize(h_values, numBlocks * blockDim_ * blockDim_); + Kokkos::resize(values, numBlocks * blockDim_ * blockDim_); + } + Kokkos::deep_copy(h_values, 0); + + for (OrdinalType ir = 0; ir < crs_mtx.numRows(); ++ir) { + const auto iblock = ir / blockDim_; + const auto ilocal = ir % blockDim_; + for (auto jk = h_crs_row_map(ir); jk < h_crs_row_map(ir + 1); ++jk) { + const auto jc = h_crs_entries(jk); + const auto jblock = jc / blockDim_; + const auto jlocal = jc % blockDim_; + for (auto jkb = h_row_map(iblock); jkb < h_row_map(iblock + 1); ++jkb) { + if (h_entries(jkb) == jblock) { + OrdinalType shift = jkb * blockDim_ * blockDim_; + h_values(shift + ilocal * blockDim_ + jlocal) = h_crs_values(jk); + break; + } + } + } + } + Kokkos::deep_copy(values, h_values); + } + + /// \brief Given an array of blocks, sum the values into corresponding + /// block in BsrMatrix + /// \param rowi [in] is a block-row index + /// \param cols[] [in] are block colidxs within the block-row to be summed + /// into ncol entries + /// \param ncol [in] is number of blocks referenced in cols[] array + /// \param vals[] [in] array containing 'block' of values + /// ncol*block_size*block_size entries + /// assume vals block is provided in 'LayoutRight' or 'Row Major' + /// format, that is e.g. 2x2 block [ a b ; c d ] provided as flattened + /// 1d array as [a b c d] Assume that each block is stored contiguously + /// in vals: [a b; c d] [e f; g h] -> [a b c d e f g h] If so, then i + /// in [0, ncols) for cols[] maps to i*block_size*block_size in vals[] + /// \param is_sorted [in] + /// \param force_atomic [in] + KOKKOS_INLINE_FUNCTION + OrdinalType sumIntoValues(const OrdinalType rowi, const OrdinalType cols[], + const OrdinalType ncol, const ScalarType vals[], + const bool is_sorted = false, + const bool force_atomic = false) const { + return operateValues(BsrMatrix::valueOperation::ADD, rowi, cols, ncol, vals, + is_sorted, force_atomic); + } + + /// \brief Given an array of blocks, replace the values of corresponding + /// blocks in BsrMatrix + /// \param rowi [in] is a block-row index + /// \param cols[] [in] are block colidxs within the block-row to be summed + /// into ncol entries + /// \param ncol [in] is number of blocks referenced in cols[] array + /// \param vals[] [in] array containing 'block' of values + /// ncol*block_size*block_size entries + /// assume vals block is provided in 'LayoutRight' or 'Row + /// Major' format, that is e.g. 2x2 block [ a b ; c d ] provided + /// as flattened 1d array as [a b c d] Assume that each block is + /// stored contiguously in vals: [a b; c d] [e f; g h] -> [a b c + /// d e f g h] If so, then i in [0, ncols) for cols[] maps to + /// i*block_size*block_size in vals[] + /// \param is_sorted [in] + /// \param force_atomic [in] + KOKKOS_INLINE_FUNCTION + OrdinalType replaceValues(const OrdinalType rowi, const OrdinalType cols[], + const OrdinalType ncol, const ScalarType vals[], + const bool is_sorted = false, + const bool force_atomic = false) const { + return operateValues(BsrMatrix::valueOperation::ASSIGN, rowi, cols, ncol, + vals, is_sorted, force_atomic); + } + + //! Attempt to assign the input matrix to \c *this. + // Are the CUDA sparse handles needed to be copied here?? + template + BsrMatrix& operator=(const BsrMatrix& mtx) { + numCols_ = mtx.numCols(); + blockDim_ = mtx.blockDim(); + graph = mtx.graph; + values = mtx.values; + dev_config = mtx.dev_config; + return *this; + } + + //! The number of rows in the sparse matrix. + KOKKOS_INLINE_FUNCTION ordinal_type numRows() const { + return graph.numRows(); + } + + //! The number of columns in the sparse matrix. + KOKKOS_INLINE_FUNCTION ordinal_type numCols() const { return numCols_; } + + //! The block dimension in the sparse block matrix. + KOKKOS_INLINE_FUNCTION ordinal_type blockDim() const { return blockDim_; } + + //! The number of "point" (non-block) rows in the matrix. + // This is the dimension of the range of this matrix as a linear operator. + KOKKOS_INLINE_FUNCTION ordinal_type numPointRows() const { + return numRows() * blockDim(); + } + + //! The number of "point" (non-block) columns in the matrix. + // This is the dimension of the domain of this matrix as a linear operator. + KOKKOS_INLINE_FUNCTION ordinal_type numPointCols() const { + return numCols() * blockDim(); + } + + //! The number of stored entries in the sparse matrix. + KOKKOS_INLINE_FUNCTION size_type nnz() const { + return graph.entries.extent(0); + } + + friend struct BsrRowView; + + /// \brief Return a BsrRowView of block-row i of the matrix. + /// + /// If row i does not belong to the matrix, return an empty view. + /// + /// The returned object \c view implements the following interface: + ///
    + ///
  • \c view.length is the number of entries (i.e. blocks) + /// in the block row
  • + ///
  • \c view.local_row_in_block_row(K, i) returns a nonconst pointer + /// to the values of the ith local row in the k-th block of the block-row + ///
  • + ///
  • \c view.full_row_in_block_row(i) returns a nonconst pointer + /// to the values of the ith local row of the block-row
  • + ///
  • \c view.local_block_value(K, i, j) returns a nonconst reference + /// to the value in the ith local row and jth local col + /// of the k-th block of the block-row
  • + ///
  • \c view.block(K) returns an unmanaged 2D strided Kokkos::View + /// of the values of the k-th block of the block-row
  • + ///
+ /// + /// Users should not rely on the return type of this method. They + /// should instead assign to 'auto'. + /// + KOKKOS_INLINE_FUNCTION + BsrRowView block_row(const ordinal_type i) const { + const size_type start = + graph.row_map(i); // total num blocks prior to this block-row + const auto count = static_cast( + graph.row_map(i + 1) - start); // num blocks in this row + + if (count == 0) { + return BsrRowView(nullptr, nullptr, 1, 0); + } else { + return BsrRowView(values, graph.entries, blockDim(), count, + start); + } + } + + /// \brief Return a BsrRowViewConst of block-row i of the matrix. + /// + /// If row i does not belong to the matrix, return an empty view. + /// + /// The returned object \c view implements the following interface: + ///
    + ///
  • \c view.length is the number of entries (i.e. blocks) + /// in the block row
  • + ///
  • \c view.local_row_in_block_row(K, i) returns a nonconst pointer + /// to the values of the ith local row in the k-th block of the block-row + ///
  • + ///
  • \c view.full_row_in_block_row(i) returns a nonconst pointer + /// to the values of the ith local row of the block-row
  • + ///
  • \c view.local_block_value(K, i, j) returns a nonconst reference + /// to the value in the ith local row and jth local col + /// of the k-th block of the block-row
  • + ///
  • \c view.block(K) returns an unmanaged 2D strided Kokkos::View + /// of the values of the k-th block of the block-row
  • + ///
+ /// + /// Users should not rely on the return type of this method. They + /// should instead assign to 'auto'. + /// + KOKKOS_INLINE_FUNCTION + BsrRowViewConst block_row_Const(const ordinal_type i) const { + const size_type start = + graph.row_map(i); // total num blocks prior to this block-row + const auto count = static_cast( + graph.row_map(i + 1) - start); // num blocks in this row + + if (count == 0) { + return BsrRowViewConst(nullptr, nullptr, 1, 0); + } else { + return BsrRowViewConst(values, graph.entries, blockDim(), + count, start); + } + } + + protected: + enum class valueOperation { ADD, ASSIGN }; + + /// \brief Given an array of blocks, operate on the values of corresponding + /// blocks in BsrMatrix + /// \param op + /// \param rowi [in] is a block-row index + /// \param ncol [in] is number of blocks referenced in cols[] array + /// \param cols[] [in] are block colidxs within the block-row to be op-ed + /// into ncol entries + /// \param vals[] [in] array containing 'block' of values + /// ncol*block_size*block_size entries + /// assume vals block is provided in 'LayoutRight' or 'Row + /// Major' format, that is e.g. 2x2 block [ a b ; c d ] provided + /// as flattened 1d array as [a b c d] Assume that each block is + /// stored contiguously in vals: [a b; c d] [e f; g h] -> [a b c + /// d e f g h] If so, then i in [0, ncols) for cols[] maps to + /// i*block_size*block_size in vals[] + /// \param is_sorted [in] + /// \param force_atomic [in] + KOKKOS_INLINE_FUNCTION + OrdinalType operateValues(const BsrMatrix::valueOperation op, + const OrdinalType rowi, const OrdinalType cols[], + const OrdinalType ncol, const ScalarType vals[], + const bool is_sorted = false, + const bool force_atomic = false) const { + BsrRowView row_view = this->block_row(rowi); + const ordinal_type block_size = this->blockDim(); + + ordinal_type numValid = 0; // number of valid local column indices + + for (ordinal_type i = 0; i < ncol; ++i) { + // Find offset into values for block-row rowi and colidx cols[i] + // cols[i] is the index to match + // blk_offset is the offset for block colidx from bptr[rowi] to bptr[rowi + // + 1] (not global offset) colidx_ and values_ are already offset to the + // beginning of blockrow rowi + auto blk_offset = row_view.findRelBlockOffset(cols[i], is_sorted); + if (blk_offset != Kokkos::ArithTraits::max()) { + ordinal_type offset_into_vals = + i * block_size * + block_size; // stride == 1 assumed between elements + for (ordinal_type lrow = 0; lrow < block_size; ++lrow) { + auto local_row_values = row_view.local_row_in_block( + blk_offset, lrow); // pointer to start of specified local row + // within this block + switch (op) { + case BsrMatrix::valueOperation::ADD: { + for (ordinal_type lcol = 0; lcol < block_size; ++lcol) { + if (force_atomic) { + Kokkos::atomic_add( + &(local_row_values[lcol]), + vals[offset_into_vals + lrow * block_size + lcol]); + } else { + local_row_values[lcol] += + vals[offset_into_vals + lrow * block_size + lcol]; + } + } + break; + } + case BsrMatrix::valueOperation::ASSIGN: { + for (ordinal_type lcol = 0; lcol < block_size; ++lcol) { + if (force_atomic) { + Kokkos::atomic_assign( + &(local_row_values[lcol]), + vals[offset_into_vals + lrow * block_size + lcol]); + } else { + local_row_values[lcol] = + vals[offset_into_vals + lrow * block_size + lcol]; + } + } + break; + } + } + } + ++numValid; + } + } // end for ncol + return numValid; + } + + private: + ordinal_type numCols_ = 0; + ordinal_type blockDim_ = 1; // TODO Assuming square blocks for now +}; + +//---------------------------------------------------------------------------- +/// \class is_bsr_matrix +/// \brief is_bsr_matrix::value is true if T is a BsrMatrix<...>, false +/// otherwise +template +struct is_bsr_matrix : public std::false_type {}; +template +struct is_bsr_matrix> : public std::true_type {}; +template +struct is_bsr_matrix> : public std::true_type {}; +//---------------------------------------------------------------------------- + +} // namespace Experimental +} // namespace KokkosSparse +#endif diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index d6ffa5f46c..3115bc9649 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -278,6 +278,23 @@ void run_test_spiluk_streams(int test_algo, int nstreams) { using crsMat_t = CrsMatrix; using AT = Kokkos::ArithTraits; + // Workaround for OpenMP: skip tests if OMP_NUM_THREADS < nstreams because of not enough resource to partition + bool run_streams_test = true; +#ifdef KOKKOS_ENABLE_OPENMP + if (std::is_same::value) { + const char *env_omp_num_threads = std::getenv("OMP_NUM_THREADS"); + if (env_omp_num_threads != nullptr) { + int num_threads = std::atoi(env_omp_num_threads); + if (num_threads < nstreams) { + run_streams_test = false; + std::cout << " Skip stream test: omp_num_threads = " << num_threads << std::endl; + } + } + } +#endif + if (!run_streams_test) + return; + const size_type nrows = 9; const size_type nnz = 21; diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index ec929a064b..2425fb4c27 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -1064,6 +1064,23 @@ void run_test_sptrsv_streams(int test_algo, int nstreams) { size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; using crsMat_t = CrsMatrix; + // Workaround for OpenMP: skip tests if OMP_NUM_THREADS < nstreams because of not enough resource to partition + bool run_streams_test = true; +#ifdef KOKKOS_ENABLE_OPENMP + if (std::is_same::value) { + const char *env_omp_num_threads = std::getenv("OMP_NUM_THREADS"); + if (env_omp_num_threads != nullptr) { + int num_threads = std::atoi(env_omp_num_threads); + if (num_threads < nstreams) { + run_streams_test = false; + std::cout << " Skip stream test: omp_num_threads = " << num_threads << std::endl; + } + } + } +#endif + if (!run_streams_test) + return; + scalar_t ZERO = scalar_t(0); scalar_t ONE = scalar_t(1); From 6a71179ab2b3c20a55ac45a296b647376c04c565 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Mon, 12 Jun 2023 12:46:10 -0700 Subject: [PATCH 428/442] Restore orig. KokkosSparse_BsrMatrix.hpp --- sparse/src/KokkosSparse_BsrMatrix.hpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index f073c93336..b077215635 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -640,11 +640,7 @@ class BsrMatrix { /// data in each row). /// \param cols [in/out] The column indices. /// \param blockDimIn [in] The block dimensions. - //BsrMatrix(/*[[maybe_unused]]*/ const std::string& label, const OrdinalType nrows, - // const OrdinalType ncols, /*[[maybe_unused]]*/ const size_type annz, - // const values_type& vals, const row_map_type& rows, - // const index_type& cols, const OrdinalType blockDimIn) - BsrMatrix(const std::string& label [[maybe_unused]], const OrdinalType nrows, + BsrMatrix([[maybe_unused]] const std::string& label, const OrdinalType nrows, const OrdinalType ncols, [[maybe_unused]] const size_type annz, const values_type& vals, const row_map_type& rows, const index_type& cols, const OrdinalType blockDimIn) From ba75b4b58407caaced8781bbb32b1f15465b3056 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Mon, 12 Jun 2023 12:47:02 -0700 Subject: [PATCH 429/442] Remove redundant file --- sparse/src/KokkosSparse_BsrMatrix.hpp_ORIG | 1085 -------------------- 1 file changed, 1085 deletions(-) delete mode 100644 sparse/src/KokkosSparse_BsrMatrix.hpp_ORIG diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp_ORIG b/sparse/src/KokkosSparse_BsrMatrix.hpp_ORIG deleted file mode 100644 index b077215635..0000000000 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp_ORIG +++ /dev/null @@ -1,1085 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -/// \file KokkosSparse_BsrMatrix.hpp -/// \brief Local sparse matrix interface -/// -/// This file provides KokkosSparse::Experimental::BsrMatrix. -/// This implements a local (no MPI) sparse matrix stored in block-by-block -/// compressed row sparse format. - -#ifndef KOKKOS_SPARSE_BSRMATRIX_HPP_ -#define KOKKOS_SPARSE_BSRMATRIX_HPP_ - -#include -#include -#include -#include - -#include "Kokkos_Core.hpp" -#include "Kokkos_StaticCrsGraph.hpp" -#include "Kokkos_ArithTraits.hpp" -#include "KokkosSparse_CrsMatrix.hpp" -#include "KokkosKernels_Error.hpp" - -namespace KokkosSparse { - -namespace Experimental { - -template -struct BsrRowView { - //! The type of the values in the row. - typedef typename MatrixType::value_type value_type; - //! The type of the column indices in the row. - typedef typename MatrixType::ordinal_type ordinal_type; - //! The type for returned block of values. - typedef Kokkos::View - block_values_type; - - private: - //! Array of values in the row. - value_type* values_; - //! Array of (local) column indices in the row. - ordinal_type* colidx_; - /// \brief Stride between successive rows in a block. - /// - const ordinal_type blockDim_; - - public: - /// \brief Constructor - /// - /// \param values [in] Array of the row's values. - /// \param colidx [in] Array of the row's column indices. - /// \param blockDim [in] (Constant) stride between block rows - /// within a block-row in the above arrays. - /// \param count [in] Number of blocks in the desired block-row. - // - // Assumes values and colidx already offset to the correct location - KOKKOS_INLINE_FUNCTION - BsrRowView(value_type* const values, ordinal_type* const colidx, - const ordinal_type& blockDim, const ordinal_type& count) - : values_(values), colidx_(colidx), blockDim_(blockDim), length(count) {} - - /// \brief Constructor with offset into \c colidx array - /// - /// \param values [in] Array of the row's values. - /// \param colidx [in] Array of the row's column indices. - /// \param blockDim [in] (Constant) stride between rows in - /// within a block in the above arrays. - /// \param count [in] Number of blocks in the desired block-row - /// \param start [in] Offset into values and colidx of the desired block-row - /// start. - /// Note: The offset into the values array for a block-row equals - /// num_blocks_prior_to_block-row*blockDim*blockDim - /// - /// \tparam OffsetType The type of \c start (see above). Must be a - /// built-in integer type. This may differ from ordinal_type. - /// For example, the matrix may have dimensions that fit in int, - /// but a number of entries that does not fit in int. - template - KOKKOS_INLINE_FUNCTION BsrRowView( - const typename MatrixType::values_type& values, - const typename MatrixType::index_type& colidx, - const ordinal_type& blockDim, const ordinal_type& count, - const OffsetType& start, - const typename std::enable_if::value, - int>::type& = 0) - : values_(&values(start * blockDim * blockDim)), - colidx_(&colidx(start)), - blockDim_(blockDim), - length(count) {} - - /// \brief Number of entries (i.e. blocks) in the row. - /// - /// This is a public const field rather than a public const method, - /// in order to avoid possible overhead of a method call if the - /// compiler is unable to inline that method call. - /// - /// We assume that rows contain no duplicate entries (i.e., entries - /// with the same column index). Thus, a row may have up to - /// A.numCols() entries. This means that the correct type of - /// 'length' is ordinal_type. - /// Here, length refers to the number of blocks in a block-row - const ordinal_type length; - - /// /brief Return a pointer offset to local row i of block K of values_ array; - /// user responsible for indexing into this pointer correctly - /// \param K [in] must be the LOCAL block index within this block-row - /// \param i [in] must be the LOCAL row index offset within this block-row - /// - /// Output: pointer to values_ array at start of local row within block K - /// - /// Pointer interfaces are NOT guaranteed for backward compatibility - /// This interface is intended for performant kernels, not common usage - KOKKOS_INLINE_FUNCTION - value_type* local_row_in_block(const ordinal_type& K, - const ordinal_type& i) const { - return (values_ + (K * blockDim_ * blockDim_ + i * blockDim_)); - } - - /// \brief Return the value at a specified block K of block-row - /// with local row and col offset (i,j) - /// \param K [in] must be the LOCAL block index within this block-row - /// \param i [in] must be the LOCAL row index offset within this block-row - /// \param j [in] must be the LOCAL col index offset within this block-row - /// - /// Output: reference to value_type at the given (K, i, j) offset into values_ - KOKKOS_INLINE_FUNCTION - value_type& local_block_value(const ordinal_type& K, const ordinal_type& i, - const ordinal_type& j) const { - return values_[K * blockDim_ * blockDim_ + i * blockDim_ + j]; - } - - /// \brief Return unmanaged 2D strided View wrapping local block K from this - /// block-row - /// \param K [in] must be the LOCAL block index within this - /// block-row - KOKKOS_INLINE_FUNCTION - block_values_type block(const ordinal_type& K) const { - return block_values_type(&(values_[K * blockDim_ * blockDim_]), - Kokkos::LayoutRight(blockDim_, blockDim_)); - } - - /// \brief Return offset into colidx_ for the requested block idx - /// If none found, return Kokkos::ArithTraits::max - /// \param idx_to_match [in] local block idx within block-row - KOKKOS_INLINE_FUNCTION - ordinal_type findRelBlockOffset(const ordinal_type idx_to_match, - bool /*is_sorted*/ = false) const { - ordinal_type offset = Kokkos::ArithTraits::max(); - for (ordinal_type blk_offset = 0; blk_offset < length; ++blk_offset) { - ordinal_type idx = colidx_[blk_offset]; - if (idx == idx_to_match) { - offset = blk_offset; - break; - } // return relative offset - } - return offset; - } -}; - -template -struct BsrRowViewConst { - //! The type of the values in the row. - typedef const typename MatrixType::non_const_value_type value_type; - //! The type of the column indices in the row. - typedef const typename MatrixType::non_const_ordinal_type ordinal_type; - //! The type for returned block of values. - typedef Kokkos::View - block_values_type; - - private: - //! Array of values in the row. - value_type* values_; - //! Array of (local) column indices in the row. - ordinal_type* colidx_; - /// \brief Stride between successive rows in a block - const ordinal_type blockDim_; - - public: - /// \brief Constructor - /// - /// \param values [in] Array of the row's values. - /// \param colidx [in] Array of the row's column indices. - /// \param blockDim [in] (Constant) stride between block rows - /// within a block-row in the above arrays. - /// \param count [in] Number of entries in the row. - // - // Assumes values and colidx already offset to the correct location - KOKKOS_INLINE_FUNCTION - BsrRowViewConst(value_type* const values, ordinal_type* const colidx, - const ordinal_type& blockDim, const ordinal_type& count) - : values_(values), colidx_(colidx), blockDim_(blockDim), length(count) {} - - /// \brief Constructor with offset into \c colidx array - /// - /// \param values [in] Array of the row's values. - /// \param colidx [in] Array of the row's column indices. - /// \param blockDim [in] The block dimensions. - /// \param count [in] Number of entries in the row. - /// \param start [in] Offset into values and colidx of the desired block-row - /// start. - /// Note: The offset into the values array for a block-row equals - /// num_blocks_prior_to_block-row*blockDim*blockDim - /// - /// \tparam OffsetType The type of \c start (see above). Must be a - /// built-in integer type. This may differ from ordinal_type. - /// For example, the matrix may have dimensions that fit in int, - /// but a number of entries that does not fit in int. - template - KOKKOS_INLINE_FUNCTION BsrRowViewConst( - const typename MatrixType::values_type& values, - const typename MatrixType::index_type& colidx, - const ordinal_type& blockDim, const ordinal_type& count, - const OffsetType& start, - const typename std::enable_if::value, - int>::type& = 0) - : values_(&values(start * blockDim * blockDim)), - colidx_(&colidx(start)), - blockDim_(blockDim), - length(count) {} - - /// \brief Number of entries (i.e. blocks) in the row. - /// - /// This is a public const field rather than a public const method, - /// in order to avoid possible overhead of a method call if the - /// compiler is unable to inline that method call. - /// - /// We assume that rows contain no duplicate entries (i.e., entries - /// with the same column index). Thus, a row may have up to - /// A.numCols() entries. This means that the correct type of - /// 'length' is ordinal_type. - const ordinal_type length; - - /// /brief Return a pointer offset to local row i of block K of values_ array; - /// user responsible for indexing into this pointer correctly - /// \param K [in] must be the LOCAL block index within this block-row - /// \param i [in] must be the LOCAL row index offset within this block-row - /// - /// Output: pointer to values_ array at start of local row within block K - /// - /// Pointer interfaces are NOT guaranteed for backward compatibility - /// This interface is intended for performant kernels, not common usage - KOKKOS_INLINE_FUNCTION - value_type* local_row_in_block(const ordinal_type& K, - const ordinal_type& i) const { - return (values_ + (K * blockDim_ * blockDim_ + i * blockDim_)); - } - - /// \brief Return the value at a specified block K with local row and col ids - /// (i,j) \param K [in] must be the LOCAL block index within this block-row - /// \param i [in] must be the LOCAL row index offset within this block-row - /// \param j [in] must be the LOCAL col index offset within this block-row - /// - /// Output: reference to value_type at the given (K, i, j) offset into values_ - KOKKOS_INLINE_FUNCTION - value_type& local_block_value(const ordinal_type& K, const ordinal_type& i, - const ordinal_type& j) const { - return values_[K * blockDim_ * blockDim_ + i * blockDim_ + j]; - } - - /// \brief Return the block column index for a specified block K - /// - /// \param K [in] must be the LOCAL block index within this block-row - /// \return Block column index for "uncompressed" block row - KOKKOS_INLINE_FUNCTION - ordinal_type block_colidx(const ordinal_type K) const { return colidx_[K]; } - - /// \brief Return unmanaged 2D strided View wrapping local block K from this - /// block-row \param K [in] must be the LOCAL block index within this - /// block-row - KOKKOS_INLINE_FUNCTION - block_values_type block(const ordinal_type& K) const { - return block_values_type(&(values_[K * blockDim_ * blockDim_]), - Kokkos::LayoutRight(blockDim_, blockDim_)); - } - - /// \brief Return offset into colidx_ for the requested block idx - /// If none found, return Kokkos::ArithTraits::max - /// \param idx_to_match [in] local block idx within block-row - KOKKOS_INLINE_FUNCTION - ordinal_type findRelBlockOffset(const ordinal_type& idx_to_match, - bool /*is_sorted*/ = false) const { - typedef typename std::remove_cv::type non_const_ordinal_type; - non_const_ordinal_type offset = - Kokkos::ArithTraits::max(); - for (non_const_ordinal_type blk_offset = 0; blk_offset < length; - ++blk_offset) { - ordinal_type idx = colidx_[blk_offset]; - if (idx == idx_to_match) { - offset = blk_offset; - break; - } // return relative offset - } - return offset; - } -}; - -/// \class BsrMatrix -/// \brief Compressed sparse row implementation of a sparse matrix. -/// \tparam ScalarType The type of entries in the sparse matrix. -/// \tparam OrdinalType The type of column indices in the sparse matrix. -/// \tparam Device The Kokkos Device type. -/// \tparam MemoryTraits Traits describing how Kokkos manages and -/// accesses data. The default parameter suffices for most users. -/// -/// "Crs" stands for "compressed row sparse." This is the phrase -/// Trilinos traditionally uses to describe compressed sparse row -/// storage for sparse matrices, as described, for example, in Saad -/// (2nd ed.). -template ::size_type> -class BsrMatrix { - static_assert( - std::is_signed::value, - "BsrMatrix requires that OrdinalType is a signed integer type."); - static_assert(Kokkos::is_memory_traits_v || - std::is_void_v, - "BsrMatrix: MemoryTraits (4th template param) must be a Kokkos " - "MemoryTraits or void"); - - private: - typedef - typename Kokkos::ViewTraits::host_mirror_space host_mirror_space; - - public: - //! Type of the matrix's execution space. - typedef typename Device::execution_space execution_space; - //! Type of the matrix's memory space. - typedef typename Device::memory_space memory_space; - //! Type of the matrix's device type. - typedef Kokkos::Device device_type; - - //! Type of each value in the matrix. - typedef ScalarType value_type; - //! Type of each (column) index in the matrix. - typedef OrdinalType ordinal_type; - typedef MemoryTraits memory_traits; - /// \brief Type of each entry of the "row map." - /// - /// The "row map" corresponds to the \c ptr array of row offsets in - /// compressed sparse row (CSR) storage. - typedef SizeType size_type; - - //! Type of a host-memory mirror of the sparse matrix. - typedef BsrMatrix - HostMirror; - //! Type of the graph structure of the sparse matrix. - typedef Kokkos::StaticCrsGraph - StaticCrsGraphType; - //! Type of the graph structure of the sparse matrix - consistent with Kokkos. - typedef Kokkos::StaticCrsGraph - staticcrsgraph_type; - //! Type of column indices in the sparse matrix. - typedef typename staticcrsgraph_type::entries_type index_type; - //! Const version of the type of column indices in the sparse matrix. - typedef typename index_type::const_value_type const_ordinal_type; - //! Nonconst version of the type of column indices in the sparse matrix. - typedef typename index_type::non_const_value_type non_const_ordinal_type; - //! Type of the "row map" (which contains the offset for each row's data). - typedef typename staticcrsgraph_type::row_map_type row_map_type; - //! Const version of the type of row offsets in the sparse matrix. - typedef typename row_map_type::const_value_type const_size_type; - //! Nonconst version of the type of row offsets in the sparse matrix. - typedef typename row_map_type::non_const_value_type non_const_size_type; - //! Kokkos Array type of the entries (values) in the sparse matrix. - typedef Kokkos::View - values_type; - //! Const version of the type of the entries in the sparse matrix. - typedef typename values_type::const_value_type const_value_type; - //! Nonconst version of the type of the entries in the sparse matrix. - typedef typename values_type::non_const_value_type non_const_value_type; - - // block values are actually a 1-D view, however they are implicitly - // arranged in LayoutRight, e.g. consecutive entries in the values view - // are consecutive entries within a row inside a block - using block_layout = Kokkos::LayoutRight; - - /// \name Storage of the actual sparsity structure and values. - /// - /// BsrMatrix uses the compressed sparse row (CSR) storage format to - /// store the sparse matrix. CSR is also called "compressed row - /// storage"; hence the name, which it inherits from Tpetra and from - /// Epetra before it. - //@{ - //! The graph (sparsity structure) of the sparse matrix. - staticcrsgraph_type graph; - //! The 1-D array of values of the sparse matrix. - values_type values; - //@} - - /// \brief Launch configuration that can be used by - /// overloads/specializations of MV_multiply(). - /// - /// This is a hack and needs to be replaced by a general - /// state mechanism. - DeviceConfig dev_config; - - /// \brief Default constructor; constructs an empty sparse matrix. - /// - /// mfh: numCols and nnz should be properties of the graph, not the matrix. - /// Then BsrMatrix needs methods to get these from the graph. - BsrMatrix() : graph(), values(), dev_config(), numCols_(0), blockDim_(1) {} - - //! Copy constructor (shallow copy). - template - explicit BsrMatrix(const BsrMatrix& B) - : graph(B.graph.entries, B.graph.row_map), - values(B.values), - dev_config(B.dev_config), - numCols_(B.numCols()), - blockDim_(B.blockDim()) { - graph.row_block_offsets = B.graph.row_block_offsets; - // MD: Changed the copy constructor of graph - // as the constructor of StaticCrsGraph does not allow copy from non const - // version. - } - - /// \brief Construct with a graph that will be shared. - /// - /// \param[in] arg_label The sparse matrix's label. - /// \param[in] arg_graph The graph between the blocks. - /// \param[in] blockDimIn The block size. - /// - /// Allocate the values array for subsequent fill. - BsrMatrix(const std::string& arg_label, const staticcrsgraph_type& arg_graph, - const OrdinalType& blockDimIn) - : graph(arg_graph), - values(arg_label, - arg_graph.entries.extent(0) * blockDimIn * blockDimIn), - numCols_(maximum_entry(arg_graph) + 1), - blockDim_(blockDimIn) { - if (blockDim_ < 1) { - std::ostringstream os; - os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - } - - /// \brief Construct BsrMatrix from host data in COO format. - /// - /// The COO matrix must already have a block structure. - /// Each entry k of the input sparse matrix has a value stored in val[k], - /// row index in rows[k] and column index in cols[k]. - /// The COO data must be sorted by increasing row index - /// - /// This constructor is mainly useful for benchmarking or for - /// reading the sparse matrix's data from a file. - /// - /// \param label [in] The sparse matrix's label. - /// \param nrows [in] The number of rows. - /// \param ncols [in] The number of columns. - /// \param annz [in] The number of entries. - /// \param vals [in] The entries. - /// \param rows [in] The row indices. rows[k] is the row index of - /// val[k]. - /// \param cols [in] The column indices. cols[k] is the column - /// index of val[k]. - /// \param blockdim [in] The block size of the constructed BsrMatrix. - /// \param pad [in] If true, pad the sparse matrix's storage with - /// zeros in order to improve cache alignment and / or - /// vectorization. - /// - /// The \c pad argument is currently not used. - BsrMatrix(const std::string& label, OrdinalType nrows, OrdinalType ncols, - size_type annz, ScalarType* vals, OrdinalType* rows, - OrdinalType* cols, OrdinalType blockdim, bool pad = false) { - (void)label; - (void)pad; - blockDim_ = blockdim; - - if (blockDim_ < 1) { - std::ostringstream os; - os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - - if (ncols % blockDim_) { - std::ostringstream os; - os << "BsrMatrix: " << ncols - << " input CrsMatrix columns is not a multiple of block size " - << blockDim_; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - if (nrows % blockDim_) { - std::ostringstream os; - os << "BsrMatrix: " << nrows - << " input CrsMatrix rows is not a multiple of block size " - << blockDim_; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - if (annz % (blockDim_ * blockDim_)) { - throw std::runtime_error( - "BsrMatrix:: annz should be a multiple of the number of entries in a " - "block"); - } - - using Coord = std::pair; // row, col - using CoordComp = std::function; // type that can order Coords - using Entry = std::pair; // (row, col), val - using Blocks = std::map, - CoordComp>; // map a block to its non-zeros, sorted - // by row, then col - - numCols_ = ncols / blockDim_; - ordinal_type numRows = nrows / blockDim_; - size_type numBlocks = annz / (blockDim_ * blockDim_); - - // device data - typename row_map_type::non_const_type row_map_device( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map_device"), - numRows + 1); - index_type entries_device("entries_device", numBlocks); - Kokkos::resize(values, annz); - - // mirror views on host - auto row_map_host = Kokkos::create_mirror_view(row_map_device); - auto entries_host = Kokkos::create_mirror_view(entries_device); - auto values_host = Kokkos::create_mirror_view(values); - - auto coord_by_row_col = [](const Coord& a, const Coord& b) { - const auto& arow = std::get<0>(a); - const auto& brow = std::get<0>(b); - const auto& acol = std::get<1>(a); - const auto& bcol = std::get<1>(b); - if (arow < brow) { - return true; - } else if (arow > brow) { - return false; - } else { - return acol < bcol; - } - }; - - auto entry_by_row_col = [coord_by_row_col](const Entry& a, const Entry& b) { - return coord_by_row_col(std::get<0>(a), std::get<0>(b)); - }; - - // organize all blocks and their entries - Blocks blocks(coord_by_row_col); - for (size_type i = 0; i < annz; ++i) { - const ordinal_type row = rows[i]; - const ordinal_type col = cols[i]; - const ScalarType val = vals[i]; - const Coord block = Coord(row / blockDim_, col / blockDim_); - const Entry entry(Coord(row, col), val); - - // add entry to the correct block - auto it = blocks.find(block); - if (it == blocks.end()) { - std::vector entries = {entry}; - entries.reserve(blockDim_ * blockDim_); - blocks[block] = std::move(entries); // new block with entry - } else { - it->second.push_back(entry); // add entry to block - } - } - - // write block data out to BSR format - ordinal_type row = 0; // current row we're in - size_t bi = 0; // how many blocks so far - for (auto& kv : blocks) { // iterating through blocks in row/col order - const Coord& block = kv.first; // block's position - auto& entries = kv.second; // non-zeros in the block - - if (OrdinalType(entries.size()) != blockDim_ * blockDim_) { - std::stringstream ss; - ss << "BsrMatrix: block " << block.first << "," << block.second - << " had only " << entries.size() << " non-zeros, expected " - << blockDim_ * blockDim_; - KokkosKernels::Impl::throw_runtime_exception(ss.str()); - } - - // update row-map if block is in a new row - for (; row < block.first; ++row) { - row_map_host(row + 1) = bi; // `row` ends at bi - } - - // record column of block - entries_host(bi) = block.second; // block's column - - // add contiguous entries of block sorted by row/col - std::sort(entries.begin(), entries.end(), entry_by_row_col); - for (size_type ei = 0; ei < size_type(entries.size()); ++ei) { - values_host(bi * blockDim_ * blockDim_ + ei) = std::get<1>(entries[ei]); - } - - // next block - ++bi; - } - // complete row map if last blocks are empty - for (; row < numRows + 1; ++row) { - row_map_host(row) = bi; - } - - // move graph data to the requested device - Kokkos::deep_copy(row_map_device, row_map_host); - Kokkos::deep_copy(entries_device, entries_host); - Kokkos::deep_copy(values, values_host); - - graph = staticcrsgraph_type(entries_device, row_map_device); - } - - /// \brief Constructor that accepts a row map, column indices, and - /// values. - /// - /// The matrix will store and use the row map, indices, and values - /// directly (by view, not by deep copy). - /// - /// \param label - /// \param nrows [in] The number of rows. - /// \param ncols [in] The number of columns. - /// \param annz [in] Filler for annz. - /// \param vals [in/out] The entries. - /// \param rows [in/out] The row map (containing the offsets to the - /// data in each row). - /// \param cols [in/out] The column indices. - /// \param blockDimIn [in] The block dimensions. - BsrMatrix([[maybe_unused]] const std::string& label, const OrdinalType nrows, - const OrdinalType ncols, [[maybe_unused]] const size_type annz, - const values_type& vals, const row_map_type& rows, - const index_type& cols, const OrdinalType blockDimIn) - : graph(cols, rows), - values(vals), - numCols_(ncols), - blockDim_(blockDimIn) { - if (blockDim_ < 1) { - std::ostringstream os; - os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - - const ordinal_type actualNumRows = - (rows.extent(0) != 0) ? static_cast( - rows.extent(0) - static_cast(1)) - : static_cast(0); - if (nrows != actualNumRows) { - std::ostringstream os; - os << "Input argument nrows = " << nrows - << " != the actual number of " - "rows " - << actualNumRows << " according to the 'rows' input argument."; - throw std::invalid_argument(os.str()); - } - // nnz returns graph.entries.extent(0) i.e. ptr[ nrows + 1 ] nnz entry - // input annz is nnz of values, not comparable with block ptr 'nnz' i.e. - // numBlocks - if (blockDim_ <= 0) { - std::ostringstream os; - os << "Input argument blockDim = " << blockDim_ - << " is not larger than 0."; - throw std::invalid_argument(os.str()); - } - } - - /// \brief Constructor that accepts a a static graph, and values. - /// - /// The matrix will store and use the row map, indices, and values - /// directly (by view, not by deep copy). - /// - /// \param ncols [in] The number of columns. - /// \param vals [in] The entries. - /// \param graph_ [in] The graph between the blocks. - /// \param blockDimIn [in] The block dimensions. - BsrMatrix(const std::string& /*label*/, const OrdinalType& ncols, - const values_type& vals, const staticcrsgraph_type& graph_, - const OrdinalType& blockDimIn) - : graph(graph_), values(vals), numCols_(ncols), blockDim_(blockDimIn) { - if (blockDim_ < 1) { - std::ostringstream os; - os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - } - - /// \brief Constructor that accepts a CrsMatrix and block dimension, - /// assuming the provided CrsMatrix has appropriate block structure. - template - BsrMatrix(const KokkosSparse::CrsMatrix& - crs_mtx, - const OrdinalType blockDimIn) { - typedef typename KokkosSparse::CrsMatrix - crs_matrix_type; - typedef typename crs_matrix_type::staticcrsgraph_type crs_graph_type; - typedef typename crs_graph_type::entries_type crs_graph_entries_type; - typedef typename crs_graph_type::row_map_type crs_graph_row_map_type; - - blockDim_ = blockDimIn; - if (blockDim_ < 1) { - std::ostringstream os; - os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - - assert( - (crs_mtx.numCols() % blockDim_ == 0) && - "BsrMatrix: input CrsMatrix columns is not a multiple of block size"); - assert((crs_mtx.numRows() % blockDim_ == 0) && - "BsrMatrix: input CrsMatrix rows is not a multiple of block size"); - - numCols_ = crs_mtx.numCols() / blockDim_; - - OrdinalType nbrows = - crs_mtx.numRows() / - blockDim_; // actual number of block rows; add 1 for ptr length - - // block_rows will accumulate the number of blocks per row - this is NOT the - // row_map with cum sum!! - std::vector block_rows(nbrows, 0); - - typename crs_graph_row_map_type::HostMirror h_crs_row_map = - Kokkos::create_mirror_view(crs_mtx.graph.row_map); - Kokkos::deep_copy(h_crs_row_map, crs_mtx.graph.row_map); - typename crs_graph_entries_type::HostMirror h_crs_entries = - Kokkos::create_mirror_view(crs_mtx.graph.entries); - Kokkos::deep_copy(h_crs_entries, crs_mtx.graph.entries); - - // determine size of block cols indices == number of blocks, - // i.e. nnz for the block CRS graph - OrdinalType numBlocks = 0; - for (OrdinalType i = 0; i < crs_mtx.numRows(); i += blockDim_) { - std::set col_set; - for (auto ie = h_crs_row_map(i); ie < h_crs_row_map(i + blockDim_); - ++ie) { - col_set.insert(h_crs_entries(ie) / blockDim_); - } - numBlocks += col_set.size(); // cum sum - block_rows[i / blockDim_] = col_set.size(); // frequency counts - } - - // create_staticcrsgraph takes the frequency of blocks per row - // and returns the cum sum pointer row_map with nbrows+1 size, and total - // numBlocks in the final entry - graph = Kokkos::create_staticcrsgraph("blockgraph", - block_rows); - typename row_map_type::HostMirror h_row_map = - Kokkos::create_mirror_view(graph.row_map); - Kokkos::deep_copy(h_row_map, graph.row_map); - - typename index_type::HostMirror h_entries = - Kokkos::create_mirror_view(graph.entries); - - OrdinalType ientry = 0; - for (OrdinalType ib = 0; ib < nbrows; ++ib) { - auto ir_start = ib * blockDim_; - auto ir_stop = (ib + 1) * blockDim_; - std::set col_set; - for (auto jk = h_crs_row_map(ir_start); jk < h_crs_row_map(ir_stop); - ++jk) { - col_set.insert(h_crs_entries(jk) / blockDim_); - } - for (auto col_block : col_set) { - h_entries(ientry++) = col_block; - } - } - Kokkos::deep_copy(graph.entries, h_entries); - - // Copy the numerical values - - typename values_type::HostMirror h_crs_values = - Kokkos::create_mirror_view(crs_mtx.values); - Kokkos::deep_copy(h_crs_values, crs_mtx.values); - - typename values_type::HostMirror h_values = - Kokkos::create_mirror_view(values); - if (h_values.extent(0) < size_t(numBlocks * blockDim_ * blockDim_)) { - Kokkos::resize(h_values, numBlocks * blockDim_ * blockDim_); - Kokkos::resize(values, numBlocks * blockDim_ * blockDim_); - } - Kokkos::deep_copy(h_values, 0); - - for (OrdinalType ir = 0; ir < crs_mtx.numRows(); ++ir) { - const auto iblock = ir / blockDim_; - const auto ilocal = ir % blockDim_; - for (auto jk = h_crs_row_map(ir); jk < h_crs_row_map(ir + 1); ++jk) { - const auto jc = h_crs_entries(jk); - const auto jblock = jc / blockDim_; - const auto jlocal = jc % blockDim_; - for (auto jkb = h_row_map(iblock); jkb < h_row_map(iblock + 1); ++jkb) { - if (h_entries(jkb) == jblock) { - OrdinalType shift = jkb * blockDim_ * blockDim_; - h_values(shift + ilocal * blockDim_ + jlocal) = h_crs_values(jk); - break; - } - } - } - } - Kokkos::deep_copy(values, h_values); - } - - /// \brief Given an array of blocks, sum the values into corresponding - /// block in BsrMatrix - /// \param rowi [in] is a block-row index - /// \param cols[] [in] are block colidxs within the block-row to be summed - /// into ncol entries - /// \param ncol [in] is number of blocks referenced in cols[] array - /// \param vals[] [in] array containing 'block' of values - /// ncol*block_size*block_size entries - /// assume vals block is provided in 'LayoutRight' or 'Row Major' - /// format, that is e.g. 2x2 block [ a b ; c d ] provided as flattened - /// 1d array as [a b c d] Assume that each block is stored contiguously - /// in vals: [a b; c d] [e f; g h] -> [a b c d e f g h] If so, then i - /// in [0, ncols) for cols[] maps to i*block_size*block_size in vals[] - /// \param is_sorted [in] - /// \param force_atomic [in] - KOKKOS_INLINE_FUNCTION - OrdinalType sumIntoValues(const OrdinalType rowi, const OrdinalType cols[], - const OrdinalType ncol, const ScalarType vals[], - const bool is_sorted = false, - const bool force_atomic = false) const { - return operateValues(BsrMatrix::valueOperation::ADD, rowi, cols, ncol, vals, - is_sorted, force_atomic); - } - - /// \brief Given an array of blocks, replace the values of corresponding - /// blocks in BsrMatrix - /// \param rowi [in] is a block-row index - /// \param cols[] [in] are block colidxs within the block-row to be summed - /// into ncol entries - /// \param ncol [in] is number of blocks referenced in cols[] array - /// \param vals[] [in] array containing 'block' of values - /// ncol*block_size*block_size entries - /// assume vals block is provided in 'LayoutRight' or 'Row - /// Major' format, that is e.g. 2x2 block [ a b ; c d ] provided - /// as flattened 1d array as [a b c d] Assume that each block is - /// stored contiguously in vals: [a b; c d] [e f; g h] -> [a b c - /// d e f g h] If so, then i in [0, ncols) for cols[] maps to - /// i*block_size*block_size in vals[] - /// \param is_sorted [in] - /// \param force_atomic [in] - KOKKOS_INLINE_FUNCTION - OrdinalType replaceValues(const OrdinalType rowi, const OrdinalType cols[], - const OrdinalType ncol, const ScalarType vals[], - const bool is_sorted = false, - const bool force_atomic = false) const { - return operateValues(BsrMatrix::valueOperation::ASSIGN, rowi, cols, ncol, - vals, is_sorted, force_atomic); - } - - //! Attempt to assign the input matrix to \c *this. - // Are the CUDA sparse handles needed to be copied here?? - template - BsrMatrix& operator=(const BsrMatrix& mtx) { - numCols_ = mtx.numCols(); - blockDim_ = mtx.blockDim(); - graph = mtx.graph; - values = mtx.values; - dev_config = mtx.dev_config; - return *this; - } - - //! The number of rows in the sparse matrix. - KOKKOS_INLINE_FUNCTION ordinal_type numRows() const { - return graph.numRows(); - } - - //! The number of columns in the sparse matrix. - KOKKOS_INLINE_FUNCTION ordinal_type numCols() const { return numCols_; } - - //! The block dimension in the sparse block matrix. - KOKKOS_INLINE_FUNCTION ordinal_type blockDim() const { return blockDim_; } - - //! The number of "point" (non-block) rows in the matrix. - // This is the dimension of the range of this matrix as a linear operator. - KOKKOS_INLINE_FUNCTION ordinal_type numPointRows() const { - return numRows() * blockDim(); - } - - //! The number of "point" (non-block) columns in the matrix. - // This is the dimension of the domain of this matrix as a linear operator. - KOKKOS_INLINE_FUNCTION ordinal_type numPointCols() const { - return numCols() * blockDim(); - } - - //! The number of stored entries in the sparse matrix. - KOKKOS_INLINE_FUNCTION size_type nnz() const { - return graph.entries.extent(0); - } - - friend struct BsrRowView; - - /// \brief Return a BsrRowView of block-row i of the matrix. - /// - /// If row i does not belong to the matrix, return an empty view. - /// - /// The returned object \c view implements the following interface: - ///
    - ///
  • \c view.length is the number of entries (i.e. blocks) - /// in the block row
  • - ///
  • \c view.local_row_in_block_row(K, i) returns a nonconst pointer - /// to the values of the ith local row in the k-th block of the block-row - ///
  • - ///
  • \c view.full_row_in_block_row(i) returns a nonconst pointer - /// to the values of the ith local row of the block-row
  • - ///
  • \c view.local_block_value(K, i, j) returns a nonconst reference - /// to the value in the ith local row and jth local col - /// of the k-th block of the block-row
  • - ///
  • \c view.block(K) returns an unmanaged 2D strided Kokkos::View - /// of the values of the k-th block of the block-row
  • - ///
- /// - /// Users should not rely on the return type of this method. They - /// should instead assign to 'auto'. - /// - KOKKOS_INLINE_FUNCTION - BsrRowView block_row(const ordinal_type i) const { - const size_type start = - graph.row_map(i); // total num blocks prior to this block-row - const auto count = static_cast( - graph.row_map(i + 1) - start); // num blocks in this row - - if (count == 0) { - return BsrRowView(nullptr, nullptr, 1, 0); - } else { - return BsrRowView(values, graph.entries, blockDim(), count, - start); - } - } - - /// \brief Return a BsrRowViewConst of block-row i of the matrix. - /// - /// If row i does not belong to the matrix, return an empty view. - /// - /// The returned object \c view implements the following interface: - ///
    - ///
  • \c view.length is the number of entries (i.e. blocks) - /// in the block row
  • - ///
  • \c view.local_row_in_block_row(K, i) returns a nonconst pointer - /// to the values of the ith local row in the k-th block of the block-row - ///
  • - ///
  • \c view.full_row_in_block_row(i) returns a nonconst pointer - /// to the values of the ith local row of the block-row
  • - ///
  • \c view.local_block_value(K, i, j) returns a nonconst reference - /// to the value in the ith local row and jth local col - /// of the k-th block of the block-row
  • - ///
  • \c view.block(K) returns an unmanaged 2D strided Kokkos::View - /// of the values of the k-th block of the block-row
  • - ///
- /// - /// Users should not rely on the return type of this method. They - /// should instead assign to 'auto'. - /// - KOKKOS_INLINE_FUNCTION - BsrRowViewConst block_row_Const(const ordinal_type i) const { - const size_type start = - graph.row_map(i); // total num blocks prior to this block-row - const auto count = static_cast( - graph.row_map(i + 1) - start); // num blocks in this row - - if (count == 0) { - return BsrRowViewConst(nullptr, nullptr, 1, 0); - } else { - return BsrRowViewConst(values, graph.entries, blockDim(), - count, start); - } - } - - protected: - enum class valueOperation { ADD, ASSIGN }; - - /// \brief Given an array of blocks, operate on the values of corresponding - /// blocks in BsrMatrix - /// \param op - /// \param rowi [in] is a block-row index - /// \param ncol [in] is number of blocks referenced in cols[] array - /// \param cols[] [in] are block colidxs within the block-row to be op-ed - /// into ncol entries - /// \param vals[] [in] array containing 'block' of values - /// ncol*block_size*block_size entries - /// assume vals block is provided in 'LayoutRight' or 'Row - /// Major' format, that is e.g. 2x2 block [ a b ; c d ] provided - /// as flattened 1d array as [a b c d] Assume that each block is - /// stored contiguously in vals: [a b; c d] [e f; g h] -> [a b c - /// d e f g h] If so, then i in [0, ncols) for cols[] maps to - /// i*block_size*block_size in vals[] - /// \param is_sorted [in] - /// \param force_atomic [in] - KOKKOS_INLINE_FUNCTION - OrdinalType operateValues(const BsrMatrix::valueOperation op, - const OrdinalType rowi, const OrdinalType cols[], - const OrdinalType ncol, const ScalarType vals[], - const bool is_sorted = false, - const bool force_atomic = false) const { - BsrRowView row_view = this->block_row(rowi); - const ordinal_type block_size = this->blockDim(); - - ordinal_type numValid = 0; // number of valid local column indices - - for (ordinal_type i = 0; i < ncol; ++i) { - // Find offset into values for block-row rowi and colidx cols[i] - // cols[i] is the index to match - // blk_offset is the offset for block colidx from bptr[rowi] to bptr[rowi - // + 1] (not global offset) colidx_ and values_ are already offset to the - // beginning of blockrow rowi - auto blk_offset = row_view.findRelBlockOffset(cols[i], is_sorted); - if (blk_offset != Kokkos::ArithTraits::max()) { - ordinal_type offset_into_vals = - i * block_size * - block_size; // stride == 1 assumed between elements - for (ordinal_type lrow = 0; lrow < block_size; ++lrow) { - auto local_row_values = row_view.local_row_in_block( - blk_offset, lrow); // pointer to start of specified local row - // within this block - switch (op) { - case BsrMatrix::valueOperation::ADD: { - for (ordinal_type lcol = 0; lcol < block_size; ++lcol) { - if (force_atomic) { - Kokkos::atomic_add( - &(local_row_values[lcol]), - vals[offset_into_vals + lrow * block_size + lcol]); - } else { - local_row_values[lcol] += - vals[offset_into_vals + lrow * block_size + lcol]; - } - } - break; - } - case BsrMatrix::valueOperation::ASSIGN: { - for (ordinal_type lcol = 0; lcol < block_size; ++lcol) { - if (force_atomic) { - Kokkos::atomic_assign( - &(local_row_values[lcol]), - vals[offset_into_vals + lrow * block_size + lcol]); - } else { - local_row_values[lcol] = - vals[offset_into_vals + lrow * block_size + lcol]; - } - } - break; - } - } - } - ++numValid; - } - } // end for ncol - return numValid; - } - - private: - ordinal_type numCols_ = 0; - ordinal_type blockDim_ = 1; // TODO Assuming square blocks for now -}; - -//---------------------------------------------------------------------------- -/// \class is_bsr_matrix -/// \brief is_bsr_matrix::value is true if T is a BsrMatrix<...>, false -/// otherwise -template -struct is_bsr_matrix : public std::false_type {}; -template -struct is_bsr_matrix> : public std::true_type {}; -template -struct is_bsr_matrix> : public std::true_type {}; -//---------------------------------------------------------------------------- - -} // namespace Experimental -} // namespace KokkosSparse -#endif From b2581bb2de79529d75b85714f26279b11528cb1d Mon Sep 17 00:00:00 2001 From: "Vinh Quang Dang (-EXP)" Date: Mon, 12 Jun 2023 13:54:08 -0600 Subject: [PATCH 430/442] Apply clang format --- sparse/unit_test/Test_Sparse_spiluk.hpp | 23 ++++++++++++----------- sparse/unit_test/Test_Sparse_sptrsv.hpp | 9 +++++---- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 3115bc9649..a7187d4066 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -47,9 +47,9 @@ namespace Test { template void run_test_spiluk() { - typedef Kokkos::View RowMapType; - typedef Kokkos::View EntriesType; - typedef Kokkos::View ValuesType; + typedef Kokkos::View RowMapType; + typedef Kokkos::View EntriesType; + typedef Kokkos::View ValuesType; typedef Kokkos::ArithTraits AT; const size_type nrows = 9; @@ -265,9 +265,9 @@ void run_test_spiluk() { template void run_test_spiluk_streams(int test_algo, int nstreams) { - using RowMapType = Kokkos::View; - using EntriesType = Kokkos::View; - using ValuesType = Kokkos::View; + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; using RowMapType_hostmirror = typename RowMapType::HostMirror; using EntriesType_hostmirror = typename EntriesType::HostMirror; using ValuesType_hostmirror = typename ValuesType::HostMirror; @@ -278,7 +278,8 @@ void run_test_spiluk_streams(int test_algo, int nstreams) { using crsMat_t = CrsMatrix; using AT = Kokkos::ArithTraits; - // Workaround for OpenMP: skip tests if OMP_NUM_THREADS < nstreams because of not enough resource to partition + // Workaround for OpenMP: skip tests if OMP_NUM_THREADS < nstreams because of + // not enough resource to partition bool run_streams_test = true; #ifdef KOKKOS_ENABLE_OPENMP if (std::is_same::value) { @@ -287,13 +288,13 @@ void run_test_spiluk_streams(int test_algo, int nstreams) { int num_threads = std::atoi(env_omp_num_threads); if (num_threads < nstreams) { run_streams_test = false; - std::cout << " Skip stream test: omp_num_threads = " << num_threads << std::endl; + std::cout << " Skip stream test: omp_num_threads = " << num_threads + << std::endl; } } } #endif - if (!run_streams_test) - return; + if (!run_streams_test) return; const size_type nrows = 9; const size_type nnz = 21; @@ -309,7 +310,7 @@ void run_test_spiluk_streams(int test_algo, int nstreams) { Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); std::vector kh_v(nstreams); - std::vector kh_ptr_v(nstreams); + std::vector kh_ptr_v(nstreams); std::vector A_row_map_v(nstreams); std::vector A_entries_v(nstreams); std::vector A_values_v(nstreams); diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index 2425fb4c27..70f7eb9ee6 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -1064,7 +1064,8 @@ void run_test_sptrsv_streams(int test_algo, int nstreams) { size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; using crsMat_t = CrsMatrix; - // Workaround for OpenMP: skip tests if OMP_NUM_THREADS < nstreams because of not enough resource to partition + // Workaround for OpenMP: skip tests if OMP_NUM_THREADS < nstreams because of + // not enough resource to partition bool run_streams_test = true; #ifdef KOKKOS_ENABLE_OPENMP if (std::is_same::value) { @@ -1073,13 +1074,13 @@ void run_test_sptrsv_streams(int test_algo, int nstreams) { int num_threads = std::atoi(env_omp_num_threads); if (num_threads < nstreams) { run_streams_test = false; - std::cout << " Skip stream test: omp_num_threads = " << num_threads << std::endl; + std::cout << " Skip stream test: omp_num_threads = " << num_threads + << std::endl; } } } #endif - if (!run_streams_test) - return; + if (!run_streams_test) return; scalar_t ZERO = scalar_t(0); scalar_t ONE = scalar_t(1); From c62d07442258c6cbc182e8ed9260b6b79695774a Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 12 Jun 2023 17:10:03 -0600 Subject: [PATCH 431/442] cm_test_all_sandia: updates for blake - remove unsupported compilers - add new compilers --- scripts/cm_test_all_sandia | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 73c8d38497..98c5db89df 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -708,10 +708,9 @@ elif [ "$MACHINE" = "blake" ]; then BASE_MODULE_LIST="cmake/3.19.3,/" BASE_MODULE_LIST_INTEL="cmake/3.19.3,/compilers/" - BASE_MODULE_LIST_ONEAPI="cmake/3.19.3,/oneAPI/base-toolkit/" + BASE_MODULE_LIST_ONEAPI="cmake/3.19.3,/oneAPI/base-toolkit/,/oneAPI/hpc-toolkit/" ONEAPI_WARNING_FLAGS="" - GCC72_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.2.20/gcc/7.2.0" GCC102_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.3.21/gcc/10.2.0" if [ "$SPOT_CHECK" = "True" ]; then @@ -719,31 +718,26 @@ elif [ "$MACHINE" = "blake" ]; then # TODO: Failing toolchains: #"intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" - COMPILERS=("intel/19.1.144 $BASE_MODULE_LIST_INTEL "OpenMP_Serial" icpc $INTEL_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GCC_WARNING_FLAGS" - "clang/10.0.1 $BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" + COMPILERS=("clang/10.0.1 $BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" "intel/19.5.281 $BASE_MODULE_LIST_INTEL "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" "gcc/10.2.0 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GCC_WARNING_FLAGS" "gcc/11.2.0 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GCC_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - # TODO: Failing toolchains: - #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" - COMPILERS=("gcc/7.2.0 $GCC72_MODULE_TPL_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" - "intel/19.5.281 $BASE_MODULE_LIST_INTEL "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" + COMPILERS=("intel/19.5.281 $BASE_MODULE_LIST_INTEL "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" "gcc/10.2.0 $GCC102_MODULE_TPL_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" ) else - COMPILERS=("intel/19.1.144 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/19.3.199 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/19.5.281 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/2021.1.1 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/8.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + COMPILERS=("intel/19.5.281 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/2021.2.0 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" + "intel/2021.4.0 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" + "intel/2022.1.2 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "clang/10.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" ) From 4dbb5838ee861eeff62b04b4790c88b00630301e Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Mon, 12 Jun 2023 16:15:07 -0700 Subject: [PATCH 432/442] Check concurrency with nstream instead --- sparse/unit_test/Test_Sparse_spiluk.hpp | 15 ++++++--------- sparse/unit_test/Test_Sparse_sptrsv.hpp | 15 ++++++--------- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index a7187d4066..2d6aabb347 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -278,19 +278,16 @@ void run_test_spiluk_streams(int test_algo, int nstreams) { using crsMat_t = CrsMatrix; using AT = Kokkos::ArithTraits; - // Workaround for OpenMP: skip tests if OMP_NUM_THREADS < nstreams because of + // Workaround for OpenMP: skip tests if concurrency < nstreams because of // not enough resource to partition bool run_streams_test = true; #ifdef KOKKOS_ENABLE_OPENMP if (std::is_same::value) { - const char *env_omp_num_threads = std::getenv("OMP_NUM_THREADS"); - if (env_omp_num_threads != nullptr) { - int num_threads = std::atoi(env_omp_num_threads); - if (num_threads < nstreams) { - run_streams_test = false; - std::cout << " Skip stream test: omp_num_threads = " << num_threads - << std::endl; - } + int exec_concurrency = execution_space().concurrency(); + if (exec_concurrency < nstreams) { + run_streams_test = false; + std::cout << " Skip stream test: concurrency = " << exec_concurrency + << std::endl; } } #endif diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index 70f7eb9ee6..fb747be6d4 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -1064,19 +1064,16 @@ void run_test_sptrsv_streams(int test_algo, int nstreams) { size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; using crsMat_t = CrsMatrix; - // Workaround for OpenMP: skip tests if OMP_NUM_THREADS < nstreams because of + // Workaround for OpenMP: skip tests if concurrency < nstreams because of // not enough resource to partition bool run_streams_test = true; #ifdef KOKKOS_ENABLE_OPENMP if (std::is_same::value) { - const char *env_omp_num_threads = std::getenv("OMP_NUM_THREADS"); - if (env_omp_num_threads != nullptr) { - int num_threads = std::atoi(env_omp_num_threads); - if (num_threads < nstreams) { - run_streams_test = false; - std::cout << " Skip stream test: omp_num_threads = " << num_threads - << std::endl; - } + int exec_concurrency = execution_space().concurrency(); + if (exec_concurrency < nstreams) { + run_streams_test = false; + std::cout << " Skip stream test: concurrency = " << exec_concurrency + << std::endl; } } #endif From 77745756f6dfedd559f41e7aa3dc25dd728136e6 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Mon, 12 Jun 2023 18:33:55 -0700 Subject: [PATCH 433/442] Add tests for nstreams=1 --- sparse/unit_test/Test_Sparse_spiluk.hpp | 10 +++++++++- sparse/unit_test/Test_Sparse_sptrsv.hpp | 13 ++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 2d6aabb347..77cdb1ede1 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -297,7 +297,9 @@ void run_test_spiluk_streams(int test_algo, int nstreams) { const size_type nnz = 21; std::vector instances; - if (nstreams == 2) + if (nstreams == 1) + instances = Kokkos::Experimental::partition_space(execution_space(), 1); + else if (nstreams == 2) instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); else if (nstreams == 3) instances = @@ -481,6 +483,9 @@ void test_spiluk() { template void test_spiluk_streams() { + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 1 stream" << std::endl; + Test::run_test_spiluk_streams(0, 1); + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 2 streams" << std::endl; Test::run_test_spiluk_streams(0, 2); @@ -490,6 +495,9 @@ void test_spiluk_streams() { std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 4 streams" << std::endl; Test::run_test_spiluk_streams(0, 4); + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 1 stream" << std::endl; + Test::run_test_spiluk_streams(1, 1); + std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 2 streams" << std::endl; Test::run_test_spiluk_streams(1, 2); diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index fb747be6d4..1a4c78e08e 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -1086,7 +1086,9 @@ void run_test_sptrsv_streams(int test_algo, int nstreams) { const size_type nnz = 10; std::vector instances; - if (nstreams == 2) + if (nstreams == 1) + instances = Kokkos::Experimental::partition_space(execution_space(), 1); + else if (nstreams == 2) instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); else if (nstreams == 3) instances = @@ -1312,6 +1314,9 @@ void test_sptrsv() { template void test_sptrsv_streams() { + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 1 stream" << std::endl; + Test::run_test_sptrsv_streams(0, 1); + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 2 streams" << std::endl; Test::run_test_sptrsv_streams(0, 2); @@ -1321,6 +1326,9 @@ void test_sptrsv_streams() { std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 4 streams" << std::endl; Test::run_test_sptrsv_streams(0, 4); + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 1 stream" << std::endl; + Test::run_test_sptrsv_streams(1, 1); + std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 2 streams" << std::endl; Test::run_test_sptrsv_streams(1, 2); @@ -1333,6 +1341,9 @@ void test_sptrsv_streams() { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) if (std::is_same::value && std::is_same::value) { + std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 1 stream" << std::endl; + Test::run_test_sptrsv_streams(2, 1); + std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 2 streams" << std::endl; Test::run_test_sptrsv_streams(2, 2); From e624a7d3bb1a24565c08408a22f72c3054849884 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 13 Jun 2023 13:02:22 -0600 Subject: [PATCH 434/442] Update to version 4.1.00 --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bbaa3f2ffe..fa666ab33e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,8 +10,8 @@ SET(KOKKOSKERNELS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) SET(KokkosKernels_VERSION_MAJOR 4) -SET(KokkosKernels_VERSION_MINOR 0) -SET(KokkosKernels_VERSION_PATCH 99) +SET(KokkosKernels_VERSION_MINOR 1) +SET(KokkosKernels_VERSION_PATCH 00) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") #Set variables for config file From 7871bd233d381ecd4ef991897217b1d1ecd33253 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 14 Jun 2023 16:56:13 -0600 Subject: [PATCH 435/442] Merge pull request #1867 from bartlettroscoe/tril-11966-bad-batched-incl-dir KokkosKernels: Don't list include for non-existant 'batched' build dir (trilinos/Trilinos#11966) (cherry picked from commit 6fdd65894a8e62bbce58a58059df0940fde0d58d) --- batched/CMakeLists.txt | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/batched/CMakeLists.txt b/batched/CMakeLists.txt index 3f13ac5084..3103dfa8a0 100644 --- a/batched/CMakeLists.txt +++ b/batched/CMakeLists.txt @@ -17,9 +17,18 @@ IF (NOT KokkosKernels_ENABLE_COMPONENT_BLAS) LIST(APPEND SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/batched/KokkosBatched_Util.cpp) ENDIF() -# Adding unit-tests -KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/batched) -KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}/batched) +IF(KokkosKernels_ENABLE_TESTS OR KokkosKernels_ENABLE_TESTS_AND_PERFSUITE) + # Adding unit-tests + KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}/batched) + KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING + ${CMAKE_CURRENT_SOURCE_DIR}/batched) +ENDIF() +# NOTE: Above, the build directory 'batched' is not created unless unit tests +# are actually enabled (which are actually included from the base-level +# CMakeLists.txt file). And the KokkosKernelsTargets.cmake file that gets +# generated from this CMake package in the build dir will be broken if these +# are listed in the `INTERFACE_INCLUDE_DIRECTORIES` property when the build +# `batched` is not created (see Trilinos PR #11966). KOKKOSKERNELS_GENERATE_ETI(Batched_Gemm_nt_nt_bll Gemm COMPONENTS batched From a0d99bf69fc2e0677cf0a741c6468cb0b800c881 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Thu, 15 Jun 2023 16:15:45 -0600 Subject: [PATCH 436/442] Merge pull request #1868 from lucbv/MKL_INT MKL: support indices properly (cherry picked from commit e8a64dd9033b37c1c80504fa4ed4bc27be1cd146) --- .../KokkosBlas2_serial_gemv_tpl_spec_decl.hpp | 2 +- perf_test/sparse/KokkosSparse_spadd.cpp | 21 +- ...osSparse_spgemm_noreuse_tpl_spec_avail.hpp | 31 +-- ...kosSparse_spgemm_noreuse_tpl_spec_decl.hpp | 60 +++--- ...osSparse_spgemm_numeric_tpl_spec_avail.hpp | 72 +++---- ...kosSparse_spgemm_numeric_tpl_spec_decl.hpp | 20 +- ...sSparse_spgemm_symbolic_tpl_spec_avail.hpp | 48 ++--- ...osSparse_spgemm_symbolic_tpl_spec_decl.hpp | 100 +++++----- ...osSparse_spmv_bsrmatrix_tpl_spec_avail.hpp | 9 +- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 183 ++++++++++-------- .../tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 9 +- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 72 +++---- 12 files changed, 340 insertions(+), 287 deletions(-) diff --git a/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp index f689ba079c..6f6a7a2e9f 100644 --- a/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp @@ -19,7 +19,7 @@ #include "KokkosBlas_util.hpp" #include "KokkosBatched_Vector.hpp" -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && !defined(KOKKOS_ENABLE_SYCL) #include "mkl_version.h" #if __INTEL_MKL__ >= 2018 #define __KOKKOSBLAS_ENABLE_INTEL_MKL_COMPACT__ 1 diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index f69d24d523..e8a0b19419 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -328,14 +328,19 @@ void run_experiment(int argc, char** argv, CommonInputParams) { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL sparse_matrix_t Amkl, Bmkl, Cmkl; if (params.use_mkl) { - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( - &Amkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)A.graph.row_map.data(), - (int*)A.graph.row_map.data() + 1, A.graph.entries.data(), - A.values.data())); - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( - &Bmkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)B.graph.row_map.data(), - (int*)B.graph.row_map.data() + 1, B.graph.entries.data(), - B.values.data())); + if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( + &Amkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)A.graph.row_map.data(), + (int*)A.graph.row_map.data() + 1, A.graph.entries.data(), + A.values.data())); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( + &Bmkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)B.graph.row_map.data(), + (int*)B.graph.row_map.data() + 1, B.graph.entries.data(), + B.values.data())); + } else { + throw std::runtime_error( + "MKL configured with long long int not supported in Kokkos Kernels"); + } } #endif diff --git a/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp index 81d3273e17..ea3edb518f 100644 --- a/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_avail.hpp @@ -19,6 +19,10 @@ #ifndef KOKKOSPARSE_SPGEMM_NOREUSE_TPL_SPEC_AVAIL_HPP_ #define KOKKOSPARSE_SPGEMM_NOREUSE_TPL_SPEC_AVAIL_HPP_ +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include "mkl.h" +#endif + namespace KokkosSparse { namespace Impl { @@ -59,18 +63,21 @@ SPGEMM_NOREUSE_AVAIL_CUSPARSE_S(Kokkos::complex) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define SPGEMM_NOREUSE_AVAIL_MKL(SCALAR, EXEC) \ - template <> \ - struct spgemm_noreuse_tpl_spec_avail< \ - KokkosSparse::CrsMatrix< \ - SCALAR, int, Kokkos::Device, void, int>, \ - KokkosSparse::CrsMatrix< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int>, \ - KokkosSparse::CrsMatrix< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int>> { \ - enum : bool { value = true }; \ +#define SPGEMM_NOREUSE_AVAIL_MKL(SCALAR, EXEC) \ + template <> \ + struct spgemm_noreuse_tpl_spec_avail< \ + KokkosSparse::CrsMatrix, void, \ + MKL_INT>, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const MKL_INT>, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const MKL_INT>> { \ + enum : bool { value = true }; \ }; #define SPGEMM_NOREUSE_AVAIL_MKL_E(EXEC) \ diff --git a/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp index f3d32a01fb..1067f3924f 100644 --- a/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_noreuse_tpl_spec_decl.hpp @@ -226,33 +226,39 @@ Matrix spgemm_noreuse_mkl(const MatrixConst &A, const MatrixConst &B) { return Matrix("C", m, k, c_nnz, valuesC, row_mapC, entriesC); } -#define SPGEMM_NOREUSE_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ - template <> \ - struct SPGEMM_NOREUSE< \ - KokkosSparse::CrsMatrix< \ - SCALAR, int, Kokkos::Device, void, int>, \ - KokkosSparse::CrsMatrix< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int>, \ - KokkosSparse::CrsMatrix< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int>, \ - true, TPL_AVAIL> { \ - using Matrix = KokkosSparse::CrsMatrix< \ - SCALAR, int, Kokkos::Device, void, int>; \ - using ConstMatrix = KokkosSparse::CrsMatrix< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int>; \ - static KokkosSparse::CrsMatrix< \ - SCALAR, int, Kokkos::Device, void, int> \ - spgemm_noreuse(const ConstMatrix &A, bool, const ConstMatrix &B, bool) { \ - std::string label = "KokkosSparse::spgemm_noreuse[TPL_MKL," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - Matrix C = spgemm_noreuse_mkl(A, B); \ - Kokkos::Profiling::popRegion(); \ - return C; \ - } \ +#define SPGEMM_NOREUSE_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ + template <> \ + struct SPGEMM_NOREUSE< \ + KokkosSparse::CrsMatrix, void, \ + MKL_INT>, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const MKL_INT>, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const MKL_INT>, \ + true, TPL_AVAIL> { \ + using Matrix = \ + KokkosSparse::CrsMatrix, void, \ + MKL_INT>; \ + using ConstMatrix = KokkosSparse::CrsMatrix< \ + const SCALAR, const MKL_INT, Kokkos::Device, \ + Kokkos::MemoryTraits, const MKL_INT>; \ + static KokkosSparse::CrsMatrix, \ + void, MKL_INT> \ + spgemm_noreuse(const ConstMatrix &A, bool, const ConstMatrix &B, bool) { \ + std::string label = "KokkosSparse::spgemm_noreuse[TPL_MKL," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + Matrix C = spgemm_noreuse_mkl(A, B); \ + Kokkos::Profiling::popRegion(); \ + return C; \ + } \ }; #define SPGEMM_NOREUSE_DECL_MKL_SE(SCALAR, EXEC) \ diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp index bfba70d913..e144b53162 100644 --- a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp @@ -19,6 +19,10 @@ #ifndef KOKKOSPARSE_SPGEMM_NUMERIC_TPL_SPEC_AVAIL_HPP_ #define KOKKOSPARSE_SPGEMM_NUMERIC_TPL_SPEC_AVAIL_HPP_ +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include "mkl.h" +#endif + namespace KokkosSparse { namespace Impl { @@ -129,40 +133,40 @@ SPGEMM_NUMERIC_AVAIL_ROCSPARSE(Kokkos::complex) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define SPGEMM_NUMERIC_AVAIL_MKL(SCALAR, EXEC) \ - template <> \ - struct spgemm_numeric_tpl_spec_avail< \ - KokkosKernels::Experimental::KokkosKernelsHandle< \ - const int, const int, const SCALAR, EXEC, Kokkos::HostSpace, \ - Kokkos::HostSpace>, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define SPGEMM_NUMERIC_AVAIL_MKL(SCALAR, EXEC) \ + template <> \ + struct spgemm_numeric_tpl_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ + Kokkos::HostSpace>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #define SPGEMM_NUMERIC_AVAIL_MKL_E(EXEC) \ diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp index cda083a6b5..6c87c60caf 100644 --- a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp @@ -553,30 +553,30 @@ void spgemm_numeric_mkl( #define SPGEMM_NUMERIC_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ template <> \ struct SPGEMM_NUMERIC, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ - Kokkos::View, \ Kokkos::MemoryTraits>, \ Kokkos::View>, \ true, TPL_AVAIL> { \ using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< \ - const int, const int, const SCALAR, EXEC, Kokkos::HostSpace, \ + const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ Kokkos::HostSpace>; \ using c_int_view_t = \ - Kokkos::View, \ Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ Kokkos::MemoryTraits>; \ using c_scalar_view_t = \ diff --git a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp index 80454be92b..1fcfa7132a 100644 --- a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp @@ -19,6 +19,10 @@ #ifndef KOKKOSPARSE_SPGEMM_SYMBOLIC_TPL_SPEC_AVAIL_HPP_ #define KOKKOSPARSE_SPGEMM_SYMBOLIC_TPL_SPEC_AVAIL_HPP_ +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include +#endif + namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists @@ -101,28 +105,28 @@ SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(Kokkos::complex) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define SPGEMM_SYMBOLIC_AVAIL_MKL(SCALAR, EXEC) \ - template <> \ - struct spgemm_symbolic_tpl_spec_avail< \ - KokkosKernels::Experimental::KokkosKernelsHandle< \ - const int, const int, const SCALAR, EXEC, Kokkos::HostSpace, \ - Kokkos::HostSpace>, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define SPGEMM_SYMBOLIC_AVAIL_MKL(SCALAR, EXEC) \ + template <> \ + struct spgemm_symbolic_tpl_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ + Kokkos::HostSpace>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #define SPGEMM_SYMBOLIC_AVAIL_MKL_E(EXEC) \ diff --git a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp index 5db0fa18a9..e662934d00 100644 --- a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp @@ -594,8 +594,10 @@ void spgemm_symbolic_mkl( handle->set_c_nnz(0); return; } - MKLMatrix A(m, n, (int *)rowptrA.data(), (int *)colidxA.data(), nullptr); - MKLMatrix B(n, k, (int *)rowptrB.data(), (int *)colidxB.data(), nullptr); + MKLMatrix A(m, n, (MKL_INT *)rowptrA.data(), (MKL_INT *)colidxA.data(), + nullptr); + MKLMatrix B(n, k, (MKL_INT *)rowptrB.data(), (MKL_INT *)colidxB.data(), + nullptr); sparse_matrix_t C; matrix_descr generalDescr; generalDescr.type = SPARSE_MATRIX_TYPE_GENERAL; @@ -621,53 +623,53 @@ void spgemm_symbolic_mkl( handle->set_c_nnz(rowptrC(m)); } -#define SPGEMM_SYMBOLIC_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ - template <> \ - struct SPGEMM_SYMBOLIC< \ - KokkosKernels::Experimental::KokkosKernelsHandle< \ - const int, const int, const SCALAR, EXEC, Kokkos::HostSpace, \ - Kokkos::HostSpace>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, TPL_AVAIL> { \ - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< \ - const int, const int, const SCALAR, EXEC, Kokkos::HostSpace, \ - Kokkos::HostSpace>; \ - using c_int_view_t = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void spgemm_symbolic(KernelHandle *handle, \ - typename KernelHandle::nnz_lno_t m, \ - typename KernelHandle::nnz_lno_t n, \ - typename KernelHandle::nnz_lno_t k, \ - c_int_view_t row_mapA, c_int_view_t entriesA, \ - bool, c_int_view_t row_mapB, \ - c_int_view_t entriesB, bool, \ - int_view_t row_mapC, bool) { \ - std::string label = "KokkosSparse::spgemm_symbolic[TPL_MKL," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spgemm_symbolic_mkl(handle->get_spgemm_handle(), m, n, k, row_mapA, \ - entriesA, row_mapB, entriesB, row_mapC); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define SPGEMM_SYMBOLIC_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ + template <> \ + struct SPGEMM_SYMBOLIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ + Kokkos::HostSpace>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, TPL_AVAIL> { \ + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< \ + const MKL_INT, const MKL_INT, const SCALAR, EXEC, Kokkos::HostSpace, \ + Kokkos::HostSpace>; \ + using c_int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using int_view_t = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void spgemm_symbolic(KernelHandle *handle, \ + typename KernelHandle::nnz_lno_t m, \ + typename KernelHandle::nnz_lno_t n, \ + typename KernelHandle::nnz_lno_t k, \ + c_int_view_t row_mapA, c_int_view_t entriesA, \ + bool, c_int_view_t row_mapB, \ + c_int_view_t entriesB, bool, \ + int_view_t row_mapC, bool) { \ + std::string label = "KokkosSparse::spgemm_symbolic[TPL_MKL," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spgemm_symbolic_mkl(handle->get_spgemm_handle(), m, n, k, row_mapA, \ + entriesA, row_mapB, entriesB, row_mapC); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define SPGEMM_SYMBOLIC_DECL_MKL_SE(SCALAR, EXEC) \ diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp index 6846e27748..b9c1f6c1dd 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp @@ -17,6 +17,10 @@ #ifndef KOKKOSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_HPP_ #define KOKKOSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_HPP_ +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include +#endif + namespace KokkosSparse { namespace Experimental { namespace Impl { @@ -124,8 +128,9 @@ KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, #define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ template <> \ struct spmv_bsrmatrix_tpl_spec_avail< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int, const SCALAR*, \ + const SCALAR, const MKL_INT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const MKL_INT, const SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 36a64228b8..c6136eab3e 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -42,14 +42,15 @@ inline matrix_descr getDescription() { } inline void spmv_block_impl_mkl(sparse_operation_t op, float alpha, float beta, - int m, int n, int b, const int* Arowptrs, - const int* Aentries, const float* Avalues, + MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const float* Avalues, const float* x, float* y) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); KOKKOSKERNELS_MKL_SAFE_CALL( @@ -57,15 +58,15 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, float alpha, float beta, } inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha, - double beta, int m, int n, int b, - const int* Arowptrs, const int* Aentries, - const double* Avalues, const double* x, - double* y) { + double beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const double* Avalues, + const double* x, double* y) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); KOKKOSKERNELS_MKL_SAFE_CALL( @@ -74,16 +75,17 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha, inline void spmv_block_impl_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - int b, const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex8*)Avalues)); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex8*)Avalues)); MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; @@ -95,16 +97,17 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, inline void spmv_block_impl_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - int b, const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex16*)Avalues)); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex16*)Avalues)); matrix_descr A_descr = getDescription(); MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; @@ -115,15 +118,16 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, } inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha, - float beta, int m, int n, int b, - const int* Arowptrs, const int* Aentries, - const float* Avalues, const float* x, - int colx, int ldx, float* y, int ldy) { + float beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const float* Avalues, + const float* x, MKL_INT colx, MKL_INT ldx, + float* y, MKL_INT ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_mm(op, alpha, A_mkl, A_descr, @@ -132,15 +136,17 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha, } inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, - double beta, int m, int n, int b, - const int* Arowptrs, const int* Aentries, + double beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const double* Avalues, const double* x, - int colx, int ldx, double* y, int ldy) { + MKL_INT colx, MKL_INT ldx, double* y, + MKL_INT ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_mm(op, alpha, A_mkl, A_descr, @@ -148,19 +154,17 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, ldx, beta, y, ldy)); } -inline void spm_mv_block_impl_mkl(sparse_operation_t op, - Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - int b, const int* Arowptrs, - const int* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, int colx, - int ldx, Kokkos::complex* y, int ldy) { +inline void spm_mv_block_impl_mkl( + sparse_operation_t op, Kokkos::complex alpha, + Kokkos::complex beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, + const Kokkos::complex* Avalues, const Kokkos::complex* x, + MKL_INT colx, MKL_INT ldx, Kokkos::complex* y, MKL_INT ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex8*)Avalues)); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex8*)Avalues)); MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; @@ -173,15 +177,15 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, inline void spm_mv_block_impl_mkl( sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, int b, const int* Arowptrs, - const int* Aentries, const Kokkos::complex* Avalues, - const Kokkos::complex* x, int colx, int ldx, - Kokkos::complex* y, int ldy) { + Kokkos::complex beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, + const Kokkos::complex* Avalues, const Kokkos::complex* x, + MKL_INT colx, MKL_INT ldx, Kokkos::complex* y, MKL_INT ldy) { sparse_matrix_t A_mkl; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex16*)Avalues)); + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex16*)Avalues)); matrix_descr A_descr = getDescription(); MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; @@ -196,25 +200,26 @@ inline void spm_mv_block_impl_mkl( #if (__INTEL_MKL__ == 2017) -inline void spmv_block_impl_mkl(char mode, float alpha, float beta, int m, - int n, int b, const int* Arowptrs, - const int* Aentries, const float* Avalues, +inline void spmv_block_impl_mkl(char mode, float alpha, float beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const float* Avalues, const float* x, float* y) { mkl_sbsrmv(&mode, &m, &n, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); } -inline void spmv_block_impl_mkl(char mode, double alpha, double beta, int m, - int n, int b, const int* Arowptrs, - const int* Aentries, const double* Avalues, +inline void spmv_block_impl_mkl(char mode, double alpha, double beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const double* Avalues, const double* x, double* y) { mkl_dbsrmv(&mode, &m, &n, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); } inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - int b, const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -229,8 +234,9 @@ inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, } inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - int b, const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -245,31 +251,31 @@ inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl); } -inline void spm_mv_block_impl_mkl(char mode, float alpha, float beta, int m, - int n, int b, const int* Arowptrs, - const int* Aentries, const float* Avalues, - const float* x, int colx, int ldx, float* y, - int ldy) { +inline void spm_mv_block_impl_mkl(char mode, float alpha, float beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const float* Avalues, + const float* x, MKL_INT colx, MKL_INT ldx, + float* y, MKL_INT ldy) { mkl_sbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); } -inline void spm_mv_block_impl_mkl(char mode, double alpha, double beta, int m, - int n, int b, const int* Arowptrs, - const int* Aentries, const double* Avalues, - const double* x, int colx, int ldx, double* y, - int ldy) { +inline void spm_mv_block_impl_mkl( + char mode, double alpha, double beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const double* Avalues, + const double* x, MKL_INT colx, MKL_INT ldx, double* y, MKL_INT ldy) { mkl_dbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, ldx, &beta, y, ldy); } inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - int b, const int* Arowptrs, - const int* Aentries, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Kokkos::complex* Avalues, - const Kokkos::complex* x, int colx, - int ldx, Kokkos::complex* y, int ldy) { + const Kokkos::complex* x, MKL_INT colx, + MKL_INT ldx, Kokkos::complex* y, + MKL_INT ldy) { const MKL_Complex8* alpha_mkl = reinterpret_cast(&alpha); const MKL_Complex8* beta_mkl = reinterpret_cast(&beta); const MKL_Complex8* Avalues_mkl = @@ -280,11 +286,14 @@ inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, Arowptrs, Arowptrs + 1, x_mkl, ldx, beta_mkl, y_mkl, ldy); } -inline void spm_mv_block_impl_mkl( - char mode, Kokkos::complex alpha, Kokkos::complex beta, - int m, int n, int b, const int* Arowptrs, const int* Aentries, - const Kokkos::complex* Avalues, const Kokkos::complex* x, - int colx, int ldx, Kokkos::complex* y, int ldy) { +inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, + Kokkos::complex beta, MKL_INT m, + MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, + const Kokkos::complex* Avalues, + const Kokkos::complex* x, + MKL_INT colx, MKL_INT ldx, + Kokkos::complex* y, MKL_INT ldy) { const MKL_Complex16* alpha_mkl = reinterpret_cast(&alpha); const MKL_Complex16* beta_mkl = reinterpret_cast(&beta); @@ -301,16 +310,17 @@ inline void spm_mv_block_impl_mkl( #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV_BSRMATRIX< \ - SCALAR const, int const, Kokkos::Device, \ - Kokkos::MemoryTraits, int const, SCALAR const*, \ + SCALAR const, MKL_INT const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using AMatrix = \ - BsrMatrix, int const>; \ + BsrMatrix, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const*, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ @@ -358,8 +368,9 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, #define KOKKOSSPARSE_SPMV_MV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV_MV_BSRMATRIX< \ - SCALAR const, int const, Kokkos::Device, \ - Kokkos::MemoryTraits, int const, SCALAR const**, \ + SCALAR const, MKL_INT const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const, SCALAR const**, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, \ SCALAR**, Kokkos::LayoutLeft, \ @@ -367,8 +378,8 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, Kokkos::MemoryTraits, true, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using AMatrix = \ - BsrMatrix, int const>; \ + BsrMatrix, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const**, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ @@ -383,9 +394,9 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - int colx = static_cast(X.extent(1)); \ - int ldx = static_cast(X.stride_1()); \ - int ldy = static_cast(Y.stride_1()); \ + MKL_INT colx = static_cast(X.extent(1)); \ + MKL_INT ldx = static_cast(X.stride_1()); \ + MKL_INT ldy = static_cast(Y.stride_1()); \ spm_mv_block_impl_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ A.numCols(), A.blockDim(), A.graph.row_map.data(), \ A.graph.entries.data(), A.values.data(), X.data(), \ diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index 4a92741cc5..060fef45bb 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -17,6 +17,10 @@ #ifndef KOKKOSPARSE_SPMV_TPL_SPEC_AVAIL_HPP_ #define KOKKOSPARSE_SPMV_TPL_SPEC_AVAIL_HPP_ +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include +#endif + namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists @@ -214,8 +218,9 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex, #define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ template <> \ struct spmv_tpl_spec_avail< \ - const SCALAR, const int, Kokkos::Device, \ - Kokkos::MemoryTraits, const int, const SCALAR*, \ + const SCALAR, const MKL_INT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const MKL_INT, const SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 6cbd1fff29..ecbe45c7fd 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -522,41 +522,43 @@ namespace Impl { #if (__INTEL_MKL__ > 2017) // MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv() -inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, int m, - int n, const int* Arowptrs, const int* Aentries, - const float* Avalues, const float* x, float* y) { +inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, MKL_INT m, + MKL_INT n, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const float* Avalues, + const float* x, float* y) { sparse_matrix_t A_mkl; matrix_descr A_descr; A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), + &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); KOKKOSKERNELS_MKL_SAFE_CALL( mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } -inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, int m, - int n, const int* Arowptrs, const int* Aentries, - const double* Avalues, const double* x, double* y) { +inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, + MKL_INT m, MKL_INT n, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const double* Avalues, + const double* x, double* y) { sparse_matrix_t A_mkl; matrix_descr A_descr; A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), + &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); KOKKOSKERNELS_MKL_SAFE_CALL( mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, MKL_INT n, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -566,8 +568,8 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), + &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex8*)Avalues)); MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; @@ -577,8 +579,8 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, } inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, MKL_INT n, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -588,8 +590,8 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), + &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex16*)Avalues)); MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; MKL_Complex16 beta_mkl{beta.real(), beta.imag()}; @@ -601,16 +603,17 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV< \ - SCALAR const, int const, Kokkos::Device, \ - Kokkos::MemoryTraits, int const, SCALAR const*, \ + SCALAR const, MKL_INT const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using AMatrix = \ - CrsMatrix, int const>; \ + CrsMatrix, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const*, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ @@ -647,23 +650,23 @@ inline char mode_kk_to_mkl(char mode_kk) { "Invalid mode for MKL (should be one of N, T, H)"); } -inline void spmv_mkl(char mode, float alpha, float beta, int m, int n, - const int* Arowptrs, const int* Aentries, +inline void spmv_mkl(char mode, float alpha, float beta, MKL_INT m, MKL_INT n, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const float* Avalues, const float* x, float* y) { mkl_scsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); } -inline void spmv_mkl(char mode, double alpha, double beta, int m, int n, - const int* Arowptrs, const int* Aentries, +inline void spmv_mkl(char mode, double alpha, double beta, MKL_INT m, MKL_INT n, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const double* Avalues, const double* x, double* y) { mkl_dcsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); } inline void spmv_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, MKL_INT n, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -678,8 +681,8 @@ inline void spmv_mkl(char mode, Kokkos::complex alpha, } inline void spmv_mkl(char mode, Kokkos::complex alpha, - Kokkos::complex beta, int m, int n, - const int* Arowptrs, const int* Aentries, + Kokkos::complex beta, MKL_INT m, MKL_INT n, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) { @@ -697,16 +700,17 @@ inline void spmv_mkl(char mode, Kokkos::complex alpha, #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV< \ - SCALAR const, int const, Kokkos::Device, \ - Kokkos::MemoryTraits, int const, SCALAR const*, \ + SCALAR const, MKL_INT const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const, SCALAR const*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, SCALAR*, \ Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using AMatrix = \ - CrsMatrix, int const>; \ + CrsMatrix, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const*, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ From c1176142b3aabc4597232cb4826aadf202119960 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 16 Jun 2023 13:23:55 -0600 Subject: [PATCH 437/442] Update changelog for 4.1.00 --- CHANGELOG.md | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d582fc354f..69e8194c73 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,122 @@ # Change Log +## [4.1.00](https://github.com/kokkos/kokkos-kernels/tree/4.1.00) (2023-06-16) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.01...4.1.00) + +### Features: +- Implement BLAS2 syr() and her() functionalities under kokkos-kernels syr() [\#1837](https://github.com/kokkos/kokkos-kernels/pull/1837) +- Stream interface for SPTRSV solve [\#1820](https://github.com/kokkos/kokkos-kernels/pull/1820) +- Blas2 and 3 on stream [\#1812](https://github.com/kokkos/kokkos-kernels/pull/1812) +- Blas1 on stream [\#1803](https://github.com/kokkos/kokkos-kernels/pull/1803) +- Norms on stream [\#1795](https://github.com/kokkos/kokkos-kernels/pull/1795) +- Add calls to KokkosBlas Gemv and Spmv for team batched kernels when m==1 [\#1770](https://github.com/kokkos/kokkos-kernels/pull/1770) +- Add BsrMatrix SpMV in rocSparse TPL, rewrite BsrMatrix SpMV unit tests [\#1769](https://github.com/kokkos/kokkos-kernels/pull/1769) +- Implementation for BLAS2 ger [\#1756](https://github.com/kokkos/kokkos-kernels/pull/1756) +- ODE: explicit integration methods [\#1754](https://github.com/kokkos/kokkos-kernels/pull/1754) +- Add calls to KokkosBlas Dot and Axpy for team batched kernels when m==1 [\#1753](https://github.com/kokkos/kokkos-kernels/pull/1753) +- Add exec instance support to sort/sort_and_merge utils [\#1744](https://github.com/kokkos/kokkos-kernels/pull/1744) +- Streams interface for SPILUK numeric [\#1728](https://github.com/kokkos/kokkos-kernels/pull/1728) +- sparse: Add coo2crs, crs2coo and CooMatrix [\#1686](https://github.com/kokkos/kokkos-kernels/pull/1686) + +### Enhancements: +- batched/eti: ETI host-level interfaces [\#1783](https://github.com/kokkos/kokkos-kernels/pull/1783) +- Add support for complex data types in MDF [\#1776](https://github.com/kokkos/kokkos-kernels/pull/1776) +- Refactor MKL TPL for both CPU and GPU usage [\#1779](https://github.com/kokkos/kokkos-kernels/pull/1779) +- Sort and merge improvements [\#1773](https://github.com/kokkos/kokkos-kernels/pull/1773) +- refactor blas3 tests to use benchmark library [\#1751](https://github.com/kokkos/kokkos-kernels/pull/1751) +- spgemm handle: check that A,B,C graphs never change [\#1742](https://github.com/kokkos/kokkos-kernels/pull/1742) +- add explicit tests of opt-in algorithms [\#1712](https://github.com/kokkos/kokkos-kernels/pull/1712) +- Adds team- and thread-based lower-bound and upper-bound search and predicates [\#1711](https://github.com/kokkos/kokkos-kernels/pull/1711) +- Adds KokkosKernels::Impl::Iota, a view-like where iota(i) = i + offset [\#1710](https://github.com/kokkos/kokkos-kernels/pull/1710) +- Use rocsparse_spmv_ex for rocm >= 5.4.0 [\#1701](https://github.com/kokkos/kokkos-kernels/pull/1701) +- Test mixed scalars: more fixes related to mixed scalar tests [\#1694](https://github.com/kokkos/kokkos-kernels/pull/1694) +- PERF TESTS: adding utilities and instantiation wrapper [\#1676](https://github.com/kokkos/kokkos-kernels/pull/1676) +- Added TplsVersion file and print methods [\#1693](https://github.com/kokkos/kokkos-kernels/pull/1693) +- Fix/enhance backend issues on spadd perftest [\#1672](https://github.com/kokkos/kokkos-kernels/pull/1672) +- Spgemm perf test enhancements [\#1664](https://github.com/kokkos/kokkos-kernels/pull/1664) +- Add basis skeleton for KokkosKernels::print_configuration [\#1665](https://github.com/kokkos/kokkos-kernels/pull/1665) +- MKL: support indices properly [\#1868](https://github.com/kokkos/kokkos-kernels/pull/1868) +- batched/dense: Add gesv DynRankView runtime checks [\#1850](https://github.com/kokkos/kokkos-kernels/pull/1850) + +### Build System: +- Do not change memory spaces instantiation defaults based on Kokkos_ENABLE_CUDA_UVM [\#1835](https://github.com/kokkos/kokkos-kernels/pull/1835) +- KokkosKernels: Remove TriBITS Kokkos subpackages (trilinos/Trilinos#11545) [\#1817](https://github.com/kokkos/kokkos-kernels/pull/1817) +- CMakeLists.txt: Add alias to match what is exported from Trilinos [\#1855](https://github.com/kokkos/kokkos-kernels/pull/1855) +- KokkosKernels: Don't list include for non-existant 'batched' build dir (trilinos/Trilinos#11966) [\#1867](https://github.com/kokkos/kokkos-kernels/pull/1867) +- Remove non-existant subdir kokkos-kernels/common/common (#11921, #11863) [\#1854](https://github.com/kokkos/kokkos-kernels/pull/1854) +- KokkosKernels: Remove non-existent common/src/[impl,tpls] include dirs (trilinos/Trilinos#11545) [\#1844](https://github.com/kokkos/kokkos-kernels/pull/1844) + +### Documentation and Testing: +- Enable sphinx werror [\#1856](https://github.com/kokkos/kokkos-kernels/pull/1856) +- Update cmake option naming in docs/comments [\#1849](https://github.com/kokkos/kokkos-kernels/pull/1849) +- docs/developer: Add Experimental namespace [\#1852](https://github.com/kokkos/kokkos-kernels/pull/1852) +- docs: Add profiling for compile times [\#1843](https://github.com/kokkos/kokkos-kernels/pull/1843) +- Ger: adding documentation stubs in apidocs [\#1822](https://github.com/kokkos/kokkos-kernels/pull/1822) +- .github/workflows: Summarize github-DOCS errors and warnings [\#1814](https://github.com/kokkos/kokkos-kernels/pull/1814) +- Blas1: docs update for PR #1803 [\#1805](https://github.com/kokkos/kokkos-kernels/pull/1805) +- apt-get update in hosted runner docs check [\#1797](https://github.com/kokkos/kokkos-kernels/pull/1797) +- scripts: Fix github-DOCS [\#1796](https://github.com/kokkos/kokkos-kernels/pull/1796) +- Add --enable-docs option to cm_generate_makefile [\#1785](https://github.com/kokkos/kokkos-kernels/pull/1785) +- docs: Add stubs for some sparse APIs [\#1768](https://github.com/kokkos/kokkos-kernels/pull/1768) +- .github: Update to actions/checkout@v3 [\#1767](https://github.com/kokkos/kokkos-kernels/pull/1767) +- docs: Include BatchedGemm [\#1765](https://github.com/kokkos/kokkos-kernels/pull/1765) +- .github: Automation reminder [\#1726](https://github.com/kokkos/kokkos-kernels/pull/1726) +- Allow an HTML-only docs build [\#1723](https://github.com/kokkos/kokkos-kernels/pull/1723) +- Add git information to benchmark context [\#1722](https://github.com/kokkos/kokkos-kernels/pull/1722) +- SYCL CI: Specify the full path to the compiler [\#1670](https://github.com/kokkos/kokkos-kernels/pull/1670) +- Add github DOCS ci check & disable Kokkos tests [\#1647](https://github.com/kokkos/kokkos-kernels/pull/1647) +- Add rocsparse,rocblas, to enabled TPLs in cm_test_all_sandia when --spot-check-tpls [\#1841](https://github.com/kokkos/kokkos-kernels/pull/1841) +- cm_test_all_sandia: update to add caraway queues for MI210, MI250 [\#1840](https://github.com/kokkos/kokkos-kernels/pull/1840) +- Support rocSparse in rocm 5.2.0 [\#1833](https://github.com/kokkos/kokkos-kernels/pull/1833) +- Add KokkosKernels_PullRequest_VEGA908_Tpls_ROCM520 support, only enable KokkosBlas::gesv where supported [\#1816](https://github.com/kokkos/kokkos-kernels/pull/1816) +- scripts: Include OMP settings [\#1801](https://github.com/kokkos/kokkos-kernels/pull/1801) +- Print the patch that clang-format-8 wants to apply [\#1714](https://github.com/kokkos/kokkos-kernels/pull/1714) + +### Benchmarks: +- Benchmark cleanup for par_ilut and spmv [\#1853](https://github.com/kokkos/kokkos-kernels/pull/1853) +- SpMV: adding benchmark for spmv [\#1821](https://github.com/kokkos/kokkos-kernels/pull/1821) +- New performance test for par_ilut, ginkgo::par_ilut, and spill [\#1799](https://github.com/kokkos/kokkos-kernels/pull/1799) +- Include OpenMP environment variables in benchmark context [\#1789](https://github.com/kokkos/kokkos-kernels/pull/1789) +- Re-enable and clean up triangle counting perf test [\#1752](https://github.com/kokkos/kokkos-kernels/pull/1752) +- Include google/benchmark lib version in benchmark output [\#1750](https://github.com/kokkos/kokkos-kernels/pull/1750) +- Refactor blas2 test for benchmark feature [\#1733](https://github.com/kokkos/kokkos-kernels/pull/1733) +- Adds a better parilut test with gmres [\#1661](https://github.com/kokkos/kokkos-kernels/pull/1661) +- Refactor blas1 test for benchmark feature [\#1636](https://github.com/kokkos/kokkos-kernels/pull/1636) + +### Cleanup: +- Drop outdated workarounds for backward compatibility with Kokkos [\#1836](https://github.com/kokkos/kokkos-kernels/pull/1836) +- Remove dead code guarded [\#1834](https://github.com/kokkos/kokkos-kernels/pull/1834) +- Remove decl ETI files [\#1824](https://github.com/kokkos/kokkos-kernels/pull/1824) +- Reorganize par_ilut performance test [\#1818](https://github.com/kokkos/kokkos-kernels/pull/1818) +- Deprecate Kokkos::Details::ArithTraits [\#1748](https://github.com/kokkos/kokkos-kernels/pull/1748) +- Drop obsolete workaround #ifdef KOKKOS_IF_ON_HOST [\#1720](https://github.com/kokkos/kokkos-kernels/pull/1720) +- Drop pre Kokkos 3.6 workaround [\#1653](https://github.com/kokkos/kokkos-kernels/pull/1653) +- View::Rank -> View::rank [\#1703](https://github.com/kokkos/kokkos-kernels/pull/1703) +- Prefer Kokkos::View::{R->r}ank [\#1679](https://github.com/kokkos/kokkos-kernels/pull/1679) +- Call concurrency(), not impl_thread_pool_size() [\#1666](https://github.com/kokkos/kokkos-kernels/pull/1666) +- Kokkos moves ALL_t out of Impl namespace [\#1658](https://github.com/kokkos/kokkos-kernels/pull/1658) +- Add KokkosKernels::Impl::are_integral_v helper variable template and quit using Kokkos::Impl::are_integral trait [\#1652](https://github.com/kokkos/kokkos-kernels/pull/1652) + +### Bug Fixes: +- Kokkos 4 compatibility: modifying the preprocessor logic [\#1827](https://github.com/kokkos/kokkos-kernels/pull/1827) +- blas/tpls: Fix gemm include guard typo [\#1848](https://github.com/kokkos/kokkos-kernels/pull/1848) +- spmv cusparse version check modified for cuda/11.1 [\#1828](https://github.com/kokkos/kokkos-kernels/pull/1828) +- Workaround for #1777 - cusparse spgemm test hang [\#1811](https://github.com/kokkos/kokkos-kernels/pull/1811) +- Fix 1798 [\#1800](https://github.com/kokkos/kokkos-kernels/pull/1800) +- BLAS: fixes and testing for LayoutStride [\#1794](https://github.com/kokkos/kokkos-kernels/pull/1794) +- Fix 1786: check that work array is contiguous in SVD [\#1793](https://github.com/kokkos/kokkos-kernels/pull/1793) +- Fix unused variable warnings [\#1790](https://github.com/kokkos/kokkos-kernels/pull/1790) +- Use KOKKOS_IMPL_DO_NOT_USE_PRINTF in Test_Common_UpperBound.hpp [\#1784](https://github.com/kokkos/kokkos-kernels/pull/1784) +- Batched Gesv: initializing variable to make compiler happy [\#1778](https://github.com/kokkos/kokkos-kernels/pull/1778) +- perf test utils: fix device ID parsing [\#1739](https://github.com/kokkos/kokkos-kernels/pull/1739) +- Fix OOB and improve comments in BsrMatrix COO constructor [\#1732](https://github.com/kokkos/kokkos-kernels/pull/1732) +- batched/unit_test: Disable simd dcomplex4 test in for intel > 19.05 and <= 2021. [\#1857](https://github.com/kokkos/kokkos-kernels/pull/1857) +- rocsparse spmv tpl: Fix rocsparse_spmv call for rocm < 5.4.0 [\#1716](https://github.com/kokkos/kokkos-kernels/pull/1716) +- compatibility with 4.0.0 [\#1709](https://github.com/kokkos/kokkos-kernels/pull/1709) +- team mult: fix type issue in max_error calculation [\#1706](https://github.com/kokkos/kokkos-kernels/pull/1706) +- cast Kokkos::Impl::integral_constant to int [\#1697](https://github.com/kokkos/kokkos-kernels/pull/1697) + + ## [4.0.01](https://github.com/kokkos/kokkos-kernels/tree/4.0.01) (2023-04-19) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.00...4.0.01) From 2579c4e3c45af8ba19707bd92cade17675fdcfef Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Fri, 16 Jun 2023 14:36:20 -0600 Subject: [PATCH 438/442] CHANGELOG: reorganizing the new features section --- CHANGELOG.md | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 69e8194c73..f674ea4ec5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,21 +3,33 @@ ## [4.1.00](https://github.com/kokkos/kokkos-kernels/tree/4.1.00) (2023-06-16) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.0.01...4.1.00) -### Features: -- Implement BLAS2 syr() and her() functionalities under kokkos-kernels syr() [\#1837](https://github.com/kokkos/kokkos-kernels/pull/1837) -- Stream interface for SPTRSV solve [\#1820](https://github.com/kokkos/kokkos-kernels/pull/1820) -- Blas2 and 3 on stream [\#1812](https://github.com/kokkos/kokkos-kernels/pull/1812) -- Blas1 on stream [\#1803](https://github.com/kokkos/kokkos-kernels/pull/1803) -- Norms on stream [\#1795](https://github.com/kokkos/kokkos-kernels/pull/1795) -- Add calls to KokkosBlas Gemv and Spmv for team batched kernels when m==1 [\#1770](https://github.com/kokkos/kokkos-kernels/pull/1770) +### New Features + +#### BLAS updates +- Adding interface with execution space instance argument to support execution of BLAS on stream + - Norms on stream [\#1795](https://github.com/kokkos/kokkos-kernels/pull/1795) + - Blas1 on stream [\#1803](https://github.com/kokkos/kokkos-kernels/pull/1803) + - Blas2 and 3 on stream [\#1812](https://github.com/kokkos/kokkos-kernels/pull/1812) +- Improving BLAS level 2 support by adding native implementation and TPL for GER, HER and SYR + - Implementation for BLAS2 ger [\#1756](https://github.com/kokkos/kokkos-kernels/pull/1756) + - Implement BLAS2 syr() and her() functionalities under kokkos-kernels syr() [\#1837](https://github.com/kokkos/kokkos-kernels/pull/1837) + +#### Batched updates +- Optimizing algorithms for single input data + - Add calls to KokkosBlas Dot and Axpy for team batched kernels when m==1 [\#1753](https://github.com/kokkos/kokkos-kernels/pull/1753) + - Add calls to KokkosBlas Gemv and Spmv for team batched kernels when m==1 [\#1770](https://github.com/kokkos/kokkos-kernels/pull/1770) + +#### Sparse updates +- Adding stream support to ILUK/SPTRSV and sort/merge + - Streams interface for SPILUK numeric [\#1728](https://github.com/kokkos/kokkos-kernels/pull/1728) + - Stream interface for SPTRSV solve [\#1820](https://github.com/kokkos/kokkos-kernels/pull/1820) + - Add exec instance support to sort/sort_and_merge utils [\#1744](https://github.com/kokkos/kokkos-kernels/pull/1744) - Add BsrMatrix SpMV in rocSparse TPL, rewrite BsrMatrix SpMV unit tests [\#1769](https://github.com/kokkos/kokkos-kernels/pull/1769) -- Implementation for BLAS2 ger [\#1756](https://github.com/kokkos/kokkos-kernels/pull/1756) -- ODE: explicit integration methods [\#1754](https://github.com/kokkos/kokkos-kernels/pull/1754) -- Add calls to KokkosBlas Dot and Axpy for team batched kernels when m==1 [\#1753](https://github.com/kokkos/kokkos-kernels/pull/1753) -- Add exec instance support to sort/sort_and_merge utils [\#1744](https://github.com/kokkos/kokkos-kernels/pull/1744) -- Streams interface for SPILUK numeric [\#1728](https://github.com/kokkos/kokkos-kernels/pull/1728) - sparse: Add coo2crs, crs2coo and CooMatrix [\#1686](https://github.com/kokkos/kokkos-kernels/pull/1686) +#### Misc updates +- ODE: explicit integration methods [\#1754](https://github.com/kokkos/kokkos-kernels/pull/1754) + ### Enhancements: - batched/eti: ETI host-level interfaces [\#1783](https://github.com/kokkos/kokkos-kernels/pull/1783) - Add support for complex data types in MDF [\#1776](https://github.com/kokkos/kokkos-kernels/pull/1776) From a3c07dfad19bd664d55331af1c7c348563d85e81 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Fri, 16 Jun 2023 14:48:59 -0600 Subject: [PATCH 439/442] CHANGELOG: organizing enhancements section --- CHANGELOG.md | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f674ea4ec5..5024c1f319 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,29 +26,40 @@ - Add exec instance support to sort/sort_and_merge utils [\#1744](https://github.com/kokkos/kokkos-kernels/pull/1744) - Add BsrMatrix SpMV in rocSparse TPL, rewrite BsrMatrix SpMV unit tests [\#1769](https://github.com/kokkos/kokkos-kernels/pull/1769) - sparse: Add coo2crs, crs2coo and CooMatrix [\#1686](https://github.com/kokkos/kokkos-kernels/pull/1686) +- Adds team- and thread-based lower-bound and upper-bound search and predicates [\#1711](https://github.com/kokkos/kokkos-kernels/pull/1711) +- Adds KokkosKernels::Impl::Iota, a view-like where iota(i) = i + offset [\#1710](https://github.com/kokkos/kokkos-kernels/pull/1710) #### Misc updates - ODE: explicit integration methods [\#1754](https://github.com/kokkos/kokkos-kernels/pull/1754) ### Enhancements: + +#### BLAS +- refactor blas3 tests to use benchmark library [\#1751](https://github.com/kokkos/kokkos-kernels/pull/1751) + +#### Batched - batched/eti: ETI host-level interfaces [\#1783](https://github.com/kokkos/kokkos-kernels/pull/1783) +- batched/dense: Add gesv DynRankView runtime checks [\#1850](https://github.com/kokkos/kokkos-kernels/pull/1850) + +#### Sparse - Add support for complex data types in MDF [\#1776](https://github.com/kokkos/kokkos-kernels/pull/1776) -- Refactor MKL TPL for both CPU and GPU usage [\#1779](https://github.com/kokkos/kokkos-kernels/pull/1779) - Sort and merge improvements [\#1773](https://github.com/kokkos/kokkos-kernels/pull/1773) -- refactor blas3 tests to use benchmark library [\#1751](https://github.com/kokkos/kokkos-kernels/pull/1751) - spgemm handle: check that A,B,C graphs never change [\#1742](https://github.com/kokkos/kokkos-kernels/pull/1742) -- add explicit tests of opt-in algorithms [\#1712](https://github.com/kokkos/kokkos-kernels/pull/1712) -- Adds team- and thread-based lower-bound and upper-bound search and predicates [\#1711](https://github.com/kokkos/kokkos-kernels/pull/1711) -- Adds KokkosKernels::Impl::Iota, a view-like where iota(i) = i + offset [\#1710](https://github.com/kokkos/kokkos-kernels/pull/1710) -- Use rocsparse_spmv_ex for rocm >= 5.4.0 [\#1701](https://github.com/kokkos/kokkos-kernels/pull/1701) -- Test mixed scalars: more fixes related to mixed scalar tests [\#1694](https://github.com/kokkos/kokkos-kernels/pull/1694) -- PERF TESTS: adding utilities and instantiation wrapper [\#1676](https://github.com/kokkos/kokkos-kernels/pull/1676) -- Added TplsVersion file and print methods [\#1693](https://github.com/kokkos/kokkos-kernels/pull/1693) - Fix/enhance backend issues on spadd perftest [\#1672](https://github.com/kokkos/kokkos-kernels/pull/1672) - Spgemm perf test enhancements [\#1664](https://github.com/kokkos/kokkos-kernels/pull/1664) +- add explicit tests of opt-in algorithms in SpMV [\#1712](https://github.com/kokkos/kokkos-kernels/pull/1712) + +#### Common utilities +- Added TplsVersion file and print methods [\#1693](https://github.com/kokkos/kokkos-kernels/pull/1693) - Add basis skeleton for KokkosKernels::print_configuration [\#1665](https://github.com/kokkos/kokkos-kernels/pull/1665) +- Test mixed scalars: more fixes related to mixed scalar tests [\#1694](https://github.com/kokkos/kokkos-kernels/pull/1694) +- PERF TESTS: adding utilities and instantiation wrapper [\#1676](https://github.com/kokkos/kokkos-kernels/pull/1676) + +#### TPL support +- Refactor MKL TPL for both CPU and GPU usage [\#1779](https://github.com/kokkos/kokkos-kernels/pull/1779) - MKL: support indices properly [\#1868](https://github.com/kokkos/kokkos-kernels/pull/1868) -- batched/dense: Add gesv DynRankView runtime checks [\#1850](https://github.com/kokkos/kokkos-kernels/pull/1850) +- Use rocsparse_spmv_ex for rocm >= 5.4.0 [\#1701](https://github.com/kokkos/kokkos-kernels/pull/1701) + ### Build System: - Do not change memory spaces instantiation defaults based on Kokkos_ENABLE_CUDA_UVM [\#1835](https://github.com/kokkos/kokkos-kernels/pull/1835) From 9e9351bd161329107aab3ae1cc3b198623de375e Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Fri, 16 Jun 2023 15:02:43 -0600 Subject: [PATCH 440/442] CHANGELOG: small updates --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5024c1f319..8c54a36a6b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -52,6 +52,7 @@ #### Common utilities - Added TplsVersion file and print methods [\#1693](https://github.com/kokkos/kokkos-kernels/pull/1693) - Add basis skeleton for KokkosKernels::print_configuration [\#1665](https://github.com/kokkos/kokkos-kernels/pull/1665) +- Add git information to benchmark context [\#1722](https://github.com/kokkos/kokkos-kernels/pull/1722) - Test mixed scalars: more fixes related to mixed scalar tests [\#1694](https://github.com/kokkos/kokkos-kernels/pull/1694) - PERF TESTS: adding utilities and instantiation wrapper [\#1676](https://github.com/kokkos/kokkos-kernels/pull/1676) @@ -85,7 +86,6 @@ - docs: Include BatchedGemm [\#1765](https://github.com/kokkos/kokkos-kernels/pull/1765) - .github: Automation reminder [\#1726](https://github.com/kokkos/kokkos-kernels/pull/1726) - Allow an HTML-only docs build [\#1723](https://github.com/kokkos/kokkos-kernels/pull/1723) -- Add git information to benchmark context [\#1722](https://github.com/kokkos/kokkos-kernels/pull/1722) - SYCL CI: Specify the full path to the compiler [\#1670](https://github.com/kokkos/kokkos-kernels/pull/1670) - Add github DOCS ci check & disable Kokkos tests [\#1647](https://github.com/kokkos/kokkos-kernels/pull/1647) - Add rocsparse,rocblas, to enabled TPLs in cm_test_all_sandia when --spot-check-tpls [\#1841](https://github.com/kokkos/kokkos-kernels/pull/1841) From 1592d9ed93f74da1d1510fb7369139e2cad6e65e Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 16 Jun 2023 16:37:36 -0600 Subject: [PATCH 441/442] Merge pull request #1874 from ndellingwood/fix-compatibility-kokkos-4.0 Backward-compatible fix with kokkos@4.0 (cherry picked from commit 6bbbe6a274690274551e9029d962cf189ceaf035) --- batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index 7a93309e65..f413ba612c 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -94,9 +94,11 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, case BaseKokkosBatchedAlgos::KK_SERIAL: case BaseHeuristicAlgos::SQUARE: case BaseTplAlgos::ARMPL: +#if KOKKOS_VERSION > 40099 assert(A.rank_dynamic() == 3 && "AViewType must have rank 3."); assert(B.rank_dynamic() == 3 && "BViewType must have rank 3."); assert(C.rank_dynamic() == 3 && "CViewType must have rank 3."); +#endif break; default: std::ostringstream os; From b6a2db921bcf583a116f495a4e35d3bd4849e9a1 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 20 Jun 2023 16:18:29 -0600 Subject: [PATCH 442/442] Update master_history.txt --- master_history.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/master_history.txt b/master_history.txt index 6c9f253c07..e7ed75b7f0 100644 --- a/master_history.txt +++ b/master_history.txt @@ -21,3 +21,4 @@ tag: 3.7.00 date: 08/25/2022 master: 42ab7a29 release: 9cc88ffa tag: 3.7.01 date: 12/01/2022 master: 04821ac3 release: 6cb632b6 tag: 4.0.00 date: 02/23/2023 master: b4014bf2 release: a10dff20 tag: 4.0.01 date: 04/26/2023 master: b9c1bab7 release: 8809e41c +tag: 4.1.00 date: 06/20/2023 master: 1331baf1 release: 14ad220a