Skip to content

Commit

Permalink
Adopt to size_type change in the Accessor
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas Grützmacher committed Feb 1, 2022
1 parent 1db0470 commit f3b2631
Show file tree
Hide file tree
Showing 10 changed files with 73 additions and 68 deletions.
49 changes: 25 additions & 24 deletions cuda/dot_benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,14 @@ int main(int argc, char **argv)
{
using ar_type = double;
using st_type = float;
using size_type = matrix_info::size_type;

constexpr std::size_t min_size{1'000'000};
constexpr std::size_t default_max_size{535 * 1000 * 1000};
constexpr size_type min_size{1'000'000};
constexpr size_type default_max_size{535 * 1000 * 1000};
constexpr char DELIM{';'};

bool detailed_error{false};
std::size_t max_size{default_max_size};
size_type max_size{default_max_size};

const std::string use_error_string("--error");
const std::string set_size_string("--size");
Expand Down Expand Up @@ -84,7 +85,7 @@ int main(int argc, char **argv)
return static_cast<ar_type>(st_data.get_result());
};

constexpr std::size_t benchmark_reference{0};
constexpr size_type benchmark_reference{0};
using benchmark_info_t =
std::tuple<std::string, std::function<void(matrix_info, matrix_info)>,
std::function<ar_type()>>;
Expand Down Expand Up @@ -139,7 +140,7 @@ int main(int argc, char **argv)
st_data.gpu_y(), st_data.gpu_res());
},
st_get_result}};
const std::size_t benchmark_num{benchmark_info.size()};
const size_type benchmark_num{static_cast<size_type>(benchmark_info.size())};


std::cout << "Vector Size";
Expand All @@ -163,32 +164,32 @@ int main(int argc, char **argv)
};

// Number of elements of a vector at the start of the benchmark
const std::size_t start = std::min(max_size, min_size);
const size_type start = std::min(max_size, min_size);
// Increase in number of elements between consecutive benchmark runs
constexpr std::size_t row_incr = 2'000'000;
constexpr size_type row_incr = 2'000'000;
// Number of benchmark runs (ignoring randomization)
const std::size_t steps =
const size_type steps =
(max_size < start) ? 0 : (max_size - start) / row_incr;
// Number of benchmark restarts with a different randomization for vectors
// Only used for a detailed error run
constexpr std::size_t max_randomize_num{10};
constexpr size_type max_randomize_num{10};

std::vector<std::size_t> benchmark_vec_size((steps + 1));
std::vector<size_type> benchmark_vec_size((steps + 1));
std::vector<double> benchmark_time((steps + 1) * benchmark_num);
// std::vector<ar_type> benchmark_error((steps + 1) * benchmark_num);
// stores the result for all different benchmark runs to compute the error
const auto actual_randomize_num = detailed_error ? max_randomize_num : 1;
std::vector<ar_type> raw_result(actual_randomize_num * (steps + 1) *
benchmark_num);
const auto get_raw_idx = [benchmark_num, actual_randomize_num](
std::size_t rnd, std::size_t step,
std::size_t bi) {
size_type rnd, size_type step,
size_type bi) {
return step * actual_randomize_num * benchmark_num +
bi * actual_randomize_num + rnd;
};

// Run all benchmarks and collect the raw data here
for (std::size_t randomize = 0; randomize < actual_randomize_num;
for (size_type randomize = 0; randomize < actual_randomize_num;
++randomize) {
if (randomize != 0) {
write_random({{max_size, 1}}, vector_dist, rengine,
Expand All @@ -198,14 +199,14 @@ int main(int argc, char **argv)
ar_data.copy_cpu_to_gpu();
st_data.convert_from(ar_data);
}
for (std::size_t vec_size = start, i = 0; vec_size <= max_size;
for (size_type vec_size = start, i = 0; vec_size <= max_size;
vec_size += row_incr, ++i) {
benchmark_vec_size.at(i) = vec_size;
const matrix_info x_info{{vec_size, 1}};
const matrix_info y_info{{vec_size, 1}};

for (std::size_t bi = 0; bi < benchmark_num; ++bi) {
const std::size_t idx = i * benchmark_num + bi;
for (size_type bi = 0; bi < benchmark_num; ++bi) {
const size_type idx = i * benchmark_num + bi;
auto curr_lambda = [&]() {
std::get<1>(benchmark_info[bi])(x_info, y_info);
};
Expand All @@ -218,26 +219,26 @@ int main(int argc, char **argv)
}

// Print the evaluated results
for (std::size_t i = 0; i <= steps; ++i) {
for (size_type i = 0; i <= steps; ++i) {
if (!detailed_error) {
std::cout << benchmark_vec_size[i];
for (std::size_t bi = 0; bi < benchmark_num; ++bi) {
for (size_type bi = 0; bi < benchmark_num; ++bi) {
std::cout << DELIM << benchmark_time[i * benchmark_num + bi];
}
const auto result_ref =
raw_result[get_raw_idx(0, i, benchmark_reference)];
for (std::size_t bi = 0; bi < benchmark_num; ++bi) {
for (size_type bi = 0; bi < benchmark_num; ++bi) {
std::cout << DELIM
<< get_error(raw_result[i * benchmark_num + bi],
result_ref);
}
std::cout << '\n';
} else {
std::cout << benchmark_vec_size[i];
for (std::size_t bi = 0; bi < benchmark_num; ++bi) {
for (size_type bi = 0; bi < benchmark_num; ++bi) {
// sort and compute the median
std::array<ar_type, max_randomize_num> local_error;
for (std::size_t rnd = 0; rnd < actual_randomize_num; ++rnd) {
for (size_type rnd = 0; rnd < actual_randomize_num; ++rnd) {
const auto result_ref =
raw_result[get_raw_idx(rnd, i, benchmark_reference)];
local_error[rnd] = get_error(
Expand Down Expand Up @@ -270,11 +271,11 @@ int main(int argc, char **argv)
std::cout << DELIM << "Result " << std::get<0>(info);
}
std::cout << '\n';
for (std::size_t i = 0; i <= steps; ++i) {
for (std::size_t randomize = 0; randomize < actual_randomize_num;
for (size_type i = 0; i <= steps; ++i) {
for (size_type randomize = 0; randomize < actual_randomize_num;
++randomize) {
std::cout << randomize << DELIM << benchmark_vec_size[i];
for (std::size_t bi = 0; bi < benchmark_num; ++bi) {
for (size_type bi = 0; bi < benchmark_num; ++bi) {
std::cout << DELIM << raw_result[get_raw_idx(randomize, i, bi)];
}
std::cout << '\n';
Expand Down
4 changes: 2 additions & 2 deletions cuda/dot_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -232,8 +232,8 @@ void acc_dot(myBlasHandle *handle, const matrix_info x_info, const StType *x,

// Accessor Setup
constexpr std::size_t dimensionality{2};
std::array<std::size_t, dimensionality - 1> x_stride{x_info.stride};
std::array<std::size_t, dimensionality - 1> y_stride{y_info.stride};
std::array<gko::acc::size_type, dimensionality - 1> x_stride{x_info.stride};
std::array<gko::acc::size_type, dimensionality - 1> y_stride{y_info.stride};

using accessor =
gko::acc::reduced_row_major<dimensionality, ArType, StType>;
Expand Down
2 changes: 1 addition & 1 deletion cuda/dot_memory.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ public:
* @param engine random engine used to generate the values
*/
template <typename VectDist, typename RndEngine>
DotMemory(std::size_t size, VectDist &&vect_dist, RndEngine &&engine)
DotMemory(matrix_info::size_type size, VectDist &&vect_dist, RndEngine &&engine)
: x_info_{{size, 1}},
y_info_{{size, 1}},
cpu_x_(gen_mtx<ValueType>(x_info_, vect_dist, engine)),
Expand Down
15 changes: 8 additions & 7 deletions cuda/gemv_benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,16 @@ int main(int argc, char **argv)
{
using ar_type = double;
using st_type = float;
using size_type = matrix_info::size_type;

constexpr ar_type ar_alpha{1.0};
constexpr ar_type ar_beta{1.0};
constexpr st_type st_alpha{static_cast<st_type>(ar_alpha)};
constexpr st_type st_beta{static_cast<st_type>(ar_beta)};

constexpr std::size_t default_max_size{24500};
constexpr std::size_t min_size{100};
std::size_t max_size{default_max_size};
constexpr size_type default_max_size{24500};
constexpr size_type min_size{100};
size_type max_size{default_max_size};

bool measure_error{false};

Expand Down Expand Up @@ -122,7 +123,7 @@ int main(int argc, char **argv)
return error / res_ref_norm;
};

constexpr std::size_t benchmark_reference{0};
constexpr size_type benchmark_reference{0};
using benchmark_info_t =
std::tuple<std::string,
std::function<void(matrix_info, matrix_info, matrix_info)>,
Expand Down Expand Up @@ -190,7 +191,7 @@ int main(int argc, char **argv)
},
st_compute_error},
};
const std::size_t benchmark_num{benchmark_info.size()};
const size_type benchmark_num{static_cast<size_type>(benchmark_info.size())};

std::cout << "Num rows";
for (const auto &info : benchmark_info) {
Expand All @@ -209,7 +210,7 @@ int main(int argc, char **argv)
std::vector<ar_type> local_res(benchmark_num);
constexpr auto start = min_size;
constexpr auto row_incr = start;
for (std::size_t num_rows = start; num_rows <= max_size;
for (size_type num_rows = start; num_rows <= max_size;
num_rows += row_incr) {
const matrix_info m_info{{num_rows, num_rows}, max_size};
const matrix_info x_info{{num_rows, 1}};
Expand All @@ -229,7 +230,7 @@ int main(int argc, char **argv)
ar_data.gpu_res_memory().copy_from(ar_cpu_res_init);
}
}
for (std::size_t i = 0; i < benchmark_num; ++i) {
for (size_type i = 0; i < benchmark_num; ++i) {
auto local_func = [&]() {
std::get<1>(benchmark_info[i])(m_info, x_info, res_info);
};
Expand Down
6 changes: 3 additions & 3 deletions cuda/gemv_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -176,9 +176,9 @@ void acc_gemv(const matrix_info m_info, ArType alpha, const StType *mtx,

// Accessor Setup
constexpr std::size_t dimensionality{2};
std::array<std::size_t, dimensionality - 1> m_stride{m_info.stride};
std::array<std::size_t, dimensionality - 1> x_stride{x_info.stride};
std::array<std::size_t, dimensionality - 1> res_stride{res_info.stride};
std::array<gko::acc::size_type, dimensionality - 1> m_stride{m_info.stride};
std::array<gko::acc::size_type, dimensionality - 1> x_stride{x_info.stride};
std::array<gko::acc::size_type, dimensionality - 1> res_stride{res_info.stride};

using accessor =
gko::acc::reduced_row_major<dimensionality, ArType, StType>;
Expand Down
2 changes: 1 addition & 1 deletion cuda/gemv_memory.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public:
* @param engine random engine used to generate the values
*/
template <typename MtxDist, typename VectDist, typename RndEngine>
GemvMemory(std::size_t max_size, MtxDist &&mtx_dist,
GemvMemory(matrix_info::size_type max_size, MtxDist &&mtx_dist,
VectDist &&vect_dist, RndEngine &&engine)
: m_info_{{max_size, max_size}},
x_info_{{max_size, 1}},
Expand Down
11 changes: 6 additions & 5 deletions cuda/trsv_benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,15 @@ int main(int argc, char **argv)
{
using ar_type = double;
using st_type = float;
using size_type = matrix_info::size_type;

constexpr tmtx_t t_matrix_type = tmtx_t::upper;
constexpr dmtx_t d_matrix_type = dmtx_t::unit;

constexpr std::size_t default_max_size{24 * 1000};
constexpr std::size_t min_size{100};
constexpr size_type default_max_size{24 * 1000};
constexpr size_type min_size{100};

std::size_t max_size{default_max_size};
auto max_size{default_max_size};
bool measure_error{false};

const std::string use_error_string("--error");
Expand Down Expand Up @@ -196,8 +197,8 @@ int main(int argc, char **argv)

std::vector<ar_type> local_res(benchmark_num);

const std::size_t start = std::min(max_size, min_size);
const std::size_t row_incr = start;
const auto start = std::min(max_size, min_size);
const auto row_incr = start;

for (auto num_rows = start; num_rows <= max_size; num_rows += row_incr) {
const matrix_info m_info{{num_rows, num_rows}, max_size};
Expand Down
16 changes: 8 additions & 8 deletions cuda/trsv_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ __global__ __launch_bounds__(swarps_per_block *swarp_size) void lower_trsv(

// stores the trianglular system in column major
__shared__ ValueType triang[swarp_size * triang_stride];
__shared__ std::uint32_t shared_row_block_idx;
__shared__ std::int32_t shared_row_block_idx;
__shared__ ValueType x_correction[swarp_size];

const auto group = cg::this_thread_block();
Expand Down Expand Up @@ -280,7 +280,7 @@ __global__ __launch_bounds__(swarps_per_block *swarp_size) void upper_trsv(

// stores the trianglular system in column major
__shared__ ValueType triang[swarp_size * triang_stride];
__shared__ std::uint32_t shared_row_block_idx;
__shared__ std::int32_t shared_row_block_idx;
__shared__ ValueType x_correction[swarp_size];

const auto group = cg::this_thread_block();
Expand Down Expand Up @@ -461,7 +461,7 @@ void trsv(const matrix_info m_info, tmtx_t ttype, dmtx_t dtype,
constexpr std::int32_t swarps_per_block{4};
const dim3 block_solve(subwarp_size, swarps_per_block, 1);
const dim3 grid_solve(
ceildiv(m_info.size[0], static_cast<std::size_t>(subwarp_size)), 1, 1);
ceildiv(m_info.size[0], static_cast<std::int64_t>(subwarp_size)), 1, 1);

kernel::trsv_init<<<1, 1>>>(trsv_helper);
if (dtype == dmtx_t::unit) {
Expand Down Expand Up @@ -544,7 +544,7 @@ __global__ __launch_bounds__(swarps_per_block *swarp_size) void acc_lower_trsv(

// stores the trianglular system in column major
__shared__ ar_type triang[swarp_size * triang_stride];
__shared__ std::uint32_t shared_row_block_idx;
__shared__ std::int32_t shared_row_block_idx;
__shared__ ar_type x_correction[swarp_size];

const auto group = cg::this_thread_block();
Expand Down Expand Up @@ -742,7 +742,7 @@ __global__ __launch_bounds__(swarps_per_block *swarp_size) void acc_upper_trsv(

// stores the trianglular system in column major
__shared__ ar_type triang[swarp_size * triang_stride];
__shared__ std::uint32_t shared_row_block_idx;
__shared__ std::int32_t shared_row_block_idx;
__shared__ ar_type x_correction[swarp_size];

const auto group = cg::this_thread_block();
Expand Down Expand Up @@ -922,8 +922,8 @@ void acc_trsv(const matrix_info m_info, tmtx_t ttype, dmtx_t dtype,
{
// Accessor Setup
constexpr std::size_t dimensionality{2};
std::array<std::size_t, dimensionality - 1> m_stride{m_info.stride};
std::array<std::size_t, dimensionality - 1> x_stride{x_info.stride};
std::array<gko::acc::size_type, dimensionality - 1> m_stride{m_info.stride};
std::array<gko::acc::size_type, dimensionality - 1> x_stride{x_info.stride};

using accessor =
gko::acc::reduced_row_major<dimensionality, ArType, StType>;
Expand All @@ -936,7 +936,7 @@ void acc_trsv(const matrix_info m_info, tmtx_t ttype, dmtx_t dtype,
constexpr std::int32_t swarps_per_block{4};
const dim3 block_solve(subwarp_size, swarps_per_block, 1);
const dim3 grid_solve(
ceildiv(m_info.size[0], static_cast<std::size_t>(subwarp_size)), 1, 1);
ceildiv(m_info.size[0], static_cast<std::int64_t>(subwarp_size)), 1, 1);

kernel::trsv_init<<<1, 1>>>(trsv_helper);
if (dtype == dmtx_t::unit) {
Expand Down
4 changes: 2 additions & 2 deletions cuda/trsv_memory.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ public:
* initializing)
*/
template <typename MtxGen, typename VectGen>
TrsvMemory(std::size_t max_size, MtxGen &&cpu_mtx_gen,
TrsvMemory(matrix_info::size_type max_size, MtxGen &&cpu_mtx_gen,
VectGen &&cpu_vect_gen)
: m_info_{{max_size, max_size}},
x_info_{{max_size, 1}},
Expand Down Expand Up @@ -139,7 +139,7 @@ public:
const auto pivot_size = std::max(m_info_.size[0], m_info_.size[1]);
Memory<int> cpu_pivot(Memory<int>::Device::cpu, pivot_size);
Memory<int> gpu_pivot(Memory<int>::Device::gpu, pivot_size);
for (std::size_t i = 0; i < pivot_size; ++i) {
for (matrix_info::size_type i = 0; i < pivot_size; ++i) {
cpu_pivot.data()[i] = i;
}
gpu_pivot = cpu_pivot;
Expand Down
Loading

0 comments on commit f3b2631

Please sign in to comment.