Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a Domain decomposition Matrix format #1719

Draft
wants to merge 18 commits into
base: dd_base
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions common/cuda_hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ set(CUDA_HIP_SOURCES
base/device_matrix_data_kernels.cpp
base/index_set_kernels.cpp
components/prefix_sum_kernels.cpp
distributed/dd_matrix_kernels.cpp
distributed/index_map_kernels.cpp
distributed/matrix_kernels.cpp
distributed/partition_helpers_kernels.cpp
Expand Down
136 changes: 136 additions & 0 deletions common/cuda_hip/distributed/dd_matrix_kernels.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#include "core/distributed/dd_matrix_kernels.hpp"

#include <thrust/binary_search.h>
#include <thrust/copy.h>
#include <thrust/distance.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
#include <thrust/transform_reduce.h>
#include <thrust/unique.h>

#include <ginkgo/core/base/exception_helpers.hpp>

#include "common/cuda_hip/base/thrust.hpp"
#include "common/cuda_hip/components/atomic.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace distributed_dd_matrix {


template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
void filter_non_owning_idxs(
std::shared_ptr<const DefaultExecutor> exec,
const device_matrix_data<ValueType, GlobalIndexType>& input,
const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
row_partition,
const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
col_partition,
comm_index_type local_part, array<GlobalIndexType>& non_local_row_idxs,
array<GlobalIndexType>& non_local_col_idxs)
{
auto input_vals = input.get_const_values();
auto row_part_ids = row_partition->get_part_ids();
auto col_part_ids = col_partition->get_part_ids();
const auto* row_range_bounds = row_partition->get_range_bounds();
const auto* col_range_bounds = col_partition->get_range_bounds();
const auto* row_range_starting_indices =
row_partition->get_range_starting_indices();
const auto* col_range_starting_indices =
col_partition->get_range_starting_indices();
const auto num_row_ranges = row_partition->get_num_ranges();
const auto num_col_ranges = col_partition->get_num_ranges();
const auto num_input_elements = input.get_num_stored_elements();

auto policy = thrust_policy(exec);

// precompute the row and column range id of each input element
auto input_row_idxs = input.get_const_row_idxs();
auto input_col_idxs = input.get_const_col_idxs();
array<size_type> row_range_ids{exec, num_input_elements};
thrust::upper_bound(policy, row_range_bounds + 1,
row_range_bounds + num_row_ranges + 1, input_row_idxs,
input_row_idxs + num_input_elements,
row_range_ids.get_data());
array<size_type> col_range_ids{exec, input.get_num_stored_elements()};
thrust::upper_bound(policy, col_range_bounds + 1,
col_range_bounds + num_col_ranges + 1, input_col_idxs,
input_col_idxs + num_input_elements,
col_range_ids.get_data());

// count number of non local row and column indices.
auto range_ids_it = thrust::make_zip_iterator(thrust::make_tuple(
row_range_ids.get_const_data(), col_range_ids.get_const_data()));
auto num_elements_pair = thrust::transform_reduce(
policy, range_ids_it, range_ids_it + num_input_elements,
[local_part, row_part_ids, col_part_ids] __host__ __device__(
const thrust::tuple<size_type, size_type>& tuple) {
auto row_part = row_part_ids[thrust::get<0>(tuple)];
auto col_part = col_part_ids[thrust::get<1>(tuple)];
bool is_local_row = row_part == local_part;
bool is_local_col = col_part == local_part;
return thrust::make_tuple(
is_local_row ? size_type{0} : size_type{1},
is_local_col ? size_type{0} : size_type{1});
},
thrust::make_tuple(size_type{}, size_type{}),
[] __host__ __device__(const thrust::tuple<size_type, size_type>& a,
const thrust::tuple<size_type, size_type>& b) {
return thrust::make_tuple(thrust::get<0>(a) + thrust::get<0>(b),
thrust::get<1>(a) + thrust::get<1>(b));
});
auto n_non_local_col_idxs = thrust::get<0>(num_elements_pair);
auto n_non_local_row_idxs = thrust::get<1>(num_elements_pair);

// define global-to-local maps for row and column indices
auto map_to_local_row =
[row_range_bounds, row_range_starting_indices] __host__ __device__(
const GlobalIndexType row, const size_type range_id) {
return static_cast<LocalIndexType>(row -
row_range_bounds[range_id]) +
row_range_starting_indices[range_id];
};
auto map_to_local_col =
[col_range_bounds, col_range_starting_indices] __host__ __device__(
const GlobalIndexType col, const size_type range_id) {
return static_cast<LocalIndexType>(col -
col_range_bounds[range_id]) +
col_range_starting_indices[range_id];
};

non_local_col_idxs.resize_and_reset(n_non_local_col_idxs);
non_local_row_idxs.resize_and_reset(n_non_local_row_idxs);
thrust::copy_if(policy, input_col_idxs, input_col_idxs + num_input_elements,
range_ids_it, non_local_col_idxs.get_data(),
[local_part, col_part_ids] __host__ __device__(
const thrust::tuple<size_type, size_type>& tuple) {
auto col_part = col_part_ids[thrust::get<1>(tuple)];
return col_part != local_part;
});
thrust::copy_if(policy, input_row_idxs, input_row_idxs + num_input_elements,
range_ids_it, non_local_row_idxs.get_data(),
[local_part, row_part_ids] __host__ __device__(
const thrust::tuple<size_type, size_type>& tuple) {
auto row_part = row_part_ids[thrust::get<0>(tuple)];
return row_part != local_part;
});
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
GKO_DECLARE_FILTER_NON_OWNING_IDXS);


} // namespace distributed_dd_matrix
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
84 changes: 84 additions & 0 deletions common/cuda_hip/distributed/index_map_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,90 @@ GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
GKO_DECLARE_INDEX_MAP_MAP_TO_LOCAL);


template <typename LocalIndexType, typename GlobalIndexType>
void map_to_global(
std::shared_ptr<const DefaultExecutor> exec,
device_partition<const LocalIndexType, const GlobalIndexType> partition,
device_segmented_array<const GlobalIndexType> remote_global_idxs,
experimental::distributed::comm_index_type rank,
const array<LocalIndexType>& local_ids,
experimental::distributed::index_space is,
array<GlobalIndexType>& global_ids)
{
auto range_bounds = partition.offsets_begin;
auto starting_indices = partition.starting_indices_begin;
const auto& ranges_by_part = partition.ranges_by_part;
auto local_ids_it = local_ids.get_const_data();
auto input_size = local_ids.get_size();

auto policy = thrust_policy(exec);

global_ids.resize_and_reset(local_ids.get_size());
auto global_ids_it = global_ids.get_data();

auto map_local = [rank, ranges_by_part, range_bounds, starting_indices,
partition] __device__(auto lid) {
auto local_size =
static_cast<LocalIndexType>(partition.part_sizes_begin[rank]);

if (lid < 0 || lid >= local_size) {
return invalid_index<GlobalIndexType>();
}

auto local_ranges = ranges_by_part.get_segment(rank);
auto local_ranges_size =
static_cast<int64>(local_ranges.end - local_ranges.begin);

auto it = binary_search(int64(0), local_ranges_size, [=](const auto i) {
return starting_indices[local_ranges.begin[i]] >= lid;
});
auto local_range_id =
it != local_ranges_size ? it : max(int64(0), it - 1);
auto range_id = local_ranges.begin[local_range_id];

return static_cast<GlobalIndexType>(lid - starting_indices[range_id]) +
range_bounds[range_id];
};
auto map_non_local = [remote_global_idxs] __device__(auto lid) {
auto remote_size = static_cast<LocalIndexType>(
remote_global_idxs.flat_end - remote_global_idxs.flat_begin);

if (lid < 0 || lid >= remote_size) {
return invalid_index<GlobalIndexType>();
}

return remote_global_idxs.flat_begin[lid];
};
auto map_combined = [map_local, map_non_local, partition,
rank] __device__(auto lid) {
auto local_size =
static_cast<LocalIndexType>(partition.part_sizes_begin[rank]);

if (lid < local_size) {
return map_local(lid);
} else {
return map_non_local(lid - local_size);
}
};

if (is == experimental::distributed::index_space::local) {
thrust::transform(policy, local_ids_it, local_ids_it + input_size,
global_ids_it, map_local);
}
if (is == experimental::distributed::index_space::non_local) {
thrust::transform(policy, local_ids_it, local_ids_it + input_size,
global_ids_it, map_non_local);
}
if (is == experimental::distributed::index_space::combined) {
thrust::transform(policy, local_ids_it, local_ids_it + input_size,
global_ids_it, map_combined);
}
}

GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
GKO_DECLARE_INDEX_MAP_MAP_TO_GLOBAL);


} // namespace index_map
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
Expand Down
114 changes: 114 additions & 0 deletions common/cuda_hip/distributed/matrix_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@

#include "common/cuda_hip/base/thrust.hpp"
#include "common/cuda_hip/components/atomic.hpp"
#include "common/unified/base/kernel_launch.hpp"
#include "core/components/fill_array_kernels.hpp"
#include "core/components/format_conversion_kernels.hpp"
#include "core/components/prefix_sum_kernels.hpp"


namespace gko {
Expand Down Expand Up @@ -49,6 +53,116 @@ struct input_type {
};


template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
void count_non_owning_entries(
std::shared_ptr<const DefaultExecutor> exec,
const device_matrix_data<ValueType, GlobalIndexType>& input,
const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
row_partition,
comm_index_type local_part, array<comm_index_type>& send_count,
array<GlobalIndexType>& send_positions,
array<GlobalIndexType>& original_positions)
{
auto row_part_ids = row_partition->get_part_ids();
const auto* row_range_bounds = row_partition->get_range_bounds();
const auto* row_range_starting_indices =
row_partition->get_range_starting_indices();
const auto num_row_ranges = row_partition->get_num_ranges();
const auto num_input_elements = input.get_num_stored_elements();

auto policy = thrust_policy(exec);

// precompute the row and column range id of each input element
auto input_row_idxs = input.get_const_row_idxs();
array<size_type> row_range_ids{exec, num_input_elements};
thrust::upper_bound(policy, row_range_bounds + 1,
row_range_bounds + num_row_ranges + 1, input_row_idxs,
input_row_idxs + num_input_elements,
row_range_ids.get_data());

array<comm_index_type> row_part_ids_per_entry{exec, num_input_elements};
run_kernel(
exec,
[] GKO_KERNEL(auto i, auto part_id, auto part_ids, auto range_ids,
auto part_ids_per_entry, auto orig_positions) {
part_ids_per_entry[i] = part_ids[range_ids[i]];
orig_positions[i] = part_ids_per_entry[i] == part_id ? -1 : i;
},
num_input_elements, local_part, row_part_ids, row_range_ids.get_data(),
row_part_ids_per_entry.get_data(), original_positions.get_data());

thrust::stable_sort_by_key(
policy, row_part_ids_per_entry.get_data(),
row_part_ids_per_entry.get_data() + num_input_elements,
original_positions.get_data());
run_kernel(
exec,
[] GKO_KERNEL(auto i, auto orig_positions, auto s_positions) {
s_positions[i] = orig_positions[i] >= 0 ? 1 : 0;
},
num_input_elements, original_positions.get_const_data(),
send_positions.get_data());

components::prefix_sum_nonnegative(exec, send_positions.get_data(),
num_input_elements);
size_type num_parts = row_partition->get_num_parts();
array<comm_index_type> row_part_ptrs{exec, num_parts + 1};
components::fill_array(exec, row_part_ptrs.get_data(), num_parts + 1,
zero<comm_index_type>());

components::convert_idxs_to_ptrs(
exec, row_part_ids_per_entry.get_const_data(), num_input_elements,
num_parts, row_part_ptrs.get_data());

run_kernel(
exec,
[] GKO_KERNEL(auto i, auto part_id, auto part_ptrs, auto count) {
count[i] = i == part_id ? 0 : part_ptrs[i + 1] - part_ptrs[i];
},
num_parts, local_part, row_part_ptrs.get_data(), send_count.get_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);


template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
void fill_send_buffers(
std::shared_ptr<const DefaultExecutor> exec,
const device_matrix_data<ValueType, GlobalIndexType>& input,
const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
row_partition,
comm_index_type local_part, const array<GlobalIndexType>& send_positions,
const array<GlobalIndexType>& original_positions,
array<GlobalIndexType>& send_row_idxs,
array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
{
auto num_entries = input.get_num_stored_elements();
auto input_row_idxs = input.get_const_row_idxs();
auto input_col_idxs = input.get_const_col_idxs();
auto input_values = input.get_const_values();

run_kernel(
exec,
[] GKO_KERNEL(auto i, auto in_rows, auto in_cols, auto in_vals,
auto in_pos, auto out_pos, auto out_rows, auto out_cols,
auto out_vals) {
if (in_pos[i] >= 0) {
out_rows[out_pos[i]] = in_rows[in_pos[i]];
out_cols[out_pos[i]] = in_cols[in_pos[i]];
out_vals[out_pos[i]] = in_vals[in_pos[i]];
}
},
num_entries, input_row_idxs, input_col_idxs, input_values,
original_positions.get_const_data(), send_positions.get_const_data(),
send_row_idxs.get_data(), send_col_idxs.get_data(),
send_values.get_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
GKO_DECLARE_FILL_SEND_BUFFERS);


template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
void separate_local_nonlocal(
std::shared_ptr<const DefaultExecutor> exec,
Expand Down
Loading
Loading