ginkgo-project · fritzgoebel · Jul 11, 2024 · Jul 16, 2024 · Jul 17, 2024 · Jul 19, 2024
diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
@@ -4,6 +4,7 @@ set(CUDA_HIP_SOURCES
     base/device_matrix_data_kernels.cpp
     base/index_set_kernels.cpp
     components/prefix_sum_kernels.cpp
+    distributed/dd_matrix_kernels.cpp
     distributed/index_map_kernels.cpp
     distributed/matrix_kernels.cpp
     distributed/partition_helpers_kernels.cpp

diff --git a/common/cuda_hip/distributed/dd_matrix_kernels.cpp b/common/cuda_hip/distributed/dd_matrix_kernels.cpp
@@ -0,0 +1,136 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/distributed/dd_matrix_kernels.hpp"
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/unique.h>
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace distributed_dd_matrix {
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void filter_non_owning_idxs(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        col_partition,
+    comm_index_type local_part, array<GlobalIndexType>& non_local_row_idxs,
+    array<GlobalIndexType>& non_local_col_idxs)
+{
+    auto input_vals = input.get_const_values();
+    auto row_part_ids = row_partition->get_part_ids();
+    auto col_part_ids = col_partition->get_part_ids();
+    const auto* row_range_bounds = row_partition->get_range_bounds();
+    const auto* col_range_bounds = col_partition->get_range_bounds();
+    const auto* row_range_starting_indices =
+        row_partition->get_range_starting_indices();
+    const auto* col_range_starting_indices =
+        col_partition->get_range_starting_indices();
+    const auto num_row_ranges = row_partition->get_num_ranges();
+    const auto num_col_ranges = col_partition->get_num_ranges();
+    const auto num_input_elements = input.get_num_stored_elements();
+
+    auto policy = thrust_policy(exec);
+
+    // precompute the row and column range id of each input element
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto input_col_idxs = input.get_const_col_idxs();
+    array<size_type> row_range_ids{exec, num_input_elements};
+    thrust::upper_bound(policy, row_range_bounds + 1,
+                        row_range_bounds + num_row_ranges + 1, input_row_idxs,
+                        input_row_idxs + num_input_elements,
+                        row_range_ids.get_data());
+    array<size_type> col_range_ids{exec, input.get_num_stored_elements()};
+    thrust::upper_bound(policy, col_range_bounds + 1,
+                        col_range_bounds + num_col_ranges + 1, input_col_idxs,
+                        input_col_idxs + num_input_elements,
+                        col_range_ids.get_data());
+
+    // count number of non local row and column indices.
+    auto range_ids_it = thrust::make_zip_iterator(thrust::make_tuple(
+        row_range_ids.get_const_data(), col_range_ids.get_const_data()));
+    auto num_elements_pair = thrust::transform_reduce(
+        policy, range_ids_it, range_ids_it + num_input_elements,
+        [local_part, row_part_ids, col_part_ids] __host__ __device__(
+            const thrust::tuple<size_type, size_type>& tuple) {
+            auto row_part = row_part_ids[thrust::get<0>(tuple)];
+            auto col_part = col_part_ids[thrust::get<1>(tuple)];
+            bool is_local_row = row_part == local_part;
+            bool is_local_col = col_part == local_part;
+            return thrust::make_tuple(
+                is_local_row ? size_type{0} : size_type{1},
+                is_local_col ? size_type{0} : size_type{1});
+        },
+        thrust::make_tuple(size_type{}, size_type{}),
+        [] __host__ __device__(const thrust::tuple<size_type, size_type>& a,
+                               const thrust::tuple<size_type, size_type>& b) {
+            return thrust::make_tuple(thrust::get<0>(a) + thrust::get<0>(b),
+                                      thrust::get<1>(a) + thrust::get<1>(b));
+        });
+    auto n_non_local_col_idxs = thrust::get<0>(num_elements_pair);
+    auto n_non_local_row_idxs = thrust::get<1>(num_elements_pair);
+
+    // define global-to-local maps for row and column indices
+    auto map_to_local_row =
+        [row_range_bounds, row_range_starting_indices] __host__ __device__(
+            const GlobalIndexType row, const size_type range_id) {
+            return static_cast<LocalIndexType>(row -
+                                               row_range_bounds[range_id]) +
+                   row_range_starting_indices[range_id];
+        };
+    auto map_to_local_col =
+        [col_range_bounds, col_range_starting_indices] __host__ __device__(
+            const GlobalIndexType col, const size_type range_id) {
+            return static_cast<LocalIndexType>(col -
+                                               col_range_bounds[range_id]) +
+                   col_range_starting_indices[range_id];
+        };
+
+    non_local_col_idxs.resize_and_reset(n_non_local_col_idxs);
+    non_local_row_idxs.resize_and_reset(n_non_local_row_idxs);
+    thrust::copy_if(policy, input_col_idxs, input_col_idxs + num_input_elements,
+                    range_ids_it, non_local_col_idxs.get_data(),
+                    [local_part, col_part_ids] __host__ __device__(
+                        const thrust::tuple<size_type, size_type>& tuple) {
+                        auto col_part = col_part_ids[thrust::get<1>(tuple)];
+                        return col_part != local_part;
+                    });
+    thrust::copy_if(policy, input_row_idxs, input_row_idxs + num_input_elements,
+                    range_ids_it, non_local_row_idxs.get_data(),
+                    [local_part, row_part_ids] __host__ __device__(
+                        const thrust::tuple<size_type, size_type>& tuple) {
+                        auto row_part = row_part_ids[thrust::get<0>(tuple)];
+                        return row_part != local_part;
+                    });
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILTER_NON_OWNING_IDXS);
+
+
+}  // namespace distributed_dd_matrix
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/distributed/index_map_kernels.cpp b/common/cuda_hip/distributed/index_map_kernels.cpp
@@ -296,6 +296,90 @@ GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
     GKO_DECLARE_INDEX_MAP_MAP_TO_LOCAL);
 
 
+template <typename LocalIndexType, typename GlobalIndexType>
+void map_to_global(
+    std::shared_ptr<const DefaultExecutor> exec,
+    device_partition<const LocalIndexType, const GlobalIndexType> partition,
+    device_segmented_array<const GlobalIndexType> remote_global_idxs,
+    experimental::distributed::comm_index_type rank,
+    const array<LocalIndexType>& local_ids,
+    experimental::distributed::index_space is,
+    array<GlobalIndexType>& global_ids)
+{
+    auto range_bounds = partition.offsets_begin;
+    auto starting_indices = partition.starting_indices_begin;
+    const auto& ranges_by_part = partition.ranges_by_part;
+    auto local_ids_it = local_ids.get_const_data();
+    auto input_size = local_ids.get_size();
+
+    auto policy = thrust_policy(exec);
+
+    global_ids.resize_and_reset(local_ids.get_size());
+    auto global_ids_it = global_ids.get_data();
+
+    auto map_local = [rank, ranges_by_part, range_bounds, starting_indices,
+                      partition] __device__(auto lid) {
+        auto local_size =
+            static_cast<LocalIndexType>(partition.part_sizes_begin[rank]);
+
+        if (lid < 0 || lid >= local_size) {
+            return invalid_index<GlobalIndexType>();
+        }
+
+        auto local_ranges = ranges_by_part.get_segment(rank);
+        auto local_ranges_size =
+            static_cast<int64>(local_ranges.end - local_ranges.begin);
+
+        auto it = binary_search(int64(0), local_ranges_size, [=](const auto i) {
+            return starting_indices[local_ranges.begin[i]] >= lid;
+        });
+        auto local_range_id =
+            it != local_ranges_size ? it : max(int64(0), it - 1);
+        auto range_id = local_ranges.begin[local_range_id];
+
+        return static_cast<GlobalIndexType>(lid - starting_indices[range_id]) +
+               range_bounds[range_id];
+    };
+    auto map_non_local = [remote_global_idxs] __device__(auto lid) {
+        auto remote_size = static_cast<LocalIndexType>(
+            remote_global_idxs.flat_end - remote_global_idxs.flat_begin);
+
+        if (lid < 0 || lid >= remote_size) {
+            return invalid_index<GlobalIndexType>();
+        }
+
+        return remote_global_idxs.flat_begin[lid];
+    };
+    auto map_combined = [map_local, map_non_local, partition,
+                         rank] __device__(auto lid) {
+        auto local_size =
+            static_cast<LocalIndexType>(partition.part_sizes_begin[rank]);
+
+        if (lid < local_size) {
+            return map_local(lid);
+        } else {
+            return map_non_local(lid - local_size);
+        }
+    };
+
+    if (is == experimental::distributed::index_space::local) {
+        thrust::transform(policy, local_ids_it, local_ids_it + input_size,
+                          global_ids_it, map_local);
+    }
+    if (is == experimental::distributed::index_space::non_local) {
+        thrust::transform(policy, local_ids_it, local_ids_it + input_size,
+                          global_ids_it, map_non_local);
+    }
+    if (is == experimental::distributed::index_space::combined) {
+        thrust::transform(policy, local_ids_it, local_ids_it + input_size,
+                          global_ids_it, map_combined);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_INDEX_MAP_MAP_TO_GLOBAL);
+
+
 }  // namespace index_map
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels

diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp
@@ -20,6 +20,10 @@
 
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/components/atomic.hpp"
+#include "common/unified/base/kernel_launch.hpp"
+#include "core/components/fill_array_kernels.hpp"
+#include "core/components/format_conversion_kernels.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
 
 
 namespace gko {
@@ -49,6 +53,116 @@ struct input_type {
 };
 
 
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void count_non_owning_entries(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, array<comm_index_type>& send_count,
+    array<GlobalIndexType>& send_positions,
+    array<GlobalIndexType>& original_positions)
+{
+    auto row_part_ids = row_partition->get_part_ids();
+    const auto* row_range_bounds = row_partition->get_range_bounds();
+    const auto* row_range_starting_indices =
+        row_partition->get_range_starting_indices();
+    const auto num_row_ranges = row_partition->get_num_ranges();
+    const auto num_input_elements = input.get_num_stored_elements();
+
+    auto policy = thrust_policy(exec);
+
+    // precompute the row and column range id of each input element
+    auto input_row_idxs = input.get_const_row_idxs();
+    array<size_type> row_range_ids{exec, num_input_elements};
+    thrust::upper_bound(policy, row_range_bounds + 1,
+                        row_range_bounds + num_row_ranges + 1, input_row_idxs,
+                        input_row_idxs + num_input_elements,
+                        row_range_ids.get_data());
+
+    array<comm_index_type> row_part_ids_per_entry{exec, num_input_elements};
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto part_id, auto part_ids, auto range_ids,
+                      auto part_ids_per_entry, auto orig_positions) {
+            part_ids_per_entry[i] = part_ids[range_ids[i]];
+            orig_positions[i] = part_ids_per_entry[i] == part_id ? -1 : i;
+        },
+        num_input_elements, local_part, row_part_ids, row_range_ids.get_data(),
+        row_part_ids_per_entry.get_data(), original_positions.get_data());
+
+    thrust::stable_sort_by_key(
+        policy, row_part_ids_per_entry.get_data(),
+        row_part_ids_per_entry.get_data() + num_input_elements,
+        original_positions.get_data());
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto orig_positions, auto s_positions) {
+            s_positions[i] = orig_positions[i] >= 0 ? 1 : 0;
+        },
+        num_input_elements, original_positions.get_const_data(),
+        send_positions.get_data());
+
+    components::prefix_sum_nonnegative(exec, send_positions.get_data(),
+                                       num_input_elements);
+    size_type num_parts = row_partition->get_num_parts();
+    array<comm_index_type> row_part_ptrs{exec, num_parts + 1};
+    components::fill_array(exec, row_part_ptrs.get_data(), num_parts + 1,
+                           zero<comm_index_type>());
+
+    components::convert_idxs_to_ptrs(
+        exec, row_part_ids_per_entry.get_const_data(), num_input_elements,
+        num_parts, row_part_ptrs.get_data());
+
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto part_id, auto part_ptrs, auto count) {
+            count[i] = i == part_id ? 0 : part_ptrs[i + 1] - part_ptrs[i];
+        },
+        num_parts, local_part, row_part_ptrs.get_data(), send_count.get_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_COUNT_NON_OWNING_ENTRIES);
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void fill_send_buffers(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    comm_index_type local_part, const array<GlobalIndexType>& send_positions,
+    const array<GlobalIndexType>& original_positions,
+    array<GlobalIndexType>& send_row_idxs,
+    array<GlobalIndexType>& send_col_idxs, array<ValueType>& send_values)
+{
+    auto num_entries = input.get_num_stored_elements();
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto input_col_idxs = input.get_const_col_idxs();
+    auto input_values = input.get_const_values();
+
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto in_rows, auto in_cols, auto in_vals,
+                      auto in_pos, auto out_pos, auto out_rows, auto out_cols,
+                      auto out_vals) {
+            if (in_pos[i] >= 0) {
+                out_rows[out_pos[i]] = in_rows[in_pos[i]];
+                out_cols[out_pos[i]] = in_cols[in_pos[i]];
+                out_vals[out_pos[i]] = in_vals[in_pos[i]];
+            }
+        },
+        num_entries, input_row_idxs, input_col_idxs, input_values,
+        original_positions.get_const_data(), send_positions.get_const_data(),
+        send_row_idxs.get_data(), send_col_idxs.get_data(),
+        send_values.get_data());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILL_SEND_BUFFERS);
+
+
 template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
 void separate_local_nonlocal(
     std::shared_ptr<const DefaultExecutor> exec,