From 83a897155e40ab52c4876bfcf9425105be748688 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@gmail.com>
Date: Sat, 7 Dec 2024 12:09:43 -0600
Subject: [PATCH] Adds fabric handle and memory protection flags to
 cuda_async_memory_resource (#1743)

This PR adds a new `fabric` handle type in `allocation_handle_type`. It also adds an optional `access_flags` to set the memory access desired when exporting (`prot_none`, or `prot_read_write`). Pools that are not meant to be shareable should omit these flags.

Please note that I can't add a unit test that exports or imports these fabric handles, because it would require system setup that doesn't look to be portable.

Authors:
  - Alessandro Bellina (https://github.com/abellina)

Approvers:
  - Rong Ou (https://github.com/rongou)
  - Lawrence Mitchell (https://github.com/wence-)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/rmm/pull/1743
---
 .../mr/device/cuda_async_memory_resource.hpp  | 36 +++++++++++++++---
 tests/mr/device/cuda_async_mr_tests.cpp       | 38 +++++++++++++++++++
 2 files changed, 69 insertions(+), 5 deletions(-)

diff --git a/include/rmm/mr/device/cuda_async_memory_resource.hpp b/include/rmm/mr/device/cuda_async_memory_resource.hpp
index b1fc0b112..dbb324a1b 100644
--- a/include/rmm/mr/device/cuda_async_memory_resource.hpp
+++ b/include/rmm/mr/device/cuda_async_memory_resource.hpp
@@ -53,14 +53,31 @@ class cuda_async_memory_resource final : public device_memory_resource {
    * memory pools (CUDA 11.2) did not support these flags, so we need a placeholder that can be
    * used consistently in the constructor of `cuda_async_memory_resource` with all versions of
    * CUDA >= 11.2. See the `cudaMemAllocationHandleType` docs at
-   * https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
+   * https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html and ensure the enum
+   * values are kept in sync with the CUDA documentation.
    */
   enum class allocation_handle_type {
     none                  = 0x0,  ///< Does not allow any export mechanism.
     posix_file_descriptor = 0x1,  ///< Allows a file descriptor to be used for exporting. Permitted
                                   ///< only on POSIX systems.
     win32     = 0x2,              ///< Allows a Win32 NT handle to be used for exporting. (HANDLE)
-    win32_kmt = 0x4  ///< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE)
+    win32_kmt = 0x4,  ///< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE)
+    fabric    = 0x8   ///< Allows a fabric handle to be used for exporting. (cudaMemFabricHandle_t)
+  };
+
+  /**
+   * @brief Flags for specifying the memory pool accessibility from other devices.
+   *
+   * @note These values are exact copies from `cudaMemAccessFlags`. See the `cudaMemAccessFlags`
+   * docs at https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html and ensure the
+   * enum values are kept in sync with the CUDA documentation. The default, `none`, marks the pool's
+   * memory as private to the device in which it was created. `read_write` should only be used if
+   * memory sharing among devices is required. Note that there is a `cudaMemAccessFlagsProtRead`
+   * documented, but memory pools don't support read-only access, so it has been omitted.
+   */
+  enum class access_flags {
+    none       = 0,  ///< Default, make pool not accessible.
+    read_write = 3   ///< Make pool read-write accessible.
   };
 
   /**
@@ -77,13 +94,16 @@ class cuda_async_memory_resource final : public device_memory_resource {
    * @param release_threshold Optional release threshold size in bytes of the pool. If no value is
    * provided, the release threshold is set to the total amount of memory on the current device.
    * @param export_handle_type Optional `cudaMemAllocationHandleType` that allocations from this
-   * resource should support interprocess communication (IPC). Default is
-   * `cudaMemHandleTypeNone` for no IPC support.
+   * resource should support interprocess communication (IPC). Default is `cudaMemHandleTypeNone`
+   * for no IPC support.
+   * @param access_flag Optional `cudaMemAccessFlags` that controls pool memory accessibility
+   * from other devices. Default is `cudaMemAccessFlagsProtNone` for no accessibility.
    */
   // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
   cuda_async_memory_resource(std::optional<std::size_t> initial_pool_size             = {},
                              std::optional<std::size_t> release_threshold             = {},
-                             std::optional<allocation_handle_type> export_handle_type = {})
+                             std::optional<allocation_handle_type> export_handle_type = {},
+                             std::optional<access_flags> access_flag                  = {})
   {
     // Check if cudaMallocAsync Memory pool supported
     RMM_EXPECTS(rmm::detail::runtime_async_alloc::is_supported(),
@@ -115,6 +135,12 @@ class cuda_async_memory_resource final : public device_memory_resource {
         cudaMemPoolSetAttribute(pool_handle(), cudaMemPoolReuseAllowOpportunistic, &disabled));
     }
 
+    if (access_flag) {
+      cudaMemAccessDesc desc = {.location = pool_props.location,
+                                .flags    = static_cast<cudaMemAccessFlags>(*access_flag)};
+      RMM_CUDA_TRY(cudaMemPoolSetAccess(pool_handle(), &desc, 1));
+    }
+
     auto const [free, total] = rmm::available_device_memory();
 
     // Need an l-value to take address to pass to cudaMemPoolSetAttribute
diff --git a/tests/mr/device/cuda_async_mr_tests.cpp b/tests/mr/device/cuda_async_mr_tests.cpp
index a39188548..167adf1e1 100644
--- a/tests/mr/device/cuda_async_mr_tests.cpp
+++ b/tests/mr/device/cuda_async_mr_tests.cpp
@@ -66,5 +66,43 @@ TEST_F(AsyncMRTest, DifferentPoolsUnequal)
   EXPECT_FALSE(mr1.is_equal(mr2));
 }
 
+class AsyncMRFabricTest : public AsyncMRTest {
+  void SetUp() override
+  {
+    AsyncMRTest::SetUp();
+
+    auto handle_type = static_cast<cudaMemAllocationHandleType>(
+      rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric);
+    if (!rmm::detail::runtime_async_alloc::is_export_handle_type_supported(handle_type)) {
+      GTEST_SKIP() << "Fabric handles are not supported in this environment. Skipping test.";
+    }
+  }
+};
+
+TEST_F(AsyncMRFabricTest, FabricHandlesSupport)
+{
+  const auto pool_init_size{100};
+  const auto pool_release_threshold{1000};
+  cuda_async_mr mr{pool_init_size,
+                   pool_release_threshold,
+                   rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric};
+  void* ptr = mr.allocate(pool_init_size);
+  mr.deallocate(ptr, pool_init_size);
+  RMM_CUDA_TRY(cudaDeviceSynchronize());
+}
+
+TEST_F(AsyncMRFabricTest, FabricHandlesSupportReadWriteShareable)
+{
+  const auto pool_init_size{100};
+  const auto pool_release_threshold{1000};
+  cuda_async_mr mr{pool_init_size,
+                   pool_release_threshold,
+                   rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric,
+                   rmm::mr::cuda_async_memory_resource::access_flags::read_write};
+  void* ptr = mr.allocate(pool_init_size);
+  mr.deallocate(ptr, pool_init_size);
+  RMM_CUDA_TRY(cudaDeviceSynchronize());
+}
+
 }  // namespace
 }  // namespace rmm::test