Skip to content

Commit

Permalink
Adds fabric handle and memory protection flags to cuda_async_memory_r…
Browse files Browse the repository at this point in the history
…esource (#1743)

This PR adds a new `fabric` handle type in `allocation_handle_type`. It also adds an optional `access_flags` to set the memory access desired when exporting (`prot_none`, or `prot_read_write`). Pools that are not meant to be shareable should omit these flags.

Please note that I can't add a unit test that exports or imports these fabric handles, because it would require system setup that doesn't look to be portable.

Authors:
  - Alessandro Bellina (https://github.com/abellina)

Approvers:
  - Rong Ou (https://github.com/rongou)
  - Lawrence Mitchell (https://github.com/wence-)
  - Bradley Dice (https://github.com/bdice)

URL: #1743
  • Loading branch information
abellina authored Dec 7, 2024
1 parent fc9c138 commit 83a8971
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 5 deletions.
36 changes: 31 additions & 5 deletions include/rmm/mr/device/cuda_async_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,31 @@ class cuda_async_memory_resource final : public device_memory_resource {
* memory pools (CUDA 11.2) did not support these flags, so we need a placeholder that can be
* used consistently in the constructor of `cuda_async_memory_resource` with all versions of
* CUDA >= 11.2. See the `cudaMemAllocationHandleType` docs at
* https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
* https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html and ensure the enum
* values are kept in sync with the CUDA documentation.
*/
enum class allocation_handle_type {
none = 0x0, ///< Does not allow any export mechanism.
posix_file_descriptor = 0x1, ///< Allows a file descriptor to be used for exporting. Permitted
///< only on POSIX systems.
win32 = 0x2, ///< Allows a Win32 NT handle to be used for exporting. (HANDLE)
win32_kmt = 0x4 ///< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE)
win32_kmt = 0x4, ///< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE)
fabric = 0x8 ///< Allows a fabric handle to be used for exporting. (cudaMemFabricHandle_t)
};

/**
* @brief Flags for specifying the memory pool accessibility from other devices.
*
* @note These values are exact copies from `cudaMemAccessFlags`. See the `cudaMemAccessFlags`
* docs at https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html and ensure the
* enum values are kept in sync with the CUDA documentation. The default, `none`, marks the pool's
* memory as private to the device in which it was created. `read_write` should only be used if
* memory sharing among devices is required. Note that there is a `cudaMemAccessFlagsProtRead`
* documented, but memory pools don't support read-only access, so it has been omitted.
*/
enum class access_flags {
none = 0, ///< Default, make pool not accessible.
read_write = 3 ///< Make pool read-write accessible.
};

/**
Expand All @@ -77,13 +94,16 @@ class cuda_async_memory_resource final : public device_memory_resource {
* @param release_threshold Optional release threshold size in bytes of the pool. If no value is
* provided, the release threshold is set to the total amount of memory on the current device.
* @param export_handle_type Optional `cudaMemAllocationHandleType` that allocations from this
* resource should support interprocess communication (IPC). Default is
* `cudaMemHandleTypeNone` for no IPC support.
* resource should support interprocess communication (IPC). Default is `cudaMemHandleTypeNone`
* for no IPC support.
* @param access_flag Optional `cudaMemAccessFlags` that controls pool memory accessibility
* from other devices. Default is `cudaMemAccessFlagsProtNone` for no accessibility.
*/
// NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
cuda_async_memory_resource(std::optional<std::size_t> initial_pool_size = {},
std::optional<std::size_t> release_threshold = {},
std::optional<allocation_handle_type> export_handle_type = {})
std::optional<allocation_handle_type> export_handle_type = {},
std::optional<access_flags> access_flag = {})
{
// Check if cudaMallocAsync Memory pool supported
RMM_EXPECTS(rmm::detail::runtime_async_alloc::is_supported(),
Expand Down Expand Up @@ -115,6 +135,12 @@ class cuda_async_memory_resource final : public device_memory_resource {
cudaMemPoolSetAttribute(pool_handle(), cudaMemPoolReuseAllowOpportunistic, &disabled));
}

if (access_flag) {
cudaMemAccessDesc desc = {.location = pool_props.location,
.flags = static_cast<cudaMemAccessFlags>(*access_flag)};
RMM_CUDA_TRY(cudaMemPoolSetAccess(pool_handle(), &desc, 1));
}

auto const [free, total] = rmm::available_device_memory();

// Need an l-value to take address to pass to cudaMemPoolSetAttribute
Expand Down
38 changes: 38 additions & 0 deletions tests/mr/device/cuda_async_mr_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,43 @@ TEST_F(AsyncMRTest, DifferentPoolsUnequal)
EXPECT_FALSE(mr1.is_equal(mr2));
}

class AsyncMRFabricTest : public AsyncMRTest {
void SetUp() override
{
AsyncMRTest::SetUp();

auto handle_type = static_cast<cudaMemAllocationHandleType>(
rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric);
if (!rmm::detail::runtime_async_alloc::is_export_handle_type_supported(handle_type)) {
GTEST_SKIP() << "Fabric handles are not supported in this environment. Skipping test.";
}
}
};

TEST_F(AsyncMRFabricTest, FabricHandlesSupport)
{
const auto pool_init_size{100};
const auto pool_release_threshold{1000};
cuda_async_mr mr{pool_init_size,
pool_release_threshold,
rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric};
void* ptr = mr.allocate(pool_init_size);
mr.deallocate(ptr, pool_init_size);
RMM_CUDA_TRY(cudaDeviceSynchronize());
}

TEST_F(AsyncMRFabricTest, FabricHandlesSupportReadWriteShareable)
{
const auto pool_init_size{100};
const auto pool_release_threshold{1000};
cuda_async_mr mr{pool_init_size,
pool_release_threshold,
rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric,
rmm::mr::cuda_async_memory_resource::access_flags::read_write};
void* ptr = mr.allocate(pool_init_size);
mr.deallocate(ptr, pool_init_size);
RMM_CUDA_TRY(cudaDeviceSynchronize());
}

} // namespace
} // namespace rmm::test

0 comments on commit 83a8971

Please sign in to comment.