From 929a595e639b61b583cc584b1c291f9559cef673 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 19 Nov 2024 09:51:53 -0500 Subject: [PATCH] Make `cudaMallocAsync` logic non-optional as we require CUDA 11.2+ (#1667) We can remove the optimizations around `CUDA_STATIC_RUNTIME` and instead see if the function is already in the process space so that RMM doesn't need to have any build context to run properly Fixes #1679 Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Mark Harris (https://github.com/harrism) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/rmm/pull/1667 --- CMakeLists.txt | 1 - .../multi_stream_allocations_bench.cu | 6 - .../random_allocations/random_allocations.cpp | 6 - include/rmm/detail/dynamic_load_runtime.hpp | 191 ------------------ include/rmm/detail/runtime_async_alloc.hpp | 79 ++++++++ .../mr/device/cuda_async_memory_resource.hpp | 54 ++--- .../cuda_async_view_memory_resource.hpp | 32 +-- python/rmm/rmm/tests/test_rmm.py | 34 +--- tests/CMakeLists.txt | 17 +- tests/mr/device/cuda_async_mr_tests.cpp | 17 +- tests/mr/device/cuda_async_view_mr_tests.cpp | 13 +- .../mr/device/mr_ref_multithreaded_tests.cpp | 16 +- tests/mr/device/mr_ref_test.hpp | 2 +- tests/mr/device/mr_ref_tests.cpp | 4 - tests/mr/device/thrust_allocator_tests.cu | 16 +- tests/prefetch_tests.cpp | 2 +- 16 files changed, 131 insertions(+), 359 deletions(-) delete mode 100644 include/rmm/detail/dynamic_load_runtime.hpp create mode 100644 include/rmm/detail/runtime_async_alloc.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 44d7fbb79..07bd368ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,7 +89,6 @@ target_include_directories(rmm INTERFACE "$Apply(benchmark_range); return; } -#endif if (name == "pool") { BENCHMARK_CAPTURE(BM_MultiStreamAllocations, pool_mr, &make_pool) // @@ -248,9 +244,7 @@ int main(int argc, char** argv) resource_names.emplace_back(args["resource"].as()); } else { resource_names.emplace_back("cuda"); -#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT resource_names.emplace_back("cuda_async"); -#endif resource_names.emplace_back("pool"); resource_names.emplace_back("arena"); resource_names.emplace_back("binning"); diff --git a/benchmarks/random_allocations/random_allocations.cpp b/benchmarks/random_allocations/random_allocations.cpp index 57116743b..2971f7e40 100644 --- a/benchmarks/random_allocations/random_allocations.cpp +++ b/benchmarks/random_allocations/random_allocations.cpp @@ -316,9 +316,7 @@ int main(int argc, char** argv) std::map const funcs({{"arena", &make_arena}, {"binning", &make_binning}, {"cuda", &make_cuda}, -#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT {"cuda_async", &make_cuda_async}, -#endif {"pool", &make_pool}}); auto resource = args["resource"].as(); @@ -340,11 +338,7 @@ int main(int argc, char** argv) std::string mr_name = args["resource"].as(); declare_benchmark(mr_name); } else { -#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT std::vector mrs{"pool", "binning", "arena", "cuda_async", "cuda"}; -#else - std::vector mrs{"pool", "binning", "arena", "cuda"}; -#endif std::for_each( std::cbegin(mrs), std::cend(mrs), [](auto const& mr) { declare_benchmark(mr); }); } diff --git a/include/rmm/detail/dynamic_load_runtime.hpp b/include/rmm/detail/dynamic_load_runtime.hpp deleted file mode 100644 index 214228752..000000000 --- a/include/rmm/detail/dynamic_load_runtime.hpp +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -#include - -#include - -#include -#include - -namespace RMM_NAMESPACE { -namespace detail { - -/** - * @brief `dynamic_load_runtime` loads the cuda runtime library at runtime - * - * By loading the cudart library at runtime we can use functions that - * are added in newer minor versions of the cuda runtime. - */ -struct dynamic_load_runtime { - static void* get_cuda_runtime_handle() - { - auto close_cudart = [](void* handle) { ::dlclose(handle); }; - auto open_cudart = []() { - ::dlerror(); - const int major = CUDART_VERSION / 1000; - - // In CUDA 12 the SONAME is correctly defined as libcudart.12, but for - // CUDA<=11 it includes an extra 0 minor version e.g. libcudart.11.0. We - // also allow finding the linker name. - const std::string libname_ver_cuda_11 = "libcudart.so." + std::to_string(major) + ".0"; - const std::string libname_ver_cuda_12 = "libcudart.so." + std::to_string(major); - const std::string libname = "libcudart.so"; - - void* ptr = nullptr; - for (auto&& name : {libname_ver_cuda_12, libname_ver_cuda_11, libname}) { - ptr = dlopen(name.c_str(), RTLD_LAZY); - if (ptr != nullptr) break; - } - - if (ptr != nullptr) { return ptr; } - - RMM_FAIL("Unable to dlopen cudart"); - }; - static std::unique_ptr cudart_handle{open_cudart(), close_cudart}; - return cudart_handle.get(); - } - - template - using function_sig = std::add_pointer_t; - - template - static std::optional function(const char* func_name) - { - auto* runtime = get_cuda_runtime_handle(); - auto* handle = ::dlsym(runtime, func_name); - if (!handle) { return std::nullopt; } - auto* function_ptr = reinterpret_cast(handle); - return std::optional(function_ptr); - } -}; - -#if defined(RMM_STATIC_CUDART) -// clang-format off -#define RMM_CUDART_API_WRAPPER(name, signature) \ - template \ - static cudaError_t name(Args... args) \ - { \ - _Pragma("GCC diagnostic push") \ - _Pragma("GCC diagnostic ignored \"-Waddress\"") \ - static_assert(static_cast(::name), \ - "Failed to find #name function with arguments #signature"); \ - _Pragma("GCC diagnostic pop") \ - return ::name(args...); \ - } -// clang-format on -#else -#define RMM_CUDART_API_WRAPPER(name, signature) \ - template \ - static cudaError_t name(Args... args) \ - { \ - static const auto func = dynamic_load_runtime::function(#name); \ - if (func) { return (*func)(args...); } \ - RMM_FAIL("Failed to find #name function in libcudart.so"); \ - } -#endif - -#if CUDART_VERSION >= 11020 // 11.2 introduced cudaMallocAsync -/** - * @brief Bind to the stream-ordered memory allocator functions - * at runtime. - * - * This allows RMM users to compile/link against CUDA 11.2+ and run with - * < CUDA 11.2 runtime as these functions are found at call time. - */ -struct async_alloc { - static bool is_supported() - { -#if defined(RMM_STATIC_CUDART) - static bool runtime_supports_pool = (CUDART_VERSION >= 11020); -#else - static bool runtime_supports_pool = - dynamic_load_runtime::function>( - "cudaFreeAsync") - .has_value(); -#endif - - static auto driver_supports_pool{[] { - int cuda_pool_supported{}; - auto result = cudaDeviceGetAttribute(&cuda_pool_supported, - cudaDevAttrMemoryPoolsSupported, - rmm::get_current_cuda_device().value()); - return result == cudaSuccess and cuda_pool_supported == 1; - }()}; - return runtime_supports_pool and driver_supports_pool; - } - - /** - * @brief Check whether the specified `cudaMemAllocationHandleType` is supported on the present - * CUDA driver/runtime version. - * - * @note This query was introduced in CUDA 11.3 so on CUDA 11.2 this function will only return - * true for `cudaMemHandleTypeNone`. - * - * @param handle_type An IPC export handle type to check for support. - * @return true if supported - * @return false if unsupported - */ - static bool is_export_handle_type_supported(cudaMemAllocationHandleType handle_type) - { - int supported_handle_types_bitmask{}; -#if CUDART_VERSION >= 11030 // 11.3 introduced cudaDevAttrMemoryPoolSupportedHandleTypes - if (cudaMemHandleTypeNone != handle_type) { - auto const result = cudaDeviceGetAttribute(&supported_handle_types_bitmask, - cudaDevAttrMemoryPoolSupportedHandleTypes, - rmm::get_current_cuda_device().value()); - - // Don't throw on cudaErrorInvalidValue - auto const unsupported_runtime = (result == cudaErrorInvalidValue); - if (unsupported_runtime) return false; - // throw any other error that may have occurred - RMM_CUDA_TRY(result); - } - -#endif - return (supported_handle_types_bitmask & handle_type) == handle_type; - } - - template - using cudart_sig = dynamic_load_runtime::function_sig; - - using cudaMemPoolCreate_sig = cudart_sig; - RMM_CUDART_API_WRAPPER(cudaMemPoolCreate, cudaMemPoolCreate_sig); - - using cudaMemPoolSetAttribute_sig = cudart_sig; - RMM_CUDART_API_WRAPPER(cudaMemPoolSetAttribute, cudaMemPoolSetAttribute_sig); - - using cudaMemPoolDestroy_sig = cudart_sig; - RMM_CUDART_API_WRAPPER(cudaMemPoolDestroy, cudaMemPoolDestroy_sig); - - using cudaMallocFromPoolAsync_sig = cudart_sig; - RMM_CUDART_API_WRAPPER(cudaMallocFromPoolAsync, cudaMallocFromPoolAsync_sig); - - using cudaFreeAsync_sig = cudart_sig; - RMM_CUDART_API_WRAPPER(cudaFreeAsync, cudaFreeAsync_sig); - - using cudaDeviceGetDefaultMemPool_sig = cudart_sig; - RMM_CUDART_API_WRAPPER(cudaDeviceGetDefaultMemPool, cudaDeviceGetDefaultMemPool_sig); -}; -#endif - -#undef RMM_CUDART_API_WRAPPER -} // namespace detail -} // namespace RMM_NAMESPACE diff --git a/include/rmm/detail/runtime_async_alloc.hpp b/include/rmm/detail/runtime_async_alloc.hpp new file mode 100644 index 000000000..6ddb2228b --- /dev/null +++ b/include/rmm/detail/runtime_async_alloc.hpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include + +#include + +#include +#include + +namespace RMM_NAMESPACE { +namespace detail { + +/** + * @brief Determine at runtime if the CUDA driver supports the stream-ordered + * memory allocator functions. + * + * This allows RMM users to compile/link against CUDA 11.2+ and run with + * older drivers. + */ + +struct runtime_async_alloc { + static bool is_supported() + { + static auto driver_supports_pool{[] { + int cuda_pool_supported{}; + auto result = cudaDeviceGetAttribute(&cuda_pool_supported, + cudaDevAttrMemoryPoolsSupported, + rmm::get_current_cuda_device().value()); + return result == cudaSuccess and cuda_pool_supported == 1; + }()}; + return driver_supports_pool; + } + + /** + * @brief Check whether the specified `cudaMemAllocationHandleType` is supported on the present + * CUDA driver/runtime version. + * + * @param handle_type An IPC export handle type to check for support. + * @return true if supported + * @return false if unsupported + */ + static bool is_export_handle_type_supported(cudaMemAllocationHandleType handle_type) + { + int supported_handle_types_bitmask{}; + if (cudaMemHandleTypeNone != handle_type) { + auto const result = cudaDeviceGetAttribute(&supported_handle_types_bitmask, + cudaDevAttrMemoryPoolSupportedHandleTypes, + rmm::get_current_cuda_device().value()); + + // Don't throw on cudaErrorInvalidValue + auto const unsupported_runtime = (result == cudaErrorInvalidValue); + if (unsupported_runtime) return false; + // throw any other error that may have occurred + RMM_CUDA_TRY(result); + } + return (supported_handle_types_bitmask & handle_type) == handle_type; + } +}; + +} // namespace detail +} // namespace RMM_NAMESPACE diff --git a/include/rmm/mr/device/cuda_async_memory_resource.hpp b/include/rmm/mr/device/cuda_async_memory_resource.hpp index 52fd2fe4e..b1fc0b112 100644 --- a/include/rmm/mr/device/cuda_async_memory_resource.hpp +++ b/include/rmm/mr/device/cuda_async_memory_resource.hpp @@ -17,9 +17,9 @@ #include #include -#include #include #include +#include #include #include #include @@ -31,12 +31,6 @@ #include #include -#if CUDART_VERSION >= 11020 // 11.2 introduced cudaMallocAsync -#ifndef RMM_DISABLE_CUDA_MALLOC_ASYNC -#define RMM_CUDA_MALLOC_ASYNC_SUPPORT -#endif -#endif - namespace RMM_NAMESPACE { namespace mr { /** @@ -91,9 +85,8 @@ class cuda_async_memory_resource final : public device_memory_resource { std::optional release_threshold = {}, std::optional export_handle_type = {}) { -#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT // Check if cudaMallocAsync Memory pool supported - RMM_EXPECTS(rmm::detail::async_alloc::is_supported(), + RMM_EXPECTS(rmm::detail::runtime_async_alloc::is_supported(), "cudaMallocAsync not supported with this CUDA driver/runtime version"); // Construct explicit pool @@ -101,12 +94,13 @@ class cuda_async_memory_resource final : public device_memory_resource { pool_props.allocType = cudaMemAllocationTypePinned; pool_props.handleTypes = static_cast( export_handle_type.value_or(allocation_handle_type::none)); - RMM_EXPECTS(rmm::detail::async_alloc::is_export_handle_type_supported(pool_props.handleTypes), - "Requested IPC memory handle type not supported"); + RMM_EXPECTS( + rmm::detail::runtime_async_alloc::is_export_handle_type_supported(pool_props.handleTypes), + "Requested IPC memory handle type not supported"); pool_props.location.type = cudaMemLocationTypeDevice; pool_props.location.id = rmm::get_current_cuda_device().value(); cudaMemPool_t cuda_pool_handle{}; - RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle, &pool_props)); + RMM_CUDA_TRY(cudaMemPoolCreate(&cuda_pool_handle, &pool_props)); pool_ = cuda_async_view_memory_resource{cuda_pool_handle}; // CUDA drivers before 11.5 have known incompatibilities with the async allocator. @@ -117,41 +111,34 @@ class cuda_async_memory_resource final : public device_memory_resource { constexpr auto min_async_version{11050}; if (driver_version < min_async_version) { int disabled{0}; - RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute( - pool_handle(), cudaMemPoolReuseAllowOpportunistic, &disabled)); + RMM_CUDA_TRY( + cudaMemPoolSetAttribute(pool_handle(), cudaMemPoolReuseAllowOpportunistic, &disabled)); } auto const [free, total] = rmm::available_device_memory(); // Need an l-value to take address to pass to cudaMemPoolSetAttribute uint64_t threshold = release_threshold.value_or(total); - RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute( - pool_handle(), cudaMemPoolAttrReleaseThreshold, &threshold)); + RMM_CUDA_TRY( + cudaMemPoolSetAttribute(pool_handle(), cudaMemPoolAttrReleaseThreshold, &threshold)); // Allocate and immediately deallocate the initial_pool_size to prime the pool with the // specified size auto const pool_size = initial_pool_size.value_or(free / 2); auto* ptr = do_allocate(pool_size, cuda_stream_default); do_deallocate(ptr, pool_size, cuda_stream_default); -#else - RMM_FAIL( - "cudaMallocAsync not supported by the version of the CUDA Toolkit used for this build"); -#endif } -#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT /** * @brief Returns the underlying native handle to the CUDA pool * + * @return cudaMemPool_t Handle to the underlying CUDA pool */ [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); } -#endif ~cuda_async_memory_resource() override { -#if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT) - RMM_ASSERT_CUDA_SUCCESS(rmm::detail::async_alloc::cudaMemPoolDestroy(pool_handle())); -#endif + RMM_ASSERT_CUDA_SUCCESS(cudaMemPoolDestroy(pool_handle())); } cuda_async_memory_resource(cuda_async_memory_resource const&) = delete; cuda_async_memory_resource(cuda_async_memory_resource&&) = delete; @@ -159,9 +146,7 @@ class cuda_async_memory_resource final : public device_memory_resource { cuda_async_memory_resource& operator=(cuda_async_memory_resource&&) = delete; private: -#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT cuda_async_view_memory_resource pool_{}; -#endif /** * @brief Allocates memory of size at least \p bytes. @@ -175,12 +160,7 @@ class cuda_async_memory_resource final : public device_memory_resource { void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override { void* ptr{nullptr}; -#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT ptr = pool_.allocate(bytes, stream); -#else - (void)bytes; - (void)stream; -#endif return ptr; } @@ -194,13 +174,7 @@ class cuda_async_memory_resource final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) override { -#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT pool_.deallocate(ptr, bytes, stream); -#else - (void)ptr; - (void)bytes; - (void)stream; -#endif } /** @@ -213,11 +187,7 @@ class cuda_async_memory_resource final : public device_memory_resource { [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override { auto const* async_mr = dynamic_cast(&other); -#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT return (async_mr != nullptr) && (this->pool_handle() == async_mr->pool_handle()); -#else - return async_mr != nullptr; -#endif } }; diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp index 3e1900e72..180c412ee 100644 --- a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp +++ b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp @@ -17,7 +17,6 @@ #include #include -#include #include #include #include @@ -28,10 +27,6 @@ #include #include -#if CUDART_VERSION >= 11020 // 11.2 introduced cudaMallocAsync -#define RMM_CUDA_MALLOC_ASYNC_SUPPORT -#endif - namespace RMM_NAMESPACE { namespace mr { /** @@ -46,13 +41,12 @@ namespace mr { */ class cuda_async_view_memory_resource final : public device_memory_resource { public: -#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT /** * @brief Constructs a cuda_async_view_memory_resource which uses an existing CUDA memory pool. * The provided pool is not owned by cuda_async_view_memory_resource and must remain valid * during the lifetime of the memory resource. * - * @throws rmm::runtime_error if the CUDA version does not support `cudaMallocAsync` + * @throws rmm::logic_error if the CUDA version does not support `cudaMallocAsync` * * @param valid_pool_handle Handle to a CUDA memory pool which will be used to * serve allocation requests. @@ -71,15 +65,13 @@ class cuda_async_view_memory_resource final : public device_memory_resource { RMM_EXPECTS(result == cudaSuccess && cuda_pool_supported, "cudaMallocAsync not supported with this CUDA driver/runtime version"); } -#endif -#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT /** * @brief Returns the underlying native handle to the CUDA pool * + * @return cudaMemPool_t Handle to the underlying CUDA pool */ [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return cuda_pool_handle_; } -#endif cuda_async_view_memory_resource() = default; cuda_async_view_memory_resource(cuda_async_view_memory_resource const&) = @@ -92,9 +84,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource { default; ///< @default_move_assignment{cuda_async_view_memory_resource} private: -#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT cudaMemPool_t cuda_pool_handle_{}; -#endif /** * @brief Allocates memory of size at least \p bytes. @@ -108,15 +98,9 @@ class cuda_async_view_memory_resource final : public device_memory_resource { void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override { void* ptr{nullptr}; -#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT if (bytes > 0) { - RMM_CUDA_TRY_ALLOC(rmm::detail::async_alloc::cudaMallocFromPoolAsync( - &ptr, bytes, pool_handle(), stream.value())); + RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value())); } -#else - (void)bytes; - (void)stream; -#endif return ptr; } @@ -132,15 +116,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource { [[maybe_unused]] std::size_t bytes, rmm::cuda_stream_view stream) override { -#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT - if (ptr != nullptr) { - RMM_ASSERT_CUDA_SUCCESS(rmm::detail::async_alloc::cudaFreeAsync(ptr, stream.value())); - } -#else - (void)ptr; - (void)bytes; - (void)stream; -#endif + if (ptr != nullptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeAsync(ptr, stream.value())); } } /** diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py index b52ea0179..d7d692287 100644 --- a/python/rmm/rmm/tests/test_rmm.py +++ b/python/rmm/rmm/tests/test_rmm.py @@ -32,12 +32,6 @@ cuda.set_memory_manager(RMMNumbaManager) -_driver_version = rmm._cuda.gpu.driverGetVersion() -_runtime_version = rmm._cuda.gpu.runtimeGetVersion() -_CUDAMALLOC_ASYNC_SUPPORTED = (_driver_version >= 11020) and ( - _runtime_version >= 11020 -) - _SYSTEM_MEMORY_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute( cudart.cudaDeviceAttr.cudaDevAttrPageableMemoryAccess, rmm._cuda.gpu.getDevice(), @@ -657,10 +651,6 @@ def test_mr_upstream_lifetime(): del pool_mr -@pytest.mark.skipif( - not _CUDAMALLOC_ASYNC_SUPPORTED, - reason="cudaMallocAsync not supported", -) @pytest.mark.parametrize("dtype", _dtypes) @pytest.mark.parametrize("nelem", _nelems) @pytest.mark.parametrize("alloc", _allocs) @@ -671,15 +661,11 @@ def test_cuda_async_memory_resource(dtype, nelem, alloc): array_tester(dtype, nelem, alloc) -@pytest.mark.skipif( - not _CUDAMALLOC_ASYNC_SUPPORTED, - reason="cudaMallocAsync not supported", -) def test_cuda_async_memory_resource_ipc(): # TODO: We don't have a great way to check if IPC is supported in Python, # without using the C++ function - # rmm::detail::async_alloc::is_export_handle_type_supported. We can't - # accurately test driver and runtime versions for this via Python because + # rmm::detail::runtime_async_alloc::is_export_handle_type_supported. + # We can't accurately test this via Python because # cuda-python always has the IPC handle enum defined (which normally # requires a CUDA 11.3 runtime) and the cuda-compat package in Docker # containers prevents us from assuming that the driver we see actually @@ -702,10 +688,6 @@ def test_cuda_async_memory_resource_ipc(): assert rmm.mr.get_current_device_resource_type() is type(mr) -@pytest.mark.skipif( - not _CUDAMALLOC_ASYNC_SUPPORTED, - reason="cudaMallocAsync not supported", -) @pytest.mark.parametrize("nelems", _nelems) def test_cuda_async_memory_resource_stream(nelems): # test that using CudaAsyncMemoryResource @@ -719,10 +701,6 @@ def test_cuda_async_memory_resource_stream(nelems): np.testing.assert_equal(expected, result) -@pytest.mark.skipif( - not _CUDAMALLOC_ASYNC_SUPPORTED, - reason="cudaMallocAsync not supported", -) @pytest.mark.parametrize("nelem", _nelems) @pytest.mark.parametrize("alloc", _allocs) def test_cuda_async_memory_resource_threshold(nelem, alloc): @@ -739,13 +717,7 @@ def test_cuda_async_memory_resource_threshold(nelem, alloc): "mr", [ rmm.mr.CudaMemoryResource, - pytest.param( - rmm.mr.CudaAsyncMemoryResource, - marks=pytest.mark.skipif( - not _CUDAMALLOC_ASYNC_SUPPORTED, - reason="cudaMallocAsync not supported", - ), - ), + pytest.param(rmm.mr.CudaAsyncMemoryResource), ], ) def test_limiting_resource_adaptor(mr): diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a482c8cc1..476028af0 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -84,7 +84,7 @@ endfunction() function(ConfigureTest TEST_NAME) set(options) - set(one_value GPUS PERCENT) + set(one_value CUDART GPUS PERCENT) set(multi_value) cmake_parse_arguments(_RMM_TEST "${options}" "${one_value}" "${multi_value}" ${ARGN}) if(NOT DEFINED _RMM_TEST_GPUS AND NOT DEFINED _RMM_TEST_PERCENT) @@ -98,13 +98,23 @@ function(ConfigureTest TEST_NAME) set(_RMM_TEST_PERCENT 100) endif() + if(_RMM_TEST_CUDART STREQUAL SHARED) + set(cudart_link_libs $ CUDA::cudart) + elseif(_RMM_TEST_CUDART STREQUAL STATIC) + set(cudart_link_libs $ CUDA::cudart_static) + else() + set(cudart_link_libs rmm) + endif() + # Test with legacy default stream. ConfigureTestInternal(${TEST_NAME} ${_RMM_TEST_UNPARSED_ARGUMENTS}) + target_link_libraries(${TEST_NAME} ${cudart_link_libs}) # Test with per-thread default stream. string(REGEX REPLACE "_TEST$" "_PTDS_TEST" PTDS_TEST_NAME "${TEST_NAME}") ConfigureTestInternal("${PTDS_TEST_NAME}" ${_RMM_TEST_UNPARSED_ARGUMENTS}) target_compile_definitions("${PTDS_TEST_NAME}" PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM) + target_link_libraries(${PTDS_TEST_NAME} ${cudart_link_libs}) foreach(name ${TEST_NAME} ${PTDS_TEST_NAME} ${NS_TEST_NAME}) rapids_test_add( @@ -130,7 +140,10 @@ ConfigureTest(ADAPTOR_TEST mr/device/adaptor_tests.cpp) ConfigureTest(POOL_MR_TEST mr/device/pool_mr_tests.cpp GPUS 1 PERCENT 100) # cuda_async mr tests -ConfigureTest(CUDA_ASYNC_MR_TEST mr/device/cuda_async_mr_tests.cpp GPUS 1 PERCENT 60) +ConfigureTest(CUDA_ASYNC_MR_STATIC_CUDART_TEST mr/device/cuda_async_mr_tests.cpp GPUS 1 PERCENT 60 + CUDART STATIC) +ConfigureTest(CUDA_ASYNC_MR_SHARED_CUDART_TEST mr/device/cuda_async_mr_tests.cpp GPUS 1 PERCENT 60 + CUDART SHARED) # thrust allocator tests ConfigureTest(THRUST_ALLOCATOR_TEST mr/device/thrust_allocator_tests.cu GPUS 1 PERCENT 60) diff --git a/tests/mr/device/cuda_async_mr_tests.cpp b/tests/mr/device/cuda_async_mr_tests.cpp index 90c7b0ff9..a39188548 100644 --- a/tests/mr/device/cuda_async_mr_tests.cpp +++ b/tests/mr/device/cuda_async_mr_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,24 +31,13 @@ class AsyncMRTest : public ::testing::Test { protected: void SetUp() override { - if (!rmm::detail::async_alloc::is_supported()) { + if (!rmm::detail::runtime_async_alloc::is_supported()) { GTEST_SKIP() << "Skipping tests since cudaMallocAsync not supported with this CUDA " << "driver/runtime version"; } } }; -TEST_F(AsyncMRTest, ThrowIfNotSupported) -{ - auto construct_mr = []() { cuda_async_mr mr; }; -#ifndef RMM_CUDA_MALLOC_ASYNC_SUPPORT - EXPECT_THROW(construct_mr(), rmm::logic_error); -#else - EXPECT_NO_THROW(construct_mr()); -#endif -} - -#if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT) TEST_F(AsyncMRTest, ExplicitInitialPoolSize) { const auto pool_init_size{100}; @@ -77,7 +66,5 @@ TEST_F(AsyncMRTest, DifferentPoolsUnequal) EXPECT_FALSE(mr1.is_equal(mr2)); } -#endif - } // namespace } // namespace rmm::test diff --git a/tests/mr/device/cuda_async_view_mr_tests.cpp b/tests/mr/device/cuda_async_view_mr_tests.cpp index fe82431a9..f3a02cbf0 100644 --- a/tests/mr/device/cuda_async_view_mr_tests.cpp +++ b/tests/mr/device/cuda_async_view_mr_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,13 +29,10 @@ using cuda_async_view_mr = rmm::mr::cuda_async_view_memory_resource; static_assert(cuda::mr::resource_with); static_assert(cuda::mr::async_resource_with); -#if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT) - TEST(PoolTest, UsePool) { cudaMemPool_t memPool{}; - RMM_CUDA_TRY(rmm::detail::async_alloc::cudaDeviceGetDefaultMemPool( - &memPool, rmm::get_current_cuda_device().value())); + RMM_CUDA_TRY(cudaDeviceGetDefaultMemPool(&memPool, rmm::get_current_cuda_device().value())); const auto pool_init_size{100}; cuda_async_view_mr mr{memPool}; @@ -53,7 +50,7 @@ TEST(PoolTest, NotTakingOwnershipOfPool) cudaMemPool_t memPool{}; - RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&memPool, &poolProps)); + RMM_CUDA_TRY(cudaMemPoolCreate(&memPool, &poolProps)); { const auto pool_init_size{100}; @@ -64,7 +61,7 @@ TEST(PoolTest, NotTakingOwnershipOfPool) } auto destroy_valid_pool = [&]() { - auto result = rmm::detail::async_alloc::cudaMemPoolDestroy(memPool); + auto result = cudaMemPoolDestroy(memPool); RMM_EXPECTS(result == cudaSuccess, "Pool wrapper did destroy pool"); }; @@ -81,7 +78,5 @@ TEST(PoolTest, ThrowIfNullptrPool) EXPECT_THROW(construct_mr(), rmm::logic_error); } -#endif - } // namespace } // namespace rmm::test diff --git a/tests/mr/device/mr_ref_multithreaded_tests.cpp b/tests/mr/device/mr_ref_multithreaded_tests.cpp index 944ba1807..9e7c8c2e8 100644 --- a/tests/mr/device/mr_ref_multithreaded_tests.cpp +++ b/tests/mr/device/mr_ref_multithreaded_tests.cpp @@ -36,17 +36,11 @@ namespace { struct mr_ref_test_mt : public mr_ref_test {}; -INSTANTIATE_TEST_CASE_P(MultiThreadResourceTests, - mr_ref_test_mt, - ::testing::Values("CUDA", -#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT - "CUDA_Async", -#endif - "Managed", - "Pool", - "Arena", - "Binning"), - [](auto const& info) { return info.param; }); +INSTANTIATE_TEST_CASE_P( + MultiThreadResourceTests, + mr_ref_test_mt, + ::testing::Values("CUDA", "CUDA_Async", "Managed", "Pool", "Arena", "Binning"), + [](auto const& info) { return info.param; }); template void spawn_n(std::size_t num_threads, Task task, Arguments&&... args) diff --git a/tests/mr/device/mr_ref_test.hpp b/tests/mr/device/mr_ref_test.hpp index 6e63b3838..2af0eff44 100644 --- a/tests/mr/device/mr_ref_test.hpp +++ b/tests/mr/device/mr_ref_test.hpp @@ -347,7 +347,7 @@ inline auto make_host_pinned() { return std::make_shared(); } return std::shared_ptr{nullptr}; diff --git a/tests/mr/device/mr_ref_tests.cpp b/tests/mr/device/mr_ref_tests.cpp index 55e91d765..41af050a0 100644 --- a/tests/mr/device/mr_ref_tests.cpp +++ b/tests/mr/device/mr_ref_tests.cpp @@ -30,9 +30,7 @@ namespace { INSTANTIATE_TEST_SUITE_P(ResourceTests, mr_ref_test, ::testing::Values("CUDA", -#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT "CUDA_Async", -#endif "Managed", "System", "Pool", @@ -46,9 +44,7 @@ INSTANTIATE_TEST_SUITE_P(ResourceTests, INSTANTIATE_TEST_SUITE_P(ResourceAllocationTests, mr_ref_allocation_test, ::testing::Values("CUDA", -#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT "CUDA_Async", -#endif "Managed", "System" "Pool", diff --git a/tests/mr/device/thrust_allocator_tests.cu b/tests/mr/device/thrust_allocator_tests.cu index 84f599957..46447aa09 100644 --- a/tests/mr/device/thrust_allocator_tests.cu +++ b/tests/mr/device/thrust_allocator_tests.cu @@ -69,17 +69,11 @@ TEST_P(allocator_test, multi_device) }()); } -INSTANTIATE_TEST_CASE_P(ThrustAllocatorTests, - allocator_test, - ::testing::Values("CUDA", -#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT - "CUDA_Async", -#endif - "Managed", - "Pool", - "Arena", - "Binning"), - [](auto const& info) { return info.param; }); +INSTANTIATE_TEST_CASE_P( + ThrustAllocatorTests, + allocator_test, + ::testing::Values("CUDA", "CUDA_Async", "Managed", "Pool", "Arena", "Binning"), + [](auto const& info) { return info.param; }); } // namespace } // namespace rmm::test diff --git a/tests/prefetch_tests.cpp b/tests/prefetch_tests.cpp index 6c7bb2dd3..4a2c41a2b 100644 --- a/tests/prefetch_tests.cpp +++ b/tests/prefetch_tests.cpp @@ -53,8 +53,8 @@ struct PrefetchTest : public ::testing::Test { // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g8048f6ea5ad77917444567656c140c5a // specifically for when cudaMemRangeAttribute::cudaMemRangeAttributeLastPrefetchLocation is // used. - constexpr size_t prefetch_data_size = 4; if constexpr (std::is_same_v) { + constexpr size_t prefetch_data_size = 4; int prefetch_location{0}; RMM_CUDA_TRY( cudaMemRangeGetAttribute(&prefetch_location,