From 929a595e639b61b583cc584b1c291f9559cef673 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 19 Nov 2024 09:51:53 -0500
Subject: [PATCH] Make `cudaMallocAsync` logic non-optional as we require CUDA
 11.2+ (#1667)

We can remove the optimizations around  `CUDA_STATIC_RUNTIME` and instead see if the function is already in the process space so that RMM doesn't need to have any build context to run properly

Fixes #1679

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/rmm/pull/1667
---
 CMakeLists.txt                                |   1 -
 .../multi_stream_allocations_bench.cu         |   6 -
 .../random_allocations/random_allocations.cpp |   6 -
 include/rmm/detail/dynamic_load_runtime.hpp   | 191 ------------------
 include/rmm/detail/runtime_async_alloc.hpp    |  79 ++++++++
 .../mr/device/cuda_async_memory_resource.hpp  |  54 ++---
 .../cuda_async_view_memory_resource.hpp       |  32 +--
 python/rmm/rmm/tests/test_rmm.py              |  34 +---
 tests/CMakeLists.txt                          |  17 +-
 tests/mr/device/cuda_async_mr_tests.cpp       |  17 +-
 tests/mr/device/cuda_async_view_mr_tests.cpp  |  13 +-
 .../mr/device/mr_ref_multithreaded_tests.cpp  |  16 +-
 tests/mr/device/mr_ref_test.hpp               |   2 +-
 tests/mr/device/mr_ref_tests.cpp              |   4 -
 tests/mr/device/thrust_allocator_tests.cu     |  16 +-
 tests/prefetch_tests.cpp                      |   2 +-
 16 files changed, 131 insertions(+), 359 deletions(-)
 delete mode 100644 include/rmm/detail/dynamic_load_runtime.hpp
 create mode 100644 include/rmm/detail/runtime_async_alloc.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 44d7fbb79..07bd368ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,7 +89,6 @@ target_include_directories(rmm INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOUR
 if(CUDA_STATIC_RUNTIME)
   message(STATUS "RMM: Enabling static linking of cudart")
   target_link_libraries(rmm INTERFACE CUDA::cudart_static)
-  target_compile_definitions(rmm INTERFACE RMM_STATIC_CUDART)
 else()
   target_link_libraries(rmm INTERFACE CUDA::cudart)
 endif()
diff --git a/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu b/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu
index 86e761c80..b5edbb536 100644
--- a/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu
+++ b/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu
@@ -133,9 +133,7 @@ static void benchmark_range(benchmark::internal::Benchmark* bench)
 MRFactoryFunc get_mr_factory(std::string const& resource_name)
 {
   if (resource_name == "cuda") { return &make_cuda; }
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   if (resource_name == "cuda_async") { return &make_cuda_async; }
-#endif
   if (resource_name == "pool") { return &make_pool; }
   if (resource_name == "arena") { return &make_arena; }
   if (resource_name == "binning") { return &make_binning; }
@@ -153,13 +151,11 @@ void declare_benchmark(std::string const& name)
     return;
   }
 
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   if (name == "cuda_async") {
     BENCHMARK_CAPTURE(BM_MultiStreamAllocations, cuda_async, &make_cuda_async)  //
       ->Apply(benchmark_range);
     return;
   }
-#endif
 
   if (name == "pool") {
     BENCHMARK_CAPTURE(BM_MultiStreamAllocations, pool_mr, &make_pool)  //
@@ -248,9 +244,7 @@ int main(int argc, char** argv)
         resource_names.emplace_back(args["resource"].as<std::string>());
       } else {
         resource_names.emplace_back("cuda");
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
         resource_names.emplace_back("cuda_async");
-#endif
         resource_names.emplace_back("pool");
         resource_names.emplace_back("arena");
         resource_names.emplace_back("binning");
diff --git a/benchmarks/random_allocations/random_allocations.cpp b/benchmarks/random_allocations/random_allocations.cpp
index 57116743b..2971f7e40 100644
--- a/benchmarks/random_allocations/random_allocations.cpp
+++ b/benchmarks/random_allocations/random_allocations.cpp
@@ -316,9 +316,7 @@ int main(int argc, char** argv)
       std::map<std::string, MRFactoryFunc> const funcs({{"arena", &make_arena},
                                                         {"binning", &make_binning},
                                                         {"cuda", &make_cuda},
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
                                                         {"cuda_async", &make_cuda_async},
-#endif
                                                         {"pool", &make_pool}});
       auto resource = args["resource"].as<std::string>();
 
@@ -340,11 +338,7 @@ int main(int argc, char** argv)
         std::string mr_name = args["resource"].as<std::string>();
         declare_benchmark(mr_name);
       } else {
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
         std::vector<std::string> mrs{"pool", "binning", "arena", "cuda_async", "cuda"};
-#else
-        std::vector<std::string> mrs{"pool", "binning", "arena", "cuda"};
-#endif
         std::for_each(
           std::cbegin(mrs), std::cend(mrs), [](auto const& mr) { declare_benchmark(mr); });
       }
diff --git a/include/rmm/detail/dynamic_load_runtime.hpp b/include/rmm/detail/dynamic_load_runtime.hpp
deleted file mode 100644
index 214228752..000000000
--- a/include/rmm/detail/dynamic_load_runtime.hpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <rmm/cuda_device.hpp>
-#include <rmm/detail/export.hpp>
-
-#include <cuda_runtime_api.h>
-
-#include <dlfcn.h>
-
-#include <memory>
-#include <optional>
-
-namespace RMM_NAMESPACE {
-namespace detail {
-
-/**
- * @brief `dynamic_load_runtime` loads the cuda runtime library at runtime
- *
- * By loading the cudart library at runtime we can use functions that
- * are added in newer minor versions of the cuda runtime.
- */
-struct dynamic_load_runtime {
-  static void* get_cuda_runtime_handle()
-  {
-    auto close_cudart = [](void* handle) { ::dlclose(handle); };
-    auto open_cudart  = []() {
-      ::dlerror();
-      const int major = CUDART_VERSION / 1000;
-
-      // In CUDA 12 the SONAME is correctly defined as libcudart.12, but for
-      // CUDA<=11 it includes an extra 0 minor version e.g. libcudart.11.0. We
-      // also allow finding the linker name.
-      const std::string libname_ver_cuda_11 = "libcudart.so." + std::to_string(major) + ".0";
-      const std::string libname_ver_cuda_12 = "libcudart.so." + std::to_string(major);
-      const std::string libname             = "libcudart.so";
-
-      void* ptr = nullptr;
-      for (auto&& name : {libname_ver_cuda_12, libname_ver_cuda_11, libname}) {
-        ptr = dlopen(name.c_str(), RTLD_LAZY);
-        if (ptr != nullptr) break;
-      }
-
-      if (ptr != nullptr) { return ptr; }
-
-      RMM_FAIL("Unable to dlopen cudart");
-    };
-    static std::unique_ptr<void, decltype(close_cudart)> cudart_handle{open_cudart(), close_cudart};
-    return cudart_handle.get();
-  }
-
-  template <typename... Args>
-  using function_sig = std::add_pointer_t<cudaError_t(Args...)>;
-
-  template <typename signature>
-  static std::optional<signature> function(const char* func_name)
-  {
-    auto* runtime = get_cuda_runtime_handle();
-    auto* handle  = ::dlsym(runtime, func_name);
-    if (!handle) { return std::nullopt; }
-    auto* function_ptr = reinterpret_cast<signature>(handle);
-    return std::optional<signature>(function_ptr);
-  }
-};
-
-#if defined(RMM_STATIC_CUDART)
-// clang-format off
-#define RMM_CUDART_API_WRAPPER(name, signature)                               \
-  template <typename... Args>                                                 \
-  static cudaError_t name(Args... args)                                       \
-  {                                                                           \
-    _Pragma("GCC diagnostic push")                                            \
-    _Pragma("GCC diagnostic ignored \"-Waddress\"")                           \
-    static_assert(static_cast<signature>(::name),                             \
-                  "Failed to find #name function with arguments #signature"); \
-    _Pragma("GCC diagnostic pop")                                             \
-    return ::name(args...);                                                   \
-  }
-// clang-format on
-#else
-#define RMM_CUDART_API_WRAPPER(name, signature)                                \
-  template <typename... Args>                                                  \
-  static cudaError_t name(Args... args)                                        \
-  {                                                                            \
-    static const auto func = dynamic_load_runtime::function<signature>(#name); \
-    if (func) { return (*func)(args...); }                                     \
-    RMM_FAIL("Failed to find #name function in libcudart.so");                 \
-  }
-#endif
-
-#if CUDART_VERSION >= 11020  // 11.2 introduced cudaMallocAsync
-/**
- * @brief Bind to the stream-ordered memory allocator functions
- * at runtime.
- *
- * This allows RMM users to compile/link against CUDA 11.2+ and run with
- * < CUDA 11.2 runtime as these functions are found at call time.
- */
-struct async_alloc {
-  static bool is_supported()
-  {
-#if defined(RMM_STATIC_CUDART)
-    static bool runtime_supports_pool = (CUDART_VERSION >= 11020);
-#else
-    static bool runtime_supports_pool =
-      dynamic_load_runtime::function<dynamic_load_runtime::function_sig<void*, cudaStream_t>>(
-        "cudaFreeAsync")
-        .has_value();
-#endif
-
-    static auto driver_supports_pool{[] {
-      int cuda_pool_supported{};
-      auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
-                                           cudaDevAttrMemoryPoolsSupported,
-                                           rmm::get_current_cuda_device().value());
-      return result == cudaSuccess and cuda_pool_supported == 1;
-    }()};
-    return runtime_supports_pool and driver_supports_pool;
-  }
-
-  /**
-   * @brief Check whether the specified `cudaMemAllocationHandleType` is supported on the present
-   * CUDA driver/runtime version.
-   *
-   * @note This query was introduced in CUDA 11.3 so on CUDA 11.2 this function will only return
-   * true for `cudaMemHandleTypeNone`.
-   *
-   * @param handle_type An IPC export handle type to check for support.
-   * @return true if supported
-   * @return false if unsupported
-   */
-  static bool is_export_handle_type_supported(cudaMemAllocationHandleType handle_type)
-  {
-    int supported_handle_types_bitmask{};
-#if CUDART_VERSION >= 11030  // 11.3 introduced cudaDevAttrMemoryPoolSupportedHandleTypes
-    if (cudaMemHandleTypeNone != handle_type) {
-      auto const result = cudaDeviceGetAttribute(&supported_handle_types_bitmask,
-                                                 cudaDevAttrMemoryPoolSupportedHandleTypes,
-                                                 rmm::get_current_cuda_device().value());
-
-      // Don't throw on cudaErrorInvalidValue
-      auto const unsupported_runtime = (result == cudaErrorInvalidValue);
-      if (unsupported_runtime) return false;
-      // throw any other error that may have occurred
-      RMM_CUDA_TRY(result);
-    }
-
-#endif
-    return (supported_handle_types_bitmask & handle_type) == handle_type;
-  }
-
-  template <typename... Args>
-  using cudart_sig = dynamic_load_runtime::function_sig<Args...>;
-
-  using cudaMemPoolCreate_sig = cudart_sig<cudaMemPool_t*, const cudaMemPoolProps*>;
-  RMM_CUDART_API_WRAPPER(cudaMemPoolCreate, cudaMemPoolCreate_sig);
-
-  using cudaMemPoolSetAttribute_sig = cudart_sig<cudaMemPool_t, cudaMemPoolAttr, void*>;
-  RMM_CUDART_API_WRAPPER(cudaMemPoolSetAttribute, cudaMemPoolSetAttribute_sig);
-
-  using cudaMemPoolDestroy_sig = cudart_sig<cudaMemPool_t>;
-  RMM_CUDART_API_WRAPPER(cudaMemPoolDestroy, cudaMemPoolDestroy_sig);
-
-  using cudaMallocFromPoolAsync_sig = cudart_sig<void**, size_t, cudaMemPool_t, cudaStream_t>;
-  RMM_CUDART_API_WRAPPER(cudaMallocFromPoolAsync, cudaMallocFromPoolAsync_sig);
-
-  using cudaFreeAsync_sig = cudart_sig<void*, cudaStream_t>;
-  RMM_CUDART_API_WRAPPER(cudaFreeAsync, cudaFreeAsync_sig);
-
-  using cudaDeviceGetDefaultMemPool_sig = cudart_sig<cudaMemPool_t*, int>;
-  RMM_CUDART_API_WRAPPER(cudaDeviceGetDefaultMemPool, cudaDeviceGetDefaultMemPool_sig);
-};
-#endif
-
-#undef RMM_CUDART_API_WRAPPER
-}  // namespace detail
-}  // namespace RMM_NAMESPACE
diff --git a/include/rmm/detail/runtime_async_alloc.hpp b/include/rmm/detail/runtime_async_alloc.hpp
new file mode 100644
index 000000000..6ddb2228b
--- /dev/null
+++ b/include/rmm/detail/runtime_async_alloc.hpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/detail/export.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <dlfcn.h>
+
+#include <memory>
+#include <optional>
+
+namespace RMM_NAMESPACE {
+namespace detail {
+
+/**
+ * @brief Determine at runtime if the CUDA driver supports the stream-ordered
+ * memory allocator functions.
+ *
+ * This allows RMM users to compile/link against CUDA 11.2+ and run with
+ * older drivers.
+ */
+
+struct runtime_async_alloc {
+  static bool is_supported()
+  {
+    static auto driver_supports_pool{[] {
+      int cuda_pool_supported{};
+      auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
+                                           cudaDevAttrMemoryPoolsSupported,
+                                           rmm::get_current_cuda_device().value());
+      return result == cudaSuccess and cuda_pool_supported == 1;
+    }()};
+    return driver_supports_pool;
+  }
+
+  /**
+   * @brief Check whether the specified `cudaMemAllocationHandleType` is supported on the present
+   * CUDA driver/runtime version.
+   *
+   * @param handle_type An IPC export handle type to check for support.
+   * @return true if supported
+   * @return false if unsupported
+   */
+  static bool is_export_handle_type_supported(cudaMemAllocationHandleType handle_type)
+  {
+    int supported_handle_types_bitmask{};
+    if (cudaMemHandleTypeNone != handle_type) {
+      auto const result = cudaDeviceGetAttribute(&supported_handle_types_bitmask,
+                                                 cudaDevAttrMemoryPoolSupportedHandleTypes,
+                                                 rmm::get_current_cuda_device().value());
+
+      // Don't throw on cudaErrorInvalidValue
+      auto const unsupported_runtime = (result == cudaErrorInvalidValue);
+      if (unsupported_runtime) return false;
+      // throw any other error that may have occurred
+      RMM_CUDA_TRY(result);
+    }
+    return (supported_handle_types_bitmask & handle_type) == handle_type;
+  }
+};
+
+}  // namespace detail
+}  // namespace RMM_NAMESPACE
diff --git a/include/rmm/mr/device/cuda_async_memory_resource.hpp b/include/rmm/mr/device/cuda_async_memory_resource.hpp
index 52fd2fe4e..b1fc0b112 100644
--- a/include/rmm/mr/device/cuda_async_memory_resource.hpp
+++ b/include/rmm/mr/device/cuda_async_memory_resource.hpp
@@ -17,9 +17,9 @@
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/detail/dynamic_load_runtime.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
+#include <rmm/detail/runtime_async_alloc.hpp>
 #include <rmm/detail/thrust_namespace.h>
 #include <rmm/mr/device/cuda_async_view_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
@@ -31,12 +31,6 @@
 #include <limits>
 #include <optional>
 
-#if CUDART_VERSION >= 11020  // 11.2 introduced cudaMallocAsync
-#ifndef RMM_DISABLE_CUDA_MALLOC_ASYNC
-#define RMM_CUDA_MALLOC_ASYNC_SUPPORT
-#endif
-#endif
-
 namespace RMM_NAMESPACE {
 namespace mr {
 /**
@@ -91,9 +85,8 @@ class cuda_async_memory_resource final : public device_memory_resource {
                              std::optional<std::size_t> release_threshold             = {},
                              std::optional<allocation_handle_type> export_handle_type = {})
   {
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     // Check if cudaMallocAsync Memory pool supported
-    RMM_EXPECTS(rmm::detail::async_alloc::is_supported(),
+    RMM_EXPECTS(rmm::detail::runtime_async_alloc::is_supported(),
                 "cudaMallocAsync not supported with this CUDA driver/runtime version");
 
     // Construct explicit pool
@@ -101,12 +94,13 @@ class cuda_async_memory_resource final : public device_memory_resource {
     pool_props.allocType   = cudaMemAllocationTypePinned;
     pool_props.handleTypes = static_cast<cudaMemAllocationHandleType>(
       export_handle_type.value_or(allocation_handle_type::none));
-    RMM_EXPECTS(rmm::detail::async_alloc::is_export_handle_type_supported(pool_props.handleTypes),
-                "Requested IPC memory handle type not supported");
+    RMM_EXPECTS(
+      rmm::detail::runtime_async_alloc::is_export_handle_type_supported(pool_props.handleTypes),
+      "Requested IPC memory handle type not supported");
     pool_props.location.type = cudaMemLocationTypeDevice;
     pool_props.location.id   = rmm::get_current_cuda_device().value();
     cudaMemPool_t cuda_pool_handle{};
-    RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle, &pool_props));
+    RMM_CUDA_TRY(cudaMemPoolCreate(&cuda_pool_handle, &pool_props));
     pool_ = cuda_async_view_memory_resource{cuda_pool_handle};
 
     // CUDA drivers before 11.5 have known incompatibilities with the async allocator.
@@ -117,41 +111,34 @@ class cuda_async_memory_resource final : public device_memory_resource {
     constexpr auto min_async_version{11050};
     if (driver_version < min_async_version) {
       int disabled{0};
-      RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute(
-        pool_handle(), cudaMemPoolReuseAllowOpportunistic, &disabled));
+      RMM_CUDA_TRY(
+        cudaMemPoolSetAttribute(pool_handle(), cudaMemPoolReuseAllowOpportunistic, &disabled));
     }
 
     auto const [free, total] = rmm::available_device_memory();
 
     // Need an l-value to take address to pass to cudaMemPoolSetAttribute
     uint64_t threshold = release_threshold.value_or(total);
-    RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute(
-      pool_handle(), cudaMemPoolAttrReleaseThreshold, &threshold));
+    RMM_CUDA_TRY(
+      cudaMemPoolSetAttribute(pool_handle(), cudaMemPoolAttrReleaseThreshold, &threshold));
 
     // Allocate and immediately deallocate the initial_pool_size to prime the pool with the
     // specified size
     auto const pool_size = initial_pool_size.value_or(free / 2);
     auto* ptr            = do_allocate(pool_size, cuda_stream_default);
     do_deallocate(ptr, pool_size, cuda_stream_default);
-#else
-    RMM_FAIL(
-      "cudaMallocAsync not supported by the version of the CUDA Toolkit used for this build");
-#endif
   }
 
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   /**
    * @brief Returns the underlying native handle to the CUDA pool
    *
+   * @return cudaMemPool_t Handle to the underlying CUDA pool
    */
   [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); }
-#endif
 
   ~cuda_async_memory_resource() override
   {
-#if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
-    RMM_ASSERT_CUDA_SUCCESS(rmm::detail::async_alloc::cudaMemPoolDestroy(pool_handle()));
-#endif
+    RMM_ASSERT_CUDA_SUCCESS(cudaMemPoolDestroy(pool_handle()));
   }
   cuda_async_memory_resource(cuda_async_memory_resource const&)            = delete;
   cuda_async_memory_resource(cuda_async_memory_resource&&)                 = delete;
@@ -159,9 +146,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
   cuda_async_memory_resource& operator=(cuda_async_memory_resource&&)      = delete;
 
  private:
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   cuda_async_view_memory_resource pool_{};
-#endif
 
   /**
    * @brief Allocates memory of size at least \p bytes.
@@ -175,12 +160,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
   {
     void* ptr{nullptr};
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     ptr = pool_.allocate(bytes, stream);
-#else
-    (void)bytes;
-    (void)stream;
-#endif
     return ptr;
   }
 
@@ -194,13 +174,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
    */
   void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) override
   {
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     pool_.deallocate(ptr, bytes, stream);
-#else
-    (void)ptr;
-    (void)bytes;
-    (void)stream;
-#endif
   }
 
   /**
@@ -213,11 +187,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
   [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
   {
     auto const* async_mr = dynamic_cast<cuda_async_memory_resource const*>(&other);
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     return (async_mr != nullptr) && (this->pool_handle() == async_mr->pool_handle());
-#else
-    return async_mr != nullptr;
-#endif
   }
 };
 
diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
index 3e1900e72..180c412ee 100644
--- a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
+++ b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
@@ -17,7 +17,6 @@
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/detail/dynamic_load_runtime.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
 #include <rmm/detail/thrust_namespace.h>
@@ -28,10 +27,6 @@
 #include <cstddef>
 #include <limits>
 
-#if CUDART_VERSION >= 11020  // 11.2 introduced cudaMallocAsync
-#define RMM_CUDA_MALLOC_ASYNC_SUPPORT
-#endif
-
 namespace RMM_NAMESPACE {
 namespace mr {
 /**
@@ -46,13 +41,12 @@ namespace mr {
  */
 class cuda_async_view_memory_resource final : public device_memory_resource {
  public:
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   /**
    * @brief Constructs a cuda_async_view_memory_resource which uses an existing CUDA memory pool.
    * The provided pool is not owned by cuda_async_view_memory_resource and must remain valid
    * during the lifetime of the memory resource.
    *
-   * @throws rmm::runtime_error if the CUDA version does not support `cudaMallocAsync`
+   * @throws rmm::logic_error if the CUDA version does not support `cudaMallocAsync`
    *
    * @param valid_pool_handle Handle to a CUDA memory pool which will be used to
    * serve allocation requests.
@@ -71,15 +65,13 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
     RMM_EXPECTS(result == cudaSuccess && cuda_pool_supported,
                 "cudaMallocAsync not supported with this CUDA driver/runtime version");
   }
-#endif
 
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   /**
    * @brief Returns the underlying native handle to the CUDA pool
    *
+   * @return cudaMemPool_t Handle to the underlying CUDA pool
    */
   [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return cuda_pool_handle_; }
-#endif
 
   cuda_async_view_memory_resource() = default;
   cuda_async_view_memory_resource(cuda_async_view_memory_resource const&) =
@@ -92,9 +84,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
     default;  ///< @default_move_assignment{cuda_async_view_memory_resource}
 
  private:
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   cudaMemPool_t cuda_pool_handle_{};
-#endif
 
   /**
    * @brief Allocates memory of size at least \p bytes.
@@ -108,15 +98,9 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
   {
     void* ptr{nullptr};
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     if (bytes > 0) {
-      RMM_CUDA_TRY_ALLOC(rmm::detail::async_alloc::cudaMallocFromPoolAsync(
-        &ptr, bytes, pool_handle(), stream.value()));
+      RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value()));
     }
-#else
-    (void)bytes;
-    (void)stream;
-#endif
     return ptr;
   }
 
@@ -132,15 +116,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
                      [[maybe_unused]] std::size_t bytes,
                      rmm::cuda_stream_view stream) override
   {
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
-    if (ptr != nullptr) {
-      RMM_ASSERT_CUDA_SUCCESS(rmm::detail::async_alloc::cudaFreeAsync(ptr, stream.value()));
-    }
-#else
-    (void)ptr;
-    (void)bytes;
-    (void)stream;
-#endif
+    if (ptr != nullptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeAsync(ptr, stream.value())); }
   }
 
   /**
diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
index b52ea0179..d7d692287 100644
--- a/python/rmm/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -32,12 +32,6 @@
 
 cuda.set_memory_manager(RMMNumbaManager)
 
-_driver_version = rmm._cuda.gpu.driverGetVersion()
-_runtime_version = rmm._cuda.gpu.runtimeGetVersion()
-_CUDAMALLOC_ASYNC_SUPPORTED = (_driver_version >= 11020) and (
-    _runtime_version >= 11020
-)
-
 _SYSTEM_MEMORY_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute(
     cudart.cudaDeviceAttr.cudaDevAttrPageableMemoryAccess,
     rmm._cuda.gpu.getDevice(),
@@ -657,10 +651,6 @@ def test_mr_upstream_lifetime():
     del pool_mr
 
 
-@pytest.mark.skipif(
-    not _CUDAMALLOC_ASYNC_SUPPORTED,
-    reason="cudaMallocAsync not supported",
-)
 @pytest.mark.parametrize("dtype", _dtypes)
 @pytest.mark.parametrize("nelem", _nelems)
 @pytest.mark.parametrize("alloc", _allocs)
@@ -671,15 +661,11 @@ def test_cuda_async_memory_resource(dtype, nelem, alloc):
     array_tester(dtype, nelem, alloc)
 
 
-@pytest.mark.skipif(
-    not _CUDAMALLOC_ASYNC_SUPPORTED,
-    reason="cudaMallocAsync not supported",
-)
 def test_cuda_async_memory_resource_ipc():
     # TODO: We don't have a great way to check if IPC is supported in Python,
     # without using the C++ function
-    # rmm::detail::async_alloc::is_export_handle_type_supported. We can't
-    # accurately test driver and runtime versions for this via Python because
+    # rmm::detail::runtime_async_alloc::is_export_handle_type_supported.
+    # We can't accurately test this via Python because
     # cuda-python always has the IPC handle enum defined (which normally
     # requires a CUDA 11.3 runtime) and the cuda-compat package in Docker
     # containers prevents us from assuming that the driver we see actually
@@ -702,10 +688,6 @@ def test_cuda_async_memory_resource_ipc():
         assert rmm.mr.get_current_device_resource_type() is type(mr)
 
 
-@pytest.mark.skipif(
-    not _CUDAMALLOC_ASYNC_SUPPORTED,
-    reason="cudaMallocAsync not supported",
-)
 @pytest.mark.parametrize("nelems", _nelems)
 def test_cuda_async_memory_resource_stream(nelems):
     # test that using CudaAsyncMemoryResource
@@ -719,10 +701,6 @@ def test_cuda_async_memory_resource_stream(nelems):
     np.testing.assert_equal(expected, result)
 
 
-@pytest.mark.skipif(
-    not _CUDAMALLOC_ASYNC_SUPPORTED,
-    reason="cudaMallocAsync not supported",
-)
 @pytest.mark.parametrize("nelem", _nelems)
 @pytest.mark.parametrize("alloc", _allocs)
 def test_cuda_async_memory_resource_threshold(nelem, alloc):
@@ -739,13 +717,7 @@ def test_cuda_async_memory_resource_threshold(nelem, alloc):
     "mr",
     [
         rmm.mr.CudaMemoryResource,
-        pytest.param(
-            rmm.mr.CudaAsyncMemoryResource,
-            marks=pytest.mark.skipif(
-                not _CUDAMALLOC_ASYNC_SUPPORTED,
-                reason="cudaMallocAsync not supported",
-            ),
-        ),
+        pytest.param(rmm.mr.CudaAsyncMemoryResource),
     ],
 )
 def test_limiting_resource_adaptor(mr):
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a482c8cc1..476028af0 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -84,7 +84,7 @@ endfunction()
 function(ConfigureTest TEST_NAME)
 
   set(options)
-  set(one_value GPUS PERCENT)
+  set(one_value CUDART GPUS PERCENT)
   set(multi_value)
   cmake_parse_arguments(_RMM_TEST "${options}" "${one_value}" "${multi_value}" ${ARGN})
   if(NOT DEFINED _RMM_TEST_GPUS AND NOT DEFINED _RMM_TEST_PERCENT)
@@ -98,13 +98,23 @@ function(ConfigureTest TEST_NAME)
     set(_RMM_TEST_PERCENT 100)
   endif()
 
+  if(_RMM_TEST_CUDART STREQUAL SHARED)
+    set(cudart_link_libs $<COMPILE_ONLY:rmm> CUDA::cudart)
+  elseif(_RMM_TEST_CUDART STREQUAL STATIC)
+    set(cudart_link_libs $<COMPILE_ONLY:rmm> CUDA::cudart_static)
+  else()
+    set(cudart_link_libs rmm)
+  endif()
+
   # Test with legacy default stream.
   ConfigureTestInternal(${TEST_NAME} ${_RMM_TEST_UNPARSED_ARGUMENTS})
+  target_link_libraries(${TEST_NAME} ${cudart_link_libs})
 
   # Test with per-thread default stream.
   string(REGEX REPLACE "_TEST$" "_PTDS_TEST" PTDS_TEST_NAME "${TEST_NAME}")
   ConfigureTestInternal("${PTDS_TEST_NAME}" ${_RMM_TEST_UNPARSED_ARGUMENTS})
   target_compile_definitions("${PTDS_TEST_NAME}" PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM)
+  target_link_libraries(${PTDS_TEST_NAME} ${cudart_link_libs})
 
   foreach(name ${TEST_NAME} ${PTDS_TEST_NAME} ${NS_TEST_NAME})
     rapids_test_add(
@@ -130,7 +140,10 @@ ConfigureTest(ADAPTOR_TEST mr/device/adaptor_tests.cpp)
 ConfigureTest(POOL_MR_TEST mr/device/pool_mr_tests.cpp GPUS 1 PERCENT 100)
 
 # cuda_async mr tests
-ConfigureTest(CUDA_ASYNC_MR_TEST mr/device/cuda_async_mr_tests.cpp GPUS 1 PERCENT 60)
+ConfigureTest(CUDA_ASYNC_MR_STATIC_CUDART_TEST mr/device/cuda_async_mr_tests.cpp GPUS 1 PERCENT 60
+              CUDART STATIC)
+ConfigureTest(CUDA_ASYNC_MR_SHARED_CUDART_TEST mr/device/cuda_async_mr_tests.cpp GPUS 1 PERCENT 60
+              CUDART SHARED)
 
 # thrust allocator tests
 ConfigureTest(THRUST_ALLOCATOR_TEST mr/device/thrust_allocator_tests.cu GPUS 1 PERCENT 60)
diff --git a/tests/mr/device/cuda_async_mr_tests.cpp b/tests/mr/device/cuda_async_mr_tests.cpp
index 90c7b0ff9..a39188548 100644
--- a/tests/mr/device/cuda_async_mr_tests.cpp
+++ b/tests/mr/device/cuda_async_mr_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,24 +31,13 @@ class AsyncMRTest : public ::testing::Test {
  protected:
   void SetUp() override
   {
-    if (!rmm::detail::async_alloc::is_supported()) {
+    if (!rmm::detail::runtime_async_alloc::is_supported()) {
       GTEST_SKIP() << "Skipping tests since cudaMallocAsync not supported with this CUDA "
                    << "driver/runtime version";
     }
   }
 };
 
-TEST_F(AsyncMRTest, ThrowIfNotSupported)
-{
-  auto construct_mr = []() { cuda_async_mr mr; };
-#ifndef RMM_CUDA_MALLOC_ASYNC_SUPPORT
-  EXPECT_THROW(construct_mr(), rmm::logic_error);
-#else
-  EXPECT_NO_THROW(construct_mr());
-#endif
-}
-
-#if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
 TEST_F(AsyncMRTest, ExplicitInitialPoolSize)
 {
   const auto pool_init_size{100};
@@ -77,7 +66,5 @@ TEST_F(AsyncMRTest, DifferentPoolsUnequal)
   EXPECT_FALSE(mr1.is_equal(mr2));
 }
 
-#endif
-
 }  // namespace
 }  // namespace rmm::test
diff --git a/tests/mr/device/cuda_async_view_mr_tests.cpp b/tests/mr/device/cuda_async_view_mr_tests.cpp
index fe82431a9..f3a02cbf0 100644
--- a/tests/mr/device/cuda_async_view_mr_tests.cpp
+++ b/tests/mr/device/cuda_async_view_mr_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,13 +29,10 @@ using cuda_async_view_mr = rmm::mr::cuda_async_view_memory_resource;
 static_assert(cuda::mr::resource_with<cuda_async_view_mr, cuda::mr::device_accessible>);
 static_assert(cuda::mr::async_resource_with<cuda_async_view_mr, cuda::mr::device_accessible>);
 
-#if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
-
 TEST(PoolTest, UsePool)
 {
   cudaMemPool_t memPool{};
-  RMM_CUDA_TRY(rmm::detail::async_alloc::cudaDeviceGetDefaultMemPool(
-    &memPool, rmm::get_current_cuda_device().value()));
+  RMM_CUDA_TRY(cudaDeviceGetDefaultMemPool(&memPool, rmm::get_current_cuda_device().value()));
 
   const auto pool_init_size{100};
   cuda_async_view_mr mr{memPool};
@@ -53,7 +50,7 @@ TEST(PoolTest, NotTakingOwnershipOfPool)
 
   cudaMemPool_t memPool{};
 
-  RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&memPool, &poolProps));
+  RMM_CUDA_TRY(cudaMemPoolCreate(&memPool, &poolProps));
 
   {
     const auto pool_init_size{100};
@@ -64,7 +61,7 @@ TEST(PoolTest, NotTakingOwnershipOfPool)
   }
 
   auto destroy_valid_pool = [&]() {
-    auto result = rmm::detail::async_alloc::cudaMemPoolDestroy(memPool);
+    auto result = cudaMemPoolDestroy(memPool);
     RMM_EXPECTS(result == cudaSuccess, "Pool wrapper did destroy pool");
   };
 
@@ -81,7 +78,5 @@ TEST(PoolTest, ThrowIfNullptrPool)
   EXPECT_THROW(construct_mr(), rmm::logic_error);
 }
 
-#endif
-
 }  // namespace
 }  // namespace rmm::test
diff --git a/tests/mr/device/mr_ref_multithreaded_tests.cpp b/tests/mr/device/mr_ref_multithreaded_tests.cpp
index 944ba1807..9e7c8c2e8 100644
--- a/tests/mr/device/mr_ref_multithreaded_tests.cpp
+++ b/tests/mr/device/mr_ref_multithreaded_tests.cpp
@@ -36,17 +36,11 @@ namespace {
 
 struct mr_ref_test_mt : public mr_ref_test {};
 
-INSTANTIATE_TEST_CASE_P(MultiThreadResourceTests,
-                        mr_ref_test_mt,
-                        ::testing::Values("CUDA",
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
-                                          "CUDA_Async",
-#endif
-                                          "Managed",
-                                          "Pool",
-                                          "Arena",
-                                          "Binning"),
-                        [](auto const& info) { return info.param; });
+INSTANTIATE_TEST_CASE_P(
+  MultiThreadResourceTests,
+  mr_ref_test_mt,
+  ::testing::Values("CUDA", "CUDA_Async", "Managed", "Pool", "Arena", "Binning"),
+  [](auto const& info) { return info.param; });
 
 template <typename Task, typename... Arguments>
 void spawn_n(std::size_t num_threads, Task task, Arguments&&... args)
diff --git a/tests/mr/device/mr_ref_test.hpp b/tests/mr/device/mr_ref_test.hpp
index 6e63b3838..2af0eff44 100644
--- a/tests/mr/device/mr_ref_test.hpp
+++ b/tests/mr/device/mr_ref_test.hpp
@@ -347,7 +347,7 @@ inline auto make_host_pinned() { return std::make_shared<rmm::mr::pinned_host_me
 
 inline auto make_cuda_async()
 {
-  if (rmm::detail::async_alloc::is_supported()) {
+  if (rmm::detail::runtime_async_alloc::is_supported()) {
     return std::make_shared<rmm::mr::cuda_async_memory_resource>();
   }
   return std::shared_ptr<rmm::mr::cuda_async_memory_resource>{nullptr};
diff --git a/tests/mr/device/mr_ref_tests.cpp b/tests/mr/device/mr_ref_tests.cpp
index 55e91d765..41af050a0 100644
--- a/tests/mr/device/mr_ref_tests.cpp
+++ b/tests/mr/device/mr_ref_tests.cpp
@@ -30,9 +30,7 @@ namespace {
 INSTANTIATE_TEST_SUITE_P(ResourceTests,
                          mr_ref_test,
                          ::testing::Values("CUDA",
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
                                            "CUDA_Async",
-#endif
                                            "Managed",
                                            "System",
                                            "Pool",
@@ -46,9 +44,7 @@ INSTANTIATE_TEST_SUITE_P(ResourceTests,
 INSTANTIATE_TEST_SUITE_P(ResourceAllocationTests,
                          mr_ref_allocation_test,
                          ::testing::Values("CUDA",
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
                                            "CUDA_Async",
-#endif
                                            "Managed",
                                            "System"
                                            "Pool",
diff --git a/tests/mr/device/thrust_allocator_tests.cu b/tests/mr/device/thrust_allocator_tests.cu
index 84f599957..46447aa09 100644
--- a/tests/mr/device/thrust_allocator_tests.cu
+++ b/tests/mr/device/thrust_allocator_tests.cu
@@ -69,17 +69,11 @@ TEST_P(allocator_test, multi_device)
   }());
 }
 
-INSTANTIATE_TEST_CASE_P(ThrustAllocatorTests,
-                        allocator_test,
-                        ::testing::Values("CUDA",
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
-                                          "CUDA_Async",
-#endif
-                                          "Managed",
-                                          "Pool",
-                                          "Arena",
-                                          "Binning"),
-                        [](auto const& info) { return info.param; });
+INSTANTIATE_TEST_CASE_P(
+  ThrustAllocatorTests,
+  allocator_test,
+  ::testing::Values("CUDA", "CUDA_Async", "Managed", "Pool", "Arena", "Binning"),
+  [](auto const& info) { return info.param; });
 
 }  // namespace
 }  // namespace rmm::test
diff --git a/tests/prefetch_tests.cpp b/tests/prefetch_tests.cpp
index 6c7bb2dd3..4a2c41a2b 100644
--- a/tests/prefetch_tests.cpp
+++ b/tests/prefetch_tests.cpp
@@ -53,8 +53,8 @@ struct PrefetchTest : public ::testing::Test {
     // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g8048f6ea5ad77917444567656c140c5a
     // specifically for when cudaMemRangeAttribute::cudaMemRangeAttributeLastPrefetchLocation is
     // used.
-    constexpr size_t prefetch_data_size = 4;
     if constexpr (std::is_same_v<MemoryResourceType, rmm::mr::managed_memory_resource>) {
+      constexpr size_t prefetch_data_size = 4;
       int prefetch_location{0};
       RMM_CUDA_TRY(
         cudaMemRangeGetAttribute(&prefetch_location,