From f4ec4631a66de1bfa199120321abe95782cf32c6 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Thu, 19 Sep 2024 11:43:47 -0400
Subject: [PATCH 01/29] DOC v24.12 Updates [skip ci]

---
 .../cuda11.8-conda/devcontainer.json          |  6 ++---
 .devcontainer/cuda11.8-pip/devcontainer.json  |  6 ++---
 .../cuda12.5-conda/devcontainer.json          |  6 ++---
 .devcontainer/cuda12.5-pip/devcontainer.json  |  6 ++---
 .github/workflows/build.yaml                  | 16 +++++++-------
 .github/workflows/pr.yaml                     | 22 +++++++++----------
 .github/workflows/test.yaml                   |  6 ++---
 VERSION                                       |  2 +-
 dependencies.yaml                             |  6 ++---
 python/rmm/pyproject.toml                     |  2 +-
 10 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 3bfef6706..549ffa67b 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 5bfc30823..d6dd7b6ce 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index 925557b22..17e8d5cd0 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index 2f9e1c493..54964d880 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda12.5-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 9b7efecde..6fa11225e 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -56,7 +56,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -68,7 +68,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -79,7 +79,7 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-publish-cpp:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -100,7 +100,7 @@ jobs:
   wheel-publish-python:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 1160b93e9..afc9f7487 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -23,40 +23,40 @@ jobs:
       - wheel-tests
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
     with:
       build_type: pull-request
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -66,7 +66,7 @@ jobs:
   wheel-build-cpp:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
@@ -74,20 +74,20 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: pull-request
       script: ci/build_wheel_python.sh
   wheel-tests:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
     with:
       build_type: pull-request
       script: ci/test_wheel.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 13838d888..34a0f746d 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -32,7 +32,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/VERSION b/VERSION
index 7c7ba0443..af28c42b5 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.10.00
+24.12.00
diff --git a/dependencies.yaml b/dependencies.yaml
index eff3560e7..5b5cded62 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -163,15 +163,15 @@ dependencies:
                 cuda: "12.*"
                 cuda_suffixed: "true"
               packages:
-                - librmm-cu12==24.10.*,>=0.0.0a0
+                - librmm-cu12==24.12.*,>=0.0.0a0
             - matrix:
                 cuda: "11.*"
                 cuda_suffixed: "true"
               packages:
-                - librmm-cu11==24.10.*,>=0.0.0a0
+                - librmm-cu11==24.12.*,>=0.0.0a0
             - matrix: null
               packages:
-                - librmm==24.10.*,>=0.0.0a0
+                - librmm==24.12.*,>=0.0.0a0
   checks:
     common:
       - output_types: [conda, requirements]
diff --git a/python/rmm/pyproject.toml b/python/rmm/pyproject.toml
index 7577ad961..b148cdba7 100644
--- a/python/rmm/pyproject.toml
+++ b/python/rmm/pyproject.toml
@@ -130,7 +130,7 @@ requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cuda-python>=11.7.1,<12.0a0",
     "cython>=3.0.0",
-    "librmm==24.10.*,>=0.0.0a0",
+    "librmm==24.12.*,>=0.0.0a0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

From b51447393c523cc929608d84850c70a3eae08af3 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 27 Sep 2024 11:56:06 -0500
Subject: [PATCH 02/29] exclude 'gcovr' from list of development pip packages
 (#1688)

This project currently lists `gcovr` (https://pypi.org/project/gcovr/) as a pip dependency for development. I strongly suspect that that was unintentional... it doesn't look like it has any reliance on getting that package via `pip` (just conda, in the C++ test jobs and for local C++ development).

This proposes removing `gcovr` from the list of pip dependencies, so it won't get installed in the DLFW images or other places where `rapids-make-pip-env` from https://github.com/rapidsai/devcontainers is called.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/rmm/pull/1688
---
 dependencies.yaml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index b9a1e1b36..9f1ed9c40 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -232,13 +232,11 @@ dependencies:
             packages:
   develop:
     common:
-      - output_types: [conda, requirements]
-        packages:
-          - gcovr>=5.0
       - output_types: conda
         packages:
           - clang==16.0.6
           - clang-tools==16.0.6
+          - gcovr>=5.0
   docs:
     common:
       - output_types: conda

From 9e410c0591f38aa6c0a17c4e2c2edc4f6bfed058 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 30 Sep 2024 18:28:44 +0100
Subject: [PATCH 03/29] Use `rmm::percent_of_free_device_memory` in arena test
 (#1689)

Rather than hand-coding a fraction of the device memory use the utility routine.

- Closes #1674

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Rong Ou (https://github.com/rongou)

URL: https://github.com/rapidsai/rmm/pull/1689
---
 tests/mr/device/arena_mr_tests.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index bdc0f2438..95cc9c9c1 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -491,9 +491,7 @@ TEST_F(ArenaTest, SizeSmallerThanSuperblockSize)  // NOLINT
 TEST_F(ArenaTest, AllocateNinetyPercent)  // NOLINT
 {
   EXPECT_NO_THROW([]() {  // NOLINT(cppcoreguidelines-avoid-goto)
-    auto const free           = rmm::available_device_memory().first;
-    auto const ninety_percent = rmm::align_up(
-      static_cast<std::size_t>(static_cast<double>(free) * 0.9), rmm::CUDA_ALLOCATION_ALIGNMENT);
+    auto const ninety_percent = rmm::percent_of_free_device_memory(90);
     arena_mr mr(rmm::mr::get_current_device_resource_ref(), ninety_percent);
   }());
 }

From 6489bb7df63a3784b4a94067e3a8fa8917523ab7 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 2 Oct 2024 23:14:10 -0400
Subject: [PATCH 04/29] [Improvement] Reorganize Cython to separate C++
 bindings and make Cython classes public (#1676)

Closes #1280

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/rmm/pull/1676
---
 .gitignore                                    |  11 +-
 python/rmm/CMakeLists.txt                     |   3 +-
 python/rmm/docs/guide.md                      |   6 +-
 python/rmm/rmm/__init__.py                    |  20 +-
 python/rmm/rmm/_cuda/stream.pxd               |   4 +-
 python/rmm/rmm/_cuda/stream.pyx               |   4 +-
 python/rmm/rmm/_lib/__init__.py               |   4 +-
 python/rmm/rmm/_lib/cuda_stream.pxd           |  27 +-
 python/rmm/rmm/_lib/cuda_stream_pool.pxd      |  14 +-
 python/rmm/rmm/_lib/cuda_stream_view.pxd      |  26 +-
 python/rmm/rmm/_lib/device_buffer.pxd         | 115 +--------
 python/rmm/rmm/_lib/device_uvector.pxd        |  28 +--
 python/rmm/rmm/_lib/helper.pxd                |   3 +-
 python/rmm/rmm/_lib/logger.pxd                |  24 ++
 python/rmm/rmm/_lib/memory_resource.pxd       | 138 ++++-------
 python/rmm/rmm/_lib/per_device_resource.pxd   |  42 ++--
 python/rmm/rmm/allocators/cupy.py             |   6 +-
 python/rmm/rmm/allocators/numba.py            |   6 +-
 python/rmm/rmm/allocators/torch.py            |   8 +-
 .../rmm/rmm/{_lib => librmm}/CMakeLists.txt   |   3 +-
 .../{_lib/__init__.pxd => librmm/__init__.py} |   0
 python/rmm/rmm/librmm/_logger.pxd             |  66 +++++
 .../rmm/{_lib/lib.pxd => librmm/_logger.pyx}  |   9 +-
 .../rmm/{_lib => librmm}/_torch_allocator.cpp |   0
 python/rmm/rmm/librmm/cuda_stream.pxd         |  28 +++
 python/rmm/rmm/librmm/cuda_stream_pool.pxd    |  23 ++
 python/rmm/rmm/librmm/cuda_stream_view.pxd    |  32 +++
 python/rmm/rmm/librmm/device_buffer.pxd       |  58 +++++
 python/rmm/rmm/librmm/device_uvector.pxd      |  39 +++
 python/rmm/rmm/librmm/memory_resource.pxd     | 230 ++++++++++++++++++
 python/rmm/rmm/librmm/per_device_resource.pxd |  36 +++
 python/rmm/rmm/mr.py                          |   2 +-
 python/rmm/rmm/pylibrmm/CMakeLists.txt        |  27 ++
 python/rmm/rmm/pylibrmm/__init__.py           |  15 ++
 python/rmm/rmm/pylibrmm/cuda_stream.pxd       |  27 ++
 .../rmm/{_lib => pylibrmm}/cuda_stream.pyx    |   4 +-
 python/rmm/rmm/pylibrmm/device_buffer.pxd     |  71 ++++++
 .../rmm/{_lib => pylibrmm}/device_buffer.pyx  |  19 +-
 .../rmm/{_lib/lib.pyx => pylibrmm/helper.pxd} |   5 +-
 python/rmm/rmm/{_lib => pylibrmm}/helper.pyx  |   0
 python/rmm/rmm/{_lib => pylibrmm}/logger.pyx  |  57 +----
 python/rmm/rmm/pylibrmm/memory_resource.pxd   |  83 +++++++
 .../{_lib => pylibrmm}/memory_resource.pyx    | 228 +++--------------
 .../rmm/{_lib => pylibrmm}/tests/__init__.py  |   0
 .../tests/test_device_buffer.pyx              |   7 +-
 python/rmm/rmm/tests/test_cython.py           |   4 +-
 python/rmm/rmm/tests/test_rmm.py              |   2 +-
 47 files changed, 965 insertions(+), 599 deletions(-)
 create mode 100644 python/rmm/rmm/_lib/logger.pxd
 rename python/rmm/rmm/{_lib => librmm}/CMakeLists.txt (93%)
 rename python/rmm/rmm/{_lib/__init__.pxd => librmm/__init__.py} (100%)
 create mode 100644 python/rmm/rmm/librmm/_logger.pxd
 rename python/rmm/rmm/{_lib/lib.pxd => librmm/_logger.pyx} (70%)
 rename python/rmm/rmm/{_lib => librmm}/_torch_allocator.cpp (100%)
 create mode 100644 python/rmm/rmm/librmm/cuda_stream.pxd
 create mode 100644 python/rmm/rmm/librmm/cuda_stream_pool.pxd
 create mode 100644 python/rmm/rmm/librmm/cuda_stream_view.pxd
 create mode 100644 python/rmm/rmm/librmm/device_buffer.pxd
 create mode 100644 python/rmm/rmm/librmm/device_uvector.pxd
 create mode 100644 python/rmm/rmm/librmm/memory_resource.pxd
 create mode 100644 python/rmm/rmm/librmm/per_device_resource.pxd
 create mode 100644 python/rmm/rmm/pylibrmm/CMakeLists.txt
 create mode 100644 python/rmm/rmm/pylibrmm/__init__.py
 create mode 100644 python/rmm/rmm/pylibrmm/cuda_stream.pxd
 rename python/rmm/rmm/{_lib => pylibrmm}/cuda_stream.pyx (91%)
 create mode 100644 python/rmm/rmm/pylibrmm/device_buffer.pxd
 rename python/rmm/rmm/{_lib => pylibrmm}/device_buffer.pyx (96%)
 rename python/rmm/rmm/{_lib/lib.pyx => pylibrmm/helper.pxd} (86%)
 rename python/rmm/rmm/{_lib => pylibrmm}/helper.pyx (100%)
 rename python/rmm/rmm/{_lib => pylibrmm}/logger.pyx (77%)
 create mode 100644 python/rmm/rmm/pylibrmm/memory_resource.pxd
 rename python/rmm/rmm/{_lib => pylibrmm}/memory_resource.pyx (82%)
 rename python/rmm/rmm/{_lib => pylibrmm}/tests/__init__.py (100%)
 rename python/rmm/rmm/{_lib => pylibrmm}/tests/test_device_buffer.pyx (83%)

diff --git a/.gitignore b/.gitignore
index 2d0b150e1..36aafe643 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,10 +22,13 @@ rmm.egg-info/
 python/build
 python/*/build
 python/rmm/docs/_build
-python/rmm/**/_lib/**/*.cpp
-!python/rmm/_lib/_torch_allocator.cpp
-python/rmm/**/_lib/**/*.h
-python/rmm/**/_lib/.nfs*
+python/rmm/**/librmmm/**/*.cpp
+!python/rmm/librmmm/_torch_allocator.cpp
+python/rmm/**/librmm/**/*.h
+python/rmm/**/librmm/.nfs*
+python/rmm/**/pylibrmmm/**/*.cpp
+python/rmm/**/pylibrmmm/**/*.h
+python/rmm/**/pylibrmmm/.nfs*
 python/rmm/_cuda/*.cpp
 python/rmm/tests/*.cpp
 python/rmm/*.ipynb
diff --git a/python/rmm/CMakeLists.txt b/python/rmm/CMakeLists.txt
index 6c2515102..ac8495e14 100644
--- a/python/rmm/CMakeLists.txt
+++ b/python/rmm/CMakeLists.txt
@@ -30,4 +30,5 @@ rapids_cython_init()
 add_compile_definitions("SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}")
 
 add_subdirectory(rmm/_cuda)
-add_subdirectory(rmm/_lib)
+add_subdirectory(rmm/librmm)
+add_subdirectory(rmm/pylibrmm)
diff --git a/python/rmm/docs/guide.md b/python/rmm/docs/guide.md
index 22c0dc023..c7e940497 100644
--- a/python/rmm/docs/guide.md
+++ b/python/rmm/docs/guide.md
@@ -236,17 +236,17 @@ Common to both usages is that they modify the currently active RMM memory resour
 
 >>> # We start with the default cuda memory resource
 >>> rmm.mr.get_current_device_resource()
-<rmm._lib.memory_resource.CudaMemoryResource at 0x7f7e6c0a1ce0>
+<rmm.pylibrmm.memory_resource.CudaMemoryResource object at 0x7fa0da48a8e0>
 
 >>> # When using statistics, we get a StatisticsResourceAdaptor with the context
 >>> with rmm.statistics.statistics():
 ...     rmm.mr.get_current_device_resource()
-<rmm._lib.memory_resource.StatisticsResourceAdaptor at 0x7f7e6c524900>
+<rmm.pylibrmm.memory_resource.StatisticsResourceAdaptor object at 0x7fa0dd6e4a40>
 
 >>> # We can also enable statistics globally
 >>> rmm.statistics.enable_statistics()
 >>> print(rmm.mr.get_current_device_resource())
-<rmm._lib.memory_resource.StatisticsResourceAdaptor at 0x7f662c2bb3c0>
+<rmm.pylibrmm.memory_resource.StatisticsResourceAdaptor object at 0x7f9a11340a40>
 ```
 
 With statistics enabled, you can query statistics of the current and peak bytes and number of allocations performed by the current RMM memory resource:
diff --git a/python/rmm/rmm/__init__.py b/python/rmm/rmm/__init__.py
index 1e3b5c8b1..b23ad68f9 100644
--- a/python/rmm/rmm/__init__.py
+++ b/python/rmm/rmm/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,8 +13,10 @@
 # limitations under the License.
 
 from rmm import mr
-from rmm._lib.device_buffer import DeviceBuffer
-from rmm._lib.logger import (
+from rmm._version import __git_commit__, __version__
+from rmm.mr import disable_logging, enable_logging, get_log_filenames
+from rmm.pylibrmm.device_buffer import DeviceBuffer
+from rmm.pylibrmm.logger import (
     flush_logger,
     get_flush_level,
     get_logging_level,
@@ -23,8 +25,6 @@
     set_logging_level,
     should_log,
 )
-from rmm._version import __git_commit__, __version__
-from rmm.mr import disable_logging, enable_logging, get_log_filenames
 from rmm.rmm import (
     RMMError,
     is_initialized,
@@ -52,3 +52,13 @@
     "should_log",
     "unregister_reinitialize_hook",
 ]
+
+
+def __getattr__(name):
+    if name == "_lib":
+        import importlib
+
+        module = importlib.import_module("rmm.pylibrmm")
+        return module
+    else:
+        raise AttributeError(f"Module '{__name__}' has no attribute '{name}'")
diff --git a/python/rmm/rmm/_cuda/stream.pxd b/python/rmm/rmm/_cuda/stream.pxd
index 3c3d3aa6f..e91e2ce58 100644
--- a/python/rmm/rmm/_cuda/stream.pxd
+++ b/python/rmm/rmm/_cuda/stream.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@ from cuda.ccudart cimport cudaStream_t
 from libc.stdint cimport uintptr_t
 from libcpp cimport bool
 
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
 
 
 cdef class Stream:
diff --git a/python/rmm/rmm/_cuda/stream.pyx b/python/rmm/rmm/_cuda/stream.pyx
index 4d5ff5232..37dcbd610 100644
--- a/python/rmm/rmm/_cuda/stream.pyx
+++ b/python/rmm/rmm/_cuda/stream.pyx
@@ -16,13 +16,13 @@ from cuda.ccudart cimport cudaStream_t
 from libc.stdint cimport uintptr_t
 from libcpp cimport bool
 
-from rmm._lib.cuda_stream cimport CudaStream
-from rmm._lib.cuda_stream_view cimport (
+from rmm.librmm.cuda_stream_view cimport (
     cuda_stream_default,
     cuda_stream_legacy,
     cuda_stream_per_thread,
     cuda_stream_view,
 )
+from rmm.pylibrmm.cuda_stream cimport CudaStream
 
 
 cdef class Stream:
diff --git a/python/rmm/rmm/_lib/__init__.py b/python/rmm/rmm/_lib/__init__.py
index 0b8672ef6..7cfddab60 100644
--- a/python/rmm/rmm/_lib/__init__.py
+++ b/python/rmm/rmm/_lib/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .device_buffer import DeviceBuffer
+from rmm.pylibrmm import *
diff --git a/python/rmm/rmm/_lib/cuda_stream.pxd b/python/rmm/rmm/_lib/cuda_stream.pxd
index e224cf9af..afc365fbb 100644
--- a/python/rmm/rmm/_lib/cuda_stream.pxd
+++ b/python/rmm/rmm/_lib/cuda_stream.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,26 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cimport cython
-from cuda.ccudart cimport cudaStream_t
-from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-
-
-cdef extern from "rmm/cuda_stream.hpp" namespace "rmm" nogil:
-    cdef cppclass cuda_stream:
-        cuda_stream() except +
-        bool is_valid() except +
-        cudaStream_t value() except +
-        cuda_stream_view view() except +
-        void synchronize() except +
-        void synchronize_no_throw()
-
-
-@cython.final
-cdef class CudaStream:
-    cdef unique_ptr[cuda_stream] c_obj
-    cdef cudaStream_t value(self) except * nogil
-    cdef bool is_valid(self) except * nogil
+from rmm.librmm.cuda_stream cimport cuda_stream
+from rmm.pylibrmm.cuda_stream cimport CudaStream
diff --git a/python/rmm/rmm/_lib/cuda_stream_pool.pxd b/python/rmm/rmm/_lib/cuda_stream_pool.pxd
index 0286a9377..4da59cc68 100644
--- a/python/rmm/rmm/_lib/cuda_stream_pool.pxd
+++ b/python/rmm/rmm/_lib/cuda_stream_pool.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,14 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cimport cython
-
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-
-
-cdef extern from "rmm/cuda_stream_pool.hpp" namespace "rmm" nogil:
-    cdef cppclass cuda_stream_pool:
-        cuda_stream_pool(size_t pool_size)
-        cuda_stream_view get_stream()
-        cuda_stream_view get_stream(size_t stream_id) except +
-        size_t get_pool_size()
+from rmm.librmm.cuda_stream_pool cimport cuda_stream_pool
diff --git a/python/rmm/rmm/_lib/cuda_stream_view.pxd b/python/rmm/rmm/_lib/cuda_stream_view.pxd
index bf0d33c24..c336b0fe8 100644
--- a/python/rmm/rmm/_lib/cuda_stream_view.pxd
+++ b/python/rmm/rmm/_lib/cuda_stream_view.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,21 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cuda.ccudart cimport cudaStream_t
-from libcpp cimport bool
-
-
-cdef extern from "rmm/cuda_stream_view.hpp" namespace "rmm" nogil:
-    cdef cppclass cuda_stream_view:
-        cuda_stream_view()
-        cuda_stream_view(cudaStream_t)
-        cudaStream_t value()
-        bool is_default()
-        bool is_per_thread_default()
-        void synchronize() except +
-
-    cdef bool operator==(cuda_stream_view const, cuda_stream_view const)
-
-    const cuda_stream_view cuda_stream_default
-    const cuda_stream_view cuda_stream_legacy
-    const cuda_stream_view cuda_stream_per_thread
+from rmm.librmm.cuda_stream_view cimport (
+    cuda_stream_default,
+    cuda_stream_legacy,
+    cuda_stream_per_thread,
+    cuda_stream_view,
+)
diff --git a/python/rmm/rmm/_lib/device_buffer.pxd b/python/rmm/rmm/_lib/device_buffer.pxd
index 0da9ace0c..22833b1b8 100644
--- a/python/rmm/rmm/_lib/device_buffer.pxd
+++ b/python/rmm/rmm/_lib/device_buffer.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,105 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from libc.stdint cimport uintptr_t
-from libcpp.memory cimport unique_ptr
-
-from rmm._cuda.stream cimport Stream
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-from rmm._lib.memory_resource cimport (
-    DeviceMemoryResource,
-    device_memory_resource,
+from rmm.librmm.device_buffer cimport (
+    cuda_device_id,
+    device_buffer,
+    get_current_cuda_device,
+    prefetch,
+)
+from rmm.pylibrmm.device_buffer cimport (
+    DeviceBuffer,
+    copy_device_to_ptr,
+    copy_host_to_ptr,
+    copy_ptr_to_host,
+    to_device,
 )
-
-
-cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
-    cdef cppclass cuda_device_id:
-        ctypedef int value_type
-        cuda_device_id()
-        cuda_device_id(value_type id)
-        value_type value()
-
-    cdef cuda_device_id get_current_cuda_device()
-
-cdef extern from "rmm/prefetch.hpp" namespace "rmm" nogil:
-    cdef void prefetch(const void* ptr,
-                       size_t bytes,
-                       cuda_device_id device,
-                       cuda_stream_view stream) except +
-
-cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
-    cdef cppclass device_buffer:
-        device_buffer()
-        device_buffer(
-            size_t size,
-            cuda_stream_view stream,
-            device_memory_resource *
-        ) except +
-        device_buffer(
-            const void* source_data,
-            size_t size,
-            cuda_stream_view stream,
-            device_memory_resource *
-        ) except +
-        device_buffer(
-            const device_buffer buf,
-            cuda_stream_view stream,
-            device_memory_resource *
-        ) except +
-        void reserve(size_t new_capacity, cuda_stream_view stream) except +
-        void resize(size_t new_size, cuda_stream_view stream) except +
-        void shrink_to_fit(cuda_stream_view stream) except +
-        void* data()
-        size_t size()
-        size_t capacity()
-
-
-cdef class DeviceBuffer:
-    cdef unique_ptr[device_buffer] c_obj
-
-    # Holds a reference to the DeviceMemoryResource used for allocation.
-    # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is
-    # needed for deallocation
-    cdef DeviceMemoryResource mr
-
-    # Holds a reference to the stream used by the underlying `device_buffer`.
-    # Ensures the stream does not get destroyed before this DeviceBuffer
-    cdef Stream stream
-
-    @staticmethod
-    cdef DeviceBuffer c_from_unique_ptr(
-        unique_ptr[device_buffer] ptr,
-        Stream stream=*,
-        DeviceMemoryResource mr=*,
-    )
-
-    @staticmethod
-    cdef DeviceBuffer c_to_device(const unsigned char[::1] b,
-                                  Stream stream=*) except *
-    cpdef copy_to_host(self, ary=*, Stream stream=*)
-    cpdef copy_from_host(self, ary, Stream stream=*)
-    cpdef copy_from_device(self, cuda_ary, Stream stream=*)
-    cpdef bytes tobytes(self, Stream stream=*)
-
-    cdef size_t c_size(self) except *
-    cpdef void reserve(self, size_t new_capacity, Stream stream=*) except *
-    cpdef void resize(self, size_t new_size, Stream stream=*) except *
-    cpdef size_t capacity(self) except *
-    cdef void* c_data(self) except *
-
-    cdef device_buffer c_release(self) except *
-
-cpdef DeviceBuffer to_device(const unsigned char[::1] b,
-                             Stream stream=*)
-cpdef void copy_ptr_to_host(uintptr_t db,
-                            unsigned char[::1] hb,
-                            Stream stream=*) except *
-
-cpdef void copy_host_to_ptr(const unsigned char[::1] hb,
-                            uintptr_t db,
-                            Stream stream=*) except *
-
-cpdef void copy_device_to_ptr(uintptr_t d_src,
-                              uintptr_t d_dst,
-                              size_t count,
-                              Stream stream=*) except *
diff --git a/python/rmm/rmm/_lib/device_uvector.pxd b/python/rmm/rmm/_lib/device_uvector.pxd
index 29e122bbf..230b0afb3 100644
--- a/python/rmm/rmm/_lib/device_uvector.pxd
+++ b/python/rmm/rmm/_lib/device_uvector.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,28 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-from rmm._lib.device_buffer cimport device_buffer
-from rmm._lib.memory_resource cimport device_memory_resource
-
-
-cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
-    cdef cppclass device_uvector[T]:
-        device_uvector(size_t size, cuda_stream_view  stream) except +
-        T* element_ptr(size_t index)
-        void set_element(size_t element_index, const T& v, cuda_stream_view s)
-        void set_element_async(
-            size_t element_index,
-            const T& v,
-            cuda_stream_view s
-        ) except +
-        T front_element(cuda_stream_view s) except +
-        T back_element(cuda_stream_view s) except +
-        void reserve(size_t new_capacity, cuda_stream_view stream) except +
-        void resize(size_t new_size, cuda_stream_view stream) except +
-        void shrink_to_fit(cuda_stream_view stream) except +
-        device_buffer release()
-        size_t capacity()
-        T* data()
-        size_t size()
-        device_memory_resource* memory_resource()
+from rmm.librmm.device_uvector cimport device_uvector
diff --git a/python/rmm/rmm/_lib/helper.pxd b/python/rmm/rmm/_lib/helper.pxd
index 8ca151c00..4a5159435 100644
--- a/python/rmm/rmm/_lib/helper.pxd
+++ b/python/rmm/rmm/_lib/helper.pxd
@@ -12,5 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-cdef object parse_bytes(object s) except *
+from rmm.pylibrmm.helper cimport parse_bytes
diff --git a/python/rmm/rmm/_lib/logger.pxd b/python/rmm/rmm/_lib/logger.pxd
new file mode 100644
index 000000000..bef05c903
--- /dev/null
+++ b/python/rmm/rmm/_lib/logger.pxd
@@ -0,0 +1,24 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.librmm._logger cimport logger, logging_level, spdlog_logger
+from rmm.pylibrmm.logger cimport (
+    _validate_level_type,
+    flush_logger,
+    get_flush_level,
+    get_logging_level,
+    set_flush_level,
+    set_logging_level,
+    should_log,
+)
diff --git a/python/rmm/rmm/_lib/memory_resource.pxd b/python/rmm/rmm/_lib/memory_resource.pxd
index 000a3fe1e..983063914 100644
--- a/python/rmm/rmm/_lib/memory_resource.pxd
+++ b/python/rmm/rmm/_lib/memory_resource.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,92 +12,50 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from libc.stdint cimport int8_t
-from libcpp.memory cimport shared_ptr
-from libcpp.pair cimport pair
-from libcpp.string cimport string
-from libcpp.vector cimport vector
-
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-
-
-cdef extern from "rmm/mr/device/device_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass device_memory_resource:
-        void* allocate(size_t bytes) except +
-        void* allocate(size_t bytes, cuda_stream_view stream) except +
-        void deallocate(void* ptr, size_t bytes) except +
-        void deallocate(
-            void* ptr,
-            size_t bytes,
-            cuda_stream_view stream
-        ) except +
-
-cdef extern from "rmm/cuda_device.hpp" namespace "rmm" nogil:
-    size_t percent_of_free_device_memory(int percent) except +
-    pair[size_t, size_t] available_device_memory() except +
-
-cdef class DeviceMemoryResource:
-    cdef shared_ptr[device_memory_resource] c_obj
-    cdef device_memory_resource* get_mr(self) noexcept nogil
-
-cdef class UpstreamResourceAdaptor(DeviceMemoryResource):
-    cdef readonly DeviceMemoryResource upstream_mr
-
-    cpdef DeviceMemoryResource get_upstream(self)
-
-cdef class CudaMemoryResource(DeviceMemoryResource):
-    pass
-
-cdef class ManagedMemoryResource(DeviceMemoryResource):
-    pass
-
-cdef class SystemMemoryResource(DeviceMemoryResource):
-    pass
-
-cdef class SamHeadroomMemoryResource(DeviceMemoryResource):
-    pass
-
-cdef class CudaAsyncMemoryResource(DeviceMemoryResource):
-    pass
-
-cdef class PoolMemoryResource(UpstreamResourceAdaptor):
-    pass
-
-cdef class FixedSizeMemoryResource(UpstreamResourceAdaptor):
-    pass
-
-cdef class BinningMemoryResource(UpstreamResourceAdaptor):
-
-    cdef readonly list _bin_mrs
-
-    cpdef add_bin(
-        self,
-        size_t allocation_size,
-        DeviceMemoryResource bin_resource=*)
-
-cdef class CallbackMemoryResource(DeviceMemoryResource):
-    cdef object _allocate_func
-    cdef object _deallocate_func
-
-cdef class LimitingResourceAdaptor(UpstreamResourceAdaptor):
-    pass
-
-cdef class LoggingResourceAdaptor(UpstreamResourceAdaptor):
-    cdef object _log_file_name
-    cpdef get_file_name(self)
-    cpdef flush(self)
-
-cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
-    pass
-
-cdef class TrackingResourceAdaptor(UpstreamResourceAdaptor):
-    pass
-
-cdef class FailureCallbackResourceAdaptor(UpstreamResourceAdaptor):
-    cdef object _callback
-
-cdef class PrefetchResourceAdaptor(UpstreamResourceAdaptor):
-    pass
-
-cpdef DeviceMemoryResource get_current_device_resource()
+from rmm.librmm.memory_resource cimport (
+    CppExcept,
+    allocate_callback_t,
+    allocation_handle_type,
+    available_device_memory,
+    binning_memory_resource,
+    callback_memory_resource,
+    cuda_async_memory_resource,
+    cuda_memory_resource,
+    deallocate_callback_t,
+    device_memory_resource,
+    failure_callback_resource_adaptor,
+    failure_callback_t,
+    fixed_size_memory_resource,
+    limiting_resource_adaptor,
+    logging_resource_adaptor,
+    managed_memory_resource,
+    percent_of_free_device_memory,
+    pool_memory_resource,
+    prefetch_resource_adaptor,
+    sam_headroom_memory_resource,
+    statistics_resource_adaptor,
+    system_memory_resource,
+    throw_cpp_except,
+    tracking_resource_adaptor,
+    translate_python_except_to_cpp,
+)
+from rmm.pylibrmm.memory_resource cimport (
+    BinningMemoryResource,
+    CallbackMemoryResource,
+    CudaAsyncMemoryResource,
+    CudaMemoryResource,
+    DeviceMemoryResource,
+    FailureCallbackResourceAdaptor,
+    FixedSizeMemoryResource,
+    LimitingResourceAdaptor,
+    LoggingResourceAdaptor,
+    ManagedMemoryResource,
+    PoolMemoryResource,
+    PrefetchResourceAdaptor,
+    SamHeadroomMemoryResource,
+    StatisticsResourceAdaptor,
+    SystemMemoryResource,
+    TrackingResourceAdaptor,
+    UpstreamResourceAdaptor,
+    get_current_device_resource,
+)
diff --git a/python/rmm/rmm/_lib/per_device_resource.pxd b/python/rmm/rmm/_lib/per_device_resource.pxd
index c33217622..29487f503 100644
--- a/python/rmm/rmm/_lib/per_device_resource.pxd
+++ b/python/rmm/rmm/_lib/per_device_resource.pxd
@@ -1,23 +1,21 @@
-from rmm._lib.memory_resource cimport device_memory_resource
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-
-cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
-    cdef cppclass cuda_device_id:
-        ctypedef int value_type
-
-        cuda_device_id(value_type id)
-
-        value_type value()
-
-cdef extern from "rmm/mr/device/per_device_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef device_memory_resource* set_current_device_resource(
-        device_memory_resource* new_mr
-    )
-    cdef device_memory_resource* get_current_device_resource()
-    cdef device_memory_resource* set_per_device_resource(
-        cuda_device_id id, device_memory_resource* new_mr
-    )
-    cdef device_memory_resource* get_per_device_resource (
-        cuda_device_id id
-    )
+from rmm.librmm.per_device_resource cimport (
+    cuda_device_id,
+    get_current_device_resource,
+    get_per_device_resource,
+    set_current_device_resource,
+    set_per_device_resource,
+)
diff --git a/python/rmm/rmm/allocators/cupy.py b/python/rmm/rmm/allocators/cupy.py
index 89947c46b..780ff2abf 100644
--- a/python/rmm/rmm/allocators/cupy.py
+++ b/python/rmm/rmm/allocators/cupy.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from rmm import _lib as librmm
+from rmm import pylibrmm
 from rmm._cuda.stream import Stream
 
 try:
@@ -34,7 +34,7 @@ def rmm_cupy_allocator(nbytes):
         raise ModuleNotFoundError("No module named 'cupy'")
 
     stream = Stream(obj=cupy.cuda.get_current_stream())
-    buf = librmm.device_buffer.DeviceBuffer(size=nbytes, stream=stream)
+    buf = pylibrmm.device_buffer.DeviceBuffer(size=nbytes, stream=stream)
     dev_id = -1 if buf.ptr else cupy.cuda.device.get_device_id()
     mem = cupy.cuda.UnownedMemory(
         ptr=buf.ptr, size=buf.size, owner=buf, device_id=dev_id
diff --git a/python/rmm/rmm/allocators/numba.py b/python/rmm/rmm/allocators/numba.py
index 5e87b87b6..fd9bacb5a 100644
--- a/python/rmm/rmm/allocators/numba.py
+++ b/python/rmm/rmm/allocators/numba.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 from numba import config, cuda
 from numba.cuda import HostOnlyCUDAMemoryManager, IpcHandle, MemoryPointer
 
-from rmm import _lib as librmm
+from rmm import pylibrmm
 
 
 def _make_emm_plugin_finalizer(handle, allocations):
@@ -70,7 +70,7 @@ def memalloc(self, size):
         """
         Allocate an on-device array from the RMM pool.
         """
-        buf = librmm.DeviceBuffer(size=size)
+        buf = pylibrmm.DeviceBuffer(size=size)
         ctx = self.context
 
         if config.CUDA_USE_NVIDIA_BINDING:
diff --git a/python/rmm/rmm/allocators/torch.py b/python/rmm/rmm/allocators/torch.py
index 753da66da..eee0e9df9 100644
--- a/python/rmm/rmm/allocators/torch.py
+++ b/python/rmm/rmm/allocators/torch.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,10 +28,10 @@
         # allocator .so relative to the current file because the current file
         # is pure Python and will therefore be in the source directory.
         # Instead, we search relative to an arbitrary file in the compiled
-        # package. We use the _lib.lib module because it is small.
-        from rmm._lib import lib
+        # package. We use the librmm._logger module because it is small.
+        from rmm.librmm import _logger
 
-        sofile = pathlib.Path(lib.__file__).parent / "_torch_allocator.so"
+        sofile = pathlib.Path(_logger.__file__).parent / "_torch_allocator.so"
         rmm_torch_allocator = CUDAPluggableAllocator(
             str(sofile.absolute()),
             alloc_fn_name="allocate",
diff --git a/python/rmm/rmm/_lib/CMakeLists.txt b/python/rmm/rmm/librmm/CMakeLists.txt
similarity index 93%
rename from python/rmm/rmm/_lib/CMakeLists.txt
rename to python/rmm/rmm/librmm/CMakeLists.txt
index 7cdfed971..5da2a1a01 100644
--- a/python/rmm/rmm/_lib/CMakeLists.txt
+++ b/python/rmm/rmm/librmm/CMakeLists.txt
@@ -12,8 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources device_buffer.pyx lib.pyx logger.pyx memory_resource.pyx cuda_stream.pyx
-                   helper.pyx)
+set(cython_sources _logger.pyx)
 set(linked_libraries rmm::rmm)
 
 # Build all of the Cython targets
diff --git a/python/rmm/rmm/_lib/__init__.pxd b/python/rmm/rmm/librmm/__init__.py
similarity index 100%
rename from python/rmm/rmm/_lib/__init__.pxd
rename to python/rmm/rmm/librmm/__init__.py
diff --git a/python/rmm/rmm/librmm/_logger.pxd b/python/rmm/rmm/librmm/_logger.pxd
new file mode 100644
index 000000000..241a748c3
--- /dev/null
+++ b/python/rmm/rmm/librmm/_logger.pxd
@@ -0,0 +1,66 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libcpp cimport bool
+
+
+cdef extern from "spdlog/common.h" namespace "spdlog::level" nogil:
+    cpdef enum logging_level "spdlog::level::level_enum":
+        """
+        The debug logging level for RMM.
+
+        Debug logging prints messages to a log file. See
+        `Debug Logging <https://github.com/rapidsai/rmm#debug-logging>`_
+        for more information.
+
+        Valid levels, in decreasing order of verbosity, are TRACE, DEBUG,
+        INFO, WARN, ERR, CRITICAL, and OFF. Default is INFO.
+
+        Examples
+        --------
+        >>> import rmm
+        >>> rmm.logging_level.DEBUG
+        <logging_level.DEBUG: 1>
+        >>> rmm.logging_level.DEBUG.value
+        1
+        >>> rmm.logging_level.DEBUG.name
+        'DEBUG'
+
+        See Also
+        --------
+        set_logging_level : Set the debug logging level
+        get_logging_level : Get the current debug logging level
+        """
+        TRACE "spdlog::level::trace"
+        DEBUG "spdlog::level::debug"
+        INFO "spdlog::level::info"
+        WARN "spdlog::level::warn"
+        ERR "spdlog::level::err"
+        CRITICAL "spdlog::level::critical"
+        OFF "spdlog::level::off"
+
+
+cdef extern from "spdlog/spdlog.h" namespace "spdlog" nogil:
+    cdef cppclass spdlog_logger "spdlog::logger":
+        spdlog_logger() except +
+        void set_level(logging_level level)
+        logging_level level()
+        void flush() except +
+        void flush_on(logging_level level)
+        logging_level flush_level()
+        bool should_log(logging_level msg_level)
+
+
+cdef extern from "rmm/logger.hpp" namespace "rmm" nogil:
+    cdef spdlog_logger& logger() except +
diff --git a/python/rmm/rmm/_lib/lib.pxd b/python/rmm/rmm/librmm/_logger.pyx
similarity index 70%
rename from python/rmm/rmm/_lib/lib.pxd
rename to python/rmm/rmm/librmm/_logger.pyx
index e35b672e4..4392cb106 100644
--- a/python/rmm/rmm/_lib/lib.pxd
+++ b/python/rmm/rmm/librmm/_logger.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,9 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from libc.stdint cimport uintptr_t
-from libcpp cimport bool
-from libcpp.utility cimport pair
-from libcpp.vector cimport vector
-
-ctypedef pair[const char*, unsigned int] caller_pair
+from rmm.librmm._logger cimport logging_level  # no-cython-lint
diff --git a/python/rmm/rmm/_lib/_torch_allocator.cpp b/python/rmm/rmm/librmm/_torch_allocator.cpp
similarity index 100%
rename from python/rmm/rmm/_lib/_torch_allocator.cpp
rename to python/rmm/rmm/librmm/_torch_allocator.cpp
diff --git a/python/rmm/rmm/librmm/cuda_stream.pxd b/python/rmm/rmm/librmm/cuda_stream.pxd
new file mode 100644
index 000000000..3f2ac3361
--- /dev/null
+++ b/python/rmm/rmm/librmm/cuda_stream.pxd
@@ -0,0 +1,28 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cuda.ccudart cimport cudaStream_t
+from libcpp cimport bool
+
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+
+
+cdef extern from "rmm/cuda_stream.hpp" namespace "rmm" nogil:
+    cdef cppclass cuda_stream:
+        cuda_stream() except +
+        bool is_valid() except +
+        cudaStream_t value() except +
+        cuda_stream_view view() except +
+        void synchronize() except +
+        void synchronize_no_throw()
diff --git a/python/rmm/rmm/librmm/cuda_stream_pool.pxd b/python/rmm/rmm/librmm/cuda_stream_pool.pxd
new file mode 100644
index 000000000..4f2cbb36d
--- /dev/null
+++ b/python/rmm/rmm/librmm/cuda_stream_pool.pxd
@@ -0,0 +1,23 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+
+
+cdef extern from "rmm/cuda_stream_pool.hpp" namespace "rmm" nogil:
+    cdef cppclass cuda_stream_pool:
+        cuda_stream_pool(size_t pool_size)
+        cuda_stream_view get_stream()
+        cuda_stream_view get_stream(size_t stream_id) except +
+        size_t get_pool_size()
diff --git a/python/rmm/rmm/librmm/cuda_stream_view.pxd b/python/rmm/rmm/librmm/cuda_stream_view.pxd
new file mode 100644
index 000000000..bf0d33c24
--- /dev/null
+++ b/python/rmm/rmm/librmm/cuda_stream_view.pxd
@@ -0,0 +1,32 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cuda.ccudart cimport cudaStream_t
+from libcpp cimport bool
+
+
+cdef extern from "rmm/cuda_stream_view.hpp" namespace "rmm" nogil:
+    cdef cppclass cuda_stream_view:
+        cuda_stream_view()
+        cuda_stream_view(cudaStream_t)
+        cudaStream_t value()
+        bool is_default()
+        bool is_per_thread_default()
+        void synchronize() except +
+
+    cdef bool operator==(cuda_stream_view const, cuda_stream_view const)
+
+    const cuda_stream_view cuda_stream_default
+    const cuda_stream_view cuda_stream_legacy
+    const cuda_stream_view cuda_stream_per_thread
diff --git a/python/rmm/rmm/librmm/device_buffer.pxd b/python/rmm/rmm/librmm/device_buffer.pxd
new file mode 100644
index 000000000..1c503ac9a
--- /dev/null
+++ b/python/rmm/rmm/librmm/device_buffer.pxd
@@ -0,0 +1,58 @@
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from rmm.librmm.memory_resource cimport device_memory_resource
+
+
+cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
+    cdef cppclass cuda_device_id:
+        ctypedef int value_type
+        cuda_device_id()
+        cuda_device_id(value_type id)
+        value_type value()
+
+    cdef cuda_device_id get_current_cuda_device()
+
+cdef extern from "rmm/prefetch.hpp" namespace "rmm" nogil:
+    cdef void prefetch(const void* ptr,
+                       size_t bytes,
+                       cuda_device_id device,
+                       cuda_stream_view stream) except +
+
+cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
+    cdef cppclass device_buffer:
+        device_buffer()
+        device_buffer(
+            size_t size,
+            cuda_stream_view stream,
+            device_memory_resource *
+        ) except +
+        device_buffer(
+            const void* source_data,
+            size_t size,
+            cuda_stream_view stream,
+            device_memory_resource *
+        ) except +
+        device_buffer(
+            const device_buffer buf,
+            cuda_stream_view stream,
+            device_memory_resource *
+        ) except +
+        void reserve(size_t new_capacity, cuda_stream_view stream) except +
+        void resize(size_t new_size, cuda_stream_view stream) except +
+        void shrink_to_fit(cuda_stream_view stream) except +
+        void* data()
+        size_t size()
+        size_t capacity()
diff --git a/python/rmm/rmm/librmm/device_uvector.pxd b/python/rmm/rmm/librmm/device_uvector.pxd
new file mode 100644
index 000000000..f560a9e38
--- /dev/null
+++ b/python/rmm/rmm/librmm/device_uvector.pxd
@@ -0,0 +1,39 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from rmm.librmm.device_buffer cimport device_buffer
+from rmm.librmm.memory_resource cimport device_memory_resource
+
+
+cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
+    cdef cppclass device_uvector[T]:
+        device_uvector(size_t size, cuda_stream_view  stream) except +
+        T* element_ptr(size_t index)
+        void set_element(size_t element_index, const T& v, cuda_stream_view s)
+        void set_element_async(
+            size_t element_index,
+            const T& v,
+            cuda_stream_view s
+        ) except +
+        T front_element(cuda_stream_view s) except +
+        T back_element(cuda_stream_view s) except +
+        void reserve(size_t new_capacity, cuda_stream_view stream) except +
+        void resize(size_t new_size, cuda_stream_view stream) except +
+        void shrink_to_fit(cuda_stream_view stream) except +
+        device_buffer release()
+        size_t capacity()
+        T* data()
+        size_t size()
+        device_memory_resource* memory_resource()
diff --git a/python/rmm/rmm/librmm/memory_resource.pxd b/python/rmm/rmm/librmm/memory_resource.pxd
new file mode 100644
index 000000000..9ddaf04b9
--- /dev/null
+++ b/python/rmm/rmm/librmm/memory_resource.pxd
@@ -0,0 +1,230 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This import is needed for Cython typing in translate_python_except_to_cpp
+# See https://github.com/cython/cython/issues/5589
+from builtins import BaseException
+
+from libc.stddef cimport size_t
+from libc.stdint cimport int8_t, int64_t
+from libcpp cimport bool
+from libcpp.optional cimport optional
+from libcpp.pair cimport pair
+from libcpp.string cimport string
+
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from rmm.librmm.memory_resource cimport device_memory_resource
+
+
+cdef extern from "rmm/mr/device/device_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass device_memory_resource:
+        void* allocate(size_t bytes) except +
+        void* allocate(size_t bytes, cuda_stream_view stream) except +
+        void deallocate(void* ptr, size_t bytes) except +
+        void deallocate(
+            void* ptr,
+            size_t bytes,
+            cuda_stream_view stream
+        ) except +
+
+cdef extern from "rmm/cuda_device.hpp" namespace "rmm" nogil:
+    size_t percent_of_free_device_memory(int percent) except +
+    pair[size_t, size_t] available_device_memory() except +
+
+# Transparent handle of a C++ exception
+ctypedef pair[int, string] CppExcept
+
+cdef inline CppExcept translate_python_except_to_cpp(err: BaseException) noexcept:
+    """Translate a Python exception into a C++ exception handle
+
+    The returned exception handle can then be thrown by `throw_cpp_except()`,
+    which MUST be done without holding the GIL.
+
+    This is useful when C++ calls a Python function and needs to catch or
+    propagate exceptions.
+    """
+    if isinstance(err, MemoryError):
+        return CppExcept(0, str.encode(str(err)))
+    return CppExcept(-1, str.encode(str(err)))
+
+# Implementation of `throw_cpp_except()`, which throws a given `CppExcept`.
+# This function MUST be called without the GIL otherwise the thrown C++
+# exception are translated back into a Python exception.
+cdef extern from *:
+    """
+    #include <stdexcept>
+    #include <utility>
+
+    void throw_cpp_except(std::pair<int, std::string> res) {
+        switch(res.first) {
+            case 0:
+                throw rmm::out_of_memory(res.second);
+            default:
+                throw std::runtime_error(res.second);
+        }
+    }
+    """
+    void throw_cpp_except(CppExcept) nogil
+
+
+cdef extern from "rmm/mr/device/cuda_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass cuda_memory_resource(device_memory_resource):
+        cuda_memory_resource() except +
+
+cdef extern from "rmm/mr/device/managed_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass managed_memory_resource(device_memory_resource):
+        managed_memory_resource() except +
+
+cdef extern from "rmm/mr/device/system_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass system_memory_resource(device_memory_resource):
+        system_memory_resource() except +
+
+cdef extern from "rmm/mr/device/sam_headroom_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass sam_headroom_memory_resource(device_memory_resource):
+        sam_headroom_memory_resource(size_t headroom) except +
+
+cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+
+    cdef cppclass cuda_async_memory_resource(device_memory_resource):
+        cuda_async_memory_resource(
+            optional[size_t] initial_pool_size,
+            optional[size_t] release_threshold,
+            optional[allocation_handle_type] export_handle_type) except +
+
+# TODO: when we adopt Cython 3.0 use enum class
+cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \
+        namespace \
+        "rmm::mr::cuda_async_memory_resource::allocation_handle_type" \
+        nogil:
+    enum allocation_handle_type \
+            "rmm::mr::cuda_async_memory_resource::allocation_handle_type":
+        none
+        posix_file_descriptor
+        win32
+        win32_kmt
+
+
+cdef extern from "rmm/mr/device/pool_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass pool_memory_resource[Upstream](device_memory_resource):
+        pool_memory_resource(
+            Upstream* upstream_mr,
+            size_t initial_pool_size,
+            optional[size_t] maximum_pool_size) except +
+        size_t pool_size()
+
+cdef extern from "rmm/mr/device/fixed_size_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass fixed_size_memory_resource[Upstream](device_memory_resource):
+        fixed_size_memory_resource(
+            Upstream* upstream_mr,
+            size_t block_size,
+            size_t block_to_preallocate) except +
+
+cdef extern from "rmm/mr/device/callback_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    ctypedef void* (*allocate_callback_t)(size_t, cuda_stream_view, void*)
+    ctypedef void (*deallocate_callback_t)(void*, size_t, cuda_stream_view, void*)
+
+    cdef cppclass callback_memory_resource(device_memory_resource):
+        callback_memory_resource(
+            allocate_callback_t allocate_callback,
+            deallocate_callback_t deallocate_callback,
+            void* allocate_callback_arg,
+            void* deallocate_callback_arg
+        ) except +
+
+cdef extern from "rmm/mr/device/binning_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass binning_memory_resource[Upstream](device_memory_resource):
+        binning_memory_resource(Upstream* upstream_mr) except +
+        binning_memory_resource(
+            Upstream* upstream_mr,
+            int8_t min_size_exponent,
+            int8_t max_size_exponent) except +
+
+        void add_bin(size_t allocation_size) except +
+        void add_bin(
+            size_t allocation_size,
+            device_memory_resource* bin_resource) except +
+
+cdef extern from "rmm/mr/device/limiting_resource_adaptor.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass limiting_resource_adaptor[Upstream](device_memory_resource):
+        limiting_resource_adaptor(
+            Upstream* upstream_mr,
+            size_t allocation_limit) except +
+
+        size_t get_allocated_bytes() except +
+        size_t get_allocation_limit() except +
+
+cdef extern from "rmm/mr/device/logging_resource_adaptor.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass logging_resource_adaptor[Upstream](device_memory_resource):
+        logging_resource_adaptor(
+            Upstream* upstream_mr,
+            string filename) except +
+
+        void flush() except +
+
+cdef extern from "rmm/mr/device/statistics_resource_adaptor.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass statistics_resource_adaptor[Upstream](device_memory_resource):
+        struct counter:
+            counter()
+
+            int64_t value
+            int64_t peak
+            int64_t total
+
+        statistics_resource_adaptor(Upstream* upstream_mr) except +
+
+        counter get_bytes_counter() except +
+        counter get_allocations_counter() except +
+        pair[counter, counter] pop_counters() except +
+        pair[counter, counter] push_counters() except +
+
+cdef extern from "rmm/mr/device/tracking_resource_adaptor.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass tracking_resource_adaptor[Upstream](device_memory_resource):
+        tracking_resource_adaptor(
+            Upstream* upstream_mr,
+            bool capture_stacks) except +
+
+        size_t get_allocated_bytes() except +
+        string get_outstanding_allocations_str() except +
+        void log_outstanding_allocations() except +
+
+cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \
+        namespace "rmm::mr" nogil:
+    ctypedef bool (*failure_callback_t)(size_t, void*)
+    cdef cppclass failure_callback_resource_adaptor[Upstream](
+        device_memory_resource
+    ):
+        failure_callback_resource_adaptor(
+            Upstream* upstream_mr,
+            failure_callback_t callback,
+            void* callback_arg
+        ) except +
+
+cdef extern from "rmm/mr/device/prefetch_resource_adaptor.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass prefetch_resource_adaptor[Upstream](device_memory_resource):
+        prefetch_resource_adaptor(Upstream* upstream_mr) except +
diff --git a/python/rmm/rmm/librmm/per_device_resource.pxd b/python/rmm/rmm/librmm/per_device_resource.pxd
new file mode 100644
index 000000000..63ee29056
--- /dev/null
+++ b/python/rmm/rmm/librmm/per_device_resource.pxd
@@ -0,0 +1,36 @@
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from rmm.librmm.memory_resource cimport device_memory_resource
+
+
+cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
+    cdef cppclass cuda_device_id:
+        ctypedef int value_type
+
+        cuda_device_id(value_type id)
+
+        value_type value()
+
+cdef extern from "rmm/mr/device/per_device_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef device_memory_resource* set_current_device_resource(
+        device_memory_resource* new_mr
+    )
+    cdef device_memory_resource* get_current_device_resource()
+    cdef device_memory_resource* set_per_device_resource(
+        cuda_device_id id, device_memory_resource* new_mr
+    )
+    cdef device_memory_resource* get_per_device_resource (
+        cuda_device_id id
+    )
diff --git a/python/rmm/rmm/mr.py b/python/rmm/rmm/mr.py
index 6eb94da0f..3f0c3fce3 100644
--- a/python/rmm/rmm/mr.py
+++ b/python/rmm/rmm/mr.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from rmm._lib.memory_resource import (
+from rmm.pylibrmm.memory_resource import (
     BinningMemoryResource,
     CallbackMemoryResource,
     CudaAsyncMemoryResource,
diff --git a/python/rmm/rmm/pylibrmm/CMakeLists.txt b/python/rmm/rmm/pylibrmm/CMakeLists.txt
new file mode 100644
index 000000000..0e88f01bb
--- /dev/null
+++ b/python/rmm/rmm/pylibrmm/CMakeLists.txt
@@ -0,0 +1,27 @@
+# =============================================================================
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources device_buffer.pyx logger.pyx memory_resource.pyx cuda_stream.pyx helper.pyx)
+set(linked_libraries rmm::rmm)
+
+# Build all of the Cython targets
+rapids_cython_create_modules(SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}"
+                                                                               CXX)
+
+# mark all symbols in these Cython targets "hidden" by default, so they won't collide with symbols
+# loaded from other DSOs
+foreach(_cython_target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  set_target_properties(${_cython_target} PROPERTIES C_VISIBILITY_PRESET hidden
+                                                     CXX_VISIBILITY_PRESET hidden)
+endforeach()
diff --git a/python/rmm/rmm/pylibrmm/__init__.py b/python/rmm/rmm/pylibrmm/__init__.py
new file mode 100644
index 000000000..0b8672ef6
--- /dev/null
+++ b/python/rmm/rmm/pylibrmm/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .device_buffer import DeviceBuffer
diff --git a/python/rmm/rmm/pylibrmm/cuda_stream.pxd b/python/rmm/rmm/pylibrmm/cuda_stream.pxd
new file mode 100644
index 000000000..dd38387c2
--- /dev/null
+++ b/python/rmm/rmm/pylibrmm/cuda_stream.pxd
@@ -0,0 +1,27 @@
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cimport cython
+from cuda.ccudart cimport cudaStream_t
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+
+from rmm.librmm.cuda_stream cimport cuda_stream
+
+
+@cython.final
+cdef class CudaStream:
+    cdef unique_ptr[cuda_stream] c_obj
+    cdef cudaStream_t value(self) except * nogil
+    cdef bool is_valid(self) except * nogil
diff --git a/python/rmm/rmm/_lib/cuda_stream.pyx b/python/rmm/rmm/pylibrmm/cuda_stream.pyx
similarity index 91%
rename from python/rmm/rmm/_lib/cuda_stream.pyx
rename to python/rmm/rmm/pylibrmm/cuda_stream.pyx
index 0861f0663..d6aa4edc7 100644
--- a/python/rmm/rmm/_lib/cuda_stream.pyx
+++ b/python/rmm/rmm/pylibrmm/cuda_stream.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@ cimport cython
 from cuda.ccudart cimport cudaStream_t
 from libcpp cimport bool
 
+from rmm.librmm.cuda_stream cimport cuda_stream
+
 
 @cython.final
 cdef class CudaStream:
diff --git a/python/rmm/rmm/pylibrmm/device_buffer.pxd b/python/rmm/rmm/pylibrmm/device_buffer.pxd
new file mode 100644
index 000000000..a0d287423
--- /dev/null
+++ b/python/rmm/rmm/pylibrmm/device_buffer.pxd
@@ -0,0 +1,71 @@
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libc.stdint cimport uintptr_t
+from libcpp.memory cimport unique_ptr
+
+from rmm._cuda.stream cimport Stream
+from rmm.librmm.device_buffer cimport device_buffer
+from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
+
+
+cdef class DeviceBuffer:
+    cdef unique_ptr[device_buffer] c_obj
+
+    # Holds a reference to the DeviceMemoryResource used for allocation.
+    # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is
+    # needed for deallocation
+    cdef DeviceMemoryResource mr
+
+    # Holds a reference to the stream used by the underlying `device_buffer`.
+    # Ensures the stream does not get destroyed before this DeviceBuffer
+    cdef Stream stream
+
+    @staticmethod
+    cdef DeviceBuffer c_from_unique_ptr(
+        unique_ptr[device_buffer] ptr,
+        Stream stream=*,
+        DeviceMemoryResource mr=*,
+    )
+
+    @staticmethod
+    cdef DeviceBuffer c_to_device(const unsigned char[::1] b,
+                                  Stream stream=*) except *
+    cpdef copy_to_host(self, ary=*, Stream stream=*)
+    cpdef copy_from_host(self, ary, Stream stream=*)
+    cpdef copy_from_device(self, cuda_ary, Stream stream=*)
+    cpdef bytes tobytes(self, Stream stream=*)
+
+    cdef size_t c_size(self) except *
+    cpdef void reserve(self, size_t new_capacity, Stream stream=*) except *
+    cpdef void resize(self, size_t new_size, Stream stream=*) except *
+    cpdef size_t capacity(self) except *
+    cdef void* c_data(self) except *
+
+    cdef device_buffer c_release(self) except *
+
+cpdef DeviceBuffer to_device(const unsigned char[::1] b,
+                             Stream stream=*)
+cpdef void copy_ptr_to_host(uintptr_t db,
+                            unsigned char[::1] hb,
+                            Stream stream=*) except *
+
+cpdef void copy_host_to_ptr(const unsigned char[::1] hb,
+                            uintptr_t db,
+                            Stream stream=*) except *
+
+cpdef void copy_device_to_ptr(uintptr_t d_src,
+                              uintptr_t d_dst,
+                              size_t count,
+                              Stream stream=*) except *
diff --git a/python/rmm/rmm/_lib/device_buffer.pyx b/python/rmm/rmm/pylibrmm/device_buffer.pyx
similarity index 96%
rename from python/rmm/rmm/_lib/device_buffer.pyx
rename to python/rmm/rmm/pylibrmm/device_buffer.pyx
index 94a4dc771..76fbceef8 100644
--- a/python/rmm/rmm/_lib/device_buffer.pyx
+++ b/python/rmm/rmm/pylibrmm/device_buffer.pyx
@@ -32,9 +32,16 @@ from cuda.ccudart cimport (
     cudaStream_t,
 )
 
-from rmm._lib.memory_resource cimport (
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from rmm.librmm.device_buffer cimport (
+    cuda_device_id,
+    device_buffer,
+    get_current_cuda_device,
+    prefetch,
+)
+from rmm.librmm.memory_resource cimport device_memory_resource
+from rmm.pylibrmm.memory_resource cimport (
     DeviceMemoryResource,
-    device_memory_resource,
     get_current_device_resource,
 )
 
@@ -394,7 +401,7 @@ cpdef DeviceBuffer to_device(const unsigned char[::1] b,
     Examples
     --------
     >>> import rmm
-    >>> db = rmm._lib.device_buffer.to_device(b"abc")
+    >>> db = rmm.pylibrmm.device_buffer.to_device(b"abc")
     >>> print(bytes(db))
     b'abc'
     """
@@ -460,7 +467,7 @@ cpdef void copy_ptr_to_host(uintptr_t db,
     >>> import rmm
     >>> db = rmm.DeviceBuffer.to_device(b"abc")
     >>> hb = bytearray(db.nbytes)
-    >>> rmm._lib.device_buffer.copy_ptr_to_host(db.ptr, hb)
+    >>> rmm.pylibrmm.device_buffer.copy_ptr_to_host(db.ptr, hb)
     >>> print(hb)
     bytearray(b'abc')
     """
@@ -502,7 +509,7 @@ cpdef void copy_host_to_ptr(const unsigned char[::1] hb,
     >>> import rmm
     >>> db = rmm.DeviceBuffer(size=10)
     >>> hb = b"abc"
-    >>> rmm._lib.device_buffer.copy_host_to_ptr(hb, db.ptr)
+    >>> rmm.pylibrmm.device_buffer.copy_host_to_ptr(hb, db.ptr)
     >>> hb = db.copy_to_host()
     >>> print(hb)
     array([97, 98, 99,  0,  0,  0,  0,  0,  0,  0], dtype=uint8)
@@ -541,7 +548,7 @@ cpdef void copy_device_to_ptr(uintptr_t d_src,
     >>> import rmm
     >>> db = rmm.DeviceBuffer(size=5)
     >>> db2 = rmm.DeviceBuffer.to_device(b"abc")
-    >>> rmm._lib.device_buffer.copy_device_to_ptr(db2.ptr, db.ptr, db2.size)
+    >>> rmm.pylibrmm.device_buffer.copy_device_to_ptr(db2.ptr, db.ptr, db2.size)
     >>> hb = db.copy_to_host()
     >>> hb
     array([97, 98, 99,  0,  0], dtype=uint8)
diff --git a/python/rmm/rmm/_lib/lib.pyx b/python/rmm/rmm/pylibrmm/helper.pxd
similarity index 86%
rename from python/rmm/rmm/_lib/lib.pyx
rename to python/rmm/rmm/pylibrmm/helper.pxd
index 46753baa3..8ca151c00 100644
--- a/python/rmm/rmm/_lib/lib.pyx
+++ b/python/rmm/rmm/pylibrmm/helper.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,3 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+
+cdef object parse_bytes(object s) except *
diff --git a/python/rmm/rmm/_lib/helper.pyx b/python/rmm/rmm/pylibrmm/helper.pyx
similarity index 100%
rename from python/rmm/rmm/_lib/helper.pyx
rename to python/rmm/rmm/pylibrmm/helper.pyx
diff --git a/python/rmm/rmm/_lib/logger.pyx b/python/rmm/rmm/pylibrmm/logger.pyx
similarity index 77%
rename from python/rmm/rmm/_lib/logger.pyx
rename to python/rmm/rmm/pylibrmm/logger.pyx
index 029bbdd79..119e1c92f 100644
--- a/python/rmm/rmm/_lib/logger.pyx
+++ b/python/rmm/rmm/pylibrmm/logger.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,58 +14,9 @@
 
 import warnings
 
-from libcpp cimport bool
-
-
-cdef extern from "spdlog/common.h" namespace "spdlog::level" nogil:
-    cpdef enum logging_level "spdlog::level::level_enum":
-        """
-        The debug logging level for RMM.
-
-        Debug logging prints messages to a log file. See
-        `Debug Logging <https://github.com/rapidsai/rmm#debug-logging>`_
-        for more information.
-
-        Valid levels, in decreasing order of verbosity, are TRACE, DEBUG,
-        INFO, WARN, ERR, CRITICAL, and OFF. Default is INFO.
-
-        Examples
-        --------
-        >>> import rmm
-        >>> rmm.logging_level.DEBUG
-        <logging_level.DEBUG: 1>
-        >>> rmm.logging_level.DEBUG.value
-        1
-        >>> rmm.logging_level.DEBUG.name
-        'DEBUG'
-
-        See Also
-        --------
-        set_logging_level : Set the debug logging level
-        get_logging_level : Get the current debug logging level
-        """
-        TRACE "spdlog::level::trace"
-        DEBUG "spdlog::level::debug"
-        INFO "spdlog::level::info"
-        WARN "spdlog::level::warn"
-        ERR "spdlog::level::err"
-        CRITICAL "spdlog::level::critical"
-        OFF "spdlog::level::off"
-
-
-cdef extern from "spdlog/spdlog.h" namespace "spdlog" nogil:
-    cdef cppclass spdlog_logger "spdlog::logger":
-        spdlog_logger() except +
-        void set_level(logging_level level)
-        logging_level level()
-        void flush() except +
-        void flush_on(logging_level level)
-        logging_level flush_level()
-        bool should_log(logging_level msg_level)
-
-
-cdef extern from "rmm/logger.hpp" namespace "rmm" nogil:
-    cdef spdlog_logger& logger() except +
+from rmm.librmm._logger cimport logger
+
+from rmm.librmm._logger import logging_level
 
 
 def _validate_level_type(level):
diff --git a/python/rmm/rmm/pylibrmm/memory_resource.pxd b/python/rmm/rmm/pylibrmm/memory_resource.pxd
new file mode 100644
index 000000000..985d5d31b
--- /dev/null
+++ b/python/rmm/rmm/pylibrmm/memory_resource.pxd
@@ -0,0 +1,83 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libcpp.memory cimport shared_ptr
+
+from rmm.librmm.memory_resource cimport device_memory_resource
+
+
+cdef class DeviceMemoryResource:
+    cdef shared_ptr[device_memory_resource] c_obj
+    cdef device_memory_resource* get_mr(self) noexcept nogil
+
+cdef class UpstreamResourceAdaptor(DeviceMemoryResource):
+    cdef readonly DeviceMemoryResource upstream_mr
+
+    cpdef DeviceMemoryResource get_upstream(self)
+
+cdef class CudaMemoryResource(DeviceMemoryResource):
+    pass
+
+cdef class ManagedMemoryResource(DeviceMemoryResource):
+    pass
+
+cdef class SystemMemoryResource(DeviceMemoryResource):
+    pass
+
+cdef class SamHeadroomMemoryResource(DeviceMemoryResource):
+    pass
+
+cdef class CudaAsyncMemoryResource(DeviceMemoryResource):
+    pass
+
+cdef class PoolMemoryResource(UpstreamResourceAdaptor):
+    pass
+
+cdef class FixedSizeMemoryResource(UpstreamResourceAdaptor):
+    pass
+
+cdef class BinningMemoryResource(UpstreamResourceAdaptor):
+
+    cdef readonly list _bin_mrs
+
+    cpdef add_bin(
+        self,
+        size_t allocation_size,
+        DeviceMemoryResource bin_resource=*)
+
+cdef class CallbackMemoryResource(DeviceMemoryResource):
+    cdef object _allocate_func
+    cdef object _deallocate_func
+
+cdef class LimitingResourceAdaptor(UpstreamResourceAdaptor):
+    pass
+
+cdef class LoggingResourceAdaptor(UpstreamResourceAdaptor):
+    cdef object _log_file_name
+    cpdef get_file_name(self)
+    cpdef flush(self)
+
+cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
+    pass
+
+cdef class TrackingResourceAdaptor(UpstreamResourceAdaptor):
+    pass
+
+cdef class FailureCallbackResourceAdaptor(UpstreamResourceAdaptor):
+    cdef object _callback
+
+cdef class PrefetchResourceAdaptor(UpstreamResourceAdaptor):
+    pass
+
+cpdef DeviceMemoryResource get_current_device_resource()
diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/pylibrmm/memory_resource.pyx
similarity index 82%
rename from python/rmm/rmm/_lib/memory_resource.pyx
rename to python/rmm/rmm/pylibrmm/memory_resource.pyx
index 231253e3f..021125567 100644
--- a/python/rmm/rmm/_lib/memory_resource.pyx
+++ b/python/rmm/rmm/pylibrmm/memory_resource.pyx
@@ -22,12 +22,11 @@ from collections import defaultdict
 cimport cython
 from cython.operator cimport dereference as deref
 from libc.stddef cimport size_t
-from libc.stdint cimport int8_t, int64_t, uintptr_t
+from libc.stdint cimport int8_t, uintptr_t
 from libcpp cimport bool
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.optional cimport optional
 from libcpp.pair cimport pair
-from libcpp.string cimport string
 
 from cuda.cudart import cudaError_t
 
@@ -37,206 +36,43 @@ from rmm._cuda.stream cimport Stream
 
 from rmm._cuda.stream import DEFAULT_STREAM
 
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-from rmm._lib.helper cimport parse_bytes
-from rmm._lib.memory_resource cimport (
-    available_device_memory as c_available_device_memory,
-    percent_of_free_device_memory as c_percent_of_free_device_memory,
-)
-from rmm._lib.per_device_resource cimport (
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from rmm.librmm.per_device_resource cimport (
     cuda_device_id,
     set_per_device_resource as cpp_set_per_device_resource,
 )
+from rmm.pylibrmm.helper cimport parse_bytes
 
 from rmm.statistics import Statistics
 
-# Transparent handle of a C++ exception
-ctypedef pair[int, string] CppExcept
-
-cdef CppExcept translate_python_except_to_cpp(err: BaseException) noexcept:
-    """Translate a Python exception into a C++ exception handle
-
-    The returned exception handle can then be thrown by `throw_cpp_except()`,
-    which MUST be done without holding the GIL.
-
-    This is useful when C++ calls a Python function and needs to catch or
-    propagate exceptions.
-    """
-    if isinstance(err, MemoryError):
-        return CppExcept(0, str.encode(str(err)))
-    return CppExcept(-1, str.encode(str(err)))
-
-# Implementation of `throw_cpp_except()`, which throws a given `CppExcept`.
-# This function MUST be called without the GIL otherwise the thrown C++
-# exception are translated back into a Python exception.
-cdef extern from *:
-    """
-    #include <stdexcept>
-    #include <utility>
-
-    void throw_cpp_except(std::pair<int, std::string> res) {
-        switch(res.first) {
-            case 0:
-                throw rmm::out_of_memory(res.second);
-            default:
-                throw std::runtime_error(res.second);
-        }
-    }
-    """
-    void throw_cpp_except(CppExcept) nogil
-
-
-# NOTE: Keep extern declarations in .pyx file as much as possible to avoid
-# leaking dependencies when importing RMM Cython .pxd files
-cdef extern from "rmm/mr/device/cuda_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass cuda_memory_resource(device_memory_resource):
-        cuda_memory_resource() except +
-
-cdef extern from "rmm/mr/device/managed_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass managed_memory_resource(device_memory_resource):
-        managed_memory_resource() except +
-
-cdef extern from "rmm/mr/device/system_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass system_memory_resource(device_memory_resource):
-        system_memory_resource() except +
-
-cdef extern from "rmm/mr/device/sam_headroom_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass sam_headroom_memory_resource(device_memory_resource):
-        sam_headroom_memory_resource(size_t headroom) except +
-
-cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-
-    cdef cppclass cuda_async_memory_resource(device_memory_resource):
-        cuda_async_memory_resource(
-            optional[size_t] initial_pool_size,
-            optional[size_t] release_threshold,
-            optional[allocation_handle_type] export_handle_type) except +
-
-# TODO: when we adopt Cython 3.0 use enum class
-cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \
-        namespace \
-        "rmm::mr::cuda_async_memory_resource::allocation_handle_type" \
-        nogil:
-    enum allocation_handle_type \
-            "rmm::mr::cuda_async_memory_resource::allocation_handle_type":
-        none
-        posix_file_descriptor
-        win32
-        win32_kmt
-
-
-cdef extern from "rmm/mr/device/pool_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass pool_memory_resource[Upstream](device_memory_resource):
-        pool_memory_resource(
-            Upstream* upstream_mr,
-            size_t initial_pool_size,
-            optional[size_t] maximum_pool_size) except +
-        size_t pool_size()
-
-cdef extern from "rmm/mr/device/fixed_size_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass fixed_size_memory_resource[Upstream](device_memory_resource):
-        fixed_size_memory_resource(
-            Upstream* upstream_mr,
-            size_t block_size,
-            size_t block_to_preallocate) except +
-
-cdef extern from "rmm/mr/device/callback_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    ctypedef void* (*allocate_callback_t)(size_t, cuda_stream_view, void*)
-    ctypedef void (*deallocate_callback_t)(void*, size_t, cuda_stream_view, void*)
-
-    cdef cppclass callback_memory_resource(device_memory_resource):
-        callback_memory_resource(
-            allocate_callback_t allocate_callback,
-            deallocate_callback_t deallocate_callback,
-            void* allocate_callback_arg,
-            void* deallocate_callback_arg
-        ) except +
-
-cdef extern from "rmm/mr/device/binning_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass binning_memory_resource[Upstream](device_memory_resource):
-        binning_memory_resource(Upstream* upstream_mr) except +
-        binning_memory_resource(
-            Upstream* upstream_mr,
-            int8_t min_size_exponent,
-            int8_t max_size_exponent) except +
-
-        void add_bin(size_t allocation_size) except +
-        void add_bin(
-            size_t allocation_size,
-            device_memory_resource* bin_resource) except +
-
-cdef extern from "rmm/mr/device/limiting_resource_adaptor.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass limiting_resource_adaptor[Upstream](device_memory_resource):
-        limiting_resource_adaptor(
-            Upstream* upstream_mr,
-            size_t allocation_limit) except +
-
-        size_t get_allocated_bytes() except +
-        size_t get_allocation_limit() except +
-
-cdef extern from "rmm/mr/device/logging_resource_adaptor.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass logging_resource_adaptor[Upstream](device_memory_resource):
-        logging_resource_adaptor(
-            Upstream* upstream_mr,
-            string filename) except +
-
-        void flush() except +
-
-cdef extern from "rmm/mr/device/statistics_resource_adaptor.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass statistics_resource_adaptor[Upstream](device_memory_resource):
-        struct counter:
-            counter()
-
-            int64_t value
-            int64_t peak
-            int64_t total
-
-        statistics_resource_adaptor(Upstream* upstream_mr) except +
-
-        counter get_bytes_counter() except +
-        counter get_allocations_counter() except +
-        pair[counter, counter] pop_counters() except +
-        pair[counter, counter] push_counters() except +
-
-cdef extern from "rmm/mr/device/tracking_resource_adaptor.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass tracking_resource_adaptor[Upstream](device_memory_resource):
-        tracking_resource_adaptor(
-            Upstream* upstream_mr,
-            bool capture_stacks) except +
-
-        size_t get_allocated_bytes() except +
-        string get_outstanding_allocations_str() except +
-        void log_outstanding_allocations() except +
-
-cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \
-        namespace "rmm::mr" nogil:
-    ctypedef bool (*failure_callback_t)(size_t, void*)
-    cdef cppclass failure_callback_resource_adaptor[Upstream](
-        device_memory_resource
-    ):
-        failure_callback_resource_adaptor(
-            Upstream* upstream_mr,
-            failure_callback_t callback,
-            void* callback_arg
-        ) except +
-
-cdef extern from "rmm/mr/device/prefetch_resource_adaptor.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass prefetch_resource_adaptor[Upstream](device_memory_resource):
-        prefetch_resource_adaptor(Upstream* upstream_mr) except +
+from rmm.librmm.memory_resource cimport (
+    CppExcept,
+    allocate_callback_t,
+    allocation_handle_type,
+    available_device_memory as c_available_device_memory,
+    binning_memory_resource,
+    callback_memory_resource,
+    cuda_async_memory_resource,
+    cuda_memory_resource,
+    deallocate_callback_t,
+    device_memory_resource,
+    failure_callback_resource_adaptor,
+    failure_callback_t,
+    fixed_size_memory_resource,
+    limiting_resource_adaptor,
+    logging_resource_adaptor,
+    managed_memory_resource,
+    percent_of_free_device_memory as c_percent_of_free_device_memory,
+    pool_memory_resource,
+    posix_file_descriptor,
+    prefetch_resource_adaptor,
+    sam_headroom_memory_resource,
+    statistics_resource_adaptor,
+    system_memory_resource,
+    throw_cpp_except,
+    tracking_resource_adaptor,
+    translate_python_except_to_cpp,
+)
 
 
 cdef class DeviceMemoryResource:
diff --git a/python/rmm/rmm/_lib/tests/__init__.py b/python/rmm/rmm/pylibrmm/tests/__init__.py
similarity index 100%
rename from python/rmm/rmm/_lib/tests/__init__.py
rename to python/rmm/rmm/pylibrmm/tests/__init__.py
diff --git a/python/rmm/rmm/_lib/tests/test_device_buffer.pyx b/python/rmm/rmm/pylibrmm/tests/test_device_buffer.pyx
similarity index 83%
rename from python/rmm/rmm/_lib/tests/test_device_buffer.pyx
rename to python/rmm/rmm/pylibrmm/tests/test_device_buffer.pyx
index 733383827..ec2ff4def 100644
--- a/python/rmm/rmm/_lib/tests/test_device_buffer.pyx
+++ b/python/rmm/rmm/pylibrmm/tests/test_device_buffer.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,8 +16,9 @@ import numpy as np
 
 from libcpp.memory cimport make_unique
 
-from rmm._lib.cuda_stream_view cimport cuda_stream_default
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+from rmm.librmm.cuda_stream_view cimport cuda_stream_default
+from rmm.librmm.device_buffer cimport device_buffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 
 def test_release():
diff --git a/python/rmm/rmm/tests/test_cython.py b/python/rmm/rmm/tests/test_cython.py
index 82eba2451..5df933435 100644
--- a/python/rmm/rmm/tests/test_cython.py
+++ b/python/rmm/rmm/tests/test_cython.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@ def wrapped(*args, **kwargs):
     return wrapped
 
 
-cython_test_modules = ["rmm._lib.tests.test_device_buffer"]
+cython_test_modules = ["rmm.pylibrmm.tests.test_device_buffer"]
 
 
 for mod in cython_test_modules:
diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
index c88d21b38..c03b9e501 100644
--- a/python/rmm/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -354,7 +354,7 @@ def test_rmm_pool_numba_stream(stream):
     rmm.reinitialize(pool_allocator=True)
 
     stream = rmm._cuda.stream.Stream(stream)
-    a = rmm._lib.device_buffer.DeviceBuffer(size=3, stream=stream)
+    a = rmm.pylibrmm.device_buffer.DeviceBuffer(size=3, stream=stream)
 
     assert a.size == 3
     assert a.ptr != 0

From 815003232d90a45fe6867214e73284649c639066 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 4 Oct 2024 15:10:01 -0400
Subject: [PATCH 05/29] Fix `rmm ._lib` imports (#1693)

This PR fixes a bug in #1676. It makes sure that rmm imports work correctly using both `from rmm._lib...` and `import rmm._lib...` syntax.

I'm adding DO NOT MERGE until I do some more testing.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/rmm/pull/1693
---
 python/rmm/rmm/_lib/cuda_stream.py     | 15 +++++++++
 python/rmm/rmm/_lib/device_buffer.py   | 21 ++++++++++++
 python/rmm/rmm/_lib/logger.py          | 24 ++++++++++++++
 python/rmm/rmm/_lib/memory_resource.py | 44 ++++++++++++++++++++++++++
 4 files changed, 104 insertions(+)
 create mode 100644 python/rmm/rmm/_lib/cuda_stream.py
 create mode 100644 python/rmm/rmm/_lib/device_buffer.py
 create mode 100644 python/rmm/rmm/_lib/logger.py
 create mode 100644 python/rmm/rmm/_lib/memory_resource.py

diff --git a/python/rmm/rmm/_lib/cuda_stream.py b/python/rmm/rmm/_lib/cuda_stream.py
new file mode 100644
index 000000000..1eb424e12
--- /dev/null
+++ b/python/rmm/rmm/_lib/cuda_stream.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.pylibrmm.cuda_stream import CudaStream  # noqa: F401
diff --git a/python/rmm/rmm/_lib/device_buffer.py b/python/rmm/rmm/_lib/device_buffer.py
new file mode 100644
index 000000000..c531bca5f
--- /dev/null
+++ b/python/rmm/rmm/_lib/device_buffer.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.pylibrmm.device_buffer import (  # noqa: F401
+    DeviceBuffer,
+    copy_device_to_ptr,
+    copy_host_to_ptr,
+    copy_ptr_to_host,
+    to_device,
+)
diff --git a/python/rmm/rmm/_lib/logger.py b/python/rmm/rmm/_lib/logger.py
new file mode 100644
index 000000000..1e9b519b8
--- /dev/null
+++ b/python/rmm/rmm/_lib/logger.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.librmm._logger import logging_level  # noqa: F401
+from rmm.pylibrmm.logger import (  # noqa: F401
+    _validate_level_type,
+    flush_logger,
+    get_flush_level,
+    get_logging_level,
+    set_flush_level,
+    set_logging_level,
+    should_log,
+)
diff --git a/python/rmm/rmm/_lib/memory_resource.py b/python/rmm/rmm/_lib/memory_resource.py
new file mode 100644
index 000000000..0d47e8c9b
--- /dev/null
+++ b/python/rmm/rmm/_lib/memory_resource.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.pylibrmm.memory_resource import (  # noqa: F401
+    BinningMemoryResource,
+    CallbackMemoryResource,
+    CudaAsyncMemoryResource,
+    CudaMemoryResource,
+    DeviceMemoryResource,
+    FailureCallbackResourceAdaptor,
+    FixedSizeMemoryResource,
+    LimitingResourceAdaptor,
+    LoggingResourceAdaptor,
+    ManagedMemoryResource,
+    PoolMemoryResource,
+    PrefetchResourceAdaptor,
+    SamHeadroomMemoryResource,
+    StatisticsResourceAdaptor,
+    SystemMemoryResource,
+    TrackingResourceAdaptor,
+    UpstreamResourceAdaptor,
+    _flush_logs,
+    available_device_memory,
+    disable_logging,
+    enable_logging,
+    get_current_device_resource,
+    get_current_device_resource_type,
+    get_log_filenames,
+    get_per_device_resource_type,
+    is_initialized,
+    set_current_device_resource,
+    set_per_device_resource,
+)

From c494395e58288cac16321ce90e9b15f3508ae89a Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Fri, 4 Oct 2024 16:18:35 -0400
Subject: [PATCH 06/29] Prune workflows based on changed files (#1695)

Contributes to https://github.com/rapidsai/build-planning/issues/94

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/rmm/pull/1695
---
 .github/workflows/pr.yaml | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index afc9f7487..4dfcaf1ae 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -12,6 +12,7 @@ concurrency:
 jobs:
   pr-builder:
     needs:
+      - changed-files
       - checks
       - conda-cpp-build
       - conda-cpp-tests
@@ -24,6 +25,29 @@ jobs:
       - devcontainer
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12
+    if: always()
+    with:
+      needs: ${{ toJSON(needs) }}
+  changed-files:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12
+    with:
+      files_yaml: |
+        test_cpp:
+          - '**'
+          - '!.devcontainer/**'
+          - '!.pre-commit-config.yaml'
+          - '!CONTRIBUTING.md'
+          - '!README.md'
+          - '!img/**'
+          - '!python/**'
+        test_python:
+          - '**'
+          - '!.devcontainer/**'
+          - '!.pre-commit-config.yaml'
+          - '!CONTRIBUTING.md'
+          - '!README.md'
+          - '!img/**'
   checks:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
@@ -36,9 +60,10 @@ jobs:
     with:
       build_type: pull-request
   conda-cpp-tests:
-    needs: conda-cpp-build
+    needs: [conda-cpp-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
   conda-python-build:
@@ -48,9 +73,10 @@ jobs:
     with:
       build_type: pull-request
   conda-python-tests:
-    needs: conda-python-build
+    needs: [conda-python-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
   docs-build:
@@ -79,9 +105,10 @@ jobs:
       build_type: pull-request
       script: ci/build_wheel_python.sh
   wheel-tests:
-    needs: wheel-build-python
+    needs: [wheel-build-python, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel.sh

From 4e519bbf94dd1641dfb69fc171f714c38a7d0894 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 9 Oct 2024 18:09:39 +0100
Subject: [PATCH 07/29] Update cross-link to cuda-python object (#1699)

nvidia/cuda-python#137 reorganised the low-level binding structure which broke our cross-linking, update to the new name to fix.

- Closes #1698

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/rmm/pull/1699
---
 python/rmm/rmm/pylibrmm/device_buffer.pyx |  2 +-
 python/rmm/rmm/statistics.py              | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/python/rmm/rmm/pylibrmm/device_buffer.pyx b/python/rmm/rmm/pylibrmm/device_buffer.pyx
index 76fbceef8..c2e95e845 100644
--- a/python/rmm/rmm/pylibrmm/device_buffer.pyx
+++ b/python/rmm/rmm/pylibrmm/device_buffer.pyx
@@ -156,7 +156,7 @@ cdef class DeviceBuffer:
         device : optional
             The CUDA device to which to prefetch the memory for this buffer.
             Defaults to the current CUDA device. To prefetch to the CPU, pass
-            :py:attr:`~cuda.cudart.cudaCpuDeviceId` as the device.
+            :py:attr:`~cuda.bindings.runtime.cudaCpuDeviceId` as the device.
         stream : optional
             CUDA stream to use for prefetching. Defaults to self.stream
         """
diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 279e45dc6..2dabedce6 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -74,8 +74,8 @@ def enable_statistics() -> None:
 def get_statistics() -> Optional[Statistics]:
     """Get the current allocation statistics.
 
-    Return
-    ------
+    Returns
+    -------
     If enabled, returns the current tracked statistics.
     If disabled, returns None.
     """
@@ -94,8 +94,8 @@ def push_statistics() -> Optional[Statistics]:
     If statistics are disabled (the current memory resource is not an
     instance of StatisticsResourceAdaptor), this function is a no-op.
 
-    Return
-    ------
+    Returns
+    -------
     If enabled, returns the current tracked statistics _before_ the pop.
     If disabled, returns None.
     """
@@ -114,8 +114,8 @@ def pop_statistics() -> Optional[Statistics]:
     If statistics are disabled (the current memory resource is not an
     instance of StatisticsResourceAdaptor), this function is a no-op.
 
-    Return
-    ------
+    Returns
+    -------
     If enabled, returns the popped counters.
     If disabled, returns None.
     """
@@ -232,8 +232,8 @@ def report(
         ordered_by
             Sort the statistics by this attribute.
 
-        Return
-        ------
+        Returns
+        -------
         The pretty formatted string of the memory statistics
         """
 
@@ -279,8 +279,8 @@ def _get_descriptive_name_of_object(obj: object) -> str:
     obj
         Object in question
 
-    Return
-    ------
+    Returns
+    -------
     A string including filename, line number, and object name.
     """
 

From 69a297d82641fcb61eac92ac6de42658cfa651f6 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 9 Oct 2024 19:00:50 +0100
Subject: [PATCH 08/29] Add BUILD_SHARED_LIBS option defaulting to ON (#1702)

This means that downstream libraries that get their `fmt` dependency from RMM will use `-DFMT_SHARED` in the compile command: this matches what the rapids combined devcontainers do, so we get sccache hits.

- Closes #1701

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/rmm/pull/1702
---
 CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 39d5dccde..26fcf1fd0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,6 +41,9 @@ rapids_cmake_build_type(Release)
 option(RMM_NVTX "Build RMM with NVTX support" OFF)
 option(BUILD_TESTS "Configure CMake to build tests" ON)
 option(BUILD_BENCHMARKS "Configure CMake to build (google) benchmarks" OFF)
+# This is mostly so that dependent libraries, such as fmt, are configured in shared mode for
+# downstream dependents of RMM that get their common dependencies transitively.
+option(BUILD_SHARED_LIBS "Build RMM shared libraries" ON)
 set(RMM_LOGGING_LEVEL
     "INFO"
     CACHE STRING "Choose the logging level.")

From f7155183645f640fa5695a0558d9708703f5b2a6 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 9 Oct 2024 13:28:23 -0500
Subject: [PATCH 09/29] make conda installs in CI stricter (#1696)

Contributes to https://github.com/rapidsai/build-planning/issues/106

Proposes specifying the RAPIDS version in `conda install` calls in CI that install CI artifacts, to reduce the risk of CI jobs picking up artifacts from other releases.

Authors:
  - James Lamb (https://github.com/jameslamb)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/rmm/pull/1696
---
 ci/build_docs.sh  | 9 +++++----
 ci/test_cpp.sh    | 5 ++++-
 ci/test_python.sh | 5 ++++-
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 970417c1d..fadaf0f27 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -6,6 +6,9 @@ set -euo pipefail
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR"
+
 rapids-dependency-file-generator \
   --output conda \
   --file-key docs \
@@ -23,11 +26,9 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  rmm librmm
+  "rmm=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "librmm=${RAPIDS_VERSION_MAJOR_MINOR}"
 
-export RAPIDS_VERSION="$(rapids-version)"
-export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
-export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 9ad1c9536..02435f249 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -8,6 +8,8 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
 rapids-logger "Generate C++ testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
@@ -29,7 +31,8 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  librmm librmm-tests
+  "librmm=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "librmm-tests=${RAPIDS_VERSION_MAJOR_MINOR}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 386d0b063..7a688107e 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -9,6 +9,8 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
 rapids-dependency-file-generator \
   --output conda \
   --file-key test_python \
@@ -28,7 +30,8 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  rmm librmm
+  "rmm=${RAPIDS_VERSION_MAJOR_MINOR}" \
+  "librmm=${RAPIDS_VERSION_MAJOR_MINOR}"
 
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}
 RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"}

From 90a5631e1093ce44c4feceb88fcf557c3dfc043b Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Wed, 9 Oct 2024 15:03:33 -0400
Subject: [PATCH 10/29] Fix typos in .gitignore (#1697)

Small fix to some typos that cropped up in the .gitignore with #1676

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/rmm/pull/1697
---
 .gitignore | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.gitignore b/.gitignore
index 36aafe643..df9d920d5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,13 +22,13 @@ rmm.egg-info/
 python/build
 python/*/build
 python/rmm/docs/_build
-python/rmm/**/librmmm/**/*.cpp
-!python/rmm/librmmm/_torch_allocator.cpp
+python/rmm/**/librmm/**/*.cpp
+!python/rmm/librmm/_torch_allocator.cpp
 python/rmm/**/librmm/**/*.h
 python/rmm/**/librmm/.nfs*
-python/rmm/**/pylibrmmm/**/*.cpp
-python/rmm/**/pylibrmmm/**/*.h
-python/rmm/**/pylibrmmm/.nfs*
+python/rmm/**/pylibrmm/**/*.cpp
+python/rmm/**/pylibrmm/**/*.h
+python/rmm/**/pylibrmm/.nfs*
 python/rmm/_cuda/*.cpp
 python/rmm/tests/*.cpp
 python/rmm/*.ipynb

From 1b70ffdd5ab460ac481f1575c42e8c1fccfda792 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 11 Oct 2024 15:09:11 -0500
Subject: [PATCH 11/29] make conda installs in CI stricter (part 2) (#1703)

Follow-up to #1696

Changes relative to that PR:

* switches to pinning CI conda installs to the output of `rapids-version` (`{major}.{minor}.{patch}`) instead of `rapids-version-major-minor` (`{major}.{minor}`), to get a bit more protection in the presence of hotfix releases
* restores some exporting of variables needed for docs builds

In #1696, I'd missed that this project's Doxygen setup is expecting to find `RAPIDS_VERSION_MAJOR_MINOR` defined in the environment.

https://github.com/rapidsai/rmm/blob/90a5631e1093ce44c4feceb88fcf557c3dfc043b/ci/build_docs.sh#L36

https://github.com/rapidsai/rmm/blob/90a5631e1093ce44c4feceb88fcf557c3dfc043b/doxygen/Doxyfile#L41

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Mike Sarahan (https://github.com/msarahan)

URL: https://github.com/rapidsai/rmm/pull/1703
---
 ci/build_docs.sh  | 10 +++++-----
 ci/test_cpp.sh    |  6 +++---
 ci/test_python.sh |  6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index fadaf0f27..844dae1c6 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -6,8 +6,8 @@ set -euo pipefail
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
-export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR"
+RAPIDS_VERSION="$(rapids-version)"
+export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
 
 rapids-dependency-file-generator \
   --output conda \
@@ -26,8 +26,8 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  "rmm=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "librmm=${RAPIDS_VERSION_MAJOR_MINOR}"
+  "rmm=${RAPIDS_VERSION}" \
+  "librmm=${RAPIDS_VERSION}"
 
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
@@ -45,4 +45,4 @@ mkdir -p "${RAPIDS_DOCS_DIR}/rmm/html"
 mv _build/dirhtml/* "${RAPIDS_DOCS_DIR}/rmm/html"
 popd
 
-rapids-upload-docs
+RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 02435f249..975477a6e 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -8,7 +8,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 
 . /opt/conda/etc/profile.d/conda.sh
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_VERSION="$(rapids-version)"
 
 rapids-logger "Generate C++ testing dependencies"
 rapids-dependency-file-generator \
@@ -31,8 +31,8 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  "librmm=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "librmm-tests=${RAPIDS_VERSION_MAJOR_MINOR}"
+  "librmm=${RAPIDS_VERSION}" \
+  "librmm-tests=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 7a688107e..51d0a48c3 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -9,7 +9,7 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
-RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+RAPIDS_VERSION="$(rapids-version)"
 
 rapids-dependency-file-generator \
   --output conda \
@@ -30,8 +30,8 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  "rmm=${RAPIDS_VERSION_MAJOR_MINOR}" \
-  "librmm=${RAPIDS_VERSION_MAJOR_MINOR}"
+  "rmm=${RAPIDS_VERSION}" \
+  "librmm=${RAPIDS_VERSION}"
 
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}
 RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"}

From de42f5711386f6b914cef0fc54d3081a936c5740 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 16 Oct 2024 01:22:40 -0700
Subject: [PATCH 12/29] Deprecate support for directly accessing logger (#1690)

Contributes to https://github.com/rapidsai/build-planning/issues/104


This PR removes support for accessing rmm's underlying spdlog logger directly.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/rmm/pull/1690
---
 include/rmm/logger.hpp                | 25 ++++++++++++++++---------
 python/rmm/rmm/librmm/_logger.pxd     |  2 +-
 tests/mr/device/tracking_mr_tests.cpp | 10 +++++-----
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/include/rmm/logger.hpp b/include/rmm/logger.hpp
index 326385f16..eba3f122b 100644
--- a/include/rmm/logger.hpp
+++ b/include/rmm/logger.hpp
@@ -96,6 +96,11 @@ struct bytes {
   }
 };
 
+inline spdlog::logger& logger()
+{
+  static detail::logger_wrapper wrapped{};
+  return wrapped.logger_;
+}
 }  // namespace detail
 
 /**
@@ -107,10 +112,12 @@ struct bytes {
  *
  * @return spdlog::logger& The logger.
  */
-RMM_EXPORT inline spdlog::logger& logger()
+[[deprecated(
+  "Support for direct access to spdlog loggers in rmm is planned for "
+  "removal")]] RMM_EXPORT inline spdlog::logger&
+logger()
 {
-  static detail::logger_wrapper wrapped{};
-  return wrapped.logger_;
+  return detail::logger();
 }
 
 //! @cond Doxygen_Suppress
@@ -118,12 +125,12 @@ RMM_EXPORT inline spdlog::logger& logger()
 // The default is INFO, but it should be used sparingly, so that by default a log file is only
 // output if there is important information, warnings, errors, and critical failures
 // Log messages that require computation should only be used at level TRACE and DEBUG
-#define RMM_LOG_TRACE(...)    SPDLOG_LOGGER_TRACE(&rmm::logger(), __VA_ARGS__)
-#define RMM_LOG_DEBUG(...)    SPDLOG_LOGGER_DEBUG(&rmm::logger(), __VA_ARGS__)
-#define RMM_LOG_INFO(...)     SPDLOG_LOGGER_INFO(&rmm::logger(), __VA_ARGS__)
-#define RMM_LOG_WARN(...)     SPDLOG_LOGGER_WARN(&rmm::logger(), __VA_ARGS__)
-#define RMM_LOG_ERROR(...)    SPDLOG_LOGGER_ERROR(&rmm::logger(), __VA_ARGS__)
-#define RMM_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&rmm::logger(), __VA_ARGS__)
+#define RMM_LOG_TRACE(...)    SPDLOG_LOGGER_TRACE(&rmm::detail::logger(), __VA_ARGS__)
+#define RMM_LOG_DEBUG(...)    SPDLOG_LOGGER_DEBUG(&rmm::detail::logger(), __VA_ARGS__)
+#define RMM_LOG_INFO(...)     SPDLOG_LOGGER_INFO(&rmm::detail::logger(), __VA_ARGS__)
+#define RMM_LOG_WARN(...)     SPDLOG_LOGGER_WARN(&rmm::detail::logger(), __VA_ARGS__)
+#define RMM_LOG_ERROR(...)    SPDLOG_LOGGER_ERROR(&rmm::detail::logger(), __VA_ARGS__)
+#define RMM_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&rmm::detail::logger(), __VA_ARGS__)
 
 //! @endcond
 
diff --git a/python/rmm/rmm/librmm/_logger.pxd b/python/rmm/rmm/librmm/_logger.pxd
index 241a748c3..fb2126b2f 100644
--- a/python/rmm/rmm/librmm/_logger.pxd
+++ b/python/rmm/rmm/librmm/_logger.pxd
@@ -62,5 +62,5 @@ cdef extern from "spdlog/spdlog.h" namespace "spdlog" nogil:
         bool should_log(logging_level msg_level)
 
 
-cdef extern from "rmm/logger.hpp" namespace "rmm" nogil:
+cdef extern from "rmm/logger.hpp" namespace "rmm::detail" nogil:
     cdef spdlog_logger& logger() except +
diff --git a/tests/mr/device/tracking_mr_tests.cpp b/tests/mr/device/tracking_mr_tests.cpp
index acd540ae6..3fce55fb8 100644
--- a/tests/mr/device/tracking_mr_tests.cpp
+++ b/tests/mr/device/tracking_mr_tests.cpp
@@ -204,8 +204,8 @@ TEST(TrackingTest, LogOutstandingAllocations)
 {
   std::ostringstream oss;
   auto oss_sink = std::make_shared<spdlog::sinks::ostream_sink_st>(oss);
-  rmm::logger().sinks().push_back(oss_sink);
-  auto old_level = rmm::logger().level();
+  rmm::detail::logger().sinks().push_back(oss_sink);
+  auto old_level = rmm::detail::logger().level();
 
   tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()};
   std::vector<void*> allocations;
@@ -213,7 +213,7 @@ TEST(TrackingTest, LogOutstandingAllocations)
     allocations.push_back(mr.allocate(ten_MiB));
   }
 
-  rmm::logger().set_level(spdlog::level::debug);
+  rmm::detail::logger().set_level(spdlog::level::debug);
   EXPECT_NO_THROW(mr.log_outstanding_allocations());
 
 #if SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_DEBUG
@@ -224,8 +224,8 @@ TEST(TrackingTest, LogOutstandingAllocations)
     mr.deallocate(allocation, ten_MiB);
   }
 
-  rmm::logger().set_level(old_level);
-  rmm::logger().sinks().pop_back();
+  rmm::detail::logger().set_level(old_level);
+  rmm::detail::logger().sinks().pop_back();
 }
 
 }  // namespace

From 50e60a868af05cc9f65b9980753d708e7170f3a1 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 16 Oct 2024 16:16:32 -0500
Subject: [PATCH 13/29] Fix docs warning (#1706)

Closes https://github.com/rapidsai/rmm/issues/1705.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/rmm/pull/1706
---
 python/rmm/docs/conf.py | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/python/rmm/docs/conf.py b/python/rmm/docs/conf.py
index d48dc2b42..0b2c21d5a 100644
--- a/python/rmm/docs/conf.py
+++ b/python/rmm/docs/conf.py
@@ -12,6 +12,7 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
+import datetime
 import os
 import re
 
@@ -22,8 +23,8 @@
 # -- Project information -----------------------------------------------------
 
 project = "rmm"
-copyright = "2020-2023, NVIDIA"
-author = "NVIDIA"
+copyright = f"2018-{datetime.datetime.today().year}, NVIDIA Corporation"
+author = "NVIDIA Corporation"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -118,19 +119,6 @@
 
 html_theme = "sphinx_rtd_theme"
 
-# on_rtd is whether we are on readthedocs.org
-on_rtd = os.environ.get("READTHEDOCS", None) == "True"
-
-if not on_rtd:
-    # only import and set the theme if we're building docs locally
-    # otherwise, readthedocs.org uses their theme by default,
-    # so no need to specify it
-    import sphinx_rtd_theme
-
-    html_theme = "sphinx_rtd_theme"
-    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
-
-
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.

From 1024a1250cfde7e93d26dc6d5e063e84c4a39824 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 22 Oct 2024 08:20:02 -0400
Subject: [PATCH 14/29] Update rmm tests to use rapids_cmake_support_conda_env
 (#1707)

Fixes issue brought up in https://github.com/rapidsai/rapids-cmake/issues/634#issuecomment-2345129521 where rmm wasn't using rapids_cmake_support_conda_env

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/rmm/pull/1707
---
 tests/CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ea1af58cd..0258c59c5 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -19,13 +19,16 @@ option(CODE_COVERAGE "Enable generating code coverage with gcov." OFF)
 include(rapids-test)
 rapids_test_init()
 
+# Ensure tests are using the conda env, so they have the correct release/debug compile flags
+rapids_cmake_support_conda_env(conda_env)
+
 # This function takes in a test name and test source and handles setting all of the associated
 # properties and linking to build the test
 function(ConfigureTestInternal TEST_NAME)
   add_executable(${TEST_NAME} ${ARGN})
   target_include_directories(${TEST_NAME} PRIVATE "$<BUILD_INTERFACE:${RMM_SOURCE_DIR}>")
   target_link_libraries(${TEST_NAME} GTest::gmock GTest::gtest GTest::gmock_main GTest::gtest_main
-                        pthread rmm)
+                        pthread rmm $<TARGET_NAME_IF_EXISTS:conda_env>)
   set_target_properties(
     ${TEST_NAME}
     PROPERTIES POSITION_INDEPENDENT_CODE ON
@@ -40,7 +43,6 @@ function(ConfigureTestInternal TEST_NAME)
                              PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}")
   target_compile_options(${TEST_NAME} PUBLIC $<$<COMPILE_LANG_AND_ID:CXX,GNU,Clang>:-Wall -Werror
                                              -Wno-error=deprecated-declarations>)
-  target_compile_options(${TEST_NAME} PUBLIC "$<$<CONFIG:Debug>:-O0>")
 
   if(DISABLE_DEPRECATION_WARNING)
     target_compile_options(

From 1ebfe0a4ee5f83a2ad54afcf99716944d20598dd Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Fri, 25 Oct 2024 19:18:44 +0200
Subject: [PATCH 15/29] devcontainer: replace `VAULT_HOST` with `AWS_ROLE_ARN`
 (#1708)

This PR is replacing the `VAULT_HOST` variable with `AWS_ROLE_ARN`. This is required to use the new token service to get AWS credentials.

Authors:
  - Jordan Jacobelli (https://github.com/jjacobelli)

Approvers:
  - Paul Taylor (https://github.com/trxcllnt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/rmm/pull/1708
---
 .devcontainer/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 9d35e3f97..5d1d53670 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -26,5 +26,5 @@ ENV PYTHONDONTWRITEBYTECODE="1"
 
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
-ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
+ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 ENV HISTFILE="/home/coder/.cache/._bash_history"

From 47dae24b5578894ac0efc3c06930b7a5a069d988 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 29 Oct 2024 13:39:06 -0500
Subject: [PATCH 16/29] print sccache stats in builds (#1712)

Contributes to https://github.com/rapidsai/build-planning/issues/111

Proposes some small packaging/CI changes, matching similar changes being made across RAPIDS.

* printing `sccache` stats to CI logs
* updating to the latest `rapids-dependency-file-generator` (v1.16.0)
* reducing verbosity of `pip wheel` in wheel builds

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/rmm/pull/1712
---
 .pre-commit-config.yaml  | 2 +-
 ci/build_cpp.sh          | 4 ++++
 ci/build_python.sh       | 4 ++++
 ci/build_wheel_cpp.sh    | 7 ++++++-
 ci/build_wheel_python.sh | 6 +++++-
 5 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f114abec4..56c972b4e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -82,7 +82,7 @@ repos:
         - id: verify-copyright
         - id: verify-alpha-spec
     - repo: https://github.com/rapidsai/dependency-file-generator
-      rev: v1.13.11
+      rev: v1.16.0
       hooks:
           - id: rapids-dependency-file-generator
             args: ["--clean"]
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index a601ecaae..9d14cd072 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -15,7 +15,11 @@ rapids-print-env
 
 rapids-logger "Begin cpp build"
 
+sccache --zero-stats
+
 # This calls mambabuild when boa is installed (as is the case in the CI images)
 RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild conda/recipes/librmm
 
+sccache --show-adv-stats
+
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_python.sh b/ci/build_python.sh
index fcd2c55e7..7a9df5fc7 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -19,7 +19,11 @@ rapids-logger "Begin py build"
 
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
+sccache --zero-stats
+
 # This calls mambabuild when boa is installed (as is the case in the CI images)
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild -c "${CPP_CHANNEL}" conda/recipes/rmm
 
+sccache --show-adv-stats
+
 rapids-upload-conda-to-s3 python
diff --git a/ci/build_wheel_cpp.sh b/ci/build_wheel_cpp.sh
index 2c5cc0560..12e099bdb 100755
--- a/ci/build_wheel_cpp.sh
+++ b/ci/build_wheel_cpp.sh
@@ -14,7 +14,12 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
 cd "${package_dir}"
 
-python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+sccache --zero-stats
+
+python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check
+
+sccache --show-adv-stats
+
 python -m pip install wheel
 python -m wheel tags --platform any dist/* --remove
 RAPIDS_PY_WHEEL_NAME="rmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp dist
diff --git a/ci/build_wheel_python.sh b/ci/build_wheel_python.sh
index 555974b50..b497b76d3 100755
--- a/ci/build_wheel_python.sh
+++ b/ci/build_wheel_python.sh
@@ -22,8 +22,12 @@ CPP_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="rmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-down
 # are used when created the isolated build environment
 echo "librmm-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo ${CPP_WHEELHOUSE}/librmm_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" > ./build-constraints.txt
 
+sccache --zero-stats
+
 PIP_CONSTRAINT="${PWD}/build-constraints.txt" \
-    python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+    python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check
+
+sccache --show-adv-stats
 
 mkdir -p final_dist
 python -m auditwheel repair -w final_dist dist/*

From 8d49fffdb93b55ce70c72981d2e1d5511692eaa2 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 31 Oct 2024 19:44:38 -0400
Subject: [PATCH 17/29] Deprecate `rmm._lib` (#1713)

Follows up #1676 to add deprecation warnings to the `rmm._lib` sub package.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/rmm/pull/1713
---
 python/rmm/rmm/__init__.py       | 8 ++++++++
 python/rmm/rmm/_lib/__init__.py  | 8 ++++++++
 python/rmm/rmm/tests/test_rmm.py | 6 ++++++
 3 files changed, 22 insertions(+)

diff --git a/python/rmm/rmm/__init__.py b/python/rmm/rmm/__init__.py
index b23ad68f9..832fec095 100644
--- a/python/rmm/rmm/__init__.py
+++ b/python/rmm/rmm/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
+
 from rmm import mr
 from rmm._version import __git_commit__, __version__
 from rmm.mr import disable_logging, enable_logging, get_log_filenames
@@ -58,6 +60,12 @@ def __getattr__(name):
     if name == "_lib":
         import importlib
 
+        warnings.warn(
+            "The `rmm._lib` module is deprecated in will be removed in a future release. Use `rmm.pylibrmm` instead.",
+            FutureWarning,
+            stacklevel=2,
+        )
+
         module = importlib.import_module("rmm.pylibrmm")
         return module
     else:
diff --git a/python/rmm/rmm/_lib/__init__.py b/python/rmm/rmm/_lib/__init__.py
index 7cfddab60..7e01bda77 100644
--- a/python/rmm/rmm/_lib/__init__.py
+++ b/python/rmm/rmm/_lib/__init__.py
@@ -12,4 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
+
 from rmm.pylibrmm import *
+
+warnings.warn(
+    "The `rmm._lib` module is deprecated in will be removed in a future release. Use `rmm.pylibrmm` instead.",
+    FutureWarning,
+    stacklevel=2,
+)
diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
index c03b9e501..9872ba89d 100644
--- a/python/rmm/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -1076,3 +1076,9 @@ def test_available_device_memory():
     assert initial_memory[1] == final_memory[1]
     assert initial_memory[0] > 0
     assert final_memory[0] > 0
+
+
+# TODO: Remove test when rmm._lib is removed in 25.02
+def test_deprecate_rmm_lib():
+    with pytest.warns(FutureWarning):
+        rmm._lib.device_buffer.DeviceBuffer(size=100)

From 9b76d366d2d971839d4997c437e2d20490d9d65e Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 1 Nov 2024 13:47:08 -0400
Subject: [PATCH 18/29] Correct rmm tests for validity of device pointers
 (#1714)

The `is_host_memory` function has been updated to understand that `cudaMemoryTypeUnregistered` is returned when provided pointers allocated by `malloc` and other host side allocation functions.

The `is_device_memory` function has been restricted to report only when device pointer that is usable by the calling cuda context. For that reason the tests now also set the active cuda device for all calling threads.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Rong Ou (https://github.com/rongou)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/rmm/pull/1714
---
 .../mr/device/mr_ref_multithreaded_tests.cpp  | 19 +++++++++++++++++--
 tests/mr/device/test_utils.hpp                |  7 ++-----
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/tests/mr/device/mr_ref_multithreaded_tests.cpp b/tests/mr/device/mr_ref_multithreaded_tests.cpp
index 7d749efd1..944ba1807 100644
--- a/tests/mr/device/mr_ref_multithreaded_tests.cpp
+++ b/tests/mr/device/mr_ref_multithreaded_tests.cpp
@@ -109,8 +109,13 @@ TEST_P(mr_ref_test_mt, SetCurrentDeviceResourceRef_mt)
 {
   // single thread changes default resource, then multiple threads use it
   auto old = rmm::mr::set_current_device_resource_ref(this->ref);
+  test_get_current_device_resource_ref();
 
-  spawn([mr = this->ref]() {
+  int device;
+  RMM_CUDA_TRY(cudaGetDevice(&device));
+
+  spawn([device, mr = this->ref]() {
+    RMM_CUDA_TRY(cudaSetDevice(device));
     EXPECT_EQ(mr, rmm::mr::get_current_device_resource_ref());
     test_get_current_device_resource_ref();  // test allocating with the new default resource
   });
@@ -156,7 +161,17 @@ TEST_P(mr_ref_test_mt, SetCurrentDeviceResourceRefPerThread_mt)
   }
 }
 
-TEST_P(mr_ref_test_mt, Allocate) { spawn(test_various_allocations, this->ref); }
+TEST_P(mr_ref_test_mt, Allocate)
+{
+  int device;
+  RMM_CUDA_TRY(cudaGetDevice(&device));
+
+  auto mr = this->ref;
+  spawn([device, mr]() {
+    RMM_CUDA_TRY(cudaSetDevice(device));
+    test_various_allocations(mr);
+  });
+}
 
 TEST_P(mr_ref_test_mt, AllocateDefaultStream)
 {
diff --git a/tests/mr/device/test_utils.hpp b/tests/mr/device/test_utils.hpp
index 2b9513793..5b7ef197b 100644
--- a/tests/mr/device/test_utils.hpp
+++ b/tests/mr/device/test_utils.hpp
@@ -31,17 +31,14 @@ inline bool is_device_accessible_memory(void* ptr)
 {
   cudaPointerAttributes attributes{};
   if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
-  return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged) or
-         ((attributes.type == cudaMemoryTypeHost) and (attributes.devicePointer != nullptr)) or
-         ((attributes.type == cudaMemoryTypeUnregistered) and
-          (rmm::mr::detail::is_system_memory_supported(rmm::get_current_cuda_device())));
+  return attributes.devicePointer != nullptr;
 }
 
 inline bool is_host_memory(void* ptr)
 {
   cudaPointerAttributes attributes{};
   if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
-  return attributes.type == cudaMemoryTypeHost;
+  return attributes.hostPointer != nullptr || attributes.type == cudaMemoryTypeUnregistered;
 }
 
 inline bool is_properly_aligned(void* ptr)

From dbae8c08b0bed1d14ff1b5fe1bc5332b0c175cf8 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 2 Nov 2024 01:58:15 +0800
Subject: [PATCH 19/29] [fea] Expose the arena mr to the Python interface.
 (#1711)

Close https://github.com/rapidsai/rmm/issues/830 .

- Add the arena allocator to the public Python interface.
- Small changes to the logger initialization to avoid exposing spdlog in the shared objects.

Authors:
  - Jiaming Yuan (https://github.com/trivialfis)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/rmm/pull/1711
---
 .../rmm/mr/device/arena_memory_resource.hpp   | 10 ++++-
 include/rmm/mr/device/detail/arena.hpp        |  2 +-
 python/rmm/rmm/_lib/memory_resource.pxd       |  1 +
 python/rmm/rmm/_lib/memory_resource.py        |  1 +
 python/rmm/rmm/librmm/memory_resource.pxd     |  9 ++++
 python/rmm/rmm/mr.py                          |  2 +
 python/rmm/rmm/pylibrmm/memory_resource.pxd   |  3 ++
 python/rmm/rmm/pylibrmm/memory_resource.pyx   | 43 +++++++++++++++++++
 python/rmm/rmm/tests/test_rmm.py              | 22 ++++++++++
 9 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index 417b7d2b4..9b380ffb9 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -97,7 +97,10 @@ class arena_memory_resource final : public device_memory_resource {
     : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
   {
     if (dump_log_on_failure_) {
-      logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log");
+      logger_ =
+        std::make_shared<spdlog::logger>("arena_memory_dump",
+                                         std::make_shared<spdlog::sinks::basic_file_sink_mt>(
+                                           "rmm_arena_memory_dump.log", true /*truncate file*/));
       // Set the level to `debug` for more detailed output.
       logger_->set_level(spdlog::level::info);
     }
@@ -120,7 +123,10 @@ class arena_memory_resource final : public device_memory_resource {
       dump_log_on_failure_{dump_log_on_failure}
   {
     if (dump_log_on_failure_) {
-      logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log");
+      logger_ =
+        std::make_shared<spdlog::logger>("arena_memory_dump",
+                                         std::make_shared<spdlog::sinks::basic_file_sink_mt>(
+                                           "rmm_arena_memory_dump.log", true /*truncate file*/));
       // Set the level to `debug` for more detailed output.
       logger_->set_level(spdlog::level::info);
     }
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 6f8303c83..da64ca85b 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -647,7 +647,7 @@ class global_arena final {
    *
    * @param logger the spdlog logger to use
    */
-  void dump_memory_log(std::shared_ptr<spdlog::logger> const& logger) const
+  RMM_HIDDEN void dump_memory_log(std::shared_ptr<spdlog::logger> const& logger) const
   {
     std::lock_guard lock(mtx_);
 
diff --git a/python/rmm/rmm/_lib/memory_resource.pxd b/python/rmm/rmm/_lib/memory_resource.pxd
index 983063914..0d11001a4 100644
--- a/python/rmm/rmm/_lib/memory_resource.pxd
+++ b/python/rmm/rmm/_lib/memory_resource.pxd
@@ -40,6 +40,7 @@ from rmm.librmm.memory_resource cimport (
     translate_python_except_to_cpp,
 )
 from rmm.pylibrmm.memory_resource cimport (
+    ArenaMemoryResource,
     BinningMemoryResource,
     CallbackMemoryResource,
     CudaAsyncMemoryResource,
diff --git a/python/rmm/rmm/_lib/memory_resource.py b/python/rmm/rmm/_lib/memory_resource.py
index 0d47e8c9b..f3a24f635 100644
--- a/python/rmm/rmm/_lib/memory_resource.py
+++ b/python/rmm/rmm/_lib/memory_resource.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from rmm.pylibrmm.memory_resource import (  # noqa: F401
+    ArenaMemoryResource,
     BinningMemoryResource,
     CallbackMemoryResource,
     CudaAsyncMemoryResource,
diff --git a/python/rmm/rmm/librmm/memory_resource.pxd b/python/rmm/rmm/librmm/memory_resource.pxd
index 9ddaf04b9..9e7b70c4f 100644
--- a/python/rmm/rmm/librmm/memory_resource.pxd
+++ b/python/rmm/rmm/librmm/memory_resource.pxd
@@ -130,6 +130,15 @@ cdef extern from "rmm/mr/device/pool_memory_resource.hpp" \
             optional[size_t] maximum_pool_size) except +
         size_t pool_size()
 
+cdef extern from "rmm/mr/device/arena_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass arena_memory_resource[Upstream](device_memory_resource):
+        arena_memory_resource(
+            Upstream* upstream_mr,
+            optional[size_t] arena_size,
+            bool dump_log_on_failure
+        ) except +
+
 cdef extern from "rmm/mr/device/fixed_size_memory_resource.hpp" \
         namespace "rmm::mr" nogil:
     cdef cppclass fixed_size_memory_resource[Upstream](device_memory_resource):
diff --git a/python/rmm/rmm/mr.py b/python/rmm/rmm/mr.py
index 3f0c3fce3..82729271f 100644
--- a/python/rmm/rmm/mr.py
+++ b/python/rmm/rmm/mr.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from rmm.pylibrmm.memory_resource import (
+    ArenaMemoryResource,
     BinningMemoryResource,
     CallbackMemoryResource,
     CudaAsyncMemoryResource,
@@ -45,6 +46,7 @@
 )
 
 __all__ = [
+    "ArenaMemoryResource",
     "BinningMemoryResource",
     "CallbackMemoryResource",
     "CudaAsyncMemoryResource",
diff --git a/python/rmm/rmm/pylibrmm/memory_resource.pxd b/python/rmm/rmm/pylibrmm/memory_resource.pxd
index 985d5d31b..d1e5610db 100644
--- a/python/rmm/rmm/pylibrmm/memory_resource.pxd
+++ b/python/rmm/rmm/pylibrmm/memory_resource.pxd
@@ -26,6 +26,9 @@ cdef class UpstreamResourceAdaptor(DeviceMemoryResource):
 
     cpdef DeviceMemoryResource get_upstream(self)
 
+cdef class ArenaMemoryResource(UpstreamResourceAdaptor):
+    pass
+
 cdef class CudaMemoryResource(DeviceMemoryResource):
     pass
 
diff --git a/python/rmm/rmm/pylibrmm/memory_resource.pyx b/python/rmm/rmm/pylibrmm/memory_resource.pyx
index 021125567..b41890fca 100644
--- a/python/rmm/rmm/pylibrmm/memory_resource.pyx
+++ b/python/rmm/rmm/pylibrmm/memory_resource.pyx
@@ -49,6 +49,7 @@ from rmm.librmm.memory_resource cimport (
     CppExcept,
     allocate_callback_t,
     allocation_handle_type,
+    arena_memory_resource,
     available_device_memory as c_available_device_memory,
     binning_memory_resource,
     callback_memory_resource,
@@ -310,6 +311,48 @@ cdef class PoolMemoryResource(UpstreamResourceAdaptor):
         )
         return c_mr.pool_size()
 
+cdef class ArenaMemoryResource(UpstreamResourceAdaptor):
+    def __cinit__(
+        self, DeviceMemoryResource upstream_mr,
+        arena_size=None,
+        dump_log_on_failure=False
+    ):
+        cdef optional[size_t] c_arena_size = (
+            optional[size_t]() if
+            arena_size is None
+            else optional[size_t](<size_t> parse_bytes(arena_size))
+        )
+        self.c_obj.reset(
+            new arena_memory_resource[device_memory_resource](
+                upstream_mr.get_mr(),
+                c_arena_size,
+                dump_log_on_failure,
+            )
+        )
+
+    def __init__(
+        self,
+        DeviceMemoryResource upstream_mr,
+        object arena_size=None,
+        bool dump_log_on_failure=False
+    ):
+        """
+        A suballocator that emphasizes fragmentation avoidance and scalable concurrency
+        support.
+
+        Parameters
+        ----------
+        upstream_mr : DeviceMemoryResource
+            The DeviceMemoryResource from which to allocate memory for arenas.
+        arena_size : int, optional
+            Size in bytes of the global arena. Defaults to half of the available memory
+            on the current device.
+        dump_log_on_failure : bool, optional
+            Whether to dump the arena on allocation failure.
+        """
+        pass
+
+
 cdef class FixedSizeMemoryResource(UpstreamResourceAdaptor):
     def __cinit__(
             self,
diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
index 9872ba89d..b52ea0179 100644
--- a/python/rmm/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -505,6 +505,28 @@ def test_binning_memory_resource(dtype, nelem, alloc, upstream_mr):
     array_tester(dtype, nelem, alloc)
 
 
+@pytest.mark.parametrize("dtype", _dtypes)
+@pytest.mark.parametrize("nelem", _nelems)
+@pytest.mark.parametrize("alloc", _allocs)
+@pytest.mark.parametrize(
+    "upstream_mr",
+    [
+        lambda: rmm.mr.CudaMemoryResource(),
+        lambda: rmm.mr.ManagedMemoryResource(),
+        lambda: rmm.mr.PoolMemoryResource(
+            rmm.mr.CudaMemoryResource(), 1 << 20
+        ),
+    ],
+)
+def test_arena_memory_resource(dtype, nelem, alloc, upstream_mr):
+    upstream = upstream_mr()
+    mr = rmm.mr.ArenaMemoryResource(upstream)
+
+    rmm.mr.set_current_device_resource(mr)
+    assert rmm.mr.get_current_device_resource_type() is type(mr)
+    array_tester(dtype, nelem, alloc)
+
+
 def test_reinitialize_max_pool_size():
     rmm.reinitialize(
         pool_allocator=True, initial_pool_size=0, maximum_pool_size="8MiB"

From d4c0635677508900668e50cd6f1afd8fae4e5c98 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 6 Nov 2024 15:28:02 -0500
Subject: [PATCH 20/29] Disallow cuda-python 12.6.1 and 11.8.4 (#1720)

Due to a bug in cuda-python we must disallow cuda-python 12.6.1 and 11.8.4. See https://github.com/rapidsai/build-planning/issues/116 for more information.

This PR disallows those versions, and other changes following from that:

* specifying `python` in both `host:` and `run:` dependencies for the `rmm` conda package
* ignoring deprecation warnings raised by newer versions of `cuda-python`

Authors:
  - Bradley Dice (https://github.com/bdice)
  - James Lamb (https://github.com/jameslamb)
  - https://github.com/jakirkham

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/rmm/pull/1720
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +-
 conda/recipes/rmm/meta.yaml                      | 8 ++++++--
 dependencies.yaml                                | 4 ++--
 python/rmm/pyproject.toml                        | 5 +++--
 5 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index bf64d4d55..2501effa5 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -10,7 +10,7 @@ dependencies:
 - clang-tools==16.0.6
 - clang==16.0.6
 - cmake>=3.26.4,!=3.30.0
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,!=11.8.4
 - cuda-version=11.8
 - cudatoolkit
 - cxx-compiler
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 112c635a8..357ae8a24 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -11,7 +11,7 @@ dependencies:
 - clang==16.0.6
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvcc
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,!=12.6.1
 - cuda-version=12.5
 - cxx-compiler
 - cython>=3.0.0
diff --git a/conda/recipes/rmm/meta.yaml b/conda/recipes/rmm/meta.yaml
index fcc7424fa..c9b0f8189 100644
--- a/conda/recipes/rmm/meta.yaml
+++ b/conda/recipes/rmm/meta.yaml
@@ -38,6 +38,7 @@ build:
     - {{ compiler('cuda') }}
     - cuda-cudart-dev
     {% endif %}
+    - cuda-python
 
 requirements:
   build:
@@ -56,10 +57,10 @@ requirements:
     - cuda-version ={{ cuda_version }}
     {% if cuda_major == "11" %}
     - cudatoolkit
-    - cuda-python >=11.7.1,<12.0a0
+    - cuda-python >=11.7.1,<12.0a0,!=11.8.4
     {% else %}
     - cuda-cudart-dev
-    - cuda-python >=12.0,<13.0a0
+    - cuda-python >=12.0,<13.0a0,!=12.6.1
     {% endif %}
     - cython >=3.0.0
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
@@ -69,12 +70,15 @@ requirements:
   run:
     {% if cuda_major == "11" %}
     - cudatoolkit
+    - cuda-python >=11.7.1,<12.0a0,!=11.8.4
     {% else %}
     - cuda-cudart
+    - cuda-python >=12.0,<13.0a0,!=12.6.1
     {% endif %}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     - numba >=0.57
     - numpy >=1.23,<3.0a0
+    - python
 
 test:
   imports:
diff --git a/dependencies.yaml b/dependencies.yaml
index 9f1ed9c40..88459c8f5 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -153,10 +153,10 @@ dependencies:
           - matrix:
               cuda: "12.*"
             packages:
-              - &cuda_python12 cuda-python>=12.0,<13.0a0
+              - &cuda_python12 cuda-python>=12.0,<13.0a0,!=12.6.1
           - matrix: # All CUDA 11 versions
             packages:
-              - &cuda_python11 cuda-python>=11.7.1,<12.0a0
+              - &cuda_python11 cuda-python>=11.7.1,<12.0a0,!=11.8.4
       - output_types: [requirements, pyproject]
         matrices:
             - matrix:
diff --git a/python/rmm/pyproject.toml b/python/rmm/pyproject.toml
index b148cdba7..095af6339 100644
--- a/python/rmm/pyproject.toml
+++ b/python/rmm/pyproject.toml
@@ -30,7 +30,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "cuda-python>=11.7.1,<12.0a0",
+    "cuda-python>=11.7.1,<12.0a0,!=11.8.4",
     "numba>=0.57",
     "numpy>=1.23,<3.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -128,7 +128,7 @@ dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
-    "cuda-python>=11.7.1,<12.0a0",
+    "cuda-python>=11.7.1,<12.0a0,!=11.8.4",
     "cython>=3.0.0",
     "librmm==24.12.*,>=0.0.0a0",
     "ninja",
@@ -138,4 +138,5 @@ requires = [
 # treat warnings as errors
 filterwarnings = [
     "error",
+    "ignore:.*cuda..* module is deprecated.*:DeprecationWarning"
 ]

From a98c22a76bc0dfbd8fd39d4aa2acef1598bd68ab Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 6 Nov 2024 21:19:25 -0600
Subject: [PATCH 21/29] WIP: put a ceiling on cuda-python (#1723)

Follow-up to #1720

Contributes to https://github.com/rapidsai/build-planning/issues/116

That PR used `!=` requirements to skip a particular version of `cuda-python` that `rmm` was incompatible with. A newer version of `cuda-python` (12.6.2 for CUDA 12, 11.8.5 for CUDA 11) was just released, and it also causes some build issues for RAPIDS libraries: https://github.com/rapidsai/cuvs/pull/445#issuecomment-2461146449

To unblock CI across RAPIDS, this proposes **temporarily** switching to ceilings on `rmm`'s `cuda-python` dependency.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/rmm/pull/1723
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +-
 conda/recipes/rmm/meta.yaml                      | 8 ++++----
 dependencies.yaml                                | 4 ++--
 python/rmm/pyproject.toml                        | 4 ++--
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 2501effa5..519c056b5 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -10,7 +10,7 @@ dependencies:
 - clang-tools==16.0.6
 - clang==16.0.6
 - cmake>=3.26.4,!=3.30.0
-- cuda-python>=11.7.1,<12.0a0,!=11.8.4
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
 - cuda-version=11.8
 - cudatoolkit
 - cxx-compiler
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 357ae8a24..86e887c21 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -11,7 +11,7 @@ dependencies:
 - clang==16.0.6
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvcc
-- cuda-python>=12.0,<13.0a0,!=12.6.1
+- cuda-python>=12.0,<13.0a0,<=12.6.0
 - cuda-version=12.5
 - cxx-compiler
 - cython>=3.0.0
diff --git a/conda/recipes/rmm/meta.yaml b/conda/recipes/rmm/meta.yaml
index c9b0f8189..8f6e13fd7 100644
--- a/conda/recipes/rmm/meta.yaml
+++ b/conda/recipes/rmm/meta.yaml
@@ -57,10 +57,10 @@ requirements:
     - cuda-version ={{ cuda_version }}
     {% if cuda_major == "11" %}
     - cudatoolkit
-    - cuda-python >=11.7.1,<12.0a0,!=11.8.4
+    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
     {% else %}
     - cuda-cudart-dev
-    - cuda-python >=12.0,<13.0a0,!=12.6.1
+    - cuda-python >=12.0,<13.0a0,<=12.6.0
     {% endif %}
     - cython >=3.0.0
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
@@ -70,10 +70,10 @@ requirements:
   run:
     {% if cuda_major == "11" %}
     - cudatoolkit
-    - cuda-python >=11.7.1,<12.0a0,!=11.8.4
+    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
     {% else %}
     - cuda-cudart
-    - cuda-python >=12.0,<13.0a0,!=12.6.1
+    - cuda-python >=12.0,<13.0a0,<=12.6.0
     {% endif %}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     - numba >=0.57
diff --git a/dependencies.yaml b/dependencies.yaml
index 88459c8f5..3e2c2eb29 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -153,10 +153,10 @@ dependencies:
           - matrix:
               cuda: "12.*"
             packages:
-              - &cuda_python12 cuda-python>=12.0,<13.0a0,!=12.6.1
+              - &cuda_python12 cuda-python>=12.0,<13.0a0,<=12.6.0
           - matrix: # All CUDA 11 versions
             packages:
-              - &cuda_python11 cuda-python>=11.7.1,<12.0a0,!=11.8.4
+              - &cuda_python11 cuda-python>=11.7.1,<12.0a0,<=11.8.3
       - output_types: [requirements, pyproject]
         matrices:
             - matrix:
diff --git a/python/rmm/pyproject.toml b/python/rmm/pyproject.toml
index 095af6339..22ed94660 100644
--- a/python/rmm/pyproject.toml
+++ b/python/rmm/pyproject.toml
@@ -30,7 +30,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "cuda-python>=11.7.1,<12.0a0,!=11.8.4",
+    "cuda-python>=11.7.1,<12.0a0,<=11.8.3",
     "numba>=0.57",
     "numpy>=1.23,<3.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -128,7 +128,7 @@ dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
-    "cuda-python>=11.7.1,<12.0a0,!=11.8.4",
+    "cuda-python>=11.7.1,<12.0a0,<=11.8.3",
     "cython>=3.0.0",
     "librmm==24.12.*,>=0.0.0a0",
     "ninja",

From 771ccdc3a1aa142f4be03518d868595b0f81a8c3 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 7 Nov 2024 10:22:42 -0600
Subject: [PATCH 22/29] use rapids-generate-pip-constraints to pin to oldest
 dependencies in CI (#1716)

Follow-up to #1613

Similar to https://github.com/rapidsai/cudf/pull/17131

Proposes using the new `rapids-generate-pip-constraints` tool from `gha-tools` to generate a list of pip constraints pinning to the oldest supported verisons of dependencies here.

## Notes for Reviewers

### How I tested this

`wheel-tests`:

* oldest-deps: numpy 1.x ([build link](https://github.com/rapidsai/rmm/actions/runs/11620907528/job/32364032641?pr=1716#step:8:106))
* latest-deps: numpy 2.x ([build link](https://github.com/rapidsai/rmm/actions/runs/11620907528/job/32364032835?pr=1716#step:8:104))

And the testing of the general approach in https://github.com/rapidsai/gha-tools/pull/114#issuecomment-2445377824

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/rmm/pull/1716
---
 ci/test_wheel.sh | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index d06c4eed0..2f39b197b 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -7,15 +7,8 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 WHEELHOUSE="${PWD}/dist/"
 RAPIDS_PY_WHEEL_NAME="rmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python "${WHEELHOUSE}"
 
-# Constraint to minimum dependency versions if job is set up as "oldest"
-echo "" > ./constraints.txt
-if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
-  rapids-dependency-file-generator \
-        --output requirements \
-        --file-key test_python \
-        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
-      | tee ./constraints.txt
-fi
+# generate constraints (possibly pinning to oldest support versions of dependencies)
+rapids-generate-pip-constraints test_python ./constraints.txt
 
 # echo to expand wildcard before adding '[extra]' requires for pip
 python -m pip install \

From 56f3a464053a66f801a39c65cde47dbab46ce513 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 13 Nov 2024 12:40:18 -0600
Subject: [PATCH 23/29] enforce wheel size limits, README formatting in CI
 (#1726)

Contributes to https://github.com/rapidsai/build-planning/issues/110

Proposes adding 2 types of validation on wheels in CI, to ensure we continue to produce wheels that are suitable for PyPI.

* checks on wheel size (compressed),
  - *to be sure they're under PyPI limits*
  - *and to prompt discussion on PRs that significantly increase wheel sizes*
* checks on README formatting
  - *to ensure they'll render properly as the PyPI project homepages*
  - *e.g. like how https://github.com/scikit-learn/scikit-learn/blob/main/README.rst becomes https://pypi.org/project/scikit-learn/*

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/rmm/pull/1726
---
 ci/build_wheel_cpp.sh        |  3 +++
 ci/build_wheel_python.sh     |  2 ++
 ci/validate_wheel.sh         | 18 ++++++++++++++++++
 python/librmm/pyproject.toml |  8 ++++++++
 python/rmm/docs/conf.py      |  5 ++++-
 python/rmm/pyproject.toml    |  8 ++++++++
 6 files changed, 43 insertions(+), 1 deletion(-)
 create mode 100755 ci/validate_wheel.sh

diff --git a/ci/build_wheel_cpp.sh b/ci/build_wheel_cpp.sh
index 12e099bdb..1ec979372 100755
--- a/ci/build_wheel_cpp.sh
+++ b/ci/build_wheel_cpp.sh
@@ -22,4 +22,7 @@ sccache --show-adv-stats
 
 python -m pip install wheel
 python -m wheel tags --platform any dist/* --remove
+
+../../ci/validate_wheel.sh dist
+
 RAPIDS_PY_WHEEL_NAME="rmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp dist
diff --git a/ci/build_wheel_python.sh b/ci/build_wheel_python.sh
index b497b76d3..4e4d3bf61 100755
--- a/ci/build_wheel_python.sh
+++ b/ci/build_wheel_python.sh
@@ -32,6 +32,8 @@ sccache --show-adv-stats
 mkdir -p final_dist
 python -m auditwheel repair -w final_dist dist/*
 
+../../ci/validate_wheel.sh final_dist
+
 RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python final_dist
 
 # switch back to the root of the repo and check symbol visibility
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
new file mode 100755
index 000000000..60a80fce6
--- /dev/null
+++ b/ci/validate_wheel.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+wheel_dir_relative_path=$1
+
+rapids-logger "validate packages with 'pydistcheck'"
+
+pydistcheck \
+    --inspect \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
+
+rapids-logger "validate packages with 'twine'"
+
+twine check \
+    --strict \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
diff --git a/python/librmm/pyproject.toml b/python/librmm/pyproject.toml
index 0f0b4e397..bae2ef36b 100644
--- a/python/librmm/pyproject.toml
+++ b/python/librmm/pyproject.toml
@@ -67,3 +67,11 @@ wheel.py-api = "py3"
 provider = "scikit_build_core.metadata.regex"
 input = "librmm/VERSION"
 regex = "(?P<value>.*)"
+
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
diff --git a/python/rmm/docs/conf.py b/python/rmm/docs/conf.py
index 0b2c21d5a..2aad3a82c 100644
--- a/python/rmm/docs/conf.py
+++ b/python/rmm/docs/conf.py
@@ -197,7 +197,10 @@
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3", None),
     "numba": ("https://numba.readthedocs.io/en/stable", None),
-    "cuda-python": ("https://nvidia.github.io/cuda-python/", None),
+    "cuda-python": (
+        "https://nvidia.github.io/cuda-python/cuda-bindings/",
+        None,
+    ),
 }
 
 # Config numpydoc
diff --git a/python/rmm/pyproject.toml b/python/rmm/pyproject.toml
index 22ed94660..aaaa15482 100644
--- a/python/rmm/pyproject.toml
+++ b/python/rmm/pyproject.toml
@@ -134,6 +134,14 @@ requires = [
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
+
 [tool.pytest.ini_options]
 # treat warnings as errors
 filterwarnings = [

From 220003e57532d1276fbecdba9dd82ed04efa1db4 Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Thu, 14 Nov 2024 12:37:07 +1100
Subject: [PATCH 24/29] Treat deprecation warnings as errors and fix
 deprecation warnings in replay benchmark (#1728)

Fixes #1727
Contributes to https://github.com/rapidsai/build-planning/issues/26

 - Removes `-Wno-error=deprecated-declarations`
 - Replaces deprecated usage of `rmm::logger()` in reply benchmark with supported `RMM_LOG_INFO` macros.

Note the latter duplicates a change in #1724 which allows the two PRs to be merged independently.

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Rong Ou (https://github.com/rongou)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/rmm/pull/1728
---
 benchmarks/CMakeLists.txt    | 7 +++----
 benchmarks/replay/replay.cpp | 6 +++---
 tests/CMakeLists.txt         | 3 +--
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 9dfb2c538..0487a2dfa 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -42,9 +42,8 @@ function(ConfigureBench BENCH_NAME)
     target_compile_definitions(${BENCH_NAME} PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM)
   endif()
 
-  target_compile_options(
-    ${BENCH_NAME} PUBLIC $<$<COMPILE_LANG_AND_ID:CXX,GNU,Clang>:-Wall -Werror
-                         -Wno-error=deprecated-declarations -Wno-unknown-pragmas>)
+  target_compile_options(${BENCH_NAME} PUBLIC $<$<COMPILE_LANG_AND_ID:CXX,GNU,Clang>:-Wall -Werror
+                                              -Wno-unknown-pragmas>)
   if(DISABLE_DEPRECATION_WARNING)
     target_compile_options(
       ${BENCH_NAME} PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-deprecated-declarations>)
diff --git a/benchmarks/replay/replay.cpp b/benchmarks/replay/replay.cpp
index 5afed036a..d80841321 100644
--- a/benchmarks/replay/replay.cpp
+++ b/benchmarks/replay/replay.cpp
@@ -172,7 +172,7 @@ struct replay_benchmark {
   void SetUp(const ::benchmark::State& state)
   {
     if (state.thread_index() == 0) {
-      rmm::logger().log(spdlog::level::info, "------ Start of Benchmark -----");
+      RMM_LOG_INFO("------ Start of Benchmark -----");
       mr_ = factory_(simulated_size_);
     }
   }
@@ -181,7 +181,7 @@ struct replay_benchmark {
   void TearDown(const ::benchmark::State& state)
   {
     if (state.thread_index() == 0) {
-      rmm::logger().log(spdlog::level::info, "------ End of Benchmark -----");
+      RMM_LOG_INFO("------ End of Benchmark -----");
       // clean up any leaked allocations
       std::size_t total_leaked{0};
       std::size_t num_leaked{0};
@@ -402,7 +402,7 @@ int main(int argc, char** argv)
     auto const num_threads = per_thread_events.size();
 
     // Uncomment to enable / change default log level
-    // rmm::logger().set_level(spdlog::level::trace);
+    // rmm::detail::logger().set_level(spdlog::level::trace);
 
     if (args.count("resource") > 0) {
       std::string mr_name = args["resource"].as<std::string>();
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 0258c59c5..a482c8cc1 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -41,8 +41,7 @@ function(ConfigureTestInternal TEST_NAME)
                CUDA_STANDARD_REQUIRED ON)
   target_compile_definitions(${TEST_NAME}
                              PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}")
-  target_compile_options(${TEST_NAME} PUBLIC $<$<COMPILE_LANG_AND_ID:CXX,GNU,Clang>:-Wall -Werror
-                                             -Wno-error=deprecated-declarations>)
+  target_compile_options(${TEST_NAME} PUBLIC $<$<COMPILE_LANG_AND_ID:CXX,GNU,Clang>:-Wall -Werror>)
 
   if(DISABLE_DEPRECATION_WARNING)
     target_compile_options(

From 52d61c528fa46a1f579ce294757ed7fe6d0b2970 Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Thu, 14 Nov 2024 13:41:18 +1100
Subject: [PATCH 25/29] Remove all explicit usage of fmtlib (#1724)

Fixes #1717
Also fixes #1710 in 5330063

I have replaced fmt-style format string placeholders (`"... {} ..."`) with printf-style placeholders by adding a function `rmm::detail::formatted_log()`, which I modified from @vyasr 's #1722.

~The only remaining mention of fmt is in CMakeLists.txt. Do we still need to explicitly fetch fmt?~
Removed.

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/rmm/pull/1724
---
 CMakeLists.txt                                |   6 +-
 benchmarks/replay/replay.cpp                  |   1 +
 benchmarks/utilities/log_parser.hpp           |   4 +-
 cmake/thirdparty/get_fmt.cmake                |  22 ----
 include/rmm/detail/format.hpp                 | 101 ++++++++++++++++++
 include/rmm/detail/logging_assert.hpp         |   4 +-
 include/rmm/logger.hpp                        |  55 +++-------
 .../rmm/mr/device/arena_memory_resource.hpp   |   4 +-
 include/rmm/mr/device/detail/arena.hpp        |  41 +++----
 .../mr/device/detail/coalescing_free_list.hpp |   9 +-
 .../detail/stream_ordered_memory_resource.hpp |  29 +++--
 .../mr/device/logging_resource_adaptor.hpp    |  12 ++-
 .../rmm/mr/device/pool_memory_resource.hpp    |   9 +-
 .../mr/device/tracking_resource_adaptor.hpp   |  16 ++-
 tests/logger_tests.cpp                        |   1 -
 tests/mr/device/arena_mr_tests.cpp            |   4 +-
 tests/mr/device/callback_mr_tests.cpp         |   7 +-
 17 files changed, 189 insertions(+), 136 deletions(-)
 delete mode 100644 cmake/thirdparty/get_fmt.cmake
 create mode 100644 include/rmm/detail/format.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 26fcf1fd0..44d7fbb79 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,8 +41,8 @@ rapids_cmake_build_type(Release)
 option(RMM_NVTX "Build RMM with NVTX support" OFF)
 option(BUILD_TESTS "Configure CMake to build tests" ON)
 option(BUILD_BENCHMARKS "Configure CMake to build (google) benchmarks" OFF)
-# This is mostly so that dependent libraries, such as fmt, are configured in shared mode for
-# downstream dependents of RMM that get their common dependencies transitively.
+# This is mostly so that dependent libraries are configured in shared mode for downstream dependents
+# of RMM that get their common dependencies transitively.
 option(BUILD_SHARED_LIBS "Build RMM shared libraries" ON)
 set(RMM_LOGGING_LEVEL
     "INFO"
@@ -73,7 +73,6 @@ rapids_find_package(
 # add third party dependencies using CPM
 rapids_cpm_init()
 
-include(cmake/thirdparty/get_fmt.cmake)
 include(cmake/thirdparty/get_spdlog.cmake)
 include(cmake/thirdparty/get_cccl.cmake)
 include(cmake/thirdparty/get_nvtx.cmake)
@@ -96,7 +95,6 @@ else()
 endif()
 
 target_link_libraries(rmm INTERFACE CCCL::CCCL)
-target_link_libraries(rmm INTERFACE fmt::fmt-header-only)
 target_link_libraries(rmm INTERFACE spdlog::spdlog_header_only)
 target_link_libraries(rmm INTERFACE dl)
 target_link_libraries(rmm INTERFACE nvtx3::nvtx3-cpp)
diff --git a/benchmarks/replay/replay.cpp b/benchmarks/replay/replay.cpp
index d80841321..7f45b7691 100644
--- a/benchmarks/replay/replay.cpp
+++ b/benchmarks/replay/replay.cpp
@@ -16,6 +16,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/error.hpp>
+#include <rmm/logger.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/binning_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
diff --git a/benchmarks/utilities/log_parser.hpp b/benchmarks/utilities/log_parser.hpp
index 2283ace93..4dfa5bae4 100644
--- a/benchmarks/utilities/log_parser.hpp
+++ b/benchmarks/utilities/log_parser.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -151,7 +151,7 @@ inline std::vector<event> parse_csv(std::string const& filename)
 
   auto parse_pointer = [](std::string const& str, uintptr_t& ptr) {
     auto const base{16};
-    ptr = std::stoll(str, nullptr, base);
+    ptr = (str == "(nil)") ? 0 : std::stoll(str, nullptr, base);
   };
 
   std::vector<uintptr_t> pointers = csv.GetColumn<uintptr_t>("Pointer", parse_pointer);
diff --git a/cmake/thirdparty/get_fmt.cmake b/cmake/thirdparty/get_fmt.cmake
deleted file mode 100644
index 5787fb73f..000000000
--- a/cmake/thirdparty/get_fmt.cmake
+++ /dev/null
@@ -1,22 +0,0 @@
-# =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-# Use CPM to find or clone fmt
-function(find_and_configure_fmt)
-
-  include(${rapids-cmake-dir}/cpm/fmt.cmake)
-  rapids_cpm_fmt(INSTALL_EXPORT_SET rmm-exports BUILD_EXPORT_SET rmm-exports)
-endfunction()
-
-find_and_configure_fmt()
diff --git a/include/rmm/detail/format.hpp b/include/rmm/detail/format.hpp
new file mode 100644
index 000000000..21acac032
--- /dev/null
+++ b/include/rmm/detail/format.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <array>
+#include <cstdio>
+#include <ios>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+namespace RMM_NAMESPACE {
+namespace detail {
+
+/**
+ * @brief Format a message string with printf-style formatting
+ *
+ * This function performs printf-style formatting to avoid the need for fmt
+ * or spdlog's own templated APIs (which would require exposing spdlog
+ * symbols publicly) and returns the formatted message as a `std::string`.
+ *
+ * @param format The format string
+ * @param args The format arguments
+ * @return The formatted message
+ * @throw rmm::logic_error if an error occurs during formatting
+ */
+template <typename... Args>
+std::string formatted_log(std::string const& format, Args&&... args)
+{
+  auto convert_to_c_string = [](auto&& arg) -> decltype(auto) {
+    using ArgType = std::decay_t<decltype(arg)>;
+    if constexpr (std::is_same_v<ArgType, std::string>) {
+      return arg.c_str();
+    } else {
+      return std::forward<decltype(arg)>(arg);
+    }
+  };
+
+  // NOLINTBEGIN(cppcoreguidelines-pro-type-vararg)
+  auto retsize =
+    std::snprintf(nullptr, 0, format.c_str(), convert_to_c_string(std::forward<Args>(args))...);
+  RMM_EXPECTS(retsize >= 0, "Error during formatting.");
+  if (retsize == 0) { return {}; }
+  auto size = static_cast<std::size_t>(retsize) + 1;  // for null terminator
+  // NOLINTNEXTLINE(modernize-avoid-c-arrays, cppcoreguidelines-avoid-c-arrays)
+  std::unique_ptr<char[]> buf(new char[size]);
+  std::snprintf(buf.get(), size, format.c_str(), convert_to_c_string(std::forward<Args>(args))...);
+  // NOLINTEND(cppcoreguidelines-pro-type-vararg)
+  return {buf.get(), buf.get() + size - 1};  // drop '\0'
+}
+
+// specialization for no arguments
+template <>
+inline std::string formatted_log(std::string const& format)
+{
+  return format;
+}
+
+// Stringify a size in bytes to a human-readable value
+inline std::string format_bytes(std::size_t value)
+{
+  static std::array units{"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"};
+
+  int index = 0;
+  auto size = static_cast<double>(value);
+  while (size > 1024) {
+    size /= 1024;
+    index++;
+  }
+
+  return std::to_string(size) + ' ' + units.at(index);
+}
+
+// Stringify a stream ID
+inline std::string format_stream(rmm::cuda_stream_view stream)
+{
+  std::stringstream sstr{};
+  sstr << std::hex << stream.value();
+  return sstr.str();
+}
+
+}  // namespace detail
+}  // namespace RMM_NAMESPACE
diff --git a/include/rmm/detail/logging_assert.hpp b/include/rmm/detail/logging_assert.hpp
index 7eb667211..4d702ee2b 100644
--- a/include/rmm/detail/logging_assert.hpp
+++ b/include/rmm/detail/logging_assert.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@
     if (!success) {                                                                               \
       RMM_LOG_CRITICAL(                                                                           \
         "[" __FILE__ ":" RMM_STRINGIFY(__LINE__) "] Assertion " RMM_STRINGIFY(_expr) " failed."); \
-      rmm::logger().flush();                                                                      \
+      rmm::detail::logger().flush();                                                              \
       /* NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay) */                   \
       assert(success);                                                                            \
     }                                                                                             \
diff --git a/include/rmm/logger.hpp b/include/rmm/logger.hpp
index eba3f122b..2cfd921b1 100644
--- a/include/rmm/logger.hpp
+++ b/include/rmm/logger.hpp
@@ -16,15 +16,13 @@
 
 #pragma once
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/export.hpp>
+#include <rmm/detail/format.hpp>
 
-#include <fmt/format.h>
-#include <fmt/ostream.h>
 #include <spdlog/sinks/basic_file_sink.h>
 #include <spdlog/spdlog.h>
 
-#include <array>
-#include <iostream>
 #include <string>
 
 namespace RMM_NAMESPACE {
@@ -70,32 +68,6 @@ struct logger_wrapper {
   }
 };
 
-/**
- * @brief Represent a size in number of bytes.
- */
-struct bytes {
-  std::size_t value;  ///< The size in bytes
-
-  /**
-   * @brief Construct a new bytes object
-   *
-   * @param os The output stream
-   * @param value The size in bytes
-   */
-  friend std::ostream& operator<<(std::ostream& os, bytes const& value)
-  {
-    static std::array units{"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"};
-
-    int index = 0;
-    auto size = static_cast<double>(value.value);
-    while (size > 1024) {
-      size /= 1024;
-      index++;
-    }
-    return os << size << ' ' << units.at(index);
-  }
-};
-
 inline spdlog::logger& logger()
 {
   static detail::logger_wrapper wrapped{};
@@ -125,20 +97,21 @@ logger()
 // The default is INFO, but it should be used sparingly, so that by default a log file is only
 // output if there is important information, warnings, errors, and critical failures
 // Log messages that require computation should only be used at level TRACE and DEBUG
-#define RMM_LOG_TRACE(...)    SPDLOG_LOGGER_TRACE(&rmm::detail::logger(), __VA_ARGS__)
-#define RMM_LOG_DEBUG(...)    SPDLOG_LOGGER_DEBUG(&rmm::detail::logger(), __VA_ARGS__)
-#define RMM_LOG_INFO(...)     SPDLOG_LOGGER_INFO(&rmm::detail::logger(), __VA_ARGS__)
-#define RMM_LOG_WARN(...)     SPDLOG_LOGGER_WARN(&rmm::detail::logger(), __VA_ARGS__)
-#define RMM_LOG_ERROR(...)    SPDLOG_LOGGER_ERROR(&rmm::detail::logger(), __VA_ARGS__)
-#define RMM_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&rmm::detail::logger(), __VA_ARGS__)
+#define RMM_LOG_TRACE(...) \
+  SPDLOG_LOGGER_TRACE(&rmm::detail::logger(), rmm::detail::formatted_log(__VA_ARGS__))
+#define RMM_LOG_DEBUG(...) \
+  SPDLOG_LOGGER_DEBUG(&rmm::detail::logger(), rmm::detail::formatted_log(__VA_ARGS__))
+#define RMM_LOG_INFO(...) \
+  SPDLOG_LOGGER_INFO(&rmm::detail::logger(), rmm::detail::formatted_log(__VA_ARGS__))
+#define RMM_LOG_WARN(...) \
+  SPDLOG_LOGGER_WARN(&rmm::detail::logger(), rmm::detail::formatted_log(__VA_ARGS__))
+#define RMM_LOG_ERROR(...) \
+  SPDLOG_LOGGER_ERROR(&rmm::detail::logger(), rmm::detail::formatted_log(__VA_ARGS__))
+#define RMM_LOG_CRITICAL(...) \
+  SPDLOG_LOGGER_CRITICAL(&rmm::detail::logger(), rmm::detail::formatted_log(__VA_ARGS__))
 
 //! @endcond
 
 }  // namespace RMM_NAMESPACE
 
-// Doxygen doesn't like this because we're overloading something from fmt
-//! @cond Doxygen_Suppress
-template <>
-struct fmt::formatter<rmm::detail::bytes> : fmt::ostream_formatter {};
-
 //! @endcond
diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index 9b380ffb9..d3a4bb09d 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -18,6 +18,7 @@
 #include <rmm/aligned.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
+#include <rmm/detail/format.hpp>
 #include <rmm/detail/logging_assert.hpp>
 #include <rmm/logger.hpp>
 #include <rmm/mr/device/detail/arena.hpp>
@@ -335,7 +336,8 @@ class arena_memory_resource final : public device_memory_resource {
   void dump_memory_log(size_t bytes)
   {
     logger_->info("**************************************************");
-    logger_->info("Ran out of memory trying to allocate {}.", rmm::detail::bytes{bytes});
+    logger_->info(rmm::detail::formatted_log("Ran out of memory trying to allocate %s.",
+                                             rmm::detail::format_bytes(bytes)));
     logger_->info("**************************************************");
     logger_->info("Global arena:");
     global_arena_.dump_memory_log(logger_);
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index da64ca85b..419c4fcf4 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -21,13 +21,13 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
+#include <rmm/detail/format.hpp>
 #include <rmm/detail/logging_assert.hpp>
 #include <rmm/logger.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <cuda_runtime_api.h>
 
-#include <fmt/core.h>
 #include <spdlog/common.h>
 
 #include <algorithm>
@@ -651,33 +651,38 @@ class global_arena final {
   {
     std::lock_guard lock(mtx_);
 
-    logger->info("  Arena size: {}", rmm::detail::bytes{upstream_block_.size()});
-    logger->info("  # superblocks: {}", superblocks_.size());
+    logger->info(rmm::detail::formatted_log("  Arena size: %s",
+                                            rmm::detail::format_bytes(upstream_block_.size())));
+    logger->info(rmm::detail::formatted_log("  # superblocks: %zu", superblocks_.size()));
     if (!superblocks_.empty()) {
-      logger->debug("  Total size of superblocks: {}",
-                    rmm::detail::bytes{total_memory_size(superblocks_)});
+      logger->debug(
+        rmm::detail::formatted_log("  Total size of superblocks: %s",
+                                   rmm::detail::format_bytes(total_memory_size(superblocks_))));
       auto const total_free    = total_free_size(superblocks_);
       auto const max_free      = max_free_size(superblocks_);
       auto const fragmentation = (1 - max_free / static_cast<double>(total_free)) * 100;
-      logger->info("  Total free memory: {}", rmm::detail::bytes{total_free});
-      logger->info("  Largest block of free memory: {}", rmm::detail::bytes{max_free});
-      logger->info("  Fragmentation: {:.2f}%", fragmentation);
+      logger->info(rmm::detail::formatted_log("  Total free memory: %s",
+                                              rmm::detail::format_bytes(total_free)));
+      logger->info(rmm::detail::formatted_log("  Largest block of free memory: %s",
+                                              rmm::detail::format_bytes(max_free)));
+      logger->info(rmm::detail::formatted_log("  Fragmentation: %0.2f", fragmentation));
 
-      auto index = 0;
+      auto index = decltype(superblocks_.size()){0};
       char* prev_end{};
       for (auto const& sblk : superblocks_) {
         if (prev_end == nullptr) { prev_end = sblk.pointer(); }
-        logger->debug(
-          "    Superblock {}: start={}, end={}, size={}, empty={}, # free blocks={}, max free={}, "
-          "gap={}",
+        logger->debug(rmm::detail::formatted_log(
+          "    Superblock %zu: start=%p, end=%p, size=%s, empty=%s, # free blocks=%zu, max "
+          "free=%s, "
+          "gap=%s",
           index,
-          fmt::ptr(sblk.pointer()),
-          fmt::ptr(sblk.end()),
-          rmm::detail::bytes{sblk.size()},
-          sblk.empty(),
+          sblk.pointer(),
+          sblk.end(),
+          rmm::detail::format_bytes(sblk.size()),
+          sblk.empty() ? "T" : "F",
           sblk.free_blocks(),
-          rmm::detail::bytes{sblk.max_free_size()},
-          rmm::detail::bytes{static_cast<size_t>(sblk.pointer() - prev_end)});
+          rmm::detail::format_bytes(sblk.max_free_size()),
+          rmm::detail::format_bytes(static_cast<size_t>(sblk.pointer() - prev_end))));
         prev_end = sblk.end();
         index++;
       }
diff --git a/include/rmm/mr/device/detail/coalescing_free_list.hpp b/include/rmm/mr/device/detail/coalescing_free_list.hpp
index 8d5cbf9ed..8b056e6d9 100644
--- a/include/rmm/mr/device/detail/coalescing_free_list.hpp
+++ b/include/rmm/mr/device/detail/coalescing_free_list.hpp
@@ -20,8 +20,6 @@
 #include <rmm/detail/export.hpp>
 #include <rmm/mr/device/detail/free_list.hpp>
 
-#include <fmt/core.h>
-
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -131,10 +129,7 @@ struct block : public block_base {
   /**
    * @brief Print this block. For debugging.
    */
-  inline void print() const
-  {
-    std::cout << fmt::format("{} {} B", fmt::ptr(pointer()), size()) << std::endl;
-  }
+  inline void print() const { std::cout << pointer() << " " << size() << " B" << std::endl; }
 #endif
 
  private:
@@ -146,7 +141,7 @@ struct block : public block_base {
 /// Print block on an ostream
 inline std::ostream& operator<<(std::ostream& out, const block& blk)
 {
-  out << fmt::format("{} {} B\n", fmt::ptr(blk.pointer()), blk.size());
+  out << blk.pointer() << " " << blk.size() << " B" << std::endl;
   return out;
 }
 #endif
diff --git a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
index 9cf674d6e..f177504f2 100644
--- a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
+++ b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
@@ -19,13 +19,12 @@
 #include <rmm/cuda_device.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
+#include <rmm/detail/format.hpp>
 #include <rmm/logger.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <cuda_runtime_api.h>
 
-#include <fmt/core.h>
-
 #include <cstddef>
 #include <map>
 #include <mutex>
@@ -201,7 +200,7 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
    */
   void* do_allocate(std::size_t size, cuda_stream_view stream) override
   {
-    RMM_LOG_TRACE("[A][stream {:p}][{}B]", fmt::ptr(stream.value()), size);
+    RMM_LOG_TRACE("[A][stream %s][%zuB]", rmm::detail::format_stream(stream), size);
 
     if (size <= 0) { return nullptr; }
 
@@ -215,10 +214,10 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
                 rmm::out_of_memory);
     auto const block = this->underlying().get_block(size, stream_event);
 
-    RMM_LOG_TRACE("[A][stream {:p}][{}B][{:p}]",
-                  fmt::ptr(stream_event.stream),
+    RMM_LOG_TRACE("[A][stream %s][%zuB][%p]",
+                  rmm::detail::format_stream(stream_event.stream),
                   size,
-                  fmt::ptr(block.pointer()));
+                  block.pointer());
 
     log_summary_trace();
 
@@ -234,7 +233,7 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
    */
   void do_deallocate(void* ptr, std::size_t size, cuda_stream_view stream) override
   {
-    RMM_LOG_TRACE("[D][stream {:p}][{}B][{:p}]", fmt::ptr(stream.value()), size, ptr);
+    RMM_LOG_TRACE("[D][stream %s][%zuB][%p]", rmm::detail::format_stream(stream), size, ptr);
 
     if (size <= 0 || ptr == nullptr) { return; }
 
@@ -384,10 +383,10 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
       if (merge_first) {
         merge_lists(stream_event, blocks, other_event, std::move(other_blocks));
 
-        RMM_LOG_DEBUG("[A][Stream {:p}][{}B][Merged stream {:p}]",
-                      fmt::ptr(stream_event.stream),
+        RMM_LOG_DEBUG("[A][Stream %s][%zuB][Merged stream %s]",
+                      rmm::detail::format_stream(stream_event.stream),
                       size,
-                      fmt::ptr(iter->first.stream));
+                      rmm::detail::format_stream(iter->first.stream));
 
         stream_free_blocks_.erase(iter);
 
@@ -414,11 +413,11 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
         block_type const block = find_block(iter);
 
         if (block.is_valid()) {
-          RMM_LOG_DEBUG((merge_first) ? "[A][Stream {:p}][{}B][Found after merging stream {:p}]"
-                                      : "[A][Stream {:p}][{}B][Taken from stream {:p}]",
-                        fmt::ptr(stream_event.stream),
+          RMM_LOG_DEBUG((merge_first) ? "[A][Stream %s][%zuB][Found after merging stream %s]"
+                                      : "[A][Stream %s][%zuB][Taken from stream %s]",
+                        rmm::detail::format_stream(stream_event.stream),
                         size,
-                        fmt::ptr(iter->first.stream));
+                        rmm::detail::format_stream(iter->first.stream));
           return block;
         }
       }
@@ -471,7 +470,7 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
                     max_block    = std::max(summary.first, max_block);
                     free_mem += summary.second;
                   });
-    RMM_LOG_TRACE("[Summary][Free lists: {}][Blocks: {}][Max Block: {}][Total Free: {}]",
+    RMM_LOG_TRACE("[Summary][Free lists: %zu][Blocks: %zu][Max Block: %zu][Total Free: %zu]",
                   stream_free_blocks_.size(),
                   num_blocks,
                   max_block,
diff --git a/include/rmm/mr/device/logging_resource_adaptor.hpp b/include/rmm/mr/device/logging_resource_adaptor.hpp
index 595ab2e4e..578543852 100644
--- a/include/rmm/mr/device/logging_resource_adaptor.hpp
+++ b/include/rmm/mr/device/logging_resource_adaptor.hpp
@@ -18,16 +18,17 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
+#include <rmm/detail/format.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <fmt/core.h>
 #include <spdlog/common.h>
 #include <spdlog/sinks/basic_file_sink.h>
 #include <spdlog/sinks/ostream_sink.h>
 #include <spdlog/spdlog.h>
 
 #include <cstddef>
+#include <cstdio>
 #include <memory>
 #include <sstream>
 #include <string_view>
@@ -297,10 +298,12 @@ class logging_resource_adaptor final : public device_memory_resource {
   {
     try {
       auto const ptr = get_upstream_resource().allocate_async(bytes, stream);
-      logger_->info("allocate,{},{},{}", ptr, bytes, fmt::ptr(stream.value()));
+      logger_->info(rmm::detail::formatted_log(
+        "allocate,%p,%zu,%s", ptr, bytes, rmm::detail::format_stream(stream)));
       return ptr;
     } catch (...) {
-      logger_->info("allocate failure,{},{},{}", nullptr, bytes, fmt::ptr(stream.value()));
+      logger_->info(rmm::detail::formatted_log(
+        "allocate failure,%p,%zu,%s", nullptr, bytes, rmm::detail::format_stream(stream)));
       throw;
     }
   }
@@ -321,7 +324,8 @@ class logging_resource_adaptor final : public device_memory_resource {
    */
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
   {
-    logger_->info("free,{},{},{}", ptr, bytes, fmt::ptr(stream.value()));
+    logger_->info(
+      rmm::detail::formatted_log("free,%p,%zu,%s", ptr, bytes, rmm::detail::format_stream(stream)));
     get_upstream_resource().deallocate_async(ptr, bytes, stream);
   }
 
diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp
index f63de21ff..037147de3 100644
--- a/include/rmm/mr/device/pool_memory_resource.hpp
+++ b/include/rmm/mr/device/pool_memory_resource.hpp
@@ -19,6 +19,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
+#include <rmm/detail/format.hpp>
 #include <rmm/detail/logging_assert.hpp>
 #include <rmm/detail/thrust_namespace.h>
 #include <rmm/logger.hpp>
@@ -34,8 +35,6 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/optional.h>
 
-#include <fmt/core.h>
-
 #include <algorithm>
 #include <cstddef>
 #include <iostream>
@@ -271,8 +270,8 @@ class pool_memory_resource final
       }
       try_size = std::max(min_size, try_size / 2);
     }
-    RMM_LOG_ERROR("[A][Stream {}][Upstream {}B][FAILURE maximum pool size exceeded]",
-                  fmt::ptr(stream.value()),
+    RMM_LOG_ERROR("[A][Stream %s][Upstream %zuB][FAILURE maximum pool size exceeded]",
+                  rmm::detail::format_stream(stream),
                   min_size);
     RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
   }
@@ -351,7 +350,7 @@ class pool_memory_resource final
    */
   std::optional<block_type> block_from_upstream(std::size_t size, cuda_stream_view stream)
   {
-    RMM_LOG_DEBUG("[A][Stream {}][Upstream {}B]", fmt::ptr(stream.value()), size);
+    RMM_LOG_DEBUG("[A][Stream %s][Upstream %zuB]", rmm::detail::format_stream(stream), size);
 
     if (size == 0) { return {}; }
 
diff --git a/include/rmm/mr/device/tracking_resource_adaptor.hpp b/include/rmm/mr/device/tracking_resource_adaptor.hpp
index 6a5916e5c..8131eef4d 100644
--- a/include/rmm/mr/device/tracking_resource_adaptor.hpp
+++ b/include/rmm/mr/device/tracking_resource_adaptor.hpp
@@ -23,8 +23,6 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <fmt/core.h>
-
 #include <cstddef>
 #include <map>
 #include <mutex>
@@ -188,7 +186,7 @@ class tracking_resource_adaptor final : public device_memory_resource {
   void log_outstanding_allocations() const
   {
 #if SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_DEBUG
-    RMM_LOG_DEBUG("Outstanding Allocations: {}", get_outstanding_allocations_str());
+    RMM_LOG_DEBUG("Outstanding Allocations: %s", get_outstanding_allocations_str());
 #endif  // SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_DEBUG
   }
 
@@ -236,12 +234,12 @@ class tracking_resource_adaptor final : public device_memory_resource {
 
       // Ensure the allocation is found and the number of bytes match
       if (found == allocations_.end()) {
-        // Don't throw but log an error. Throwing in a descructor (or any noexcept) will call
+        // Don't throw but log an error. Throwing in a destructor (or any noexcept) will call
         // std::terminate
         RMM_LOG_ERROR(
-          "Deallocating a pointer that was not tracked. Ptr: {:p} [{}B], Current Num. Allocations: "
-          "{}",
-          fmt::ptr(ptr),
+          "Deallocating a pointer that was not tracked. Ptr: %p [%zuB], Current Num. Allocations: "
+          "%zu",
+          ptr,
           bytes,
           this->allocations_.size());
       } else {
@@ -250,10 +248,10 @@ class tracking_resource_adaptor final : public device_memory_resource {
         auto allocated_bytes = found->second.allocation_size;
 
         if (allocated_bytes != bytes) {
-          // Don't throw but log an error. Throwing in a descructor (or any noexcept) will call
+          // Don't throw but log an error. Throwing in a destructor (or any noexcept) will call
           // std::terminate
           RMM_LOG_ERROR(
-            "Alloc bytes ({}) and Dealloc bytes ({}) do not match", allocated_bytes, bytes);
+            "Alloc bytes (%zu) and Dealloc bytes (%zu) do not match", allocated_bytes, bytes);
 
           bytes = allocated_bytes;
         }
diff --git a/tests/logger_tests.cpp b/tests/logger_tests.cpp
index 643281d91..8a5d37be2 100644
--- a/tests/logger_tests.cpp
+++ b/tests/logger_tests.cpp
@@ -112,7 +112,6 @@ void expect_log_events(std::string const& filename,
                // EXPECT_EQ(expected.thread_id, actual.thread_id);
                // EXPECT_EQ(expected.stream, actual.stream);
                EXPECT_EQ(expected.act, actual.act);
-               // device_memory_resource automatically pads an allocation to a multiple of 8 bytes
                EXPECT_EQ(expected.size, actual.size);
                EXPECT_EQ(expected.pointer, actual.pointer);
                return true;
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index 95cc9c9c1..67f183a23 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -574,10 +574,10 @@ TEST_F(ArenaTest, DumpLogOnFailure)  // NOLINT
     std::size_t num_threads{4};
     threads.reserve(num_threads);
     for (std::size_t i = 0; i < num_threads; ++i) {
-      threads.emplace_back(std::thread([&] {
+      threads.emplace_back([&] {
         void* ptr = mr.allocate(32_KiB);
         mr.deallocate(ptr, 32_KiB);
-      }));
+      });
     }
 
     for (auto& thread : threads) {
diff --git a/tests/mr/device/callback_mr_tests.cpp b/tests/mr/device/callback_mr_tests.cpp
index a56efa60c..a7f6ab7be 100644
--- a/tests/mr/device/callback_mr_tests.cpp
+++ b/tests/mr/device/callback_mr_tests.cpp
@@ -23,11 +23,11 @@
 #include <rmm/mr/device/callback_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
-#include <fmt/core.h>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 #include <cstddef>
+#include <string>
 
 namespace rmm::test {
 namespace {
@@ -78,8 +78,9 @@ TEST(CallbackTest, LoggingTest)
   auto* ptr       = mr.allocate(size);
   mr.deallocate(ptr, size);
 
-  std::string output = testing::internal::GetCapturedStdout();
-  std::string expect = fmt::format("Allocating {} bytes\nDeallocating {} bytes\n", size, size);
+  auto output = testing::internal::GetCapturedStdout();
+  auto expect = std::string("Allocating ") + std::to_string(size) + " bytes\nDeallocating " +
+                std::to_string(size) + " bytes\n";
   ASSERT_EQ(expect, output);
 }
 

From c7fc01794a78ccb6b4dc173d4ed20db156ebb39a Mon Sep 17 00:00:00 2001
From: Mike Sarahan <msarahan@gmail.com>
Date: Thu, 14 Nov 2024 15:11:20 -0600
Subject: [PATCH 26/29] adding telemetry (#1692)

close #1691

Authors:
  - Mike Sarahan (https://github.com/msarahan)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/rmm/pull/1692
---
 .github/workflows/pr.yaml | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 4dfcaf1ae..6780298c3 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -19,6 +19,7 @@ jobs:
       - conda-python-build
       - conda-python-tests
       - docs-build
+      - telemetry-setup
       - wheel-build-cpp
       - wheel-build-python
       - wheel-tests
@@ -28,7 +29,17 @@ jobs:
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
+  telemetry-setup:
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    env:
+        OTEL_SERVICE_NAME: "pr-rmm"
+    steps:
+      - name: Telemetry setup
+        uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main
   changed-files:
+    needs:
+      - telemetry-setup
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12
     with:
@@ -50,9 +61,12 @@ jobs:
           - '!img/**'
   checks:
     secrets: inherit
+    needs:
+      - telemetry-setup
     uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
     with:
       enable_check_generated_files: false
+      ignored_pr_jobs: telemetry-summarize
   conda-cpp-build:
     needs: checks
     secrets: inherit
@@ -114,6 +128,8 @@ jobs:
       script: ci/test_wheel.sh
   devcontainer:
     secrets: inherit
+    needs:
+      - telemetry-setup
     uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12
     with:
       arch: '["amd64"]'
@@ -122,3 +138,18 @@ jobs:
         sccache -z;
         build-all -DBUILD_BENCHMARKS=ON --verbose;
         sccache -s;
+
+  telemetry-summarize:
+    runs-on: ubuntu-latest
+    needs: pr-builder
+    if: always()
+    continue-on-error: true
+    steps:
+      - name: Load stashed telemetry env vars
+        uses: rapidsai/shared-actions/telemetry-dispatch-load-base-env-vars@main
+        with:
+            load_service_name: true
+      - name: Telemetry summarize
+        uses: rapidsai/shared-actions/telemetry-dispatch-write-summary@main
+        with:
+          cert_concat: "${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }}"

From 929a595e639b61b583cc584b1c291f9559cef673 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 19 Nov 2024 09:51:53 -0500
Subject: [PATCH 27/29] Make `cudaMallocAsync` logic non-optional as we require
 CUDA 11.2+ (#1667)

We can remove the optimizations around  `CUDA_STATIC_RUNTIME` and instead see if the function is already in the process space so that RMM doesn't need to have any build context to run properly

Fixes #1679

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/rmm/pull/1667
---
 CMakeLists.txt                                |   1 -
 .../multi_stream_allocations_bench.cu         |   6 -
 .../random_allocations/random_allocations.cpp |   6 -
 include/rmm/detail/dynamic_load_runtime.hpp   | 191 ------------------
 include/rmm/detail/runtime_async_alloc.hpp    |  79 ++++++++
 .../mr/device/cuda_async_memory_resource.hpp  |  54 ++---
 .../cuda_async_view_memory_resource.hpp       |  32 +--
 python/rmm/rmm/tests/test_rmm.py              |  34 +---
 tests/CMakeLists.txt                          |  17 +-
 tests/mr/device/cuda_async_mr_tests.cpp       |  17 +-
 tests/mr/device/cuda_async_view_mr_tests.cpp  |  13 +-
 .../mr/device/mr_ref_multithreaded_tests.cpp  |  16 +-
 tests/mr/device/mr_ref_test.hpp               |   2 +-
 tests/mr/device/mr_ref_tests.cpp              |   4 -
 tests/mr/device/thrust_allocator_tests.cu     |  16 +-
 tests/prefetch_tests.cpp                      |   2 +-
 16 files changed, 131 insertions(+), 359 deletions(-)
 delete mode 100644 include/rmm/detail/dynamic_load_runtime.hpp
 create mode 100644 include/rmm/detail/runtime_async_alloc.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 44d7fbb79..07bd368ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,7 +89,6 @@ target_include_directories(rmm INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOUR
 if(CUDA_STATIC_RUNTIME)
   message(STATUS "RMM: Enabling static linking of cudart")
   target_link_libraries(rmm INTERFACE CUDA::cudart_static)
-  target_compile_definitions(rmm INTERFACE RMM_STATIC_CUDART)
 else()
   target_link_libraries(rmm INTERFACE CUDA::cudart)
 endif()
diff --git a/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu b/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu
index 86e761c80..b5edbb536 100644
--- a/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu
+++ b/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu
@@ -133,9 +133,7 @@ static void benchmark_range(benchmark::internal::Benchmark* bench)
 MRFactoryFunc get_mr_factory(std::string const& resource_name)
 {
   if (resource_name == "cuda") { return &make_cuda; }
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   if (resource_name == "cuda_async") { return &make_cuda_async; }
-#endif
   if (resource_name == "pool") { return &make_pool; }
   if (resource_name == "arena") { return &make_arena; }
   if (resource_name == "binning") { return &make_binning; }
@@ -153,13 +151,11 @@ void declare_benchmark(std::string const& name)
     return;
   }
 
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   if (name == "cuda_async") {
     BENCHMARK_CAPTURE(BM_MultiStreamAllocations, cuda_async, &make_cuda_async)  //
       ->Apply(benchmark_range);
     return;
   }
-#endif
 
   if (name == "pool") {
     BENCHMARK_CAPTURE(BM_MultiStreamAllocations, pool_mr, &make_pool)  //
@@ -248,9 +244,7 @@ int main(int argc, char** argv)
         resource_names.emplace_back(args["resource"].as<std::string>());
       } else {
         resource_names.emplace_back("cuda");
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
         resource_names.emplace_back("cuda_async");
-#endif
         resource_names.emplace_back("pool");
         resource_names.emplace_back("arena");
         resource_names.emplace_back("binning");
diff --git a/benchmarks/random_allocations/random_allocations.cpp b/benchmarks/random_allocations/random_allocations.cpp
index 57116743b..2971f7e40 100644
--- a/benchmarks/random_allocations/random_allocations.cpp
+++ b/benchmarks/random_allocations/random_allocations.cpp
@@ -316,9 +316,7 @@ int main(int argc, char** argv)
       std::map<std::string, MRFactoryFunc> const funcs({{"arena", &make_arena},
                                                         {"binning", &make_binning},
                                                         {"cuda", &make_cuda},
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
                                                         {"cuda_async", &make_cuda_async},
-#endif
                                                         {"pool", &make_pool}});
       auto resource = args["resource"].as<std::string>();
 
@@ -340,11 +338,7 @@ int main(int argc, char** argv)
         std::string mr_name = args["resource"].as<std::string>();
         declare_benchmark(mr_name);
       } else {
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
         std::vector<std::string> mrs{"pool", "binning", "arena", "cuda_async", "cuda"};
-#else
-        std::vector<std::string> mrs{"pool", "binning", "arena", "cuda"};
-#endif
         std::for_each(
           std::cbegin(mrs), std::cend(mrs), [](auto const& mr) { declare_benchmark(mr); });
       }
diff --git a/include/rmm/detail/dynamic_load_runtime.hpp b/include/rmm/detail/dynamic_load_runtime.hpp
deleted file mode 100644
index 214228752..000000000
--- a/include/rmm/detail/dynamic_load_runtime.hpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <rmm/cuda_device.hpp>
-#include <rmm/detail/export.hpp>
-
-#include <cuda_runtime_api.h>
-
-#include <dlfcn.h>
-
-#include <memory>
-#include <optional>
-
-namespace RMM_NAMESPACE {
-namespace detail {
-
-/**
- * @brief `dynamic_load_runtime` loads the cuda runtime library at runtime
- *
- * By loading the cudart library at runtime we can use functions that
- * are added in newer minor versions of the cuda runtime.
- */
-struct dynamic_load_runtime {
-  static void* get_cuda_runtime_handle()
-  {
-    auto close_cudart = [](void* handle) { ::dlclose(handle); };
-    auto open_cudart  = []() {
-      ::dlerror();
-      const int major = CUDART_VERSION / 1000;
-
-      // In CUDA 12 the SONAME is correctly defined as libcudart.12, but for
-      // CUDA<=11 it includes an extra 0 minor version e.g. libcudart.11.0. We
-      // also allow finding the linker name.
-      const std::string libname_ver_cuda_11 = "libcudart.so." + std::to_string(major) + ".0";
-      const std::string libname_ver_cuda_12 = "libcudart.so." + std::to_string(major);
-      const std::string libname             = "libcudart.so";
-
-      void* ptr = nullptr;
-      for (auto&& name : {libname_ver_cuda_12, libname_ver_cuda_11, libname}) {
-        ptr = dlopen(name.c_str(), RTLD_LAZY);
-        if (ptr != nullptr) break;
-      }
-
-      if (ptr != nullptr) { return ptr; }
-
-      RMM_FAIL("Unable to dlopen cudart");
-    };
-    static std::unique_ptr<void, decltype(close_cudart)> cudart_handle{open_cudart(), close_cudart};
-    return cudart_handle.get();
-  }
-
-  template <typename... Args>
-  using function_sig = std::add_pointer_t<cudaError_t(Args...)>;
-
-  template <typename signature>
-  static std::optional<signature> function(const char* func_name)
-  {
-    auto* runtime = get_cuda_runtime_handle();
-    auto* handle  = ::dlsym(runtime, func_name);
-    if (!handle) { return std::nullopt; }
-    auto* function_ptr = reinterpret_cast<signature>(handle);
-    return std::optional<signature>(function_ptr);
-  }
-};
-
-#if defined(RMM_STATIC_CUDART)
-// clang-format off
-#define RMM_CUDART_API_WRAPPER(name, signature)                               \
-  template <typename... Args>                                                 \
-  static cudaError_t name(Args... args)                                       \
-  {                                                                           \
-    _Pragma("GCC diagnostic push")                                            \
-    _Pragma("GCC diagnostic ignored \"-Waddress\"")                           \
-    static_assert(static_cast<signature>(::name),                             \
-                  "Failed to find #name function with arguments #signature"); \
-    _Pragma("GCC diagnostic pop")                                             \
-    return ::name(args...);                                                   \
-  }
-// clang-format on
-#else
-#define RMM_CUDART_API_WRAPPER(name, signature)                                \
-  template <typename... Args>                                                  \
-  static cudaError_t name(Args... args)                                        \
-  {                                                                            \
-    static const auto func = dynamic_load_runtime::function<signature>(#name); \
-    if (func) { return (*func)(args...); }                                     \
-    RMM_FAIL("Failed to find #name function in libcudart.so");                 \
-  }
-#endif
-
-#if CUDART_VERSION >= 11020  // 11.2 introduced cudaMallocAsync
-/**
- * @brief Bind to the stream-ordered memory allocator functions
- * at runtime.
- *
- * This allows RMM users to compile/link against CUDA 11.2+ and run with
- * < CUDA 11.2 runtime as these functions are found at call time.
- */
-struct async_alloc {
-  static bool is_supported()
-  {
-#if defined(RMM_STATIC_CUDART)
-    static bool runtime_supports_pool = (CUDART_VERSION >= 11020);
-#else
-    static bool runtime_supports_pool =
-      dynamic_load_runtime::function<dynamic_load_runtime::function_sig<void*, cudaStream_t>>(
-        "cudaFreeAsync")
-        .has_value();
-#endif
-
-    static auto driver_supports_pool{[] {
-      int cuda_pool_supported{};
-      auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
-                                           cudaDevAttrMemoryPoolsSupported,
-                                           rmm::get_current_cuda_device().value());
-      return result == cudaSuccess and cuda_pool_supported == 1;
-    }()};
-    return runtime_supports_pool and driver_supports_pool;
-  }
-
-  /**
-   * @brief Check whether the specified `cudaMemAllocationHandleType` is supported on the present
-   * CUDA driver/runtime version.
-   *
-   * @note This query was introduced in CUDA 11.3 so on CUDA 11.2 this function will only return
-   * true for `cudaMemHandleTypeNone`.
-   *
-   * @param handle_type An IPC export handle type to check for support.
-   * @return true if supported
-   * @return false if unsupported
-   */
-  static bool is_export_handle_type_supported(cudaMemAllocationHandleType handle_type)
-  {
-    int supported_handle_types_bitmask{};
-#if CUDART_VERSION >= 11030  // 11.3 introduced cudaDevAttrMemoryPoolSupportedHandleTypes
-    if (cudaMemHandleTypeNone != handle_type) {
-      auto const result = cudaDeviceGetAttribute(&supported_handle_types_bitmask,
-                                                 cudaDevAttrMemoryPoolSupportedHandleTypes,
-                                                 rmm::get_current_cuda_device().value());
-
-      // Don't throw on cudaErrorInvalidValue
-      auto const unsupported_runtime = (result == cudaErrorInvalidValue);
-      if (unsupported_runtime) return false;
-      // throw any other error that may have occurred
-      RMM_CUDA_TRY(result);
-    }
-
-#endif
-    return (supported_handle_types_bitmask & handle_type) == handle_type;
-  }
-
-  template <typename... Args>
-  using cudart_sig = dynamic_load_runtime::function_sig<Args...>;
-
-  using cudaMemPoolCreate_sig = cudart_sig<cudaMemPool_t*, const cudaMemPoolProps*>;
-  RMM_CUDART_API_WRAPPER(cudaMemPoolCreate, cudaMemPoolCreate_sig);
-
-  using cudaMemPoolSetAttribute_sig = cudart_sig<cudaMemPool_t, cudaMemPoolAttr, void*>;
-  RMM_CUDART_API_WRAPPER(cudaMemPoolSetAttribute, cudaMemPoolSetAttribute_sig);
-
-  using cudaMemPoolDestroy_sig = cudart_sig<cudaMemPool_t>;
-  RMM_CUDART_API_WRAPPER(cudaMemPoolDestroy, cudaMemPoolDestroy_sig);
-
-  using cudaMallocFromPoolAsync_sig = cudart_sig<void**, size_t, cudaMemPool_t, cudaStream_t>;
-  RMM_CUDART_API_WRAPPER(cudaMallocFromPoolAsync, cudaMallocFromPoolAsync_sig);
-
-  using cudaFreeAsync_sig = cudart_sig<void*, cudaStream_t>;
-  RMM_CUDART_API_WRAPPER(cudaFreeAsync, cudaFreeAsync_sig);
-
-  using cudaDeviceGetDefaultMemPool_sig = cudart_sig<cudaMemPool_t*, int>;
-  RMM_CUDART_API_WRAPPER(cudaDeviceGetDefaultMemPool, cudaDeviceGetDefaultMemPool_sig);
-};
-#endif
-
-#undef RMM_CUDART_API_WRAPPER
-}  // namespace detail
-}  // namespace RMM_NAMESPACE
diff --git a/include/rmm/detail/runtime_async_alloc.hpp b/include/rmm/detail/runtime_async_alloc.hpp
new file mode 100644
index 000000000..6ddb2228b
--- /dev/null
+++ b/include/rmm/detail/runtime_async_alloc.hpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/detail/export.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <dlfcn.h>
+
+#include <memory>
+#include <optional>
+
+namespace RMM_NAMESPACE {
+namespace detail {
+
+/**
+ * @brief Determine at runtime if the CUDA driver supports the stream-ordered
+ * memory allocator functions.
+ *
+ * This allows RMM users to compile/link against CUDA 11.2+ and run with
+ * older drivers.
+ */
+
+struct runtime_async_alloc {
+  static bool is_supported()
+  {
+    static auto driver_supports_pool{[] {
+      int cuda_pool_supported{};
+      auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
+                                           cudaDevAttrMemoryPoolsSupported,
+                                           rmm::get_current_cuda_device().value());
+      return result == cudaSuccess and cuda_pool_supported == 1;
+    }()};
+    return driver_supports_pool;
+  }
+
+  /**
+   * @brief Check whether the specified `cudaMemAllocationHandleType` is supported on the present
+   * CUDA driver/runtime version.
+   *
+   * @param handle_type An IPC export handle type to check for support.
+   * @return true if supported
+   * @return false if unsupported
+   */
+  static bool is_export_handle_type_supported(cudaMemAllocationHandleType handle_type)
+  {
+    int supported_handle_types_bitmask{};
+    if (cudaMemHandleTypeNone != handle_type) {
+      auto const result = cudaDeviceGetAttribute(&supported_handle_types_bitmask,
+                                                 cudaDevAttrMemoryPoolSupportedHandleTypes,
+                                                 rmm::get_current_cuda_device().value());
+
+      // Don't throw on cudaErrorInvalidValue
+      auto const unsupported_runtime = (result == cudaErrorInvalidValue);
+      if (unsupported_runtime) return false;
+      // throw any other error that may have occurred
+      RMM_CUDA_TRY(result);
+    }
+    return (supported_handle_types_bitmask & handle_type) == handle_type;
+  }
+};
+
+}  // namespace detail
+}  // namespace RMM_NAMESPACE
diff --git a/include/rmm/mr/device/cuda_async_memory_resource.hpp b/include/rmm/mr/device/cuda_async_memory_resource.hpp
index 52fd2fe4e..b1fc0b112 100644
--- a/include/rmm/mr/device/cuda_async_memory_resource.hpp
+++ b/include/rmm/mr/device/cuda_async_memory_resource.hpp
@@ -17,9 +17,9 @@
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/detail/dynamic_load_runtime.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
+#include <rmm/detail/runtime_async_alloc.hpp>
 #include <rmm/detail/thrust_namespace.h>
 #include <rmm/mr/device/cuda_async_view_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
@@ -31,12 +31,6 @@
 #include <limits>
 #include <optional>
 
-#if CUDART_VERSION >= 11020  // 11.2 introduced cudaMallocAsync
-#ifndef RMM_DISABLE_CUDA_MALLOC_ASYNC
-#define RMM_CUDA_MALLOC_ASYNC_SUPPORT
-#endif
-#endif
-
 namespace RMM_NAMESPACE {
 namespace mr {
 /**
@@ -91,9 +85,8 @@ class cuda_async_memory_resource final : public device_memory_resource {
                              std::optional<std::size_t> release_threshold             = {},
                              std::optional<allocation_handle_type> export_handle_type = {})
   {
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     // Check if cudaMallocAsync Memory pool supported
-    RMM_EXPECTS(rmm::detail::async_alloc::is_supported(),
+    RMM_EXPECTS(rmm::detail::runtime_async_alloc::is_supported(),
                 "cudaMallocAsync not supported with this CUDA driver/runtime version");
 
     // Construct explicit pool
@@ -101,12 +94,13 @@ class cuda_async_memory_resource final : public device_memory_resource {
     pool_props.allocType   = cudaMemAllocationTypePinned;
     pool_props.handleTypes = static_cast<cudaMemAllocationHandleType>(
       export_handle_type.value_or(allocation_handle_type::none));
-    RMM_EXPECTS(rmm::detail::async_alloc::is_export_handle_type_supported(pool_props.handleTypes),
-                "Requested IPC memory handle type not supported");
+    RMM_EXPECTS(
+      rmm::detail::runtime_async_alloc::is_export_handle_type_supported(pool_props.handleTypes),
+      "Requested IPC memory handle type not supported");
     pool_props.location.type = cudaMemLocationTypeDevice;
     pool_props.location.id   = rmm::get_current_cuda_device().value();
     cudaMemPool_t cuda_pool_handle{};
-    RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle, &pool_props));
+    RMM_CUDA_TRY(cudaMemPoolCreate(&cuda_pool_handle, &pool_props));
     pool_ = cuda_async_view_memory_resource{cuda_pool_handle};
 
     // CUDA drivers before 11.5 have known incompatibilities with the async allocator.
@@ -117,41 +111,34 @@ class cuda_async_memory_resource final : public device_memory_resource {
     constexpr auto min_async_version{11050};
     if (driver_version < min_async_version) {
       int disabled{0};
-      RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute(
-        pool_handle(), cudaMemPoolReuseAllowOpportunistic, &disabled));
+      RMM_CUDA_TRY(
+        cudaMemPoolSetAttribute(pool_handle(), cudaMemPoolReuseAllowOpportunistic, &disabled));
     }
 
     auto const [free, total] = rmm::available_device_memory();
 
     // Need an l-value to take address to pass to cudaMemPoolSetAttribute
     uint64_t threshold = release_threshold.value_or(total);
-    RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute(
-      pool_handle(), cudaMemPoolAttrReleaseThreshold, &threshold));
+    RMM_CUDA_TRY(
+      cudaMemPoolSetAttribute(pool_handle(), cudaMemPoolAttrReleaseThreshold, &threshold));
 
     // Allocate and immediately deallocate the initial_pool_size to prime the pool with the
     // specified size
     auto const pool_size = initial_pool_size.value_or(free / 2);
     auto* ptr            = do_allocate(pool_size, cuda_stream_default);
     do_deallocate(ptr, pool_size, cuda_stream_default);
-#else
-    RMM_FAIL(
-      "cudaMallocAsync not supported by the version of the CUDA Toolkit used for this build");
-#endif
   }
 
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   /**
    * @brief Returns the underlying native handle to the CUDA pool
    *
+   * @return cudaMemPool_t Handle to the underlying CUDA pool
    */
   [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); }
-#endif
 
   ~cuda_async_memory_resource() override
   {
-#if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
-    RMM_ASSERT_CUDA_SUCCESS(rmm::detail::async_alloc::cudaMemPoolDestroy(pool_handle()));
-#endif
+    RMM_ASSERT_CUDA_SUCCESS(cudaMemPoolDestroy(pool_handle()));
   }
   cuda_async_memory_resource(cuda_async_memory_resource const&)            = delete;
   cuda_async_memory_resource(cuda_async_memory_resource&&)                 = delete;
@@ -159,9 +146,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
   cuda_async_memory_resource& operator=(cuda_async_memory_resource&&)      = delete;
 
  private:
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   cuda_async_view_memory_resource pool_{};
-#endif
 
   /**
    * @brief Allocates memory of size at least \p bytes.
@@ -175,12 +160,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
   {
     void* ptr{nullptr};
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     ptr = pool_.allocate(bytes, stream);
-#else
-    (void)bytes;
-    (void)stream;
-#endif
     return ptr;
   }
 
@@ -194,13 +174,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
    */
   void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) override
   {
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     pool_.deallocate(ptr, bytes, stream);
-#else
-    (void)ptr;
-    (void)bytes;
-    (void)stream;
-#endif
   }
 
   /**
@@ -213,11 +187,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
   [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
   {
     auto const* async_mr = dynamic_cast<cuda_async_memory_resource const*>(&other);
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     return (async_mr != nullptr) && (this->pool_handle() == async_mr->pool_handle());
-#else
-    return async_mr != nullptr;
-#endif
   }
 };
 
diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
index 3e1900e72..180c412ee 100644
--- a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
+++ b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
@@ -17,7 +17,6 @@
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/detail/dynamic_load_runtime.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
 #include <rmm/detail/thrust_namespace.h>
@@ -28,10 +27,6 @@
 #include <cstddef>
 #include <limits>
 
-#if CUDART_VERSION >= 11020  // 11.2 introduced cudaMallocAsync
-#define RMM_CUDA_MALLOC_ASYNC_SUPPORT
-#endif
-
 namespace RMM_NAMESPACE {
 namespace mr {
 /**
@@ -46,13 +41,12 @@ namespace mr {
  */
 class cuda_async_view_memory_resource final : public device_memory_resource {
  public:
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   /**
    * @brief Constructs a cuda_async_view_memory_resource which uses an existing CUDA memory pool.
    * The provided pool is not owned by cuda_async_view_memory_resource and must remain valid
    * during the lifetime of the memory resource.
    *
-   * @throws rmm::runtime_error if the CUDA version does not support `cudaMallocAsync`
+   * @throws rmm::logic_error if the CUDA version does not support `cudaMallocAsync`
    *
    * @param valid_pool_handle Handle to a CUDA memory pool which will be used to
    * serve allocation requests.
@@ -71,15 +65,13 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
     RMM_EXPECTS(result == cudaSuccess && cuda_pool_supported,
                 "cudaMallocAsync not supported with this CUDA driver/runtime version");
   }
-#endif
 
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   /**
    * @brief Returns the underlying native handle to the CUDA pool
    *
+   * @return cudaMemPool_t Handle to the underlying CUDA pool
    */
   [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return cuda_pool_handle_; }
-#endif
 
   cuda_async_view_memory_resource() = default;
   cuda_async_view_memory_resource(cuda_async_view_memory_resource const&) =
@@ -92,9 +84,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
     default;  ///< @default_move_assignment{cuda_async_view_memory_resource}
 
  private:
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   cudaMemPool_t cuda_pool_handle_{};
-#endif
 
   /**
    * @brief Allocates memory of size at least \p bytes.
@@ -108,15 +98,9 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
   {
     void* ptr{nullptr};
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     if (bytes > 0) {
-      RMM_CUDA_TRY_ALLOC(rmm::detail::async_alloc::cudaMallocFromPoolAsync(
-        &ptr, bytes, pool_handle(), stream.value()));
+      RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value()));
     }
-#else
-    (void)bytes;
-    (void)stream;
-#endif
     return ptr;
   }
 
@@ -132,15 +116,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
                      [[maybe_unused]] std::size_t bytes,
                      rmm::cuda_stream_view stream) override
   {
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
-    if (ptr != nullptr) {
-      RMM_ASSERT_CUDA_SUCCESS(rmm::detail::async_alloc::cudaFreeAsync(ptr, stream.value()));
-    }
-#else
-    (void)ptr;
-    (void)bytes;
-    (void)stream;
-#endif
+    if (ptr != nullptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeAsync(ptr, stream.value())); }
   }
 
   /**
diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
index b52ea0179..d7d692287 100644
--- a/python/rmm/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -32,12 +32,6 @@
 
 cuda.set_memory_manager(RMMNumbaManager)
 
-_driver_version = rmm._cuda.gpu.driverGetVersion()
-_runtime_version = rmm._cuda.gpu.runtimeGetVersion()
-_CUDAMALLOC_ASYNC_SUPPORTED = (_driver_version >= 11020) and (
-    _runtime_version >= 11020
-)
-
 _SYSTEM_MEMORY_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute(
     cudart.cudaDeviceAttr.cudaDevAttrPageableMemoryAccess,
     rmm._cuda.gpu.getDevice(),
@@ -657,10 +651,6 @@ def test_mr_upstream_lifetime():
     del pool_mr
 
 
-@pytest.mark.skipif(
-    not _CUDAMALLOC_ASYNC_SUPPORTED,
-    reason="cudaMallocAsync not supported",
-)
 @pytest.mark.parametrize("dtype", _dtypes)
 @pytest.mark.parametrize("nelem", _nelems)
 @pytest.mark.parametrize("alloc", _allocs)
@@ -671,15 +661,11 @@ def test_cuda_async_memory_resource(dtype, nelem, alloc):
     array_tester(dtype, nelem, alloc)
 
 
-@pytest.mark.skipif(
-    not _CUDAMALLOC_ASYNC_SUPPORTED,
-    reason="cudaMallocAsync not supported",
-)
 def test_cuda_async_memory_resource_ipc():
     # TODO: We don't have a great way to check if IPC is supported in Python,
     # without using the C++ function
-    # rmm::detail::async_alloc::is_export_handle_type_supported. We can't
-    # accurately test driver and runtime versions for this via Python because
+    # rmm::detail::runtime_async_alloc::is_export_handle_type_supported.
+    # We can't accurately test this via Python because
     # cuda-python always has the IPC handle enum defined (which normally
     # requires a CUDA 11.3 runtime) and the cuda-compat package in Docker
     # containers prevents us from assuming that the driver we see actually
@@ -702,10 +688,6 @@ def test_cuda_async_memory_resource_ipc():
         assert rmm.mr.get_current_device_resource_type() is type(mr)
 
 
-@pytest.mark.skipif(
-    not _CUDAMALLOC_ASYNC_SUPPORTED,
-    reason="cudaMallocAsync not supported",
-)
 @pytest.mark.parametrize("nelems", _nelems)
 def test_cuda_async_memory_resource_stream(nelems):
     # test that using CudaAsyncMemoryResource
@@ -719,10 +701,6 @@ def test_cuda_async_memory_resource_stream(nelems):
     np.testing.assert_equal(expected, result)
 
 
-@pytest.mark.skipif(
-    not _CUDAMALLOC_ASYNC_SUPPORTED,
-    reason="cudaMallocAsync not supported",
-)
 @pytest.mark.parametrize("nelem", _nelems)
 @pytest.mark.parametrize("alloc", _allocs)
 def test_cuda_async_memory_resource_threshold(nelem, alloc):
@@ -739,13 +717,7 @@ def test_cuda_async_memory_resource_threshold(nelem, alloc):
     "mr",
     [
         rmm.mr.CudaMemoryResource,
-        pytest.param(
-            rmm.mr.CudaAsyncMemoryResource,
-            marks=pytest.mark.skipif(
-                not _CUDAMALLOC_ASYNC_SUPPORTED,
-                reason="cudaMallocAsync not supported",
-            ),
-        ),
+        pytest.param(rmm.mr.CudaAsyncMemoryResource),
     ],
 )
 def test_limiting_resource_adaptor(mr):
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a482c8cc1..476028af0 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -84,7 +84,7 @@ endfunction()
 function(ConfigureTest TEST_NAME)
 
   set(options)
-  set(one_value GPUS PERCENT)
+  set(one_value CUDART GPUS PERCENT)
   set(multi_value)
   cmake_parse_arguments(_RMM_TEST "${options}" "${one_value}" "${multi_value}" ${ARGN})
   if(NOT DEFINED _RMM_TEST_GPUS AND NOT DEFINED _RMM_TEST_PERCENT)
@@ -98,13 +98,23 @@ function(ConfigureTest TEST_NAME)
     set(_RMM_TEST_PERCENT 100)
   endif()
 
+  if(_RMM_TEST_CUDART STREQUAL SHARED)
+    set(cudart_link_libs $<COMPILE_ONLY:rmm> CUDA::cudart)
+  elseif(_RMM_TEST_CUDART STREQUAL STATIC)
+    set(cudart_link_libs $<COMPILE_ONLY:rmm> CUDA::cudart_static)
+  else()
+    set(cudart_link_libs rmm)
+  endif()
+
   # Test with legacy default stream.
   ConfigureTestInternal(${TEST_NAME} ${_RMM_TEST_UNPARSED_ARGUMENTS})
+  target_link_libraries(${TEST_NAME} ${cudart_link_libs})
 
   # Test with per-thread default stream.
   string(REGEX REPLACE "_TEST$" "_PTDS_TEST" PTDS_TEST_NAME "${TEST_NAME}")
   ConfigureTestInternal("${PTDS_TEST_NAME}" ${_RMM_TEST_UNPARSED_ARGUMENTS})
   target_compile_definitions("${PTDS_TEST_NAME}" PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM)
+  target_link_libraries(${PTDS_TEST_NAME} ${cudart_link_libs})
 
   foreach(name ${TEST_NAME} ${PTDS_TEST_NAME} ${NS_TEST_NAME})
     rapids_test_add(
@@ -130,7 +140,10 @@ ConfigureTest(ADAPTOR_TEST mr/device/adaptor_tests.cpp)
 ConfigureTest(POOL_MR_TEST mr/device/pool_mr_tests.cpp GPUS 1 PERCENT 100)
 
 # cuda_async mr tests
-ConfigureTest(CUDA_ASYNC_MR_TEST mr/device/cuda_async_mr_tests.cpp GPUS 1 PERCENT 60)
+ConfigureTest(CUDA_ASYNC_MR_STATIC_CUDART_TEST mr/device/cuda_async_mr_tests.cpp GPUS 1 PERCENT 60
+              CUDART STATIC)
+ConfigureTest(CUDA_ASYNC_MR_SHARED_CUDART_TEST mr/device/cuda_async_mr_tests.cpp GPUS 1 PERCENT 60
+              CUDART SHARED)
 
 # thrust allocator tests
 ConfigureTest(THRUST_ALLOCATOR_TEST mr/device/thrust_allocator_tests.cu GPUS 1 PERCENT 60)
diff --git a/tests/mr/device/cuda_async_mr_tests.cpp b/tests/mr/device/cuda_async_mr_tests.cpp
index 90c7b0ff9..a39188548 100644
--- a/tests/mr/device/cuda_async_mr_tests.cpp
+++ b/tests/mr/device/cuda_async_mr_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,24 +31,13 @@ class AsyncMRTest : public ::testing::Test {
  protected:
   void SetUp() override
   {
-    if (!rmm::detail::async_alloc::is_supported()) {
+    if (!rmm::detail::runtime_async_alloc::is_supported()) {
       GTEST_SKIP() << "Skipping tests since cudaMallocAsync not supported with this CUDA "
                    << "driver/runtime version";
     }
   }
 };
 
-TEST_F(AsyncMRTest, ThrowIfNotSupported)
-{
-  auto construct_mr = []() { cuda_async_mr mr; };
-#ifndef RMM_CUDA_MALLOC_ASYNC_SUPPORT
-  EXPECT_THROW(construct_mr(), rmm::logic_error);
-#else
-  EXPECT_NO_THROW(construct_mr());
-#endif
-}
-
-#if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
 TEST_F(AsyncMRTest, ExplicitInitialPoolSize)
 {
   const auto pool_init_size{100};
@@ -77,7 +66,5 @@ TEST_F(AsyncMRTest, DifferentPoolsUnequal)
   EXPECT_FALSE(mr1.is_equal(mr2));
 }
 
-#endif
-
 }  // namespace
 }  // namespace rmm::test
diff --git a/tests/mr/device/cuda_async_view_mr_tests.cpp b/tests/mr/device/cuda_async_view_mr_tests.cpp
index fe82431a9..f3a02cbf0 100644
--- a/tests/mr/device/cuda_async_view_mr_tests.cpp
+++ b/tests/mr/device/cuda_async_view_mr_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,13 +29,10 @@ using cuda_async_view_mr = rmm::mr::cuda_async_view_memory_resource;
 static_assert(cuda::mr::resource_with<cuda_async_view_mr, cuda::mr::device_accessible>);
 static_assert(cuda::mr::async_resource_with<cuda_async_view_mr, cuda::mr::device_accessible>);
 
-#if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
-
 TEST(PoolTest, UsePool)
 {
   cudaMemPool_t memPool{};
-  RMM_CUDA_TRY(rmm::detail::async_alloc::cudaDeviceGetDefaultMemPool(
-    &memPool, rmm::get_current_cuda_device().value()));
+  RMM_CUDA_TRY(cudaDeviceGetDefaultMemPool(&memPool, rmm::get_current_cuda_device().value()));
 
   const auto pool_init_size{100};
   cuda_async_view_mr mr{memPool};
@@ -53,7 +50,7 @@ TEST(PoolTest, NotTakingOwnershipOfPool)
 
   cudaMemPool_t memPool{};
 
-  RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&memPool, &poolProps));
+  RMM_CUDA_TRY(cudaMemPoolCreate(&memPool, &poolProps));
 
   {
     const auto pool_init_size{100};
@@ -64,7 +61,7 @@ TEST(PoolTest, NotTakingOwnershipOfPool)
   }
 
   auto destroy_valid_pool = [&]() {
-    auto result = rmm::detail::async_alloc::cudaMemPoolDestroy(memPool);
+    auto result = cudaMemPoolDestroy(memPool);
     RMM_EXPECTS(result == cudaSuccess, "Pool wrapper did destroy pool");
   };
 
@@ -81,7 +78,5 @@ TEST(PoolTest, ThrowIfNullptrPool)
   EXPECT_THROW(construct_mr(), rmm::logic_error);
 }
 
-#endif
-
 }  // namespace
 }  // namespace rmm::test
diff --git a/tests/mr/device/mr_ref_multithreaded_tests.cpp b/tests/mr/device/mr_ref_multithreaded_tests.cpp
index 944ba1807..9e7c8c2e8 100644
--- a/tests/mr/device/mr_ref_multithreaded_tests.cpp
+++ b/tests/mr/device/mr_ref_multithreaded_tests.cpp
@@ -36,17 +36,11 @@ namespace {
 
 struct mr_ref_test_mt : public mr_ref_test {};
 
-INSTANTIATE_TEST_CASE_P(MultiThreadResourceTests,
-                        mr_ref_test_mt,
-                        ::testing::Values("CUDA",
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
-                                          "CUDA_Async",
-#endif
-                                          "Managed",
-                                          "Pool",
-                                          "Arena",
-                                          "Binning"),
-                        [](auto const& info) { return info.param; });
+INSTANTIATE_TEST_CASE_P(
+  MultiThreadResourceTests,
+  mr_ref_test_mt,
+  ::testing::Values("CUDA", "CUDA_Async", "Managed", "Pool", "Arena", "Binning"),
+  [](auto const& info) { return info.param; });
 
 template <typename Task, typename... Arguments>
 void spawn_n(std::size_t num_threads, Task task, Arguments&&... args)
diff --git a/tests/mr/device/mr_ref_test.hpp b/tests/mr/device/mr_ref_test.hpp
index 6e63b3838..2af0eff44 100644
--- a/tests/mr/device/mr_ref_test.hpp
+++ b/tests/mr/device/mr_ref_test.hpp
@@ -347,7 +347,7 @@ inline auto make_host_pinned() { return std::make_shared<rmm::mr::pinned_host_me
 
 inline auto make_cuda_async()
 {
-  if (rmm::detail::async_alloc::is_supported()) {
+  if (rmm::detail::runtime_async_alloc::is_supported()) {
     return std::make_shared<rmm::mr::cuda_async_memory_resource>();
   }
   return std::shared_ptr<rmm::mr::cuda_async_memory_resource>{nullptr};
diff --git a/tests/mr/device/mr_ref_tests.cpp b/tests/mr/device/mr_ref_tests.cpp
index 55e91d765..41af050a0 100644
--- a/tests/mr/device/mr_ref_tests.cpp
+++ b/tests/mr/device/mr_ref_tests.cpp
@@ -30,9 +30,7 @@ namespace {
 INSTANTIATE_TEST_SUITE_P(ResourceTests,
                          mr_ref_test,
                          ::testing::Values("CUDA",
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
                                            "CUDA_Async",
-#endif
                                            "Managed",
                                            "System",
                                            "Pool",
@@ -46,9 +44,7 @@ INSTANTIATE_TEST_SUITE_P(ResourceTests,
 INSTANTIATE_TEST_SUITE_P(ResourceAllocationTests,
                          mr_ref_allocation_test,
                          ::testing::Values("CUDA",
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
                                            "CUDA_Async",
-#endif
                                            "Managed",
                                            "System"
                                            "Pool",
diff --git a/tests/mr/device/thrust_allocator_tests.cu b/tests/mr/device/thrust_allocator_tests.cu
index 84f599957..46447aa09 100644
--- a/tests/mr/device/thrust_allocator_tests.cu
+++ b/tests/mr/device/thrust_allocator_tests.cu
@@ -69,17 +69,11 @@ TEST_P(allocator_test, multi_device)
   }());
 }
 
-INSTANTIATE_TEST_CASE_P(ThrustAllocatorTests,
-                        allocator_test,
-                        ::testing::Values("CUDA",
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
-                                          "CUDA_Async",
-#endif
-                                          "Managed",
-                                          "Pool",
-                                          "Arena",
-                                          "Binning"),
-                        [](auto const& info) { return info.param; });
+INSTANTIATE_TEST_CASE_P(
+  ThrustAllocatorTests,
+  allocator_test,
+  ::testing::Values("CUDA", "CUDA_Async", "Managed", "Pool", "Arena", "Binning"),
+  [](auto const& info) { return info.param; });
 
 }  // namespace
 }  // namespace rmm::test
diff --git a/tests/prefetch_tests.cpp b/tests/prefetch_tests.cpp
index 6c7bb2dd3..4a2c41a2b 100644
--- a/tests/prefetch_tests.cpp
+++ b/tests/prefetch_tests.cpp
@@ -53,8 +53,8 @@ struct PrefetchTest : public ::testing::Test {
     // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g8048f6ea5ad77917444567656c140c5a
     // specifically for when cudaMemRangeAttribute::cudaMemRangeAttributeLastPrefetchLocation is
     // used.
-    constexpr size_t prefetch_data_size = 4;
     if constexpr (std::is_same_v<MemoryResourceType, rmm::mr::managed_memory_resource>) {
+      constexpr size_t prefetch_data_size = 4;
       int prefetch_location{0};
       RMM_CUDA_TRY(
         cudaMemRangeGetAttribute(&prefetch_location,

From 3b5f6af2eaa0519643ccc2a4c1395307bfd3ad7e Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Wed, 20 Nov 2024 12:22:49 +1100
Subject: [PATCH 28/29] Query total memory in failure_callback_resource_adaptor
 tests (#1734)

Fixes #1733 by querying total device memory and using twice as much in tests that are expected to fail allocation.

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/rmm/pull/1734
---
 python/rmm/rmm/tests/test_rmm.py | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
index d7d692287..182434dc5 100644
--- a/python/rmm/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -795,10 +795,28 @@ def callback(nbytes: int) -> bool:
     rmm.mr.set_current_device_resource(mr)
 
     with pytest.raises(MemoryError):
-        rmm.DeviceBuffer(size=int(1e11))
+        from rmm.mr import available_device_memory
+
+        total_memory = available_device_memory()[1]
+        rmm.DeviceBuffer(size=total_memory * 2)
     assert retried[0]
 
 
+def test_failure_callback_resource_adaptor_error():
+    def callback(nbytes: int) -> bool:
+        raise RuntimeError("MyError")
+
+    cuda_mr = rmm.mr.CudaMemoryResource()
+    mr = rmm.mr.FailureCallbackResourceAdaptor(cuda_mr, callback)
+    rmm.mr.set_current_device_resource(mr)
+
+    with pytest.raises(RuntimeError, match="MyError"):
+        from rmm.mr import available_device_memory
+
+        total_memory = available_device_memory()[1]
+        rmm.DeviceBuffer(size=total_memory * 2)
+
+
 @pytest.mark.parametrize("managed", [True, False])
 def test_prefetch_resource_adaptor(managed):
     if managed:
@@ -823,18 +841,6 @@ def test_prefetch_resource_adaptor(managed):
         assert_prefetched(db, device)
 
 
-def test_failure_callback_resource_adaptor_error():
-    def callback(nbytes: int) -> bool:
-        raise RuntimeError("MyError")
-
-    cuda_mr = rmm.mr.CudaMemoryResource()
-    mr = rmm.mr.FailureCallbackResourceAdaptor(cuda_mr, callback)
-    rmm.mr.set_current_device_resource(mr)
-
-    with pytest.raises(RuntimeError, match="MyError"):
-        rmm.DeviceBuffer(size=int(1e11))
-
-
 def test_dev_buf_circle_ref_dealloc():
     # This test creates a reference cycle containing a `DeviceBuffer`
     # and ensures that the garbage collector does not clear it, i.e.,

From b391ce867a62bbfc3283779d70231be5b00e2e84 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 11 Dec 2024 13:12:21 -0500
Subject: [PATCH 29/29] Update Changelog [skip ci]

---
 CHANGELOG.md | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1268762b2..817a0c8c2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,48 @@
+# rmm 24.12.00 (11 Dec 2024)
+
+## 🚨 Breaking Changes
+
+- Deprecate support for directly accessing logger ([#1690](https://github.com/rapidsai/rmm/pull/1690)) [@vyasr](https://github.com/vyasr)
+
+## 🐛 Bug Fixes
+
+- Query total memory in failure_callback_resource_adaptor tests ([#1734](https://github.com/rapidsai/rmm/pull/1734)) [@harrism](https://github.com/harrism)
+- Treat deprecation warnings as errors and fix deprecation warnings in replay benchmark ([#1728](https://github.com/rapidsai/rmm/pull/1728)) [@harrism](https://github.com/harrism)
+- Disallow cuda-python 12.6.1 and 11.8.4 ([#1720](https://github.com/rapidsai/rmm/pull/1720)) [@bdice](https://github.com/bdice)
+- Fix typos in .gitignore ([#1697](https://github.com/rapidsai/rmm/pull/1697)) [@charlesbluca](https://github.com/charlesbluca)
+- Fix `rmm ._lib` imports ([#1693](https://github.com/rapidsai/rmm/pull/1693)) [@Matt711](https://github.com/Matt711)
+
+## 📖 Documentation
+
+- Fix docs warning ([#1706](https://github.com/rapidsai/rmm/pull/1706)) [@bdice](https://github.com/bdice)
+- Update cross-link to cuda-python object ([#1699](https://github.com/rapidsai/rmm/pull/1699)) [@wence-](https://github.com/wence-)
+
+## 🚀 New Features
+
+- Correct rmm tests for validity of device pointers ([#1714](https://github.com/rapidsai/rmm/pull/1714)) [@robertmaynard](https://github.com/robertmaynard)
+- Update rmm tests to use rapids_cmake_support_conda_env ([#1707](https://github.com/rapidsai/rmm/pull/1707)) [@robertmaynard](https://github.com/robertmaynard)
+- adding telemetry ([#1692](https://github.com/rapidsai/rmm/pull/1692)) [@msarahan](https://github.com/msarahan)
+- Make `cudaMallocAsync` logic non-optional as we require CUDA 11.2+ ([#1667](https://github.com/rapidsai/rmm/pull/1667)) [@robertmaynard](https://github.com/robertmaynard)
+
+## 🛠️ Improvements
+
+- enforce wheel size limits, README formatting in CI ([#1726](https://github.com/rapidsai/rmm/pull/1726)) [@jameslamb](https://github.com/jameslamb)
+- Remove all explicit usage of fmtlib ([#1724](https://github.com/rapidsai/rmm/pull/1724)) [@harrism](https://github.com/harrism)
+- WIP: put a ceiling on cuda-python ([#1723](https://github.com/rapidsai/rmm/pull/1723)) [@jameslamb](https://github.com/jameslamb)
+- use rapids-generate-pip-constraints to pin to oldest dependencies in CI ([#1716](https://github.com/rapidsai/rmm/pull/1716)) [@jameslamb](https://github.com/jameslamb)
+- Deprecate `rmm._lib` ([#1713](https://github.com/rapidsai/rmm/pull/1713)) [@Matt711](https://github.com/Matt711)
+- print sccache stats in builds ([#1712](https://github.com/rapidsai/rmm/pull/1712)) [@jameslamb](https://github.com/jameslamb)
+- [fea] Expose the arena mr to the Python interface. ([#1711](https://github.com/rapidsai/rmm/pull/1711)) [@trivialfis](https://github.com/trivialfis)
+- devcontainer: replace `VAULT_HOST` with `AWS_ROLE_ARN` ([#1708](https://github.com/rapidsai/rmm/pull/1708)) [@jjacobelli](https://github.com/jjacobelli)
+- make conda installs in CI stricter (part 2) ([#1703](https://github.com/rapidsai/rmm/pull/1703)) [@jameslamb](https://github.com/jameslamb)
+- Add BUILD_SHARED_LIBS option defaulting to ON ([#1702](https://github.com/rapidsai/rmm/pull/1702)) [@wence-](https://github.com/wence-)
+- make conda installs in CI stricter ([#1696](https://github.com/rapidsai/rmm/pull/1696)) [@jameslamb](https://github.com/jameslamb)
+- Prune workflows based on changed files ([#1695](https://github.com/rapidsai/rmm/pull/1695)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Deprecate support for directly accessing logger ([#1690](https://github.com/rapidsai/rmm/pull/1690)) [@vyasr](https://github.com/vyasr)
+- Use `rmm::percent_of_free_device_memory` in arena test ([#1689](https://github.com/rapidsai/rmm/pull/1689)) [@wence-](https://github.com/wence-)
+- exclude &#39;gcovr&#39; from list of development pip packages ([#1688](https://github.com/rapidsai/rmm/pull/1688)) [@jameslamb](https://github.com/jameslamb)
+- [Improvement] Reorganize Cython to separate C++ bindings and make Cython classes public ([#1676](https://github.com/rapidsai/rmm/pull/1676)) [@Matt711](https://github.com/Matt711)
+
 # rmm 24.10.00 (9 Oct 2024)
 
 ## 🚨 Breaking Changes