diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 9d35e3f97..5d1d53670 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -26,5 +26,5 @@ ENV PYTHONDONTWRITEBYTECODE="1"
 
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
-ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
+ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
 ENV HISTFILE="/home/coder/.cache/._bash_history"
diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 3bfef6706..549ffa67b 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 5bfc30823..d6dd7b6ce 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda11.8-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index 925557b22..17e8d5cd0 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index 2f9e1c493..54964d880 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.12-cpp-cuda12.5-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.12-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 9b7efecde..6fa11225e 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -56,7 +56,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -68,7 +68,7 @@ jobs:
       run_script: "ci/build_docs.sh"
   wheel-build-cpp:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
@@ -79,7 +79,7 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-publish-cpp:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -100,7 +100,7 @@ jobs:
   wheel-publish-python:
     needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 1160b93e9..6780298c3 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -12,51 +12,91 @@ concurrency:
 jobs:
   pr-builder:
     needs:
+      - changed-files
       - checks
       - conda-cpp-build
       - conda-cpp-tests
       - conda-python-build
       - conda-python-tests
       - docs-build
+      - telemetry-setup
       - wheel-build-cpp
       - wheel-build-python
       - wheel-tests
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.12
+    if: always()
+    with:
+      needs: ${{ toJSON(needs) }}
+  telemetry-setup:
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    env:
+        OTEL_SERVICE_NAME: "pr-rmm"
+    steps:
+      - name: Telemetry setup
+        uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main
+  changed-files:
+    needs:
+      - telemetry-setup
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/changed-files.yaml@branch-24.12
+    with:
+      files_yaml: |
+        test_cpp:
+          - '**'
+          - '!.devcontainer/**'
+          - '!.pre-commit-config.yaml'
+          - '!CONTRIBUTING.md'
+          - '!README.md'
+          - '!img/**'
+          - '!python/**'
+        test_python:
+          - '**'
+          - '!.devcontainer/**'
+          - '!.pre-commit-config.yaml'
+          - '!CONTRIBUTING.md'
+          - '!README.md'
+          - '!img/**'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
+    needs:
+      - telemetry-setup
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.12
     with:
       enable_check_generated_files: false
+      ignored_pr_jobs: telemetry-summarize
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.12
     with:
       build_type: pull-request
   conda-cpp-tests:
-    needs: conda-cpp-build
+    needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.12
     with:
       build_type: pull-request
   conda-python-tests:
-    needs: conda-python-build
+    needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -66,7 +106,7 @@ jobs:
   wheel-build-cpp:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
@@ -74,20 +114,23 @@ jobs:
   wheel-build-python:
     needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.12
     with:
       build_type: pull-request
       script: ci/build_wheel_python.sh
   wheel-tests:
-    needs: wheel-build-python
+    needs: [wheel-build-python, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python
     with:
       build_type: pull-request
       script: ci/test_wheel.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
+    needs:
+      - telemetry-setup
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.12
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
@@ -95,3 +138,18 @@ jobs:
         sccache -z;
         build-all -DBUILD_BENCHMARKS=ON --verbose;
         sccache -s;
+
+  telemetry-summarize:
+    runs-on: ubuntu-latest
+    needs: pr-builder
+    if: always()
+    continue-on-error: true
+    steps:
+      - name: Load stashed telemetry env vars
+        uses: rapidsai/shared-actions/telemetry-dispatch-load-base-env-vars@main
+        with:
+            load_service_name: true
+      - name: Telemetry summarize
+        uses: rapidsai/shared-actions/telemetry-dispatch-write-summary@main
+        with:
+          cert_concat: "${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }}"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 13838d888..34a0f746d 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -32,7 +32,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/.gitignore b/.gitignore
index 2d0b150e1..df9d920d5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,10 +22,13 @@ rmm.egg-info/
 python/build
 python/*/build
 python/rmm/docs/_build
-python/rmm/**/_lib/**/*.cpp
-!python/rmm/_lib/_torch_allocator.cpp
-python/rmm/**/_lib/**/*.h
-python/rmm/**/_lib/.nfs*
+python/rmm/**/librmm/**/*.cpp
+!python/rmm/librmm/_torch_allocator.cpp
+python/rmm/**/librmm/**/*.h
+python/rmm/**/librmm/.nfs*
+python/rmm/**/pylibrmm/**/*.cpp
+python/rmm/**/pylibrmm/**/*.h
+python/rmm/**/pylibrmm/.nfs*
 python/rmm/_cuda/*.cpp
 python/rmm/tests/*.cpp
 python/rmm/*.ipynb
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f114abec4..56c972b4e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -82,7 +82,7 @@ repos:
         - id: verify-copyright
         - id: verify-alpha-spec
     - repo: https://github.com/rapidsai/dependency-file-generator
-      rev: v1.13.11
+      rev: v1.16.0
       hooks:
           - id: rapids-dependency-file-generator
             args: ["--clean"]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1268762b2..817a0c8c2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,48 @@
+# rmm 24.12.00 (11 Dec 2024)
+
+## 🚨 Breaking Changes
+
+- Deprecate support for directly accessing logger ([#1690](https://github.com/rapidsai/rmm/pull/1690)) [@vyasr](https://github.com/vyasr)
+
+## 🐛 Bug Fixes
+
+- Query total memory in failure_callback_resource_adaptor tests ([#1734](https://github.com/rapidsai/rmm/pull/1734)) [@harrism](https://github.com/harrism)
+- Treat deprecation warnings as errors and fix deprecation warnings in replay benchmark ([#1728](https://github.com/rapidsai/rmm/pull/1728)) [@harrism](https://github.com/harrism)
+- Disallow cuda-python 12.6.1 and 11.8.4 ([#1720](https://github.com/rapidsai/rmm/pull/1720)) [@bdice](https://github.com/bdice)
+- Fix typos in .gitignore ([#1697](https://github.com/rapidsai/rmm/pull/1697)) [@charlesbluca](https://github.com/charlesbluca)
+- Fix `rmm ._lib` imports ([#1693](https://github.com/rapidsai/rmm/pull/1693)) [@Matt711](https://github.com/Matt711)
+
+## 📖 Documentation
+
+- Fix docs warning ([#1706](https://github.com/rapidsai/rmm/pull/1706)) [@bdice](https://github.com/bdice)
+- Update cross-link to cuda-python object ([#1699](https://github.com/rapidsai/rmm/pull/1699)) [@wence-](https://github.com/wence-)
+
+## 🚀 New Features
+
+- Correct rmm tests for validity of device pointers ([#1714](https://github.com/rapidsai/rmm/pull/1714)) [@robertmaynard](https://github.com/robertmaynard)
+- Update rmm tests to use rapids_cmake_support_conda_env ([#1707](https://github.com/rapidsai/rmm/pull/1707)) [@robertmaynard](https://github.com/robertmaynard)
+- adding telemetry ([#1692](https://github.com/rapidsai/rmm/pull/1692)) [@msarahan](https://github.com/msarahan)
+- Make `cudaMallocAsync` logic non-optional as we require CUDA 11.2+ ([#1667](https://github.com/rapidsai/rmm/pull/1667)) [@robertmaynard](https://github.com/robertmaynard)
+
+## 🛠️ Improvements
+
+- enforce wheel size limits, README formatting in CI ([#1726](https://github.com/rapidsai/rmm/pull/1726)) [@jameslamb](https://github.com/jameslamb)
+- Remove all explicit usage of fmtlib ([#1724](https://github.com/rapidsai/rmm/pull/1724)) [@harrism](https://github.com/harrism)
+- WIP: put a ceiling on cuda-python ([#1723](https://github.com/rapidsai/rmm/pull/1723)) [@jameslamb](https://github.com/jameslamb)
+- use rapids-generate-pip-constraints to pin to oldest dependencies in CI ([#1716](https://github.com/rapidsai/rmm/pull/1716)) [@jameslamb](https://github.com/jameslamb)
+- Deprecate `rmm._lib` ([#1713](https://github.com/rapidsai/rmm/pull/1713)) [@Matt711](https://github.com/Matt711)
+- print sccache stats in builds ([#1712](https://github.com/rapidsai/rmm/pull/1712)) [@jameslamb](https://github.com/jameslamb)
+- [fea] Expose the arena mr to the Python interface. ([#1711](https://github.com/rapidsai/rmm/pull/1711)) [@trivialfis](https://github.com/trivialfis)
+- devcontainer: replace `VAULT_HOST` with `AWS_ROLE_ARN` ([#1708](https://github.com/rapidsai/rmm/pull/1708)) [@jjacobelli](https://github.com/jjacobelli)
+- make conda installs in CI stricter (part 2) ([#1703](https://github.com/rapidsai/rmm/pull/1703)) [@jameslamb](https://github.com/jameslamb)
+- Add BUILD_SHARED_LIBS option defaulting to ON ([#1702](https://github.com/rapidsai/rmm/pull/1702)) [@wence-](https://github.com/wence-)
+- make conda installs in CI stricter ([#1696](https://github.com/rapidsai/rmm/pull/1696)) [@jameslamb](https://github.com/jameslamb)
+- Prune workflows based on changed files ([#1695](https://github.com/rapidsai/rmm/pull/1695)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Deprecate support for directly accessing logger ([#1690](https://github.com/rapidsai/rmm/pull/1690)) [@vyasr](https://github.com/vyasr)
+- Use `rmm::percent_of_free_device_memory` in arena test ([#1689](https://github.com/rapidsai/rmm/pull/1689)) [@wence-](https://github.com/wence-)
+- exclude &#39;gcovr&#39; from list of development pip packages ([#1688](https://github.com/rapidsai/rmm/pull/1688)) [@jameslamb](https://github.com/jameslamb)
+- [Improvement] Reorganize Cython to separate C++ bindings and make Cython classes public ([#1676](https://github.com/rapidsai/rmm/pull/1676)) [@Matt711](https://github.com/Matt711)
+
 # rmm 24.10.00 (9 Oct 2024)
 
 ## 🚨 Breaking Changes
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 39d5dccde..07bd368ee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,6 +41,9 @@ rapids_cmake_build_type(Release)
 option(RMM_NVTX "Build RMM with NVTX support" OFF)
 option(BUILD_TESTS "Configure CMake to build tests" ON)
 option(BUILD_BENCHMARKS "Configure CMake to build (google) benchmarks" OFF)
+# This is mostly so that dependent libraries are configured in shared mode for downstream dependents
+# of RMM that get their common dependencies transitively.
+option(BUILD_SHARED_LIBS "Build RMM shared libraries" ON)
 set(RMM_LOGGING_LEVEL
     "INFO"
     CACHE STRING "Choose the logging level.")
@@ -70,7 +73,6 @@ rapids_find_package(
 # add third party dependencies using CPM
 rapids_cpm_init()
 
-include(cmake/thirdparty/get_fmt.cmake)
 include(cmake/thirdparty/get_spdlog.cmake)
 include(cmake/thirdparty/get_cccl.cmake)
 include(cmake/thirdparty/get_nvtx.cmake)
@@ -87,13 +89,11 @@ target_include_directories(rmm INTERFACE "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOUR
 if(CUDA_STATIC_RUNTIME)
   message(STATUS "RMM: Enabling static linking of cudart")
   target_link_libraries(rmm INTERFACE CUDA::cudart_static)
-  target_compile_definitions(rmm INTERFACE RMM_STATIC_CUDART)
 else()
   target_link_libraries(rmm INTERFACE CUDA::cudart)
 endif()
 
 target_link_libraries(rmm INTERFACE CCCL::CCCL)
-target_link_libraries(rmm INTERFACE fmt::fmt-header-only)
 target_link_libraries(rmm INTERFACE spdlog::spdlog_header_only)
 target_link_libraries(rmm INTERFACE dl)
 target_link_libraries(rmm INTERFACE nvtx3::nvtx3-cpp)
diff --git a/VERSION b/VERSION
index 7c7ba0443..af28c42b5 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.10.00
+24.12.00
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 9dfb2c538..0487a2dfa 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -42,9 +42,8 @@ function(ConfigureBench BENCH_NAME)
     target_compile_definitions(${BENCH_NAME} PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM)
   endif()
 
-  target_compile_options(
-    ${BENCH_NAME} PUBLIC $<$<COMPILE_LANG_AND_ID:CXX,GNU,Clang>:-Wall -Werror
-                         -Wno-error=deprecated-declarations -Wno-unknown-pragmas>)
+  target_compile_options(${BENCH_NAME} PUBLIC $<$<COMPILE_LANG_AND_ID:CXX,GNU,Clang>:-Wall -Werror
+                                              -Wno-unknown-pragmas>)
   if(DISABLE_DEPRECATION_WARNING)
     target_compile_options(
       ${BENCH_NAME} PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-Wno-deprecated-declarations>)
diff --git a/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu b/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu
index 86e761c80..b5edbb536 100644
--- a/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu
+++ b/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu
@@ -133,9 +133,7 @@ static void benchmark_range(benchmark::internal::Benchmark* bench)
 MRFactoryFunc get_mr_factory(std::string const& resource_name)
 {
   if (resource_name == "cuda") { return &make_cuda; }
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   if (resource_name == "cuda_async") { return &make_cuda_async; }
-#endif
   if (resource_name == "pool") { return &make_pool; }
   if (resource_name == "arena") { return &make_arena; }
   if (resource_name == "binning") { return &make_binning; }
@@ -153,13 +151,11 @@ void declare_benchmark(std::string const& name)
     return;
   }
 
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   if (name == "cuda_async") {
     BENCHMARK_CAPTURE(BM_MultiStreamAllocations, cuda_async, &make_cuda_async)  //
       ->Apply(benchmark_range);
     return;
   }
-#endif
 
   if (name == "pool") {
     BENCHMARK_CAPTURE(BM_MultiStreamAllocations, pool_mr, &make_pool)  //
@@ -248,9 +244,7 @@ int main(int argc, char** argv)
         resource_names.emplace_back(args["resource"].as<std::string>());
       } else {
         resource_names.emplace_back("cuda");
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
         resource_names.emplace_back("cuda_async");
-#endif
         resource_names.emplace_back("pool");
         resource_names.emplace_back("arena");
         resource_names.emplace_back("binning");
diff --git a/benchmarks/random_allocations/random_allocations.cpp b/benchmarks/random_allocations/random_allocations.cpp
index 57116743b..2971f7e40 100644
--- a/benchmarks/random_allocations/random_allocations.cpp
+++ b/benchmarks/random_allocations/random_allocations.cpp
@@ -316,9 +316,7 @@ int main(int argc, char** argv)
       std::map<std::string, MRFactoryFunc> const funcs({{"arena", &make_arena},
                                                         {"binning", &make_binning},
                                                         {"cuda", &make_cuda},
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
                                                         {"cuda_async", &make_cuda_async},
-#endif
                                                         {"pool", &make_pool}});
       auto resource = args["resource"].as<std::string>();
 
@@ -340,11 +338,7 @@ int main(int argc, char** argv)
         std::string mr_name = args["resource"].as<std::string>();
         declare_benchmark(mr_name);
       } else {
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
         std::vector<std::string> mrs{"pool", "binning", "arena", "cuda_async", "cuda"};
-#else
-        std::vector<std::string> mrs{"pool", "binning", "arena", "cuda"};
-#endif
         std::for_each(
           std::cbegin(mrs), std::cend(mrs), [](auto const& mr) { declare_benchmark(mr); });
       }
diff --git a/benchmarks/replay/replay.cpp b/benchmarks/replay/replay.cpp
index 5afed036a..7f45b7691 100644
--- a/benchmarks/replay/replay.cpp
+++ b/benchmarks/replay/replay.cpp
@@ -16,6 +16,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/error.hpp>
+#include <rmm/logger.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/binning_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
@@ -172,7 +173,7 @@ struct replay_benchmark {
   void SetUp(const ::benchmark::State& state)
   {
     if (state.thread_index() == 0) {
-      rmm::logger().log(spdlog::level::info, "------ Start of Benchmark -----");
+      RMM_LOG_INFO("------ Start of Benchmark -----");
       mr_ = factory_(simulated_size_);
     }
   }
@@ -181,7 +182,7 @@ struct replay_benchmark {
   void TearDown(const ::benchmark::State& state)
   {
     if (state.thread_index() == 0) {
-      rmm::logger().log(spdlog::level::info, "------ End of Benchmark -----");
+      RMM_LOG_INFO("------ End of Benchmark -----");
       // clean up any leaked allocations
       std::size_t total_leaked{0};
       std::size_t num_leaked{0};
@@ -402,7 +403,7 @@ int main(int argc, char** argv)
     auto const num_threads = per_thread_events.size();
 
     // Uncomment to enable / change default log level
-    // rmm::logger().set_level(spdlog::level::trace);
+    // rmm::detail::logger().set_level(spdlog::level::trace);
 
     if (args.count("resource") > 0) {
       std::string mr_name = args["resource"].as<std::string>();
diff --git a/benchmarks/utilities/log_parser.hpp b/benchmarks/utilities/log_parser.hpp
index 2283ace93..4dfa5bae4 100644
--- a/benchmarks/utilities/log_parser.hpp
+++ b/benchmarks/utilities/log_parser.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -151,7 +151,7 @@ inline std::vector<event> parse_csv(std::string const& filename)
 
   auto parse_pointer = [](std::string const& str, uintptr_t& ptr) {
     auto const base{16};
-    ptr = std::stoll(str, nullptr, base);
+    ptr = (str == "(nil)") ? 0 : std::stoll(str, nullptr, base);
   };
 
   std::vector<uintptr_t> pointers = csv.GetColumn<uintptr_t>("Pointer", parse_pointer);
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index a601ecaae..9d14cd072 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -15,7 +15,11 @@ rapids-print-env
 
 rapids-logger "Begin cpp build"
 
+sccache --zero-stats
+
 # This calls mambabuild when boa is installed (as is the case in the CI images)
 RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild conda/recipes/librmm
 
+sccache --show-adv-stats
+
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 970417c1d..844dae1c6 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -6,6 +6,9 @@ set -euo pipefail
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
+
 rapids-dependency-file-generator \
   --output conda \
   --file-key docs \
@@ -23,11 +26,9 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  rmm librmm
+  "rmm=${RAPIDS_VERSION}" \
+  "librmm=${RAPIDS_VERSION}"
 
-export RAPIDS_VERSION="$(rapids-version)"
-export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
-export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-logger "Build CPP docs"
@@ -44,4 +45,4 @@ mkdir -p "${RAPIDS_DOCS_DIR}/rmm/html"
 mv _build/dirhtml/* "${RAPIDS_DOCS_DIR}/rmm/html"
 popd
 
-rapids-upload-docs
+RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs
diff --git a/ci/build_python.sh b/ci/build_python.sh
index fcd2c55e7..7a9df5fc7 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -19,7 +19,11 @@ rapids-logger "Begin py build"
 
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
+sccache --zero-stats
+
 # This calls mambabuild when boa is installed (as is the case in the CI images)
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild -c "${CPP_CHANNEL}" conda/recipes/rmm
 
+sccache --show-adv-stats
+
 rapids-upload-conda-to-s3 python
diff --git a/ci/build_wheel_cpp.sh b/ci/build_wheel_cpp.sh
index 2c5cc0560..1ec979372 100755
--- a/ci/build_wheel_cpp.sh
+++ b/ci/build_wheel_cpp.sh
@@ -14,7 +14,15 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
 cd "${package_dir}"
 
-python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+sccache --zero-stats
+
+python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check
+
+sccache --show-adv-stats
+
 python -m pip install wheel
 python -m wheel tags --platform any dist/* --remove
+
+../../ci/validate_wheel.sh dist
+
 RAPIDS_PY_WHEEL_NAME="rmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp dist
diff --git a/ci/build_wheel_python.sh b/ci/build_wheel_python.sh
index 555974b50..4e4d3bf61 100755
--- a/ci/build_wheel_python.sh
+++ b/ci/build_wheel_python.sh
@@ -22,12 +22,18 @@ CPP_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="rmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-down
 # are used when created the isolated build environment
 echo "librmm-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo ${CPP_WHEELHOUSE}/librmm_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" > ./build-constraints.txt
 
+sccache --zero-stats
+
 PIP_CONSTRAINT="${PWD}/build-constraints.txt" \
-    python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+    python -m pip wheel . -w dist -v --no-deps --disable-pip-version-check
+
+sccache --show-adv-stats
 
 mkdir -p final_dist
 python -m auditwheel repair -w final_dist dist/*
 
+../../ci/validate_wheel.sh final_dist
+
 RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python final_dist
 
 # switch back to the root of the repo and check symbol visibility
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 9ad1c9536..975477a6e 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -8,6 +8,8 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+
 rapids-logger "Generate C++ testing dependencies"
 rapids-dependency-file-generator \
   --output conda \
@@ -29,7 +31,8 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  librmm librmm-tests
+  "librmm=${RAPIDS_VERSION}" \
+  "librmm-tests=${RAPIDS_VERSION}"
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/ci/test_python.sh b/ci/test_python.sh
index 386d0b063..51d0a48c3 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -9,6 +9,8 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
+RAPIDS_VERSION="$(rapids-version)"
+
 rapids-dependency-file-generator \
   --output conda \
   --file-key test_python \
@@ -28,7 +30,8 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  rmm librmm
+  "rmm=${RAPIDS_VERSION}" \
+  "librmm=${RAPIDS_VERSION}"
 
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}
 RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"}
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index d06c4eed0..2f39b197b 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -7,15 +7,8 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 WHEELHOUSE="${PWD}/dist/"
 RAPIDS_PY_WHEEL_NAME="rmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python "${WHEELHOUSE}"
 
-# Constraint to minimum dependency versions if job is set up as "oldest"
-echo "" > ./constraints.txt
-if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
-  rapids-dependency-file-generator \
-        --output requirements \
-        --file-key test_python \
-        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
-      | tee ./constraints.txt
-fi
+# generate constraints (possibly pinning to oldest support versions of dependencies)
+rapids-generate-pip-constraints test_python ./constraints.txt
 
 # echo to expand wildcard before adding '[extra]' requires for pip
 python -m pip install \
diff --git a/ci/validate_wheel.sh b/ci/validate_wheel.sh
new file mode 100755
index 000000000..60a80fce6
--- /dev/null
+++ b/ci/validate_wheel.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+wheel_dir_relative_path=$1
+
+rapids-logger "validate packages with 'pydistcheck'"
+
+pydistcheck \
+    --inspect \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
+
+rapids-logger "validate packages with 'twine'"
+
+twine check \
+    --strict \
+    "$(echo ${wheel_dir_relative_path}/*.whl)"
diff --git a/cmake/thirdparty/get_fmt.cmake b/cmake/thirdparty/get_fmt.cmake
deleted file mode 100644
index 5787fb73f..000000000
--- a/cmake/thirdparty/get_fmt.cmake
+++ /dev/null
@@ -1,22 +0,0 @@
-# =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-# Use CPM to find or clone fmt
-function(find_and_configure_fmt)
-
-  include(${rapids-cmake-dir}/cpm/fmt.cmake)
-  rapids_cpm_fmt(INSTALL_EXPORT_SET rmm-exports BUILD_EXPORT_SET rmm-exports)
-endfunction()
-
-find_and_configure_fmt()
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index bf64d4d55..519c056b5 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -10,7 +10,7 @@ dependencies:
 - clang-tools==16.0.6
 - clang==16.0.6
 - cmake>=3.26.4,!=3.30.0
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.7.1,<12.0a0,<=11.8.3
 - cuda-version=11.8
 - cudatoolkit
 - cxx-compiler
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 112c635a8..86e887c21 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -11,7 +11,7 @@ dependencies:
 - clang==16.0.6
 - cmake>=3.26.4,!=3.30.0
 - cuda-nvcc
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.0,<13.0a0,<=12.6.0
 - cuda-version=12.5
 - cxx-compiler
 - cython>=3.0.0
diff --git a/conda/recipes/rmm/meta.yaml b/conda/recipes/rmm/meta.yaml
index fcc7424fa..8f6e13fd7 100644
--- a/conda/recipes/rmm/meta.yaml
+++ b/conda/recipes/rmm/meta.yaml
@@ -38,6 +38,7 @@ build:
     - {{ compiler('cuda') }}
     - cuda-cudart-dev
     {% endif %}
+    - cuda-python
 
 requirements:
   build:
@@ -56,10 +57,10 @@ requirements:
     - cuda-version ={{ cuda_version }}
     {% if cuda_major == "11" %}
     - cudatoolkit
-    - cuda-python >=11.7.1,<12.0a0
+    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
     {% else %}
     - cuda-cudart-dev
-    - cuda-python >=12.0,<13.0a0
+    - cuda-python >=12.0,<13.0a0,<=12.6.0
     {% endif %}
     - cython >=3.0.0
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
@@ -69,12 +70,15 @@ requirements:
   run:
     {% if cuda_major == "11" %}
     - cudatoolkit
+    - cuda-python >=11.7.1,<12.0a0,<=11.8.3
     {% else %}
     - cuda-cudart
+    - cuda-python >=12.0,<13.0a0,<=12.6.0
     {% endif %}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     - numba >=0.57
     - numpy >=1.23,<3.0a0
+    - python
 
 test:
   imports:
diff --git a/dependencies.yaml b/dependencies.yaml
index 483c21e61..3e2c2eb29 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -153,25 +153,25 @@ dependencies:
           - matrix:
               cuda: "12.*"
             packages:
-              - &cuda_python12 cuda-python>=12.0,<13.0a0
+              - &cuda_python12 cuda-python>=12.0,<13.0a0,<=12.6.0
           - matrix: # All CUDA 11 versions
             packages:
-              - &cuda_python11 cuda-python>=11.7.1,<12.0a0
+              - &cuda_python11 cuda-python>=11.7.1,<12.0a0,<=11.8.3
       - output_types: [requirements, pyproject]
         matrices:
             - matrix:
                 cuda: "12.*"
                 cuda_suffixed: "true"
               packages:
-                - librmm-cu12==24.10.*,>=0.0.0a0
+                - librmm-cu12==24.12.*,>=0.0.0a0
             - matrix:
                 cuda: "11.*"
                 cuda_suffixed: "true"
               packages:
-                - librmm-cu11==24.10.*,>=0.0.0a0
+                - librmm-cu11==24.12.*,>=0.0.0a0
             - matrix: null
               packages:
-                - librmm==24.10.*,>=0.0.0a0
+                - librmm==24.12.*,>=0.0.0a0
   checks:
     common:
       - output_types: [conda, requirements]
@@ -232,13 +232,11 @@ dependencies:
             packages:
   develop:
     common:
-      - output_types: [conda, requirements]
-        packages:
-          - gcovr>=5.0
       - output_types: conda
         packages:
           - clang==16.0.6
           - clang-tools==16.0.6
+          - gcovr>=5.0
   docs:
     common:
       - output_types: conda
diff --git a/include/rmm/detail/dynamic_load_runtime.hpp b/include/rmm/detail/dynamic_load_runtime.hpp
deleted file mode 100644
index 214228752..000000000
--- a/include/rmm/detail/dynamic_load_runtime.hpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <rmm/cuda_device.hpp>
-#include <rmm/detail/export.hpp>
-
-#include <cuda_runtime_api.h>
-
-#include <dlfcn.h>
-
-#include <memory>
-#include <optional>
-
-namespace RMM_NAMESPACE {
-namespace detail {
-
-/**
- * @brief `dynamic_load_runtime` loads the cuda runtime library at runtime
- *
- * By loading the cudart library at runtime we can use functions that
- * are added in newer minor versions of the cuda runtime.
- */
-struct dynamic_load_runtime {
-  static void* get_cuda_runtime_handle()
-  {
-    auto close_cudart = [](void* handle) { ::dlclose(handle); };
-    auto open_cudart  = []() {
-      ::dlerror();
-      const int major = CUDART_VERSION / 1000;
-
-      // In CUDA 12 the SONAME is correctly defined as libcudart.12, but for
-      // CUDA<=11 it includes an extra 0 minor version e.g. libcudart.11.0. We
-      // also allow finding the linker name.
-      const std::string libname_ver_cuda_11 = "libcudart.so." + std::to_string(major) + ".0";
-      const std::string libname_ver_cuda_12 = "libcudart.so." + std::to_string(major);
-      const std::string libname             = "libcudart.so";
-
-      void* ptr = nullptr;
-      for (auto&& name : {libname_ver_cuda_12, libname_ver_cuda_11, libname}) {
-        ptr = dlopen(name.c_str(), RTLD_LAZY);
-        if (ptr != nullptr) break;
-      }
-
-      if (ptr != nullptr) { return ptr; }
-
-      RMM_FAIL("Unable to dlopen cudart");
-    };
-    static std::unique_ptr<void, decltype(close_cudart)> cudart_handle{open_cudart(), close_cudart};
-    return cudart_handle.get();
-  }
-
-  template <typename... Args>
-  using function_sig = std::add_pointer_t<cudaError_t(Args...)>;
-
-  template <typename signature>
-  static std::optional<signature> function(const char* func_name)
-  {
-    auto* runtime = get_cuda_runtime_handle();
-    auto* handle  = ::dlsym(runtime, func_name);
-    if (!handle) { return std::nullopt; }
-    auto* function_ptr = reinterpret_cast<signature>(handle);
-    return std::optional<signature>(function_ptr);
-  }
-};
-
-#if defined(RMM_STATIC_CUDART)
-// clang-format off
-#define RMM_CUDART_API_WRAPPER(name, signature)                               \
-  template <typename... Args>                                                 \
-  static cudaError_t name(Args... args)                                       \
-  {                                                                           \
-    _Pragma("GCC diagnostic push")                                            \
-    _Pragma("GCC diagnostic ignored \"-Waddress\"")                           \
-    static_assert(static_cast<signature>(::name),                             \
-                  "Failed to find #name function with arguments #signature"); \
-    _Pragma("GCC diagnostic pop")                                             \
-    return ::name(args...);                                                   \
-  }
-// clang-format on
-#else
-#define RMM_CUDART_API_WRAPPER(name, signature)                                \
-  template <typename... Args>                                                  \
-  static cudaError_t name(Args... args)                                        \
-  {                                                                            \
-    static const auto func = dynamic_load_runtime::function<signature>(#name); \
-    if (func) { return (*func)(args...); }                                     \
-    RMM_FAIL("Failed to find #name function in libcudart.so");                 \
-  }
-#endif
-
-#if CUDART_VERSION >= 11020  // 11.2 introduced cudaMallocAsync
-/**
- * @brief Bind to the stream-ordered memory allocator functions
- * at runtime.
- *
- * This allows RMM users to compile/link against CUDA 11.2+ and run with
- * < CUDA 11.2 runtime as these functions are found at call time.
- */
-struct async_alloc {
-  static bool is_supported()
-  {
-#if defined(RMM_STATIC_CUDART)
-    static bool runtime_supports_pool = (CUDART_VERSION >= 11020);
-#else
-    static bool runtime_supports_pool =
-      dynamic_load_runtime::function<dynamic_load_runtime::function_sig<void*, cudaStream_t>>(
-        "cudaFreeAsync")
-        .has_value();
-#endif
-
-    static auto driver_supports_pool{[] {
-      int cuda_pool_supported{};
-      auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
-                                           cudaDevAttrMemoryPoolsSupported,
-                                           rmm::get_current_cuda_device().value());
-      return result == cudaSuccess and cuda_pool_supported == 1;
-    }()};
-    return runtime_supports_pool and driver_supports_pool;
-  }
-
-  /**
-   * @brief Check whether the specified `cudaMemAllocationHandleType` is supported on the present
-   * CUDA driver/runtime version.
-   *
-   * @note This query was introduced in CUDA 11.3 so on CUDA 11.2 this function will only return
-   * true for `cudaMemHandleTypeNone`.
-   *
-   * @param handle_type An IPC export handle type to check for support.
-   * @return true if supported
-   * @return false if unsupported
-   */
-  static bool is_export_handle_type_supported(cudaMemAllocationHandleType handle_type)
-  {
-    int supported_handle_types_bitmask{};
-#if CUDART_VERSION >= 11030  // 11.3 introduced cudaDevAttrMemoryPoolSupportedHandleTypes
-    if (cudaMemHandleTypeNone != handle_type) {
-      auto const result = cudaDeviceGetAttribute(&supported_handle_types_bitmask,
-                                                 cudaDevAttrMemoryPoolSupportedHandleTypes,
-                                                 rmm::get_current_cuda_device().value());
-
-      // Don't throw on cudaErrorInvalidValue
-      auto const unsupported_runtime = (result == cudaErrorInvalidValue);
-      if (unsupported_runtime) return false;
-      // throw any other error that may have occurred
-      RMM_CUDA_TRY(result);
-    }
-
-#endif
-    return (supported_handle_types_bitmask & handle_type) == handle_type;
-  }
-
-  template <typename... Args>
-  using cudart_sig = dynamic_load_runtime::function_sig<Args...>;
-
-  using cudaMemPoolCreate_sig = cudart_sig<cudaMemPool_t*, const cudaMemPoolProps*>;
-  RMM_CUDART_API_WRAPPER(cudaMemPoolCreate, cudaMemPoolCreate_sig);
-
-  using cudaMemPoolSetAttribute_sig = cudart_sig<cudaMemPool_t, cudaMemPoolAttr, void*>;
-  RMM_CUDART_API_WRAPPER(cudaMemPoolSetAttribute, cudaMemPoolSetAttribute_sig);
-
-  using cudaMemPoolDestroy_sig = cudart_sig<cudaMemPool_t>;
-  RMM_CUDART_API_WRAPPER(cudaMemPoolDestroy, cudaMemPoolDestroy_sig);
-
-  using cudaMallocFromPoolAsync_sig = cudart_sig<void**, size_t, cudaMemPool_t, cudaStream_t>;
-  RMM_CUDART_API_WRAPPER(cudaMallocFromPoolAsync, cudaMallocFromPoolAsync_sig);
-
-  using cudaFreeAsync_sig = cudart_sig<void*, cudaStream_t>;
-  RMM_CUDART_API_WRAPPER(cudaFreeAsync, cudaFreeAsync_sig);
-
-  using cudaDeviceGetDefaultMemPool_sig = cudart_sig<cudaMemPool_t*, int>;
-  RMM_CUDART_API_WRAPPER(cudaDeviceGetDefaultMemPool, cudaDeviceGetDefaultMemPool_sig);
-};
-#endif
-
-#undef RMM_CUDART_API_WRAPPER
-}  // namespace detail
-}  // namespace RMM_NAMESPACE
diff --git a/include/rmm/detail/format.hpp b/include/rmm/detail/format.hpp
new file mode 100644
index 000000000..21acac032
--- /dev/null
+++ b/include/rmm/detail/format.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <array>
+#include <cstdio>
+#include <ios>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+namespace RMM_NAMESPACE {
+namespace detail {
+
+/**
+ * @brief Format a message string with printf-style formatting
+ *
+ * This function performs printf-style formatting to avoid the need for fmt
+ * or spdlog's own templated APIs (which would require exposing spdlog
+ * symbols publicly) and returns the formatted message as a `std::string`.
+ *
+ * @param format The format string
+ * @param args The format arguments
+ * @return The formatted message
+ * @throw rmm::logic_error if an error occurs during formatting
+ */
+template <typename... Args>
+std::string formatted_log(std::string const& format, Args&&... args)
+{
+  auto convert_to_c_string = [](auto&& arg) -> decltype(auto) {
+    using ArgType = std::decay_t<decltype(arg)>;
+    if constexpr (std::is_same_v<ArgType, std::string>) {
+      return arg.c_str();
+    } else {
+      return std::forward<decltype(arg)>(arg);
+    }
+  };
+
+  // NOLINTBEGIN(cppcoreguidelines-pro-type-vararg)
+  auto retsize =
+    std::snprintf(nullptr, 0, format.c_str(), convert_to_c_string(std::forward<Args>(args))...);
+  RMM_EXPECTS(retsize >= 0, "Error during formatting.");
+  if (retsize == 0) { return {}; }
+  auto size = static_cast<std::size_t>(retsize) + 1;  // for null terminator
+  // NOLINTNEXTLINE(modernize-avoid-c-arrays, cppcoreguidelines-avoid-c-arrays)
+  std::unique_ptr<char[]> buf(new char[size]);
+  std::snprintf(buf.get(), size, format.c_str(), convert_to_c_string(std::forward<Args>(args))...);
+  // NOLINTEND(cppcoreguidelines-pro-type-vararg)
+  return {buf.get(), buf.get() + size - 1};  // drop '\0'
+}
+
+// specialization for no arguments
+template <>
+inline std::string formatted_log(std::string const& format)
+{
+  return format;
+}
+
+// Stringify a size in bytes to a human-readable value
+inline std::string format_bytes(std::size_t value)
+{
+  static std::array units{"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"};
+
+  int index = 0;
+  auto size = static_cast<double>(value);
+  while (size > 1024) {
+    size /= 1024;
+    index++;
+  }
+
+  return std::to_string(size) + ' ' + units.at(index);
+}
+
+// Stringify a stream ID
+inline std::string format_stream(rmm::cuda_stream_view stream)
+{
+  std::stringstream sstr{};
+  sstr << std::hex << stream.value();
+  return sstr.str();
+}
+
+}  // namespace detail
+}  // namespace RMM_NAMESPACE
diff --git a/include/rmm/detail/logging_assert.hpp b/include/rmm/detail/logging_assert.hpp
index 7eb667211..4d702ee2b 100644
--- a/include/rmm/detail/logging_assert.hpp
+++ b/include/rmm/detail/logging_assert.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@
     if (!success) {                                                                               \
       RMM_LOG_CRITICAL(                                                                           \
         "[" __FILE__ ":" RMM_STRINGIFY(__LINE__) "] Assertion " RMM_STRINGIFY(_expr) " failed."); \
-      rmm::logger().flush();                                                                      \
+      rmm::detail::logger().flush();                                                              \
       /* NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay) */                   \
       assert(success);                                                                            \
     }                                                                                             \
diff --git a/include/rmm/detail/runtime_async_alloc.hpp b/include/rmm/detail/runtime_async_alloc.hpp
new file mode 100644
index 000000000..6ddb2228b
--- /dev/null
+++ b/include/rmm/detail/runtime_async_alloc.hpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/detail/export.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <dlfcn.h>
+
+#include <memory>
+#include <optional>
+
+namespace RMM_NAMESPACE {
+namespace detail {
+
+/**
+ * @brief Determine at runtime if the CUDA driver supports the stream-ordered
+ * memory allocator functions.
+ *
+ * This allows RMM users to compile/link against CUDA 11.2+ and run with
+ * older drivers.
+ */
+
+struct runtime_async_alloc {
+  static bool is_supported()
+  {
+    static auto driver_supports_pool{[] {
+      int cuda_pool_supported{};
+      auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
+                                           cudaDevAttrMemoryPoolsSupported,
+                                           rmm::get_current_cuda_device().value());
+      return result == cudaSuccess and cuda_pool_supported == 1;
+    }()};
+    return driver_supports_pool;
+  }
+
+  /**
+   * @brief Check whether the specified `cudaMemAllocationHandleType` is supported on the present
+   * CUDA driver/runtime version.
+   *
+   * @param handle_type An IPC export handle type to check for support.
+   * @return true if supported
+   * @return false if unsupported
+   */
+  static bool is_export_handle_type_supported(cudaMemAllocationHandleType handle_type)
+  {
+    int supported_handle_types_bitmask{};
+    if (cudaMemHandleTypeNone != handle_type) {
+      auto const result = cudaDeviceGetAttribute(&supported_handle_types_bitmask,
+                                                 cudaDevAttrMemoryPoolSupportedHandleTypes,
+                                                 rmm::get_current_cuda_device().value());
+
+      // Don't throw on cudaErrorInvalidValue
+      auto const unsupported_runtime = (result == cudaErrorInvalidValue);
+      if (unsupported_runtime) return false;
+      // throw any other error that may have occurred
+      RMM_CUDA_TRY(result);
+    }
+    return (supported_handle_types_bitmask & handle_type) == handle_type;
+  }
+};
+
+}  // namespace detail
+}  // namespace RMM_NAMESPACE
diff --git a/include/rmm/logger.hpp b/include/rmm/logger.hpp
index 326385f16..2cfd921b1 100644
--- a/include/rmm/logger.hpp
+++ b/include/rmm/logger.hpp
@@ -16,15 +16,13 @@
 
 #pragma once
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/export.hpp>
+#include <rmm/detail/format.hpp>
 
-#include <fmt/format.h>
-#include <fmt/ostream.h>
 #include <spdlog/sinks/basic_file_sink.h>
 #include <spdlog/spdlog.h>
 
-#include <array>
-#include <iostream>
 #include <string>
 
 namespace RMM_NAMESPACE {
@@ -70,32 +68,11 @@ struct logger_wrapper {
   }
 };
 
-/**
- * @brief Represent a size in number of bytes.
- */
-struct bytes {
-  std::size_t value;  ///< The size in bytes
-
-  /**
-   * @brief Construct a new bytes object
-   *
-   * @param os The output stream
-   * @param value The size in bytes
-   */
-  friend std::ostream& operator<<(std::ostream& os, bytes const& value)
-  {
-    static std::array units{"B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB"};
-
-    int index = 0;
-    auto size = static_cast<double>(value.value);
-    while (size > 1024) {
-      size /= 1024;
-      index++;
-    }
-    return os << size << ' ' << units.at(index);
-  }
-};
-
+inline spdlog::logger& logger()
+{
+  static detail::logger_wrapper wrapped{};
+  return wrapped.logger_;
+}
 }  // namespace detail
 
 /**
@@ -107,10 +84,12 @@ struct bytes {
  *
  * @return spdlog::logger& The logger.
  */
-RMM_EXPORT inline spdlog::logger& logger()
+[[deprecated(
+  "Support for direct access to spdlog loggers in rmm is planned for "
+  "removal")]] RMM_EXPORT inline spdlog::logger&
+logger()
 {
-  static detail::logger_wrapper wrapped{};
-  return wrapped.logger_;
+  return detail::logger();
 }
 
 //! @cond Doxygen_Suppress
@@ -118,20 +97,21 @@ RMM_EXPORT inline spdlog::logger& logger()
 // The default is INFO, but it should be used sparingly, so that by default a log file is only
 // output if there is important information, warnings, errors, and critical failures
 // Log messages that require computation should only be used at level TRACE and DEBUG
-#define RMM_LOG_TRACE(...)    SPDLOG_LOGGER_TRACE(&rmm::logger(), __VA_ARGS__)
-#define RMM_LOG_DEBUG(...)    SPDLOG_LOGGER_DEBUG(&rmm::logger(), __VA_ARGS__)
-#define RMM_LOG_INFO(...)     SPDLOG_LOGGER_INFO(&rmm::logger(), __VA_ARGS__)
-#define RMM_LOG_WARN(...)     SPDLOG_LOGGER_WARN(&rmm::logger(), __VA_ARGS__)
-#define RMM_LOG_ERROR(...)    SPDLOG_LOGGER_ERROR(&rmm::logger(), __VA_ARGS__)
-#define RMM_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&rmm::logger(), __VA_ARGS__)
+#define RMM_LOG_TRACE(...) \
+  SPDLOG_LOGGER_TRACE(&rmm::detail::logger(), rmm::detail::formatted_log(__VA_ARGS__))
+#define RMM_LOG_DEBUG(...) \
+  SPDLOG_LOGGER_DEBUG(&rmm::detail::logger(), rmm::detail::formatted_log(__VA_ARGS__))
+#define RMM_LOG_INFO(...) \
+  SPDLOG_LOGGER_INFO(&rmm::detail::logger(), rmm::detail::formatted_log(__VA_ARGS__))
+#define RMM_LOG_WARN(...) \
+  SPDLOG_LOGGER_WARN(&rmm::detail::logger(), rmm::detail::formatted_log(__VA_ARGS__))
+#define RMM_LOG_ERROR(...) \
+  SPDLOG_LOGGER_ERROR(&rmm::detail::logger(), rmm::detail::formatted_log(__VA_ARGS__))
+#define RMM_LOG_CRITICAL(...) \
+  SPDLOG_LOGGER_CRITICAL(&rmm::detail::logger(), rmm::detail::formatted_log(__VA_ARGS__))
 
 //! @endcond
 
 }  // namespace RMM_NAMESPACE
 
-// Doxygen doesn't like this because we're overloading something from fmt
-//! @cond Doxygen_Suppress
-template <>
-struct fmt::formatter<rmm::detail::bytes> : fmt::ostream_formatter {};
-
 //! @endcond
diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index 417b7d2b4..d3a4bb09d 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -18,6 +18,7 @@
 #include <rmm/aligned.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
+#include <rmm/detail/format.hpp>
 #include <rmm/detail/logging_assert.hpp>
 #include <rmm/logger.hpp>
 #include <rmm/mr/device/detail/arena.hpp>
@@ -97,7 +98,10 @@ class arena_memory_resource final : public device_memory_resource {
     : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
   {
     if (dump_log_on_failure_) {
-      logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log");
+      logger_ =
+        std::make_shared<spdlog::logger>("arena_memory_dump",
+                                         std::make_shared<spdlog::sinks::basic_file_sink_mt>(
+                                           "rmm_arena_memory_dump.log", true /*truncate file*/));
       // Set the level to `debug` for more detailed output.
       logger_->set_level(spdlog::level::info);
     }
@@ -120,7 +124,10 @@ class arena_memory_resource final : public device_memory_resource {
       dump_log_on_failure_{dump_log_on_failure}
   {
     if (dump_log_on_failure_) {
-      logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log");
+      logger_ =
+        std::make_shared<spdlog::logger>("arena_memory_dump",
+                                         std::make_shared<spdlog::sinks::basic_file_sink_mt>(
+                                           "rmm_arena_memory_dump.log", true /*truncate file*/));
       // Set the level to `debug` for more detailed output.
       logger_->set_level(spdlog::level::info);
     }
@@ -329,7 +336,8 @@ class arena_memory_resource final : public device_memory_resource {
   void dump_memory_log(size_t bytes)
   {
     logger_->info("**************************************************");
-    logger_->info("Ran out of memory trying to allocate {}.", rmm::detail::bytes{bytes});
+    logger_->info(rmm::detail::formatted_log("Ran out of memory trying to allocate %s.",
+                                             rmm::detail::format_bytes(bytes)));
     logger_->info("**************************************************");
     logger_->info("Global arena:");
     global_arena_.dump_memory_log(logger_);
diff --git a/include/rmm/mr/device/cuda_async_memory_resource.hpp b/include/rmm/mr/device/cuda_async_memory_resource.hpp
index 52fd2fe4e..b1fc0b112 100644
--- a/include/rmm/mr/device/cuda_async_memory_resource.hpp
+++ b/include/rmm/mr/device/cuda_async_memory_resource.hpp
@@ -17,9 +17,9 @@
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/detail/dynamic_load_runtime.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
+#include <rmm/detail/runtime_async_alloc.hpp>
 #include <rmm/detail/thrust_namespace.h>
 #include <rmm/mr/device/cuda_async_view_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
@@ -31,12 +31,6 @@
 #include <limits>
 #include <optional>
 
-#if CUDART_VERSION >= 11020  // 11.2 introduced cudaMallocAsync
-#ifndef RMM_DISABLE_CUDA_MALLOC_ASYNC
-#define RMM_CUDA_MALLOC_ASYNC_SUPPORT
-#endif
-#endif
-
 namespace RMM_NAMESPACE {
 namespace mr {
 /**
@@ -91,9 +85,8 @@ class cuda_async_memory_resource final : public device_memory_resource {
                              std::optional<std::size_t> release_threshold             = {},
                              std::optional<allocation_handle_type> export_handle_type = {})
   {
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     // Check if cudaMallocAsync Memory pool supported
-    RMM_EXPECTS(rmm::detail::async_alloc::is_supported(),
+    RMM_EXPECTS(rmm::detail::runtime_async_alloc::is_supported(),
                 "cudaMallocAsync not supported with this CUDA driver/runtime version");
 
     // Construct explicit pool
@@ -101,12 +94,13 @@ class cuda_async_memory_resource final : public device_memory_resource {
     pool_props.allocType   = cudaMemAllocationTypePinned;
     pool_props.handleTypes = static_cast<cudaMemAllocationHandleType>(
       export_handle_type.value_or(allocation_handle_type::none));
-    RMM_EXPECTS(rmm::detail::async_alloc::is_export_handle_type_supported(pool_props.handleTypes),
-                "Requested IPC memory handle type not supported");
+    RMM_EXPECTS(
+      rmm::detail::runtime_async_alloc::is_export_handle_type_supported(pool_props.handleTypes),
+      "Requested IPC memory handle type not supported");
     pool_props.location.type = cudaMemLocationTypeDevice;
     pool_props.location.id   = rmm::get_current_cuda_device().value();
     cudaMemPool_t cuda_pool_handle{};
-    RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle, &pool_props));
+    RMM_CUDA_TRY(cudaMemPoolCreate(&cuda_pool_handle, &pool_props));
     pool_ = cuda_async_view_memory_resource{cuda_pool_handle};
 
     // CUDA drivers before 11.5 have known incompatibilities with the async allocator.
@@ -117,41 +111,34 @@ class cuda_async_memory_resource final : public device_memory_resource {
     constexpr auto min_async_version{11050};
     if (driver_version < min_async_version) {
       int disabled{0};
-      RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute(
-        pool_handle(), cudaMemPoolReuseAllowOpportunistic, &disabled));
+      RMM_CUDA_TRY(
+        cudaMemPoolSetAttribute(pool_handle(), cudaMemPoolReuseAllowOpportunistic, &disabled));
     }
 
     auto const [free, total] = rmm::available_device_memory();
 
     // Need an l-value to take address to pass to cudaMemPoolSetAttribute
     uint64_t threshold = release_threshold.value_or(total);
-    RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolSetAttribute(
-      pool_handle(), cudaMemPoolAttrReleaseThreshold, &threshold));
+    RMM_CUDA_TRY(
+      cudaMemPoolSetAttribute(pool_handle(), cudaMemPoolAttrReleaseThreshold, &threshold));
 
     // Allocate and immediately deallocate the initial_pool_size to prime the pool with the
     // specified size
     auto const pool_size = initial_pool_size.value_or(free / 2);
     auto* ptr            = do_allocate(pool_size, cuda_stream_default);
     do_deallocate(ptr, pool_size, cuda_stream_default);
-#else
-    RMM_FAIL(
-      "cudaMallocAsync not supported by the version of the CUDA Toolkit used for this build");
-#endif
   }
 
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   /**
    * @brief Returns the underlying native handle to the CUDA pool
    *
+   * @return cudaMemPool_t Handle to the underlying CUDA pool
    */
   [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return pool_.pool_handle(); }
-#endif
 
   ~cuda_async_memory_resource() override
   {
-#if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
-    RMM_ASSERT_CUDA_SUCCESS(rmm::detail::async_alloc::cudaMemPoolDestroy(pool_handle()));
-#endif
+    RMM_ASSERT_CUDA_SUCCESS(cudaMemPoolDestroy(pool_handle()));
   }
   cuda_async_memory_resource(cuda_async_memory_resource const&)            = delete;
   cuda_async_memory_resource(cuda_async_memory_resource&&)                 = delete;
@@ -159,9 +146,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
   cuda_async_memory_resource& operator=(cuda_async_memory_resource&&)      = delete;
 
  private:
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   cuda_async_view_memory_resource pool_{};
-#endif
 
   /**
    * @brief Allocates memory of size at least \p bytes.
@@ -175,12 +160,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
   {
     void* ptr{nullptr};
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     ptr = pool_.allocate(bytes, stream);
-#else
-    (void)bytes;
-    (void)stream;
-#endif
     return ptr;
   }
 
@@ -194,13 +174,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
    */
   void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) override
   {
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     pool_.deallocate(ptr, bytes, stream);
-#else
-    (void)ptr;
-    (void)bytes;
-    (void)stream;
-#endif
   }
 
   /**
@@ -213,11 +187,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
   [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
   {
     auto const* async_mr = dynamic_cast<cuda_async_memory_resource const*>(&other);
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     return (async_mr != nullptr) && (this->pool_handle() == async_mr->pool_handle());
-#else
-    return async_mr != nullptr;
-#endif
   }
 };
 
diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
index 3e1900e72..180c412ee 100644
--- a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
+++ b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
@@ -17,7 +17,6 @@
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/detail/dynamic_load_runtime.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
 #include <rmm/detail/thrust_namespace.h>
@@ -28,10 +27,6 @@
 #include <cstddef>
 #include <limits>
 
-#if CUDART_VERSION >= 11020  // 11.2 introduced cudaMallocAsync
-#define RMM_CUDA_MALLOC_ASYNC_SUPPORT
-#endif
-
 namespace RMM_NAMESPACE {
 namespace mr {
 /**
@@ -46,13 +41,12 @@ namespace mr {
  */
 class cuda_async_view_memory_resource final : public device_memory_resource {
  public:
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   /**
    * @brief Constructs a cuda_async_view_memory_resource which uses an existing CUDA memory pool.
    * The provided pool is not owned by cuda_async_view_memory_resource and must remain valid
    * during the lifetime of the memory resource.
    *
-   * @throws rmm::runtime_error if the CUDA version does not support `cudaMallocAsync`
+   * @throws rmm::logic_error if the CUDA version does not support `cudaMallocAsync`
    *
    * @param valid_pool_handle Handle to a CUDA memory pool which will be used to
    * serve allocation requests.
@@ -71,15 +65,13 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
     RMM_EXPECTS(result == cudaSuccess && cuda_pool_supported,
                 "cudaMallocAsync not supported with this CUDA driver/runtime version");
   }
-#endif
 
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   /**
    * @brief Returns the underlying native handle to the CUDA pool
    *
+   * @return cudaMemPool_t Handle to the underlying CUDA pool
    */
   [[nodiscard]] cudaMemPool_t pool_handle() const noexcept { return cuda_pool_handle_; }
-#endif
 
   cuda_async_view_memory_resource() = default;
   cuda_async_view_memory_resource(cuda_async_view_memory_resource const&) =
@@ -92,9 +84,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
     default;  ///< @default_move_assignment{cuda_async_view_memory_resource}
 
  private:
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
   cudaMemPool_t cuda_pool_handle_{};
-#endif
 
   /**
    * @brief Allocates memory of size at least \p bytes.
@@ -108,15 +98,9 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
   {
     void* ptr{nullptr};
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
     if (bytes > 0) {
-      RMM_CUDA_TRY_ALLOC(rmm::detail::async_alloc::cudaMallocFromPoolAsync(
-        &ptr, bytes, pool_handle(), stream.value()));
+      RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value()));
     }
-#else
-    (void)bytes;
-    (void)stream;
-#endif
     return ptr;
   }
 
@@ -132,15 +116,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
                      [[maybe_unused]] std::size_t bytes,
                      rmm::cuda_stream_view stream) override
   {
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
-    if (ptr != nullptr) {
-      RMM_ASSERT_CUDA_SUCCESS(rmm::detail::async_alloc::cudaFreeAsync(ptr, stream.value()));
-    }
-#else
-    (void)ptr;
-    (void)bytes;
-    (void)stream;
-#endif
+    if (ptr != nullptr) { RMM_ASSERT_CUDA_SUCCESS(cudaFreeAsync(ptr, stream.value())); }
   }
 
   /**
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 6f8303c83..419c4fcf4 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -21,13 +21,13 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
+#include <rmm/detail/format.hpp>
 #include <rmm/detail/logging_assert.hpp>
 #include <rmm/logger.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <cuda_runtime_api.h>
 
-#include <fmt/core.h>
 #include <spdlog/common.h>
 
 #include <algorithm>
@@ -647,37 +647,42 @@ class global_arena final {
    *
    * @param logger the spdlog logger to use
    */
-  void dump_memory_log(std::shared_ptr<spdlog::logger> const& logger) const
+  RMM_HIDDEN void dump_memory_log(std::shared_ptr<spdlog::logger> const& logger) const
   {
     std::lock_guard lock(mtx_);
 
-    logger->info("  Arena size: {}", rmm::detail::bytes{upstream_block_.size()});
-    logger->info("  # superblocks: {}", superblocks_.size());
+    logger->info(rmm::detail::formatted_log("  Arena size: %s",
+                                            rmm::detail::format_bytes(upstream_block_.size())));
+    logger->info(rmm::detail::formatted_log("  # superblocks: %zu", superblocks_.size()));
     if (!superblocks_.empty()) {
-      logger->debug("  Total size of superblocks: {}",
-                    rmm::detail::bytes{total_memory_size(superblocks_)});
+      logger->debug(
+        rmm::detail::formatted_log("  Total size of superblocks: %s",
+                                   rmm::detail::format_bytes(total_memory_size(superblocks_))));
       auto const total_free    = total_free_size(superblocks_);
       auto const max_free      = max_free_size(superblocks_);
       auto const fragmentation = (1 - max_free / static_cast<double>(total_free)) * 100;
-      logger->info("  Total free memory: {}", rmm::detail::bytes{total_free});
-      logger->info("  Largest block of free memory: {}", rmm::detail::bytes{max_free});
-      logger->info("  Fragmentation: {:.2f}%", fragmentation);
+      logger->info(rmm::detail::formatted_log("  Total free memory: %s",
+                                              rmm::detail::format_bytes(total_free)));
+      logger->info(rmm::detail::formatted_log("  Largest block of free memory: %s",
+                                              rmm::detail::format_bytes(max_free)));
+      logger->info(rmm::detail::formatted_log("  Fragmentation: %0.2f", fragmentation));
 
-      auto index = 0;
+      auto index = decltype(superblocks_.size()){0};
       char* prev_end{};
       for (auto const& sblk : superblocks_) {
         if (prev_end == nullptr) { prev_end = sblk.pointer(); }
-        logger->debug(
-          "    Superblock {}: start={}, end={}, size={}, empty={}, # free blocks={}, max free={}, "
-          "gap={}",
+        logger->debug(rmm::detail::formatted_log(
+          "    Superblock %zu: start=%p, end=%p, size=%s, empty=%s, # free blocks=%zu, max "
+          "free=%s, "
+          "gap=%s",
           index,
-          fmt::ptr(sblk.pointer()),
-          fmt::ptr(sblk.end()),
-          rmm::detail::bytes{sblk.size()},
-          sblk.empty(),
+          sblk.pointer(),
+          sblk.end(),
+          rmm::detail::format_bytes(sblk.size()),
+          sblk.empty() ? "T" : "F",
           sblk.free_blocks(),
-          rmm::detail::bytes{sblk.max_free_size()},
-          rmm::detail::bytes{static_cast<size_t>(sblk.pointer() - prev_end)});
+          rmm::detail::format_bytes(sblk.max_free_size()),
+          rmm::detail::format_bytes(static_cast<size_t>(sblk.pointer() - prev_end))));
         prev_end = sblk.end();
         index++;
       }
diff --git a/include/rmm/mr/device/detail/coalescing_free_list.hpp b/include/rmm/mr/device/detail/coalescing_free_list.hpp
index 8d5cbf9ed..8b056e6d9 100644
--- a/include/rmm/mr/device/detail/coalescing_free_list.hpp
+++ b/include/rmm/mr/device/detail/coalescing_free_list.hpp
@@ -20,8 +20,6 @@
 #include <rmm/detail/export.hpp>
 #include <rmm/mr/device/detail/free_list.hpp>
 
-#include <fmt/core.h>
-
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -131,10 +129,7 @@ struct block : public block_base {
   /**
    * @brief Print this block. For debugging.
    */
-  inline void print() const
-  {
-    std::cout << fmt::format("{} {} B", fmt::ptr(pointer()), size()) << std::endl;
-  }
+  inline void print() const { std::cout << pointer() << " " << size() << " B" << std::endl; }
 #endif
 
  private:
@@ -146,7 +141,7 @@ struct block : public block_base {
 /// Print block on an ostream
 inline std::ostream& operator<<(std::ostream& out, const block& blk)
 {
-  out << fmt::format("{} {} B\n", fmt::ptr(blk.pointer()), blk.size());
+  out << blk.pointer() << " " << blk.size() << " B" << std::endl;
   return out;
 }
 #endif
diff --git a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
index 9cf674d6e..f177504f2 100644
--- a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
+++ b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
@@ -19,13 +19,12 @@
 #include <rmm/cuda_device.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
+#include <rmm/detail/format.hpp>
 #include <rmm/logger.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <cuda_runtime_api.h>
 
-#include <fmt/core.h>
-
 #include <cstddef>
 #include <map>
 #include <mutex>
@@ -201,7 +200,7 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
    */
   void* do_allocate(std::size_t size, cuda_stream_view stream) override
   {
-    RMM_LOG_TRACE("[A][stream {:p}][{}B]", fmt::ptr(stream.value()), size);
+    RMM_LOG_TRACE("[A][stream %s][%zuB]", rmm::detail::format_stream(stream), size);
 
     if (size <= 0) { return nullptr; }
 
@@ -215,10 +214,10 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
                 rmm::out_of_memory);
     auto const block = this->underlying().get_block(size, stream_event);
 
-    RMM_LOG_TRACE("[A][stream {:p}][{}B][{:p}]",
-                  fmt::ptr(stream_event.stream),
+    RMM_LOG_TRACE("[A][stream %s][%zuB][%p]",
+                  rmm::detail::format_stream(stream_event.stream),
                   size,
-                  fmt::ptr(block.pointer()));
+                  block.pointer());
 
     log_summary_trace();
 
@@ -234,7 +233,7 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
    */
   void do_deallocate(void* ptr, std::size_t size, cuda_stream_view stream) override
   {
-    RMM_LOG_TRACE("[D][stream {:p}][{}B][{:p}]", fmt::ptr(stream.value()), size, ptr);
+    RMM_LOG_TRACE("[D][stream %s][%zuB][%p]", rmm::detail::format_stream(stream), size, ptr);
 
     if (size <= 0 || ptr == nullptr) { return; }
 
@@ -384,10 +383,10 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
       if (merge_first) {
         merge_lists(stream_event, blocks, other_event, std::move(other_blocks));
 
-        RMM_LOG_DEBUG("[A][Stream {:p}][{}B][Merged stream {:p}]",
-                      fmt::ptr(stream_event.stream),
+        RMM_LOG_DEBUG("[A][Stream %s][%zuB][Merged stream %s]",
+                      rmm::detail::format_stream(stream_event.stream),
                       size,
-                      fmt::ptr(iter->first.stream));
+                      rmm::detail::format_stream(iter->first.stream));
 
         stream_free_blocks_.erase(iter);
 
@@ -414,11 +413,11 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
         block_type const block = find_block(iter);
 
         if (block.is_valid()) {
-          RMM_LOG_DEBUG((merge_first) ? "[A][Stream {:p}][{}B][Found after merging stream {:p}]"
-                                      : "[A][Stream {:p}][{}B][Taken from stream {:p}]",
-                        fmt::ptr(stream_event.stream),
+          RMM_LOG_DEBUG((merge_first) ? "[A][Stream %s][%zuB][Found after merging stream %s]"
+                                      : "[A][Stream %s][%zuB][Taken from stream %s]",
+                        rmm::detail::format_stream(stream_event.stream),
                         size,
-                        fmt::ptr(iter->first.stream));
+                        rmm::detail::format_stream(iter->first.stream));
           return block;
         }
       }
@@ -471,7 +470,7 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
                     max_block    = std::max(summary.first, max_block);
                     free_mem += summary.second;
                   });
-    RMM_LOG_TRACE("[Summary][Free lists: {}][Blocks: {}][Max Block: {}][Total Free: {}]",
+    RMM_LOG_TRACE("[Summary][Free lists: %zu][Blocks: %zu][Max Block: %zu][Total Free: %zu]",
                   stream_free_blocks_.size(),
                   num_blocks,
                   max_block,
diff --git a/include/rmm/mr/device/logging_resource_adaptor.hpp b/include/rmm/mr/device/logging_resource_adaptor.hpp
index 595ab2e4e..578543852 100644
--- a/include/rmm/mr/device/logging_resource_adaptor.hpp
+++ b/include/rmm/mr/device/logging_resource_adaptor.hpp
@@ -18,16 +18,17 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
+#include <rmm/detail/format.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <fmt/core.h>
 #include <spdlog/common.h>
 #include <spdlog/sinks/basic_file_sink.h>
 #include <spdlog/sinks/ostream_sink.h>
 #include <spdlog/spdlog.h>
 
 #include <cstddef>
+#include <cstdio>
 #include <memory>
 #include <sstream>
 #include <string_view>
@@ -297,10 +298,12 @@ class logging_resource_adaptor final : public device_memory_resource {
   {
     try {
       auto const ptr = get_upstream_resource().allocate_async(bytes, stream);
-      logger_->info("allocate,{},{},{}", ptr, bytes, fmt::ptr(stream.value()));
+      logger_->info(rmm::detail::formatted_log(
+        "allocate,%p,%zu,%s", ptr, bytes, rmm::detail::format_stream(stream)));
       return ptr;
     } catch (...) {
-      logger_->info("allocate failure,{},{},{}", nullptr, bytes, fmt::ptr(stream.value()));
+      logger_->info(rmm::detail::formatted_log(
+        "allocate failure,%p,%zu,%s", nullptr, bytes, rmm::detail::format_stream(stream)));
       throw;
     }
   }
@@ -321,7 +324,8 @@ class logging_resource_adaptor final : public device_memory_resource {
    */
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
   {
-    logger_->info("free,{},{},{}", ptr, bytes, fmt::ptr(stream.value()));
+    logger_->info(
+      rmm::detail::formatted_log("free,%p,%zu,%s", ptr, bytes, rmm::detail::format_stream(stream)));
     get_upstream_resource().deallocate_async(ptr, bytes, stream);
   }
 
diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp
index f63de21ff..037147de3 100644
--- a/include/rmm/mr/device/pool_memory_resource.hpp
+++ b/include/rmm/mr/device/pool_memory_resource.hpp
@@ -19,6 +19,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
+#include <rmm/detail/format.hpp>
 #include <rmm/detail/logging_assert.hpp>
 #include <rmm/detail/thrust_namespace.h>
 #include <rmm/logger.hpp>
@@ -34,8 +35,6 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/optional.h>
 
-#include <fmt/core.h>
-
 #include <algorithm>
 #include <cstddef>
 #include <iostream>
@@ -271,8 +270,8 @@ class pool_memory_resource final
       }
       try_size = std::max(min_size, try_size / 2);
     }
-    RMM_LOG_ERROR("[A][Stream {}][Upstream {}B][FAILURE maximum pool size exceeded]",
-                  fmt::ptr(stream.value()),
+    RMM_LOG_ERROR("[A][Stream %s][Upstream %zuB][FAILURE maximum pool size exceeded]",
+                  rmm::detail::format_stream(stream),
                   min_size);
     RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
   }
@@ -351,7 +350,7 @@ class pool_memory_resource final
    */
   std::optional<block_type> block_from_upstream(std::size_t size, cuda_stream_view stream)
   {
-    RMM_LOG_DEBUG("[A][Stream {}][Upstream {}B]", fmt::ptr(stream.value()), size);
+    RMM_LOG_DEBUG("[A][Stream %s][Upstream %zuB]", rmm::detail::format_stream(stream), size);
 
     if (size == 0) { return {}; }
 
diff --git a/include/rmm/mr/device/tracking_resource_adaptor.hpp b/include/rmm/mr/device/tracking_resource_adaptor.hpp
index 6a5916e5c..8131eef4d 100644
--- a/include/rmm/mr/device/tracking_resource_adaptor.hpp
+++ b/include/rmm/mr/device/tracking_resource_adaptor.hpp
@@ -23,8 +23,6 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <fmt/core.h>
-
 #include <cstddef>
 #include <map>
 #include <mutex>
@@ -188,7 +186,7 @@ class tracking_resource_adaptor final : public device_memory_resource {
   void log_outstanding_allocations() const
   {
 #if SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_DEBUG
-    RMM_LOG_DEBUG("Outstanding Allocations: {}", get_outstanding_allocations_str());
+    RMM_LOG_DEBUG("Outstanding Allocations: %s", get_outstanding_allocations_str());
 #endif  // SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_DEBUG
   }
 
@@ -236,12 +234,12 @@ class tracking_resource_adaptor final : public device_memory_resource {
 
       // Ensure the allocation is found and the number of bytes match
       if (found == allocations_.end()) {
-        // Don't throw but log an error. Throwing in a descructor (or any noexcept) will call
+        // Don't throw but log an error. Throwing in a destructor (or any noexcept) will call
         // std::terminate
         RMM_LOG_ERROR(
-          "Deallocating a pointer that was not tracked. Ptr: {:p} [{}B], Current Num. Allocations: "
-          "{}",
-          fmt::ptr(ptr),
+          "Deallocating a pointer that was not tracked. Ptr: %p [%zuB], Current Num. Allocations: "
+          "%zu",
+          ptr,
           bytes,
           this->allocations_.size());
       } else {
@@ -250,10 +248,10 @@ class tracking_resource_adaptor final : public device_memory_resource {
         auto allocated_bytes = found->second.allocation_size;
 
         if (allocated_bytes != bytes) {
-          // Don't throw but log an error. Throwing in a descructor (or any noexcept) will call
+          // Don't throw but log an error. Throwing in a destructor (or any noexcept) will call
           // std::terminate
           RMM_LOG_ERROR(
-            "Alloc bytes ({}) and Dealloc bytes ({}) do not match", allocated_bytes, bytes);
+            "Alloc bytes (%zu) and Dealloc bytes (%zu) do not match", allocated_bytes, bytes);
 
           bytes = allocated_bytes;
         }
diff --git a/python/librmm/pyproject.toml b/python/librmm/pyproject.toml
index 0f0b4e397..bae2ef36b 100644
--- a/python/librmm/pyproject.toml
+++ b/python/librmm/pyproject.toml
@@ -67,3 +67,11 @@ wheel.py-api = "py3"
 provider = "scikit_build_core.metadata.regex"
 input = "librmm/VERSION"
 regex = "(?P<value>.*)"
+
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
diff --git a/python/rmm/CMakeLists.txt b/python/rmm/CMakeLists.txt
index 6c2515102..ac8495e14 100644
--- a/python/rmm/CMakeLists.txt
+++ b/python/rmm/CMakeLists.txt
@@ -30,4 +30,5 @@ rapids_cython_init()
 add_compile_definitions("SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}")
 
 add_subdirectory(rmm/_cuda)
-add_subdirectory(rmm/_lib)
+add_subdirectory(rmm/librmm)
+add_subdirectory(rmm/pylibrmm)
diff --git a/python/rmm/docs/conf.py b/python/rmm/docs/conf.py
index d48dc2b42..2aad3a82c 100644
--- a/python/rmm/docs/conf.py
+++ b/python/rmm/docs/conf.py
@@ -12,6 +12,7 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
+import datetime
 import os
 import re
 
@@ -22,8 +23,8 @@
 # -- Project information -----------------------------------------------------
 
 project = "rmm"
-copyright = "2020-2023, NVIDIA"
-author = "NVIDIA"
+copyright = f"2018-{datetime.datetime.today().year}, NVIDIA Corporation"
+author = "NVIDIA Corporation"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -118,19 +119,6 @@
 
 html_theme = "sphinx_rtd_theme"
 
-# on_rtd is whether we are on readthedocs.org
-on_rtd = os.environ.get("READTHEDOCS", None) == "True"
-
-if not on_rtd:
-    # only import and set the theme if we're building docs locally
-    # otherwise, readthedocs.org uses their theme by default,
-    # so no need to specify it
-    import sphinx_rtd_theme
-
-    html_theme = "sphinx_rtd_theme"
-    html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
-
-
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
@@ -209,7 +197,10 @@
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3", None),
     "numba": ("https://numba.readthedocs.io/en/stable", None),
-    "cuda-python": ("https://nvidia.github.io/cuda-python/", None),
+    "cuda-python": (
+        "https://nvidia.github.io/cuda-python/cuda-bindings/",
+        None,
+    ),
 }
 
 # Config numpydoc
diff --git a/python/rmm/docs/guide.md b/python/rmm/docs/guide.md
index 22c0dc023..c7e940497 100644
--- a/python/rmm/docs/guide.md
+++ b/python/rmm/docs/guide.md
@@ -236,17 +236,17 @@ Common to both usages is that they modify the currently active RMM memory resour
 
 >>> # We start with the default cuda memory resource
 >>> rmm.mr.get_current_device_resource()
-<rmm._lib.memory_resource.CudaMemoryResource at 0x7f7e6c0a1ce0>
+<rmm.pylibrmm.memory_resource.CudaMemoryResource object at 0x7fa0da48a8e0>
 
 >>> # When using statistics, we get a StatisticsResourceAdaptor with the context
 >>> with rmm.statistics.statistics():
 ...     rmm.mr.get_current_device_resource()
-<rmm._lib.memory_resource.StatisticsResourceAdaptor at 0x7f7e6c524900>
+<rmm.pylibrmm.memory_resource.StatisticsResourceAdaptor object at 0x7fa0dd6e4a40>
 
 >>> # We can also enable statistics globally
 >>> rmm.statistics.enable_statistics()
 >>> print(rmm.mr.get_current_device_resource())
-<rmm._lib.memory_resource.StatisticsResourceAdaptor at 0x7f662c2bb3c0>
+<rmm.pylibrmm.memory_resource.StatisticsResourceAdaptor object at 0x7f9a11340a40>
 ```
 
 With statistics enabled, you can query statistics of the current and peak bytes and number of allocations performed by the current RMM memory resource:
diff --git a/python/rmm/pyproject.toml b/python/rmm/pyproject.toml
index 7577ad961..aaaa15482 100644
--- a/python/rmm/pyproject.toml
+++ b/python/rmm/pyproject.toml
@@ -30,7 +30,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "cuda-python>=11.7.1,<12.0a0",
+    "cuda-python>=11.7.1,<12.0a0,<=11.8.3",
     "numba>=0.57",
     "numpy>=1.23,<3.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -128,14 +128,23 @@ dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
-    "cuda-python>=11.7.1,<12.0a0",
+    "cuda-python>=11.7.1,<12.0a0,<=11.8.3",
     "cython>=3.0.0",
-    "librmm==24.10.*,>=0.0.0a0",
+    "librmm==24.12.*,>=0.0.0a0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
+[tool.pydistcheck]
+select = [
+    "distro-too-large-compressed",
+]
+
+# PyPI limit is 100 MiB, fail CI before we get too close to that
+max_allowed_size_compressed = '75M'
+
 [tool.pytest.ini_options]
 # treat warnings as errors
 filterwarnings = [
     "error",
+    "ignore:.*cuda..* module is deprecated.*:DeprecationWarning"
 ]
diff --git a/python/rmm/rmm/__init__.py b/python/rmm/rmm/__init__.py
index 1e3b5c8b1..832fec095 100644
--- a/python/rmm/rmm/__init__.py
+++ b/python/rmm/rmm/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,9 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
+
 from rmm import mr
-from rmm._lib.device_buffer import DeviceBuffer
-from rmm._lib.logger import (
+from rmm._version import __git_commit__, __version__
+from rmm.mr import disable_logging, enable_logging, get_log_filenames
+from rmm.pylibrmm.device_buffer import DeviceBuffer
+from rmm.pylibrmm.logger import (
     flush_logger,
     get_flush_level,
     get_logging_level,
@@ -23,8 +27,6 @@
     set_logging_level,
     should_log,
 )
-from rmm._version import __git_commit__, __version__
-from rmm.mr import disable_logging, enable_logging, get_log_filenames
 from rmm.rmm import (
     RMMError,
     is_initialized,
@@ -52,3 +54,19 @@
     "should_log",
     "unregister_reinitialize_hook",
 ]
+
+
+def __getattr__(name):
+    if name == "_lib":
+        import importlib
+
+        warnings.warn(
+            "The `rmm._lib` module is deprecated in will be removed in a future release. Use `rmm.pylibrmm` instead.",
+            FutureWarning,
+            stacklevel=2,
+        )
+
+        module = importlib.import_module("rmm.pylibrmm")
+        return module
+    else:
+        raise AttributeError(f"Module '{__name__}' has no attribute '{name}'")
diff --git a/python/rmm/rmm/_cuda/stream.pxd b/python/rmm/rmm/_cuda/stream.pxd
index 3c3d3aa6f..e91e2ce58 100644
--- a/python/rmm/rmm/_cuda/stream.pxd
+++ b/python/rmm/rmm/_cuda/stream.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@ from cuda.ccudart cimport cudaStream_t
 from libc.stdint cimport uintptr_t
 from libcpp cimport bool
 
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
 
 
 cdef class Stream:
diff --git a/python/rmm/rmm/_cuda/stream.pyx b/python/rmm/rmm/_cuda/stream.pyx
index 4d5ff5232..37dcbd610 100644
--- a/python/rmm/rmm/_cuda/stream.pyx
+++ b/python/rmm/rmm/_cuda/stream.pyx
@@ -16,13 +16,13 @@ from cuda.ccudart cimport cudaStream_t
 from libc.stdint cimport uintptr_t
 from libcpp cimport bool
 
-from rmm._lib.cuda_stream cimport CudaStream
-from rmm._lib.cuda_stream_view cimport (
+from rmm.librmm.cuda_stream_view cimport (
     cuda_stream_default,
     cuda_stream_legacy,
     cuda_stream_per_thread,
     cuda_stream_view,
 )
+from rmm.pylibrmm.cuda_stream cimport CudaStream
 
 
 cdef class Stream:
diff --git a/python/rmm/rmm/_lib/__init__.py b/python/rmm/rmm/_lib/__init__.py
index 0b8672ef6..7e01bda77 100644
--- a/python/rmm/rmm/_lib/__init__.py
+++ b/python/rmm/rmm/_lib/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,4 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .device_buffer import DeviceBuffer
+import warnings
+
+from rmm.pylibrmm import *
+
+warnings.warn(
+    "The `rmm._lib` module is deprecated in will be removed in a future release. Use `rmm.pylibrmm` instead.",
+    FutureWarning,
+    stacklevel=2,
+)
diff --git a/python/rmm/rmm/_lib/cuda_stream.pxd b/python/rmm/rmm/_lib/cuda_stream.pxd
index e224cf9af..afc365fbb 100644
--- a/python/rmm/rmm/_lib/cuda_stream.pxd
+++ b/python/rmm/rmm/_lib/cuda_stream.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,26 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cimport cython
-from cuda.ccudart cimport cudaStream_t
-from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-
-
-cdef extern from "rmm/cuda_stream.hpp" namespace "rmm" nogil:
-    cdef cppclass cuda_stream:
-        cuda_stream() except +
-        bool is_valid() except +
-        cudaStream_t value() except +
-        cuda_stream_view view() except +
-        void synchronize() except +
-        void synchronize_no_throw()
-
-
-@cython.final
-cdef class CudaStream:
-    cdef unique_ptr[cuda_stream] c_obj
-    cdef cudaStream_t value(self) except * nogil
-    cdef bool is_valid(self) except * nogil
+from rmm.librmm.cuda_stream cimport cuda_stream
+from rmm.pylibrmm.cuda_stream cimport CudaStream
diff --git a/python/rmm/rmm/_lib/lib.pxd b/python/rmm/rmm/_lib/cuda_stream.py
similarity index 70%
rename from python/rmm/rmm/_lib/lib.pxd
rename to python/rmm/rmm/_lib/cuda_stream.py
index e35b672e4..1eb424e12 100644
--- a/python/rmm/rmm/_lib/lib.pxd
+++ b/python/rmm/rmm/_lib/cuda_stream.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,9 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from libc.stdint cimport uintptr_t
-from libcpp cimport bool
-from libcpp.utility cimport pair
-from libcpp.vector cimport vector
-
-ctypedef pair[const char*, unsigned int] caller_pair
+from rmm.pylibrmm.cuda_stream import CudaStream  # noqa: F401
diff --git a/python/rmm/rmm/_lib/cuda_stream_pool.pxd b/python/rmm/rmm/_lib/cuda_stream_pool.pxd
index 0286a9377..4da59cc68 100644
--- a/python/rmm/rmm/_lib/cuda_stream_pool.pxd
+++ b/python/rmm/rmm/_lib/cuda_stream_pool.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,14 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-cimport cython
-
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-
-
-cdef extern from "rmm/cuda_stream_pool.hpp" namespace "rmm" nogil:
-    cdef cppclass cuda_stream_pool:
-        cuda_stream_pool(size_t pool_size)
-        cuda_stream_view get_stream()
-        cuda_stream_view get_stream(size_t stream_id) except +
-        size_t get_pool_size()
+from rmm.librmm.cuda_stream_pool cimport cuda_stream_pool
diff --git a/python/rmm/rmm/_lib/cuda_stream_view.pxd b/python/rmm/rmm/_lib/cuda_stream_view.pxd
index bf0d33c24..c336b0fe8 100644
--- a/python/rmm/rmm/_lib/cuda_stream_view.pxd
+++ b/python/rmm/rmm/_lib/cuda_stream_view.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,21 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cuda.ccudart cimport cudaStream_t
-from libcpp cimport bool
-
-
-cdef extern from "rmm/cuda_stream_view.hpp" namespace "rmm" nogil:
-    cdef cppclass cuda_stream_view:
-        cuda_stream_view()
-        cuda_stream_view(cudaStream_t)
-        cudaStream_t value()
-        bool is_default()
-        bool is_per_thread_default()
-        void synchronize() except +
-
-    cdef bool operator==(cuda_stream_view const, cuda_stream_view const)
-
-    const cuda_stream_view cuda_stream_default
-    const cuda_stream_view cuda_stream_legacy
-    const cuda_stream_view cuda_stream_per_thread
+from rmm.librmm.cuda_stream_view cimport (
+    cuda_stream_default,
+    cuda_stream_legacy,
+    cuda_stream_per_thread,
+    cuda_stream_view,
+)
diff --git a/python/rmm/rmm/_lib/device_buffer.pxd b/python/rmm/rmm/_lib/device_buffer.pxd
index 0da9ace0c..22833b1b8 100644
--- a/python/rmm/rmm/_lib/device_buffer.pxd
+++ b/python/rmm/rmm/_lib/device_buffer.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,105 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from libc.stdint cimport uintptr_t
-from libcpp.memory cimport unique_ptr
-
-from rmm._cuda.stream cimport Stream
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-from rmm._lib.memory_resource cimport (
-    DeviceMemoryResource,
-    device_memory_resource,
+from rmm.librmm.device_buffer cimport (
+    cuda_device_id,
+    device_buffer,
+    get_current_cuda_device,
+    prefetch,
+)
+from rmm.pylibrmm.device_buffer cimport (
+    DeviceBuffer,
+    copy_device_to_ptr,
+    copy_host_to_ptr,
+    copy_ptr_to_host,
+    to_device,
 )
-
-
-cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
-    cdef cppclass cuda_device_id:
-        ctypedef int value_type
-        cuda_device_id()
-        cuda_device_id(value_type id)
-        value_type value()
-
-    cdef cuda_device_id get_current_cuda_device()
-
-cdef extern from "rmm/prefetch.hpp" namespace "rmm" nogil:
-    cdef void prefetch(const void* ptr,
-                       size_t bytes,
-                       cuda_device_id device,
-                       cuda_stream_view stream) except +
-
-cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
-    cdef cppclass device_buffer:
-        device_buffer()
-        device_buffer(
-            size_t size,
-            cuda_stream_view stream,
-            device_memory_resource *
-        ) except +
-        device_buffer(
-            const void* source_data,
-            size_t size,
-            cuda_stream_view stream,
-            device_memory_resource *
-        ) except +
-        device_buffer(
-            const device_buffer buf,
-            cuda_stream_view stream,
-            device_memory_resource *
-        ) except +
-        void reserve(size_t new_capacity, cuda_stream_view stream) except +
-        void resize(size_t new_size, cuda_stream_view stream) except +
-        void shrink_to_fit(cuda_stream_view stream) except +
-        void* data()
-        size_t size()
-        size_t capacity()
-
-
-cdef class DeviceBuffer:
-    cdef unique_ptr[device_buffer] c_obj
-
-    # Holds a reference to the DeviceMemoryResource used for allocation.
-    # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is
-    # needed for deallocation
-    cdef DeviceMemoryResource mr
-
-    # Holds a reference to the stream used by the underlying `device_buffer`.
-    # Ensures the stream does not get destroyed before this DeviceBuffer
-    cdef Stream stream
-
-    @staticmethod
-    cdef DeviceBuffer c_from_unique_ptr(
-        unique_ptr[device_buffer] ptr,
-        Stream stream=*,
-        DeviceMemoryResource mr=*,
-    )
-
-    @staticmethod
-    cdef DeviceBuffer c_to_device(const unsigned char[::1] b,
-                                  Stream stream=*) except *
-    cpdef copy_to_host(self, ary=*, Stream stream=*)
-    cpdef copy_from_host(self, ary, Stream stream=*)
-    cpdef copy_from_device(self, cuda_ary, Stream stream=*)
-    cpdef bytes tobytes(self, Stream stream=*)
-
-    cdef size_t c_size(self) except *
-    cpdef void reserve(self, size_t new_capacity, Stream stream=*) except *
-    cpdef void resize(self, size_t new_size, Stream stream=*) except *
-    cpdef size_t capacity(self) except *
-    cdef void* c_data(self) except *
-
-    cdef device_buffer c_release(self) except *
-
-cpdef DeviceBuffer to_device(const unsigned char[::1] b,
-                             Stream stream=*)
-cpdef void copy_ptr_to_host(uintptr_t db,
-                            unsigned char[::1] hb,
-                            Stream stream=*) except *
-
-cpdef void copy_host_to_ptr(const unsigned char[::1] hb,
-                            uintptr_t db,
-                            Stream stream=*) except *
-
-cpdef void copy_device_to_ptr(uintptr_t d_src,
-                              uintptr_t d_dst,
-                              size_t count,
-                              Stream stream=*) except *
diff --git a/python/rmm/rmm/_lib/device_buffer.py b/python/rmm/rmm/_lib/device_buffer.py
new file mode 100644
index 000000000..c531bca5f
--- /dev/null
+++ b/python/rmm/rmm/_lib/device_buffer.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.pylibrmm.device_buffer import (  # noqa: F401
+    DeviceBuffer,
+    copy_device_to_ptr,
+    copy_host_to_ptr,
+    copy_ptr_to_host,
+    to_device,
+)
diff --git a/python/rmm/rmm/_lib/device_uvector.pxd b/python/rmm/rmm/_lib/device_uvector.pxd
index 29e122bbf..230b0afb3 100644
--- a/python/rmm/rmm/_lib/device_uvector.pxd
+++ b/python/rmm/rmm/_lib/device_uvector.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,28 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-from rmm._lib.device_buffer cimport device_buffer
-from rmm._lib.memory_resource cimport device_memory_resource
-
-
-cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
-    cdef cppclass device_uvector[T]:
-        device_uvector(size_t size, cuda_stream_view  stream) except +
-        T* element_ptr(size_t index)
-        void set_element(size_t element_index, const T& v, cuda_stream_view s)
-        void set_element_async(
-            size_t element_index,
-            const T& v,
-            cuda_stream_view s
-        ) except +
-        T front_element(cuda_stream_view s) except +
-        T back_element(cuda_stream_view s) except +
-        void reserve(size_t new_capacity, cuda_stream_view stream) except +
-        void resize(size_t new_size, cuda_stream_view stream) except +
-        void shrink_to_fit(cuda_stream_view stream) except +
-        device_buffer release()
-        size_t capacity()
-        T* data()
-        size_t size()
-        device_memory_resource* memory_resource()
+from rmm.librmm.device_uvector cimport device_uvector
diff --git a/python/rmm/rmm/_lib/helper.pxd b/python/rmm/rmm/_lib/helper.pxd
index 8ca151c00..4a5159435 100644
--- a/python/rmm/rmm/_lib/helper.pxd
+++ b/python/rmm/rmm/_lib/helper.pxd
@@ -12,5 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-cdef object parse_bytes(object s) except *
+from rmm.pylibrmm.helper cimport parse_bytes
diff --git a/python/rmm/rmm/_lib/logger.pxd b/python/rmm/rmm/_lib/logger.pxd
new file mode 100644
index 000000000..bef05c903
--- /dev/null
+++ b/python/rmm/rmm/_lib/logger.pxd
@@ -0,0 +1,24 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.librmm._logger cimport logger, logging_level, spdlog_logger
+from rmm.pylibrmm.logger cimport (
+    _validate_level_type,
+    flush_logger,
+    get_flush_level,
+    get_logging_level,
+    set_flush_level,
+    set_logging_level,
+    should_log,
+)
diff --git a/python/rmm/rmm/_lib/logger.py b/python/rmm/rmm/_lib/logger.py
new file mode 100644
index 000000000..1e9b519b8
--- /dev/null
+++ b/python/rmm/rmm/_lib/logger.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.librmm._logger import logging_level  # noqa: F401
+from rmm.pylibrmm.logger import (  # noqa: F401
+    _validate_level_type,
+    flush_logger,
+    get_flush_level,
+    get_logging_level,
+    set_flush_level,
+    set_logging_level,
+    should_log,
+)
diff --git a/python/rmm/rmm/_lib/memory_resource.pxd b/python/rmm/rmm/_lib/memory_resource.pxd
index 000a3fe1e..0d11001a4 100644
--- a/python/rmm/rmm/_lib/memory_resource.pxd
+++ b/python/rmm/rmm/_lib/memory_resource.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,92 +12,51 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from libc.stdint cimport int8_t
-from libcpp.memory cimport shared_ptr
-from libcpp.pair cimport pair
-from libcpp.string cimport string
-from libcpp.vector cimport vector
-
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-
-
-cdef extern from "rmm/mr/device/device_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass device_memory_resource:
-        void* allocate(size_t bytes) except +
-        void* allocate(size_t bytes, cuda_stream_view stream) except +
-        void deallocate(void* ptr, size_t bytes) except +
-        void deallocate(
-            void* ptr,
-            size_t bytes,
-            cuda_stream_view stream
-        ) except +
-
-cdef extern from "rmm/cuda_device.hpp" namespace "rmm" nogil:
-    size_t percent_of_free_device_memory(int percent) except +
-    pair[size_t, size_t] available_device_memory() except +
-
-cdef class DeviceMemoryResource:
-    cdef shared_ptr[device_memory_resource] c_obj
-    cdef device_memory_resource* get_mr(self) noexcept nogil
-
-cdef class UpstreamResourceAdaptor(DeviceMemoryResource):
-    cdef readonly DeviceMemoryResource upstream_mr
-
-    cpdef DeviceMemoryResource get_upstream(self)
-
-cdef class CudaMemoryResource(DeviceMemoryResource):
-    pass
-
-cdef class ManagedMemoryResource(DeviceMemoryResource):
-    pass
-
-cdef class SystemMemoryResource(DeviceMemoryResource):
-    pass
-
-cdef class SamHeadroomMemoryResource(DeviceMemoryResource):
-    pass
-
-cdef class CudaAsyncMemoryResource(DeviceMemoryResource):
-    pass
-
-cdef class PoolMemoryResource(UpstreamResourceAdaptor):
-    pass
-
-cdef class FixedSizeMemoryResource(UpstreamResourceAdaptor):
-    pass
-
-cdef class BinningMemoryResource(UpstreamResourceAdaptor):
-
-    cdef readonly list _bin_mrs
-
-    cpdef add_bin(
-        self,
-        size_t allocation_size,
-        DeviceMemoryResource bin_resource=*)
-
-cdef class CallbackMemoryResource(DeviceMemoryResource):
-    cdef object _allocate_func
-    cdef object _deallocate_func
-
-cdef class LimitingResourceAdaptor(UpstreamResourceAdaptor):
-    pass
-
-cdef class LoggingResourceAdaptor(UpstreamResourceAdaptor):
-    cdef object _log_file_name
-    cpdef get_file_name(self)
-    cpdef flush(self)
-
-cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
-    pass
-
-cdef class TrackingResourceAdaptor(UpstreamResourceAdaptor):
-    pass
-
-cdef class FailureCallbackResourceAdaptor(UpstreamResourceAdaptor):
-    cdef object _callback
-
-cdef class PrefetchResourceAdaptor(UpstreamResourceAdaptor):
-    pass
-
-cpdef DeviceMemoryResource get_current_device_resource()
+from rmm.librmm.memory_resource cimport (
+    CppExcept,
+    allocate_callback_t,
+    allocation_handle_type,
+    available_device_memory,
+    binning_memory_resource,
+    callback_memory_resource,
+    cuda_async_memory_resource,
+    cuda_memory_resource,
+    deallocate_callback_t,
+    device_memory_resource,
+    failure_callback_resource_adaptor,
+    failure_callback_t,
+    fixed_size_memory_resource,
+    limiting_resource_adaptor,
+    logging_resource_adaptor,
+    managed_memory_resource,
+    percent_of_free_device_memory,
+    pool_memory_resource,
+    prefetch_resource_adaptor,
+    sam_headroom_memory_resource,
+    statistics_resource_adaptor,
+    system_memory_resource,
+    throw_cpp_except,
+    tracking_resource_adaptor,
+    translate_python_except_to_cpp,
+)
+from rmm.pylibrmm.memory_resource cimport (
+    ArenaMemoryResource,
+    BinningMemoryResource,
+    CallbackMemoryResource,
+    CudaAsyncMemoryResource,
+    CudaMemoryResource,
+    DeviceMemoryResource,
+    FailureCallbackResourceAdaptor,
+    FixedSizeMemoryResource,
+    LimitingResourceAdaptor,
+    LoggingResourceAdaptor,
+    ManagedMemoryResource,
+    PoolMemoryResource,
+    PrefetchResourceAdaptor,
+    SamHeadroomMemoryResource,
+    StatisticsResourceAdaptor,
+    SystemMemoryResource,
+    TrackingResourceAdaptor,
+    UpstreamResourceAdaptor,
+    get_current_device_resource,
+)
diff --git a/python/rmm/rmm/_lib/memory_resource.py b/python/rmm/rmm/_lib/memory_resource.py
new file mode 100644
index 000000000..f3a24f635
--- /dev/null
+++ b/python/rmm/rmm/_lib/memory_resource.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.pylibrmm.memory_resource import (  # noqa: F401
+    ArenaMemoryResource,
+    BinningMemoryResource,
+    CallbackMemoryResource,
+    CudaAsyncMemoryResource,
+    CudaMemoryResource,
+    DeviceMemoryResource,
+    FailureCallbackResourceAdaptor,
+    FixedSizeMemoryResource,
+    LimitingResourceAdaptor,
+    LoggingResourceAdaptor,
+    ManagedMemoryResource,
+    PoolMemoryResource,
+    PrefetchResourceAdaptor,
+    SamHeadroomMemoryResource,
+    StatisticsResourceAdaptor,
+    SystemMemoryResource,
+    TrackingResourceAdaptor,
+    UpstreamResourceAdaptor,
+    _flush_logs,
+    available_device_memory,
+    disable_logging,
+    enable_logging,
+    get_current_device_resource,
+    get_current_device_resource_type,
+    get_log_filenames,
+    get_per_device_resource_type,
+    is_initialized,
+    set_current_device_resource,
+    set_per_device_resource,
+)
diff --git a/python/rmm/rmm/_lib/per_device_resource.pxd b/python/rmm/rmm/_lib/per_device_resource.pxd
index c33217622..29487f503 100644
--- a/python/rmm/rmm/_lib/per_device_resource.pxd
+++ b/python/rmm/rmm/_lib/per_device_resource.pxd
@@ -1,23 +1,21 @@
-from rmm._lib.memory_resource cimport device_memory_resource
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-
-cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
-    cdef cppclass cuda_device_id:
-        ctypedef int value_type
-
-        cuda_device_id(value_type id)
-
-        value_type value()
-
-cdef extern from "rmm/mr/device/per_device_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef device_memory_resource* set_current_device_resource(
-        device_memory_resource* new_mr
-    )
-    cdef device_memory_resource* get_current_device_resource()
-    cdef device_memory_resource* set_per_device_resource(
-        cuda_device_id id, device_memory_resource* new_mr
-    )
-    cdef device_memory_resource* get_per_device_resource (
-        cuda_device_id id
-    )
+from rmm.librmm.per_device_resource cimport (
+    cuda_device_id,
+    get_current_device_resource,
+    get_per_device_resource,
+    set_current_device_resource,
+    set_per_device_resource,
+)
diff --git a/python/rmm/rmm/allocators/cupy.py b/python/rmm/rmm/allocators/cupy.py
index 89947c46b..780ff2abf 100644
--- a/python/rmm/rmm/allocators/cupy.py
+++ b/python/rmm/rmm/allocators/cupy.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from rmm import _lib as librmm
+from rmm import pylibrmm
 from rmm._cuda.stream import Stream
 
 try:
@@ -34,7 +34,7 @@ def rmm_cupy_allocator(nbytes):
         raise ModuleNotFoundError("No module named 'cupy'")
 
     stream = Stream(obj=cupy.cuda.get_current_stream())
-    buf = librmm.device_buffer.DeviceBuffer(size=nbytes, stream=stream)
+    buf = pylibrmm.device_buffer.DeviceBuffer(size=nbytes, stream=stream)
     dev_id = -1 if buf.ptr else cupy.cuda.device.get_device_id()
     mem = cupy.cuda.UnownedMemory(
         ptr=buf.ptr, size=buf.size, owner=buf, device_id=dev_id
diff --git a/python/rmm/rmm/allocators/numba.py b/python/rmm/rmm/allocators/numba.py
index 5e87b87b6..fd9bacb5a 100644
--- a/python/rmm/rmm/allocators/numba.py
+++ b/python/rmm/rmm/allocators/numba.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 from numba import config, cuda
 from numba.cuda import HostOnlyCUDAMemoryManager, IpcHandle, MemoryPointer
 
-from rmm import _lib as librmm
+from rmm import pylibrmm
 
 
 def _make_emm_plugin_finalizer(handle, allocations):
@@ -70,7 +70,7 @@ def memalloc(self, size):
         """
         Allocate an on-device array from the RMM pool.
         """
-        buf = librmm.DeviceBuffer(size=size)
+        buf = pylibrmm.DeviceBuffer(size=size)
         ctx = self.context
 
         if config.CUDA_USE_NVIDIA_BINDING:
diff --git a/python/rmm/rmm/allocators/torch.py b/python/rmm/rmm/allocators/torch.py
index 753da66da..eee0e9df9 100644
--- a/python/rmm/rmm/allocators/torch.py
+++ b/python/rmm/rmm/allocators/torch.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,10 +28,10 @@
         # allocator .so relative to the current file because the current file
         # is pure Python and will therefore be in the source directory.
         # Instead, we search relative to an arbitrary file in the compiled
-        # package. We use the _lib.lib module because it is small.
-        from rmm._lib import lib
+        # package. We use the librmm._logger module because it is small.
+        from rmm.librmm import _logger
 
-        sofile = pathlib.Path(lib.__file__).parent / "_torch_allocator.so"
+        sofile = pathlib.Path(_logger.__file__).parent / "_torch_allocator.so"
         rmm_torch_allocator = CUDAPluggableAllocator(
             str(sofile.absolute()),
             alloc_fn_name="allocate",
diff --git a/python/rmm/rmm/_lib/CMakeLists.txt b/python/rmm/rmm/librmm/CMakeLists.txt
similarity index 93%
rename from python/rmm/rmm/_lib/CMakeLists.txt
rename to python/rmm/rmm/librmm/CMakeLists.txt
index 7cdfed971..5da2a1a01 100644
--- a/python/rmm/rmm/_lib/CMakeLists.txt
+++ b/python/rmm/rmm/librmm/CMakeLists.txt
@@ -12,8 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources device_buffer.pyx lib.pyx logger.pyx memory_resource.pyx cuda_stream.pyx
-                   helper.pyx)
+set(cython_sources _logger.pyx)
 set(linked_libraries rmm::rmm)
 
 # Build all of the Cython targets
diff --git a/python/rmm/rmm/_lib/__init__.pxd b/python/rmm/rmm/librmm/__init__.py
similarity index 100%
rename from python/rmm/rmm/_lib/__init__.pxd
rename to python/rmm/rmm/librmm/__init__.py
diff --git a/python/rmm/rmm/librmm/_logger.pxd b/python/rmm/rmm/librmm/_logger.pxd
new file mode 100644
index 000000000..fb2126b2f
--- /dev/null
+++ b/python/rmm/rmm/librmm/_logger.pxd
@@ -0,0 +1,66 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libcpp cimport bool
+
+
+cdef extern from "spdlog/common.h" namespace "spdlog::level" nogil:
+    cpdef enum logging_level "spdlog::level::level_enum":
+        """
+        The debug logging level for RMM.
+
+        Debug logging prints messages to a log file. See
+        `Debug Logging <https://github.com/rapidsai/rmm#debug-logging>`_
+        for more information.
+
+        Valid levels, in decreasing order of verbosity, are TRACE, DEBUG,
+        INFO, WARN, ERR, CRITICAL, and OFF. Default is INFO.
+
+        Examples
+        --------
+        >>> import rmm
+        >>> rmm.logging_level.DEBUG
+        <logging_level.DEBUG: 1>
+        >>> rmm.logging_level.DEBUG.value
+        1
+        >>> rmm.logging_level.DEBUG.name
+        'DEBUG'
+
+        See Also
+        --------
+        set_logging_level : Set the debug logging level
+        get_logging_level : Get the current debug logging level
+        """
+        TRACE "spdlog::level::trace"
+        DEBUG "spdlog::level::debug"
+        INFO "spdlog::level::info"
+        WARN "spdlog::level::warn"
+        ERR "spdlog::level::err"
+        CRITICAL "spdlog::level::critical"
+        OFF "spdlog::level::off"
+
+
+cdef extern from "spdlog/spdlog.h" namespace "spdlog" nogil:
+    cdef cppclass spdlog_logger "spdlog::logger":
+        spdlog_logger() except +
+        void set_level(logging_level level)
+        logging_level level()
+        void flush() except +
+        void flush_on(logging_level level)
+        logging_level flush_level()
+        bool should_log(logging_level msg_level)
+
+
+cdef extern from "rmm/logger.hpp" namespace "rmm::detail" nogil:
+    cdef spdlog_logger& logger() except +
diff --git a/python/rmm/rmm/librmm/_logger.pyx b/python/rmm/rmm/librmm/_logger.pyx
new file mode 100644
index 000000000..4392cb106
--- /dev/null
+++ b/python/rmm/rmm/librmm/_logger.pyx
@@ -0,0 +1,15 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.librmm._logger cimport logging_level  # no-cython-lint
diff --git a/python/rmm/rmm/_lib/_torch_allocator.cpp b/python/rmm/rmm/librmm/_torch_allocator.cpp
similarity index 100%
rename from python/rmm/rmm/_lib/_torch_allocator.cpp
rename to python/rmm/rmm/librmm/_torch_allocator.cpp
diff --git a/python/rmm/rmm/librmm/cuda_stream.pxd b/python/rmm/rmm/librmm/cuda_stream.pxd
new file mode 100644
index 000000000..3f2ac3361
--- /dev/null
+++ b/python/rmm/rmm/librmm/cuda_stream.pxd
@@ -0,0 +1,28 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cuda.ccudart cimport cudaStream_t
+from libcpp cimport bool
+
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+
+
+cdef extern from "rmm/cuda_stream.hpp" namespace "rmm" nogil:
+    cdef cppclass cuda_stream:
+        cuda_stream() except +
+        bool is_valid() except +
+        cudaStream_t value() except +
+        cuda_stream_view view() except +
+        void synchronize() except +
+        void synchronize_no_throw()
diff --git a/python/rmm/rmm/librmm/cuda_stream_pool.pxd b/python/rmm/rmm/librmm/cuda_stream_pool.pxd
new file mode 100644
index 000000000..4f2cbb36d
--- /dev/null
+++ b/python/rmm/rmm/librmm/cuda_stream_pool.pxd
@@ -0,0 +1,23 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+
+
+cdef extern from "rmm/cuda_stream_pool.hpp" namespace "rmm" nogil:
+    cdef cppclass cuda_stream_pool:
+        cuda_stream_pool(size_t pool_size)
+        cuda_stream_view get_stream()
+        cuda_stream_view get_stream(size_t stream_id) except +
+        size_t get_pool_size()
diff --git a/python/rmm/rmm/librmm/cuda_stream_view.pxd b/python/rmm/rmm/librmm/cuda_stream_view.pxd
new file mode 100644
index 000000000..bf0d33c24
--- /dev/null
+++ b/python/rmm/rmm/librmm/cuda_stream_view.pxd
@@ -0,0 +1,32 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cuda.ccudart cimport cudaStream_t
+from libcpp cimport bool
+
+
+cdef extern from "rmm/cuda_stream_view.hpp" namespace "rmm" nogil:
+    cdef cppclass cuda_stream_view:
+        cuda_stream_view()
+        cuda_stream_view(cudaStream_t)
+        cudaStream_t value()
+        bool is_default()
+        bool is_per_thread_default()
+        void synchronize() except +
+
+    cdef bool operator==(cuda_stream_view const, cuda_stream_view const)
+
+    const cuda_stream_view cuda_stream_default
+    const cuda_stream_view cuda_stream_legacy
+    const cuda_stream_view cuda_stream_per_thread
diff --git a/python/rmm/rmm/librmm/device_buffer.pxd b/python/rmm/rmm/librmm/device_buffer.pxd
new file mode 100644
index 000000000..1c503ac9a
--- /dev/null
+++ b/python/rmm/rmm/librmm/device_buffer.pxd
@@ -0,0 +1,58 @@
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from rmm.librmm.memory_resource cimport device_memory_resource
+
+
+cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
+    cdef cppclass cuda_device_id:
+        ctypedef int value_type
+        cuda_device_id()
+        cuda_device_id(value_type id)
+        value_type value()
+
+    cdef cuda_device_id get_current_cuda_device()
+
+cdef extern from "rmm/prefetch.hpp" namespace "rmm" nogil:
+    cdef void prefetch(const void* ptr,
+                       size_t bytes,
+                       cuda_device_id device,
+                       cuda_stream_view stream) except +
+
+cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
+    cdef cppclass device_buffer:
+        device_buffer()
+        device_buffer(
+            size_t size,
+            cuda_stream_view stream,
+            device_memory_resource *
+        ) except +
+        device_buffer(
+            const void* source_data,
+            size_t size,
+            cuda_stream_view stream,
+            device_memory_resource *
+        ) except +
+        device_buffer(
+            const device_buffer buf,
+            cuda_stream_view stream,
+            device_memory_resource *
+        ) except +
+        void reserve(size_t new_capacity, cuda_stream_view stream) except +
+        void resize(size_t new_size, cuda_stream_view stream) except +
+        void shrink_to_fit(cuda_stream_view stream) except +
+        void* data()
+        size_t size()
+        size_t capacity()
diff --git a/python/rmm/rmm/librmm/device_uvector.pxd b/python/rmm/rmm/librmm/device_uvector.pxd
new file mode 100644
index 000000000..f560a9e38
--- /dev/null
+++ b/python/rmm/rmm/librmm/device_uvector.pxd
@@ -0,0 +1,39 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from rmm.librmm.device_buffer cimport device_buffer
+from rmm.librmm.memory_resource cimport device_memory_resource
+
+
+cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
+    cdef cppclass device_uvector[T]:
+        device_uvector(size_t size, cuda_stream_view  stream) except +
+        T* element_ptr(size_t index)
+        void set_element(size_t element_index, const T& v, cuda_stream_view s)
+        void set_element_async(
+            size_t element_index,
+            const T& v,
+            cuda_stream_view s
+        ) except +
+        T front_element(cuda_stream_view s) except +
+        T back_element(cuda_stream_view s) except +
+        void reserve(size_t new_capacity, cuda_stream_view stream) except +
+        void resize(size_t new_size, cuda_stream_view stream) except +
+        void shrink_to_fit(cuda_stream_view stream) except +
+        device_buffer release()
+        size_t capacity()
+        T* data()
+        size_t size()
+        device_memory_resource* memory_resource()
diff --git a/python/rmm/rmm/librmm/memory_resource.pxd b/python/rmm/rmm/librmm/memory_resource.pxd
new file mode 100644
index 000000000..9e7b70c4f
--- /dev/null
+++ b/python/rmm/rmm/librmm/memory_resource.pxd
@@ -0,0 +1,239 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This import is needed for Cython typing in translate_python_except_to_cpp
+# See https://github.com/cython/cython/issues/5589
+from builtins import BaseException
+
+from libc.stddef cimport size_t
+from libc.stdint cimport int8_t, int64_t
+from libcpp cimport bool
+from libcpp.optional cimport optional
+from libcpp.pair cimport pair
+from libcpp.string cimport string
+
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from rmm.librmm.memory_resource cimport device_memory_resource
+
+
+cdef extern from "rmm/mr/device/device_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass device_memory_resource:
+        void* allocate(size_t bytes) except +
+        void* allocate(size_t bytes, cuda_stream_view stream) except +
+        void deallocate(void* ptr, size_t bytes) except +
+        void deallocate(
+            void* ptr,
+            size_t bytes,
+            cuda_stream_view stream
+        ) except +
+
+cdef extern from "rmm/cuda_device.hpp" namespace "rmm" nogil:
+    size_t percent_of_free_device_memory(int percent) except +
+    pair[size_t, size_t] available_device_memory() except +
+
+# Transparent handle of a C++ exception
+ctypedef pair[int, string] CppExcept
+
+cdef inline CppExcept translate_python_except_to_cpp(err: BaseException) noexcept:
+    """Translate a Python exception into a C++ exception handle
+
+    The returned exception handle can then be thrown by `throw_cpp_except()`,
+    which MUST be done without holding the GIL.
+
+    This is useful when C++ calls a Python function and needs to catch or
+    propagate exceptions.
+    """
+    if isinstance(err, MemoryError):
+        return CppExcept(0, str.encode(str(err)))
+    return CppExcept(-1, str.encode(str(err)))
+
+# Implementation of `throw_cpp_except()`, which throws a given `CppExcept`.
+# This function MUST be called without the GIL otherwise the thrown C++
+# exception are translated back into a Python exception.
+cdef extern from *:
+    """
+    #include <stdexcept>
+    #include <utility>
+
+    void throw_cpp_except(std::pair<int, std::string> res) {
+        switch(res.first) {
+            case 0:
+                throw rmm::out_of_memory(res.second);
+            default:
+                throw std::runtime_error(res.second);
+        }
+    }
+    """
+    void throw_cpp_except(CppExcept) nogil
+
+
+cdef extern from "rmm/mr/device/cuda_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass cuda_memory_resource(device_memory_resource):
+        cuda_memory_resource() except +
+
+cdef extern from "rmm/mr/device/managed_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass managed_memory_resource(device_memory_resource):
+        managed_memory_resource() except +
+
+cdef extern from "rmm/mr/device/system_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass system_memory_resource(device_memory_resource):
+        system_memory_resource() except +
+
+cdef extern from "rmm/mr/device/sam_headroom_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass sam_headroom_memory_resource(device_memory_resource):
+        sam_headroom_memory_resource(size_t headroom) except +
+
+cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+
+    cdef cppclass cuda_async_memory_resource(device_memory_resource):
+        cuda_async_memory_resource(
+            optional[size_t] initial_pool_size,
+            optional[size_t] release_threshold,
+            optional[allocation_handle_type] export_handle_type) except +
+
+# TODO: when we adopt Cython 3.0 use enum class
+cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \
+        namespace \
+        "rmm::mr::cuda_async_memory_resource::allocation_handle_type" \
+        nogil:
+    enum allocation_handle_type \
+            "rmm::mr::cuda_async_memory_resource::allocation_handle_type":
+        none
+        posix_file_descriptor
+        win32
+        win32_kmt
+
+
+cdef extern from "rmm/mr/device/pool_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass pool_memory_resource[Upstream](device_memory_resource):
+        pool_memory_resource(
+            Upstream* upstream_mr,
+            size_t initial_pool_size,
+            optional[size_t] maximum_pool_size) except +
+        size_t pool_size()
+
+cdef extern from "rmm/mr/device/arena_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass arena_memory_resource[Upstream](device_memory_resource):
+        arena_memory_resource(
+            Upstream* upstream_mr,
+            optional[size_t] arena_size,
+            bool dump_log_on_failure
+        ) except +
+
+cdef extern from "rmm/mr/device/fixed_size_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass fixed_size_memory_resource[Upstream](device_memory_resource):
+        fixed_size_memory_resource(
+            Upstream* upstream_mr,
+            size_t block_size,
+            size_t block_to_preallocate) except +
+
+cdef extern from "rmm/mr/device/callback_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    ctypedef void* (*allocate_callback_t)(size_t, cuda_stream_view, void*)
+    ctypedef void (*deallocate_callback_t)(void*, size_t, cuda_stream_view, void*)
+
+    cdef cppclass callback_memory_resource(device_memory_resource):
+        callback_memory_resource(
+            allocate_callback_t allocate_callback,
+            deallocate_callback_t deallocate_callback,
+            void* allocate_callback_arg,
+            void* deallocate_callback_arg
+        ) except +
+
+cdef extern from "rmm/mr/device/binning_memory_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass binning_memory_resource[Upstream](device_memory_resource):
+        binning_memory_resource(Upstream* upstream_mr) except +
+        binning_memory_resource(
+            Upstream* upstream_mr,
+            int8_t min_size_exponent,
+            int8_t max_size_exponent) except +
+
+        void add_bin(size_t allocation_size) except +
+        void add_bin(
+            size_t allocation_size,
+            device_memory_resource* bin_resource) except +
+
+cdef extern from "rmm/mr/device/limiting_resource_adaptor.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass limiting_resource_adaptor[Upstream](device_memory_resource):
+        limiting_resource_adaptor(
+            Upstream* upstream_mr,
+            size_t allocation_limit) except +
+
+        size_t get_allocated_bytes() except +
+        size_t get_allocation_limit() except +
+
+cdef extern from "rmm/mr/device/logging_resource_adaptor.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass logging_resource_adaptor[Upstream](device_memory_resource):
+        logging_resource_adaptor(
+            Upstream* upstream_mr,
+            string filename) except +
+
+        void flush() except +
+
+cdef extern from "rmm/mr/device/statistics_resource_adaptor.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass statistics_resource_adaptor[Upstream](device_memory_resource):
+        struct counter:
+            counter()
+
+            int64_t value
+            int64_t peak
+            int64_t total
+
+        statistics_resource_adaptor(Upstream* upstream_mr) except +
+
+        counter get_bytes_counter() except +
+        counter get_allocations_counter() except +
+        pair[counter, counter] pop_counters() except +
+        pair[counter, counter] push_counters() except +
+
+cdef extern from "rmm/mr/device/tracking_resource_adaptor.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass tracking_resource_adaptor[Upstream](device_memory_resource):
+        tracking_resource_adaptor(
+            Upstream* upstream_mr,
+            bool capture_stacks) except +
+
+        size_t get_allocated_bytes() except +
+        string get_outstanding_allocations_str() except +
+        void log_outstanding_allocations() except +
+
+cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \
+        namespace "rmm::mr" nogil:
+    ctypedef bool (*failure_callback_t)(size_t, void*)
+    cdef cppclass failure_callback_resource_adaptor[Upstream](
+        device_memory_resource
+    ):
+        failure_callback_resource_adaptor(
+            Upstream* upstream_mr,
+            failure_callback_t callback,
+            void* callback_arg
+        ) except +
+
+cdef extern from "rmm/mr/device/prefetch_resource_adaptor.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef cppclass prefetch_resource_adaptor[Upstream](device_memory_resource):
+        prefetch_resource_adaptor(Upstream* upstream_mr) except +
diff --git a/python/rmm/rmm/librmm/per_device_resource.pxd b/python/rmm/rmm/librmm/per_device_resource.pxd
new file mode 100644
index 000000000..63ee29056
--- /dev/null
+++ b/python/rmm/rmm/librmm/per_device_resource.pxd
@@ -0,0 +1,36 @@
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from rmm.librmm.memory_resource cimport device_memory_resource
+
+
+cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
+    cdef cppclass cuda_device_id:
+        ctypedef int value_type
+
+        cuda_device_id(value_type id)
+
+        value_type value()
+
+cdef extern from "rmm/mr/device/per_device_resource.hpp" \
+        namespace "rmm::mr" nogil:
+    cdef device_memory_resource* set_current_device_resource(
+        device_memory_resource* new_mr
+    )
+    cdef device_memory_resource* get_current_device_resource()
+    cdef device_memory_resource* set_per_device_resource(
+        cuda_device_id id, device_memory_resource* new_mr
+    )
+    cdef device_memory_resource* get_per_device_resource (
+        cuda_device_id id
+    )
diff --git a/python/rmm/rmm/mr.py b/python/rmm/rmm/mr.py
index 6eb94da0f..82729271f 100644
--- a/python/rmm/rmm/mr.py
+++ b/python/rmm/rmm/mr.py
@@ -11,7 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from rmm._lib.memory_resource import (
+from rmm.pylibrmm.memory_resource import (
+    ArenaMemoryResource,
     BinningMemoryResource,
     CallbackMemoryResource,
     CudaAsyncMemoryResource,
@@ -45,6 +46,7 @@
 )
 
 __all__ = [
+    "ArenaMemoryResource",
     "BinningMemoryResource",
     "CallbackMemoryResource",
     "CudaAsyncMemoryResource",
diff --git a/python/rmm/rmm/pylibrmm/CMakeLists.txt b/python/rmm/rmm/pylibrmm/CMakeLists.txt
new file mode 100644
index 000000000..0e88f01bb
--- /dev/null
+++ b/python/rmm/rmm/pylibrmm/CMakeLists.txt
@@ -0,0 +1,27 @@
+# =============================================================================
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources device_buffer.pyx logger.pyx memory_resource.pyx cuda_stream.pyx helper.pyx)
+set(linked_libraries rmm::rmm)
+
+# Build all of the Cython targets
+rapids_cython_create_modules(SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}"
+                                                                               CXX)
+
+# mark all symbols in these Cython targets "hidden" by default, so they won't collide with symbols
+# loaded from other DSOs
+foreach(_cython_target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  set_target_properties(${_cython_target} PROPERTIES C_VISIBILITY_PRESET hidden
+                                                     CXX_VISIBILITY_PRESET hidden)
+endforeach()
diff --git a/python/rmm/rmm/pylibrmm/__init__.py b/python/rmm/rmm/pylibrmm/__init__.py
new file mode 100644
index 000000000..0b8672ef6
--- /dev/null
+++ b/python/rmm/rmm/pylibrmm/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .device_buffer import DeviceBuffer
diff --git a/python/rmm/rmm/pylibrmm/cuda_stream.pxd b/python/rmm/rmm/pylibrmm/cuda_stream.pxd
new file mode 100644
index 000000000..dd38387c2
--- /dev/null
+++ b/python/rmm/rmm/pylibrmm/cuda_stream.pxd
@@ -0,0 +1,27 @@
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cimport cython
+from cuda.ccudart cimport cudaStream_t
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+
+from rmm.librmm.cuda_stream cimport cuda_stream
+
+
+@cython.final
+cdef class CudaStream:
+    cdef unique_ptr[cuda_stream] c_obj
+    cdef cudaStream_t value(self) except * nogil
+    cdef bool is_valid(self) except * nogil
diff --git a/python/rmm/rmm/_lib/cuda_stream.pyx b/python/rmm/rmm/pylibrmm/cuda_stream.pyx
similarity index 91%
rename from python/rmm/rmm/_lib/cuda_stream.pyx
rename to python/rmm/rmm/pylibrmm/cuda_stream.pyx
index 0861f0663..d6aa4edc7 100644
--- a/python/rmm/rmm/_lib/cuda_stream.pyx
+++ b/python/rmm/rmm/pylibrmm/cuda_stream.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@ cimport cython
 from cuda.ccudart cimport cudaStream_t
 from libcpp cimport bool
 
+from rmm.librmm.cuda_stream cimport cuda_stream
+
 
 @cython.final
 cdef class CudaStream:
diff --git a/python/rmm/rmm/pylibrmm/device_buffer.pxd b/python/rmm/rmm/pylibrmm/device_buffer.pxd
new file mode 100644
index 000000000..a0d287423
--- /dev/null
+++ b/python/rmm/rmm/pylibrmm/device_buffer.pxd
@@ -0,0 +1,71 @@
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libc.stdint cimport uintptr_t
+from libcpp.memory cimport unique_ptr
+
+from rmm._cuda.stream cimport Stream
+from rmm.librmm.device_buffer cimport device_buffer
+from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
+
+
+cdef class DeviceBuffer:
+    cdef unique_ptr[device_buffer] c_obj
+
+    # Holds a reference to the DeviceMemoryResource used for allocation.
+    # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is
+    # needed for deallocation
+    cdef DeviceMemoryResource mr
+
+    # Holds a reference to the stream used by the underlying `device_buffer`.
+    # Ensures the stream does not get destroyed before this DeviceBuffer
+    cdef Stream stream
+
+    @staticmethod
+    cdef DeviceBuffer c_from_unique_ptr(
+        unique_ptr[device_buffer] ptr,
+        Stream stream=*,
+        DeviceMemoryResource mr=*,
+    )
+
+    @staticmethod
+    cdef DeviceBuffer c_to_device(const unsigned char[::1] b,
+                                  Stream stream=*) except *
+    cpdef copy_to_host(self, ary=*, Stream stream=*)
+    cpdef copy_from_host(self, ary, Stream stream=*)
+    cpdef copy_from_device(self, cuda_ary, Stream stream=*)
+    cpdef bytes tobytes(self, Stream stream=*)
+
+    cdef size_t c_size(self) except *
+    cpdef void reserve(self, size_t new_capacity, Stream stream=*) except *
+    cpdef void resize(self, size_t new_size, Stream stream=*) except *
+    cpdef size_t capacity(self) except *
+    cdef void* c_data(self) except *
+
+    cdef device_buffer c_release(self) except *
+
+cpdef DeviceBuffer to_device(const unsigned char[::1] b,
+                             Stream stream=*)
+cpdef void copy_ptr_to_host(uintptr_t db,
+                            unsigned char[::1] hb,
+                            Stream stream=*) except *
+
+cpdef void copy_host_to_ptr(const unsigned char[::1] hb,
+                            uintptr_t db,
+                            Stream stream=*) except *
+
+cpdef void copy_device_to_ptr(uintptr_t d_src,
+                              uintptr_t d_dst,
+                              size_t count,
+                              Stream stream=*) except *
diff --git a/python/rmm/rmm/_lib/device_buffer.pyx b/python/rmm/rmm/pylibrmm/device_buffer.pyx
similarity index 96%
rename from python/rmm/rmm/_lib/device_buffer.pyx
rename to python/rmm/rmm/pylibrmm/device_buffer.pyx
index 94a4dc771..c2e95e845 100644
--- a/python/rmm/rmm/_lib/device_buffer.pyx
+++ b/python/rmm/rmm/pylibrmm/device_buffer.pyx
@@ -32,9 +32,16 @@ from cuda.ccudart cimport (
     cudaStream_t,
 )
 
-from rmm._lib.memory_resource cimport (
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from rmm.librmm.device_buffer cimport (
+    cuda_device_id,
+    device_buffer,
+    get_current_cuda_device,
+    prefetch,
+)
+from rmm.librmm.memory_resource cimport device_memory_resource
+from rmm.pylibrmm.memory_resource cimport (
     DeviceMemoryResource,
-    device_memory_resource,
     get_current_device_resource,
 )
 
@@ -149,7 +156,7 @@ cdef class DeviceBuffer:
         device : optional
             The CUDA device to which to prefetch the memory for this buffer.
             Defaults to the current CUDA device. To prefetch to the CPU, pass
-            :py:attr:`~cuda.cudart.cudaCpuDeviceId` as the device.
+            :py:attr:`~cuda.bindings.runtime.cudaCpuDeviceId` as the device.
         stream : optional
             CUDA stream to use for prefetching. Defaults to self.stream
         """
@@ -394,7 +401,7 @@ cpdef DeviceBuffer to_device(const unsigned char[::1] b,
     Examples
     --------
     >>> import rmm
-    >>> db = rmm._lib.device_buffer.to_device(b"abc")
+    >>> db = rmm.pylibrmm.device_buffer.to_device(b"abc")
     >>> print(bytes(db))
     b'abc'
     """
@@ -460,7 +467,7 @@ cpdef void copy_ptr_to_host(uintptr_t db,
     >>> import rmm
     >>> db = rmm.DeviceBuffer.to_device(b"abc")
     >>> hb = bytearray(db.nbytes)
-    >>> rmm._lib.device_buffer.copy_ptr_to_host(db.ptr, hb)
+    >>> rmm.pylibrmm.device_buffer.copy_ptr_to_host(db.ptr, hb)
     >>> print(hb)
     bytearray(b'abc')
     """
@@ -502,7 +509,7 @@ cpdef void copy_host_to_ptr(const unsigned char[::1] hb,
     >>> import rmm
     >>> db = rmm.DeviceBuffer(size=10)
     >>> hb = b"abc"
-    >>> rmm._lib.device_buffer.copy_host_to_ptr(hb, db.ptr)
+    >>> rmm.pylibrmm.device_buffer.copy_host_to_ptr(hb, db.ptr)
     >>> hb = db.copy_to_host()
     >>> print(hb)
     array([97, 98, 99,  0,  0,  0,  0,  0,  0,  0], dtype=uint8)
@@ -541,7 +548,7 @@ cpdef void copy_device_to_ptr(uintptr_t d_src,
     >>> import rmm
     >>> db = rmm.DeviceBuffer(size=5)
     >>> db2 = rmm.DeviceBuffer.to_device(b"abc")
-    >>> rmm._lib.device_buffer.copy_device_to_ptr(db2.ptr, db.ptr, db2.size)
+    >>> rmm.pylibrmm.device_buffer.copy_device_to_ptr(db2.ptr, db.ptr, db2.size)
     >>> hb = db.copy_to_host()
     >>> hb
     array([97, 98, 99,  0,  0], dtype=uint8)
diff --git a/python/rmm/rmm/_lib/lib.pyx b/python/rmm/rmm/pylibrmm/helper.pxd
similarity index 86%
rename from python/rmm/rmm/_lib/lib.pyx
rename to python/rmm/rmm/pylibrmm/helper.pxd
index 46753baa3..8ca151c00 100644
--- a/python/rmm/rmm/_lib/lib.pyx
+++ b/python/rmm/rmm/pylibrmm/helper.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,3 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+
+cdef object parse_bytes(object s) except *
diff --git a/python/rmm/rmm/_lib/helper.pyx b/python/rmm/rmm/pylibrmm/helper.pyx
similarity index 100%
rename from python/rmm/rmm/_lib/helper.pyx
rename to python/rmm/rmm/pylibrmm/helper.pyx
diff --git a/python/rmm/rmm/_lib/logger.pyx b/python/rmm/rmm/pylibrmm/logger.pyx
similarity index 77%
rename from python/rmm/rmm/_lib/logger.pyx
rename to python/rmm/rmm/pylibrmm/logger.pyx
index 029bbdd79..119e1c92f 100644
--- a/python/rmm/rmm/_lib/logger.pyx
+++ b/python/rmm/rmm/pylibrmm/logger.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,58 +14,9 @@
 
 import warnings
 
-from libcpp cimport bool
-
-
-cdef extern from "spdlog/common.h" namespace "spdlog::level" nogil:
-    cpdef enum logging_level "spdlog::level::level_enum":
-        """
-        The debug logging level for RMM.
-
-        Debug logging prints messages to a log file. See
-        `Debug Logging <https://github.com/rapidsai/rmm#debug-logging>`_
-        for more information.
-
-        Valid levels, in decreasing order of verbosity, are TRACE, DEBUG,
-        INFO, WARN, ERR, CRITICAL, and OFF. Default is INFO.
-
-        Examples
-        --------
-        >>> import rmm
-        >>> rmm.logging_level.DEBUG
-        <logging_level.DEBUG: 1>
-        >>> rmm.logging_level.DEBUG.value
-        1
-        >>> rmm.logging_level.DEBUG.name
-        'DEBUG'
-
-        See Also
-        --------
-        set_logging_level : Set the debug logging level
-        get_logging_level : Get the current debug logging level
-        """
-        TRACE "spdlog::level::trace"
-        DEBUG "spdlog::level::debug"
-        INFO "spdlog::level::info"
-        WARN "spdlog::level::warn"
-        ERR "spdlog::level::err"
-        CRITICAL "spdlog::level::critical"
-        OFF "spdlog::level::off"
-
-
-cdef extern from "spdlog/spdlog.h" namespace "spdlog" nogil:
-    cdef cppclass spdlog_logger "spdlog::logger":
-        spdlog_logger() except +
-        void set_level(logging_level level)
-        logging_level level()
-        void flush() except +
-        void flush_on(logging_level level)
-        logging_level flush_level()
-        bool should_log(logging_level msg_level)
-
-
-cdef extern from "rmm/logger.hpp" namespace "rmm" nogil:
-    cdef spdlog_logger& logger() except +
+from rmm.librmm._logger cimport logger
+
+from rmm.librmm._logger import logging_level
 
 
 def _validate_level_type(level):
diff --git a/python/rmm/rmm/pylibrmm/memory_resource.pxd b/python/rmm/rmm/pylibrmm/memory_resource.pxd
new file mode 100644
index 000000000..d1e5610db
--- /dev/null
+++ b/python/rmm/rmm/pylibrmm/memory_resource.pxd
@@ -0,0 +1,86 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libcpp.memory cimport shared_ptr
+
+from rmm.librmm.memory_resource cimport device_memory_resource
+
+
+cdef class DeviceMemoryResource:
+    cdef shared_ptr[device_memory_resource] c_obj
+    cdef device_memory_resource* get_mr(self) noexcept nogil
+
+cdef class UpstreamResourceAdaptor(DeviceMemoryResource):
+    cdef readonly DeviceMemoryResource upstream_mr
+
+    cpdef DeviceMemoryResource get_upstream(self)
+
+cdef class ArenaMemoryResource(UpstreamResourceAdaptor):
+    pass
+
+cdef class CudaMemoryResource(DeviceMemoryResource):
+    pass
+
+cdef class ManagedMemoryResource(DeviceMemoryResource):
+    pass
+
+cdef class SystemMemoryResource(DeviceMemoryResource):
+    pass
+
+cdef class SamHeadroomMemoryResource(DeviceMemoryResource):
+    pass
+
+cdef class CudaAsyncMemoryResource(DeviceMemoryResource):
+    pass
+
+cdef class PoolMemoryResource(UpstreamResourceAdaptor):
+    pass
+
+cdef class FixedSizeMemoryResource(UpstreamResourceAdaptor):
+    pass
+
+cdef class BinningMemoryResource(UpstreamResourceAdaptor):
+
+    cdef readonly list _bin_mrs
+
+    cpdef add_bin(
+        self,
+        size_t allocation_size,
+        DeviceMemoryResource bin_resource=*)
+
+cdef class CallbackMemoryResource(DeviceMemoryResource):
+    cdef object _allocate_func
+    cdef object _deallocate_func
+
+cdef class LimitingResourceAdaptor(UpstreamResourceAdaptor):
+    pass
+
+cdef class LoggingResourceAdaptor(UpstreamResourceAdaptor):
+    cdef object _log_file_name
+    cpdef get_file_name(self)
+    cpdef flush(self)
+
+cdef class StatisticsResourceAdaptor(UpstreamResourceAdaptor):
+    pass
+
+cdef class TrackingResourceAdaptor(UpstreamResourceAdaptor):
+    pass
+
+cdef class FailureCallbackResourceAdaptor(UpstreamResourceAdaptor):
+    cdef object _callback
+
+cdef class PrefetchResourceAdaptor(UpstreamResourceAdaptor):
+    pass
+
+cpdef DeviceMemoryResource get_current_device_resource()
diff --git a/python/rmm/rmm/_lib/memory_resource.pyx b/python/rmm/rmm/pylibrmm/memory_resource.pyx
similarity index 82%
rename from python/rmm/rmm/_lib/memory_resource.pyx
rename to python/rmm/rmm/pylibrmm/memory_resource.pyx
index 231253e3f..b41890fca 100644
--- a/python/rmm/rmm/_lib/memory_resource.pyx
+++ b/python/rmm/rmm/pylibrmm/memory_resource.pyx
@@ -22,12 +22,11 @@ from collections import defaultdict
 cimport cython
 from cython.operator cimport dereference as deref
 from libc.stddef cimport size_t
-from libc.stdint cimport int8_t, int64_t, uintptr_t
+from libc.stdint cimport int8_t, uintptr_t
 from libcpp cimport bool
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.optional cimport optional
 from libcpp.pair cimport pair
-from libcpp.string cimport string
 
 from cuda.cudart import cudaError_t
 
@@ -37,206 +36,44 @@ from rmm._cuda.stream cimport Stream
 
 from rmm._cuda.stream import DEFAULT_STREAM
 
-from rmm._lib.cuda_stream_view cimport cuda_stream_view
-from rmm._lib.helper cimport parse_bytes
-from rmm._lib.memory_resource cimport (
-    available_device_memory as c_available_device_memory,
-    percent_of_free_device_memory as c_percent_of_free_device_memory,
-)
-from rmm._lib.per_device_resource cimport (
+from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from rmm.librmm.per_device_resource cimport (
     cuda_device_id,
     set_per_device_resource as cpp_set_per_device_resource,
 )
+from rmm.pylibrmm.helper cimport parse_bytes
 
 from rmm.statistics import Statistics
 
-# Transparent handle of a C++ exception
-ctypedef pair[int, string] CppExcept
-
-cdef CppExcept translate_python_except_to_cpp(err: BaseException) noexcept:
-    """Translate a Python exception into a C++ exception handle
-
-    The returned exception handle can then be thrown by `throw_cpp_except()`,
-    which MUST be done without holding the GIL.
-
-    This is useful when C++ calls a Python function and needs to catch or
-    propagate exceptions.
-    """
-    if isinstance(err, MemoryError):
-        return CppExcept(0, str.encode(str(err)))
-    return CppExcept(-1, str.encode(str(err)))
-
-# Implementation of `throw_cpp_except()`, which throws a given `CppExcept`.
-# This function MUST be called without the GIL otherwise the thrown C++
-# exception are translated back into a Python exception.
-cdef extern from *:
-    """
-    #include <stdexcept>
-    #include <utility>
-
-    void throw_cpp_except(std::pair<int, std::string> res) {
-        switch(res.first) {
-            case 0:
-                throw rmm::out_of_memory(res.second);
-            default:
-                throw std::runtime_error(res.second);
-        }
-    }
-    """
-    void throw_cpp_except(CppExcept) nogil
-
-
-# NOTE: Keep extern declarations in .pyx file as much as possible to avoid
-# leaking dependencies when importing RMM Cython .pxd files
-cdef extern from "rmm/mr/device/cuda_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass cuda_memory_resource(device_memory_resource):
-        cuda_memory_resource() except +
-
-cdef extern from "rmm/mr/device/managed_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass managed_memory_resource(device_memory_resource):
-        managed_memory_resource() except +
-
-cdef extern from "rmm/mr/device/system_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass system_memory_resource(device_memory_resource):
-        system_memory_resource() except +
-
-cdef extern from "rmm/mr/device/sam_headroom_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass sam_headroom_memory_resource(device_memory_resource):
-        sam_headroom_memory_resource(size_t headroom) except +
-
-cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-
-    cdef cppclass cuda_async_memory_resource(device_memory_resource):
-        cuda_async_memory_resource(
-            optional[size_t] initial_pool_size,
-            optional[size_t] release_threshold,
-            optional[allocation_handle_type] export_handle_type) except +
-
-# TODO: when we adopt Cython 3.0 use enum class
-cdef extern from "rmm/mr/device/cuda_async_memory_resource.hpp" \
-        namespace \
-        "rmm::mr::cuda_async_memory_resource::allocation_handle_type" \
-        nogil:
-    enum allocation_handle_type \
-            "rmm::mr::cuda_async_memory_resource::allocation_handle_type":
-        none
-        posix_file_descriptor
-        win32
-        win32_kmt
-
-
-cdef extern from "rmm/mr/device/pool_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass pool_memory_resource[Upstream](device_memory_resource):
-        pool_memory_resource(
-            Upstream* upstream_mr,
-            size_t initial_pool_size,
-            optional[size_t] maximum_pool_size) except +
-        size_t pool_size()
-
-cdef extern from "rmm/mr/device/fixed_size_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass fixed_size_memory_resource[Upstream](device_memory_resource):
-        fixed_size_memory_resource(
-            Upstream* upstream_mr,
-            size_t block_size,
-            size_t block_to_preallocate) except +
-
-cdef extern from "rmm/mr/device/callback_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    ctypedef void* (*allocate_callback_t)(size_t, cuda_stream_view, void*)
-    ctypedef void (*deallocate_callback_t)(void*, size_t, cuda_stream_view, void*)
-
-    cdef cppclass callback_memory_resource(device_memory_resource):
-        callback_memory_resource(
-            allocate_callback_t allocate_callback,
-            deallocate_callback_t deallocate_callback,
-            void* allocate_callback_arg,
-            void* deallocate_callback_arg
-        ) except +
-
-cdef extern from "rmm/mr/device/binning_memory_resource.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass binning_memory_resource[Upstream](device_memory_resource):
-        binning_memory_resource(Upstream* upstream_mr) except +
-        binning_memory_resource(
-            Upstream* upstream_mr,
-            int8_t min_size_exponent,
-            int8_t max_size_exponent) except +
-
-        void add_bin(size_t allocation_size) except +
-        void add_bin(
-            size_t allocation_size,
-            device_memory_resource* bin_resource) except +
-
-cdef extern from "rmm/mr/device/limiting_resource_adaptor.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass limiting_resource_adaptor[Upstream](device_memory_resource):
-        limiting_resource_adaptor(
-            Upstream* upstream_mr,
-            size_t allocation_limit) except +
-
-        size_t get_allocated_bytes() except +
-        size_t get_allocation_limit() except +
-
-cdef extern from "rmm/mr/device/logging_resource_adaptor.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass logging_resource_adaptor[Upstream](device_memory_resource):
-        logging_resource_adaptor(
-            Upstream* upstream_mr,
-            string filename) except +
-
-        void flush() except +
-
-cdef extern from "rmm/mr/device/statistics_resource_adaptor.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass statistics_resource_adaptor[Upstream](device_memory_resource):
-        struct counter:
-            counter()
-
-            int64_t value
-            int64_t peak
-            int64_t total
-
-        statistics_resource_adaptor(Upstream* upstream_mr) except +
-
-        counter get_bytes_counter() except +
-        counter get_allocations_counter() except +
-        pair[counter, counter] pop_counters() except +
-        pair[counter, counter] push_counters() except +
-
-cdef extern from "rmm/mr/device/tracking_resource_adaptor.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass tracking_resource_adaptor[Upstream](device_memory_resource):
-        tracking_resource_adaptor(
-            Upstream* upstream_mr,
-            bool capture_stacks) except +
-
-        size_t get_allocated_bytes() except +
-        string get_outstanding_allocations_str() except +
-        void log_outstanding_allocations() except +
-
-cdef extern from "rmm/mr/device/failure_callback_resource_adaptor.hpp" \
-        namespace "rmm::mr" nogil:
-    ctypedef bool (*failure_callback_t)(size_t, void*)
-    cdef cppclass failure_callback_resource_adaptor[Upstream](
-        device_memory_resource
-    ):
-        failure_callback_resource_adaptor(
-            Upstream* upstream_mr,
-            failure_callback_t callback,
-            void* callback_arg
-        ) except +
-
-cdef extern from "rmm/mr/device/prefetch_resource_adaptor.hpp" \
-        namespace "rmm::mr" nogil:
-    cdef cppclass prefetch_resource_adaptor[Upstream](device_memory_resource):
-        prefetch_resource_adaptor(Upstream* upstream_mr) except +
+from rmm.librmm.memory_resource cimport (
+    CppExcept,
+    allocate_callback_t,
+    allocation_handle_type,
+    arena_memory_resource,
+    available_device_memory as c_available_device_memory,
+    binning_memory_resource,
+    callback_memory_resource,
+    cuda_async_memory_resource,
+    cuda_memory_resource,
+    deallocate_callback_t,
+    device_memory_resource,
+    failure_callback_resource_adaptor,
+    failure_callback_t,
+    fixed_size_memory_resource,
+    limiting_resource_adaptor,
+    logging_resource_adaptor,
+    managed_memory_resource,
+    percent_of_free_device_memory as c_percent_of_free_device_memory,
+    pool_memory_resource,
+    posix_file_descriptor,
+    prefetch_resource_adaptor,
+    sam_headroom_memory_resource,
+    statistics_resource_adaptor,
+    system_memory_resource,
+    throw_cpp_except,
+    tracking_resource_adaptor,
+    translate_python_except_to_cpp,
+)
 
 
 cdef class DeviceMemoryResource:
@@ -474,6 +311,48 @@ cdef class PoolMemoryResource(UpstreamResourceAdaptor):
         )
         return c_mr.pool_size()
 
+cdef class ArenaMemoryResource(UpstreamResourceAdaptor):
+    def __cinit__(
+        self, DeviceMemoryResource upstream_mr,
+        arena_size=None,
+        dump_log_on_failure=False
+    ):
+        cdef optional[size_t] c_arena_size = (
+            optional[size_t]() if
+            arena_size is None
+            else optional[size_t](<size_t> parse_bytes(arena_size))
+        )
+        self.c_obj.reset(
+            new arena_memory_resource[device_memory_resource](
+                upstream_mr.get_mr(),
+                c_arena_size,
+                dump_log_on_failure,
+            )
+        )
+
+    def __init__(
+        self,
+        DeviceMemoryResource upstream_mr,
+        object arena_size=None,
+        bool dump_log_on_failure=False
+    ):
+        """
+        A suballocator that emphasizes fragmentation avoidance and scalable concurrency
+        support.
+
+        Parameters
+        ----------
+        upstream_mr : DeviceMemoryResource
+            The DeviceMemoryResource from which to allocate memory for arenas.
+        arena_size : int, optional
+            Size in bytes of the global arena. Defaults to half of the available memory
+            on the current device.
+        dump_log_on_failure : bool, optional
+            Whether to dump the arena on allocation failure.
+        """
+        pass
+
+
 cdef class FixedSizeMemoryResource(UpstreamResourceAdaptor):
     def __cinit__(
             self,
diff --git a/python/rmm/rmm/_lib/tests/__init__.py b/python/rmm/rmm/pylibrmm/tests/__init__.py
similarity index 100%
rename from python/rmm/rmm/_lib/tests/__init__.py
rename to python/rmm/rmm/pylibrmm/tests/__init__.py
diff --git a/python/rmm/rmm/_lib/tests/test_device_buffer.pyx b/python/rmm/rmm/pylibrmm/tests/test_device_buffer.pyx
similarity index 83%
rename from python/rmm/rmm/_lib/tests/test_device_buffer.pyx
rename to python/rmm/rmm/pylibrmm/tests/test_device_buffer.pyx
index 733383827..ec2ff4def 100644
--- a/python/rmm/rmm/_lib/tests/test_device_buffer.pyx
+++ b/python/rmm/rmm/pylibrmm/tests/test_device_buffer.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,8 +16,9 @@ import numpy as np
 
 from libcpp.memory cimport make_unique
 
-from rmm._lib.cuda_stream_view cimport cuda_stream_default
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+from rmm.librmm.cuda_stream_view cimport cuda_stream_default
+from rmm.librmm.device_buffer cimport device_buffer
+from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 
 def test_release():
diff --git a/python/rmm/rmm/statistics.py b/python/rmm/rmm/statistics.py
index 279e45dc6..2dabedce6 100644
--- a/python/rmm/rmm/statistics.py
+++ b/python/rmm/rmm/statistics.py
@@ -74,8 +74,8 @@ def enable_statistics() -> None:
 def get_statistics() -> Optional[Statistics]:
     """Get the current allocation statistics.
 
-    Return
-    ------
+    Returns
+    -------
     If enabled, returns the current tracked statistics.
     If disabled, returns None.
     """
@@ -94,8 +94,8 @@ def push_statistics() -> Optional[Statistics]:
     If statistics are disabled (the current memory resource is not an
     instance of StatisticsResourceAdaptor), this function is a no-op.
 
-    Return
-    ------
+    Returns
+    -------
     If enabled, returns the current tracked statistics _before_ the pop.
     If disabled, returns None.
     """
@@ -114,8 +114,8 @@ def pop_statistics() -> Optional[Statistics]:
     If statistics are disabled (the current memory resource is not an
     instance of StatisticsResourceAdaptor), this function is a no-op.
 
-    Return
-    ------
+    Returns
+    -------
     If enabled, returns the popped counters.
     If disabled, returns None.
     """
@@ -232,8 +232,8 @@ def report(
         ordered_by
             Sort the statistics by this attribute.
 
-        Return
-        ------
+        Returns
+        -------
         The pretty formatted string of the memory statistics
         """
 
@@ -279,8 +279,8 @@ def _get_descriptive_name_of_object(obj: object) -> str:
     obj
         Object in question
 
-    Return
-    ------
+    Returns
+    -------
     A string including filename, line number, and object name.
     """
 
diff --git a/python/rmm/rmm/tests/test_cython.py b/python/rmm/rmm/tests/test_cython.py
index 82eba2451..5df933435 100644
--- a/python/rmm/rmm/tests/test_cython.py
+++ b/python/rmm/rmm/tests/test_cython.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@ def wrapped(*args, **kwargs):
     return wrapped
 
 
-cython_test_modules = ["rmm._lib.tests.test_device_buffer"]
+cython_test_modules = ["rmm.pylibrmm.tests.test_device_buffer"]
 
 
 for mod in cython_test_modules:
diff --git a/python/rmm/rmm/tests/test_rmm.py b/python/rmm/rmm/tests/test_rmm.py
index c88d21b38..182434dc5 100644
--- a/python/rmm/rmm/tests/test_rmm.py
+++ b/python/rmm/rmm/tests/test_rmm.py
@@ -32,12 +32,6 @@
 
 cuda.set_memory_manager(RMMNumbaManager)
 
-_driver_version = rmm._cuda.gpu.driverGetVersion()
-_runtime_version = rmm._cuda.gpu.runtimeGetVersion()
-_CUDAMALLOC_ASYNC_SUPPORTED = (_driver_version >= 11020) and (
-    _runtime_version >= 11020
-)
-
 _SYSTEM_MEMORY_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute(
     cudart.cudaDeviceAttr.cudaDevAttrPageableMemoryAccess,
     rmm._cuda.gpu.getDevice(),
@@ -354,7 +348,7 @@ def test_rmm_pool_numba_stream(stream):
     rmm.reinitialize(pool_allocator=True)
 
     stream = rmm._cuda.stream.Stream(stream)
-    a = rmm._lib.device_buffer.DeviceBuffer(size=3, stream=stream)
+    a = rmm.pylibrmm.device_buffer.DeviceBuffer(size=3, stream=stream)
 
     assert a.size == 3
     assert a.ptr != 0
@@ -505,6 +499,28 @@ def test_binning_memory_resource(dtype, nelem, alloc, upstream_mr):
     array_tester(dtype, nelem, alloc)
 
 
+@pytest.mark.parametrize("dtype", _dtypes)
+@pytest.mark.parametrize("nelem", _nelems)
+@pytest.mark.parametrize("alloc", _allocs)
+@pytest.mark.parametrize(
+    "upstream_mr",
+    [
+        lambda: rmm.mr.CudaMemoryResource(),
+        lambda: rmm.mr.ManagedMemoryResource(),
+        lambda: rmm.mr.PoolMemoryResource(
+            rmm.mr.CudaMemoryResource(), 1 << 20
+        ),
+    ],
+)
+def test_arena_memory_resource(dtype, nelem, alloc, upstream_mr):
+    upstream = upstream_mr()
+    mr = rmm.mr.ArenaMemoryResource(upstream)
+
+    rmm.mr.set_current_device_resource(mr)
+    assert rmm.mr.get_current_device_resource_type() is type(mr)
+    array_tester(dtype, nelem, alloc)
+
+
 def test_reinitialize_max_pool_size():
     rmm.reinitialize(
         pool_allocator=True, initial_pool_size=0, maximum_pool_size="8MiB"
@@ -635,10 +651,6 @@ def test_mr_upstream_lifetime():
     del pool_mr
 
 
-@pytest.mark.skipif(
-    not _CUDAMALLOC_ASYNC_SUPPORTED,
-    reason="cudaMallocAsync not supported",
-)
 @pytest.mark.parametrize("dtype", _dtypes)
 @pytest.mark.parametrize("nelem", _nelems)
 @pytest.mark.parametrize("alloc", _allocs)
@@ -649,15 +661,11 @@ def test_cuda_async_memory_resource(dtype, nelem, alloc):
     array_tester(dtype, nelem, alloc)
 
 
-@pytest.mark.skipif(
-    not _CUDAMALLOC_ASYNC_SUPPORTED,
-    reason="cudaMallocAsync not supported",
-)
 def test_cuda_async_memory_resource_ipc():
     # TODO: We don't have a great way to check if IPC is supported in Python,
     # without using the C++ function
-    # rmm::detail::async_alloc::is_export_handle_type_supported. We can't
-    # accurately test driver and runtime versions for this via Python because
+    # rmm::detail::runtime_async_alloc::is_export_handle_type_supported.
+    # We can't accurately test this via Python because
     # cuda-python always has the IPC handle enum defined (which normally
     # requires a CUDA 11.3 runtime) and the cuda-compat package in Docker
     # containers prevents us from assuming that the driver we see actually
@@ -680,10 +688,6 @@ def test_cuda_async_memory_resource_ipc():
         assert rmm.mr.get_current_device_resource_type() is type(mr)
 
 
-@pytest.mark.skipif(
-    not _CUDAMALLOC_ASYNC_SUPPORTED,
-    reason="cudaMallocAsync not supported",
-)
 @pytest.mark.parametrize("nelems", _nelems)
 def test_cuda_async_memory_resource_stream(nelems):
     # test that using CudaAsyncMemoryResource
@@ -697,10 +701,6 @@ def test_cuda_async_memory_resource_stream(nelems):
     np.testing.assert_equal(expected, result)
 
 
-@pytest.mark.skipif(
-    not _CUDAMALLOC_ASYNC_SUPPORTED,
-    reason="cudaMallocAsync not supported",
-)
 @pytest.mark.parametrize("nelem", _nelems)
 @pytest.mark.parametrize("alloc", _allocs)
 def test_cuda_async_memory_resource_threshold(nelem, alloc):
@@ -717,13 +717,7 @@ def test_cuda_async_memory_resource_threshold(nelem, alloc):
     "mr",
     [
         rmm.mr.CudaMemoryResource,
-        pytest.param(
-            rmm.mr.CudaAsyncMemoryResource,
-            marks=pytest.mark.skipif(
-                not _CUDAMALLOC_ASYNC_SUPPORTED,
-                reason="cudaMallocAsync not supported",
-            ),
-        ),
+        pytest.param(rmm.mr.CudaAsyncMemoryResource),
     ],
 )
 def test_limiting_resource_adaptor(mr):
@@ -801,10 +795,28 @@ def callback(nbytes: int) -> bool:
     rmm.mr.set_current_device_resource(mr)
 
     with pytest.raises(MemoryError):
-        rmm.DeviceBuffer(size=int(1e11))
+        from rmm.mr import available_device_memory
+
+        total_memory = available_device_memory()[1]
+        rmm.DeviceBuffer(size=total_memory * 2)
     assert retried[0]
 
 
+def test_failure_callback_resource_adaptor_error():
+    def callback(nbytes: int) -> bool:
+        raise RuntimeError("MyError")
+
+    cuda_mr = rmm.mr.CudaMemoryResource()
+    mr = rmm.mr.FailureCallbackResourceAdaptor(cuda_mr, callback)
+    rmm.mr.set_current_device_resource(mr)
+
+    with pytest.raises(RuntimeError, match="MyError"):
+        from rmm.mr import available_device_memory
+
+        total_memory = available_device_memory()[1]
+        rmm.DeviceBuffer(size=total_memory * 2)
+
+
 @pytest.mark.parametrize("managed", [True, False])
 def test_prefetch_resource_adaptor(managed):
     if managed:
@@ -829,18 +841,6 @@ def test_prefetch_resource_adaptor(managed):
         assert_prefetched(db, device)
 
 
-def test_failure_callback_resource_adaptor_error():
-    def callback(nbytes: int) -> bool:
-        raise RuntimeError("MyError")
-
-    cuda_mr = rmm.mr.CudaMemoryResource()
-    mr = rmm.mr.FailureCallbackResourceAdaptor(cuda_mr, callback)
-    rmm.mr.set_current_device_resource(mr)
-
-    with pytest.raises(RuntimeError, match="MyError"):
-        rmm.DeviceBuffer(size=int(1e11))
-
-
 def test_dev_buf_circle_ref_dealloc():
     # This test creates a reference cycle containing a `DeviceBuffer`
     # and ensures that the garbage collector does not clear it, i.e.,
@@ -1076,3 +1076,9 @@ def test_available_device_memory():
     assert initial_memory[1] == final_memory[1]
     assert initial_memory[0] > 0
     assert final_memory[0] > 0
+
+
+# TODO: Remove test when rmm._lib is removed in 25.02
+def test_deprecate_rmm_lib():
+    with pytest.warns(FutureWarning):
+        rmm._lib.device_buffer.DeviceBuffer(size=100)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ea1af58cd..476028af0 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -19,13 +19,16 @@ option(CODE_COVERAGE "Enable generating code coverage with gcov." OFF)
 include(rapids-test)
 rapids_test_init()
 
+# Ensure tests are using the conda env, so they have the correct release/debug compile flags
+rapids_cmake_support_conda_env(conda_env)
+
 # This function takes in a test name and test source and handles setting all of the associated
 # properties and linking to build the test
 function(ConfigureTestInternal TEST_NAME)
   add_executable(${TEST_NAME} ${ARGN})
   target_include_directories(${TEST_NAME} PRIVATE "$<BUILD_INTERFACE:${RMM_SOURCE_DIR}>")
   target_link_libraries(${TEST_NAME} GTest::gmock GTest::gtest GTest::gmock_main GTest::gtest_main
-                        pthread rmm)
+                        pthread rmm $<TARGET_NAME_IF_EXISTS:conda_env>)
   set_target_properties(
     ${TEST_NAME}
     PROPERTIES POSITION_INDEPENDENT_CODE ON
@@ -38,9 +41,7 @@ function(ConfigureTestInternal TEST_NAME)
                CUDA_STANDARD_REQUIRED ON)
   target_compile_definitions(${TEST_NAME}
                              PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL}")
-  target_compile_options(${TEST_NAME} PUBLIC $<$<COMPILE_LANG_AND_ID:CXX,GNU,Clang>:-Wall -Werror
-                                             -Wno-error=deprecated-declarations>)
-  target_compile_options(${TEST_NAME} PUBLIC "$<$<CONFIG:Debug>:-O0>")
+  target_compile_options(${TEST_NAME} PUBLIC $<$<COMPILE_LANG_AND_ID:CXX,GNU,Clang>:-Wall -Werror>)
 
   if(DISABLE_DEPRECATION_WARNING)
     target_compile_options(
@@ -83,7 +84,7 @@ endfunction()
 function(ConfigureTest TEST_NAME)
 
   set(options)
-  set(one_value GPUS PERCENT)
+  set(one_value CUDART GPUS PERCENT)
   set(multi_value)
   cmake_parse_arguments(_RMM_TEST "${options}" "${one_value}" "${multi_value}" ${ARGN})
   if(NOT DEFINED _RMM_TEST_GPUS AND NOT DEFINED _RMM_TEST_PERCENT)
@@ -97,13 +98,23 @@ function(ConfigureTest TEST_NAME)
     set(_RMM_TEST_PERCENT 100)
   endif()
 
+  if(_RMM_TEST_CUDART STREQUAL SHARED)
+    set(cudart_link_libs $<COMPILE_ONLY:rmm> CUDA::cudart)
+  elseif(_RMM_TEST_CUDART STREQUAL STATIC)
+    set(cudart_link_libs $<COMPILE_ONLY:rmm> CUDA::cudart_static)
+  else()
+    set(cudart_link_libs rmm)
+  endif()
+
   # Test with legacy default stream.
   ConfigureTestInternal(${TEST_NAME} ${_RMM_TEST_UNPARSED_ARGUMENTS})
+  target_link_libraries(${TEST_NAME} ${cudart_link_libs})
 
   # Test with per-thread default stream.
   string(REGEX REPLACE "_TEST$" "_PTDS_TEST" PTDS_TEST_NAME "${TEST_NAME}")
   ConfigureTestInternal("${PTDS_TEST_NAME}" ${_RMM_TEST_UNPARSED_ARGUMENTS})
   target_compile_definitions("${PTDS_TEST_NAME}" PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM)
+  target_link_libraries(${PTDS_TEST_NAME} ${cudart_link_libs})
 
   foreach(name ${TEST_NAME} ${PTDS_TEST_NAME} ${NS_TEST_NAME})
     rapids_test_add(
@@ -129,7 +140,10 @@ ConfigureTest(ADAPTOR_TEST mr/device/adaptor_tests.cpp)
 ConfigureTest(POOL_MR_TEST mr/device/pool_mr_tests.cpp GPUS 1 PERCENT 100)
 
 # cuda_async mr tests
-ConfigureTest(CUDA_ASYNC_MR_TEST mr/device/cuda_async_mr_tests.cpp GPUS 1 PERCENT 60)
+ConfigureTest(CUDA_ASYNC_MR_STATIC_CUDART_TEST mr/device/cuda_async_mr_tests.cpp GPUS 1 PERCENT 60
+              CUDART STATIC)
+ConfigureTest(CUDA_ASYNC_MR_SHARED_CUDART_TEST mr/device/cuda_async_mr_tests.cpp GPUS 1 PERCENT 60
+              CUDART SHARED)
 
 # thrust allocator tests
 ConfigureTest(THRUST_ALLOCATOR_TEST mr/device/thrust_allocator_tests.cu GPUS 1 PERCENT 60)
diff --git a/tests/logger_tests.cpp b/tests/logger_tests.cpp
index 643281d91..8a5d37be2 100644
--- a/tests/logger_tests.cpp
+++ b/tests/logger_tests.cpp
@@ -112,7 +112,6 @@ void expect_log_events(std::string const& filename,
                // EXPECT_EQ(expected.thread_id, actual.thread_id);
                // EXPECT_EQ(expected.stream, actual.stream);
                EXPECT_EQ(expected.act, actual.act);
-               // device_memory_resource automatically pads an allocation to a multiple of 8 bytes
                EXPECT_EQ(expected.size, actual.size);
                EXPECT_EQ(expected.pointer, actual.pointer);
                return true;
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index bdc0f2438..67f183a23 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -491,9 +491,7 @@ TEST_F(ArenaTest, SizeSmallerThanSuperblockSize)  // NOLINT
 TEST_F(ArenaTest, AllocateNinetyPercent)  // NOLINT
 {
   EXPECT_NO_THROW([]() {  // NOLINT(cppcoreguidelines-avoid-goto)
-    auto const free           = rmm::available_device_memory().first;
-    auto const ninety_percent = rmm::align_up(
-      static_cast<std::size_t>(static_cast<double>(free) * 0.9), rmm::CUDA_ALLOCATION_ALIGNMENT);
+    auto const ninety_percent = rmm::percent_of_free_device_memory(90);
     arena_mr mr(rmm::mr::get_current_device_resource_ref(), ninety_percent);
   }());
 }
@@ -576,10 +574,10 @@ TEST_F(ArenaTest, DumpLogOnFailure)  // NOLINT
     std::size_t num_threads{4};
     threads.reserve(num_threads);
     for (std::size_t i = 0; i < num_threads; ++i) {
-      threads.emplace_back(std::thread([&] {
+      threads.emplace_back([&] {
         void* ptr = mr.allocate(32_KiB);
         mr.deallocate(ptr, 32_KiB);
-      }));
+      });
     }
 
     for (auto& thread : threads) {
diff --git a/tests/mr/device/callback_mr_tests.cpp b/tests/mr/device/callback_mr_tests.cpp
index a56efa60c..a7f6ab7be 100644
--- a/tests/mr/device/callback_mr_tests.cpp
+++ b/tests/mr/device/callback_mr_tests.cpp
@@ -23,11 +23,11 @@
 #include <rmm/mr/device/callback_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
-#include <fmt/core.h>
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 #include <cstddef>
+#include <string>
 
 namespace rmm::test {
 namespace {
@@ -78,8 +78,9 @@ TEST(CallbackTest, LoggingTest)
   auto* ptr       = mr.allocate(size);
   mr.deallocate(ptr, size);
 
-  std::string output = testing::internal::GetCapturedStdout();
-  std::string expect = fmt::format("Allocating {} bytes\nDeallocating {} bytes\n", size, size);
+  auto output = testing::internal::GetCapturedStdout();
+  auto expect = std::string("Allocating ") + std::to_string(size) + " bytes\nDeallocating " +
+                std::to_string(size) + " bytes\n";
   ASSERT_EQ(expect, output);
 }
 
diff --git a/tests/mr/device/cuda_async_mr_tests.cpp b/tests/mr/device/cuda_async_mr_tests.cpp
index 90c7b0ff9..a39188548 100644
--- a/tests/mr/device/cuda_async_mr_tests.cpp
+++ b/tests/mr/device/cuda_async_mr_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,24 +31,13 @@ class AsyncMRTest : public ::testing::Test {
  protected:
   void SetUp() override
   {
-    if (!rmm::detail::async_alloc::is_supported()) {
+    if (!rmm::detail::runtime_async_alloc::is_supported()) {
       GTEST_SKIP() << "Skipping tests since cudaMallocAsync not supported with this CUDA "
                    << "driver/runtime version";
     }
   }
 };
 
-TEST_F(AsyncMRTest, ThrowIfNotSupported)
-{
-  auto construct_mr = []() { cuda_async_mr mr; };
-#ifndef RMM_CUDA_MALLOC_ASYNC_SUPPORT
-  EXPECT_THROW(construct_mr(), rmm::logic_error);
-#else
-  EXPECT_NO_THROW(construct_mr());
-#endif
-}
-
-#if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
 TEST_F(AsyncMRTest, ExplicitInitialPoolSize)
 {
   const auto pool_init_size{100};
@@ -77,7 +66,5 @@ TEST_F(AsyncMRTest, DifferentPoolsUnequal)
   EXPECT_FALSE(mr1.is_equal(mr2));
 }
 
-#endif
-
 }  // namespace
 }  // namespace rmm::test
diff --git a/tests/mr/device/cuda_async_view_mr_tests.cpp b/tests/mr/device/cuda_async_view_mr_tests.cpp
index fe82431a9..f3a02cbf0 100644
--- a/tests/mr/device/cuda_async_view_mr_tests.cpp
+++ b/tests/mr/device/cuda_async_view_mr_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,13 +29,10 @@ using cuda_async_view_mr = rmm::mr::cuda_async_view_memory_resource;
 static_assert(cuda::mr::resource_with<cuda_async_view_mr, cuda::mr::device_accessible>);
 static_assert(cuda::mr::async_resource_with<cuda_async_view_mr, cuda::mr::device_accessible>);
 
-#if defined(RMM_CUDA_MALLOC_ASYNC_SUPPORT)
-
 TEST(PoolTest, UsePool)
 {
   cudaMemPool_t memPool{};
-  RMM_CUDA_TRY(rmm::detail::async_alloc::cudaDeviceGetDefaultMemPool(
-    &memPool, rmm::get_current_cuda_device().value()));
+  RMM_CUDA_TRY(cudaDeviceGetDefaultMemPool(&memPool, rmm::get_current_cuda_device().value()));
 
   const auto pool_init_size{100};
   cuda_async_view_mr mr{memPool};
@@ -53,7 +50,7 @@ TEST(PoolTest, NotTakingOwnershipOfPool)
 
   cudaMemPool_t memPool{};
 
-  RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&memPool, &poolProps));
+  RMM_CUDA_TRY(cudaMemPoolCreate(&memPool, &poolProps));
 
   {
     const auto pool_init_size{100};
@@ -64,7 +61,7 @@ TEST(PoolTest, NotTakingOwnershipOfPool)
   }
 
   auto destroy_valid_pool = [&]() {
-    auto result = rmm::detail::async_alloc::cudaMemPoolDestroy(memPool);
+    auto result = cudaMemPoolDestroy(memPool);
     RMM_EXPECTS(result == cudaSuccess, "Pool wrapper did destroy pool");
   };
 
@@ -81,7 +78,5 @@ TEST(PoolTest, ThrowIfNullptrPool)
   EXPECT_THROW(construct_mr(), rmm::logic_error);
 }
 
-#endif
-
 }  // namespace
 }  // namespace rmm::test
diff --git a/tests/mr/device/mr_ref_multithreaded_tests.cpp b/tests/mr/device/mr_ref_multithreaded_tests.cpp
index 7d749efd1..9e7c8c2e8 100644
--- a/tests/mr/device/mr_ref_multithreaded_tests.cpp
+++ b/tests/mr/device/mr_ref_multithreaded_tests.cpp
@@ -36,17 +36,11 @@ namespace {
 
 struct mr_ref_test_mt : public mr_ref_test {};
 
-INSTANTIATE_TEST_CASE_P(MultiThreadResourceTests,
-                        mr_ref_test_mt,
-                        ::testing::Values("CUDA",
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
-                                          "CUDA_Async",
-#endif
-                                          "Managed",
-                                          "Pool",
-                                          "Arena",
-                                          "Binning"),
-                        [](auto const& info) { return info.param; });
+INSTANTIATE_TEST_CASE_P(
+  MultiThreadResourceTests,
+  mr_ref_test_mt,
+  ::testing::Values("CUDA", "CUDA_Async", "Managed", "Pool", "Arena", "Binning"),
+  [](auto const& info) { return info.param; });
 
 template <typename Task, typename... Arguments>
 void spawn_n(std::size_t num_threads, Task task, Arguments&&... args)
@@ -109,8 +103,13 @@ TEST_P(mr_ref_test_mt, SetCurrentDeviceResourceRef_mt)
 {
   // single thread changes default resource, then multiple threads use it
   auto old = rmm::mr::set_current_device_resource_ref(this->ref);
+  test_get_current_device_resource_ref();
 
-  spawn([mr = this->ref]() {
+  int device;
+  RMM_CUDA_TRY(cudaGetDevice(&device));
+
+  spawn([device, mr = this->ref]() {
+    RMM_CUDA_TRY(cudaSetDevice(device));
     EXPECT_EQ(mr, rmm::mr::get_current_device_resource_ref());
     test_get_current_device_resource_ref();  // test allocating with the new default resource
   });
@@ -156,7 +155,17 @@ TEST_P(mr_ref_test_mt, SetCurrentDeviceResourceRefPerThread_mt)
   }
 }
 
-TEST_P(mr_ref_test_mt, Allocate) { spawn(test_various_allocations, this->ref); }
+TEST_P(mr_ref_test_mt, Allocate)
+{
+  int device;
+  RMM_CUDA_TRY(cudaGetDevice(&device));
+
+  auto mr = this->ref;
+  spawn([device, mr]() {
+    RMM_CUDA_TRY(cudaSetDevice(device));
+    test_various_allocations(mr);
+  });
+}
 
 TEST_P(mr_ref_test_mt, AllocateDefaultStream)
 {
diff --git a/tests/mr/device/mr_ref_test.hpp b/tests/mr/device/mr_ref_test.hpp
index 6e63b3838..2af0eff44 100644
--- a/tests/mr/device/mr_ref_test.hpp
+++ b/tests/mr/device/mr_ref_test.hpp
@@ -347,7 +347,7 @@ inline auto make_host_pinned() { return std::make_shared<rmm::mr::pinned_host_me
 
 inline auto make_cuda_async()
 {
-  if (rmm::detail::async_alloc::is_supported()) {
+  if (rmm::detail::runtime_async_alloc::is_supported()) {
     return std::make_shared<rmm::mr::cuda_async_memory_resource>();
   }
   return std::shared_ptr<rmm::mr::cuda_async_memory_resource>{nullptr};
diff --git a/tests/mr/device/mr_ref_tests.cpp b/tests/mr/device/mr_ref_tests.cpp
index 55e91d765..41af050a0 100644
--- a/tests/mr/device/mr_ref_tests.cpp
+++ b/tests/mr/device/mr_ref_tests.cpp
@@ -30,9 +30,7 @@ namespace {
 INSTANTIATE_TEST_SUITE_P(ResourceTests,
                          mr_ref_test,
                          ::testing::Values("CUDA",
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
                                            "CUDA_Async",
-#endif
                                            "Managed",
                                            "System",
                                            "Pool",
@@ -46,9 +44,7 @@ INSTANTIATE_TEST_SUITE_P(ResourceTests,
 INSTANTIATE_TEST_SUITE_P(ResourceAllocationTests,
                          mr_ref_allocation_test,
                          ::testing::Values("CUDA",
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
                                            "CUDA_Async",
-#endif
                                            "Managed",
                                            "System"
                                            "Pool",
diff --git a/tests/mr/device/test_utils.hpp b/tests/mr/device/test_utils.hpp
index 2b9513793..5b7ef197b 100644
--- a/tests/mr/device/test_utils.hpp
+++ b/tests/mr/device/test_utils.hpp
@@ -31,17 +31,14 @@ inline bool is_device_accessible_memory(void* ptr)
 {
   cudaPointerAttributes attributes{};
   if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
-  return (attributes.type == cudaMemoryTypeDevice) or (attributes.type == cudaMemoryTypeManaged) or
-         ((attributes.type == cudaMemoryTypeHost) and (attributes.devicePointer != nullptr)) or
-         ((attributes.type == cudaMemoryTypeUnregistered) and
-          (rmm::mr::detail::is_system_memory_supported(rmm::get_current_cuda_device())));
+  return attributes.devicePointer != nullptr;
 }
 
 inline bool is_host_memory(void* ptr)
 {
   cudaPointerAttributes attributes{};
   if (cudaSuccess != cudaPointerGetAttributes(&attributes, ptr)) { return false; }
-  return attributes.type == cudaMemoryTypeHost;
+  return attributes.hostPointer != nullptr || attributes.type == cudaMemoryTypeUnregistered;
 }
 
 inline bool is_properly_aligned(void* ptr)
diff --git a/tests/mr/device/thrust_allocator_tests.cu b/tests/mr/device/thrust_allocator_tests.cu
index 84f599957..46447aa09 100644
--- a/tests/mr/device/thrust_allocator_tests.cu
+++ b/tests/mr/device/thrust_allocator_tests.cu
@@ -69,17 +69,11 @@ TEST_P(allocator_test, multi_device)
   }());
 }
 
-INSTANTIATE_TEST_CASE_P(ThrustAllocatorTests,
-                        allocator_test,
-                        ::testing::Values("CUDA",
-#ifdef RMM_CUDA_MALLOC_ASYNC_SUPPORT
-                                          "CUDA_Async",
-#endif
-                                          "Managed",
-                                          "Pool",
-                                          "Arena",
-                                          "Binning"),
-                        [](auto const& info) { return info.param; });
+INSTANTIATE_TEST_CASE_P(
+  ThrustAllocatorTests,
+  allocator_test,
+  ::testing::Values("CUDA", "CUDA_Async", "Managed", "Pool", "Arena", "Binning"),
+  [](auto const& info) { return info.param; });
 
 }  // namespace
 }  // namespace rmm::test
diff --git a/tests/mr/device/tracking_mr_tests.cpp b/tests/mr/device/tracking_mr_tests.cpp
index acd540ae6..3fce55fb8 100644
--- a/tests/mr/device/tracking_mr_tests.cpp
+++ b/tests/mr/device/tracking_mr_tests.cpp
@@ -204,8 +204,8 @@ TEST(TrackingTest, LogOutstandingAllocations)
 {
   std::ostringstream oss;
   auto oss_sink = std::make_shared<spdlog::sinks::ostream_sink_st>(oss);
-  rmm::logger().sinks().push_back(oss_sink);
-  auto old_level = rmm::logger().level();
+  rmm::detail::logger().sinks().push_back(oss_sink);
+  auto old_level = rmm::detail::logger().level();
 
   tracking_adaptor mr{rmm::mr::get_current_device_resource_ref()};
   std::vector<void*> allocations;
@@ -213,7 +213,7 @@ TEST(TrackingTest, LogOutstandingAllocations)
     allocations.push_back(mr.allocate(ten_MiB));
   }
 
-  rmm::logger().set_level(spdlog::level::debug);
+  rmm::detail::logger().set_level(spdlog::level::debug);
   EXPECT_NO_THROW(mr.log_outstanding_allocations());
 
 #if SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_DEBUG
@@ -224,8 +224,8 @@ TEST(TrackingTest, LogOutstandingAllocations)
     mr.deallocate(allocation, ten_MiB);
   }
 
-  rmm::logger().set_level(old_level);
-  rmm::logger().sinks().pop_back();
+  rmm::detail::logger().set_level(old_level);
+  rmm::detail::logger().sinks().pop_back();
 }
 
 }  // namespace
diff --git a/tests/prefetch_tests.cpp b/tests/prefetch_tests.cpp
index 6c7bb2dd3..4a2c41a2b 100644
--- a/tests/prefetch_tests.cpp
+++ b/tests/prefetch_tests.cpp
@@ -53,8 +53,8 @@ struct PrefetchTest : public ::testing::Test {
     // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g8048f6ea5ad77917444567656c140c5a
     // specifically for when cudaMemRangeAttribute::cudaMemRangeAttributeLastPrefetchLocation is
     // used.
-    constexpr size_t prefetch_data_size = 4;
     if constexpr (std::is_same_v<MemoryResourceType, rmm::mr::managed_memory_resource>) {
+      constexpr size_t prefetch_data_size = 4;
       int prefetch_location{0};
       RMM_CUDA_TRY(
         cudaMemRangeGetAttribute(&prefetch_location,