diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
index 532bea844..570f1b58f 100644
--- a/.github/scripts/fbgemm_gpu_test.bash
+++ b/.github/scripts/fbgemm_gpu_test.bash
@@ -88,9 +88,10 @@ run_fbgemm_gpu_tests () {
   )
 
   if [ "$fbgemm_variant" == "cpu" ]; then
-    # These are tests that are currently broken in FBGEMM_GPU-CPU
+    # These tests have non-CPU operators referenced in @given
     local ignored_tests=(
-      ./uvm_test.py
+      ./uvm/copy_test.py
+      ./uvm/uvm_test.py
     )
   elif [ "$fbgemm_variant" == "rocm" ]; then
     local ignored_tests=(
diff --git a/fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp b/fbgemm_gpu/test/uvm/cache_miss_emulate_test.cpp
similarity index 97%
rename from fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp
rename to fbgemm_gpu/test/uvm/cache_miss_emulate_test.cpp
index cd0e9deb0..576c7aa92 100644
--- a/fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp
+++ b/fbgemm_gpu/test/uvm/cache_miss_emulate_test.cpp
@@ -45,7 +45,7 @@ std::pair<at::Tensor, at::Tensor> run_emulate_cache_miss(
   return {lxu_cache_location_with_cache_misses.cpu(), uvm_cache_stats.cpu()};
 }
 
-TEST(uvm_cache_miss_emulate_test, no_cache_miss) {
+TEST(UvmCacheMissEmulateTest, no_cache_miss) {
   constexpr int64_t num_requests = 10000;
   constexpr int64_t num_sets = 32768;
   constexpr int64_t associativity = 32;
@@ -60,7 +60,7 @@ TEST(uvm_cache_miss_emulate_test, no_cache_miss) {
       at::equal(lxu_cache_locations_cpu, lxu_cache_location_with_cache_misses));
 }
 
-TEST(uvm_cache_miss_emulate_test, enforced_cache_miss) {
+TEST(UvmCacheMissEmulateTest, enforced_cache_miss) {
   constexpr int64_t num_requests = 10000;
   constexpr int64_t num_sets = 32768;
   constexpr int64_t associativity = 32;
diff --git a/fbgemm_gpu/test/uvm/copy_test.py b/fbgemm_gpu/test/uvm/copy_test.py
new file mode 100644
index 000000000..5c719cd96
--- /dev/null
+++ b/fbgemm_gpu/test/uvm/copy_test.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-ignore-all-errors[56]
+
+import unittest
+from typing import List
+
+import fbgemm_gpu
+import hypothesis.strategies as st
+import torch
+from hypothesis import given, settings, Verbosity
+
+# pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
+open_source: bool = getattr(fbgemm_gpu, "open_source", False)
+
+if open_source:
+    # pyre-ignore[21]
+    from test_utils import gpu_available, gpu_unavailable, skipIfRocm
+else:
+    from fbgemm_gpu.test.test_utils import gpu_available, gpu_unavailable, skipIfRocm
+
+if gpu_available:
+    # pyre-ignore[21]
+    from fbgemm_gpu.uvm import cudaMemAdvise, cudaMemoryAdvise, cudaMemPrefetchAsync
+
+
+MAX_EXAMPLES = 40
+
+
+class CopyTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    @given(
+        sizes=st.lists(st.integers(min_value=1, max_value=8), min_size=1, max_size=4),
+        uvm_op=st.sampled_from(
+            [
+                torch.ops.fbgemm.new_unified_tensor,
+                torch.ops.fbgemm.new_managed_tensor,
+                torch.ops.fbgemm.new_vanilla_managed_tensor,
+            ]
+        ),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    # pyre-fixme[2]: Parameter must be annotated.
+    def test_uvm_to_cpu(self, sizes: List[int], uvm_op) -> None:
+        if uvm_op is torch.ops.fbgemm.new_unified_tensor:
+            is_host_mapped = False
+            uvm_t = uvm_op(
+                torch.empty(0, device="cuda:0", dtype=torch.float),
+                sizes,
+                is_host_mapped,
+            )
+        else:
+            uvm_t = uvm_op(torch.empty(0, device="cuda:0", dtype=torch.float), sizes)
+
+        cpu_t = torch.ops.fbgemm.uvm_to_cpu(uvm_t)
+        assert not torch.ops.fbgemm.is_uvm_tensor(cpu_t)
+        assert torch.ops.fbgemm.uvm_storage(cpu_t)
+
+        uvm_t.copy_(cpu_t)
+        assert torch.ops.fbgemm.is_uvm_tensor(uvm_t)
+        assert torch.ops.fbgemm.uvm_storage(uvm_t)
+
+        # Test use of cpu tensor after freeing the uvm tensor
+        del uvm_t
+        cpu_t.mul_(42)
+
+    @skipIfRocm()
+    @unittest.skipIf(
+        not torch.cuda.is_available() or torch.cuda.device_count() < 2,
+        "Skip unless two CUDA devices are detected",
+    )
+    @given(
+        sizes=st.lists(
+            st.integers(min_value=1, max_value=(1024)), min_size=1, max_size=4
+        ),
+        uvm_op=st.sampled_from(
+            [
+                torch.ops.fbgemm.new_unified_tensor,
+                torch.ops.fbgemm.new_managed_tensor,
+                torch.ops.fbgemm.new_vanilla_managed_tensor,
+            ]
+        ),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    # pyre-fixme[2]: Parameter must be annotated.
+    def test_uvm_to_device(self, sizes: List[int], uvm_op) -> None:
+        if uvm_op is torch.ops.fbgemm.new_unified_tensor:
+            is_host_mapped = False
+            uvm_t = uvm_op(
+                torch.empty(0, device="cuda:0", dtype=torch.float),
+                sizes,
+                is_host_mapped,
+            )
+        else:
+            uvm_t = uvm_op(torch.empty(0, device="cuda:0", dtype=torch.float), sizes)
+
+        assert torch.ops.fbgemm.is_uvm_tensor(uvm_t)
+        assert torch.ops.fbgemm.uvm_storage(uvm_t)
+
+        # Reference uvm tensor from second cuda device
+        try:
+            device_prototype = torch.empty(0, device="cuda:1")
+        except RuntimeError:
+            # Skip the tests if there is no "cuda:1" device
+            return
+
+        second_t = torch.ops.fbgemm.uvm_to_device(uvm_t, device_prototype)
+
+        assert torch.ops.fbgemm.is_uvm_tensor(second_t)
+        assert torch.ops.fbgemm.uvm_storage(second_t)
+        assert second_t.device == device_prototype.device
+
+    @unittest.skipIf(*gpu_unavailable)
+    @given(
+        sizes=st.lists(
+            st.integers(min_value=1, max_value=(512)), min_size=1, max_size=3
+        ),
+        uvm_op=st.sampled_from(
+            [
+                torch.ops.fbgemm.new_unified_tensor,
+                torch.ops.fbgemm.new_managed_tensor,
+                torch.ops.fbgemm.new_vanilla_managed_tensor,
+            ]
+        ),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
+    # pyre-fixme[2]: Parameter must be annotated.
+    def test_uvm_to_cpu_clone(self, sizes: List[int], uvm_op) -> None:
+        if uvm_op is torch.ops.fbgemm.new_unified_tensor:
+            is_host_mapped = False
+            uvm_t = uvm_op(
+                torch.empty(0, device="cuda:0", dtype=torch.float),
+                sizes,
+                is_host_mapped,
+            )
+        else:
+            uvm_t = uvm_op(torch.empty(0, device="cuda:0", dtype=torch.float), sizes)
+
+        assert torch.ops.fbgemm.is_uvm_tensor(uvm_t)
+        assert torch.ops.fbgemm.uvm_storage(uvm_t)
+
+        cpu_clone = torch.ops.fbgemm.uvm_to_cpu_clone(uvm_t)
+
+        assert not torch.ops.fbgemm.is_uvm_tensor(cpu_clone)
+        assert not torch.ops.fbgemm.uvm_storage(cpu_clone)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fbgemm_gpu/test/uvm_test.py b/fbgemm_gpu/test/uvm/uvm_test.py
similarity index 65%
rename from fbgemm_gpu/test/uvm_test.py
rename to fbgemm_gpu/test/uvm/uvm_test.py
index 89ce026d7..3a879bfb2 100644
--- a/fbgemm_gpu/test/uvm_test.py
+++ b/fbgemm_gpu/test/uvm/uvm_test.py
@@ -61,42 +61,6 @@ def test_is_uvm_tensor(self, sizes: List[int], uvm_op) -> None:
         assert torch.ops.fbgemm.is_uvm_tensor(uvm_t)
         assert torch.ops.fbgemm.uvm_storage(uvm_t)
 
-    @unittest.skipIf(*gpu_unavailable)
-    @given(
-        sizes=st.lists(st.integers(min_value=1, max_value=8), min_size=1, max_size=4),
-        uvm_op=st.sampled_from(
-            [
-                torch.ops.fbgemm.new_unified_tensor,
-                torch.ops.fbgemm.new_managed_tensor,
-                torch.ops.fbgemm.new_vanilla_managed_tensor,
-            ]
-        ),
-    )
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
-    # pyre-fixme[2]: Parameter must be annotated.
-    def test_uvm_to_cpu(self, sizes: List[int], uvm_op) -> None:
-        if uvm_op is torch.ops.fbgemm.new_unified_tensor:
-            is_host_mapped = False
-            uvm_t = uvm_op(
-                torch.empty(0, device="cuda:0", dtype=torch.float),
-                sizes,
-                is_host_mapped,
-            )
-        else:
-            uvm_t = uvm_op(torch.empty(0, device="cuda:0", dtype=torch.float), sizes)
-
-        cpu_t = torch.ops.fbgemm.uvm_to_cpu(uvm_t)
-        assert not torch.ops.fbgemm.is_uvm_tensor(cpu_t)
-        assert torch.ops.fbgemm.uvm_storage(cpu_t)
-
-        uvm_t.copy_(cpu_t)
-        assert torch.ops.fbgemm.is_uvm_tensor(uvm_t)
-        assert torch.ops.fbgemm.uvm_storage(uvm_t)
-
-        # Test use of cpu tensor after freeing the uvm tensor
-        del uvm_t
-        cpu_t.mul_(42)
-
     @unittest.skipIf(*gpu_unavailable)
     def test_enum(self) -> None:
         # pyre-ignore[16]
@@ -168,52 +132,6 @@ def test_cudaMemPrefetchAsync(self, sizes: List[int], uvm_op) -> None:
 
         torch.cuda.synchronize(torch.device("cuda:0"))
 
-    @skipIfRocm()
-    @unittest.skipIf(
-        not torch.cuda.is_available() or torch.cuda.device_count() < 2,
-        "Skip unless two CUDA devices are detected",
-    )
-    @given(
-        sizes=st.lists(
-            st.integers(min_value=1, max_value=(1024)), min_size=1, max_size=4
-        ),
-        uvm_op=st.sampled_from(
-            [
-                torch.ops.fbgemm.new_unified_tensor,
-                torch.ops.fbgemm.new_managed_tensor,
-                torch.ops.fbgemm.new_vanilla_managed_tensor,
-            ]
-        ),
-    )
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
-    # pyre-fixme[2]: Parameter must be annotated.
-    def test_uvm_to_device(self, sizes: List[int], uvm_op) -> None:
-        if uvm_op is torch.ops.fbgemm.new_unified_tensor:
-            is_host_mapped = False
-            uvm_t = uvm_op(
-                torch.empty(0, device="cuda:0", dtype=torch.float),
-                sizes,
-                is_host_mapped,
-            )
-        else:
-            uvm_t = uvm_op(torch.empty(0, device="cuda:0", dtype=torch.float), sizes)
-
-        assert torch.ops.fbgemm.is_uvm_tensor(uvm_t)
-        assert torch.ops.fbgemm.uvm_storage(uvm_t)
-
-        # Reference uvm tensor from second cuda device
-        try:
-            device_prototype = torch.empty(0, device="cuda:1")
-        except RuntimeError:
-            # Skip the tests if there is no "cuda:1" device
-            return
-
-        second_t = torch.ops.fbgemm.uvm_to_device(uvm_t, device_prototype)
-
-        assert torch.ops.fbgemm.is_uvm_tensor(second_t)
-        assert torch.ops.fbgemm.uvm_storage(second_t)
-        assert second_t.device == device_prototype.device
-
     @skipIfRocm()
     @unittest.skipIf(*gpu_unavailable)
     @given(
@@ -289,40 +207,6 @@ def test_uvm_memadviceDontFork(self, sizes: List[int], uvm_op) -> None:
 
         torch.ops.fbgemm.uvm_mem_advice_dont_fork(cpu_t)
 
-    @unittest.skipIf(*gpu_unavailable)
-    @given(
-        sizes=st.lists(
-            st.integers(min_value=1, max_value=(512)), min_size=1, max_size=3
-        ),
-        uvm_op=st.sampled_from(
-            [
-                torch.ops.fbgemm.new_unified_tensor,
-                torch.ops.fbgemm.new_managed_tensor,
-                torch.ops.fbgemm.new_vanilla_managed_tensor,
-            ]
-        ),
-    )
-    @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
-    # pyre-fixme[2]: Parameter must be annotated.
-    def test_uvm_to_cpu_clone(self, sizes: List[int], uvm_op) -> None:
-        if uvm_op is torch.ops.fbgemm.new_unified_tensor:
-            is_host_mapped = False
-            uvm_t = uvm_op(
-                torch.empty(0, device="cuda:0", dtype=torch.float),
-                sizes,
-                is_host_mapped,
-            )
-        else:
-            uvm_t = uvm_op(torch.empty(0, device="cuda:0", dtype=torch.float), sizes)
-
-        assert torch.ops.fbgemm.is_uvm_tensor(uvm_t)
-        assert torch.ops.fbgemm.uvm_storage(uvm_t)
-
-        cpu_clone = torch.ops.fbgemm.uvm_to_cpu_clone(uvm_t)
-
-        assert not torch.ops.fbgemm.is_uvm_tensor(cpu_clone)
-        assert not torch.ops.fbgemm.uvm_storage(cpu_clone)
-
     @unittest.skipIf(*gpu_unavailable)
     @given(
         sizes=st.lists(