diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash index 532bea844..570f1b58f 100644 --- a/.github/scripts/fbgemm_gpu_test.bash +++ b/.github/scripts/fbgemm_gpu_test.bash @@ -88,9 +88,10 @@ run_fbgemm_gpu_tests () { ) if [ "$fbgemm_variant" == "cpu" ]; then - # These are tests that are currently broken in FBGEMM_GPU-CPU + # These tests have non-CPU operators referenced in @given local ignored_tests=( - ./uvm_test.py + ./uvm/copy_test.py + ./uvm/uvm_test.py ) elif [ "$fbgemm_variant" == "rocm" ]; then local ignored_tests=( diff --git a/fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp b/fbgemm_gpu/test/uvm/cache_miss_emulate_test.cpp similarity index 97% rename from fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp rename to fbgemm_gpu/test/uvm/cache_miss_emulate_test.cpp index cd0e9deb0..576c7aa92 100644 --- a/fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp +++ b/fbgemm_gpu/test/uvm/cache_miss_emulate_test.cpp @@ -45,7 +45,7 @@ std::pair run_emulate_cache_miss( return {lxu_cache_location_with_cache_misses.cpu(), uvm_cache_stats.cpu()}; } -TEST(uvm_cache_miss_emulate_test, no_cache_miss) { +TEST(UvmCacheMissEmulateTest, no_cache_miss) { constexpr int64_t num_requests = 10000; constexpr int64_t num_sets = 32768; constexpr int64_t associativity = 32; @@ -60,7 +60,7 @@ TEST(uvm_cache_miss_emulate_test, no_cache_miss) { at::equal(lxu_cache_locations_cpu, lxu_cache_location_with_cache_misses)); } -TEST(uvm_cache_miss_emulate_test, enforced_cache_miss) { +TEST(UvmCacheMissEmulateTest, enforced_cache_miss) { constexpr int64_t num_requests = 10000; constexpr int64_t num_sets = 32768; constexpr int64_t associativity = 32; diff --git a/fbgemm_gpu/test/uvm/copy_test.py b/fbgemm_gpu/test/uvm/copy_test.py new file mode 100644 index 000000000..5c719cd96 --- /dev/null +++ b/fbgemm_gpu/test/uvm/copy_test.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-ignore-all-errors[56] + +import unittest +from typing import List + +import fbgemm_gpu +import hypothesis.strategies as st +import torch +from hypothesis import given, settings, Verbosity + +# pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`. +open_source: bool = getattr(fbgemm_gpu, "open_source", False) + +if open_source: + # pyre-ignore[21] + from test_utils import gpu_available, gpu_unavailable, skipIfRocm +else: + from fbgemm_gpu.test.test_utils import gpu_available, gpu_unavailable, skipIfRocm + +if gpu_available: + # pyre-ignore[21] + from fbgemm_gpu.uvm import cudaMemAdvise, cudaMemoryAdvise, cudaMemPrefetchAsync + + +MAX_EXAMPLES = 40 + + +class CopyTest(unittest.TestCase): + @unittest.skipIf(*gpu_unavailable) + @given( + sizes=st.lists(st.integers(min_value=1, max_value=8), min_size=1, max_size=4), + uvm_op=st.sampled_from( + [ + torch.ops.fbgemm.new_unified_tensor, + torch.ops.fbgemm.new_managed_tensor, + torch.ops.fbgemm.new_vanilla_managed_tensor, + ] + ), + ) + @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None) + # pyre-fixme[2]: Parameter must be annotated. + def test_uvm_to_cpu(self, sizes: List[int], uvm_op) -> None: + if uvm_op is torch.ops.fbgemm.new_unified_tensor: + is_host_mapped = False + uvm_t = uvm_op( + torch.empty(0, device="cuda:0", dtype=torch.float), + sizes, + is_host_mapped, + ) + else: + uvm_t = uvm_op(torch.empty(0, device="cuda:0", dtype=torch.float), sizes) + + cpu_t = torch.ops.fbgemm.uvm_to_cpu(uvm_t) + assert not torch.ops.fbgemm.is_uvm_tensor(cpu_t) + assert torch.ops.fbgemm.uvm_storage(cpu_t) + + uvm_t.copy_(cpu_t) + assert torch.ops.fbgemm.is_uvm_tensor(uvm_t) + assert torch.ops.fbgemm.uvm_storage(uvm_t) + + # Test use of cpu tensor after freeing the uvm tensor + del uvm_t + cpu_t.mul_(42) + + @skipIfRocm() + @unittest.skipIf( + not torch.cuda.is_available() or torch.cuda.device_count() < 2, + "Skip unless two CUDA devices are detected", + ) + @given( + sizes=st.lists( + st.integers(min_value=1, max_value=(1024)), min_size=1, max_size=4 + ), + uvm_op=st.sampled_from( + [ + torch.ops.fbgemm.new_unified_tensor, + torch.ops.fbgemm.new_managed_tensor, + torch.ops.fbgemm.new_vanilla_managed_tensor, + ] + ), + ) + @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None) + # pyre-fixme[2]: Parameter must be annotated. + def test_uvm_to_device(self, sizes: List[int], uvm_op) -> None: + if uvm_op is torch.ops.fbgemm.new_unified_tensor: + is_host_mapped = False + uvm_t = uvm_op( + torch.empty(0, device="cuda:0", dtype=torch.float), + sizes, + is_host_mapped, + ) + else: + uvm_t = uvm_op(torch.empty(0, device="cuda:0", dtype=torch.float), sizes) + + assert torch.ops.fbgemm.is_uvm_tensor(uvm_t) + assert torch.ops.fbgemm.uvm_storage(uvm_t) + + # Reference uvm tensor from second cuda device + try: + device_prototype = torch.empty(0, device="cuda:1") + except RuntimeError: + # Skip the tests if there is no "cuda:1" device + return + + second_t = torch.ops.fbgemm.uvm_to_device(uvm_t, device_prototype) + + assert torch.ops.fbgemm.is_uvm_tensor(second_t) + assert torch.ops.fbgemm.uvm_storage(second_t) + assert second_t.device == device_prototype.device + + @unittest.skipIf(*gpu_unavailable) + @given( + sizes=st.lists( + st.integers(min_value=1, max_value=(512)), min_size=1, max_size=3 + ), + uvm_op=st.sampled_from( + [ + torch.ops.fbgemm.new_unified_tensor, + torch.ops.fbgemm.new_managed_tensor, + torch.ops.fbgemm.new_vanilla_managed_tensor, + ] + ), + ) + @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None) + # pyre-fixme[2]: Parameter must be annotated. + def test_uvm_to_cpu_clone(self, sizes: List[int], uvm_op) -> None: + if uvm_op is torch.ops.fbgemm.new_unified_tensor: + is_host_mapped = False + uvm_t = uvm_op( + torch.empty(0, device="cuda:0", dtype=torch.float), + sizes, + is_host_mapped, + ) + else: + uvm_t = uvm_op(torch.empty(0, device="cuda:0", dtype=torch.float), sizes) + + assert torch.ops.fbgemm.is_uvm_tensor(uvm_t) + assert torch.ops.fbgemm.uvm_storage(uvm_t) + + cpu_clone = torch.ops.fbgemm.uvm_to_cpu_clone(uvm_t) + + assert not torch.ops.fbgemm.is_uvm_tensor(cpu_clone) + assert not torch.ops.fbgemm.uvm_storage(cpu_clone) + + +if __name__ == "__main__": + unittest.main() diff --git a/fbgemm_gpu/test/uvm_test.py b/fbgemm_gpu/test/uvm/uvm_test.py similarity index 65% rename from fbgemm_gpu/test/uvm_test.py rename to fbgemm_gpu/test/uvm/uvm_test.py index 89ce026d7..3a879bfb2 100644 --- a/fbgemm_gpu/test/uvm_test.py +++ b/fbgemm_gpu/test/uvm/uvm_test.py @@ -61,42 +61,6 @@ def test_is_uvm_tensor(self, sizes: List[int], uvm_op) -> None: assert torch.ops.fbgemm.is_uvm_tensor(uvm_t) assert torch.ops.fbgemm.uvm_storage(uvm_t) - @unittest.skipIf(*gpu_unavailable) - @given( - sizes=st.lists(st.integers(min_value=1, max_value=8), min_size=1, max_size=4), - uvm_op=st.sampled_from( - [ - torch.ops.fbgemm.new_unified_tensor, - torch.ops.fbgemm.new_managed_tensor, - torch.ops.fbgemm.new_vanilla_managed_tensor, - ] - ), - ) - @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None) - # pyre-fixme[2]: Parameter must be annotated. - def test_uvm_to_cpu(self, sizes: List[int], uvm_op) -> None: - if uvm_op is torch.ops.fbgemm.new_unified_tensor: - is_host_mapped = False - uvm_t = uvm_op( - torch.empty(0, device="cuda:0", dtype=torch.float), - sizes, - is_host_mapped, - ) - else: - uvm_t = uvm_op(torch.empty(0, device="cuda:0", dtype=torch.float), sizes) - - cpu_t = torch.ops.fbgemm.uvm_to_cpu(uvm_t) - assert not torch.ops.fbgemm.is_uvm_tensor(cpu_t) - assert torch.ops.fbgemm.uvm_storage(cpu_t) - - uvm_t.copy_(cpu_t) - assert torch.ops.fbgemm.is_uvm_tensor(uvm_t) - assert torch.ops.fbgemm.uvm_storage(uvm_t) - - # Test use of cpu tensor after freeing the uvm tensor - del uvm_t - cpu_t.mul_(42) - @unittest.skipIf(*gpu_unavailable) def test_enum(self) -> None: # pyre-ignore[16] @@ -168,52 +132,6 @@ def test_cudaMemPrefetchAsync(self, sizes: List[int], uvm_op) -> None: torch.cuda.synchronize(torch.device("cuda:0")) - @skipIfRocm() - @unittest.skipIf( - not torch.cuda.is_available() or torch.cuda.device_count() < 2, - "Skip unless two CUDA devices are detected", - ) - @given( - sizes=st.lists( - st.integers(min_value=1, max_value=(1024)), min_size=1, max_size=4 - ), - uvm_op=st.sampled_from( - [ - torch.ops.fbgemm.new_unified_tensor, - torch.ops.fbgemm.new_managed_tensor, - torch.ops.fbgemm.new_vanilla_managed_tensor, - ] - ), - ) - @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None) - # pyre-fixme[2]: Parameter must be annotated. - def test_uvm_to_device(self, sizes: List[int], uvm_op) -> None: - if uvm_op is torch.ops.fbgemm.new_unified_tensor: - is_host_mapped = False - uvm_t = uvm_op( - torch.empty(0, device="cuda:0", dtype=torch.float), - sizes, - is_host_mapped, - ) - else: - uvm_t = uvm_op(torch.empty(0, device="cuda:0", dtype=torch.float), sizes) - - assert torch.ops.fbgemm.is_uvm_tensor(uvm_t) - assert torch.ops.fbgemm.uvm_storage(uvm_t) - - # Reference uvm tensor from second cuda device - try: - device_prototype = torch.empty(0, device="cuda:1") - except RuntimeError: - # Skip the tests if there is no "cuda:1" device - return - - second_t = torch.ops.fbgemm.uvm_to_device(uvm_t, device_prototype) - - assert torch.ops.fbgemm.is_uvm_tensor(second_t) - assert torch.ops.fbgemm.uvm_storage(second_t) - assert second_t.device == device_prototype.device - @skipIfRocm() @unittest.skipIf(*gpu_unavailable) @given( @@ -289,40 +207,6 @@ def test_uvm_memadviceDontFork(self, sizes: List[int], uvm_op) -> None: torch.ops.fbgemm.uvm_mem_advice_dont_fork(cpu_t) - @unittest.skipIf(*gpu_unavailable) - @given( - sizes=st.lists( - st.integers(min_value=1, max_value=(512)), min_size=1, max_size=3 - ), - uvm_op=st.sampled_from( - [ - torch.ops.fbgemm.new_unified_tensor, - torch.ops.fbgemm.new_managed_tensor, - torch.ops.fbgemm.new_vanilla_managed_tensor, - ] - ), - ) - @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None) - # pyre-fixme[2]: Parameter must be annotated. - def test_uvm_to_cpu_clone(self, sizes: List[int], uvm_op) -> None: - if uvm_op is torch.ops.fbgemm.new_unified_tensor: - is_host_mapped = False - uvm_t = uvm_op( - torch.empty(0, device="cuda:0", dtype=torch.float), - sizes, - is_host_mapped, - ) - else: - uvm_t = uvm_op(torch.empty(0, device="cuda:0", dtype=torch.float), sizes) - - assert torch.ops.fbgemm.is_uvm_tensor(uvm_t) - assert torch.ops.fbgemm.uvm_storage(uvm_t) - - cpu_clone = torch.ops.fbgemm.uvm_to_cpu_clone(uvm_t) - - assert not torch.ops.fbgemm.is_uvm_tensor(cpu_clone) - assert not torch.ops.fbgemm.uvm_storage(cpu_clone) - @unittest.skipIf(*gpu_unavailable) @given( sizes=st.lists(