From 2c519bef568e91f2f89373cc79d9ae6c07eb8614 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Mon, 12 Feb 2024 18:53:12 -0800 Subject: [PATCH] More flexible test retries (#2328) Summary: - Allow for re-running tests for only the failed test cases, to avoid OOM errors with large test suites - Re-enable the `test_backward_dense` test Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2328 Reviewed By: sryap Differential Revision: D53688642 Pulled By: q10 fbshipit-source-id: 368698a3a2a5088c796ea188faa655a778b42b6b --- .github/scripts/fbgemm_gpu_test.bash | 108 ++++++++++++------ fbgemm_gpu/test/jagged_tensor_ops_test.py | 6 - .../test/permute_pooled_embedding_test.py | 10 +- fbgemm_gpu/test/sparse/index_select_test.py | 5 +- fbgemm_gpu/test/sparse/pack_segments_test.py | 5 +- .../test/tbe/training/backward_dense_test.py | 10 +- fbgemm_gpu/test/test_utils.py | 46 +++++++- 7 files changed, 124 insertions(+), 66 deletions(-) diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash index 570f1b58f..9a0d9cabf 100644 --- a/.github/scripts/fbgemm_gpu_test.bash +++ b/.github/scripts/fbgemm_gpu_test.bash @@ -32,13 +32,31 @@ run_python_test () { local env_prefix=$(env_name_or_prefix "${env_name}") # shellcheck disable=SC2086 - if exec_with_retries 2 conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then + if print_exec conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning --cache-clear "${python_test_file}"; then echo "[TEST] Python test suite PASSED: ${python_test_file}" echo "" echo "" echo "" + return 0 + fi + + echo "[TEST] Some tests FAILED. Re-attempting only FAILED tests: ${python_test_file}" + echo "" + echo "" + + # NOTE: Running large test suites may result in OOM error that will cause the + # process to be prematurely killed. To work around this, when we re-run test + # suites, we only run tests that have failed in the previous round. This is + # enabled by using the pytest cache and the --lf flag. + + # shellcheck disable=SC2086 + if exec_with_retries 2 conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning --lf --last-failed-no-failures none "${python_test_file}"; then + echo "[TEST] Python test suite PASSED with retries: ${python_test_file}" + echo "" + echo "" + echo "" else - echo "[TEST] Python test suite FAILED: ${python_test_file}" + echo "[TEST] Python test suite FAILED for some or all tests despite retries: ${python_test_file}" echo "" echo "" echo "" @@ -46,6 +64,45 @@ run_python_test () { fi } +__configure_fbgemm_gpu_test_cpu () { + ignored_tests=( + ./ssd_split_table_batched_embeddings_test.py + # These tests have non-CPU operators referenced in @given + ./uvm/copy_test.py + ./uvm/uvm_test.py + ) +} + +__configure_fbgemm_gpu_test_cuda () { + ignored_tests=( + ./ssd_split_table_batched_embeddings_test.py + ) +} + +__configure_fbgemm_gpu_test_rocm () { + # shellcheck disable=SC2155 + local env_prefix=$(env_name_or_prefix "${env_name}") + + echo "[TEST] Set environment variables for ROCm testing ..." + # shellcheck disable=SC2086 + print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1 + # shellcheck disable=SC2086 + print_exec conda env config vars set ${env_prefix} HIP_LAUNCH_BLOCKING=1 + + # Starting from MI250 AMD GPUs support per process XNACK mode change + # shellcheck disable=SC2155 + local rocm_version=$(awk -F'[.-]' '{print $1 * 10000 + $2 * 100 + $3}' /opt/rocm/.info/version-dev) + if [ "$rocm_version" -ge 50700 ]; then + # shellcheck disable=SC2086 + print_exec conda env config vars set ${env_prefix} HSA_XNACK=1 + fi + + ignored_tests=( + ./ssd_split_table_batched_embeddings_test.py + # https://github.com/pytorch/FBGEMM/issues/1559 + ./batched_unary_embeddings_test.py + ) +} ################################################################################ # FBGEMM_GPU Test Functions @@ -73,37 +130,17 @@ run_fbgemm_gpu_tests () { # shellcheck disable=SC2155 local env_prefix=$(env_name_or_prefix "${env_name}") - # Enable ROCM testing if specified - if [ "$fbgemm_variant" == "rocm" ]; then - echo "[TEST] Set environment variables for ROCm testing ..." - # shellcheck disable=SC2086 - print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1 - # shellcheck disable=SC2086 - print_exec conda env config vars set ${env_prefix} HIP_LAUNCH_BLOCKING=1 - fi - - # These are either non-tests or currently-broken tests in both FBGEMM_GPU and FBGEMM_GPU-CPU - local files_to_skip=( - ./ssd_split_table_batched_embeddings_test.py - ) - if [ "$fbgemm_variant" == "cpu" ]; then - # These tests have non-CPU operators referenced in @given - local ignored_tests=( - ./uvm/copy_test.py - ./uvm/uvm_test.py - ) + echo "Configuring for CPU-based testing ..." + __configure_fbgemm_gpu_test_cpu + elif [ "$fbgemm_variant" == "rocm" ]; then - local ignored_tests=( - # https://github.com/pytorch/FBGEMM/issues/1559 - ./batched_unary_embeddings_test.py - ./tbe/backward_adagrad_test.py - ./tbe/backward_dense_test.py - ./tbe/backward_none_test.py - ./tbe/backward_sgd_test.py - ) + echo "Configuring for ROCm-based testing ..." + __configure_fbgemm_gpu_test_rocm + else - local ignored_tests=() + echo "Configuring for CUDA-based testing ..." + __configure_fbgemm_gpu_test_cuda fi echo "[TEST] Installing pytest ..." @@ -114,19 +151,22 @@ run_fbgemm_gpu_tests () { (test_python_import_package "${env_name}" fbgemm_gpu) || return 1 (test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1 - echo "[TEST] Enumerating test files ..." + echo "[TEST] Enumerating ALL test files ..." # shellcheck disable=SC2155 local all_test_files=$(find . -type f -name '*_test.py' -print | sort) for f in $all_test_files; do echo "$f"; done echo "" + echo "[TEST] Enumerating IGNORED test files ..." + for f in $ignored_tests; do echo "$f"; done + echo "" + # NOTE: Tests running on single CPU core with a less powerful testing GPU in # GHA can take up to 5 hours. for test_file in $all_test_files; do - if echo "${files_to_skip[@]}" | grep "${test_file}"; then - echo "[TEST] Skipping test file known to be broken: ${test_file}" - elif echo "${ignored_tests[@]}" | grep "${test_file}"; then + if echo "${ignored_tests[@]}" | grep "${test_file}"; then echo "[TEST] Skipping test file: ${test_file}" + echo "" elif run_python_test "${env_name}" "${test_file}"; then echo "" else diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py index fee127057..f19fb2355 100644 --- a/fbgemm_gpu/test/jagged_tensor_ops_test.py +++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py @@ -29,7 +29,6 @@ gpu_unavailable, gradcheck, optests, - skipIfRocm, symint_vector_unsupported, use_cpu_strategy, ) @@ -46,7 +45,6 @@ gpu_unavailable, gradcheck, optests, - skipIfRocm, symint_vector_unsupported, use_cpu_strategy, ) @@ -1630,7 +1628,6 @@ def test_jagged_dense_dense_elementwise_add_jagged_output_dynamic_shape( assert output.size() == output_ref.size() - @skipIfRocm() @settings( verbosity=Verbosity.verbose, max_examples=20, @@ -2370,7 +2367,6 @@ def test_jagged_softmax( torch.testing.assert_close(values.grad, values_ref.grad) - @skipIfRocm() @given( B=st.integers(10, 512), M=st.integers(1, 32), @@ -2669,7 +2665,6 @@ def test_jagged_slice_errors( ) @unittest.skipIf(*gpu_unavailable) - @skipIfRocm() @given( B=st.integers(min_value=100, max_value=200), F=st.integers(min_value=50, max_value=100), @@ -2774,7 +2769,6 @@ def test_jagged_unique_indices( self.assertTrue((output_start <= pos) and (pos < output_end)) @unittest.skipIf(*gpu_unavailable) - @skipIfRocm() @given( B=st.integers(min_value=100, max_value=200), F=st.integers(min_value=50, max_value=100), diff --git a/fbgemm_gpu/test/permute_pooled_embedding_test.py b/fbgemm_gpu/test/permute_pooled_embedding_test.py index 48dc3b72f..48c40632c 100644 --- a/fbgemm_gpu/test/permute_pooled_embedding_test.py +++ b/fbgemm_gpu/test/permute_pooled_embedding_test.py @@ -23,20 +23,13 @@ # pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`. if getattr(fbgemm_gpu, "open_source", False): # pyre-ignore[21] - from test_utils import ( - cpu_and_maybe_gpu, - gpu_unavailable, - on_arm_platform, - optests, - skipIfRocm, - ) + from test_utils import cpu_and_maybe_gpu, gpu_unavailable, on_arm_platform, optests else: from fbgemm_gpu.test.test_utils import ( cpu_and_maybe_gpu, gpu_unavailable, on_arm_platform, optests, - skipIfRocm, ) typed_gpu_unavailable: Tuple[bool, str] = gpu_unavailable @@ -145,7 +138,6 @@ def test_permutation(self, fwd_only: bool) -> None: [6, 7, 8, 9, 0, 1, 5, 2, 3, 4], ) - @skipIfRocm() @unittest.skipIf(*typed_on_arm_platform) def test_permutation_autograd(self) -> None: net = Net().to(self.device) diff --git a/fbgemm_gpu/test/sparse/index_select_test.py b/fbgemm_gpu/test/sparse/index_select_test.py index 872b255d7..4b9a0cb3e 100644 --- a/fbgemm_gpu/test/sparse/index_select_test.py +++ b/fbgemm_gpu/test/sparse/index_select_test.py @@ -23,14 +23,13 @@ if open_source: # pyre-ignore[21] - from test_utils import gpu_available, skipIfRocm + from test_utils import gpu_available else: import fbgemm_gpu.sparse_ops # noqa: F401, E402 - from fbgemm_gpu.test.test_utils import gpu_available, skipIfRocm + from fbgemm_gpu.test.test_utils import gpu_available class IndexSelectTest(unittest.TestCase): - @skipIfRocm() @given( N=st.integers(1, 32), shape=st.one_of( diff --git a/fbgemm_gpu/test/sparse/pack_segments_test.py b/fbgemm_gpu/test/sparse/pack_segments_test.py index fc6a39f65..eafc1a144 100644 --- a/fbgemm_gpu/test/sparse/pack_segments_test.py +++ b/fbgemm_gpu/test/sparse/pack_segments_test.py @@ -20,9 +20,9 @@ if open_source: # pyre-ignore[21] - from test_utils import gpu_available, skipIfRocm + from test_utils import gpu_available else: - from fbgemm_gpu.test.test_utils import gpu_available, skipIfRocm + from fbgemm_gpu.test.test_utils import gpu_available def get_n_rand_num_summing_to_k(n: int, k: int) -> np.ndarray: @@ -236,7 +236,6 @@ def test_pack_segments_smaller_max_len( ) self.assertTrue(torch.equal(packed_tensor, packed_cuda.cpu())) - @skipIfRocm() @given( n=st.integers(2, 10), k=st.integers(2, 10), diff --git a/fbgemm_gpu/test/tbe/training/backward_dense_test.py b/fbgemm_gpu/test/tbe/training/backward_dense_test.py index 6f92970ba..f90dd6e86 100644 --- a/fbgemm_gpu/test/tbe/training/backward_dense_test.py +++ b/fbgemm_gpu/test/tbe/training/backward_dense_test.py @@ -31,14 +31,9 @@ if open_source: # pyre-ignore[21] - from test_utils import gradcheck, optests, running_on_github, use_cpu_strategy + from test_utils import gradcheck, optests, use_cpu_strategy else: - from fbgemm_gpu.test.test_utils import ( - gradcheck, - optests, - running_on_github, - use_cpu_strategy, - ) + from fbgemm_gpu.test.test_utils import gradcheck, optests, use_cpu_strategy VERBOSITY: Verbosity = Verbosity.verbose @@ -72,7 +67,6 @@ class BackwardDenseTest(unittest.TestCase): deadline=None, suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large], ) - @unittest.skipIf(*running_on_github) def test_backward_dense( # noqa C901 self, T: int, diff --git a/fbgemm_gpu/test/test_utils.py b/fbgemm_gpu/test/test_utils.py index adb1c07c9..299e38974 100644 --- a/fbgemm_gpu/test/test_utils.py +++ b/fbgemm_gpu/test/test_utils.py @@ -185,7 +185,7 @@ def use_cpu_strategy() -> st.SearchStrategy[bool]: def skipIfRocm(reason: str = "Test currently doesn't work on the ROCm stack") -> Any: # pyre-fixme[3]: Return annotation cannot be `Any`. # pyre-fixme[24]: Generic type `Callable` expects 2 type parameters. - def skipIfRocmDecorator(fn: Callable) -> Any: + def decorator(fn: Callable) -> Any: @wraps(fn) # pyre-fixme[3]: Return annotation cannot be `Any`. def wrapper(*args: Any, **kwargs: Any) -> Any: @@ -196,7 +196,47 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: return wrapper - return skipIfRocmDecorator + return decorator + + +# pyre-fixme[3]: Return annotation cannot be `Any`. +def skipIfRocmLessThan(min_version: int) -> Any: + # pyre-fixme[3]: Return annotation cannot be `Any`. + # pyre-fixme[24]: Generic type `Callable` expects 2 type parameters. + def decorator(testfn: Callable) -> Any: + @wraps(testfn) + # pyre-fixme[3]: Return annotation cannot be `Any`. + def wrapper(*args: Any, **kwargs: Any) -> Any: + ROCM_VERSION_FILEPATH = "/opt/rocm/.info/version-dev" + if TEST_WITH_ROCM: + # Fail if ROCm version file is missing. + if not os.path.isfile(ROCM_VERSION_FILEPATH): + raise AssertionError( + f"ROCm version file {ROCM_VERSION_FILEPATH} is missing!" + ) + + # Parse the version number from the file. + with open(ROCM_VERSION_FILEPATH, "r") as file: + version = file.read().strip() + version = version.replace("-", "").split(".") + version = ( + int(version[0]) * 10000 + int(version[1]) * 100 + int(version[2]) + ) + + # Fail if ROCm version is less than the minimum version. + if version < min_version: + raise unittest.SkipTest( + f"Skip the test since the ROCm version is less than {min_version}" + ) + else: + testfn(*args, **kwargs) + + else: + testfn(*args, **kwargs) + + return wrapper + + return decorator def symint_vector_unsupported() -> Tuple[bool, str]: @@ -204,7 +244,7 @@ def symint_vector_unsupported() -> Tuple[bool, str]: return ( int(major) < 2 or (int(major) == 2 and int(minor) < 1), """ - dynamic shape support for this op needs to be on PyTorch 2.1 or + Dynamic shape support for this operator needs to be on PyTorch 2.1 or newer with https://github.com/pytorch/pytorch/pull/101056 """, )