From 2c519bef568e91f2f89373cc79d9ae6c07eb8614 Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Mon, 12 Feb 2024 18:53:12 -0800
Subject: [PATCH] More flexible test retries (#2328)

Summary:
- Allow for re-running tests for only the failed test cases, to avoid OOM errors with large test suites

- Re-enable the `test_backward_dense` test

Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2328

Reviewed By: sryap

Differential Revision: D53688642

Pulled By: q10

fbshipit-source-id: 368698a3a2a5088c796ea188faa655a778b42b6b
---
 .github/scripts/fbgemm_gpu_test.bash          | 108 ++++++++++++------
 fbgemm_gpu/test/jagged_tensor_ops_test.py     |   6 -
 .../test/permute_pooled_embedding_test.py     |  10 +-
 fbgemm_gpu/test/sparse/index_select_test.py   |   5 +-
 fbgemm_gpu/test/sparse/pack_segments_test.py  |   5 +-
 .../test/tbe/training/backward_dense_test.py  |  10 +-
 fbgemm_gpu/test/test_utils.py                 |  46 +++++++-
 7 files changed, 124 insertions(+), 66 deletions(-)

diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
index 570f1b58f..9a0d9cabf 100644
--- a/.github/scripts/fbgemm_gpu_test.bash
+++ b/.github/scripts/fbgemm_gpu_test.bash
@@ -32,13 +32,31 @@ run_python_test () {
   local env_prefix=$(env_name_or_prefix "${env_name}")
 
   # shellcheck disable=SC2086
-  if exec_with_retries 2 conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning "${python_test_file}"; then
+  if print_exec conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning --cache-clear "${python_test_file}"; then
     echo "[TEST] Python test suite PASSED: ${python_test_file}"
     echo ""
     echo ""
     echo ""
+    return 0
+  fi
+
+  echo "[TEST] Some tests FAILED.  Re-attempting only FAILED tests: ${python_test_file}"
+  echo ""
+  echo ""
+
+  # NOTE: Running large test suites may result in OOM error that will cause the
+  # process to be prematurely killed.  To work around this, when we re-run test
+  # suites, we only run tests that have failed in the previous round.  This is
+  # enabled by using the pytest cache and the --lf flag.
+
+  # shellcheck disable=SC2086
+  if exec_with_retries 2 conda run --no-capture-output ${env_prefix} python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning --lf --last-failed-no-failures none "${python_test_file}"; then
+    echo "[TEST] Python test suite PASSED with retries: ${python_test_file}"
+    echo ""
+    echo ""
+    echo ""
   else
-    echo "[TEST] Python test suite FAILED: ${python_test_file}"
+    echo "[TEST] Python test suite FAILED for some or all tests despite retries: ${python_test_file}"
     echo ""
     echo ""
     echo ""
@@ -46,6 +64,45 @@ run_python_test () {
   fi
 }
 
+__configure_fbgemm_gpu_test_cpu () {
+  ignored_tests=(
+    ./ssd_split_table_batched_embeddings_test.py
+    # These tests have non-CPU operators referenced in @given
+    ./uvm/copy_test.py
+    ./uvm/uvm_test.py
+  )
+}
+
+__configure_fbgemm_gpu_test_cuda () {
+  ignored_tests=(
+    ./ssd_split_table_batched_embeddings_test.py
+  )
+}
+
+__configure_fbgemm_gpu_test_rocm () {
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+
+  echo "[TEST] Set environment variables for ROCm testing ..."
+  # shellcheck disable=SC2086
+  print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1
+  # shellcheck disable=SC2086
+  print_exec conda env config vars set ${env_prefix} HIP_LAUNCH_BLOCKING=1
+
+  # Starting from MI250 AMD GPUs support per process XNACK mode change
+  # shellcheck disable=SC2155
+  local rocm_version=$(awk -F'[.-]' '{print $1 * 10000 + $2 * 100 + $3}' /opt/rocm/.info/version-dev)
+  if [ "$rocm_version" -ge 50700 ]; then
+    # shellcheck disable=SC2086
+    print_exec conda env config vars set ${env_prefix} HSA_XNACK=1
+  fi
+
+  ignored_tests=(
+    ./ssd_split_table_batched_embeddings_test.py
+    # https://github.com/pytorch/FBGEMM/issues/1559
+    ./batched_unary_embeddings_test.py
+  )
+}
 
 ################################################################################
 # FBGEMM_GPU Test Functions
@@ -73,37 +130,17 @@ run_fbgemm_gpu_tests () {
   # shellcheck disable=SC2155
   local env_prefix=$(env_name_or_prefix "${env_name}")
 
-  # Enable ROCM testing if specified
-  if [ "$fbgemm_variant" == "rocm" ]; then
-    echo "[TEST] Set environment variables for ROCm testing ..."
-    # shellcheck disable=SC2086
-    print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1
-    # shellcheck disable=SC2086
-    print_exec conda env config vars set ${env_prefix} HIP_LAUNCH_BLOCKING=1
-  fi
-
-  # These are either non-tests or currently-broken tests in both FBGEMM_GPU and FBGEMM_GPU-CPU
-  local files_to_skip=(
-    ./ssd_split_table_batched_embeddings_test.py
-  )
-
   if [ "$fbgemm_variant" == "cpu" ]; then
-    # These tests have non-CPU operators referenced in @given
-    local ignored_tests=(
-      ./uvm/copy_test.py
-      ./uvm/uvm_test.py
-    )
+    echo "Configuring for CPU-based testing ..."
+    __configure_fbgemm_gpu_test_cpu
+
   elif [ "$fbgemm_variant" == "rocm" ]; then
-    local ignored_tests=(
-      # https://github.com/pytorch/FBGEMM/issues/1559
-      ./batched_unary_embeddings_test.py
-      ./tbe/backward_adagrad_test.py
-      ./tbe/backward_dense_test.py
-      ./tbe/backward_none_test.py
-      ./tbe/backward_sgd_test.py
-    )
+    echo "Configuring for ROCm-based testing ..."
+    __configure_fbgemm_gpu_test_rocm
+
   else
-    local ignored_tests=()
+    echo "Configuring for CUDA-based testing ..."
+    __configure_fbgemm_gpu_test_cuda
   fi
 
   echo "[TEST] Installing pytest ..."
@@ -114,19 +151,22 @@ run_fbgemm_gpu_tests () {
   (test_python_import_package "${env_name}" fbgemm_gpu) || return 1
   (test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1
 
-  echo "[TEST] Enumerating test files ..."
+  echo "[TEST] Enumerating ALL test files ..."
   # shellcheck disable=SC2155
   local all_test_files=$(find . -type f -name '*_test.py' -print | sort)
   for f in $all_test_files; do echo "$f"; done
   echo ""
 
+  echo "[TEST] Enumerating IGNORED test files ..."
+  for f in $ignored_tests; do echo "$f"; done
+  echo ""
+
   # NOTE: Tests running on single CPU core with a less powerful testing GPU in
   # GHA can take up to 5 hours.
   for test_file in $all_test_files; do
-    if echo "${files_to_skip[@]}" | grep "${test_file}"; then
-      echo "[TEST] Skipping test file known to be broken: ${test_file}"
-    elif echo "${ignored_tests[@]}" | grep "${test_file}"; then
+    if echo "${ignored_tests[@]}" | grep "${test_file}"; then
       echo "[TEST] Skipping test file: ${test_file}"
+      echo ""
     elif run_python_test "${env_name}" "${test_file}"; then
       echo ""
     else
diff --git a/fbgemm_gpu/test/jagged_tensor_ops_test.py b/fbgemm_gpu/test/jagged_tensor_ops_test.py
index fee127057..f19fb2355 100644
--- a/fbgemm_gpu/test/jagged_tensor_ops_test.py
+++ b/fbgemm_gpu/test/jagged_tensor_ops_test.py
@@ -29,7 +29,6 @@
         gpu_unavailable,
         gradcheck,
         optests,
-        skipIfRocm,
         symint_vector_unsupported,
         use_cpu_strategy,
     )
@@ -46,7 +45,6 @@
         gpu_unavailable,
         gradcheck,
         optests,
-        skipIfRocm,
         symint_vector_unsupported,
         use_cpu_strategy,
     )
@@ -1630,7 +1628,6 @@ def test_jagged_dense_dense_elementwise_add_jagged_output_dynamic_shape(
 
         assert output.size() == output_ref.size()
 
-    @skipIfRocm()
     @settings(
         verbosity=Verbosity.verbose,
         max_examples=20,
@@ -2370,7 +2367,6 @@ def test_jagged_softmax(
 
         torch.testing.assert_close(values.grad, values_ref.grad)
 
-    @skipIfRocm()
     @given(
         B=st.integers(10, 512),
         M=st.integers(1, 32),
@@ -2669,7 +2665,6 @@ def test_jagged_slice_errors(
             )
 
     @unittest.skipIf(*gpu_unavailable)
-    @skipIfRocm()
     @given(
         B=st.integers(min_value=100, max_value=200),
         F=st.integers(min_value=50, max_value=100),
@@ -2774,7 +2769,6 @@ def test_jagged_unique_indices(
                 self.assertTrue((output_start <= pos) and (pos < output_end))
 
     @unittest.skipIf(*gpu_unavailable)
-    @skipIfRocm()
     @given(
         B=st.integers(min_value=100, max_value=200),
         F=st.integers(min_value=50, max_value=100),
diff --git a/fbgemm_gpu/test/permute_pooled_embedding_test.py b/fbgemm_gpu/test/permute_pooled_embedding_test.py
index 48dc3b72f..48c40632c 100644
--- a/fbgemm_gpu/test/permute_pooled_embedding_test.py
+++ b/fbgemm_gpu/test/permute_pooled_embedding_test.py
@@ -23,20 +23,13 @@
 # pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
 if getattr(fbgemm_gpu, "open_source", False):
     # pyre-ignore[21]
-    from test_utils import (
-        cpu_and_maybe_gpu,
-        gpu_unavailable,
-        on_arm_platform,
-        optests,
-        skipIfRocm,
-    )
+    from test_utils import cpu_and_maybe_gpu, gpu_unavailable, on_arm_platform, optests
 else:
     from fbgemm_gpu.test.test_utils import (
         cpu_and_maybe_gpu,
         gpu_unavailable,
         on_arm_platform,
         optests,
-        skipIfRocm,
     )
 
 typed_gpu_unavailable: Tuple[bool, str] = gpu_unavailable
@@ -145,7 +138,6 @@ def test_permutation(self, fwd_only: bool) -> None:
             [6, 7, 8, 9, 0, 1, 5, 2, 3, 4],
         )
 
-    @skipIfRocm()
     @unittest.skipIf(*typed_on_arm_platform)
     def test_permutation_autograd(self) -> None:
         net = Net().to(self.device)
diff --git a/fbgemm_gpu/test/sparse/index_select_test.py b/fbgemm_gpu/test/sparse/index_select_test.py
index 872b255d7..4b9a0cb3e 100644
--- a/fbgemm_gpu/test/sparse/index_select_test.py
+++ b/fbgemm_gpu/test/sparse/index_select_test.py
@@ -23,14 +23,13 @@
 
 if open_source:
     # pyre-ignore[21]
-    from test_utils import gpu_available, skipIfRocm
+    from test_utils import gpu_available
 else:
     import fbgemm_gpu.sparse_ops  # noqa: F401, E402
-    from fbgemm_gpu.test.test_utils import gpu_available, skipIfRocm
+    from fbgemm_gpu.test.test_utils import gpu_available
 
 
 class IndexSelectTest(unittest.TestCase):
-    @skipIfRocm()
     @given(
         N=st.integers(1, 32),
         shape=st.one_of(
diff --git a/fbgemm_gpu/test/sparse/pack_segments_test.py b/fbgemm_gpu/test/sparse/pack_segments_test.py
index fc6a39f65..eafc1a144 100644
--- a/fbgemm_gpu/test/sparse/pack_segments_test.py
+++ b/fbgemm_gpu/test/sparse/pack_segments_test.py
@@ -20,9 +20,9 @@
 
 if open_source:
     # pyre-ignore[21]
-    from test_utils import gpu_available, skipIfRocm
+    from test_utils import gpu_available
 else:
-    from fbgemm_gpu.test.test_utils import gpu_available, skipIfRocm
+    from fbgemm_gpu.test.test_utils import gpu_available
 
 
 def get_n_rand_num_summing_to_k(n: int, k: int) -> np.ndarray:
@@ -236,7 +236,6 @@ def test_pack_segments_smaller_max_len(
             )
             self.assertTrue(torch.equal(packed_tensor, packed_cuda.cpu()))
 
-    @skipIfRocm()
     @given(
         n=st.integers(2, 10),
         k=st.integers(2, 10),
diff --git a/fbgemm_gpu/test/tbe/training/backward_dense_test.py b/fbgemm_gpu/test/tbe/training/backward_dense_test.py
index 6f92970ba..f90dd6e86 100644
--- a/fbgemm_gpu/test/tbe/training/backward_dense_test.py
+++ b/fbgemm_gpu/test/tbe/training/backward_dense_test.py
@@ -31,14 +31,9 @@
 
 if open_source:
     # pyre-ignore[21]
-    from test_utils import gradcheck, optests, running_on_github, use_cpu_strategy
+    from test_utils import gradcheck, optests, use_cpu_strategy
 else:
-    from fbgemm_gpu.test.test_utils import (
-        gradcheck,
-        optests,
-        running_on_github,
-        use_cpu_strategy,
-    )
+    from fbgemm_gpu.test.test_utils import gradcheck, optests, use_cpu_strategy
 
 
 VERBOSITY: Verbosity = Verbosity.verbose
@@ -72,7 +67,6 @@ class BackwardDenseTest(unittest.TestCase):
         deadline=None,
         suppress_health_check=[HealthCheck.filter_too_much, HealthCheck.data_too_large],
     )
-    @unittest.skipIf(*running_on_github)
     def test_backward_dense(  # noqa C901
         self,
         T: int,
diff --git a/fbgemm_gpu/test/test_utils.py b/fbgemm_gpu/test/test_utils.py
index adb1c07c9..299e38974 100644
--- a/fbgemm_gpu/test/test_utils.py
+++ b/fbgemm_gpu/test/test_utils.py
@@ -185,7 +185,7 @@ def use_cpu_strategy() -> st.SearchStrategy[bool]:
 def skipIfRocm(reason: str = "Test currently doesn't work on the ROCm stack") -> Any:
     # pyre-fixme[3]: Return annotation cannot be `Any`.
     # pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
-    def skipIfRocmDecorator(fn: Callable) -> Any:
+    def decorator(fn: Callable) -> Any:
         @wraps(fn)
         # pyre-fixme[3]: Return annotation cannot be `Any`.
         def wrapper(*args: Any, **kwargs: Any) -> Any:
@@ -196,7 +196,47 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
 
         return wrapper
 
-    return skipIfRocmDecorator
+    return decorator
+
+
+# pyre-fixme[3]: Return annotation cannot be `Any`.
+def skipIfRocmLessThan(min_version: int) -> Any:
+    # pyre-fixme[3]: Return annotation cannot be `Any`.
+    # pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
+    def decorator(testfn: Callable) -> Any:
+        @wraps(testfn)
+        # pyre-fixme[3]: Return annotation cannot be `Any`.
+        def wrapper(*args: Any, **kwargs: Any) -> Any:
+            ROCM_VERSION_FILEPATH = "/opt/rocm/.info/version-dev"
+            if TEST_WITH_ROCM:
+                # Fail if ROCm version file is missing.
+                if not os.path.isfile(ROCM_VERSION_FILEPATH):
+                    raise AssertionError(
+                        f"ROCm version file {ROCM_VERSION_FILEPATH} is missing!"
+                    )
+
+                # Parse the version number from the file.
+                with open(ROCM_VERSION_FILEPATH, "r") as file:
+                    version = file.read().strip()
+                version = version.replace("-", "").split(".")
+                version = (
+                    int(version[0]) * 10000 + int(version[1]) * 100 + int(version[2])
+                )
+
+                # Fail if ROCm version is less than the minimum version.
+                if version < min_version:
+                    raise unittest.SkipTest(
+                        f"Skip the test since the ROCm version is less than {min_version}"
+                    )
+                else:
+                    testfn(*args, **kwargs)
+
+            else:
+                testfn(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
 
 
 def symint_vector_unsupported() -> Tuple[bool, str]:
@@ -204,7 +244,7 @@ def symint_vector_unsupported() -> Tuple[bool, str]:
     return (
         int(major) < 2 or (int(major) == 2 and int(minor) < 1),
         """
-        dynamic shape support for this op needs to be on PyTorch 2.1 or
+        Dynamic shape support for this operator needs to be on PyTorch 2.1 or
         newer with https://github.com/pytorch/pytorch/pull/101056
         """,
     )