diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash index 6c93c53d0..87e3fd147 100644 --- a/.github/scripts/fbgemm_gpu_install.bash +++ b/.github/scripts/fbgemm_gpu_install.bash @@ -49,7 +49,7 @@ __install_fetch_version_and_variant_info () { echo "" } -__install_list_subpackages_info () { +__install_check_subpackages () { # shellcheck disable=SC2086,SC2155 local fbgemm_gpu_packages=$(conda run ${env_prefix} python -c "import fbgemm_gpu; print(dir(fbgemm_gpu))") @@ -64,6 +64,22 @@ __install_list_subpackages_info () { echo "[CHECK] fbgemm_gpu.experimental: ${experimental_packages}" echo "################################################################################" echo "" + + + echo "[INSTALL] Check for installation of Python sources ..." + local subpackages=( + "fbgemm_gpu.docs" + "fbgemm_gpu.tbe.cache" + "fbgemm_gpu.tbe.ssd" + ) + + for package in "${subpackages[@]}"; do + (test_python_import_package "${env_name}" "${package}") || return 1 + done + + if [ "$installed_fbgemm_gpu_variant" != "genai" ]; then + (test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1 + fi } __install_check_operator_registrations () { @@ -105,20 +121,16 @@ __fbgemm_gpu_post_install_checks () { local env_prefix=$(env_name_or_prefix "${env_name}") # Print PyTorch and CUDA versions for sanity check - __install_print_dependencies_info + __install_print_dependencies_info || return 1 # Fetch the version and variant info from the package - __install_fetch_version_and_variant_info - - # List out FBGEMM_GPU subpackages - __install_list_subpackages_info + __install_fetch_version_and_variant_info || return 1 - echo "[INSTALL] Check for installation of Python sources ..." - if [ "$installed_fbgemm_gpu_variant" != "genai" ]; then - (test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1 - fi + # Check FBGEMM_GPU subpackages are installed correctly + __install_check_subpackages || return 1 - __install_check_operator_registrations + # Check operator registrations are working + __install_check_operator_registrations || return 1 } install_fbgemm_gpu_wheel () { diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash index 7e167698d..ca65b76c6 100644 --- a/.github/scripts/fbgemm_gpu_test.bash +++ b/.github/scripts/fbgemm_gpu_test.bash @@ -65,8 +65,15 @@ run_python_test () { } __configure_fbgemm_gpu_test_cpu () { + # shellcheck disable=SC2155 + local env_prefix=$(env_name_or_prefix "${env_name}") + echo "[TEST] Set environment variables for CPU-only testing ..." + + # Prevent automatically running CUDA-enabled tests on a GPU-capable machine + # shellcheck disable=SC2086 + print_exec conda env config vars set ${env_prefix} CUDA_VISIBLE_DEVICES=-1 + ignored_tests=( - ./tbe/ssd/ssd_split_table_batched_embeddings_test.py # These tests have non-CPU operators referenced in @given ./uvm/copy_test.py ./uvm/uvm_test.py @@ -74,20 +81,27 @@ __configure_fbgemm_gpu_test_cpu () { } __configure_fbgemm_gpu_test_cuda () { + # shellcheck disable=SC2155 + local env_prefix=$(env_name_or_prefix "${env_name}") + echo "[TEST] Set environment variables for CPU-only testing ..." + # Disabled by default; enable for debugging # shellcheck disable=SC2086 # print_exec conda env config vars set ${env_prefix} CUDA_LAUNCH_BLOCKING=1 + # Remove CUDA device specificity when running CUDA tests + # shellcheck disable=SC2086 + print_exec conda env config vars unset ${env_prefix} CUDA_VISIBLE_DEVICES + ignored_tests=( - ./tbe/ssd/ssd_split_table_batched_embeddings_test.py ) } __configure_fbgemm_gpu_test_rocm () { # shellcheck disable=SC2155 local env_prefix=$(env_name_or_prefix "${env_name}") - echo "[TEST] Set environment variables for ROCm testing ..." + # shellcheck disable=SC2086 print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1 # shellcheck disable=SC2086 @@ -102,7 +116,6 @@ __configure_fbgemm_gpu_test_rocm () { fi ignored_tests=( - ./tbe/ssd/ssd_split_table_batched_embeddings_test.py # https://github.com/pytorch/FBGEMM/issues/1559 ./batched_unary_embeddings_test.py ) diff --git a/.github/scripts/utils_base.bash b/.github/scripts/utils_base.bash index bb814617f..b2bda95c3 100644 --- a/.github/scripts/utils_base.bash +++ b/.github/scripts/utils_base.bash @@ -138,9 +138,9 @@ test_python_import_package () { # shellcheck disable=SC2086 if conda run ${env_prefix} python -c "import ${python_import}"; then - echo "[CHECK] Python package '${python_import}' found." + echo "[CHECK] Python (sub-)package '${python_import}' found ..." else - echo "[CHECK] Python package '${python_import}' was not found, or the package is broken!" + echo "[CHECK] Python (sub-)package '${python_import}' was not found! Please check if the Python sources have been packaged correctly." return 1 fi } diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml index 45c69064d..f0c07ced3 100644 --- a/.github/workflows/fbgemm_gpu_pip.yml +++ b/.github/workflows/fbgemm_gpu_pip.yml @@ -186,7 +186,7 @@ jobs: ] # ROCm machines are limited, so we only test a subset of Python versions python-version: [ "3.11", "3.12" ] - rocm-version: [ "5.7" ] + rocm-version: [ "6.0.2" ] steps: - name: Setup Build Container diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu-development/BuildInstructions.rst b/fbgemm_gpu/docs/src/fbgemm_gpu-development/BuildInstructions.rst index d8d0c7382..d0f9e9b09 100644 --- a/fbgemm_gpu/docs/src/fbgemm_gpu-development/BuildInstructions.rst +++ b/fbgemm_gpu/docs/src/fbgemm_gpu-development/BuildInstructions.rst @@ -245,8 +245,11 @@ symbols with ``GLIBCXX`` when compiling FBGEMM_CPU: .. code:: sh - # Fix GCC to 10.4.0, to keep compatibility with older versions of GLIBCXX - gcc_version=15.0.7 + # Set GCC to 10.4.0 to keep compatibility with older versions of GLIBCXX + # + # A newer versions of GCC also works, but will need to be accompanied by an + # appropriate updated version of the sysroot_linux package. + gcc_version=10.4.0 conda install -n ${env_name} -c conda-forge -y gxx_linux-64=${gcc_version} sysroot_linux-64=2.17 diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu-development/TestInstructions.rst b/fbgemm_gpu/docs/src/fbgemm_gpu-development/TestInstructions.rst index babb17db4..1017f075b 100644 --- a/fbgemm_gpu/docs/src/fbgemm_gpu-development/TestInstructions.rst +++ b/fbgemm_gpu/docs/src/fbgemm_gpu-development/TestInstructions.rst @@ -52,6 +52,10 @@ environment: # Enable for debugging failed kernel executions export CUDA_LAUNCH_BLOCKING=1 + # For operators involving NCCL, if the rpath is not set up correctly for + # libnccl.so.2, LD_LIBRARY_PATH will need to be updated. + export LD_LIBRARY_PATH="/path/to/nccl/lib:${LD_LIBRARY_PATH}" + python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning split_table_batched_embeddings_test.py Testing with the ROCm Variant diff --git a/fbgemm_gpu/test/tbe/ssd/__init__.py b/fbgemm_gpu/test/tbe/ssd/__init__.py new file mode 100644 index 000000000..a9fdb3b99 --- /dev/null +++ b/fbgemm_gpu/test/tbe/ssd/__init__.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. diff --git a/fbgemm_gpu/test/tbe/ssd/ssd_split_table_batched_embeddings_test.py b/fbgemm_gpu/test/tbe/ssd/ssd_split_table_batched_embeddings_test.py index f72649dba..da22c95f2 100644 --- a/fbgemm_gpu/test/tbe/ssd/ssd_split_table_batched_embeddings_test.py +++ b/fbgemm_gpu/test/tbe/ssd/ssd_split_table_batched_embeddings_test.py @@ -34,11 +34,22 @@ from hypothesis import assume, given, settings, Verbosity +from .. import common # noqa E402 +from ..common import open_source + + +if open_source: + # pyre-ignore[21] + from test_utils import gpu_unavailable, running_on_github +else: + from fbgemm_gpu.test.test_utils import gpu_unavailable, running_on_github + MAX_EXAMPLES = 40 -@unittest.skipIf(not torch.cuda.is_available(), "Skip when CUDA is not available") +@unittest.skipIf(*running_on_github) +@unittest.skipIf(*gpu_unavailable) class SSDSplitTableBatchedEmbeddingsTest(unittest.TestCase): def get_physical_table_arg_indices_(self, feature_table_map: List[int]): """ @@ -765,7 +776,8 @@ def test_ssd_cache( ) -@unittest.skipIf(not torch.cuda.is_available(), "Skip when CUDA is not available") +@unittest.skipIf(*running_on_github) +@unittest.skipIf(*gpu_unavailable) class SSDIntNBitTableBatchedEmbeddingsTest(unittest.TestCase): def test_nbit_ssd(self) -> None: import tempfile