diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash
index 6c93c53d0..87e3fd147 100644
--- a/.github/scripts/fbgemm_gpu_install.bash
+++ b/.github/scripts/fbgemm_gpu_install.bash
@@ -49,7 +49,7 @@ __install_fetch_version_and_variant_info () {
   echo ""
 }
 
-__install_list_subpackages_info () {
+__install_check_subpackages () {
   # shellcheck disable=SC2086,SC2155
   local fbgemm_gpu_packages=$(conda run ${env_prefix} python -c "import fbgemm_gpu; print(dir(fbgemm_gpu))")
 
@@ -64,6 +64,22 @@ __install_list_subpackages_info () {
   echo "[CHECK] fbgemm_gpu.experimental: ${experimental_packages}"
   echo "################################################################################"
   echo ""
+
+
+  echo "[INSTALL] Check for installation of Python sources ..."
+  local subpackages=(
+    "fbgemm_gpu.docs"
+    "fbgemm_gpu.tbe.cache"
+    "fbgemm_gpu.tbe.ssd"
+  )
+
+  for package in "${subpackages[@]}"; do
+    (test_python_import_package "${env_name}" "${package}") || return 1
+  done
+
+  if [ "$installed_fbgemm_gpu_variant" != "genai" ]; then
+    (test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1
+  fi
 }
 
 __install_check_operator_registrations () {
@@ -105,20 +121,16 @@ __fbgemm_gpu_post_install_checks () {
   local env_prefix=$(env_name_or_prefix "${env_name}")
 
   # Print PyTorch and CUDA versions for sanity check
-  __install_print_dependencies_info
+  __install_print_dependencies_info         || return 1
 
   # Fetch the version and variant info from the package
-  __install_fetch_version_and_variant_info
-
-  # List out FBGEMM_GPU subpackages
-  __install_list_subpackages_info
+  __install_fetch_version_and_variant_info  || return 1
 
-  echo "[INSTALL] Check for installation of Python sources ..."
-  if [ "$installed_fbgemm_gpu_variant" != "genai" ]; then
-    (test_python_import_package "${env_name}" fbgemm_gpu.split_embedding_codegen_lookup_invokers) || return 1
-  fi
+  # Check FBGEMM_GPU subpackages are installed correctly
+  __install_check_subpackages               || return 1
 
-  __install_check_operator_registrations
+  # Check operator registrations are working
+  __install_check_operator_registrations    || return 1
 }
 
 install_fbgemm_gpu_wheel () {
diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
index 7e167698d..ca65b76c6 100644
--- a/.github/scripts/fbgemm_gpu_test.bash
+++ b/.github/scripts/fbgemm_gpu_test.bash
@@ -65,8 +65,15 @@ run_python_test () {
 }
 
 __configure_fbgemm_gpu_test_cpu () {
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+  echo "[TEST] Set environment variables for CPU-only testing ..."
+
+  # Prevent automatically running CUDA-enabled tests on a GPU-capable machine
+  # shellcheck disable=SC2086
+  print_exec conda env config vars set ${env_prefix} CUDA_VISIBLE_DEVICES=-1
+
   ignored_tests=(
-    ./tbe/ssd/ssd_split_table_batched_embeddings_test.py
     # These tests have non-CPU operators referenced in @given
     ./uvm/copy_test.py
     ./uvm/uvm_test.py
@@ -74,20 +81,27 @@ __configure_fbgemm_gpu_test_cpu () {
 }
 
 __configure_fbgemm_gpu_test_cuda () {
+  # shellcheck disable=SC2155
+  local env_prefix=$(env_name_or_prefix "${env_name}")
+  echo "[TEST] Set environment variables for CPU-only testing ..."
+
   # Disabled by default; enable for debugging
   # shellcheck disable=SC2086
   # print_exec conda env config vars set ${env_prefix} CUDA_LAUNCH_BLOCKING=1
 
+  # Remove CUDA device specificity when running CUDA tests
+  # shellcheck disable=SC2086
+  print_exec conda env config vars unset ${env_prefix} CUDA_VISIBLE_DEVICES
+
   ignored_tests=(
-    ./tbe/ssd/ssd_split_table_batched_embeddings_test.py
   )
 }
 
 __configure_fbgemm_gpu_test_rocm () {
   # shellcheck disable=SC2155
   local env_prefix=$(env_name_or_prefix "${env_name}")
-
   echo "[TEST] Set environment variables for ROCm testing ..."
+
   # shellcheck disable=SC2086
   print_exec conda env config vars set ${env_prefix} FBGEMM_TEST_WITH_ROCM=1
   # shellcheck disable=SC2086
@@ -102,7 +116,6 @@ __configure_fbgemm_gpu_test_rocm () {
   fi
 
   ignored_tests=(
-    ./tbe/ssd/ssd_split_table_batched_embeddings_test.py
     # https://github.com/pytorch/FBGEMM/issues/1559
     ./batched_unary_embeddings_test.py
   )
diff --git a/.github/scripts/utils_base.bash b/.github/scripts/utils_base.bash
index bb814617f..b2bda95c3 100644
--- a/.github/scripts/utils_base.bash
+++ b/.github/scripts/utils_base.bash
@@ -138,9 +138,9 @@ test_python_import_package () {
 
   # shellcheck disable=SC2086
   if conda run ${env_prefix} python -c "import ${python_import}"; then
-    echo "[CHECK] Python package '${python_import}' found."
+    echo "[CHECK] Python (sub-)package '${python_import}' found ..."
   else
-    echo "[CHECK] Python package '${python_import}' was not found, or the package is broken!"
+    echo "[CHECK] Python (sub-)package '${python_import}' was not found!  Please check if the Python sources have been packaged correctly."
     return 1
   fi
 }
diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
index 45c69064d..f0c07ced3 100644
--- a/.github/workflows/fbgemm_gpu_pip.yml
+++ b/.github/workflows/fbgemm_gpu_pip.yml
@@ -186,7 +186,7 @@ jobs:
         ]
         # ROCm machines are limited, so we only test a subset of Python versions
         python-version: [ "3.11", "3.12" ]
-        rocm-version: [ "5.7" ]
+        rocm-version: [ "6.0.2" ]
 
     steps:
     - name: Setup Build Container
diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu-development/BuildInstructions.rst b/fbgemm_gpu/docs/src/fbgemm_gpu-development/BuildInstructions.rst
index d8d0c7382..d0f9e9b09 100644
--- a/fbgemm_gpu/docs/src/fbgemm_gpu-development/BuildInstructions.rst
+++ b/fbgemm_gpu/docs/src/fbgemm_gpu-development/BuildInstructions.rst
@@ -245,8 +245,11 @@ symbols with ``GLIBCXX`` when compiling FBGEMM_CPU:
 
 .. code:: sh
 
-  # Fix GCC to 10.4.0, to keep compatibility with older versions of GLIBCXX
-  gcc_version=15.0.7
+  # Set GCC to 10.4.0 to keep compatibility with older versions of GLIBCXX
+  #
+  # A newer versions of GCC also works, but will need to be accompanied by an
+  # appropriate updated version of the sysroot_linux package.
+  gcc_version=10.4.0
 
   conda install -n ${env_name} -c conda-forge -y gxx_linux-64=${gcc_version} sysroot_linux-64=2.17
 
diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu-development/TestInstructions.rst b/fbgemm_gpu/docs/src/fbgemm_gpu-development/TestInstructions.rst
index babb17db4..1017f075b 100644
--- a/fbgemm_gpu/docs/src/fbgemm_gpu-development/TestInstructions.rst
+++ b/fbgemm_gpu/docs/src/fbgemm_gpu-development/TestInstructions.rst
@@ -52,6 +52,10 @@ environment:
   # Enable for debugging failed kernel executions
   export CUDA_LAUNCH_BLOCKING=1
 
+  # For operators involving NCCL, if the rpath is not set up correctly for
+  # libnccl.so.2, LD_LIBRARY_PATH will need to be updated.
+  export LD_LIBRARY_PATH="/path/to/nccl/lib:${LD_LIBRARY_PATH}"
+
   python -m pytest -v -rsx -s -W ignore::pytest.PytestCollectionWarning split_table_batched_embeddings_test.py
 
 Testing with the ROCm Variant
diff --git a/fbgemm_gpu/test/tbe/ssd/__init__.py b/fbgemm_gpu/test/tbe/ssd/__init__.py
new file mode 100644
index 000000000..a9fdb3b99
--- /dev/null
+++ b/fbgemm_gpu/test/tbe/ssd/__init__.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/fbgemm_gpu/test/tbe/ssd/ssd_split_table_batched_embeddings_test.py b/fbgemm_gpu/test/tbe/ssd/ssd_split_table_batched_embeddings_test.py
index f72649dba..da22c95f2 100644
--- a/fbgemm_gpu/test/tbe/ssd/ssd_split_table_batched_embeddings_test.py
+++ b/fbgemm_gpu/test/tbe/ssd/ssd_split_table_batched_embeddings_test.py
@@ -34,11 +34,22 @@
 
 from hypothesis import assume, given, settings, Verbosity
 
+from .. import common  # noqa E402
+from ..common import open_source
+
+
+if open_source:
+    # pyre-ignore[21]
+    from test_utils import gpu_unavailable, running_on_github
+else:
+    from fbgemm_gpu.test.test_utils import gpu_unavailable, running_on_github
+
 
 MAX_EXAMPLES = 40
 
 
-@unittest.skipIf(not torch.cuda.is_available(), "Skip when CUDA is not available")
+@unittest.skipIf(*running_on_github)
+@unittest.skipIf(*gpu_unavailable)
 class SSDSplitTableBatchedEmbeddingsTest(unittest.TestCase):
     def get_physical_table_arg_indices_(self, feature_table_map: List[int]):
         """
@@ -765,7 +776,8 @@ def test_ssd_cache(
             )
 
 
-@unittest.skipIf(not torch.cuda.is_available(), "Skip when CUDA is not available")
+@unittest.skipIf(*running_on_github)
+@unittest.skipIf(*gpu_unavailable)
 class SSDIntNBitTableBatchedEmbeddingsTest(unittest.TestCase):
     def test_nbit_ssd(self) -> None:
         import tempfile