Enable NCCL code (pytorch#2631)

Summary: - Enable NCCL code and tests for multiple GPU car Pull Request resolved: pytorch#2631 Reviewed By: jianyuh Differential Revision: D58147817 Pulled By: q10 fbshipit-source-id: 49f1ec628c6f278bd8069b8f80237ea5d47e1e94
q10 · Jun 7, 2024 · c7720e8 · c7720e8
1 parent 6e73730
commit c7720e8
Show file tree

Hide file tree

Showing 20 changed files with 318 additions and 181 deletions.
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -85,14 +85,6 @@ __configure_fbgemm_gpu_build_nvcc () {
   # shellcheck disable=SC2206
   local cuda_version_arr=(${cuda_version//./ })
 
-  echo "[BUILD] Looking up NCCL path ..."
-  # shellcheck disable=SC2155,SC2086
-  local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
-  # shellcheck disable=SC2155,SC2086
-  local nccl_lib=$(conda run ${env_prefix} find ${conda_prefix} -name "libnccl.so*")
-  # shellcheck disable=SC2155,SC2086
-  local nccl_path=$(dirname "$(dirname ${nccl_lib})")
-
   # Only NVCC 12+ supports C++20
   if [[ ${cuda_version_arr[0]} -lt 12 ]]; then
     local cppstd_ver=17
@@ -123,8 +115,6 @@ __configure_fbgemm_gpu_build_nvcc () {
   build_args+=(
     # Override CMake configuration
     -DCMAKE_CXX_STANDARD="${cppstd_ver}"
-    -DNCCL_INCLUDE_DIR=${nccl_path}/include
-    -DNCCL_LIB_DIR=${nccl_path}/lib
   )
 }
 
@@ -203,6 +193,7 @@ __configure_fbgemm_gpu_build_cuda () {
     local arch_list="${TORCH_CUDA_ARCH_LIST}"
 
   else
+    # Build only CUDA 7.0, 8.0, and 9.0 (i.e. V100, A100, H100) because of 100 MB binary size limits from PyPI.
     echo "[BUILD] Using the default CUDA targets ..."
     # For cuda version 12.1, enable sm 9.0
     cuda_version_nvcc=$(conda run -n "${env_name}" nvcc --version)
@@ -213,20 +204,27 @@ __configure_fbgemm_gpu_build_cuda () {
       local arch_list="7.0;8.0"
     fi
   fi
+  echo "[BUILD] Setting the following CUDA targets: ${arch_list}"
 
   # Unset the environment-supplied TORCH_CUDA_ARCH_LIST because it will take
   # precedence over cmake -DTORCH_CUDA_ARCH_LIST
   unset TORCH_CUDA_ARCH_LIST
 
-  echo "[BUILD] Setting the following CUDA targets: ${arch_list}"
-
-  # Build only CUDA 7.0 and 8.0 (i.e. V100 and A100) because of 100 MB binary size limits from PyPI.
-  echo "[BUILD] Setting CUDA build args ..."
+  echo "[BUILD] Looking up NVML filepath ..."
   # shellcheck disable=SC2155,SC2086
   local nvml_lib_path=$(conda run --no-capture-output ${env_prefix} printenv NVML_LIB_PATH)
+
+  echo "[BUILD] Looking up NCCL filepath ..."
+  # shellcheck disable=SC2155,SC2086
+  local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
+  # shellcheck disable=SC2155,SC2086
+  local nccl_lib_path=$(conda run ${env_prefix} find ${conda_prefix} -name "libnccl.so*")
+
+  echo "[BUILD] Setting CUDA build args ..."
   build_args=(
     --package_variant=cuda
     --nvml_lib_path="${nvml_lib_path}"
+    --nccl_lib_path="${nccl_lib_path}"
     # Pass to PyTorch CMake
     -DTORCH_CUDA_ARCH_LIST="'${arch_list}'"
   )
@@ -235,6 +233,17 @@ __configure_fbgemm_gpu_build_cuda () {
   __configure_fbgemm_gpu_build_nvcc
 }
 
+__configure_fbgemm_gpu_build_genai () {
+  local fbgemm_variant_targets="$1"
+
+  __configure_fbgemm_gpu_build_cuda "$fbgemm_variant_targets" || return 1
+
+  # Replace the package_variant flag, since GenAI is also a CUDA-type build
+  for i in "${!build_args[@]}"; do
+    build_args[i]="${build_args[i]/--package_variant=cuda/--package_variant=genai}"
+  done
+}
+
 __configure_fbgemm_gpu_build () {
   local fbgemm_variant="$1"
   local fbgemm_variant_targets="$2"
@@ -267,6 +276,10 @@ __configure_fbgemm_gpu_build () {
     echo "[BUILD] Configuring build as ROCm variant ..."
     __configure_fbgemm_gpu_build_rocm "${fbgemm_variant_targets}"
 
+  elif [ "$fbgemm_variant" == "genai" ]; then
+    echo "[BUILD] Configuring build as GenAI variant ..."
+    __configure_fbgemm_gpu_build_genai "${fbgemm_variant_targets}"
+
   else
     echo "[BUILD] Configuring build as CUDA variant (this is the default behavior) ..."
     __configure_fbgemm_gpu_build_cuda "${fbgemm_variant_targets}"
@@ -347,7 +360,13 @@ __build_fbgemm_gpu_common_pre_steps () {
   (test_binpath "${env_name}" g++) || return 1
 
   # Set the default the FBGEMM_GPU variant to be CUDA
-  if [ "$fbgemm_variant" != "cpu" ] && [ "$fbgemm_variant" != "rocm" ]; then
+  if [ "$fbgemm_variant" != "cpu" ] &&
+     [ "$fbgemm_variant" != "rocm" ] &&
+     [ "$fbgemm_variant" != "genai" ]; then
+    echo "################################################################################"
+    echo "[BUILD] Unknown FBGEMM_GPU variant: $fbgemm_variant"
+    echo "[BUILD] Defaulting to CUDA"
+    echo "################################################################################"
     export fbgemm_variant="cuda"
   fi
 
@@ -371,66 +390,113 @@ __build_fbgemm_gpu_common_pre_steps () {
   print_exec git diff
 }
 
-run_fbgemm_gpu_postbuild_checks () {
-  local fbgemm_variant="$1"
-  if [ "$fbgemm_variant" == "" ]; then
-    echo "Usage: ${FUNCNAME[0]} FBGEMM_VARIANT"
-    echo "Example(s):"
-    echo "    ${FUNCNAME[0]} cpu"
-    echo "    ${FUNCNAME[0]} cuda"
-    echo "    ${FUNCNAME[0]} rocm"
-    return 1
-  fi
-
-  # Find the .SO file
-  # shellcheck disable=SC2155
-  local fbgemm_gpu_so_files=$(find . -name fbgemm_gpu_py.so)
+__print_library_infos () {
+  # shellcheck disable=SC2035,SC2061,SC2062,SC2155,SC2178
+  local fbgemm_gpu_so_files=$(find . -name *.so | grep .*cmake-build/.*)
   readarray -t fbgemm_gpu_so_files <<<"$fbgemm_gpu_so_files"
-  if [ "${#fbgemm_gpu_so_files[@]}" -le 0 ]; then
-    echo "[CHECK] .SO library fbgemm_gpu_py.so is missing from the build path!"
+
+  for library in "${fbgemm_gpu_so_files[@]}"; do
+    echo "################################################################################"
+    echo "[CHECK] BUILT LIBRARY: ${library}"
+
+    echo "[CHECK] Listing out library size:"
+    print_exec "du -h --block-size=1M ${library}"
+
+    echo "[CHECK] Listing out the GLIBCXX versions referenced:"
+    print_glibc_info "${library}"
+
+    echo "[CHECK] Listing out undefined symbols:"
+    print_exec "nm -gDCu ${library} | sort"
+
+    echo "[CHECK] Listing out external shared libraries linked:"
+    print_exec ldd "${library}"
+    echo "################################################################################"
+    echo ""
+    echo ""
+  done
+}
+
+__verify_library_symbols () {
+  __test_one_symbol () {
+    local symbol="$1"
+    if [ "$symbol" == "" ]; then
+      echo "Usage: ${FUNCNAME[0]} SYMBOL"
+      echo "Example(s):"
+      echo "    ${FUNCNAME[0]} fbgemm_gpu::asynchronous_inclusive_cumsum_cpu"
+      return 1
+    fi
+
+    # shellcheck disable=SC2035,SC2061,SC2062,SC2155,SC2178
+    local fbgemm_gpu_so_files=$(find . -name *.so | grep .*cmake-build/.*)
+    readarray -t fbgemm_gpu_so_files <<<"$fbgemm_gpu_so_files"
+
+    # Iterate through the built .SO files to check for the symbol's existence
+    for library in "${fbgemm_gpu_so_files[@]}"; do
+      if test_library_symbol "${library}" "${symbol}"; then
+        return 0
+      fi
+    done
+
     return 1
-  fi
+  }
 
   # Prepare a sample set of symbols whose existence in the built library should be checked
   # This is by no means an exhaustive set, and should be updated accordingly
-  local lib_symbols_to_check=(
-    fbgemm_gpu::asynchronous_inclusive_cumsum_cpu
-    fbgemm_gpu::jagged_2d_to_dense
-  )
-
-  # Add more symbols to check for if it's a non-CPU variant
-  if [ "${fbgemm_variant}" == "cuda" ]; then
-    lib_symbols_to_check+=(
+  if [ "${fbgemm_variant}" == "cpu" ]; then
+    local lib_symbols_to_check=(
+      fbgemm_gpu::asynchronous_inclusive_cumsum_cpu
+      fbgemm_gpu::jagged_2d_to_dense
+    )
+  elif [ "${fbgemm_variant}" == "cuda" ]; then
+    local lib_symbols_to_check=(
+      fbgemm_gpu::asynchronous_inclusive_cumsum_cpu
+      fbgemm_gpu::jagged_2d_to_dense
       fbgemm_gpu::asynchronous_inclusive_cumsum_gpu
       fbgemm_gpu::merge_pooled_embeddings
     )
   elif [ "${fbgemm_variant}" == "rocm" ]; then
-    # merge_pooled_embeddings is missing in ROCm builds bc it requires NVML
-    lib_symbols_to_check+=(
+    local lib_symbols_to_check=(
+      fbgemm_gpu::asynchronous_inclusive_cumsum_cpu
+      fbgemm_gpu::jagged_2d_to_dense
       fbgemm_gpu::asynchronous_inclusive_cumsum_gpu
       fbgemm_gpu::merge_pooled_embeddings
     )
+  elif [ "${fbgemm_variant}" == "genai" ]; then
+    local lib_symbols_to_check=(
+      fbgemm_gpu::car_init
+      fbgemm_gpu::per_tensor_quantize_i8
+    )
   fi
 
-  # Print info for only the first instance of the .SO file, since the build makes multiple copies
-  local library="${fbgemm_gpu_so_files[0]}"
-
-  echo "[CHECK] Listing out library size: ${library}"
-  print_exec "du -h --block-size=1M ${library}"
-
-  echo "[CHECK] Listing out the GLIBCXX versions referenced by the library: ${library}"
-  print_glibc_info "${library}"
+  echo "[CHECK] Verifying sample subset of symbols in the built libraries ..."
+  for symbol in "${lib_symbols_to_check[@]}"; do
+    (__test_one_symbol "${symbol}") || return 1
+  done
+}
 
-  echo "[CHECK] Listing out undefined symbols in the library: ${library}"
-  print_exec "nm -gDCu ${library} | sort"
+run_fbgemm_gpu_postbuild_checks () {
+  fbgemm_variant="$1"
+  if [ "$fbgemm_variant" == "" ]; then
+    echo "Usage: ${FUNCNAME[0]} FBGEMM_VARIANT"
+    echo "Example(s):"
+    echo "    ${FUNCNAME[0]} cpu"
+    echo "    ${FUNCNAME[0]} cuda"
+    echo "    ${FUNCNAME[0]} rocm"
+    echo "    ${FUNCNAME[0]} genai"
+    return 1
+  fi
 
-  echo "[CHECK] Listing out external shared libraries required by the library: ${library}"
-  print_exec ldd "${library}"
+  # Find the .SO file
+  # shellcheck disable=SC2035,SC2061,SC2062,SC2155,SC2178
+  local fbgemm_gpu_so_files=$(find . -name *.so | grep .*cmake-build/.*)
+  readarray -t fbgemm_gpu_so_files <<<"$fbgemm_gpu_so_files"
+  if [ "${#fbgemm_gpu_so_files[@]}" -le 0 ]; then
+    echo "[CHECK] .SO library is missing from the build path!"
+    return 1
+  fi
 
-  echo "[CHECK] Verifying sample subset of symbols in the library ..."
-  for symbol in "${lib_symbols_to_check[@]}"; do
-    (test_library_symbol "${library}" "${symbol}") || return 1
-  done
+  __print_library_infos
+  __verify_library_symbols
 }
 
 ################################################################################

diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash
@@ -115,7 +115,7 @@ install_fbgemm_gpu_pip () {
     echo "Example(s):"
     echo "    ${FUNCNAME[0]} build_env 0.5.0 cpu                  # Install the CPU variant, specific version from release channel"
     echo "    ${FUNCNAME[0]} build_env release cuda 12.4.1        # Install the CUDA variant, latest version from release channel"
-    echo "    ${FUNCNAME[0]} build_env test/0.6.0rc0 cuda 12.4.1  # Install the CUDA 12.4 variant, specific version from test channel"
+    echo "    ${FUNCNAME[0]} build_env test/0.7.0rc0 cuda 12.4.1  # Install the CUDA 12.4 variant, specific version from test channel"
     echo "    ${FUNCNAME[0]} build_env nightly rocm 5.3           # Install the ROCM 5.3 variant, latest version from nightly channel"
     return 1
   else

diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
@@ -70,16 +70,12 @@ __configure_fbgemm_gpu_test_cpu () {
     # These tests have non-CPU operators referenced in @given
     ./uvm/copy_test.py
     ./uvm/uvm_test.py
-    # require multiple GPUs
-    ./comm/multi_gpu_car_test.py
   )
 }
 
 __configure_fbgemm_gpu_test_cuda () {
   ignored_tests=(
     ./tbe/ssd/ssd_split_table_batched_embeddings_test.py
-    # require multiple GPUs
-    ./comm/multi_gpu_car_test.py
   )
 }
 
@@ -105,8 +101,6 @@ __configure_fbgemm_gpu_test_rocm () {
     ./tbe/ssd/ssd_split_table_batched_embeddings_test.py
     # https://github.com/pytorch/FBGEMM/issues/1559
     ./batched_unary_embeddings_test.py
-    # require multiple GPUs
-    ./comm/multi_gpu_car_test.py
   )
 }
 
@@ -250,7 +244,7 @@ test_setup_conda_environment () {
   if [ "$pytorch_variant_type" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME COMPILER PYTHON_VERSION PYTORCH_INSTALLER PYTORCH_CHANNEL[/VERSION] PYTORCH_VARIANT_TYPE [PYTORCH_VARIANT_VERSION]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} build_env clang 3.12 pip test/0.6.0 cuda 12.1.0       # Setup environment with pytorch-test 0.6.0 for Clang + Python 3.12 + CUDA 12.1.0"
+    echo "    ${FUNCNAME[0]} build_env clang 3.12 pip test/0.7.0 cuda 12.1.0       # Setup environment with pytorch-test 0.7.0 for Clang + Python 3.12 + CUDA 12.1.0"
     return 1
   else
     echo "################################################################################"
@@ -320,6 +314,13 @@ test_fbgemm_gpu_build_and_install () {
 
   cd ~/FBGEMM/                                                                || return 1
   install_fbgemm_gpu_wheel    "${env_name}" fbgemm_gpu/dist/*.whl             || return 1
+}
+
+test_fbgemm_gpu_build_and_install_and_run () {
+  local env_name="$1"
+  local pytorch_variant_type="$2"
+
+  test_fbgemm_gpu_build_and_install "${env_name}" "${pytorch_variant_type}"   || return 1
 
   cd ~/FBGEMM/                                                                || return 1
   test_all_fbgemm_gpu_modules "${env_name}" "${pytorch_variant_type}"         || return 1
@@ -332,7 +333,7 @@ test_fbgemm_gpu_setup_and_pip_install () {
   if [ "$fbgemm_gpu_channel_version" == "" ]; then
     echo "Usage: ${FUNCNAME[0]} ENV_NAME PYTORCH_CHANNEL[/VERSION] FBGEMM_GPU_CHANNEL[/VERSION]"
     echo "Example(s):"
-    echo "    ${FUNCNAME[0]} test_env cpu test/2.2.0 test/0.6.0     # Run tests against all Python versions with PyTorch test/2.2.0 and FBGEMM_GPU test/0.6.0 (CPU-only)"
+    echo "    ${FUNCNAME[0]} test_env cpu test/2.2.0 test/0.7.0     # Run tests against all Python versions with PyTorch test/2.2.0 and FBGEMM_GPU test/0.7.0 (CPU-only)"
     echo "    ${FUNCNAME[0]} test_env cuda test/2.3.0 test/0.7.0    # Run tests against all Python versions with PyTorch test/2.3.0 and FBGEMM_GPU test/0.7.0 (all CUDA versions)"
     return 1
   else

diff --git a/.github/scripts/utils_conda.bash b/.github/scripts/utils_conda.bash
@@ -62,11 +62,11 @@ setup_miniconda () {
   # https://medium.com/data-tyro/resolving-the-conda-libmamba-issue-and-environment-activation-trouble-9f911a6106a4
   # https://www.reddit.com/r/learnpython/comments/160kjz9/how_do_i_get_anaconda_to_work_the_way_i_want_it_to/
   echo "[SETUP] Installing libmamba-solver (required since Anaconda 2024.02-1) ..."
-  (exec_with_retries 3 conda install -n base conda-libmamba-solver --solver classic) || return 1
+  (exec_with_retries 3 conda install -n base  -y conda-libmamba-solver --solver classic) || return 1
 
   # https://stackoverflow.com/questions/77617946/solve-conda-libmamba-solver-libarchive-so-19-error-after-updating-conda-to-23
   echo "[SETUP] Installing libarchive ..."
-  (exec_with_retries 3 conda install -n base -c main libarchive --force-reinstall) || return 1
+  (exec_with_retries 3 conda install -n base -c main -y libarchive --force-reinstall) || return 1
 
   # Clean up packages
   conda_cleanup

diff --git a/.github/scripts/utils_pip.bash b/.github/scripts/utils_pip.bash
@@ -176,11 +176,11 @@ install_from_pytorch_pip () {
     echo "Example(s):"
     echo "    ${FUNCNAME[0]} build_env torch 1.11.0 cpu                       # Install the CPU variant, specific version from release channel"
     echo "    ${FUNCNAME[0]} build_env torch release cpu                      # Install the CPU variant, latest version from release channel"
-    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test/0.6.0rc0 cuda/12.1.0   # Install the CUDA 12.1 variant, specific version from test channel"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test/0.7.0rc0 cuda/12.1.0   # Install the CUDA 12.1 variant, specific version from test channel"
     echo "    ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm/5.3            # Install the ROCM 5.3 variant, latest version from nightly channel"
     echo "    ${FUNCNAME[0]} build_env pytorch_triton 1.11.0                  # Install specific version from release channel"
     echo "    ${FUNCNAME[0]} build_env pytorch_triton release                 # Install latest version from release channel"
-    echo "    ${FUNCNAME[0]} build_env pytorch_triton test/0.6.0rc0           # Install specific version from test channel"
+    echo "    ${FUNCNAME[0]} build_env pytorch_triton test/0.7.0rc0           # Install specific version from test channel"
     echo "    ${FUNCNAME[0]} build_env pytorch_triton_rocm nightly            # Install latest version from nightly channel"
     return 1
   else
@@ -233,7 +233,7 @@ download_from_pytorch_pip () {
     echo "Example(s):"
     echo "    ${FUNCNAME[0]} build_env torch 1.11.0 cpu                       # Download the CPU variant, specific version from release channel"
     echo "    ${FUNCNAME[0]} build_env torch release cpu                      # Download the CPU variant, latest version from release channel"
-    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test/0.6.0rc0 cuda/12.1.0   # Download the CUDA 12.1 variant, specific version from test channel"
+    echo "    ${FUNCNAME[0]} build_env fbgemm_gpu test/0.7.0rc0 cuda/12.1.0   # Download the CUDA 12.1 variant, specific version from test channel"
     echo "    ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm/5.3            # Download the ROCM 5.3 variant, latest version from nightly channel"
     return 1
   else

diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
@@ -31,7 +31,7 @@ on:
         required: true
         default: "nightly"
       fbgemm_gpu_channel_version:
-        description: FBGEMM-GPU Channel + Version (e.g. '0.5.0', 'nightly', 'test/0.6.0r0')
+        description: FBGEMM-GPU Channel + Version (e.g. '0.5.0', 'nightly', 'test/0.7.0r0')
         type: string
         required: true
         default: "nightly"

diff --git a/cmake/modules/CudaSetup.cmake b/cmake/modules/CudaSetup.cmake
@@ -12,10 +12,10 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/modules/Utilities.cmake)
 ################################################################################
 
 BLOCK_PRINT(
-  "NCCL flags"
+  "NCCL Flags"
   ""
-  "NCCL_INCLUDE_DIR=${NCCL_INCLUDE_DIR}"
-  "NCCL_LIB_DIR=${NCCL_LIB_DIR}"
+  "NCCL_INCLUDE_DIRS=${NCCL_INCLUDE_DIRS}"
+  "NCCL_LIBRARIES=${NCCL_LIBRARIES}"
 )
 
 # Set NVML_LIB_PATH if provided, or detect the default lib path