Skip to content

Commit

Permalink
Enable NCCL code (pytorch#2631)
Browse files Browse the repository at this point in the history
Summary:
- Enable NCCL code and tests for multiple GPU car

Pull Request resolved: pytorch#2631

Reviewed By: jianyuh

Differential Revision: D58147817

Pulled By: q10

fbshipit-source-id: 49f1ec628c6f278bd8069b8f80237ea5d47e1e94
  • Loading branch information
q10 authored and facebook-github-bot committed Jun 7, 2024
1 parent 6e73730 commit c7720e8
Show file tree
Hide file tree
Showing 20 changed files with 318 additions and 181 deletions.
182 changes: 124 additions & 58 deletions .github/scripts/fbgemm_gpu_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,6 @@ __configure_fbgemm_gpu_build_nvcc () {
# shellcheck disable=SC2206
local cuda_version_arr=(${cuda_version//./ })

echo "[BUILD] Looking up NCCL path ..."
# shellcheck disable=SC2155,SC2086
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
# shellcheck disable=SC2155,SC2086
local nccl_lib=$(conda run ${env_prefix} find ${conda_prefix} -name "libnccl.so*")
# shellcheck disable=SC2155,SC2086
local nccl_path=$(dirname "$(dirname ${nccl_lib})")

# Only NVCC 12+ supports C++20
if [[ ${cuda_version_arr[0]} -lt 12 ]]; then
local cppstd_ver=17
Expand Down Expand Up @@ -123,8 +115,6 @@ __configure_fbgemm_gpu_build_nvcc () {
build_args+=(
# Override CMake configuration
-DCMAKE_CXX_STANDARD="${cppstd_ver}"
-DNCCL_INCLUDE_DIR=${nccl_path}/include
-DNCCL_LIB_DIR=${nccl_path}/lib
)
}

Expand Down Expand Up @@ -203,6 +193,7 @@ __configure_fbgemm_gpu_build_cuda () {
local arch_list="${TORCH_CUDA_ARCH_LIST}"

else
# Build only CUDA 7.0, 8.0, and 9.0 (i.e. V100, A100, H100) because of 100 MB binary size limits from PyPI.
echo "[BUILD] Using the default CUDA targets ..."
# For cuda version 12.1, enable sm 9.0
cuda_version_nvcc=$(conda run -n "${env_name}" nvcc --version)
Expand All @@ -213,20 +204,27 @@ __configure_fbgemm_gpu_build_cuda () {
local arch_list="7.0;8.0"
fi
fi
echo "[BUILD] Setting the following CUDA targets: ${arch_list}"

# Unset the environment-supplied TORCH_CUDA_ARCH_LIST because it will take
# precedence over cmake -DTORCH_CUDA_ARCH_LIST
unset TORCH_CUDA_ARCH_LIST

echo "[BUILD] Setting the following CUDA targets: ${arch_list}"

# Build only CUDA 7.0 and 8.0 (i.e. V100 and A100) because of 100 MB binary size limits from PyPI.
echo "[BUILD] Setting CUDA build args ..."
echo "[BUILD] Looking up NVML filepath ..."
# shellcheck disable=SC2155,SC2086
local nvml_lib_path=$(conda run --no-capture-output ${env_prefix} printenv NVML_LIB_PATH)

echo "[BUILD] Looking up NCCL filepath ..."
# shellcheck disable=SC2155,SC2086
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
# shellcheck disable=SC2155,SC2086
local nccl_lib_path=$(conda run ${env_prefix} find ${conda_prefix} -name "libnccl.so*")

echo "[BUILD] Setting CUDA build args ..."
build_args=(
--package_variant=cuda
--nvml_lib_path="${nvml_lib_path}"
--nccl_lib_path="${nccl_lib_path}"
# Pass to PyTorch CMake
-DTORCH_CUDA_ARCH_LIST="'${arch_list}'"
)
Expand All @@ -235,6 +233,17 @@ __configure_fbgemm_gpu_build_cuda () {
__configure_fbgemm_gpu_build_nvcc
}

__configure_fbgemm_gpu_build_genai () {
local fbgemm_variant_targets="$1"

__configure_fbgemm_gpu_build_cuda "$fbgemm_variant_targets" || return 1

# Replace the package_variant flag, since GenAI is also a CUDA-type build
for i in "${!build_args[@]}"; do
build_args[i]="${build_args[i]/--package_variant=cuda/--package_variant=genai}"
done
}

__configure_fbgemm_gpu_build () {
local fbgemm_variant="$1"
local fbgemm_variant_targets="$2"
Expand Down Expand Up @@ -267,6 +276,10 @@ __configure_fbgemm_gpu_build () {
echo "[BUILD] Configuring build as ROCm variant ..."
__configure_fbgemm_gpu_build_rocm "${fbgemm_variant_targets}"

elif [ "$fbgemm_variant" == "genai" ]; then
echo "[BUILD] Configuring build as GenAI variant ..."
__configure_fbgemm_gpu_build_genai "${fbgemm_variant_targets}"

else
echo "[BUILD] Configuring build as CUDA variant (this is the default behavior) ..."
__configure_fbgemm_gpu_build_cuda "${fbgemm_variant_targets}"
Expand Down Expand Up @@ -347,7 +360,13 @@ __build_fbgemm_gpu_common_pre_steps () {
(test_binpath "${env_name}" g++) || return 1

# Set the default the FBGEMM_GPU variant to be CUDA
if [ "$fbgemm_variant" != "cpu" ] && [ "$fbgemm_variant" != "rocm" ]; then
if [ "$fbgemm_variant" != "cpu" ] &&
[ "$fbgemm_variant" != "rocm" ] &&
[ "$fbgemm_variant" != "genai" ]; then
echo "################################################################################"
echo "[BUILD] Unknown FBGEMM_GPU variant: $fbgemm_variant"
echo "[BUILD] Defaulting to CUDA"
echo "################################################################################"
export fbgemm_variant="cuda"
fi

Expand All @@ -371,66 +390,113 @@ __build_fbgemm_gpu_common_pre_steps () {
print_exec git diff
}

run_fbgemm_gpu_postbuild_checks () {
local fbgemm_variant="$1"
if [ "$fbgemm_variant" == "" ]; then
echo "Usage: ${FUNCNAME[0]} FBGEMM_VARIANT"
echo "Example(s):"
echo " ${FUNCNAME[0]} cpu"
echo " ${FUNCNAME[0]} cuda"
echo " ${FUNCNAME[0]} rocm"
return 1
fi

# Find the .SO file
# shellcheck disable=SC2155
local fbgemm_gpu_so_files=$(find . -name fbgemm_gpu_py.so)
__print_library_infos () {
# shellcheck disable=SC2035,SC2061,SC2062,SC2155,SC2178
local fbgemm_gpu_so_files=$(find . -name *.so | grep .*cmake-build/.*)
readarray -t fbgemm_gpu_so_files <<<"$fbgemm_gpu_so_files"
if [ "${#fbgemm_gpu_so_files[@]}" -le 0 ]; then
echo "[CHECK] .SO library fbgemm_gpu_py.so is missing from the build path!"

for library in "${fbgemm_gpu_so_files[@]}"; do
echo "################################################################################"
echo "[CHECK] BUILT LIBRARY: ${library}"

echo "[CHECK] Listing out library size:"
print_exec "du -h --block-size=1M ${library}"

echo "[CHECK] Listing out the GLIBCXX versions referenced:"
print_glibc_info "${library}"

echo "[CHECK] Listing out undefined symbols:"
print_exec "nm -gDCu ${library} | sort"

echo "[CHECK] Listing out external shared libraries linked:"
print_exec ldd "${library}"
echo "################################################################################"
echo ""
echo ""
done
}

__verify_library_symbols () {
__test_one_symbol () {
local symbol="$1"
if [ "$symbol" == "" ]; then
echo "Usage: ${FUNCNAME[0]} SYMBOL"
echo "Example(s):"
echo " ${FUNCNAME[0]} fbgemm_gpu::asynchronous_inclusive_cumsum_cpu"
return 1
fi

# shellcheck disable=SC2035,SC2061,SC2062,SC2155,SC2178
local fbgemm_gpu_so_files=$(find . -name *.so | grep .*cmake-build/.*)
readarray -t fbgemm_gpu_so_files <<<"$fbgemm_gpu_so_files"

# Iterate through the built .SO files to check for the symbol's existence
for library in "${fbgemm_gpu_so_files[@]}"; do
if test_library_symbol "${library}" "${symbol}"; then
return 0
fi
done

return 1
fi
}

# Prepare a sample set of symbols whose existence in the built library should be checked
# This is by no means an exhaustive set, and should be updated accordingly
local lib_symbols_to_check=(
fbgemm_gpu::asynchronous_inclusive_cumsum_cpu
fbgemm_gpu::jagged_2d_to_dense
)

# Add more symbols to check for if it's a non-CPU variant
if [ "${fbgemm_variant}" == "cuda" ]; then
lib_symbols_to_check+=(
if [ "${fbgemm_variant}" == "cpu" ]; then
local lib_symbols_to_check=(
fbgemm_gpu::asynchronous_inclusive_cumsum_cpu
fbgemm_gpu::jagged_2d_to_dense
)
elif [ "${fbgemm_variant}" == "cuda" ]; then
local lib_symbols_to_check=(
fbgemm_gpu::asynchronous_inclusive_cumsum_cpu
fbgemm_gpu::jagged_2d_to_dense
fbgemm_gpu::asynchronous_inclusive_cumsum_gpu
fbgemm_gpu::merge_pooled_embeddings
)
elif [ "${fbgemm_variant}" == "rocm" ]; then
# merge_pooled_embeddings is missing in ROCm builds bc it requires NVML
lib_symbols_to_check+=(
local lib_symbols_to_check=(
fbgemm_gpu::asynchronous_inclusive_cumsum_cpu
fbgemm_gpu::jagged_2d_to_dense
fbgemm_gpu::asynchronous_inclusive_cumsum_gpu
fbgemm_gpu::merge_pooled_embeddings
)
elif [ "${fbgemm_variant}" == "genai" ]; then
local lib_symbols_to_check=(
fbgemm_gpu::car_init
fbgemm_gpu::per_tensor_quantize_i8
)
fi

# Print info for only the first instance of the .SO file, since the build makes multiple copies
local library="${fbgemm_gpu_so_files[0]}"

echo "[CHECK] Listing out library size: ${library}"
print_exec "du -h --block-size=1M ${library}"

echo "[CHECK] Listing out the GLIBCXX versions referenced by the library: ${library}"
print_glibc_info "${library}"
echo "[CHECK] Verifying sample subset of symbols in the built libraries ..."
for symbol in "${lib_symbols_to_check[@]}"; do
(__test_one_symbol "${symbol}") || return 1
done
}

echo "[CHECK] Listing out undefined symbols in the library: ${library}"
print_exec "nm -gDCu ${library} | sort"
run_fbgemm_gpu_postbuild_checks () {
fbgemm_variant="$1"
if [ "$fbgemm_variant" == "" ]; then
echo "Usage: ${FUNCNAME[0]} FBGEMM_VARIANT"
echo "Example(s):"
echo " ${FUNCNAME[0]} cpu"
echo " ${FUNCNAME[0]} cuda"
echo " ${FUNCNAME[0]} rocm"
echo " ${FUNCNAME[0]} genai"
return 1
fi

echo "[CHECK] Listing out external shared libraries required by the library: ${library}"
print_exec ldd "${library}"
# Find the .SO file
# shellcheck disable=SC2035,SC2061,SC2062,SC2155,SC2178
local fbgemm_gpu_so_files=$(find . -name *.so | grep .*cmake-build/.*)
readarray -t fbgemm_gpu_so_files <<<"$fbgemm_gpu_so_files"
if [ "${#fbgemm_gpu_so_files[@]}" -le 0 ]; then
echo "[CHECK] .SO library is missing from the build path!"
return 1
fi

echo "[CHECK] Verifying sample subset of symbols in the library ..."
for symbol in "${lib_symbols_to_check[@]}"; do
(test_library_symbol "${library}" "${symbol}") || return 1
done
__print_library_infos
__verify_library_symbols
}

################################################################################
Expand Down
2 changes: 1 addition & 1 deletion .github/scripts/fbgemm_gpu_install.bash
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ install_fbgemm_gpu_pip () {
echo "Example(s):"
echo " ${FUNCNAME[0]} build_env 0.5.0 cpu # Install the CPU variant, specific version from release channel"
echo " ${FUNCNAME[0]} build_env release cuda 12.4.1 # Install the CUDA variant, latest version from release channel"
echo " ${FUNCNAME[0]} build_env test/0.6.0rc0 cuda 12.4.1 # Install the CUDA 12.4 variant, specific version from test channel"
echo " ${FUNCNAME[0]} build_env test/0.7.0rc0 cuda 12.4.1 # Install the CUDA 12.4 variant, specific version from test channel"
echo " ${FUNCNAME[0]} build_env nightly rocm 5.3 # Install the ROCM 5.3 variant, latest version from nightly channel"
return 1
else
Expand Down
17 changes: 9 additions & 8 deletions .github/scripts/fbgemm_gpu_test.bash
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,12 @@ __configure_fbgemm_gpu_test_cpu () {
# These tests have non-CPU operators referenced in @given
./uvm/copy_test.py
./uvm/uvm_test.py
# require multiple GPUs
./comm/multi_gpu_car_test.py
)
}

__configure_fbgemm_gpu_test_cuda () {
ignored_tests=(
./tbe/ssd/ssd_split_table_batched_embeddings_test.py
# require multiple GPUs
./comm/multi_gpu_car_test.py
)
}

Expand All @@ -105,8 +101,6 @@ __configure_fbgemm_gpu_test_rocm () {
./tbe/ssd/ssd_split_table_batched_embeddings_test.py
# https://github.com/pytorch/FBGEMM/issues/1559
./batched_unary_embeddings_test.py
# require multiple GPUs
./comm/multi_gpu_car_test.py
)
}

Expand Down Expand Up @@ -250,7 +244,7 @@ test_setup_conda_environment () {
if [ "$pytorch_variant_type" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME COMPILER PYTHON_VERSION PYTORCH_INSTALLER PYTORCH_CHANNEL[/VERSION] PYTORCH_VARIANT_TYPE [PYTORCH_VARIANT_VERSION]"
echo "Example(s):"
echo " ${FUNCNAME[0]} build_env clang 3.12 pip test/0.6.0 cuda 12.1.0 # Setup environment with pytorch-test 0.6.0 for Clang + Python 3.12 + CUDA 12.1.0"
echo " ${FUNCNAME[0]} build_env clang 3.12 pip test/0.7.0 cuda 12.1.0 # Setup environment with pytorch-test 0.7.0 for Clang + Python 3.12 + CUDA 12.1.0"
return 1
else
echo "################################################################################"
Expand Down Expand Up @@ -320,6 +314,13 @@ test_fbgemm_gpu_build_and_install () {

cd ~/FBGEMM/ || return 1
install_fbgemm_gpu_wheel "${env_name}" fbgemm_gpu/dist/*.whl || return 1
}

test_fbgemm_gpu_build_and_install_and_run () {
local env_name="$1"
local pytorch_variant_type="$2"

test_fbgemm_gpu_build_and_install "${env_name}" "${pytorch_variant_type}" || return 1

cd ~/FBGEMM/ || return 1
test_all_fbgemm_gpu_modules "${env_name}" "${pytorch_variant_type}" || return 1
Expand All @@ -332,7 +333,7 @@ test_fbgemm_gpu_setup_and_pip_install () {
if [ "$fbgemm_gpu_channel_version" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME PYTORCH_CHANNEL[/VERSION] FBGEMM_GPU_CHANNEL[/VERSION]"
echo "Example(s):"
echo " ${FUNCNAME[0]} test_env cpu test/2.2.0 test/0.6.0 # Run tests against all Python versions with PyTorch test/2.2.0 and FBGEMM_GPU test/0.6.0 (CPU-only)"
echo " ${FUNCNAME[0]} test_env cpu test/2.2.0 test/0.7.0 # Run tests against all Python versions with PyTorch test/2.2.0 and FBGEMM_GPU test/0.7.0 (CPU-only)"
echo " ${FUNCNAME[0]} test_env cuda test/2.3.0 test/0.7.0 # Run tests against all Python versions with PyTorch test/2.3.0 and FBGEMM_GPU test/0.7.0 (all CUDA versions)"
return 1
else
Expand Down
4 changes: 2 additions & 2 deletions .github/scripts/utils_conda.bash
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@ setup_miniconda () {
# https://medium.com/data-tyro/resolving-the-conda-libmamba-issue-and-environment-activation-trouble-9f911a6106a4
# https://www.reddit.com/r/learnpython/comments/160kjz9/how_do_i_get_anaconda_to_work_the_way_i_want_it_to/
echo "[SETUP] Installing libmamba-solver (required since Anaconda 2024.02-1) ..."
(exec_with_retries 3 conda install -n base conda-libmamba-solver --solver classic) || return 1
(exec_with_retries 3 conda install -n base -y conda-libmamba-solver --solver classic) || return 1

# https://stackoverflow.com/questions/77617946/solve-conda-libmamba-solver-libarchive-so-19-error-after-updating-conda-to-23
echo "[SETUP] Installing libarchive ..."
(exec_with_retries 3 conda install -n base -c main libarchive --force-reinstall) || return 1
(exec_with_retries 3 conda install -n base -c main -y libarchive --force-reinstall) || return 1

# Clean up packages
conda_cleanup
Expand Down
6 changes: 3 additions & 3 deletions .github/scripts/utils_pip.bash
Original file line number Diff line number Diff line change
Expand Up @@ -176,11 +176,11 @@ install_from_pytorch_pip () {
echo "Example(s):"
echo " ${FUNCNAME[0]} build_env torch 1.11.0 cpu # Install the CPU variant, specific version from release channel"
echo " ${FUNCNAME[0]} build_env torch release cpu # Install the CPU variant, latest version from release channel"
echo " ${FUNCNAME[0]} build_env fbgemm_gpu test/0.6.0rc0 cuda/12.1.0 # Install the CUDA 12.1 variant, specific version from test channel"
echo " ${FUNCNAME[0]} build_env fbgemm_gpu test/0.7.0rc0 cuda/12.1.0 # Install the CUDA 12.1 variant, specific version from test channel"
echo " ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm/5.3 # Install the ROCM 5.3 variant, latest version from nightly channel"
echo " ${FUNCNAME[0]} build_env pytorch_triton 1.11.0 # Install specific version from release channel"
echo " ${FUNCNAME[0]} build_env pytorch_triton release # Install latest version from release channel"
echo " ${FUNCNAME[0]} build_env pytorch_triton test/0.6.0rc0 # Install specific version from test channel"
echo " ${FUNCNAME[0]} build_env pytorch_triton test/0.7.0rc0 # Install specific version from test channel"
echo " ${FUNCNAME[0]} build_env pytorch_triton_rocm nightly # Install latest version from nightly channel"
return 1
else
Expand Down Expand Up @@ -233,7 +233,7 @@ download_from_pytorch_pip () {
echo "Example(s):"
echo " ${FUNCNAME[0]} build_env torch 1.11.0 cpu # Download the CPU variant, specific version from release channel"
echo " ${FUNCNAME[0]} build_env torch release cpu # Download the CPU variant, latest version from release channel"
echo " ${FUNCNAME[0]} build_env fbgemm_gpu test/0.6.0rc0 cuda/12.1.0 # Download the CUDA 12.1 variant, specific version from test channel"
echo " ${FUNCNAME[0]} build_env fbgemm_gpu test/0.7.0rc0 cuda/12.1.0 # Download the CUDA 12.1 variant, specific version from test channel"
echo " ${FUNCNAME[0]} build_env fbgemm_gpu nightly rocm/5.3 # Download the ROCM 5.3 variant, latest version from nightly channel"
return 1
else
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_pip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ on:
required: true
default: "nightly"
fbgemm_gpu_channel_version:
description: FBGEMM-GPU Channel + Version (e.g. '0.5.0', 'nightly', 'test/0.6.0r0')
description: FBGEMM-GPU Channel + Version (e.g. '0.5.0', 'nightly', 'test/0.7.0r0')
type: string
required: true
default: "nightly"
Expand Down
6 changes: 3 additions & 3 deletions cmake/modules/CudaSetup.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/modules/Utilities.cmake)
################################################################################

BLOCK_PRINT(
"NCCL flags"
"NCCL Flags"
""
"NCCL_INCLUDE_DIR=${NCCL_INCLUDE_DIR}"
"NCCL_LIB_DIR=${NCCL_LIB_DIR}"
"NCCL_INCLUDE_DIRS=${NCCL_INCLUDE_DIRS}"
"NCCL_LIBRARIES=${NCCL_LIBRARIES}"
)

# Set NVML_LIB_PATH if provided, or detect the default lib path
Expand Down
Loading

0 comments on commit c7720e8

Please sign in to comment.