From 3abe1b3b81cdee43988722e7ad1ac385667edabf Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Fri, 20 Sep 2024 17:14:20 -0700 Subject: [PATCH] [fbgemm_gpu] Add workflow for running only on non-PyTorch infrastructure - Add workflow for running only on non-PyTorch infrastructure --- .github/scripts/utils_pytorch.bash | 7 +- .github/scripts/utils_system.bash | 44 +++- .github/workflows/fbgemm_gpu_ci_genai.yml | 1 - .../fbgemm_gpu_ci_genai_generic_infra.yml | 188 ++++++++++++++++++ 4 files changed, 237 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash index 3d07e437c9..42ea6c0f6d 100644 --- a/.github/scripts/utils_pytorch.bash +++ b/.github/scripts/utils_pytorch.bash @@ -127,7 +127,12 @@ install_pytorch_pip () { # shellcheck disable=SC2155 local env_prefix=$(env_name_or_prefix "${env_name}") - # Install the package from PyTorch PIP (not PyPI) + # Install the main dependencies + # shellcheck disable=SC2086 + (exec_with_retries 3 conda install ${env_prefix} -c conda-forge -y \ + numpy) || return 1 + + # Install the torch package from PyTorch PIP (not PyPI) install_from_pytorch_pip "${env_name}" torch "${pytorch_channel_version}" "${pytorch_variant_type_version}" || return 1 # Check that PyTorch is importable diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash index ba5605a281..3fba4146a3 100644 --- a/.github/scripts/utils_system.bash +++ b/.github/scripts/utils_system.bash @@ -73,6 +73,48 @@ free_disk_space () { echo "[CLEANUP] Freed up some disk space" } +free_disk_space_on_host () { + echo "################################################################################" + echo "# Free Disk Space On CI Host" + echo "################################################################################" + + # Frees up disk space on the ubuntu-latest host machine based on recommendations: + # https://github.com/orgs/community/discussions/25678 + # https://github.com/apache/flink/blob/02d30ace69dc18555a5085eccf70ee884e73a16e/tools/azure-pipelines/free_disk_space.sh + + # Escape the docker container to run the free disk operation on the host: + # https://stackoverflow.com/questions/66160057/how-to-run-a-command-in-host-before-entering-docker-container-in-github-ci + # https://stackoverflow.com/questions/32163955/how-to-run-shell-script-on-host-from-docker-container/63140387#63140387 + + nsenter -t 1 -m -u -n -i bash -c " + echo 'Listing 100 largest packages'; + dpkg-query -Wf '\${Installed-Size}\t\${Package}\n' | sort -n | tail -n 100; + df -h; + + echo 'Removing large packages'; + sudo apt-get remove -y '^ghc-8.*'; + sudo apt-get remove -y '^dotnet-.*'; + sudo apt-get remove -y '^llvm-.*'; + sudo apt-get remove -y 'php.*'; + sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel; + sudo apt-get autoremove -y; + sudo apt-get clean; + df -h; + + echo 'Removing large directories'; + rm -rf /usr/local/android; + rm -rf /usr/share/dotnet; + rm -rf /usr/local/share/boost; + rm -rf /opt/ghc; + rm -rf /usr/local/share/chrom*; + rm -rf /usr/share/swift; + rm -rf /usr/local/julia*; + rm -rf /usr/local/lib/android; + rm -rf /opt/hostedtoolcache; + df -h; + " +} + ################################################################################ # Info Functions @@ -91,7 +133,7 @@ print_gpu_info () { (lspci -v | grep -e 'controller.*NVIDIA') || true - if [[ "${ENFORCE_CUDA_DEVICE}" ]]; then + if [[ "${ENFORCE_CUDA_DEVICE}" == '1' ]]; then # Ensure that nvidia-smi is available and returns GPU entries if ! nvidia-smi; then echo "[CHECK] NVIDIA drivers and CUDA device are required for this workflow, but does not appear to be installed or available!" diff --git a/.github/workflows/fbgemm_gpu_ci_genai.yml b/.github/workflows/fbgemm_gpu_ci_genai.yml index 07f8f80595..3ffdf45c0a 100644 --- a/.github/workflows/fbgemm_gpu_ci_genai.yml +++ b/.github/workflows/fbgemm_gpu_ci_genai.yml @@ -127,7 +127,6 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: - # runs-on: linux.4xlarge.nvidia.gpu # Use available instance types - https://github.com/pytorch/test-infra/blob/main/.github/scale-config.yml runs-on: ${{ matrix.host-machine.instance }} defaults: diff --git a/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml b/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml new file mode 100644 index 0000000000..8f63b2603d --- /dev/null +++ b/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml @@ -0,0 +1,188 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# This workflow is used for FBGEMM_GPU-GenAI CI, and is meant to be used for +# copies of the FBGEMM repos hosted outside of the pytorch org. +name: FBGEMM_GPU-GenAI CI (Generic Runner) + +on: + # PR Trigger + # + pull_request: + branches: + - main + + # Push Trigger (enable to catch errors coming out of multiple merges) + # + push: + branches: + - main + + # Manual Trigger + # + workflow_dispatch: + +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + # Build on CPU hosts and upload to GHA + build_artifact: + runs-on: ${{ matrix.host-machine.instance }} + container: + image: amazonlinux:2023 + options: --user root --privileged --pid=host + volumes: + - /var/run/docker.sock:/var/run/docker.sock + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + BUILD_ENV: build_binary + BUILD_VARIANT: genai + continue-on-error: true + strategy: + fail-fast: false + matrix: + host-machine: [ + { arch: x86, instance: "ubuntu-latest" }, + ] + python-version: [ "3.9", "3.10", "3.11", "3.12" ] + cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ] + compiler: [ "gcc", "clang" ] + + steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which + + - name: Checkout the Repository + uses: actions/checkout@v4 + with: + submodules: true + + - name: Free Disk Space + run: . $PRELUDE; free_disk_space_on_host + + - name: Display System Info + run: . $PRELUDE; print_system_info + + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info + + - name: Setup Miniconda + run: . $PRELUDE; setup_miniconda $HOME/miniconda + + - name: Create Conda Environment + run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} + + - name: Install C/C++ Compilers + run: . $PRELUDE; install_cxx_compiler $BUILD_ENV ${{ matrix.compiler }} + + - name: Install Build Tools + run: . $PRELUDE; install_build_tools $BUILD_ENV + + - name: Install CUDA + run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }} + + # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready + - name: Install PyTorch Nightly + run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda/${{ matrix.cuda-version }} + + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi + + - name: Install cuDNN + run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }} + + - name: Prepare FBGEMM_GPU Build + run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV + + - name: Build FBGEMM_GPU Wheel + run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly genai + + - name: Upload Built Wheel as GHA Artifact + # Cannot upgrade to actions/upload-artifact@v4 yet because GLIBC on the instance is too old + uses: actions/upload-artifact@v3 + with: + name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl + path: fbgemm_gpu/dist/*.whl + if-no-files-found: error + + # Download the built artifact from GHA, test on GPU, and push to PyPI + test_artifact: + runs-on: ${{ matrix.host-machine.instance }} + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + BUILD_ENV: build_binary + BUILD_VARIANT: genai + ENFORCE_CUDA_DEVICE: 0 + CUDA_VISIBLE_DEVICES: -1 + strategy: + fail-fast: false + matrix: + host-machine: [ + { arch: x86, instance: "ubuntu-latest" }, + ] + python-version: [ "3.9", "3.10", "3.11", "3.12" ] + cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ] + # Specify exactly ONE CUDA version for artifact publish + cuda-version-publish: [ "12.1.1" ] + compiler: [ "gcc", "clang" ] + needs: build_artifact + + steps: + # Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old + - name: Checkout the Repository + uses: actions/checkout@v3 + with: + submodules: true + + - name: Download Wheel Artifact from GHA + # Cannot upgrade to actions/download-artifact@v4 yet because GLIBC on the instance is too old + uses: actions/download-artifact@v3 + with: + name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl + + - name: Display System Info + run: . $PRELUDE; print_system_info; print_ec2_info + + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info + + - name: Setup Miniconda + run: . $PRELUDE; setup_miniconda $HOME/miniconda + + - name: Create Conda Environment + run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} + + - name: Install C/C++ Compilers for Updated LIBGCC + run: . $PRELUDE; install_cxx_compiler $BUILD_ENV clang + + - name: Install CUDA + run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }} + + - name: Install PyTorch Nightly + run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda/${{ matrix.cuda-version }} + + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi + + - name: Prepare FBGEMM_GPU Build + run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV + + - name: Install FBGEMM_GPU Wheel + run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl + + - name: Test with PyTest + timeout-minutes: 30 + run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV