From 8e25dd5b2a01e4ceab7d5bb3e57084206586f3da Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Fri, 20 Sep 2024 17:14:20 -0700 Subject: [PATCH] [fbgemm_gpu] Add workflow for running only on non-PyTorch infrastructure - Add workflow for running only on non-PyTorch infrastructure --- .github/workflows/fbgemm_gpu_ci_genai.yml | 1 - .../fbgemm_gpu_ci_genai_generic_infra.yml | 188 ++++++++++++++++++ 2 files changed, 188 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml diff --git a/.github/workflows/fbgemm_gpu_ci_genai.yml b/.github/workflows/fbgemm_gpu_ci_genai.yml index 07f8f80595..3ffdf45c0a 100644 --- a/.github/workflows/fbgemm_gpu_ci_genai.yml +++ b/.github/workflows/fbgemm_gpu_ci_genai.yml @@ -127,7 +127,6 @@ jobs: # Download the built artifact from GHA, test on GPU, and push to PyPI test_and_publish_artifact: - # runs-on: linux.4xlarge.nvidia.gpu # Use available instance types - https://github.com/pytorch/test-infra/blob/main/.github/scale-config.yml runs-on: ${{ matrix.host-machine.instance }} defaults: diff --git a/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml b/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml new file mode 100644 index 0000000000..bd4d63de6c --- /dev/null +++ b/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml @@ -0,0 +1,188 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# This workflow is used for FBGEMM_GPU-GenAI CI, and is meant to be used for +# copies of the FBGEMM repos hosted outside of the pytorch org. +name: FBGEMM_GPU-GenAI CI (Generic Runner) + +on: + # PR Trigger + # + pull_request: + branches: + - main + + # Push Trigger (enable to catch errors coming out of multiple merges) + # + push: + branches: + - main + + # Manual Trigger + # + workflow_dispatch: + +concurrency: + # Cancel previous runs in the PR if a new commit is pushed + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + # Build on CPU hosts and upload to GHA + build_artifact: + runs-on: ${{ matrix.host-machine.instance }} + container: + image: amazonlinux:2023 + options: --user root --privileged --pid=host + volumes: + - /var/run/docker.sock:/var/run/docker.sock + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + BUILD_ENV: build_binary + BUILD_VARIANT: genai + continue-on-error: true + strategy: + fail-fast: false + matrix: + host-machine: [ + { arch: x86, instance: "ubuntu-latest" }, + ] + python-version: [ "3.9", "3.10", "3.11", "3.12" ] + cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ] + compiler: [ "gcc", "clang" ] + + steps: + - name: Setup Build Container + run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which + + - name: Checkout the Repository + uses: actions/checkout@v4 + with: + submodules: true + + - name: Free Disk Space + run: . $PRELUDE; nsenter -t 1 -m -u -n -i bash -c "ls -la /" + + - name: Display System Info + run: . $PRELUDE; print_system_info + + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info + + - name: Setup Miniconda + run: . $PRELUDE; setup_miniconda $HOME/miniconda + + - name: Create Conda Environment + run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} + + - name: Install C/C++ Compilers + run: . $PRELUDE; install_cxx_compiler $BUILD_ENV ${{ matrix.compiler }} + + - name: Install Build Tools + run: . $PRELUDE; install_build_tools $BUILD_ENV + + - name: Install CUDA + run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }} + + # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready + - name: Install PyTorch Nightly + run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda/${{ matrix.cuda-version }} + + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi + + - name: Install cuDNN + run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }} + + - name: Prepare FBGEMM_GPU Build + run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV + + - name: Build FBGEMM_GPU Wheel + run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly genai + + - name: Upload Built Wheel as GHA Artifact + # Cannot upgrade to actions/upload-artifact@v4 yet because GLIBC on the instance is too old + uses: actions/upload-artifact@v3 + with: + name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl + path: fbgemm_gpu/dist/*.whl + if-no-files-found: error + + # Download the built artifact from GHA, test on GPU, and push to PyPI + test_artifact: + runs-on: ${{ matrix.host-machine.instance }} + defaults: + run: + shell: bash + env: + PRELUDE: .github/scripts/setup_env.bash + BUILD_ENV: build_binary + BUILD_VARIANT: genai + ENFORCE_CUDA_DEVICE: 0 + CUDA_VISIBLE_DEVICES: -1 + strategy: + fail-fast: false + matrix: + host-machine: [ + { arch: x86, instance: "ubuntu-latest" }, + ] + python-version: [ "3.9", "3.10", "3.11", "3.12" ] + cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ] + # Specify exactly ONE CUDA version for artifact publish + cuda-version-publish: [ "12.1.1" ] + compiler: [ "gcc", "clang" ] + needs: build_artifact + + steps: + # Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old + - name: Checkout the Repository + uses: actions/checkout@v3 + with: + submodules: true + + - name: Download Wheel Artifact from GHA + # Cannot upgrade to actions/download-artifact@v4 yet because GLIBC on the instance is too old + uses: actions/download-artifact@v3 + with: + name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl + + - name: Display System Info + run: . $PRELUDE; print_system_info; print_ec2_info + + - name: Display GPU Info + run: . $PRELUDE; print_gpu_info + + - name: Setup Miniconda + run: . $PRELUDE; setup_miniconda $HOME/miniconda + + - name: Create Conda Environment + run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }} + + - name: Install C/C++ Compilers for Updated LIBGCC + run: . $PRELUDE; install_cxx_compiler $BUILD_ENV clang + + - name: Install CUDA + run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }} + + - name: Install PyTorch Nightly + run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda/${{ matrix.cuda-version }} + + - name: Collect PyTorch Environment Info + if: ${{ success() || failure() }} + run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi + + - name: Prepare FBGEMM_GPU Build + run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV + + - name: Install FBGEMM_GPU Wheel + run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl + + - name: Test with PyTest + timeout-minutes: 30 + run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV