[T162270879][fbgemm_gpu] Add CUDA artifact selection on publish

- Add CUDA version selection on artifact publishing
q10 · Sep 8, 2023 · 6952885 · 6952885
1 parent f664fd9
commit 6952885
Show file tree

Hide file tree

Showing 3 changed files with 200 additions and 7 deletions.
diff --git a/.github/workflows/fbgemm_gpu_ci.yml b/.github/workflows/fbgemm_gpu_ci.yml
@@ -88,10 +88,10 @@ jobs:
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Build FBGEMM_GPU-ROCM Nightly
+    - name: Build FBGEMM_GPU-ROCm Nightly
       run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm gfx90a
 
-    - name: Test FBGEMM_GPU-ROCM Nightly Installation
+    - name: Test FBGEMM_GPU-ROCm Nightly Installation
       timeout-minutes: 10
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
 
@@ -154,10 +154,10 @@ jobs:
     - name: Prepare FBGEMM_GPU Build
       run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
 
-    - name: Build FBGEMM_GPU-ROCM Nightly
+    - name: Build FBGEMM_GPU-ROCm Nightly
       run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_develop $BUILD_ENV rocm
 
-    - name: Test FBGEMM_GPU-ROCM Nightly Installation
+    - name: Test FBGEMM_GPU-ROCm Nightly Installation
       timeout-minutes: 15
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm
 

diff --git a/.github/workflows/fbgemm_gpu_cuda_release.yml b/.github/workflows/fbgemm_gpu_cuda_release.yml
@@ -27,6 +27,12 @@ on:
         type: boolean
         required: false
         default: false
+      cuda_version:
+        description: CUDA Version to Use for PyPI Publishing
+        type: choice
+        required: false
+        options: [ "11.8.0", "12.1.1" ]
+        default: "11.8.0"
 
 concurrency:
   # Cancel previous runs in the PR if a new commit is pushed
@@ -124,8 +130,6 @@ jobs:
         ]
         python-version: [ "3.8", "3.9", "3.10", "3.11" ]
         cuda-version: [ "11.8.0", "12.1.1" ]
-        # Specify exactly ONE CUDA version for artifact publish
-        cuda-version-publish: [ "11.8.0" ]
     needs: build_artifact
 
     steps:
@@ -171,7 +175,7 @@ jobs:
       run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
 
     - name: Push FBGEMM_GPU Binary to PYPI
-      if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == matrix.cuda-version-publish }}
+      if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.publish_to_pypi == 'true' && matrix.cuda-version == github.event.inputs.cuda_version }}
       env:
         PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
       run: . $PRELUDE; publish_to_pypi $BUILD_ENV fbgemm_gpu-*.whl "$PYPI_TOKEN"
diff --git a/.github/workflows/fbgemm_gpu_pypi.yml b/.github/workflows/fbgemm_gpu_pypi.yml
@@ -0,0 +1,189 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+name: FBGEMM_GPU-CPU Nightly Build
+
+on:
+  # Manual Trigger
+  #
+  workflow_dispatch:
+    inputs:
+      fbgemm_gpu_variant:
+        description: FBGEMM-GPU Variant
+        type: choice
+        required: true
+        options: [ "cpu", "cuda", "rocm" ]
+        default: "cpu"
+      fbgemm_gpu_version:
+        description: FBGEMM-GPU Version (e.g. '0.5.0rc1')
+        type: string
+        required: true
+
+
+test_pypi_install_cpu:
+  if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant == 'cpu' }}
+  runs-on: ${{ matrix.host-machine.instance }}
+  container:
+    image: amazonlinux:2023
+    options: --user root
+  defaults:
+    run:
+      shell: bash
+  env:
+    PRELUDE: .github/scripts/setup_env.bash
+    BUILD_ENV: test_install
+  strategy:
+    fail-fast: false
+    matrix:
+      host-machine: [
+        { instance: "linux.4xlarge" },
+        { instance: "linux.arm64.2xlarge" },
+      ]
+      python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+
+  steps:
+  - name: Setup Build Container
+    run: yum update -y; yum install -y binutils findutils git pciutils sudo wget which
+
+  - name: Checkout the Repository
+    uses: actions/checkout@v3
+
+  - name: Display System Info
+    run: . $PRELUDE; print_system_info; print_ec2_info
+
+  - name: Display GPU Info
+    run: . $PRELUDE; print_gpu_info
+
+  - name: Setup Miniconda
+    run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+  - name: Create Conda Environment
+    run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+  - name: Install PyTorch-CPU
+    run: . $PRELUDE; install_pytorch_pip $BUILD_ENV test cpu
+
+  - name: Install FBGEMM_GPU-CPU
+    run: . $PRELUDE; cd fbgemm_gpu; install_fbgemm_gpu_pypi $BUILD_ENV cuda ${{ github.event.inputs.fbgemm_gpu_version }}
+
+  - name: Test with PyTest
+    timeout-minutes: 10
+    run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV cpu
+
+
+test_pypi_install_cuda:
+  if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant == 'cuda' }}
+  runs-on: ${{ matrix.host-machine.instance }}
+  defaults:
+    run:
+      shell: bash
+  env:
+    PRELUDE: .github/scripts/setup_env.bash
+    BUILD_ENV: test_install
+    ENFORCE_NVIDIA_GPU: 1
+  strategy:
+    fail-fast: false
+    matrix:
+      host-machine: [
+        { instance: "linux.g5.4xlarge.nvidia.gpu" },
+      ]
+      python-version: [ "3.8", "3.9", "3.10", "3.11" ]
+      cuda-version: [ "11.8.0", "12.1.1" ]
+      # Specify exactly ONE CUDA version for artifact publish
+      cuda-version-publish: [ "11.8.0" ]
+
+  steps:
+  - name: Checkout the Repository
+    uses: actions/checkout@v3
+
+  - name: Install NVIDIA Drivers and NVIDIA-Docker Runtime
+    uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+
+  - name: Display System Info
+    run: . $PRELUDE; print_system_info; print_ec2_info
+
+  - name: Display GPU Info
+    run: . $PRELUDE; print_gpu_info
+
+  - name: Setup Miniconda
+    run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+  - name: Create Conda Environment
+    run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+  - name: Install CUDA
+    run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
+
+  - name: Install PyTorch-CUDA
+    run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda ${{ matrix.cuda-version }}
+
+  - name: Install FBGEMM_GPU-CUDA
+    run: . $PRELUDE; cd fbgemm_gpu; install_fbgemm_gpu_pypi $BUILD_ENV cuda ${{ github.event.inputs.fbgemm_gpu_version }}
+
+  - name: Test with PyTest
+    timeout-minutes: 10
+    run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV
+
+
+test_pypi_install_rocm:
+  if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant == 'rocm' }}
+  runs-on: ${{ matrix.host-machine.instance }}
+  container:
+    image: "rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}-complete"
+    options: --user root --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
+  defaults:
+    run:
+      shell: bash
+  env:
+    PRELUDE: .github/scripts/setup_env.bash
+    BUILD_ENV: test_install
+    ENFORCE_AMD_GPU: 1
+  strategy:
+    fail-fast: false
+    matrix:
+      host-machine: [
+        { instance: "rocm" },
+      ]
+      # ROCm machines are limited, so we only test against Python 3.10
+      python-version: [ "3.10" ]
+      rocm-version: [ "5.5.1", "5.6" ]
+
+  steps:
+  - name: Setup Build Container
+    run: |
+      apt update -y
+      apt install -y git wget
+      git config --global --add safe.directory '*'
+
+  - name: Checkout the Repository
+    uses: actions/checkout@v3
+
+  - name: Display System Info
+    run: . $PRELUDE; print_system_info
+
+  - name: Display GPU Info
+    run: . $PRELUDE; print_gpu_info
+
+  - name: Free Disk Space
+    run: . $PRELUDE; free_disk_space
+
+  - name: Setup Miniconda
+    run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+  - name: Create Conda Environment
+    run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+  - name: Install Build Tools
+    run: . $PRELUDE; install_build_tools $BUILD_ENV
+
+  - name: Install PyTorch-ROCm
+    run:  . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly rocm ${{ matrix.rocm-version }}
+
+  - name: Install FBGEMM_GPU-ROCm
+    run: . $PRELUDE; cd fbgemm_gpu; install_fbgemm_gpu_pypi $BUILD_ENV rocm ${{ github.event.inputs.fbgemm_gpu_version }}
+
+  - name: Test FBGEMM_GPU-ROCm
+    timeout-minutes: 15
+    run: . $PRELUDE; cd fbgemm_gpu/test; run_fbgemm_gpu_tests $BUILD_ENV rocm