From 3abe1b3b81cdee43988722e7ad1ac385667edabf Mon Sep 17 00:00:00 2001
From: Benson Ma <bensonma415@meta.com>
Date: Fri, 20 Sep 2024 17:14:20 -0700
Subject: [PATCH] [fbgemm_gpu] Add workflow for running only on non-PyTorch
 infrastructure

- Add workflow for running only on non-PyTorch infrastructure
---
 .github/scripts/utils_pytorch.bash            |   7 +-
 .github/scripts/utils_system.bash             |  44 +++-
 .github/workflows/fbgemm_gpu_ci_genai.yml     |   1 -
 .../fbgemm_gpu_ci_genai_generic_infra.yml     | 188 ++++++++++++++++++
 4 files changed, 237 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml

diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash
index 3d07e437c9..42ea6c0f6d 100644
--- a/.github/scripts/utils_pytorch.bash
+++ b/.github/scripts/utils_pytorch.bash
@@ -127,7 +127,12 @@ install_pytorch_pip () {
   # shellcheck disable=SC2155
   local env_prefix=$(env_name_or_prefix "${env_name}")
 
-  # Install the package from PyTorch PIP (not PyPI)
+  # Install the main dependencies
+  # shellcheck disable=SC2086
+  (exec_with_retries 3 conda install ${env_prefix} -c conda-forge -y \
+    numpy) || return 1
+
+  # Install the torch package from PyTorch PIP (not PyPI)
   install_from_pytorch_pip "${env_name}" torch "${pytorch_channel_version}" "${pytorch_variant_type_version}" || return 1
 
   # Check that PyTorch is importable
diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash
index ba5605a281..3fba4146a3 100644
--- a/.github/scripts/utils_system.bash
+++ b/.github/scripts/utils_system.bash
@@ -73,6 +73,48 @@ free_disk_space () {
   echo "[CLEANUP] Freed up some disk space"
 }
 
+free_disk_space_on_host () {
+  echo "################################################################################"
+  echo "# Free Disk Space On CI Host"
+  echo "################################################################################"
+
+  # Frees up disk space on the ubuntu-latest host machine based on recommendations:
+  # https://github.com/orgs/community/discussions/25678
+  # https://github.com/apache/flink/blob/02d30ace69dc18555a5085eccf70ee884e73a16e/tools/azure-pipelines/free_disk_space.sh
+
+  # Escape the docker container to run the free disk operation on the host:
+  # https://stackoverflow.com/questions/66160057/how-to-run-a-command-in-host-before-entering-docker-container-in-github-ci
+  # https://stackoverflow.com/questions/32163955/how-to-run-shell-script-on-host-from-docker-container/63140387#63140387
+
+  nsenter -t 1 -m -u -n -i bash -c "
+    echo 'Listing 100 largest packages';
+    dpkg-query -Wf '\${Installed-Size}\t\${Package}\n' | sort -n | tail -n 100;
+    df -h;
+
+    echo 'Removing large packages';
+    sudo apt-get remove -y '^ghc-8.*';
+    sudo apt-get remove -y '^dotnet-.*';
+    sudo apt-get remove -y '^llvm-.*';
+    sudo apt-get remove -y 'php.*';
+    sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel;
+    sudo apt-get autoremove -y;
+    sudo apt-get clean;
+    df -h;
+
+    echo 'Removing large directories';
+    rm -rf /usr/local/android;
+    rm -rf /usr/share/dotnet;
+    rm -rf /usr/local/share/boost;
+    rm -rf /opt/ghc;
+    rm -rf /usr/local/share/chrom*;
+    rm -rf /usr/share/swift;
+    rm -rf /usr/local/julia*;
+    rm -rf /usr/local/lib/android;
+    rm -rf /opt/hostedtoolcache;
+    df -h;
+  "
+}
+
 
 ################################################################################
 # Info Functions
@@ -91,7 +133,7 @@ print_gpu_info () {
 
   (lspci -v | grep -e 'controller.*NVIDIA') || true
 
-  if [[ "${ENFORCE_CUDA_DEVICE}" ]]; then
+  if [[ "${ENFORCE_CUDA_DEVICE}" == '1' ]]; then
     # Ensure that nvidia-smi is available and returns GPU entries
     if ! nvidia-smi; then
       echo "[CHECK] NVIDIA drivers and CUDA device are required for this workflow, but does not appear to be installed or available!"
diff --git a/.github/workflows/fbgemm_gpu_ci_genai.yml b/.github/workflows/fbgemm_gpu_ci_genai.yml
index 07f8f80595..3ffdf45c0a 100644
--- a/.github/workflows/fbgemm_gpu_ci_genai.yml
+++ b/.github/workflows/fbgemm_gpu_ci_genai.yml
@@ -127,7 +127,6 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
-    # runs-on: linux.4xlarge.nvidia.gpu
     # Use available instance types - https://github.com/pytorch/test-infra/blob/main/.github/scale-config.yml
     runs-on: ${{ matrix.host-machine.instance }}
     defaults:
diff --git a/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml b/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml
new file mode 100644
index 0000000000..8f63b2603d
--- /dev/null
+++ b/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml
@@ -0,0 +1,188 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This workflow is used for FBGEMM_GPU-GenAI CI, and is meant to be used for
+# copies of the FBGEMM repos hosted outside of the pytorch org.
+name: FBGEMM_GPU-GenAI CI (Generic Runner)
+
+on:
+  # PR Trigger
+  #
+  pull_request:
+    branches:
+      - main
+
+  # Push Trigger (enable to catch errors coming out of multiple merges)
+  #
+  push:
+    branches:
+      - main
+
+  # Manual Trigger
+  #
+  workflow_dispatch:
+
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # Build on CPU hosts and upload to GHA
+  build_artifact:
+    runs-on: ${{ matrix.host-machine.instance }}
+    container:
+      image: amazonlinux:2023
+      options: --user root --privileged --pid=host
+      volumes:
+          - /var/run/docker.sock:/var/run/docker.sock
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
+      BUILD_VARIANT: genai
+    continue-on-error: true
+    strategy:
+      fail-fast: false
+      matrix:
+        host-machine: [
+          { arch: x86, instance: "ubuntu-latest" },
+        ]
+        python-version: [ "3.9", "3.10", "3.11", "3.12" ]
+        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        compiler: [ "gcc", "clang" ]
+
+    steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which
+
+    - name: Checkout the Repository
+      uses: actions/checkout@v4
+      with:
+        submodules: true
+
+    - name: Free Disk Space
+      run: . $PRELUDE; free_disk_space_on_host
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install C/C++ Compilers
+      run: . $PRELUDE; install_cxx_compiler $BUILD_ENV ${{ matrix.compiler }}
+
+    - name: Install Build Tools
+      run: . $PRELUDE; install_build_tools $BUILD_ENV
+
+    - name: Install CUDA
+      run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
+
+    # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready
+    - name: Install PyTorch Nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda/${{ matrix.cuda-version }}
+
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi
+
+    - name: Install cuDNN
+      run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}
+
+    - name: Prepare FBGEMM_GPU Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Build FBGEMM_GPU Wheel
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly genai
+
+    - name: Upload Built Wheel as GHA Artifact
+      # Cannot upgrade to actions/upload-artifact@v4 yet because GLIBC on the instance is too old
+      uses: actions/upload-artifact@v3
+      with:
+        name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
+        path: fbgemm_gpu/dist/*.whl
+        if-no-files-found: error
+
+  # Download the built artifact from GHA, test on GPU, and push to PyPI
+  test_artifact:
+    runs-on: ${{ matrix.host-machine.instance }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
+      BUILD_VARIANT: genai
+      ENFORCE_CUDA_DEVICE: 0
+      CUDA_VISIBLE_DEVICES: -1
+    strategy:
+      fail-fast: false
+      matrix:
+        host-machine: [
+          { arch: x86, instance: "ubuntu-latest" },
+        ]
+        python-version: [ "3.9", "3.10", "3.11", "3.12" ]
+        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        # Specify exactly ONE CUDA version for artifact publish
+        cuda-version-publish: [ "12.1.1" ]
+        compiler: [ "gcc", "clang" ]
+    needs: build_artifact
+
+    steps:
+    # Cannot upgrade to actions/checkout@v4 yet because GLIBC on the instance is too old
+    - name: Checkout the Repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
+
+    - name: Download Wheel Artifact from GHA
+      # Cannot upgrade to actions/download-artifact@v4 yet because GLIBC on the instance is too old
+      uses: actions/download-artifact@v3
+      with:
+        name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info; print_ec2_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install C/C++ Compilers for Updated LIBGCC
+      run: . $PRELUDE; install_cxx_compiler $BUILD_ENV clang
+
+    - name: Install CUDA
+      run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
+
+    - name: Install PyTorch Nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda/${{ matrix.cuda-version }}
+
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi
+
+    - name: Prepare FBGEMM_GPU Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Install FBGEMM_GPU Wheel
+      run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl
+
+    - name: Test with PyTest
+      timeout-minutes: 30
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV