2024-09-25 nightly release (dad29b2)

pytorch · Sep 25, 2024 · d670aab · d670aab
1 parent 0d0aace
commit d670aab
Show file tree

Hide file tree

Showing 13 changed files with 596 additions and 200 deletions.
diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash
@@ -70,6 +70,12 @@ install_cuda () {
   nm -gDC "${libcuda_path}"
   append_to_library_path "${env_name}" "$(dirname "$libcuda_path")"
 
+  # The symlink appears to be missing when we attempt to run FBGEMM_GPU on the
+  # `ubuntu-latest` runners on GitHub, so we have to manually add this in.
+  if [ "$ADD_LIBCUDA_SYMLINK" == "1" ]; then
+    print_exec ln "${libcuda_path}" -s "$(dirname "$libcuda_path")/libcuda.so.1"
+  fi
+
   echo "[INSTALL] Set environment variable NVML_LIB_PATH ..."
   # shellcheck disable=SC2155,SC2086
   local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)

diff --git a/.github/scripts/utils_pytorch.bash b/.github/scripts/utils_pytorch.bash
@@ -127,7 +127,12 @@ install_pytorch_pip () {
   # shellcheck disable=SC2155
   local env_prefix=$(env_name_or_prefix "${env_name}")
 
-  # Install the package from PyTorch PIP (not PyPI)
+  # Install the main dependencies
+  # shellcheck disable=SC2086
+  (exec_with_retries 3 conda install ${env_prefix} -c conda-forge -y \
+    numpy) || return 1
+
+  # Install the torch package from PyTorch PIP (not PyPI)
   install_from_pytorch_pip "${env_name}" torch "${pytorch_channel_version}" "${pytorch_variant_type_version}" || return 1
 
   # Check that PyTorch is importable

diff --git a/.github/scripts/utils_system.bash b/.github/scripts/utils_system.bash
@@ -73,6 +73,57 @@ free_disk_space () {
   echo "[CLEANUP] Freed up some disk space"
 }
 
+free_disk_space_on_host () {
+  echo "################################################################################"
+  echo "# Free Disk Space On CI Host"
+  echo "################################################################################"
+
+  # NOTE: This is meant to be run from ** inside ** containers hosted on
+  # non-PyTorch-infra GitHub runners, where the hosts might be close to full
+  # disk from serving many CI jobs.  When the container is set up properly, we
+  # can escape the container using nsenter to run commands on the host.
+  #
+  # On average, we see roughly 3GB of disk freed when running this cleanup,
+  # which appears to be sufficient to avoid the somewhat-frequent out-of-disk
+  # errors that we were previously running into.
+  #
+  # Frees up disk space on the ubuntu-latest host machine based on recommendations:
+  # https://github.com/orgs/community/discussions/25678
+  # https://github.com/apache/flink/blob/02d30ace69dc18555a5085eccf70ee884e73a16e/tools/azure-pipelines/free_disk_space.sh
+  #
+  # Escape the docker container to run the free disk operation on the host:
+  # https://stackoverflow.com/questions/66160057/how-to-run-a-command-in-host-before-entering-docker-container-in-github-ci
+  # https://stackoverflow.com/questions/32163955/how-to-run-shell-script-on-host-from-docker-container/63140387#63140387
+
+  nsenter -t 1 -m -u -n -i bash -c "
+    echo 'Listing 100 largest packages';
+    dpkg-query -Wf '\${Installed-Size}\t\${Package}\n' | sort -n | tail -n 100;
+    df -h;
+
+    echo 'Removing large packages';
+    sudo apt-get remove -y '^ghc-8.*';
+    sudo apt-get remove -y '^dotnet-.*';
+    sudo apt-get remove -y '^llvm-.*';
+    sudo apt-get remove -y 'php.*';
+    sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel;
+    sudo apt-get autoremove -y;
+    sudo apt-get clean;
+    df -h;
+
+    echo 'Removing large directories';
+    rm -rf /usr/local/android;
+    rm -rf /usr/share/dotnet;
+    rm -rf /usr/local/share/boost;
+    rm -rf /opt/ghc;
+    rm -rf /usr/local/share/chrom*;
+    rm -rf /usr/share/swift;
+    rm -rf /usr/local/julia*;
+    rm -rf /usr/local/lib/android;
+    rm -rf /opt/hostedtoolcache;
+    df -h;
+  "
+}
+
 
 ################################################################################
 # Info Functions
@@ -91,7 +142,7 @@ print_gpu_info () {
 
   (lspci -v | grep -e 'controller.*NVIDIA') || true
 
-  if [[ "${ENFORCE_CUDA_DEVICE}" ]]; then
+  if [[ "${ENFORCE_CUDA_DEVICE}" == '1' ]]; then
     # Ensure that nvidia-smi is available and returns GPU entries
     if ! nvidia-smi; then
       echo "[CHECK] NVIDIA drivers and CUDA device are required for this workflow, but does not appear to be installed or available!"

diff --git a/.github/workflows/fbgemm_gpu_ci_genai.yml b/.github/workflows/fbgemm_gpu_ci_genai.yml
@@ -127,7 +127,6 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
-    # runs-on: linux.4xlarge.nvidia.gpu
     # Use available instance types - https://github.com/pytorch/test-infra/blob/main/.github/scale-config.yml
     runs-on: ${{ matrix.host-machine.instance }}
     defaults:

diff --git a/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml b/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml
@@ -0,0 +1,199 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This workflow is used for FBGEMM_GPU-GenAI CI, and is meant to be used for
+# copies of the FBGEMM repos hosted outside of the pytorch org.
+name: FBGEMM_GPU-GenAI CI (Generic Runner)
+
+on:
+  # PR Trigger
+  #
+  pull_request:
+    branches:
+      - main
+
+  # Push Trigger (enable to catch errors coming out of multiple merges)
+  #
+  push:
+    branches:
+      - main
+
+  # Manual Trigger
+  #
+  workflow_dispatch:
+
+concurrency:
+  # Cancel previous runs in the PR if a new commit is pushed
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # Build on CPU hosts and upload to GHA
+  build_artifact:
+    runs-on: ${{ matrix.host-machine.instance }}
+    container:
+      image: amazonlinux:2023
+      options: --user root --privileged --pid=host
+      volumes:
+          - /var/run/docker.sock:/var/run/docker.sock
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
+      BUILD_VARIANT: genai
+    continue-on-error: true
+    strategy:
+      fail-fast: false
+      matrix:
+        host-machine: [
+          { arch: x86, instance: "ubuntu-latest" },
+        ]
+        python-version: [ "3.9", "3.10", "3.11", "3.12" ]
+        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        compiler: [ "gcc", "clang" ]
+
+    steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which
+
+    - name: Checkout the Repository
+      uses: actions/checkout@v4
+      with:
+        submodules: true
+
+    - name: Free Disk Space on Host
+      run: . $PRELUDE; free_disk_space_on_host
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install C/C++ Compilers
+      run: . $PRELUDE; install_cxx_compiler $BUILD_ENV ${{ matrix.compiler }}
+
+    - name: Install Build Tools
+      run: . $PRELUDE; install_build_tools $BUILD_ENV
+
+    - name: Install CUDA
+      run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
+
+    # Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready
+    - name: Install PyTorch Nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda/${{ matrix.cuda-version }}
+
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi
+
+    - name: Install cuDNN
+      run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}
+
+    - name: Prepare FBGEMM_GPU Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Build FBGEMM_GPU Wheel
+      run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly genai
+
+    - name: Upload Built Wheel as GHA Artifact
+      # Cannot upgrade to actions/upload-artifact@v4 yet because GLIBC on the instance is too old
+      uses: actions/upload-artifact@v3
+      with:
+        name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
+        path: fbgemm_gpu/dist/*.whl
+        if-no-files-found: error
+
+  # Download the built artifact from GHA, test on GPU, and push to PyPI
+  test_artifact:
+    runs-on: ${{ matrix.host-machine.instance }}
+    container:
+      image: amazonlinux:2023
+      options: --user root --privileged --pid=host
+      volumes:
+          - /var/run/docker.sock:/var/run/docker.sock
+    defaults:
+      run:
+        shell: bash
+    env:
+      PRELUDE: .github/scripts/setup_env.bash
+      BUILD_ENV: build_binary
+      BUILD_VARIANT: genai
+      ENFORCE_CUDA_DEVICE: 0
+      CUDA_VISIBLE_DEVICES: -1
+      ADD_LIBCUDA_SYMLINK: 1
+    strategy:
+      fail-fast: false
+      matrix:
+        host-machine: [
+          { arch: x86, instance: "ubuntu-latest" },
+        ]
+        python-version: [ "3.9", "3.10", "3.11", "3.12" ]
+        cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
+        # Specify exactly ONE CUDA version for artifact publish
+        cuda-version-publish: [ "12.1.1" ]
+        compiler: [ "gcc", "clang" ]
+    needs: build_artifact
+
+    steps:
+    - name: Setup Build Container
+      run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which
+
+    - name: Checkout the Repository
+      uses: actions/checkout@v4
+      with:
+        submodules: true
+
+    - name: Free Disk Space on Host
+      run: . $PRELUDE; free_disk_space_on_host
+
+    - name: Download Wheel Artifact from GHA
+      # Cannot upgrade to actions/download-artifact@v4 yet because GLIBC on the instance is too old
+      uses: actions/download-artifact@v3
+      with:
+        name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
+
+    - name: Display System Info
+      run: . $PRELUDE; print_system_info; print_ec2_info
+
+    - name: Display GPU Info
+      run: . $PRELUDE; print_gpu_info
+
+    - name: Setup Miniconda
+      run: . $PRELUDE; setup_miniconda $HOME/miniconda
+
+    - name: Create Conda Environment
+      run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}
+
+    - name: Install C/C++ Compilers for Updated LIBGCC
+      run: . $PRELUDE; install_cxx_compiler $BUILD_ENV clang
+
+    - name: Install CUDA
+      run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}
+
+    - name: Install PyTorch Nightly
+      run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda/${{ matrix.cuda-version }}
+
+    - name: Collect PyTorch Environment Info
+      if: ${{ success() || failure() }}
+      run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi
+
+    - name: Prepare FBGEMM_GPU Build
+      run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV
+
+    - name: Install FBGEMM_GPU Wheel
+      run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl
+
+    - name: Test with PyTest
+      timeout-minutes: 30
+      run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV
diff --git a/fbgemm_gpu/codegen/genscript/optimizers.py b/fbgemm_gpu/codegen/genscript/optimizers.py
@@ -1047,27 +1047,22 @@ def ensemble_rowwise_adagrad() -> Dict[str, Any]:
         momentum2[idx] = new_sum_square_grads;
         multiplier = learning_rate / (sqrtf(new_sum_square_grads) + eps);
         
-        coef_ema = fabs(momentum);
+        coef_ema = (row_counter[idx] > step_start) ? (momentum*1.0) : 0.0;
         if (step_mode == 1) {
-            // row_counter[idx] records the number of appearances of this row
+            // row_counter[idx] tracks the number of appearances of this ID
             row_counter[idx] += 1.0;
             should_ema = floorf(row_counter[idx] / step_ema) - floorf((row_counter[idx]-1.0) / step_ema);
             should_swap = floorf(row_counter[idx] / step_swap) - floorf((row_counter[idx]-1.0) / step_swap);
         } else if (step_mode == 2)  {
-            // row_counter[idx] records the step of last ema; prev_iter[idx] records the step of last swap
-            if (momentum > 0) {
-                should_ema = floorf(iter*1.0 / step_ema) - floorf(row_counter[idx]  / step_ema);
-                should_swap = floorf(iter*1.0 / step_swap) - floorf(prev_iter[idx]  / step_swap);
-                coef_ema = (should_ema > 0.5) ? powf(coef_ema, should_ema) : coef_ema;
-            } else {
-                should_ema = floorf((iter*1.0 - row_counter[idx]) / step_ema);
-                should_swap = floorf((iter*1.0 - prev_iter[idx]) / step_swap);
-                coef_ema = (should_ema > 0.5) ? powf(coef_ema, (iter*1.0 - row_counter[idx]) / step_ema) : coef_ema;
-            }
+            should_ema = floorf((iter*1.0 - row_counter[idx]) / step_ema);
+            should_swap = floorf((iter*1.0 - prev_iter[idx]) / step_swap);
+            // row_counter[idx] records the step of last ema
             if (should_ema > 0.5) {
+                coef_ema = powf(coef_ema, (iter*1.0 - row_counter[idx]) / step_ema);
                 row_counter[idx] = iter*1.0;
             }
-            if (iter*1.0 > step_start && should_swap > 0.5) {
+            // prev_iter[idx] records the step of last swap
+            if (should_swap > 0.5) {
                 prev_iter[idx] = iter*1.0;
             }
         } else {
@@ -1089,14 +1084,14 @@ def ensemble_rowwise_adagrad() -> Dict[str, Any]:
 
         if (should_ema > 0.5) { // slow table ema
             Vec4T<momentum1_ph_t> m_t(&momentum1[idx * D + d]);
-            m_t.acc.x = (1.0 - coef_ema) * weight_new.acc.x + coef_ema * m_t.acc.x + (fabs(momentum) - coef_ema) * multiplier * grad.acc.x;
-            m_t.acc.y = (1.0 - coef_ema) * weight_new.acc.y + coef_ema * m_t.acc.y + (fabs(momentum) - coef_ema) * multiplier * grad.acc.y;
-            m_t.acc.z = (1.0 - coef_ema) * weight_new.acc.z + coef_ema * m_t.acc.z + (fabs(momentum) - coef_ema) * multiplier * grad.acc.z;
-            m_t.acc.w = (1.0 - coef_ema) * weight_new.acc.w + coef_ema * m_t.acc.w + (fabs(momentum) - coef_ema) * multiplier * grad.acc.w;
+            m_t.acc.x = (1.0 - coef_ema) * weight_new.acc.x + coef_ema * m_t.acc.x + (momentum - coef_ema) * multiplier * grad.acc.x;
+            m_t.acc.y = (1.0 - coef_ema) * weight_new.acc.y + coef_ema * m_t.acc.y + (momentum - coef_ema) * multiplier * grad.acc.y;
+            m_t.acc.z = (1.0 - coef_ema) * weight_new.acc.z + coef_ema * m_t.acc.z + (momentum - coef_ema) * multiplier * grad.acc.z;
+            m_t.acc.w = (1.0 - coef_ema) * weight_new.acc.w + coef_ema * m_t.acc.w + (momentum - coef_ema) * multiplier * grad.acc.w;
             m_t.store(&momentum1[idx * D + d]);
         }
 
-        if (iter*1.0 > step_start && should_swap > 0.5) { // slow-to-fast swap
+        if (should_swap > 0.5) { // slow-to-fast swap
             Vec4T<momentum1_ph_t> m_t(&momentum1[idx * D + d]);
             weight_new.acc.x = m_t.acc.x * 1.0;
             weight_new.acc.y = m_t.acc.y * 1.0;

diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu-python-api/table_batched_embedding_ops.rst b/fbgemm_gpu/docs/src/fbgemm_gpu-python-api/table_batched_embedding_ops.rst
@@ -1,6 +1,5 @@
 Table Batched Embedding (TBE) Operators
 =======================================
 
-.. automodule:: fbgemm_gpu
-
-.. autofunction:: fbgemm_gpu.split_table_batched_embeddings_ops.SplitTableBatchedEmbeddingBagsCodegen
+.. autoclass:: fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen
+    :members:
diff --git a/fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py b/fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py
@@ -137,7 +137,7 @@ def gqa_reference(
 
 class Int4GQATest(unittest.TestCase):
     @unittest.skipIf(
-        not torch.version.cuda or torch.cuda.get_device_capability()[0] < 8,
+        not torch.cuda.is_available() or torch.cuda.get_device_capability()[0] < 8,
         "Skip when CUDA is not available or CUDA compute capability is less than 8",
     )
     @settings(verbosity=VERBOSITY, max_examples=40, deadline=None)
@@ -243,7 +243,7 @@ def test_gqa(
     )
     # pyre-fixme[56]
     @unittest.skipIf(
-        not torch.version.cuda or not HAS_XFORMERS,
+        not torch.cuda.is_available() or not HAS_XFORMERS,
         "Skip when CUDA is not available or xformers is not available",
     )
     def test_mqa_main(  # noqa C901