Skip to content

Commit

Permalink
2024-09-25 nightly release (dad29b2)
Browse files Browse the repository at this point in the history
  • Loading branch information
pytorchbot committed Sep 25, 2024
1 parent 0d0aace commit d670aab
Show file tree
Hide file tree
Showing 13 changed files with 596 additions and 200 deletions.
6 changes: 6 additions & 0 deletions .github/scripts/utils_cuda.bash
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@ install_cuda () {
nm -gDC "${libcuda_path}"
append_to_library_path "${env_name}" "$(dirname "$libcuda_path")"

# The symlink appears to be missing when we attempt to run FBGEMM_GPU on the
# `ubuntu-latest` runners on GitHub, so we have to manually add this in.
if [ "$ADD_LIBCUDA_SYMLINK" == "1" ]; then
print_exec ln "${libcuda_path}" -s "$(dirname "$libcuda_path")/libcuda.so.1"
fi

echo "[INSTALL] Set environment variable NVML_LIB_PATH ..."
# shellcheck disable=SC2155,SC2086
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
Expand Down
7 changes: 6 additions & 1 deletion .github/scripts/utils_pytorch.bash
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,12 @@ install_pytorch_pip () {
# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

# Install the package from PyTorch PIP (not PyPI)
# Install the main dependencies
# shellcheck disable=SC2086
(exec_with_retries 3 conda install ${env_prefix} -c conda-forge -y \
numpy) || return 1

# Install the torch package from PyTorch PIP (not PyPI)
install_from_pytorch_pip "${env_name}" torch "${pytorch_channel_version}" "${pytorch_variant_type_version}" || return 1

# Check that PyTorch is importable
Expand Down
53 changes: 52 additions & 1 deletion .github/scripts/utils_system.bash
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,57 @@ free_disk_space () {
echo "[CLEANUP] Freed up some disk space"
}

free_disk_space_on_host () {
echo "################################################################################"
echo "# Free Disk Space On CI Host"
echo "################################################################################"

# NOTE: This is meant to be run from ** inside ** containers hosted on
# non-PyTorch-infra GitHub runners, where the hosts might be close to full
# disk from serving many CI jobs. When the container is set up properly, we
# can escape the container using nsenter to run commands on the host.
#
# On average, we see roughly 3GB of disk freed when running this cleanup,
# which appears to be sufficient to avoid the somewhat-frequent out-of-disk
# errors that we were previously running into.
#
# Frees up disk space on the ubuntu-latest host machine based on recommendations:
# https://github.com/orgs/community/discussions/25678
# https://github.com/apache/flink/blob/02d30ace69dc18555a5085eccf70ee884e73a16e/tools/azure-pipelines/free_disk_space.sh
#
# Escape the docker container to run the free disk operation on the host:
# https://stackoverflow.com/questions/66160057/how-to-run-a-command-in-host-before-entering-docker-container-in-github-ci
# https://stackoverflow.com/questions/32163955/how-to-run-shell-script-on-host-from-docker-container/63140387#63140387

nsenter -t 1 -m -u -n -i bash -c "
echo 'Listing 100 largest packages';
dpkg-query -Wf '\${Installed-Size}\t\${Package}\n' | sort -n | tail -n 100;
df -h;
echo 'Removing large packages';
sudo apt-get remove -y '^ghc-8.*';
sudo apt-get remove -y '^dotnet-.*';
sudo apt-get remove -y '^llvm-.*';
sudo apt-get remove -y 'php.*';
sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel;
sudo apt-get autoremove -y;
sudo apt-get clean;
df -h;
echo 'Removing large directories';
rm -rf /usr/local/android;
rm -rf /usr/share/dotnet;
rm -rf /usr/local/share/boost;
rm -rf /opt/ghc;
rm -rf /usr/local/share/chrom*;
rm -rf /usr/share/swift;
rm -rf /usr/local/julia*;
rm -rf /usr/local/lib/android;
rm -rf /opt/hostedtoolcache;
df -h;
"
}


################################################################################
# Info Functions
Expand All @@ -91,7 +142,7 @@ print_gpu_info () {

(lspci -v | grep -e 'controller.*NVIDIA') || true

if [[ "${ENFORCE_CUDA_DEVICE}" ]]; then
if [[ "${ENFORCE_CUDA_DEVICE}" == '1' ]]; then
# Ensure that nvidia-smi is available and returns GPU entries
if ! nvidia-smi; then
echo "[CHECK] NVIDIA drivers and CUDA device are required for this workflow, but does not appear to be installed or available!"
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/fbgemm_gpu_ci_genai.yml
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ jobs:

# Download the built artifact from GHA, test on GPU, and push to PyPI
test_and_publish_artifact:
# runs-on: linux.4xlarge.nvidia.gpu
# Use available instance types - https://github.com/pytorch/test-infra/blob/main/.github/scale-config.yml
runs-on: ${{ matrix.host-machine.instance }}
defaults:
Expand Down
199 changes: 199 additions & 0 deletions .github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# This workflow is used for FBGEMM_GPU-GenAI CI, and is meant to be used for
# copies of the FBGEMM repos hosted outside of the pytorch org.
name: FBGEMM_GPU-GenAI CI (Generic Runner)

on:
# PR Trigger
#
pull_request:
branches:
- main

# Push Trigger (enable to catch errors coming out of multiple merges)
#
push:
branches:
- main

# Manual Trigger
#
workflow_dispatch:

concurrency:
# Cancel previous runs in the PR if a new commit is pushed
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

jobs:
# Build on CPU hosts and upload to GHA
build_artifact:
runs-on: ${{ matrix.host-machine.instance }}
container:
image: amazonlinux:2023
options: --user root --privileged --pid=host
volumes:
- /var/run/docker.sock:/var/run/docker.sock
defaults:
run:
shell: bash
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
BUILD_VARIANT: genai
continue-on-error: true
strategy:
fail-fast: false
matrix:
host-machine: [
{ arch: x86, instance: "ubuntu-latest" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12" ]
cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
compiler: [ "gcc", "clang" ]

steps:
- name: Setup Build Container
run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which

- name: Checkout the Repository
uses: actions/checkout@v4
with:
submodules: true

- name: Free Disk Space on Host
run: . $PRELUDE; free_disk_space_on_host

- name: Display System Info
run: . $PRELUDE; print_system_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: . $PRELUDE; setup_miniconda $HOME/miniconda

- name: Create Conda Environment
run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}

- name: Install C/C++ Compilers
run: . $PRELUDE; install_cxx_compiler $BUILD_ENV ${{ matrix.compiler }}

- name: Install Build Tools
run: . $PRELUDE; install_build_tools $BUILD_ENV

- name: Install CUDA
run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}

# Install via PIP to avoid defaulting to the CPU variant if the GPU variant of the day is not ready
- name: Install PyTorch Nightly
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda/${{ matrix.cuda-version }}

- name: Collect PyTorch Environment Info
if: ${{ success() || failure() }}
run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi

- name: Install cuDNN
run: . $PRELUDE; install_cudnn $BUILD_ENV "$(pwd)/build_only/cudnn" ${{ matrix.cuda-version }}

- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Build FBGEMM_GPU Wheel
run: . $PRELUDE; cd fbgemm_gpu; build_fbgemm_gpu_package $BUILD_ENV nightly genai

- name: Upload Built Wheel as GHA Artifact
# Cannot upgrade to actions/upload-artifact@v4 yet because GLIBC on the instance is too old
uses: actions/upload-artifact@v3
with:
name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl
path: fbgemm_gpu/dist/*.whl
if-no-files-found: error

# Download the built artifact from GHA, test on GPU, and push to PyPI
test_artifact:
runs-on: ${{ matrix.host-machine.instance }}
container:
image: amazonlinux:2023
options: --user root --privileged --pid=host
volumes:
- /var/run/docker.sock:/var/run/docker.sock
defaults:
run:
shell: bash
env:
PRELUDE: .github/scripts/setup_env.bash
BUILD_ENV: build_binary
BUILD_VARIANT: genai
ENFORCE_CUDA_DEVICE: 0
CUDA_VISIBLE_DEVICES: -1
ADD_LIBCUDA_SYMLINK: 1
strategy:
fail-fast: false
matrix:
host-machine: [
{ arch: x86, instance: "ubuntu-latest" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12" ]
cuda-version: [ "11.8.0", "12.1.1", "12.4.1" ]
# Specify exactly ONE CUDA version for artifact publish
cuda-version-publish: [ "12.1.1" ]
compiler: [ "gcc", "clang" ]
needs: build_artifact

steps:
- name: Setup Build Container
run: yum update -y; yum install -y binutils findutils git pciutils sudo tar wget which

- name: Checkout the Repository
uses: actions/checkout@v4
with:
submodules: true

- name: Free Disk Space on Host
run: . $PRELUDE; free_disk_space_on_host

- name: Download Wheel Artifact from GHA
# Cannot upgrade to actions/download-artifact@v4 yet because GLIBC on the instance is too old
uses: actions/download-artifact@v3
with:
name: fbgemm_gpu_nightly_genai_${{ matrix.host-machine.arch }}_${{ matrix.compiler }}_py${{ matrix.python-version }}_cu${{ matrix.cuda-version }}.whl

- name: Display System Info
run: . $PRELUDE; print_system_info; print_ec2_info

- name: Display GPU Info
run: . $PRELUDE; print_gpu_info

- name: Setup Miniconda
run: . $PRELUDE; setup_miniconda $HOME/miniconda

- name: Create Conda Environment
run: . $PRELUDE; create_conda_environment $BUILD_ENV ${{ matrix.python-version }}

- name: Install C/C++ Compilers for Updated LIBGCC
run: . $PRELUDE; install_cxx_compiler $BUILD_ENV clang

- name: Install CUDA
run: . $PRELUDE; install_cuda $BUILD_ENV ${{ matrix.cuda-version }}

- name: Install PyTorch Nightly
run: . $PRELUDE; install_pytorch_pip $BUILD_ENV nightly cuda/${{ matrix.cuda-version }}

- name: Collect PyTorch Environment Info
if: ${{ success() || failure() }}
run: if . $PRELUDE && which conda; then collect_pytorch_env_info $BUILD_ENV; fi

- name: Prepare FBGEMM_GPU Build
run: . $PRELUDE; cd fbgemm_gpu; prepare_fbgemm_gpu_build $BUILD_ENV

- name: Install FBGEMM_GPU Wheel
run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl

- name: Test with PyTest
timeout-minutes: 30
run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV
31 changes: 13 additions & 18 deletions fbgemm_gpu/codegen/genscript/optimizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1047,27 +1047,22 @@ def ensemble_rowwise_adagrad() -> Dict[str, Any]:
momentum2[idx] = new_sum_square_grads;
multiplier = learning_rate / (sqrtf(new_sum_square_grads) + eps);
coef_ema = fabs(momentum);
coef_ema = (row_counter[idx] > step_start) ? (momentum*1.0) : 0.0;
if (step_mode == 1) {
// row_counter[idx] records the number of appearances of this row
// row_counter[idx] tracks the number of appearances of this ID
row_counter[idx] += 1.0;
should_ema = floorf(row_counter[idx] / step_ema) - floorf((row_counter[idx]-1.0) / step_ema);
should_swap = floorf(row_counter[idx] / step_swap) - floorf((row_counter[idx]-1.0) / step_swap);
} else if (step_mode == 2) {
// row_counter[idx] records the step of last ema; prev_iter[idx] records the step of last swap
if (momentum > 0) {
should_ema = floorf(iter*1.0 / step_ema) - floorf(row_counter[idx] / step_ema);
should_swap = floorf(iter*1.0 / step_swap) - floorf(prev_iter[idx] / step_swap);
coef_ema = (should_ema > 0.5) ? powf(coef_ema, should_ema) : coef_ema;
} else {
should_ema = floorf((iter*1.0 - row_counter[idx]) / step_ema);
should_swap = floorf((iter*1.0 - prev_iter[idx]) / step_swap);
coef_ema = (should_ema > 0.5) ? powf(coef_ema, (iter*1.0 - row_counter[idx]) / step_ema) : coef_ema;
}
should_ema = floorf((iter*1.0 - row_counter[idx]) / step_ema);
should_swap = floorf((iter*1.0 - prev_iter[idx]) / step_swap);
// row_counter[idx] records the step of last ema
if (should_ema > 0.5) {
coef_ema = powf(coef_ema, (iter*1.0 - row_counter[idx]) / step_ema);
row_counter[idx] = iter*1.0;
}
if (iter*1.0 > step_start && should_swap > 0.5) {
// prev_iter[idx] records the step of last swap
if (should_swap > 0.5) {
prev_iter[idx] = iter*1.0;
}
} else {
Expand All @@ -1089,14 +1084,14 @@ def ensemble_rowwise_adagrad() -> Dict[str, Any]:
if (should_ema > 0.5) { // slow table ema
Vec4T<momentum1_ph_t> m_t(&momentum1[idx * D + d]);
m_t.acc.x = (1.0 - coef_ema) * weight_new.acc.x + coef_ema * m_t.acc.x + (fabs(momentum) - coef_ema) * multiplier * grad.acc.x;
m_t.acc.y = (1.0 - coef_ema) * weight_new.acc.y + coef_ema * m_t.acc.y + (fabs(momentum) - coef_ema) * multiplier * grad.acc.y;
m_t.acc.z = (1.0 - coef_ema) * weight_new.acc.z + coef_ema * m_t.acc.z + (fabs(momentum) - coef_ema) * multiplier * grad.acc.z;
m_t.acc.w = (1.0 - coef_ema) * weight_new.acc.w + coef_ema * m_t.acc.w + (fabs(momentum) - coef_ema) * multiplier * grad.acc.w;
m_t.acc.x = (1.0 - coef_ema) * weight_new.acc.x + coef_ema * m_t.acc.x + (momentum - coef_ema) * multiplier * grad.acc.x;
m_t.acc.y = (1.0 - coef_ema) * weight_new.acc.y + coef_ema * m_t.acc.y + (momentum - coef_ema) * multiplier * grad.acc.y;
m_t.acc.z = (1.0 - coef_ema) * weight_new.acc.z + coef_ema * m_t.acc.z + (momentum - coef_ema) * multiplier * grad.acc.z;
m_t.acc.w = (1.0 - coef_ema) * weight_new.acc.w + coef_ema * m_t.acc.w + (momentum - coef_ema) * multiplier * grad.acc.w;
m_t.store(&momentum1[idx * D + d]);
}
if (iter*1.0 > step_start && should_swap > 0.5) { // slow-to-fast swap
if (should_swap > 0.5) { // slow-to-fast swap
Vec4T<momentum1_ph_t> m_t(&momentum1[idx * D + d]);
weight_new.acc.x = m_t.acc.x * 1.0;
weight_new.acc.y = m_t.acc.y * 1.0;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
Table Batched Embedding (TBE) Operators
=======================================

.. automodule:: fbgemm_gpu

.. autofunction:: fbgemm_gpu.split_table_batched_embeddings_ops.SplitTableBatchedEmbeddingBagsCodegen
.. autoclass:: fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen
:members:
4 changes: 2 additions & 2 deletions fbgemm_gpu/experimental/gen_ai/test/attention/gqa_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def gqa_reference(

class Int4GQATest(unittest.TestCase):
@unittest.skipIf(
not torch.version.cuda or torch.cuda.get_device_capability()[0] < 8,
not torch.cuda.is_available() or torch.cuda.get_device_capability()[0] < 8,
"Skip when CUDA is not available or CUDA compute capability is less than 8",
)
@settings(verbosity=VERBOSITY, max_examples=40, deadline=None)
Expand Down Expand Up @@ -243,7 +243,7 @@ def test_gqa(
)
# pyre-fixme[56]
@unittest.skipIf(
not torch.version.cuda or not HAS_XFORMERS,
not torch.cuda.is_available() or not HAS_XFORMERS,
"Skip when CUDA is not available or xformers is not available",
)
def test_mqa_main( # noqa C901
Expand Down
Loading

0 comments on commit d670aab

Please sign in to comment.