2024-09-26 nightly release (b152339)

pytorch · Sep 26, 2024 · 47175dc · 47175dc
1 parent d670aab
commit 47175dc
Show file tree

Hide file tree

Showing 26 changed files with 379 additions and 105 deletions.
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -416,6 +416,9 @@ __print_library_infos () {
 
     echo "[CHECK] Listing out external shared libraries linked:"
     print_exec ldd "${library}"
+
+    echo "[CHECK] Displaying ELF information:"
+    print_exec readelf -d "${library}"
     echo "################################################################################"
     echo ""
     echo ""

diff --git a/.github/scripts/fbgemm_gpu_postbuild.bash b/.github/scripts/fbgemm_gpu_postbuild.bash
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+echo "################################################################################"
+echo "[CMAKE] Running post-build script ..."
+
+# Print directory
+pwd
+
+# List all generated .SO files
+find . -name '*.so'
+
+# Remove errant RPATHs from the .SO
+# https://github.com/pytorch/FBGEMM/issues/3098
+# https://github.com/NixOS/patchelf/issues/453
+find . -name '*.so' -print0 | xargs -0 patchelf --remove-rpath
+
+echo "[CMAKE] Removed errant RPATHs"
+echo "################################################################################"
diff --git a/.github/scripts/utils_build.bash b/.github/scripts/utils_build.bash
@@ -297,6 +297,7 @@ install_build_tools () {
     ncurses \
     ninja \
     openblas \
+    patchelf \
     scikit-build \
     wheel) || return 1
 

diff --git a/.github/workflows/build_wheels_linux_aarch64.yml b/.github/workflows/build_wheels_linux_aarch64.yml
@@ -24,6 +24,7 @@ permissions:
 
 jobs:
   generate-matrix:
+    if: ${{ github.repository_owner == 'pytorch' }}
     uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
     with:
       package-type: wheel
@@ -32,6 +33,7 @@ jobs:
       test-infra-ref: main
       with-cuda: disable
   build:
+    if: ${{ github.repository_owner == 'pytorch' }}
     needs: generate-matrix
     strategy:
       fail-fast: false

diff --git a/.github/workflows/build_wheels_linux_x86.yml b/.github/workflows/build_wheels_linux_x86.yml
@@ -24,6 +24,7 @@ permissions:
 
 jobs:
   generate-matrix:
+    if: ${{ github.repository_owner == 'pytorch' }}
     uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@main
     with:
       package-type: wheel
@@ -34,6 +35,7 @@ jobs:
       with-rocm: enable
       with-cpu: enable
   build:
+    if: ${{ github.repository_owner == 'pytorch' }}
     needs: generate-matrix
     name: pytorch/FBGEMM
     uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@main

diff --git a/.github/workflows/fbgemm_ci.yml b/.github/workflows/fbgemm_ci.yml
@@ -20,7 +20,7 @@ concurrency:
 
 jobs:
   build-linux:
-    runs-on: ${{ matrix.host-machine.instance }}
+    runs-on: ${{ github.repository_owner == 'pytorch' && matrix.host-machine.instance || 'ubuntu-latest' }}
     container:
       image: amazonlinux:2023
       options: --user root
@@ -105,7 +105,7 @@ jobs:
 
 
   build-bazel:
-    runs-on: linux.12xlarge
+    runs-on: ${{ github.repository_owner == 'pytorch' && matrix.host-machine.instance || 'ubuntu-latest' }}
     container:
       image: amazonlinux:2023
       options: --user root

diff --git a/.github/workflows/fbgemm_gpu_ci_cpu.yml b/.github/workflows/fbgemm_gpu_ci_cpu.yml
@@ -47,6 +47,7 @@ concurrency:
 jobs:
   # Build on CPU hosts, run tests, and upload to GHA
   build_artifact:
+    if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023
@@ -118,6 +119,7 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
+    if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023

diff --git a/.github/workflows/fbgemm_gpu_ci_cuda.yml b/.github/workflows/fbgemm_gpu_ci_cuda.yml
@@ -46,6 +46,7 @@ concurrency:
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
+    if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023
@@ -127,6 +128,7 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
+    if: ${{ github.repository_owner == 'pytorch' }}
     # runs-on: linux.4xlarge.nvidia.gpu
     # Use available instance types - https://github.com/pytorch/test-infra/blob/main/.github/scale-config.yml
     runs-on: ${{ matrix.host-machine.instance }}

diff --git a/.github/workflows/fbgemm_gpu_ci_genai.yml b/.github/workflows/fbgemm_gpu_ci_genai.yml
@@ -46,6 +46,7 @@ concurrency:
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
+    if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023
@@ -127,6 +128,7 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
+    if: ${{ github.repository_owner == 'pytorch' }}
     # Use available instance types - https://github.com/pytorch/test-infra/blob/main/.github/scale-config.yml
     runs-on: ${{ matrix.host-machine.instance }}
     defaults:

diff --git a/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml b/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml
@@ -32,6 +32,7 @@ concurrency:
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
+    if: ${{ github.repository_owner != 'pytorch' }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023
@@ -116,6 +117,7 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_artifact:
+    if: ${{ github.repository_owner != 'pytorch' }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023

diff --git a/.github/workflows/fbgemm_gpu_ci_rocm.yml b/.github/workflows/fbgemm_gpu_ci_rocm.yml
@@ -46,6 +46,7 @@ concurrency:
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
+    if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: ${{ matrix.container-image }}
@@ -125,6 +126,7 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
+    if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: "rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}-complete"

diff --git a/.github/workflows/fbgemm_gpu_docs.yml b/.github/workflows/fbgemm_gpu_docs.yml
@@ -24,6 +24,7 @@ on:
 
 jobs:
   build-docs:
+    if: ${{ github.repository_owner == 'pytorch' }}
     permissions:
       # Grant write permission here so that the generated docs can be pushed to `gh-pages` branch
       contents: write

diff --git a/.github/workflows/fbgemm_gpu_pip.yml b/.github/workflows/fbgemm_gpu_pip.yml
@@ -45,7 +45,7 @@ on:
 
 jobs:
   test_pypi_install_cpu:
-    if: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'cpu') }}
+    if: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'cpu')) }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023
@@ -104,7 +104,7 @@ jobs:
 
 
   test_pypi_install_cuda:
-    if: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'cuda') }}
+    if: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'cuda') }}
     runs-on: ${{ matrix.host-machine.instance }}
     defaults:
       run:
@@ -165,7 +165,7 @@ jobs:
 
 
   test_pypi_install_rocm:
-    if: ${{ github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'rocm') }}
+    if: ${{ github.repository_owner == 'pytorch' && (github.event_name == 'schedule' || (github.event_name == 'workflow_dispatch' && github.event.inputs.fbgemm_gpu_variant_type == 'rocm') }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: "rocm/dev-ubuntu-20.04:${{ matrix.rocm-version }}-complete"

diff --git a/.github/workflows/fbgemm_gpu_release_cpu.yml b/.github/workflows/fbgemm_gpu_release_cpu.yml
@@ -44,6 +44,7 @@ concurrency:
 jobs:
   # Build on CPU hosts, run tests, and upload to GHA
   build_artifact:
+    if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023
@@ -114,6 +115,7 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
+    if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023

diff --git a/.github/workflows/fbgemm_gpu_release_cuda.yml b/.github/workflows/fbgemm_gpu_release_cuda.yml
@@ -50,6 +50,7 @@ concurrency:
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
+    if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023
@@ -126,6 +127,7 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
+    if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: ${{ matrix.host-machine.instance }}
     defaults:
       run:

diff --git a/.github/workflows/fbgemm_gpu_release_genai.yml b/.github/workflows/fbgemm_gpu_release_genai.yml
@@ -50,6 +50,7 @@ concurrency:
 jobs:
   # Build on CPU hosts and upload to GHA
   build_artifact:
+    if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: ${{ matrix.host-machine.instance }}
     container:
       image: amazonlinux:2023
@@ -126,6 +127,7 @@ jobs:
 
   # Download the built artifact from GHA, test on GPU, and push to PyPI
   test_and_publish_artifact:
+    if: ${{ github.repository_owner == 'pytorch' }}
     runs-on: ${{ matrix.host-machine.instance }}
     defaults:
       run:

diff --git a/fbgemm_gpu/FbgemmGpu.cmake b/fbgemm_gpu/FbgemmGpu.cmake
@@ -725,3 +725,9 @@ install(FILES ${gen_python_source_files}
 
 install(FILES ${gen_defused_optim_py_files}
         DESTINATION fbgemm_gpu/split_embedding_optimizer_codegen)
+
+add_custom_target(fbgemm_gpu_py_clean_rpath ALL
+  WORKING_DIRECTORY ${OUTPUT_DIR}
+  COMMAND bash ${FBGEMM}/.github/scripts/fbgemm_gpu_postbuild.bash)
+
+add_dependencies(fbgemm_gpu_py_clean_rpath fbgemm_gpu_py)
diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu-python-api/jagged_tensor_ops.rst b/fbgemm_gpu/docs/src/fbgemm_gpu-python-api/jagged_tensor_ops.rst
@@ -1,6 +1,8 @@
 Jagged Tensor Operators
 =======================
 
+.. automodule:: fbgemm_gpu
+
 .. autofunction:: torch.ops.fbgemm.jagged_2d_to_dense
 
 .. autofunction:: torch.ops.fbgemm.jagged_1d_to_dense

diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu-python-api/pooled_embedding_ops.rst b/fbgemm_gpu/docs/src/fbgemm_gpu-python-api/pooled_embedding_ops.rst
@@ -0,0 +1,6 @@
+Pooled Embedding Operators
+==========================
+
+.. automodule:: fbgemm_gpu
+
+.. autofunction:: torch.ops.fbgemm.merge_pooled_embeddings
diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu-python-api/table_batched_embedding_ops.rst b/fbgemm_gpu/docs/src/fbgemm_gpu-python-api/table_batched_embedding_ops.rst
@@ -1,5 +1,10 @@
-Table Batched Embedding (TBE) Operators
-=======================================
+Table Batched Embedding (TBE) Training Module
+=============================================
 
 .. autoclass:: fbgemm_gpu.split_table_batched_embeddings_ops_training.SplitTableBatchedEmbeddingBagsCodegen
-    :members:
+    :members: forward,
+              split_embedding_weights,
+              split_optimizer_states,
+              set_learning_rate,
+              update_hyper_parameters,
+              set_optimizer_step
diff --git a/fbgemm_gpu/docs/src/index.rst b/fbgemm_gpu/docs/src/index.rst
@@ -91,3 +91,4 @@ Table of Contents
 
    fbgemm_gpu-python-api/table_batched_embedding_ops.rst
    fbgemm_gpu-python-api/jagged_tensor_ops.rst
+   fbgemm_gpu-python-api/pooled_embedding_ops.rst
diff --git a/fbgemm_gpu/experimental/example/CMakeLists.txt b/fbgemm_gpu/experimental/example/CMakeLists.txt
@@ -70,3 +70,10 @@ install(TARGETS fbgemm_gpu_experimental_example_py
 
 install(FILES ${experimental_example_python_source_files}
         DESTINATION fbgemm_gpu/experimental/example)
+
+add_custom_target(fbgemm_gpu_experimental_example_py_clean_rpath ALL
+    WORKING_DIRECTORY ${OUTPUT_DIR}
+    COMMAND bash ${FBGEMM}/.github/scripts/fbgemm_gpu_postbuild.bash)
+
+add_dependencies(fbgemm_gpu_experimental_example_py_clean_rpath
+    fbgemm_gpu_experimental_example_py)
diff --git a/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt b/fbgemm_gpu/experimental/gen_ai/CMakeLists.txt
@@ -154,3 +154,10 @@ install(TARGETS fbgemm_gpu_experimental_gen_ai_py
 
 install(FILES ${experimental_gen_ai_python_source_files}
         DESTINATION fbgemm_gpu/experimental/gen_ai)
+
+add_custom_target(fbgemm_gpu_experimental_gen_ai_py_clean_rpath ALL
+  WORKING_DIRECTORY ${OUTPUT_DIR}
+  COMMAND bash ${FBGEMM}/.github/scripts/fbgemm_gpu_postbuild.bash)
+
+add_dependencies(fbgemm_gpu_experimental_gen_ai_py_clean_rpath
+  fbgemm_gpu_experimental_gen_ai_py)
diff --git a/fbgemm_gpu/fbgemm_gpu/docs/__init__.py b/fbgemm_gpu/fbgemm_gpu/docs/__init__.py
@@ -7,6 +7,6 @@
 
 # Trigger the manual addition of docstrings to pybind11-generated operators
 try:
-    from . import jagged_tensor_ops, table_batched_embedding_ops  # noqa: F401
+    from . import jagged_tensor_ops, merge_pooled_embedding_ops  # noqa: F401
 except Exception:
     pass
diff --git a/fbgemm_gpu/fbgemm_gpu/docs/merge_pooled_embedding_ops.py b/fbgemm_gpu/fbgemm_gpu/docs/merge_pooled_embedding_ops.py
@@ -0,0 +1,36 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from .common import add_docs
+
+add_docs(
+    torch.ops.fbgemm.merge_pooled_embeddings,
+    """
+merge_pooled_embeddings(pooled_embeddings, uncat_dim_size, target_device, cat_dim=1) -> Tensor
+
+Concatenate embedding outputs from different devices (on the same host)
+on to the target device.
+
+Args:
+    pooled_embeddings (List[Tensor]): A list of embedding outputs from
+        different devices on the same host. Each output has 2
+        dimensions.
+
+    uncat_dim_size (int): The size of the dimension that is not
+        concatenated, i.e., if `cat_dim=0`, `uncat_dim_size` is the size
+        of dim 1 and vice versa.
+
+    target_device (torch.device): The target device that aggregates all
+        the embedding outputs.
+
+    cat_dim (int = 1): The dimension that the tensors are concatenated
+
+Returns:
+    The concatenated embedding output (2D) on the target device
+    """,
+)