From 2ab9b619a028657f7dba17ca3de89a3f6c5ee95f Mon Sep 17 00:00:00 2001
From: pytorchbot <pytorchbot@pytorch.com>
Date: Sun, 19 May 2024 11:34:47 +0000
Subject: [PATCH] 2024-05-19 nightly release
 (37c283ccb0ed51555994f743fa2a3a188f5a479f)

---
 .../gen_ai/src/quantize/cutlass_extensions.cu |  8 +++++++
 .../gen_ai/test/quantize/quantize_test.py     |  4 ++++
 fbgemm_gpu/fbgemm_gpu/__init__.py             | 21 +++++++++++++++----
 fbgemm_gpu/fbgemm_gpu/docs/__init__.py        |  5 ++++-
 4 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions.cu
index d973d28ac..ef44aedb1 100644
--- a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions.cu
+++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions.cu
@@ -1896,6 +1896,14 @@ at::Tensor f8f8bf16_cublas(
       "CUDA version is older than 12.0"); // requires CUDA>=12
 }
 at::Tensor f8f8bf16(
+    at::Tensor XQ, // FP8
+    at::Tensor WQ, // FP8
+    at::Tensor scale,
+    bool use_fast_accum) {
+  throw std::runtime_error(
+      "CUDA version is older than 12.0"); // requires CUDA>=12
+}
+at::Tensor f8f8bf16_tensorwise(
     at::Tensor XQ, // FP8
     at::Tensor WQ, // FP8
     double scale,
diff --git a/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py b/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py
index 627454336..8f0235f5d 100644
--- a/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py
+++ b/fbgemm_gpu/experimental/gen_ai/test/quantize/quantize_test.py
@@ -259,3 +259,7 @@ def test_quantize_fp8_per_tensor_with_ub(
 
         zq_ref = (x @ w.T).to(torch.bfloat16)
         torch.testing.assert_close(zq, zq_ref, atol=1.0e-3, rtol=1.0e-3)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/fbgemm_gpu/fbgemm_gpu/__init__.py b/fbgemm_gpu/fbgemm_gpu/__init__.py
index 555c8574e..1d913d28a 100644
--- a/fbgemm_gpu/fbgemm_gpu/__init__.py
+++ b/fbgemm_gpu/fbgemm_gpu/__init__.py
@@ -11,8 +11,18 @@
 
 try:
     torch.ops.load_library(os.path.join(os.path.dirname(__file__), "fbgemm_gpu_py.so"))
-except Exception as e:
-    print(e)
+except Exception as error_ranking:
+    try:
+        torch.ops.load_library(
+            os.path.join(
+                os.path.dirname(__file__),
+                "experimental/gen_ai/fbgemm_gpu_experimental_gen_ai_py.so",
+            )
+        )
+    except Exception as error_gen_ai:
+        # When both ranking/gen_ai so files are not available, print the error logs
+        print(error_ranking)
+        print(error_gen_ai)
 
 # Since __init__.py is only used in OSS context, we define `open_source` here
 # and use its existence to determine whether or not we are in OSS context
@@ -24,5 +34,8 @@
 # Export the version string from the version file auto-generated by setup.py
 from fbgemm_gpu.docs.version import __version__  # noqa: F401, E402
 
-# Trigger meta operator registrations
-from . import sparse_ops  # noqa: F401, E402
+try:
+    # Trigger meta operator registrations
+    from . import sparse_ops  # noqa: F401, E402
+except Exception:
+    pass
diff --git a/fbgemm_gpu/fbgemm_gpu/docs/__init__.py b/fbgemm_gpu/fbgemm_gpu/docs/__init__.py
index 05551ca6d..250f9d58e 100644
--- a/fbgemm_gpu/fbgemm_gpu/docs/__init__.py
+++ b/fbgemm_gpu/fbgemm_gpu/docs/__init__.py
@@ -6,4 +6,7 @@
 # LICENSE file in the root directory of this source tree.
 
 # Trigger the manual addition of docstrings to pybind11-generated operators
-from . import jagged_tensor_ops, table_batched_embedding_ops  # noqa: F401
+try:
+    from . import jagged_tensor_ops, table_batched_embedding_ops  # noqa: F401
+except Exception:
+    pass