diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu-python-api/quantize_ops.rst b/fbgemm_gpu/docs/src/fbgemm_gpu-python-api/quantize_ops.rst
new file mode 100644
index 000000000..df2a6c2d7
--- /dev/null
+++ b/fbgemm_gpu/docs/src/fbgemm_gpu-python-api/quantize_ops.rst
@@ -0,0 +1,6 @@
+Quantization Operators
+======================
+
+.. automodule:: fbgemm_gpu
+
+.. autofunction:: torch.ops.fbgemm.FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf
diff --git a/fbgemm_gpu/docs/src/index.rst b/fbgemm_gpu/docs/src/index.rst
index ba0d8ba6b..1669bf22f 100644
--- a/fbgemm_gpu/docs/src/index.rst
+++ b/fbgemm_gpu/docs/src/index.rst
@@ -91,6 +91,7 @@ Table of Contents
 
    fbgemm_gpu-python-api/jagged_tensor_ops.rst
    fbgemm_gpu-python-api/pooled_embedding_ops.rst
+   fbgemm_gpu-python-api/quantize_ops.rst
 
 .. _fbgemm-gpu.toc.api.python.modules:
 
diff --git a/fbgemm_gpu/fbgemm_gpu/docs/__init__.py b/fbgemm_gpu/fbgemm_gpu/docs/__init__.py
index 4b621cbe3..e531e1254 100644
--- a/fbgemm_gpu/fbgemm_gpu/docs/__init__.py
+++ b/fbgemm_gpu/fbgemm_gpu/docs/__init__.py
@@ -11,6 +11,7 @@
         jagged_tensor_ops,
         merge_pooled_embedding_ops,
         permute_pooled_embedding_ops,
+        quantize_ops,
     )
 except Exception:
     pass
diff --git a/fbgemm_gpu/fbgemm_gpu/docs/quantize_ops.py b/fbgemm_gpu/fbgemm_gpu/docs/quantize_ops.py
new file mode 100644
index 000000000..3662b12c7
--- /dev/null
+++ b/fbgemm_gpu/fbgemm_gpu/docs/quantize_ops.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from .common import add_docs
+
+add_docs(
+    torch.ops.fbgemm.FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf,
+    """
+FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf(input, bit_rate) -> Tensor
+
+Convert FP32/16 to INT8/4/2 using rowwise quantization.
+
+Args:
+    input (Tensor): An input tensor. Must be either FP32 (`torch.float`)
+        or FP16 (`torch.half`) and must be 2 dimensions.
+
+    bit_rate (int): Quantized bit rate (2 for INT2, 4 for INT4, or 8 for
+        INT8)
+
+Returns:
+    Quantized output (Tensor). Data type is `torch.uint8` (byte type)
+
+**Example:**
+
+    >>> # Randomize input
+    >>> input = torch.randn(2, 4, dtype=torch.float32, device="cuda")
+    >>> print(input)
+    tensor([[ 0.8247,  0.0031, -1.0068, -1.2081],
+            [ 0.5427,  1.5772,  1.0291, -0.7626]], device='cuda:0')
+    >>> # Quantize
+    >>> output = torch.ops.fbgemm.FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf(input, bit_rate=4)
+    >>> print(output)
+    tensor([[159,   1,  86,  48, 213, 188],
+            [248,  11, 254,  48,  26, 186]], device='cuda:0', dtype=torch.uint8)
+    """,
+)