From 55f4c781ba7f8b665b0f23e600f98a656c8fa1eb Mon Sep 17 00:00:00 2001
From: Josh Fromm <jwfromm@meta.com>
Date: Thu, 19 Sep 2024 16:02:37 -0700
Subject: [PATCH] Switch default MX4 rounding mode to Even (#3111)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/3111

X-link: https://github.com/facebookresearch/FBGEMM/pull/198

Even rounding is a bit faster and more accurate than ceil rounding. This diff swtiches the default to it.

Reviewed By: jspark1105

Differential Revision: D62466094

fbshipit-source-id: 9e80c49f536332ae65c665df7b325cecdbfef92b
---
 fbgemm_gpu/fbgemm_gpu/quantize/quantize_ops.py | 2 +-
 fbgemm_gpu/fbgemm_gpu/quantize_comm.py         | 4 ++--
 fbgemm_gpu/fbgemm_gpu/quantize_utils.py        | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fbgemm_gpu/fbgemm_gpu/quantize/quantize_ops.py b/fbgemm_gpu/fbgemm_gpu/quantize/quantize_ops.py
index 890f9d331..3da107b61 100644
--- a/fbgemm_gpu/fbgemm_gpu/quantize/quantize_ops.py
+++ b/fbgemm_gpu/fbgemm_gpu/quantize/quantize_ops.py
@@ -19,7 +19,7 @@ def quantize_mx(
     elem_mbits: int = 3,
     elem_max_norm: float = 6.0,
     mx_group_size: int = 32,
-    rounding_mode: Union[RoundingMode, int] = RoundingMode.ceil,
+    rounding_mode: Union[RoundingMode, int] = RoundingMode.even,
 ) -> torch.Tensor:
     """
     Registered quantize_mx ops for E2E comm.
diff --git a/fbgemm_gpu/fbgemm_gpu/quantize_comm.py b/fbgemm_gpu/fbgemm_gpu/quantize_comm.py
index ca2ebd02d..e699a4f15 100644
--- a/fbgemm_gpu/fbgemm_gpu/quantize_comm.py
+++ b/fbgemm_gpu/fbgemm_gpu/quantize_comm.py
@@ -66,7 +66,7 @@ class QuantizationContext:
     row_dim: int = ROW_DIM_DEFAULT
     row_dim_quant: int = -1
     mx_group_size: int = MX_GROUP_SIZE_DEFAULT
-    rounding_mode: RoundingMode = RoundingMode.ceil
+    rounding_mode: RoundingMode = RoundingMode.even
     padded_dim_sum_per_rank: Optional[List[int]] = None
 
 
@@ -110,7 +110,7 @@ def _quantize_tensor(
         return input_quant_all2all
     elif comm_precision == SparseType.MX4:
         mx_group_size = ctx.mx_group_size if ctx is not None else MX_GROUP_SIZE_DEFAULT
-        rounding_mode = ctx.rounding_mode if ctx is not None else RoundingMode.ceil
+        rounding_mode = ctx.rounding_mode if ctx is not None else RoundingMode.even
         return fp32_to_mx4(
             input_tensor, mx_group_size, rounding_mode=rounding_mode
         ).view(-1)
diff --git a/fbgemm_gpu/fbgemm_gpu/quantize_utils.py b/fbgemm_gpu/fbgemm_gpu/quantize_utils.py
index a0020f8b2..7646edf4e 100644
--- a/fbgemm_gpu/fbgemm_gpu/quantize_utils.py
+++ b/fbgemm_gpu/fbgemm_gpu/quantize_utils.py
@@ -36,7 +36,7 @@ def fp32_to_mx4(
     group_size: int = 32,
     ebits: int = 2,
     mbits: int = 1,
-    rounding_mode: Optional[Union[RoundingMode, int]] = RoundingMode.ceil,
+    rounding_mode: Optional[Union[RoundingMode, int]] = RoundingMode.even,
     stochastic_casting: bool = False,
     use_triton: bool = True,
 ) -> torch.Tensor:
@@ -58,7 +58,7 @@ def fp32_to_mx4(
     # Accelerated MX4 is only available on cuda, if input is on cpu, use python.
     # Operate on flattened input.
     if rounding_mode is None:
-        rounding_mode = RoundingMode.ceil
+        rounding_mode = RoundingMode.even
 
     if not tensor.is_cuda:
         return py_quantize_mx4(