From ea1f66c4e4dea207b297b83926083e34f647b359 Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Mon, 4 Mar 2024 21:11:43 -0800
Subject: [PATCH] Add BF16 output in TBE training (#2382)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2382

As title

Reviewed By: xing-liu

Differential Revision: D54389565

fbshipit-source-id: ab32f9d6a3e8faa597d08d6c9d1da3452e0829e6
---
 .../embedding_backward_split_grad_template.cu |  2 +-
 ...ding_backward_split_kernel_cta_template.cu |  2 +-
 ...ing_backward_split_kernel_warp_template.cu |  2 +-
 ...rward_split_kernel_nobag_small_template.cu |  2 +-
 ...embedding_forward_split_kernel_template.cu |  2 +-
 ...edding_forward_split_kernel_v2_template.cu |  7 ++-
 .../include/fbgemm_gpu/dispatch_macros.h      | 14 ++++++
 .../tbe/training/backward_adagrad_test.py     | 33 +++++++++++---
 .../test/tbe/training/backward_dense_test.py  | 44 ++++++++-----------
 .../test/tbe/training/backward_none_test.py   | 15 ++++++-
 .../test/tbe/training/backward_sgd_test.py    |  5 +++
 .../test/tbe/training/failures_dict_fast.json |  8 ++++
 fbgemm_gpu/test/tbe/training/forward_test.py  |  3 ++
 13 files changed, 99 insertions(+), 40 deletions(-)

diff --git a/fbgemm_gpu/codegen/embedding_backward_split_grad_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_grad_template.cu
index 729063627..40247e43f 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_grad_template.cu
+++ b/fbgemm_gpu/codegen/embedding_backward_split_grad_template.cu
@@ -211,7 +211,7 @@ __global__ __launch_bounds__(kMaxThreads) void grad_mean{{ vdesc }}_kernel(
 // Explicitly instantiate the template based on DISPATCH_EMB_GRAD_CACHE_TYPES
 ////////////////////////////////////////////////////////////////////////////////
 
-{% for grad_type in ['at::Half', 'float'] %}
+{% for grad_type in ['at::Half', 'float', 'at::BFloat16'] %}
 template __global__ __launch_bounds__(kMaxThreads)
 void grad_mean{{ vdesc }}_kernel
 <{{ grad_type }}> (
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_kernel_cta_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_kernel_cta_template.cu
index f7d617190..72e445d78 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_kernel_cta_template.cu
+++ b/fbgemm_gpu/codegen/embedding_backward_split_kernel_cta_template.cu
@@ -504,7 +504,7 @@ split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vdesc
 {%- endmacro %}
 
 {%- macro bulk_template_instantiations(kMaxVecsPerThread, kThreadGroupSize) %}
-    {%- for grad_type in ['float', 'at::Half'] %}
+    {%- for grad_type in ['float', 'at::Half', 'at::BFloat16'] %}
     {%- for emb_type in ['float', 'at::Half'] %}
     {%- for cache_type in ['float', 'at::Half'] %}
         {{ template_instantiation(emb_type, grad_type, cache_type, kMaxVecsPerThread, kThreadGroupSize) }}
diff --git a/fbgemm_gpu/codegen/embedding_backward_split_kernel_warp_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_kernel_warp_template.cu
index 6f625459b..90a785e00 100644
--- a/fbgemm_gpu/codegen/embedding_backward_split_kernel_warp_template.cu
+++ b/fbgemm_gpu/codegen/embedding_backward_split_kernel_warp_template.cu
@@ -340,7 +340,7 @@ split_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vdesc
 {%- endmacro %}
 
 {%- macro bulk_template_instantiations(kMaxVecsPerThread, kThreadGroupSize) %}
-    {%- for grad_type in ['float', 'at::Half'] %}
+    {%- for grad_type in ['float', 'at::Half', 'at::BFloat16'] %}
     {%- for emb_type in ['float', 'at::Half'] %}
     {%- for cache_type in ['float', 'at::Half'] %}
         {{ template_instantiation(emb_type, grad_type, cache_type, kMaxVecsPerThread, kThreadGroupSize) }}
diff --git a/fbgemm_gpu/codegen/embedding_forward_split_kernel_nobag_small_template.cu b/fbgemm_gpu/codegen/embedding_forward_split_kernel_nobag_small_template.cu
index 0b8b46e1d..a60fe16ae 100644
--- a/fbgemm_gpu/codegen/embedding_forward_split_kernel_nobag_small_template.cu
+++ b/fbgemm_gpu/codegen/embedding_forward_split_kernel_nobag_small_template.cu
@@ -193,7 +193,7 @@ batch_index_select_dim0_codegen_forward_small_kernel(
     embedding_forward_split_template.cu
 */
 
-{%- for output_type in ['float', 'at::Half'] %}
+{%- for output_type in ['float', 'at::Half', 'at::BFloat16'] %}
 {%- for emb_type in ['float', 'at::Half'] %}
 {%- for cache_type in ['float', 'at::Half'] %}
 {%- for kEmbeddingSize in [4, 8, 16, 32] %}
diff --git a/fbgemm_gpu/codegen/embedding_forward_split_kernel_template.cu b/fbgemm_gpu/codegen/embedding_forward_split_kernel_template.cu
index e1097b179..9ae35bd9f 100644
--- a/fbgemm_gpu/codegen/embedding_forward_split_kernel_template.cu
+++ b/fbgemm_gpu/codegen/embedding_forward_split_kernel_template.cu
@@ -550,7 +550,7 @@ batch_index_select_dim0_codegen_forward_kernel
 {%- macro bulk_template_instantiations(use_cache, kMaxVecsPerThread, kThreadGroupSize) %}
     {%- for emb_type in ['float', 'at::Half'] %}
     {%- for cache_type in ['float', 'at::Half'] %}
-    {%- for output_type in ['float', 'at::Half'] %}
+    {%- for output_type in ['float', 'at::Half', 'at::BFloat16'] %}
         {{ template_instantiation(emb_type, cache_type, output_type, use_cache, kMaxVecsPerThread, kThreadGroupSize) }}
     {%- endfor %}
     {%- endfor %}
diff --git a/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu b/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu
index ab129bcd1..c878b8d02 100644
--- a/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu
+++ b/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu
@@ -82,6 +82,11 @@ struct Vec4Type<at::Half> {
   using type = float2;
 };
 
+template <>
+struct Vec4Type<at::BFloat16> {
+  using type = float2;
+};
+
 template <>
 struct Vec4Type<uint8_t> {
   using type = uint8_t;
@@ -973,7 +978,7 @@ __global__ void split_embedding_codegen_forward_{{ wdesc }}_v2_kernel(
     embedding_forward_split_template.cu
 */
 
-{%- for output_type in ['float', 'at::Half'] %}
+{%- for output_type in ['float', 'at::Half', 'at::BFloat16'] %}
 {%- for emb_type in ['float', 'at::Half'] %}
 {%- for cache_type in ['float', 'at::Half'] %}
 {%- for use_cache in ['true', 'false'] %}
diff --git a/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h b/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h
index 70261706d..21ba7ab68 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h
@@ -84,6 +84,13 @@
           float,                                                   \
           NAME,                                                    \
           __VA_ARGS__)                                             \
+      PRIVATE_CASE_TYPE_OUTPUT(                                    \
+          at::ScalarType::BFloat16,                                \
+          emb_type,                                                \
+          cache_type,                                              \
+          at::BFloat16,                                            \
+          NAME,                                                    \
+          __VA_ARGS__)                                             \
       default:                                                     \
         AT_ERROR(                                                  \
             #NAME,                                                 \
@@ -172,6 +179,13 @@
           at::ScalarType::Float, _cache_t, _emb_t, float, NAME, __VA_ARGS__)   \
       PRIVATE_CASE_TYPE_CACHE_EMB(                                             \
           at::ScalarType::Half, _cache_t, _emb_t, at::Half, NAME, __VA_ARGS__) \
+      PRIVATE_CASE_TYPE_CACHE_EMB(                                             \
+          at::ScalarType::BFloat16,                                            \
+          _cache_t,                                                            \
+          _emb_t,                                                              \
+          at::BFloat16,                                                        \
+          NAME,                                                                \
+          __VA_ARGS__)                                                         \
       default:                                                                 \
         AT_ERROR(                                                              \
             #NAME, " not implemented for grad_t '", toString(_grad_t), "'");   \
diff --git a/fbgemm_gpu/test/tbe/training/backward_adagrad_test.py b/fbgemm_gpu/test/tbe/training/backward_adagrad_test.py
index 498a47fa5..f9a81f674 100644
--- a/fbgemm_gpu/test/tbe/training/backward_adagrad_test.py
+++ b/fbgemm_gpu/test/tbe/training/backward_adagrad_test.py
@@ -256,6 +256,11 @@ def execute_backward_adagrad_(  # noqa C901
                 for (b, x, xw) in zip(bs, xs, xws)
             ]
         )
+
+        # Cast output type to output_dtype
+        if weights_precision != output_dtype:
+            fs = [f.to(output_dtype.as_dtype()) for f in fs]
+
         gos = [torch.randn_like(f) for f in fs]
         [f.backward(go) for (f, go) in zip(fs, gos)]
         # do SGD update
@@ -505,7 +510,9 @@ def execute_backward_adagrad_(  # noqa C901
         use_cache=st.booleans(),
         cache_algorithm=st.sampled_from(CacheAlgorithm),
         use_cpu=use_cpu_strategy(),
-        output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
+        output_dtype=st.sampled_from(
+            [SparseType.FP32, SparseType.FP16, SparseType.BF16]
+        ),
     )
     @settings(
         verbosity=VERBOSITY,
@@ -574,7 +581,9 @@ def test_backward_adagrad_fp16_pmSUM(  # noqa C901
         use_cache=st.booleans(),
         cache_algorithm=st.sampled_from(CacheAlgorithm),
         use_cpu=use_cpu_strategy(),
-        output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
+        output_dtype=st.sampled_from(
+            [SparseType.FP32, SparseType.FP16, SparseType.BF16]
+        ),
         compile=st.booleans(),
     )
     @settings(
@@ -643,7 +652,9 @@ def test_backward_adagrad_fp16_pmMEAN(  # noqa C901
         use_cache=st.booleans(),
         cache_algorithm=st.sampled_from(CacheAlgorithm),
         use_cpu=use_cpu_strategy(),
-        output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
+        output_dtype=st.sampled_from(
+            [SparseType.FP32, SparseType.FP16, SparseType.BF16]
+        ),
         compile=st.booleans(),
     )
     @settings(
@@ -708,7 +719,9 @@ def test_backward_adagrad_fp16_pmNONE(  # noqa C901
         use_cache=st.booleans(),
         cache_algorithm=st.sampled_from(CacheAlgorithm),
         use_cpu=use_cpu_strategy(),
-        output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
+        output_dtype=st.sampled_from(
+            [SparseType.FP32, SparseType.FP16, SparseType.BF16]
+        ),
         compile=st.booleans(),
     )
     @settings(
@@ -777,7 +790,9 @@ def test_backward_adagrad_fp32_pmSUM(  # noqa C901
         use_cache=st.booleans(),
         cache_algorithm=st.sampled_from(CacheAlgorithm),
         use_cpu=use_cpu_strategy(),
-        output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
+        output_dtype=st.sampled_from(
+            [SparseType.FP32, SparseType.FP16, SparseType.BF16]
+        ),
         compile=st.booleans(),
     )
     @settings(
@@ -846,7 +861,9 @@ def test_backward_adagrad_fp32_pmMEAN(  # noqa C901
         use_cache=st.booleans(),
         cache_algorithm=st.sampled_from(CacheAlgorithm),
         use_cpu=use_cpu_strategy(),
-        output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
+        output_dtype=st.sampled_from(
+            [SparseType.FP32, SparseType.FP16, SparseType.BF16]
+        ),
         compile=st.booleans(),
     )
     @settings(
@@ -912,7 +929,9 @@ def test_backward_adagrad_fp32_pmNONE(  # noqa C901
         use_cache=st.booleans(),
         cache_algorithm=st.sampled_from(CacheAlgorithm),
         use_cpu=st.just(False),
-        output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
+        output_dtype=st.sampled_from(
+            [SparseType.FP32, SparseType.FP16, SparseType.BF16]
+        ),
         max_norm=st.floats(min_value=0.01, max_value=1.0),
     )
     @settings(
diff --git a/fbgemm_gpu/test/tbe/training/backward_dense_test.py b/fbgemm_gpu/test/tbe/training/backward_dense_test.py
index fead574e4..484d9a37b 100644
--- a/fbgemm_gpu/test/tbe/training/backward_dense_test.py
+++ b/fbgemm_gpu/test/tbe/training/backward_dense_test.py
@@ -60,7 +60,9 @@ class BackwardDenseTest(unittest.TestCase):
             ]
         ),
         use_cpu=use_cpu_strategy(),
-        output_dtype=st.sampled_from([SparseType.FP32, SparseType.FP16]),
+        output_dtype=st.sampled_from(
+            [SparseType.FP32, SparseType.FP16, SparseType.BF16]
+        ),
     )
     @settings(
         verbosity=VERBOSITY,
@@ -174,6 +176,11 @@ def test_backward_dense(  # noqa C901
                 for (b, x, xw) in zip(bs, xs, xws)
             ]
         )
+
+        # Cast output type to output_dtype
+        if weights_precision != output_dtype:
+            fs = [f.to(output_dtype.as_dtype()) for f in fs]
+
         gos = [torch.randn_like(f) for f in fs]
         [f.backward(go) for (f, go) in zip(fs, gos)]
 
@@ -211,42 +218,29 @@ def test_backward_dense(  # noqa C901
         else:
             f = torch.cat(fs, dim=0).view(-1, D)
 
+        is_low_prec = (
+            weights_precision == SparseType.FP16
+            or output_dtype == SparseType.FP16
+            or output_dtype == SparseType.BF16
+        )
+        tol = 5.0e-3 if is_low_prec else 1.0e-5
         torch.testing.assert_close(
             fc2.float(),
             f.float(),
-            atol=(
-                5.0e-3
-                if weights_precision == SparseType.FP16
-                or output_dtype == SparseType.FP16
-                else 1.0e-5
-            ),
-            rtol=(
-                5.0e-3
-                if weights_precision == SparseType.FP16
-                or output_dtype == SparseType.FP16
-                else 1.0e-5
-            ),
+            atol=tol,
+            rtol=tol,
         )
         if do_pooling:
             goc = torch.cat([go.view(B, -1) for go in gos], dim=1)
         else:
             goc = torch.cat(gos, dim=0)
         fc2.backward(goc)
+        tol = 5.0e-3 if is_low_prec else 1.0e-4
         torch.testing.assert_close(
             cc.weights.grad,
             grad_weights,
-            atol=(
-                5.0e-3
-                if weights_precision == SparseType.FP16
-                or output_dtype == SparseType.FP16
-                else 1.0e-4
-            ),
-            rtol=(
-                5.0e-3
-                if weights_precision == SparseType.FP16
-                or output_dtype == SparseType.FP16
-                else 1.0e-4
-            ),
+            atol=tol,
+            rtol=tol,
         )
 
         cc = DenseTableBatchedEmbeddingBagsCodegen(
diff --git a/fbgemm_gpu/test/tbe/training/backward_none_test.py b/fbgemm_gpu/test/tbe/training/backward_none_test.py
index ef079254b..1c824d654 100644
--- a/fbgemm_gpu/test/tbe/training/backward_none_test.py
+++ b/fbgemm_gpu/test/tbe/training/backward_none_test.py
@@ -82,7 +82,9 @@ class BackwardNoneTest(unittest.TestCase):
                 PoolingMode.NONE,
             ]
         ),
-        output_dtype=st.sampled_from([SparseType.FP16, SparseType.FP32]),
+        output_dtype=st.sampled_from(
+            [SparseType.FP16, SparseType.FP32, SparseType.BF16]
+        ),
     )
     @settings(
         verbosity=VERBOSITY,
@@ -110,7 +112,9 @@ def test_backward_none(self, **kwargs: Any) -> None:
                 PoolingMode.NONE,
             ]
         ),
-        output_dtype=st.sampled_from([SparseType.FP16, SparseType.FP32]),
+        output_dtype=st.sampled_from(
+            [SparseType.FP16, SparseType.FP32, SparseType.BF16]
+        ),
     )
     @settings(
         verbosity=VERBOSITY,
@@ -150,6 +154,9 @@ def execute_backward_none_(  # noqa C901
         assume(not weighted or pooling_mode != PoolingMode.NONE)
 
         assume(pooling_mode == PoolingMode.SUM or not weighted)
+        # TODO: Check why long_segments=True fails when output_dtype ==
+        # SparseType.BF16
+        assume(not long_segments or output_dtype != SparseType.BF16)
 
         if pooling_mode == PoolingMode.SUM:
             mode = "sum"
@@ -269,6 +276,10 @@ def execute_backward_none_(  # noqa C901
                     for (b, x, xw) in zip(bs, xs, xws)
                 ]
             )
+            # Torch's Embedding only produces an output that has the same type
+            # as weight
+            if weights_precision != output_dtype:
+                fs = [f.to(output_dtype.as_dtype()) for f in fs]
             gos: Union[List[Tensor], Tensor] = [torch.randn_like(f) for f in fs]
             [f.backward(go) for (f, go) in zip(fs, gos)]
         else:
diff --git a/fbgemm_gpu/test/tbe/training/backward_sgd_test.py b/fbgemm_gpu/test/tbe/training/backward_sgd_test.py
index 05242b5d9..f62d7f379 100644
--- a/fbgemm_gpu/test/tbe/training/backward_sgd_test.py
+++ b/fbgemm_gpu/test/tbe/training/backward_sgd_test.py
@@ -234,6 +234,11 @@ def execute_backward_sgd_(  # noqa C901
                 for (b, x, xw) in zip(bs, xs, xws)
             ]
         )
+
+        # Cast output type to output_dtype
+        if weights_precision != output_dtype:
+            fs = [f.to(output_dtype.as_dtype()) for f in fs]
+
         # Generate gradients
         gos = [torch.randn_like(f) for f in fs]
         # Run baseline's backward
diff --git a/fbgemm_gpu/test/tbe/training/failures_dict_fast.json b/fbgemm_gpu/test/tbe/training/failures_dict_fast.json
index 6e5930718..6880503c6 100644
--- a/fbgemm_gpu/test/tbe/training/failures_dict_fast.json
+++ b/fbgemm_gpu/test/tbe/training/failures_dict_fast.json
@@ -85,6 +85,10 @@
         "comment": "",
         "status": "skip"
       },
+      "BackwardAdagradTest.test_faketensor__test_backward_adagrad_fp16_pmSUM_with_max_norm": {
+        "comment": "",
+        "status": "xfail"
+      },
       "BackwardAdagradTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
         "comment": "",
         "status": "skip"
@@ -198,6 +202,10 @@
         "comment": "",
         "status": "xfail"
       },
+      "BackwardAdagradTest.test_faketensor__test_backward_adagrad_fp16_pmSUM_with_max_norm": {
+        "comment": "",
+        "status": "xfail"
+      },
       "BackwardAdagradTest.test_faketensor__test_backward_adagrad_fp32_pmMEAN": {
         "comment": "",
         "status": "xfail"
diff --git a/fbgemm_gpu/test/tbe/training/forward_test.py b/fbgemm_gpu/test/tbe/training/forward_test.py
index 814ee2407..f48dfbf25 100644
--- a/fbgemm_gpu/test/tbe/training/forward_test.py
+++ b/fbgemm_gpu/test/tbe/training/forward_test.py
@@ -604,6 +604,7 @@ def test_forward_gpu_uvm_cache_int8(
             [
                 SparseType.FP32,
                 SparseType.FP16,
+                SparseType.BF16,
             ]
         )
         if pooling_mode == PoolingMode.NONE:
@@ -670,6 +671,7 @@ def test_forward_gpu_uvm_cache_fp16(
             [
                 SparseType.FP32,
                 SparseType.FP16,
+                SparseType.BF16,
             ]
         )
         if pooling_mode == PoolingMode.NONE:
@@ -736,6 +738,7 @@ def test_forward_gpu_uvm_cache_fp32(
             [
                 SparseType.FP32,
                 SparseType.FP16,
+                SparseType.BF16,
             ]
         )
         if pooling_mode == PoolingMode.NONE: