From b32d59ec6e856c9971c8280463c89cedc1a719f6 Mon Sep 17 00:00:00 2001
From: Jun Luo <junluo@meta.com>
Date: Fri, 21 Jun 2024 14:11:52 -0700
Subject: [PATCH] Fix torch_dispatch issue in
 group_index_select_dim0_gpu_backward (#2767)

Summary:
Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2767

The group_index_select_dim0_gpu_backward returns tensor list with undefined tensor. It is not compatible to ```__torch_dispatch__``` which call into Python from C++. The undefined tensors in the vector become None in Python. Once it returns to C++ call stack, It can't convert ```[None,None, Tensor]``` back to ```std::vector<at::Tensor>```.
To solve this problem, the group_index_select_dim0_gpu_backward must return valid tensor list without undefined tensors (the diff used tensor with size {0}). It can set the variable_list to undefined tensor after returned from the pytorch dispatcher, which is a common scenario I have seen in other ops.

Reviewed By: sryap

Differential Revision: D58761943

fbshipit-source-id: 6dcd7a75023d667db9e80685ca82b6a572620148
---
 fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp
index 501853b84..f913bc891 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp
@@ -493,9 +493,12 @@ class GroupIndexSelectDim0GPUOp
 
     // 1) Add group_size Variable()'s for indices
     // c10::irange cannot be used in here as it
-    // triggers a build error of i being an unused variable
+    // triggers a build error of i being an unused variable.
+    // Add empty tensor with zero size here to make __torch_dispatch__ work for
+    // the backward op. Those empty tensors will be replaced with
+    // torch::autograd::Variable() outside of the op call.
     for (auto i = 0; i < group_size; i++) {
-      outputs.push_back(torch::autograd::Variable());
+      outputs.push_back(at::empty({0}, at::TensorOptions().dtype(at::kLong)));
     }
 
     // Allocate Tensor for ptrs of grad output and input, and indices
@@ -615,6 +618,11 @@ class GroupIndexSelectDim0GPUOp
                 "fbgemm::group_index_select_dim0_gpu_backward", "")
             .typed<decltype(backward_impl)>();
     auto res = backward_op.call(grad_output_group, output_shape_group);
+    // 1) Add group_size Variable()'s for indices
+    // Replace all empty tensors with Variable(). This must be done after the
+    // op.call to make __torch_dispatch__ work for the backward op.
+    std::fill(
+        res.begin(), res.begin() + group_size, torch::autograd::Variable());
     // 3) Add 1 Variable() for group_size
     res.push_back({});
     return res;