diff --git a/fbgemm_gpu/docs/src/fbgemm_gpu-python-api/sparse_ops.rst b/fbgemm_gpu/docs/src/fbgemm_gpu-python-api/sparse_ops.rst index e22812586..afc38a450 100644 --- a/fbgemm_gpu/docs/src/fbgemm_gpu-python-api/sparse_ops.rst +++ b/fbgemm_gpu/docs/src/fbgemm_gpu-python-api/sparse_ops.rst @@ -6,3 +6,9 @@ Sparse Operators .. autofunction:: torch.ops.fbgemm.permute_2D_sparse_data .. autofunction:: torch.ops.fbgemm.permute_1D_sparse_data + +.. autofunction:: torch.ops.fbgemm.expand_into_jagged_permute + +.. autofunction:: torch.ops.fbgemm.asynchronous_complete_cumsum + +.. autofunction:: torch.ops.fbgemm.offsets_range diff --git a/fbgemm_gpu/fbgemm_gpu/docs/sparse_ops.py b/fbgemm_gpu/fbgemm_gpu/docs/sparse_ops.py index c95588207..ae307dc8f 100644 --- a/fbgemm_gpu/fbgemm_gpu/docs/sparse_ops.py +++ b/fbgemm_gpu/fbgemm_gpu/docs/sparse_ops.py @@ -119,3 +119,88 @@ None) """, ) + +add_docs( + torch.ops.fbgemm.expand_into_jagged_permute, + """ +expand_into_jagged_permute(permute, input_offset, output_offset, output_size) -> Tensor + +Expand the sparse data permute index from feature dimension to batch dimension, +for cases where the sparse features has different batch sizes across ranks. + +The op expands the permute from feature level to batch level by contiguously +mapping each bag of its corresponding features to the position the batch sits +on after feature permute. The op will automatically derive offset array of +feature and batch to compute the output permute. + +Args: + permute (Tensor): The feature level permute index. + + input_offset (Tensor): The exclusive offsets of feature-level length. + + output_offsets (Tensor): The exclusive offsets of feature-level permuted + length. + + output_size (int): The number of elements in the output tensor + +Returns: + The output follows the following formula + + >>> output_permute[feature_offset[permute[feature]] + batch] <- bag_offset[batch] + """, +) + +add_docs( + torch.ops.fbgemm.asynchronous_complete_cumsum, + """ +asynchronous_complete_cumsum(t_in) -> Tensor + +Compute complete cumulative sum. For the GPU operator, the operator is +nonblocking asynchronous. For the CPU operator, it is a blocking operator. + +Args: + t_in (Tensor): An input tensor + +Returns: + The complete cumulative sum of `t_in`. Shape is `t_in.numel() + 1` + +**Example:** + + >>> t_in = torch.tensor([7, 8, 2, 1, 0, 9, 4], dtype=torch.int64, device="cuda") + >>> torch.ops.fbgemm.asynchronous_complete_cumsum(t_in) + tensor([ 0, 7, 15, 17, 18, 18, 27, 31], device='cuda:0') + """, +) + +add_docs( + torch.ops.fbgemm.offsets_range, + """ +offsets_range(offsets, range_size) -> Tensor + +Generate an integer sequence from 0 to `(offsets[i+1] - offsets[i])` for every +`i`, where `0 <= i < offsets.numel()` + +Args: + offsets (Tensor): The offsets (complete cumulative sum values) + + range_size (int): The output size (the total sum) + +Returns: + A tensor that contains offsets range + +**Example:** + >>> # Generate example inputs + >>> lengths = torch.tensor([3, 4, 1, 9, 3, 7], dtype=torch.int64, device="cuda") + >>> offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths) + >>> range_size = offsets[-1].item() + >>> print(range_size) + 27 + >>> offsets = offsets[:-1] + >>> print(offsets) + tensor([ 0, 3, 7, 8, 17, 20], device='cuda:0') + >>> # Invoke + >>> torch.ops.fbgemm.offsets_range(offsets, range_size) + tensor([0, 1, 2, 0, 1, 2, 3, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 0, 1, 2, 3, + 4, 5, 6], device='cuda:0') + """, +)