From 61cd7594d5beb378437adbccfb9cd7950cf64e71 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Thu, 14 Nov 2024 00:09:35 -0800 Subject: [PATCH 1/4] Add support for `int32_t` indices in TBE training (2B/N) (#3371) Summary: X-link: https://github.com/facebookresearch/FBGEMM/pull/462 - Fix `hash_size_cumsum` to be `int64_t` in `linearize_index_index_select_kernel` and `linearize_index_kernel` - Enable more specializations for radix sort Differential Revision: D65923591 --- fbgemm_gpu/FbgemmGpu.cmake | 3 +- .../fbgemm_gpu/split_embeddings_utils.cuh | 8 +++-- .../radix_sort_pairs.cu | 8 +++-- .../transpose_embedding_input.cu | 30 +++++++++---------- 4 files changed, 27 insertions(+), 22 deletions(-) diff --git a/fbgemm_gpu/FbgemmGpu.cmake b/fbgemm_gpu/FbgemmGpu.cmake index 8ff5270f01..68f2b1fa3a 100644 --- a/fbgemm_gpu/FbgemmGpu.cmake +++ b/fbgemm_gpu/FbgemmGpu.cmake @@ -725,8 +725,7 @@ endif() # Silence warnings in asmjit target_compile_options(fbgemm_gpu_py PRIVATE - -Wno-deprecated-anon-enum-enum-conversion) -target_compile_options(fbgemm_gpu_py PRIVATE + -Wno-deprecated-anon-enum-enum-conversion -Wno-deprecated-declarations) diff --git a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh index 8351e046c2..9e6130f408 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh @@ -59,9 +59,13 @@ transpose_embedding_input( int end_bit = sizeof(KeyT) * 8, \ cudaStream_t stream = 0) +DECL_RADIX_SORT_PAIRS_FN(int64_t, int32_t); +DECL_RADIX_SORT_PAIRS_FN(int64_t, int64_t); DECL_RADIX_SORT_PAIRS_FN(int64_t, float); DECL_RADIX_SORT_PAIRS_FN(int64_t, double); -DECL_RADIX_SORT_PAIRS_FN(int64_t, int64_t); -DECL_RADIX_SORT_PAIRS_FN(int64_t, int32_t); +DECL_RADIX_SORT_PAIRS_FN(int32_t, int32_t); +DECL_RADIX_SORT_PAIRS_FN(int32_t, int64_t); +DECL_RADIX_SORT_PAIRS_FN(int32_t, float); +DECL_RADIX_SORT_PAIRS_FN(int32_t, double); #undef DECL_RADIX_SORT_PAIRS_FN diff --git a/fbgemm_gpu/src/split_embeddings_utils/radix_sort_pairs.cu b/fbgemm_gpu/src/split_embeddings_utils/radix_sort_pairs.cu index 50d9757d25..93dab81a4f 100644 --- a/fbgemm_gpu/src/split_embeddings_utils/radix_sort_pairs.cu +++ b/fbgemm_gpu/src/split_embeddings_utils/radix_sort_pairs.cu @@ -77,7 +77,11 @@ using namespace fbgemm_gpu; } #endif +DEF_RADIX_SORT_PAIRS_FN(int64_t, int32_t); +DEF_RADIX_SORT_PAIRS_FN(int64_t, int64_t); DEF_RADIX_SORT_PAIRS_FN(int64_t, float); DEF_RADIX_SORT_PAIRS_FN(int64_t, double); -DEF_RADIX_SORT_PAIRS_FN(int64_t, int64_t); -DEF_RADIX_SORT_PAIRS_FN(int64_t, int32_t); +DEF_RADIX_SORT_PAIRS_FN(int32_t, int32_t); +DEF_RADIX_SORT_PAIRS_FN(int32_t, int64_t); +DEF_RADIX_SORT_PAIRS_FN(int32_t, float); +DEF_RADIX_SORT_PAIRS_FN(int32_t, double); diff --git a/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu b/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu index 83a06d78a0..07bafe636a 100644 --- a/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu +++ b/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu @@ -63,7 +63,7 @@ inline at::Tensor asynchronous_complete_cumsum(at::Tensor t_in) { template __global__ __launch_bounds__(kMaxThreads) void linearize_index_kernel( - const pta::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 hash_size_cumsum, const pta::PackedTensorAccessor32 indices, @@ -79,7 +79,7 @@ __global__ __launch_bounds__(kMaxThreads) void linearize_index_kernel( // Use a raw pointer to avoid creating dummy PackedTensorAccessor const uint32_t* const __restrict__ vbe_b_t_map, FixedDivisor fd) { - const int32_t T = hash_size_cumsum.size(0) - 1; + const auto T = hash_size_cumsum.size(0) - 1; auto b_t = blockIdx.x * blockDim.x + threadIdx.x; int32_t b; int32_t t; @@ -97,17 +97,16 @@ __global__ __launch_bounds__(kMaxThreads) void linearize_index_kernel( } const index_t hash_offset = valid ? hash_size_cumsum[t] : -1; - const index_t indices_start = valid ? offsets[b_t] : -1; - const int32_t L = valid ? offsets[b_t + 1] - indices_start : 0; + const auto indices_start = valid ? offsets[b_t] : -1; + const auto L = valid ? offsets[b_t + 1] - indices_start : 0; const int32_t lane_id = threadIdx.x % fbgemm_gpu::kWarpSize; // Compile-time conditional if (nobag) { for (int32_t j = 0; j < fbgemm_gpu::kWarpSize; ++j) { - const index_t indices_start_warp = - fbgemm_gpu::shfl_sync(indices_start, j); - const int32_t t_warp = fbgemm_gpu::shfl_sync(t, j); - const int32_t L_warp = fbgemm_gpu::shfl_sync(L, j); + const auto indices_start_warp = fbgemm_gpu::shfl_sync(indices_start, j); + const auto t_warp = fbgemm_gpu::shfl_sync(t, j); + const auto L_warp = fbgemm_gpu::shfl_sync(L, j); const index_t hash_offset_warp = fbgemm_gpu::shfl_sync(hash_offset, j); for (int32_t i = lane_id; i < L_warp; i += fbgemm_gpu::kWarpSize) { const index_t idx = __ldg(&indices[indices_start_warp + i]); @@ -124,10 +123,9 @@ __global__ __launch_bounds__(kMaxThreads) void linearize_index_kernel( reinterpret_cast(&b)[0]; } for (int32_t j = 0; j < fbgemm_gpu::kWarpSize; ++j) { - const index_t indices_start_warp = - fbgemm_gpu::shfl_sync(indices_start, j); - const uint32_t info_warp = fbgemm_gpu::shfl_sync(info, j); - const int32_t L_warp = fbgemm_gpu::shfl_sync(L, j); + const auto indices_start_warp = fbgemm_gpu::shfl_sync(indices_start, j); + const auto info_warp = fbgemm_gpu::shfl_sync(info, j); + const auto L_warp = fbgemm_gpu::shfl_sync(L, j); const index_t hash_offset_warp = fbgemm_gpu::shfl_sync(hash_offset, j); for (int32_t i = lane_id; i < L_warp; i += fbgemm_gpu::kWarpSize) { const index_t idx = __ldg(&indices[indices_start_warp + i]); @@ -142,7 +140,7 @@ __global__ __launch_bounds__(kMaxThreads) void linearize_index_kernel( template __global__ __launch_bounds__(kMaxThreads) void linearize_index_index_select_kernel( - const pta::PackedTensorAccessor32 + const pta::PackedTensorAccessor32 hash_size_cumsum, const pta::PackedTensorAccessor32 indices, @@ -153,7 +151,7 @@ __launch_bounds__(kMaxThreads) void linearize_index_index_select_kernel( linear_indices, FixedDivisor fd, int32_t fixed_L_per_warp) { - const int32_t T = hash_size_cumsum.size(0) - 1; + const auto T = hash_size_cumsum.size(0) - 1; auto b_t = blockIdx.x * blockDim.x + threadIdx.x; int32_t b; int32_t t; @@ -258,7 +256,7 @@ transpose_embedding_input( kMaxThreads, \ 0, \ at::cuda::getCurrentCUDAStream()>>>( \ - MAKE_PTA_WITH_NAME(func_name, hash_size_cumsum, index_t, 1, 32), \ + MAKE_PTA_WITH_NAME(func_name, hash_size_cumsum, int64_t, 1, 32), \ MAKE_PTA_WITH_NAME(func_name, indices, index_t, 1, 32), \ MAKE_PTA_WITH_NAME(func_name, offsets, index_t, 1, 32), \ MAKE_PTA_WITH_NAME(func_name, infos, INFO_ACC_T, 1, 32), \ @@ -296,7 +294,7 @@ transpose_embedding_input( 0, at::cuda::getCurrentCUDAStream()>>>( MAKE_PTA_WITH_NAME( - func_name, hash_size_cumsum, index_t, 1, 32), + func_name, hash_size_cumsum, int64_t, 1, 32), MAKE_PTA_WITH_NAME(func_name, indices, index_t, 1, 32), MAKE_PTA_WITH_NAME( func_name, total_L_offsets.value(), index_t, 1, 32), From 041d8e9d52577925cf2188e3af2af3b0a059c40f Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Thu, 14 Nov 2024 00:09:35 -0800 Subject: [PATCH 2/4] Add support for `int32_t` indices in TBE training (2C/N) (#3372) Summary: X-link: https://github.com/facebookresearch/FBGEMM/pull/465 - Add `index_t` support to TBE training backward kernels Differential Revision: D65925354 --- .../backward/embedding_backward_split_template.cu | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu index fdd9c0f798..7c4d85fc33 100644 --- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu +++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu @@ -742,6 +742,7 @@ Tensor {{ embedding_cuda_op }}( else { {{ locs_or_addrs_tensor }}_sorted = at::empty_like({{ locs_or_addrs_tensor }}); size_t temp_storage_bytes = 0; + AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "{{ embedding_cuda_op }}_1", [&] { AT_CUDA_CHECK(radix_sort_pairs( nullptr, temp_storage_bytes, @@ -753,9 +754,11 @@ Tensor {{ embedding_cuda_op }}( 0, total_hash_size_bits, at::cuda::getCurrentCUDAStream())); + auto temp_storage = at::empty( {static_cast(temp_storage_bytes)}, indices.options().dtype(at::kByte)); + AT_CUDA_CHECK(radix_sort_pairs( temp_storage.data_ptr(), temp_storage_bytes, @@ -767,6 +770,7 @@ Tensor {{ embedding_cuda_op }}( 0, total_hash_size_bits, at::cuda::getCurrentCUDAStream())); + }); } } @@ -775,6 +779,7 @@ Tensor {{ embedding_cuda_op }}( } {%- endif %} + AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "{{ embedding_cuda_op }}_2", [&] { DISPATCH_EMB_GRAD_CACHE_TYPES( dev_weights.scalar_type(), aligned_grad_output.scalar_type(), @@ -800,9 +805,11 @@ Tensor {{ embedding_cuda_op }}( 0, total_hash_size_bits, at::cuda::getCurrentCUDAStream())); + auto temp_storage = at::empty( {static_cast(temp_storage_bytes)}, indices.options().dtype(at::kByte)); + AT_CUDA_CHECK(radix_sort_pairs( temp_storage.data_ptr(), temp_storage_bytes, @@ -1181,6 +1188,7 @@ Tensor {{ embedding_cuda_op }}( }); // DISPATCH_OPTIMAL_KERNEL }); // DISPATCH_EMB_GRAD_CACHE_TYPES + }); // AT_DISPATCH_INDEX_TYPES {%- if dense %} return grad_dev_weights; From e831b65e020a8f3672258633f68594c0b16ee474 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Thu, 14 Nov 2024 00:09:35 -0800 Subject: [PATCH 3/4] Add support for `int32_t` indices in TBE training (2D/N) (#3374) Summary: X-link: https://github.com/facebookresearch/FBGEMM/pull/464 - Add `index_t` support to TBE training backward kernels Differential Revision: D65930273 --- .../backward/embedding_backward_split_grad_template.cu | 10 ++++++---- .../backward/embedding_backward_split_template.cu | 6 +++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_grad_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_grad_template.cu index f20b1b97bd..032ef7e862 100644 --- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_grad_template.cu +++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_grad_template.cu @@ -140,14 +140,14 @@ void split_embedding_backward_count_unique_indices_kernel {% for vbe in [True, False] %} {% set vdesc = "_vbe" if vbe else "" %} -template +template __global__ __launch_bounds__(kMaxThreads) void grad_mean{{ vdesc }}_kernel( pta::PackedTensorAccessor64 grad_output_mean, const pta::PackedTensorAccessor64 grad_output, const pta::PackedTensorAccessor32 D_offsets, - const pta::PackedTensorAccessor32 offsets, + const pta::PackedTensorAccessor32 offsets, {% if vbe %} const pta::PackedTensorAccessor32 row_grad_offsets, const pta::PackedTensorAccessor32 b_t_map, @@ -212,15 +212,16 @@ __global__ __launch_bounds__(kMaxThreads) void grad_mean{{ vdesc }}_kernel( //////////////////////////////////////////////////////////////////////////////// {% for grad_type in ['at::Half', 'float', 'at::BFloat16'] %} +{% for offset_type in ['int32_t', 'int64_t'] %} template __global__ __launch_bounds__(kMaxThreads) void grad_mean{{ vdesc }}_kernel -<{{ grad_type }}> ( +<{{ grad_type }}, {{ offset_type }}> ( pta::PackedTensorAccessor64<{{ grad_type }}, 2, at::RestrictPtrTraits> grad_output_mean, const pta::PackedTensorAccessor64<{{ grad_type }}, 2, at::RestrictPtrTraits> grad_output, const pta::PackedTensorAccessor32 D_offsets, - const pta::PackedTensorAccessor32 offsets, + const pta::PackedTensorAccessor32<{{ offset_type }}, 1, at::RestrictPtrTraits> offsets, {% if vbe %} const pta::PackedTensorAccessor32 row_grad_offsets, const pta::PackedTensorAccessor32 b_t_map, @@ -230,6 +231,7 @@ void grad_mean{{ vdesc }}_kernel FixedDivisor fd_B {% endif %} ); +{% endfor %} // for offset_type in ['int32_t', 'int64_t'] {% endfor %} // for grad_type in ['at::Half', 'float'] {% endfor %} // for vbe in [True, False] diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu index 7c4d85fc33..f168bf79f5 100644 --- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu +++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu @@ -232,13 +232,13 @@ split_embedding_backward_codegen_find_long_segments( const bool use_deterministic_algorithms); -template +template __global__ __launch_bounds__(kMaxThreads) void grad_mean{{ vdesc }}_kernel( pta::PackedTensorAccessor64 grad_output_mean, const pta::PackedTensorAccessor64 grad_output, const pta::PackedTensorAccessor32 D_offsets, - const pta::PackedTensorAccessor32 offsets, + const pta::PackedTensorAccessor32 offsets, {%- if vbe %} const pta::PackedTensorAccessor32 grad_offsets, const pta::PackedTensorAccessor32 b_t_map, @@ -860,7 +860,7 @@ Tensor {{ embedding_cuda_op }}( MAKE_PTA_WITH_NAME(func_name1, grad_output_mean, grad_t, 2, 64), MAKE_PTA_WITH_NAME(func_name1, grad_output_reshaped, grad_t, 2, 64), MAKE_PTA_WITH_NAME(func_name1, D_offsets, int32_t, 1, 32), - MAKE_PTA_WITH_NAME(func_name1, offsets, int64_t, 1, 32), + MAKE_PTA_WITH_NAME(func_name1, offsets, index_t, 1, 32), {%- if vbe %} MAKE_PTA_WITH_NAME(func_name1, vbe_row_output_offsets, int64_t, 1, 32), MAKE_PTA_WITH_NAME(func_name1, vbe_b_t_map, int32_t, 1, 32), From c7bf4dc22f38b692d8b839ba8acb26e372cbfce6 Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Thu, 14 Nov 2024 00:09:35 -0800 Subject: [PATCH 4/4] Add support for `int32_t` indices in TBE training (2E/N) Summary: - Add `index_t` support to TBE training backward kernels Differential Revision: D65933410 --- .../embedding_backward_split_kernel_cta_template.cu | 10 ++++++++-- .../backward/embedding_backward_split_template.cu | 6 ++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_cta_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_cta_template.cu index 3fb49ed5e7..1cfeb66c94 100644 --- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_cta_template.cu +++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_cta_template.cu @@ -77,6 +77,7 @@ template < typename emb_t, typename grad_t, typename cache_t, + typename index_t, {%- for ph_name in args.placeholder_tensor_names %} typename {{ ph_name + "_ph_t" }}, {%- endfor %} @@ -105,7 +106,7 @@ batch_index_select_dim0_codegen_backward_kernel_cta_per_row( int64_t D, {%- endif %} const pta::PackedTensorAccessor32 hash_size_cumsum, - const pta::PackedTensorAccessor32 sorted_linear_indices_run, + const pta::PackedTensorAccessor32 sorted_linear_indices_run, const pta::PackedTensorAccessor32 sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32 long_run_ids, const pta::PackedTensorAccessor32 num_long_run_ids, @@ -430,6 +431,7 @@ batch_index_select_dim0_codegen_backward_kernel_cta_per_row( emb_type, grad_type, cache_type, + index_type, ph_type_combo, kFixedMaxVecsPerThread, kThreadGroupSize, @@ -446,6 +448,7 @@ batch_index_select_dim0_codegen_backward_kernel_cta_per_row < {{ emb_type }}, {{ grad_type }}, {{ cache_type }}, + {{ index_type }}, {%- for ph_name in args.placeholder_tensor_names %} {{ ph_type_combo[ph_name].primitive_type }}, {%- endfor %} @@ -470,7 +473,7 @@ batch_index_select_dim0_codegen_backward_kernel_cta_per_row int64_t D, {%- endif %} const pta::PackedTensorAccessor32 hash_size_cumsum, - const pta::PackedTensorAccessor32 sorted_linear_indices_run, + const pta::PackedTensorAccessor32<{{ index_type }}, 1, at::RestrictPtrTraits> sorted_linear_indices_run, const pta::PackedTensorAccessor32 sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32 long_run_ids, const pta::PackedTensorAccessor32 num_long_run_ids, @@ -538,11 +541,13 @@ batch_index_select_dim0_codegen_backward_kernel_cta_per_row {%- for grad_type in ['float', 'at::Half', 'at::BFloat16'] %} {%- for emb_type in ['float', 'at::Half'] %} {%- for cache_type in ['float', 'at::Half'] %} + {%- for index_type in ['int32_t', 'int64_t'] %} {%- for ph_type_combo in args.placeholder_type_combos %} {{ template_instantiation( emb_type, grad_type, cache_type, + index_type, ph_type_combo, kFixedMaxVecsPerThread, kThreadGroupSize, @@ -552,6 +557,7 @@ batch_index_select_dim0_codegen_backward_kernel_cta_per_row {%- endfor %} {%- endfor %} {%- endfor %} + {%- endfor %} {%- endmacro %} diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu index f168bf79f5..c731308e4f 100644 --- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu +++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu @@ -45,6 +45,7 @@ template < typename emb_t, typename grad_t, typename cache_t, + typename index_t, {%- for ph_name in args.placeholder_tensor_names %} typename {{ ph_name + "_ph_t" }}, {%- endfor %} @@ -73,7 +74,7 @@ batch_index_select_dim0_codegen_backward_kernel_cta_per_row( int64_t D, {%- endif %} const pta::PackedTensorAccessor32 hash_size_cumsum, - const pta::PackedTensorAccessor32 sorted_linear_indices_run, + const pta::PackedTensorAccessor32 sorted_linear_indices_run, const pta::PackedTensorAccessor32 sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32 long_run_ids, const pta::PackedTensorAccessor32 num_long_run_ids, @@ -962,6 +963,7 @@ Tensor {{ embedding_cuda_op }}(