From 39914eff04dd1d79557ae7be135d6dd81f43b328 Mon Sep 17 00:00:00 2001 From: Yifan Xu Date: Fri, 29 Sep 2023 15:18:22 -0700 Subject: [PATCH] Back out "jagged bmm CPU operator optimization" (#2053) Summary: Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/2053 Original commit changeset: 6688a5cd68fd Original Phabricator Diff: D45936724 Reviewed By: renganxu Differential Revision: D49783731 fbshipit-source-id: 54c1bd4ec355325d88ec1b22fe2335e8a07936e3 --- .../jagged_tensor_ops_cpu.cpp | 46 ++----------------- 1 file changed, 4 insertions(+), 42 deletions(-) diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp index 820173b6c..7ae207adb 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp +++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp @@ -17,10 +17,6 @@ #include "fbgemm_gpu/sparse_ops.h" #include "fbgemm_gpu/sparse_ops_utils.h" -#ifdef _OPENMP -#include -#endif - namespace fbgemm_gpu { ///@defgroup jagged-tensor-ops-cpu Jagged Tensor Operators @@ -1243,11 +1239,7 @@ void jagged_softmax_kernel( const int64_t max_L) { const int B = offsets.size(0) - 1; const int D = values.size(1); - -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (auto b = 0; b < B; b++) { + for (const auto b : c10::irange(B)) { const int row_start = offsets[b]; const int row_end = offsets[b + 1]; const int length = std::min(row_end - row_start, (int)max_L); @@ -1284,10 +1276,6 @@ Tensor jagged_softmax_forward( const int D = values.size(1); auto output = at::empty_like(values); -#ifdef _OPENMP - omp_set_num_threads(10); -#endif - if (B > 0 && D > 0) { AT_DISPATCH_INDEX_TYPES( offsets.scalar_type(), "jagged_softmax_kernel_1", [&] { @@ -1317,11 +1305,7 @@ void jagged_softmax_backward_kernel( const int64_t max_L) { const int B = offsets.size(0) - 1; const int D = grad_output.size(1); - -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (auto b = 0; b < B; b++) { + for (const auto b : c10::irange(B)) { const int row_start = offsets[b]; const int row_end = offsets[b + 1]; const int length = std::min(row_end - row_start, (int)max_L); @@ -1354,10 +1338,6 @@ Tensor jagged_softmax_backward( const int D = grad_output.size(1); auto grad_input = at::empty_like(grad_output); -#ifdef _OPENMP - omp_set_num_threads(10); -#endif - if (B > 0 && D > 0) { AT_DISPATCH_INDEX_TYPES( offsets.scalar_type(), "jagged_backward_kernel_1", [&] { @@ -1389,11 +1369,7 @@ void jagged_jagged_bmm_kernel( const int B = offsets.size(0) - 1; const int M = x_values.size(1); const int N = y_values.size(1); - -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (auto b = 0; b < B; b++) { + for (const auto b : c10::irange(B)) { const int row_start = offsets[b]; const int row_end = offsets[b + 1]; const int length = std::min(row_end - row_start, (int)max_L); @@ -1421,11 +1397,6 @@ Tensor jagged_jagged_bmm_forward( const int M = x_values.size(-1); const int N = y_values.size(-1); auto output = at::zeros({B, M, N}, x_values.options()); - -#ifdef _OPENMP - omp_set_num_threads(10); -#endif - if (B > 0 && M > 0 && N > 0) { AT_DISPATCH_INDEX_TYPES( offsets.scalar_type(), "jagged_jagged_bmm_kernel_1", [&] { @@ -1459,11 +1430,7 @@ void jagged_dense_bmm_kernel( const int B = x_offsets.size(0) - 1; const int K = x_values.size(1); const int N = y.size(2); - -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (auto b = 0; b < B; b++) { + for (const auto b : c10::irange(B)) { const int row_start = x_offsets[b]; const int row_end = x_offsets[b + 1]; const int length = std::min(row_end - row_start, (int)max_L); @@ -1492,11 +1459,6 @@ Tensor jagged_dense_bmm_forward( const int N = y.size(-1); const int total_L = x_values.size(0); auto output = at::zeros({total_L, N}, x_values.options()); - -#ifdef _OPENMP - omp_set_num_threads(10); -#endif - if (B > 0 && M > 0 && N > 0) { AT_DISPATCH_INDEX_TYPES( x_offsets.scalar_type(), "jagged_dense_bmm_kernel_1", [&] {