From a89a850179931aa9b7d7202ee7d48eb6117d10f7 Mon Sep 17 00:00:00 2001 From: Artem Tamazov Date: Thu, 8 Aug 2024 07:26:22 +0300 Subject: [PATCH] [Fix][TransformTensor] Ignore output buffer when BETA=0 (#3184) --- .../MIOpenSubTensorOpWithTransformKernel.cl | 49 +++++++++++++++---- src/ocl/tensorocl.cpp | 17 ++++--- 2 files changed, 49 insertions(+), 17 deletions(-) diff --git a/src/kernels/MIOpenSubTensorOpWithTransformKernel.cl b/src/kernels/MIOpenSubTensorOpWithTransformKernel.cl index bd06f89626..39e2149ae8 100644 --- a/src/kernels/MIOpenSubTensorOpWithTransformKernel.cl +++ b/src/kernels/MIOpenSubTensorOpWithTransformKernel.cl @@ -100,13 +100,42 @@ #define WORK_STRIDE_1 (WORK_LENGTH_2 * WORK_STRIDE_2) #define WORK_STRIDE_0 (WORK_LENGTH_1 * WORK_STRIDE_1) -#ifndef SUBTENSOR_OP_WITH_SCALAR -#define SUBTENSOR_OP_WITH_SCALAR BREAK_COMPILE_INTENTIONALLY +#ifndef MIOPEN_BETA_IS_ZERO +#error "MIOPEN_BETA_IS_ZERO must be defined" +#endif +#ifndef MIOPEN_ALPHA_IS_ONE +#error "MIOPEN_ALPHA_IS_ONE must be defined" #endif -#define SUBTENSOR_OP_WITH_SCALAR_SET(t, a) (t = a) -#define SUBTENSOR_OP_WITH_SCALAR_MULTIPLY(t, a) (t *= a) -#define SUBTENSOR_OP_WITH_SCALAR_MAD(tb, b, ta, a) (tb = mad(ta, a, tb * b)) +#if MIOPEN_BETA_IS_ZERO && MIOPEN_ALPHA_IS_ONE +#define SUBTENSOR_OP_WITH_ALPHA_BETA(dst, src) \ + do \ + { \ + (dst) = (src); \ + (void)beta; \ + (void)alpha; \ + } while(0) +#elif MIOPEN_BETA_IS_ZERO +#define SUBTENSOR_OP_WITH_ALPHA_BETA(dst, src) \ + do \ + { \ + (dst) = (src)*alpha; \ + (void)beta; \ + } while(0) +#elif MIOPEN_ALPHA_IS_ONE +#define SUBTENSOR_OP_WITH_ALPHA_BETA(dst, src) \ + do \ + { \ + (dst) = mad((dst), beta, (src)); \ + (void)alpha; \ + } while(0) +#else +#define SUBTENSOR_OP_WITH_ALPHA_BETA(dst, src) \ + do \ + { \ + (dst) = mad((src), alpha, (dst)*beta); \ + } while(0) +#endif __kernel void SubTensorOpWithTransform1d(global _FLOAT* __restrict src, const _FLOAT alpha, @@ -127,7 +156,7 @@ __kernel void SubTensorOpWithTransform1d(global _FLOAT* __restrict src, uint si = src_stride0 * did0 + src_offset; uint di = dst_stride0 * did0 + dst_offset; - SUBTENSOR_OP_WITH_SCALAR(dst[di], beta, src[si], alpha); + SUBTENSOR_OP_WITH_ALPHA_BETA(dst[di], src[si]); } } @@ -159,7 +188,7 @@ __kernel void SubTensorOpWithTransform2d(global _FLOAT* __restrict src, uint si = src_stride0 * did0 + src_stride1 * did1 + src_offset; uint di = dst_stride0 * did0 + dst_stride1 * did1 + dst_offset; - SUBTENSOR_OP_WITH_SCALAR(dst[di], beta, src[si], alpha); + SUBTENSOR_OP_WITH_ALPHA_BETA(dst[di], src[si]); } } } @@ -201,7 +230,7 @@ __kernel void SubTensorOpWithTransform3d(global _FLOAT* __restrict src, uint si = src_stride0 * did0 + src_stride1 * did1 + src_stride2 * did2 + src_offset; uint di = dst_stride0 * did0 + dst_stride1 * did1 + dst_stride2 * did2 + dst_offset; - SUBTENSOR_OP_WITH_SCALAR(dst[di], beta, src[si], alpha); + SUBTENSOR_OP_WITH_ALPHA_BETA(dst[di], src[si]); } } } @@ -255,7 +284,7 @@ __kernel void SubTensorOpWithTransform4d(global _FLOAT* __restrict src, uint di = dst_stride0 * did0 + dst_stride1 * did1 + dst_stride2 * did2 + dst_stride3 * did3 + dst_offset; - SUBTENSOR_OP_WITH_SCALAR(dst[di], beta, src[si], alpha); + SUBTENSOR_OP_WITH_ALPHA_BETA(dst[di], src[si]); } } } @@ -319,7 +348,7 @@ __kernel void SubTensorOpWithTransform5d(global _FLOAT* __restrict src, uint di = dst_stride0 * did0 + dst_stride1 * did1 + dst_stride2 * did2 + dst_stride3 * did3 + dst_stride4 * did4 + dst_offset; - SUBTENSOR_OP_WITH_SCALAR(dst[di], beta, src[si], alpha); + SUBTENSOR_OP_WITH_ALPHA_BETA(dst[di], src[si]); } } } diff --git a/src/ocl/tensorocl.cpp b/src/ocl/tensorocl.cpp index 74717f50ea..98002de68a 100644 --- a/src/ocl/tensorocl.cpp +++ b/src/ocl/tensorocl.cpp @@ -2187,6 +2187,9 @@ void TransformTensor(const Handle& handle, MIOPEN_THROW("Tensor x and y batch sizes do not match"); } + const auto is_alpha_one = float_equal(*(static_cast(alpha)), 1); + const auto is_beta_zero = float_equal(*(static_cast(beta)), 0); + if(xDesc.GetType() == miopenInt8 && yDesc.GetType() == miopenInt8 && x_len.size() >= 3) { if(x_len[1] <= y_len[1]) @@ -2221,8 +2224,7 @@ void TransformTensor(const Handle& handle, size_t x_offset = i * x_batch_sz; size_t y_offset = i * y_batch_sz; - if(float_equal(*(static_cast(alpha)), 1) && - float_equal(*(static_cast(beta)), 0)) + if(is_alpha_one && is_beta_zero) { CopyTensor(handle, ((x_len[1] <= y_len[1]) ? x_batch_desc : y_batch_desc), @@ -2234,7 +2236,8 @@ void TransformTensor(const Handle& handle, } else { - // TODO: support y=alpha*x+beta*y + MIOPEN_THROW(miopenStatusNotImplemented, + "y=alpha*x+beta*y is not supported for int8 yet"); } } } @@ -2254,7 +2257,6 @@ void TransformTensor(const Handle& handle, const TensorDescriptor& xDesc_flat = std::get<0>(flat_descriptors); const TensorDescriptor& yDesc_flat = std::get<1>(flat_descriptors); -#ifndef NDEBUG if(xDesc.GetNumDims() != xDesc_flat.GetNumDims()) { MIOPEN_LOG_I2("x real descriptor: " << xDesc); @@ -2266,7 +2268,6 @@ void TransformTensor(const Handle& handle, MIOPEN_LOG_I2("y real descriptor: " << yDesc); MIOPEN_LOG_I2("y flat descriptor: " << yDesc_flat); } -#endif const std::size_t yDim_flat = yDesc_flat.GetNumDims(); @@ -2329,8 +2330,10 @@ void TransformTensor(const Handle& handle, std::size_t wld = 256 < wgd ? 256 : wgd; - std::string parms = "-DSUBTENSOR_OP_WITH_SCALAR=SUBTENSOR_OP_WITH_SCALAR_MAD" + - GetDataTypeKernelParams(dataTypey); + std::string parms = + GetDataTypeKernelParams(dataTypey) // + + " -DMIOPEN_BETA_IS_ZERO=" + std::to_string(static_cast(is_beta_zero)) // + + " -DMIOPEN_ALPHA_IS_ONE=" + std::to_string(static_cast(is_alpha_one)); for(int i = 0; i < yDim_flat; ++i) {