diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 90e29ffaa9..9594e00ef0 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -35,3 +35,4 @@ The MIOpen API library is structured as follows: * :doc:`ReduceExtreme <../doxygen/html/group__ReduceExtreme>` (experimental) * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental) * :doc:`ReduceCalculation <../doxygen/html/group__ReduceCalculation>` (experimental) + * :doc:`AvgPool <../doxygen/html/group__avgpool>` (experimental) diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index cd663eb8b4..385580e2e1 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -32,6 +32,7 @@ add_executable(MIOpenDriver dm_activ.cpp dm_adam.cpp dm_addlayernorm.cpp + dm_avgpool.cpp dm_bnorm.cpp dm_cat.cpp dm_conv.cpp diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp new file mode 100644 index 0000000000..ff7d04edd5 --- /dev/null +++ b/driver/avgpool_driver.hpp @@ -0,0 +1,575 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_MIOPEN_AVGPOOL_DRIVER_HPP +#define GUARD_MIOPEN_AVGPOOL_DRIVER_HPP + +#include "InputFlags.hpp" +#include "driver.hpp" +#include "mloAvgPoolHost.hpp" +#include "random.hpp" +#include "tensor_driver.hpp" +#include "timer.hpp" + +#include <../test/tensor_holder.hpp> +#include <../test/verify.hpp> + +#include +#include +#include +#include +#include + +template +class AvgPoolDriver : public Driver +{ +public: + AvgPoolDriver() : Driver() + { + miopenCreateTensorDescriptor(&inputDesc); + miopenCreateTensorDescriptor(&outputDesc); + miopenCreateTensorDescriptor(&inputGradDesc); + miopenCreateTensorDescriptor(&outputGradDesc); + + data_type = miopen_type{}; + } + + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + std::vector GetInputTensorDimsFromCmd(const char* param); + int GetandSetData() override; + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + int RunForwardCPU(); + + int RunBackwardGPU() override; + int RunBackwardCPU(); + + Tref GetTolerance(); + int VerifyBackward() override; + int VerifyForward() override; + ~AvgPoolDriver() override + { + miopenDestroyTensorDescriptor(inputDesc); + miopenDestroyTensorDescriptor(outputDesc); + miopenDestroyTensorDescriptor(inputGradDesc); + miopenDestroyTensorDescriptor(outputGradDesc); + } + +private: + InputFlags inflags; + + int forw; + + miopenTensorDescriptor_t inputDesc; + miopenTensorDescriptor_t outputDesc; + miopenTensorDescriptor_t inputGradDesc; + miopenTensorDescriptor_t outputGradDesc; + + std::unique_ptr input_dev; + std::unique_ptr output_dev; + std::unique_ptr input_grad_dev; + std::unique_ptr output_grad_dev; + + std::vector input; + std::vector output; + std::vector output_host; + std::vector input_grad; + std::vector input_grad_host; + std::vector output_grad; + std::vector ksize; + std::vector stride; + std::vector padding; + + bool ceil_mode; + bool count_include_pad; + int32_t divisor_override; + int32_t N, C, D, H, W, OD, OH, OW; + + std::vector in_dim; +}; + +template +int AvgPoolDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + return miopenStatusSuccess; +} + +template +std::vector AvgPoolDriver::GetInputTensorDimsFromCmd(const char* param) +{ + std::string lengthsStr = inflags.GetValueStr(param); + + std::vector lengths; + std::size_t pos = 0; + std::size_t new_pos; + + new_pos = lengthsStr.find(',', pos); + while(new_pos != std::string::npos) + { + std::string sliceStr = lengthsStr.substr(pos, new_pos - pos); + + int len = std::stoi(sliceStr); + + lengths.push_back(len); + + pos = new_pos + 1; + new_pos = lengthsStr.find(',', pos); + }; + + std::string sliceStr = lengthsStr.substr(pos); + int len = std::stoi(sliceStr); + + lengths.push_back(len); + + return (lengths); +} + +template +int AvgPoolDriver::GetandSetData() +{ + in_dim = GetInputTensorDimsFromCmd("input_dims"); + int ksp_dim = in_dim.size() - 2; + ksize = GetInputTensorDimsFromCmd("kernel_size"); + stride = GetInputTensorDimsFromCmd("stride"); + padding = GetInputTensorDimsFromCmd("padding"); + + if(ksize.size() != ksp_dim) + { + int ref = ksp_dim - ksize.size(); + while((ref--) != 0) + ksize.push_back(ksize[0]); + } + if(stride.size() != ksp_dim) + { + int ref = ksp_dim - stride.size(); + while((ref--) != 0) + stride.push_back(stride[0]); + } + if(padding.size() != ksp_dim) + { + int ref = ksp_dim - padding.size(); + while((ref--) != 0) + padding.push_back(padding[0]); + } + + ceil_mode = static_cast(inflags.GetValueInt("ceil_mode")); + count_include_pad = static_cast(inflags.GetValueInt("count_include_pad")); + divisor_override = inflags.GetValueInt("divisor_override"); + + N = in_dim[0]; + C = in_dim[1]; + D = in_dim.size() == 5 ? in_dim[2] : 1; + H = in_dim.size() == 5 ? in_dim[3] : in_dim[2]; + W = in_dim.size() == 5 ? in_dim[4] : in_dim[3]; + + std::vector out_dim; + if(in_dim.size() == 5) + { + if(ceil_mode) + { + OD = std::ceil(static_cast(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OH = std::ceil(static_cast(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + OW = std::ceil(static_cast(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1; + } + else + { + OD = std::floor(static_cast(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OH = std::floor(static_cast(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + OW = std::floor(static_cast(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1; + } + out_dim = std::vector{N, C, OD, OH, OW}; + } + else + { + if(ceil_mode) + { + OH = std::ceil(static_cast(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OW = std::ceil(static_cast(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + } + else + { + OH = std::floor(static_cast(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OW = std::floor(static_cast(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + } + out_dim = std::vector{N, C, OH, OW}; + } + SetTensorNd(inputDesc, in_dim, data_type); + SetTensorNd(outputDesc, out_dim, data_type); + SetTensorNd(outputGradDesc, out_dim, data_type); + SetTensorNd(inputGradDesc, in_dim, data_type); + + return miopenStatusSuccess; +} + +template +int AvgPoolDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag("forw", 'F', "1", "Run only Forward AvgPool (Default=1)", "int"); + inflags.AddInputFlag( + "input_dims", + 'D', + "2,3,7,9", + "The dimensional lengths of the input tensor: N,C,D1,D2,... Example: 2,3,7,9.", + "string"); + inflags.AddInputFlag( + "kernel_size", 'k', "1,1", "The size of the window D1,D2,... Example: 1,1.", "string"); + inflags.AddInputFlag( + "stride", + 's', + "1,1", + "The stride of the window. Default value is kernel_size D1,D2,... Example: 1,1.", + "string"); + inflags.AddInputFlag("padding", + 'p', + "0,0", + "Implicit zero padding to be added on both sides D1,D2,... Example: 0,0.", + "string"); + inflags.AddInputFlag("ceil_mode", + 'c', + "1", + "When 1, will use ceil instead of floor to compute the output shape.", + "int"); + inflags.AddInputFlag("count_include_pad", + 'P', + "0", + "When 1, will include the zero-padding in the averaging calculation.", + "int"); + inflags.AddInputFlag("divisor_override", + 'd', + "0", + "If specified, it will be used as divisor, otherwise size of the pooling " + "region will be used.", + "int"); + + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "1", "Verify (Default=1)", "int"); + inflags.AddInputFlag("time", 't', "1", "Time (Default=1)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +int AvgPoolDriver::AllocateBuffersAndCopy() +{ + size_t input_sz = GetTensorSize(inputDesc); + size_t output_sz = GetTensorSize(outputDesc); + + uint32_t ctx = 0; + + input_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); + output_dev = std::unique_ptr(new GPUMem(ctx, output_sz, sizeof(Tgpu))); + input_grad_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); + output_grad_dev = std::unique_ptr(new GPUMem(ctx, output_sz, sizeof(Tgpu))); + + input = std::vector(input_sz, static_cast(0)); + output = std::vector(output_sz, static_cast(0)); + output_host = std::vector(output_sz, static_cast(0)); + + input_grad = std::vector(input_sz, static_cast(0)); + input_grad_host = std::vector(input_sz, static_cast(0)); + output_grad = std::vector(output_sz, static_cast(0)); + + int status; + + for(int i = 0; i < input_sz; i++) + { + input[i] = prng::gen_A_to_B(static_cast(-10.0f), static_cast(10.0f)); + } + status = input_dev->ToGPU(q, input.data()); + + status |= output_dev->ToGPU(q, output.data()); + + status |= input_grad_dev->ToGPU(q, input_grad.data()); + + for(int i = 0; i < output_sz; i++) + { + output_grad[i] = prng::gen_A_to_B(static_cast(-1.0), static_cast(1.0)); + } + status |= output_grad_dev->ToGPU(q, output_grad.data()); + + if(status != 0) + std::cout << "Error copying data to GPU\n" << std::endl; + + return miopenStatusSuccess; +} + +template +int AvgPoolDriver::RunForwardGPU() +{ + float kernel_total_time = 0.0; + float kernel_first_time = 0.0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenAvgPoolForward(GetHandle(), + inputDesc, + input_dev->GetMem(), + outputDesc, + output_dev->GetMem(), + ksize.size() == 3 ? ksize[0] : 0, + ksize.size() == 3 ? ksize[1] : ksize[0], + ksize.size() == 3 ? ksize[2] : ksize[1], + stride.size() == 3 ? stride[0] : 0, + stride.size() == 3 ? stride[1] : stride[0], + stride.size() == 3 ? stride[2] : stride[1], + padding.size() == 3 ? padding[0] : 0, + padding.size() == 3 ? padding[1] : padding[0], + padding.size() == 3 ? padding[2] : padding[1], + count_include_pad, + divisor_override); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + printf("Wall-clock Time Forward AvgPool Elapsed: %f ms\n", t.gettime_ms() / iter); + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + printf("GPU Kernel Time Forward AvgPool Elapsed: %f ms\n", kernel_average_time); + } + + output_dev->FromGPU(GetStream(), output.data()); + + return miopenStatusSuccess; +} + +template +int AvgPoolDriver::RunForwardCPU() +{ + if(in_dim.size() == 4) + { + mloAvgPoolForward2dRunHost(inputDesc, + outputDesc, + input.data(), + output_host.data(), + N, + C, + H, + W, + OH, + OW, + ksize.data(), + stride.data(), + padding.data(), + count_include_pad, + divisor_override); + } + else if(in_dim.size() == 5) + { + mloAvgPoolForward3dRunHost(inputDesc, + outputDesc, + input.data(), + output_host.data(), + N, + C, + D, + H, + W, + OD, + OH, + OW, + ksize.data(), + stride.data(), + padding.data(), + count_include_pad, + divisor_override); + } + return miopenStatusSuccess; +} + +template +int AvgPoolDriver::RunBackwardGPU() +{ + float kernel_total_time = 0.0; + float kernel_first_time = 0.0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenAvgPoolBackward(GetHandle(), + outputGradDesc, + output_grad_dev->GetMem(), + inputGradDesc, + input_grad_dev->GetMem(), + ksize.size() == 3 ? ksize[0] : 0, + ksize.size() == 3 ? ksize[1] : ksize[0], + ksize.size() == 3 ? ksize[2] : ksize[1], + stride.size() == 3 ? stride[0] : 0, + stride.size() == 3 ? stride[1] : stride[0], + stride.size() == 3 ? stride[2] : stride[1], + padding.size() == 3 ? padding[0] : 0, + padding.size() == 3 ? padding[1] : padding[0], + padding.size() == 3 ? padding[2] : padding[1], + count_include_pad, + divisor_override); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + printf("Wall-clock Time Backward AvgPool Elapsed: %f ms\n", t.gettime_ms() / iter); + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + printf("GPU Kernel Time Backward AvgPool Elapsed: %f ms\n", kernel_average_time); + } + + input_grad_dev->FromGPU(GetStream(), input_grad.data()); + + return miopenStatusSuccess; +} + +template +int AvgPoolDriver::RunBackwardCPU() +{ + if(in_dim.size() == 4) + { + mloAvgPoolBackward2dRunHost(outputGradDesc, + inputGradDesc, + output_grad.data(), + input_grad_host.data(), + N, + C, + H, + W, + OH, + OW, + ksize.data(), + stride.data(), + padding.data(), + count_include_pad, + divisor_override); + } + else if(in_dim.size() == 5) + { + mloAvgPoolBackward3dRunHost(outputGradDesc, + inputGradDesc, + output_grad.data(), + input_grad_host.data(), + N, + C, + D, + H, + W, + OD, + OH, + OW, + ksize.data(), + stride.data(), + padding.data(), + count_include_pad, + divisor_override); + } + return miopenStatusSuccess; +} + +template +Tref AvgPoolDriver::GetTolerance() +{ + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + auto tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + return tolerance; +} + +template +int AvgPoolDriver::VerifyForward() +{ + RunForwardCPU(); + const Tref tolerance = GetTolerance(); + auto error = miopen::rms_range(output_host, output); + + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Forward AvgPool FAILED: " << error << std::endl; + return EC_VerifyFwd; + } + else + { + printf("Forward AvgPool Verifies on CPU and GPU (err=%f)\n", error); + } + + return miopenStatusSuccess; +} + +template +int AvgPoolDriver::VerifyBackward() +{ + RunBackwardCPU(); + const Tref tolerance = GetTolerance(); + auto error = miopen::rms_range(input_grad_host, input_grad); + + if(!std::isfinite(error) || error > tolerance) + { + std::cout << "Backward AvgPool FAILED: " << error << std::endl; + return EC_VerifyFwd; + } + else + { + printf("Backward AvgPool Verifies on CPU and GPU (err=%f)\n", error); + } + return miopenStatusSuccess; +} + +#endif // GUARD_MIOPEN_AVGPOOL_DRIVER_HPP diff --git a/driver/dm_avgpool.cpp b/driver/dm_avgpool.cpp new file mode 100644 index 0000000000..ec0e457056 --- /dev/null +++ b/driver/dm_avgpool.cpp @@ -0,0 +1,40 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "registry_driver_maker.hpp" +#include "avgpool_driver.hpp" + +static Driver* makeDriver(const std::string& base_arg) +{ + if(base_arg == "avgpool") + return new AvgPoolDriver(); + if(base_arg == "avgpoolfp16") + return new AvgPoolDriver(); + if(base_arg == "avgpoolbfp16") + return new AvgPoolDriver(); + return nullptr; +} + +REGISTER_DRIVER_MAKER(makeDriver); diff --git a/driver/driver.hpp b/driver/driver.hpp index b23df690d1..bd42f6ee13 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -175,7 +175,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], " "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], " "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, " - "getitem[bfp16|fp16], reducecalculation[bfp16|fp16]\n"); + "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], avgpool[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -206,7 +206,8 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" && arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" && arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" && - arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "--version") + arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "avgpool" && + arg != "avgpoolfp16" && arg != "avgpoolbfp16" && arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); diff --git a/driver/mloAvgPoolHost.hpp b/driver/mloAvgPoolHost.hpp new file mode 100644 index 0000000000..6980ce968e --- /dev/null +++ b/driver/mloAvgPoolHost.hpp @@ -0,0 +1,438 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MLO_AVGPOOLHOST_H_ +#define MLO_AVGPOOLHOST_H_ + +#include +#include + +template +int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t outputDesc, + Tgpu* input, + Tcheck* output, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + const int32_t* ksize, + const int32_t* stride, + const int32_t* padding, + bool count_include_pad, + int32_t divisor_override) +{ + auto dims = miopen::deref(inputDesc).GetLengths(); + auto numel = miopen::deref(outputDesc).GetElementSize(); + + auto input_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc)); + auto output_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputDesc)); + + for(int32_t gid = 0; gid < numel; gid++) + { + int32_t ncoh = gid / OW, ow = gid % OW; + int32_t nc = ncoh / OH, oh = ncoh % OH; + int32_t n = nc / C, c = nc % C; + int32_t R = ksize[0]; + int32_t S = ksize[1]; + int32_t sh = stride[0]; + int32_t sw = stride[1]; + int32_t ph = padding[0]; + int32_t pw = padding[1]; + + if(n >= N) + return 0; + + float m = 0; + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + // input idx : (n, c, h, w) + int32_t h = oh * sh - ph + r; + if(h < 0 || h >= H) + continue; + int32_t w = ow * sw - pw + s; + if(w < 0 || w >= W) + continue; + // int32_t input_idx = ((n * C + c) * H + h) * W + w; + m += static_cast( + input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]); + } + } + + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (hend - hstart) * (wend - wstart); + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, H); + wend = min(wend, W); + + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (hend - hstart) * (wend - wstart); + } + } + float val = m / divide_factor; + + output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] = + static_cast(val); + } + return 0; +} + +template +int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t outputDesc, + Tgpu* input, + Tcheck* output, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, + const int32_t* ksize, + const int32_t* stride, + const int32_t* padding, + bool count_include_pad, + int32_t divisor_override) +{ + auto dims = miopen::deref(inputDesc).GetLengths(); + auto numel = miopen::deref(outputDesc).GetElementSize(); + + auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); + auto output_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc)); + + for(int32_t gid = 0; gid < numel; gid++) + { + int32_t ncodoh = gid / OW, ow = gid % OW; + int32_t ncod = ncodoh / OH, oh = ncodoh % OH; + int32_t nc = ncod / OD, od = ncod % OD; + int32_t n = nc / C, c = nc % C; + int32_t KD = ksize[0]; + int32_t R = ksize[1]; + int32_t S = ksize[2]; + int32_t sd = stride[0]; + int32_t sh = stride[1]; + int32_t sw = stride[2]; + int32_t pd = padding[0]; + int32_t ph = padding[1]; + int32_t pw = padding[2]; + + if(n >= N) + return 0; + float sum = 0; + for(int32_t kd = 0; kd < KD; ++kd) + { + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + // input idx : (n, c, d, h, w) + int32_t d = od * sd - pd + kd; + if(d < 0 || d >= D) + continue; + int32_t h = oh * sh - ph + r; + if(h < 0 || h >= H) + continue; + int32_t w = ow * sw - pw + s; + if(w < 0 || w >= W) + continue; + // int32_t input_idx = ((n * C + c) * H + h) * W + w; + sum += static_cast( + input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]); + } + } + } + int32_t dstart = od * sd - pd; + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t dend = min(dstart + KD, D + pd); + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + dend = min(dend, D); + hend = min(hend, H); + wend = min(wend, W); + + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart); + } + } + float val = sum / divide_factor; + output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] = + static_cast(val); + } + return 0; +} + +template +int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDesc, + const miopenTensorDescriptor_t inputGradDesc, + Tgpu* output_grad, + Tcheck* input_grad, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + const int32_t* ksize, + const int32_t* stride, + const int32_t* padding, + bool count_include_pad, + int32_t divisor_override) +{ + auto dims = miopen::deref(inputGradDesc).GetLengths(); + auto numel = miopen::deref(inputGradDesc).GetElementSize(); + + auto output_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc)); + auto input_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(inputGradDesc)); + + for(size_t gid = 0; gid < numel; gid++) + { + int32_t nch = gid / W, w = gid % W; + int32_t nc = nch / H, h = nch % H; + int32_t n = nc / C, c = nc % C; + int32_t R = ksize[0]; + int32_t S = ksize[1]; + int32_t sh = stride[0]; + int32_t sw = stride[1]; + int32_t ph = padding[0]; + int32_t pw = padding[1]; + + if(n >= N) + return 0; + + float grad = 0; + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + int32_t ohsh = h + ph - r; + if(ohsh % sh != 0) + continue; + int32_t oh = ohsh / sh; + if(oh < 0 || oh >= OH) + continue; + int32_t owsw = w + pw - s; + if(owsw % sw != 0) + continue; + int32_t ow = owsw / sw; + if(ow < 0 || ow >= OW) + continue; + + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (hend - hstart) * (wend - wstart); + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, H); + wend = min(wend, W); + + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (hend - hstart) * (wend - wstart); + } + } + + grad += static_cast(output_grad[output_grad_tv.get_tensor_view_idx( + tensor_layout_t<4>(n, c, oh, ow))]) / + divide_factor; + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] = + static_cast(grad); + } + return 0; +} + +template +int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDesc, + const miopenTensorDescriptor_t inputGradDesc, + Tgpu* output_grad, + Tcheck* input_grad, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, + const int32_t* ksize, + const int32_t* stride, + const int32_t* padding, + bool count_include_pad, + int32_t divisor_override) +{ + auto dims = miopen::deref(inputGradDesc).GetLengths(); + auto numel = miopen::deref(inputGradDesc).GetElementSize(); + + auto output_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc)); + auto input_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputGradDesc)); + + for(size_t gid = 0; gid < numel; gid++) + { + int32_t ncdh = gid / W, w = gid % W; + int32_t ncd = ncdh / H, h = ncdh % H; + int32_t nc = ncd / D, d = ncd % D; + int32_t n = nc / C, c = nc % C; + int32_t KD = ksize[0]; + int32_t R = ksize[1]; + int32_t S = ksize[2]; + int32_t sd = stride[0]; + int32_t sh = stride[1]; + int32_t sw = stride[2]; + int32_t pd = padding[0]; + int32_t ph = padding[1]; + int32_t pw = padding[2]; + + if(n >= N) + return 0; + + float grad = 0; + for(int32_t kd = 0; kd < KD; ++kd) + { + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + int32_t odsd = d + pd - kd; + if(odsd % sd != 0) + continue; + int32_t od = odsd / sd; + if(od < 0 || od >= OD) + continue; + + int32_t ohsh = h + ph - r; + if(ohsh % sh != 0) + continue; + int32_t oh = ohsh / sh; + if(oh < 0 || oh >= OH) + continue; + + int32_t owsw = w + pw - s; + if(owsw % sw != 0) + continue; + int32_t ow = owsw / sw; + if(ow < 0 || ow >= OW) + continue; + + int32_t dstart = od * sd - pd; + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t dend = min(dstart + KD, D + pd); + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + dend = min(dend, D); + hend = min(hend, H); + wend = min(wend, W); + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart); + } + } + grad += static_cast(output_grad[output_grad_tv.get_tensor_view_idx( + tensor_layout_t<5>(n, c, od, oh, ow))]) / + divide_factor; + } + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] = + static_cast(grad); + } + return 0; +} + +#endif // MLO_AVGPOOLHOST_H_ diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 3b9bbeccc1..ea44de92d5 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -70,6 +70,7 @@ * @defgroup SGD * @defgroup getitem * @defgroup ReduceCalculation + * @defgroup avgpool * */ @@ -7621,6 +7622,76 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle, // CLOSEOUT GETITEM DOXYGEN GROUP #endif // MIOPEN_BETA_API +#ifdef MIOPEN_BETA_API +// avgpool APIs +/** @addtogroup avgpool + * + * @{ + */ + +/*! @brief Execute an avgpool forward layer + * + * @param handle MIOpen handle (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param input Data tensor input (input) + * @param outputDesc Tensor descriptor for output tensor (input) + * @param output Data tensor output (output) + * @param count_include_pad When True, will include the zero-padding in the averaging + * calculation (input) + * @param divisor_override If non-zero, will use this value as the divisor, otherwise will + * use the number of elements in the pooling window (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output, + const int32_t KD, + const int32_t KH, + const int32_t KW, + const int32_t SD, + const int32_t SH, + const int32_t SW, + const int32_t PD, + const int32_t PH, + const int32_t PW, + const bool count_include_pad, + const int32_t divisor_override); + +/*! @brief Execute an avgpool backward layer + * + * @param handle MIOpen handle (input) + * @param outputGradDesc Tensor descriptor for output grad tensor (input) + * @param output_grad Data tensor output grad (input) + * @param inputGradDesc Tensor descriptor for input grad tensor (input) + * @param input_grad Data tensor input grad (output) + * @param count_include_pad When True, will include the zero-padding in the averaging + * calculation (input) + * @param divisor_override If non-zero, will use this value as the divisor, otherwise will + * use the number of elements in the pooling window (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle, + const miopenTensorDescriptor_t outputGradDesc, + const void* output_grad, + const miopenTensorDescriptor_t inputGradDesc, + void* input_grad, + const int32_t KD, + const int32_t KH, + const int32_t KW, + const int32_t SD, + const int32_t SH, + const int32_t SW, + const int32_t PD, + const int32_t PH, + const int32_t PW, + const bool count_include_pad, + const int32_t divisor_override); +/** @} */ +// CLOSEOUT avgpool DOXYGEN GROUP +#endif // MIOPEN_BETA_API + #ifdef __cplusplus } #endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 77acf3f7d3..ee36c92967 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -89,6 +89,8 @@ set( MIOpen_Source adam_api.cpp addlayernorm_api.cpp api/find2_0_commons.cpp + avgpool_api.cpp + avgpool/problem_description.cpp batch_norm.cpp batch_norm_api.cpp batchnorm/problem_description.cpp @@ -191,6 +193,10 @@ set( MIOpen_Source solver/activ/fwd_1.cpp solver/adam/adam.cpp solver/adam/transformers_adam_w.cpp + solver/avgpool/backward_avgpool_2d.cpp + solver/avgpool/backward_avgpool_3d.cpp + solver/avgpool/forward_avgpool_2d.cpp + solver/avgpool/forward_avgpool_3d.cpp solver/batchnorm/backward_ck.cpp solver/batchnorm/backward_per_activation.cpp solver/batchnorm/backward_per_activation_fused.cpp @@ -482,6 +488,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN ${GPU_BATCHED_TRANSPOSE_KERNEL_HIP} ${GPU_GENERAL_TENSOR_REORDER_KERNEL_HIP_SOURCE} kernels/MIOpenAdam.cpp + kernels/MIOpenAvgPool.cpp kernels/MIOpenCat.cpp kernels/MIOpenCheckNumerics.cpp kernels/MIOpenBatchNormActivBwdPerAct.cl @@ -626,6 +633,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN activ.cpp adam.cpp addlayernorm.cpp + avgpool.cpp cat.cpp groupnorm.cpp getitem.cpp diff --git a/src/avgpool.cpp b/src/avgpool.cpp new file mode 100644 index 0000000000..323f01c90e --- /dev/null +++ b/src/avgpool.cpp @@ -0,0 +1,136 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#include +#include +#include +#include +#include +#include + +namespace miopen { + +miopenStatus_t AvgPoolForward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + const int32_t KD, + const int32_t KH, + const int32_t KW, + const int32_t SD, + const int32_t SH, + const int32_t SW, + const int32_t PD, + const int32_t PH, + const int32_t PW, + const bool count_include_pad, + const int32_t divisor_override) +{ + const auto problem = + avgpool::FwdProblemDescription{inputDesc, outputDesc, count_include_pad, divisor_override}; + + const auto invoke_params = [&]() { + auto tmp = avgpool::FwdInvokeParams{}; + tmp.inputDesc = &inputDesc; + tmp.outputDesc = &outputDesc; + + tmp.input = input; + tmp.output = output; + tmp.KD = KD; + tmp.KH = KH; + tmp.KW = KW; + tmp.SD = SD; + tmp.SH = SH; + tmp.SW = SW; + tmp.PD = PD; + tmp.PH = PH; + tmp.PW = PW; + tmp.count_include_pad = count_include_pad; + tmp.divisor_override = divisor_override; + + return tmp; + }(); + const auto algo = AlgorithmName{"AvgPoolForward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +miopenStatus_t AvgPoolBackward(Handle& handle, + const TensorDescriptor& outputGradDesc, + ConstData_t output_grad, + const TensorDescriptor& inputGradDesc, + Data_t input_grad, + const int32_t KD, + const int32_t KH, + const int32_t KW, + const int32_t SD, + const int32_t SH, + const int32_t SW, + const int32_t PD, + const int32_t PH, + const int32_t PW, + const bool count_include_pad, + const int32_t divisor_override) +{ + const auto problem = avgpool::BwdProblemDescription{ + outputGradDesc, inputGradDesc, count_include_pad, divisor_override}; + + const auto invoke_params = [&]() { + auto tmp = avgpool::BwdInvokeParams{}; + tmp.outputGradDesc = &outputGradDesc; + tmp.inputGradDesc = &inputGradDesc; + + tmp.output_grad = output_grad; + tmp.input_grad = input_grad; + tmp.KD = KD; + tmp.KH = KH; + tmp.KW = KW; + tmp.SD = SD; + tmp.SH = SH; + tmp.SW = SW; + tmp.PD = PD; + tmp.PH = PH; + tmp.PW = PW; + tmp.count_include_pad = count_include_pad; + tmp.divisor_override = divisor_override; + + return tmp; + }(); + const auto algo = AlgorithmName{"AvgPoolBackward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +} // namespace miopen diff --git a/src/avgpool/problem_description.cpp b/src/avgpool/problem_description.cpp new file mode 100644 index 0000000000..96ecb4bb72 --- /dev/null +++ b/src/avgpool/problem_description.cpp @@ -0,0 +1,95 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +namespace miopen { + +namespace avgpool { + +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + os << '{'; + for(int i = 0; i < v.size(); ++i) + { + if(i != 0) + os << ','; + os << v[i]; + } + os << '}'; + return os; +} + +NetworkConfig FwdProblemDescription::MakeNetworkConfig() const +{ + auto input_size = inputDesc.GetLengths(); + auto output_size = outputDesc.GetLengths(); + auto input_stride = inputDesc.GetStrides(); + auto output_stride = outputDesc.GetStrides(); + + auto input_dtype = inputDesc.GetType(); + + std::ostringstream ss; + + ss << "avgpool_fwd"; + ss << "-input_dtype" << input_dtype; + ss << "-Is" << input_size; + ss << "-Os" << output_size; + ss << "-Si" << input_stride; + ss << "-So" << output_stride; + ss << "-Cp " << count_include_pad; + ss << "-Do " << divisor_override; + + return NetworkConfig{ss.str()}; +} + +NetworkConfig BwdProblemDescription::MakeNetworkConfig() const +{ + auto input_grad_size = inputGradDesc.GetLengths(); + auto output_grad_size = outputGradDesc.GetLengths(); + auto input_grad_stride = inputGradDesc.GetStrides(); + auto output_grad_stride = outputGradDesc.GetStrides(); + + auto input_dtype = inputGradDesc.GetType(); + + std::ostringstream ss; + + ss << "avgpool_bwd"; + ss << "-input_dtype" << input_dtype; + ss << "-dIs" << input_grad_size; + ss << "-dOs" << output_grad_size; + ss << "-dSi" << input_grad_stride; + ss << "-dSo" << output_grad_stride; + ss << "-Cp " << count_include_pad; + ss << "-Do " << divisor_override; + + return NetworkConfig{ss.str()}; +} + +} // namespace avgpool + +} // namespace miopen diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp new file mode 100644 index 0000000000..32e1f12f92 --- /dev/null +++ b/src/avgpool_api.cpp @@ -0,0 +1,190 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include + +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + os << '{'; + for(int i = 0; i < v.size(); ++i) + { + if(i != 0) + os << ','; + os << v[i]; + } + os << '}'; + return os; +} + +static void LogCmdAvgPool(const miopenTensorDescriptor_t xDesc, + const miopenTensorDescriptor_t oDesc, + const bool count_include_pad, + const int32_t divisor_override, + const bool is_fwd) +{ + if(miopen::IsLoggingCmd()) + { + std::stringstream ss; + auto dtype = miopen::deref(xDesc).GetType(); + if(dtype == miopenHalf) + { + ss << "avgpoolfp16"; + } + else if(dtype == miopenFloat) + { + ss << "avgpoolfp32"; + } + else if(dtype == miopenBFloat16) + { + ss << "avgpoolbfp16"; + } + + MIOPEN_LOG_FUNCTION(xDesc, oDesc, count_include_pad, divisor_override); + ss << " -Is " << miopen::deref(xDesc).GetLengths(); + ss << " -Os " << miopen::deref(oDesc).GetLengths(); + ss << " -Si " << miopen::deref(xDesc).GetStrides(); + ss << " -So " << miopen::deref(oDesc).GetStrides(); + ss << " -Cp " << count_include_pad; + ss << " -Do " << divisor_override; + ss << " -F " << ((is_fwd) ? "1" : "2"); + + MIOPEN_LOG_DRIVER_CMD(ss.str()); + } +} + +extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t outputDesc, + void* output, + const int32_t KD, + const int32_t KH, + const int32_t KW, + const int32_t SD, + const int32_t SH, + const int32_t SW, + const int32_t PD, + const int32_t PH, + const int32_t PW, + const bool count_include_pad, + const int32_t divisor_override) +{ + MIOPEN_LOG_FUNCTION(handle, + inputDesc, + input, + outputDesc, + output, + KD, + KH, + KW, + SD, + SH, + SW, + PD, + PH, + PW, + count_include_pad, + divisor_override); + + LogCmdAvgPool(inputDesc, outputDesc, count_include_pad, divisor_override, true); + return miopen::try_([&] { + miopen::AvgPoolForward(miopen::deref(handle), + miopen::deref(inputDesc), + DataCast(input), + miopen::deref(outputDesc), + DataCast(output), + KD, + KH, + KW, + SD, + SH, + SW, + PD, + PH, + PW, + count_include_pad, + divisor_override); + }); +} + +extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle, + const miopenTensorDescriptor_t outputGradDesc, + const void* output_grad, + const miopenTensorDescriptor_t inputGradDesc, + void* input_grad, + const int32_t KD, + const int32_t KH, + const int32_t KW, + const int32_t SD, + const int32_t SH, + const int32_t SW, + const int32_t PD, + const int32_t PH, + const int32_t PW, + const bool count_include_pad, + const int32_t divisor_override) +{ + MIOPEN_LOG_FUNCTION(handle, + outputGradDesc, + output_grad, + inputGradDesc, + input_grad, + KD, + KH, + KW, + SD, + SH, + SW, + PD, + PH, + PW, + count_include_pad, + divisor_override); + + LogCmdAvgPool(inputGradDesc, outputGradDesc, count_include_pad, divisor_override, false); + return miopen::try_([&] { + miopen::AvgPoolBackward(miopen::deref(handle), + miopen::deref(outputGradDesc), + DataCast(output_grad), + miopen::deref(inputGradDesc), + DataCast(input_grad), + KD, + KH, + KW, + SD, + SH, + SW, + PD, + PH, + PW, + count_include_pad, + divisor_override); + }); +} diff --git a/src/include/miopen/avgpool.hpp b/src/include/miopen/avgpool.hpp new file mode 100644 index 0000000000..00a2717ff6 --- /dev/null +++ b/src/include/miopen/avgpool.hpp @@ -0,0 +1,71 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#ifndef MIOPEN_AVGPOOL_HPP_ +#define MIOPEN_AVGPOOL_HPP_ + +#include + +namespace miopen { + +struct Handle; +struct TensorDescriptor; + +MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolForward(Handle& handle, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& outputDesc, + Data_t output, + int32_t KD, + int32_t KH, + int32_t KW, + int32_t SD, + int32_t SH, + int32_t SW, + int32_t PD, + int32_t PH, + int32_t PW, + bool count_include_pad, + int32_t divisor_override); + +MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolBackward(Handle& handle, + const TensorDescriptor& outputGradDesc, + ConstData_t output_grad, + const TensorDescriptor& inputGradDesc, + Data_t input_grad, + int32_t KD, + int32_t KH, + int32_t KW, + int32_t SD, + int32_t SH, + int32_t SW, + int32_t PD, + int32_t PH, + int32_t PW, + bool count_include_pad, + int32_t divisor_override); +} // namespace miopen +#endif // _MIOPEN_AVGPOOL_HPP_ diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/avgpool/invoke_params.hpp new file mode 100644 index 0000000000..e8bd9256ac --- /dev/null +++ b/src/include/miopen/avgpool/invoke_params.hpp @@ -0,0 +1,93 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include "miopen/common.hpp" +#include +#include + +namespace miopen { +namespace avgpool { + +struct FwdInvokeParams : public miopen::InvokeParams +{ + + FwdInvokeParams() = default; + + const TensorDescriptor* inputDesc = nullptr; + const TensorDescriptor* outputDesc = nullptr; + + ConstData_t input = nullptr; + Data_t output = nullptr; + ConstData_t ksize = nullptr; + + int32_t KD = 0; + int32_t KH = 0; + int32_t KW = 0; + int32_t SD = 0; + int32_t SH = 0; + int32_t SW = 0; + int32_t PD = 0; + int32_t PH = 0; + int32_t PW = 0; + bool count_include_pad = false; + int32_t divisor_override = 0; + + std::size_t GetWorkspaceSize() const { return 0; } + Data_t GetWorkspace() const { return nullptr; } +}; + +struct BwdInvokeParams : public miopen::InvokeParams +{ + + BwdInvokeParams() = default; + + const TensorDescriptor* outputGradDesc = nullptr; + const TensorDescriptor* inputGradDesc = nullptr; + + ConstData_t output_grad = nullptr; + Data_t input_grad = nullptr; + ConstData_t ksize = nullptr; + + int32_t KD = 0; + int32_t KH = 0; + int32_t KW = 0; + int32_t SD = 0; + int32_t SH = 0; + int32_t SW = 0; + int32_t PD = 0; + int32_t PH = 0; + int32_t PW = 0; + bool count_include_pad = false; + int32_t divisor_override = 0; + + std::size_t GetWorkspaceSize() const { return 0; } + Data_t GetWorkspace() const { return nullptr; } +}; + +} // namespace avgpool +} // namespace miopen diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/avgpool/problem_description.hpp new file mode 100644 index 0000000000..2dee6a30ea --- /dev/null +++ b/src/include/miopen/avgpool/problem_description.hpp @@ -0,0 +1,133 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include +#include +#include + +namespace miopen { + +struct NetworkConfig; + +namespace avgpool { + +struct ProblemDescription : ProblemDescriptionBase +{ + ProblemDescription(const bool count_include_pad_, const int32_t divisor_override_) + : count_include_pad(count_include_pad_), divisor_override(divisor_override_) + { + if(divisor_override < 0) + { + MIOPEN_THROW(miopenStatusBadParm, "AvgPool: divisor_override must be non-negative."); + } + } + +protected: + bool count_include_pad; + int32_t divisor_override; +}; + +struct FwdProblemDescription : ProblemDescription +{ + FwdProblemDescription(const TensorDescriptor& inputDesc_, + const TensorDescriptor& outputDesc_, + const bool count_include_pad_, + const int32_t divisor_override_) + : ProblemDescription(count_include_pad_, divisor_override_), + inputDesc(inputDesc_), + outputDesc(outputDesc_) + { + IsValidLength(); + } + + auto GetInputDesc() const { return inputDesc; } + auto GetOutputDesc() const { return outputDesc; } + auto GetNtotal() const { return outputDesc.GetElementSize(); } + + bool IsValidLength() const + { + auto input_dims = inputDesc.GetLengths().size(); + if(outputDesc.GetLengths()[0] != inputDesc.GetLengths()[0] || + outputDesc.GetLengths()[1] != inputDesc.GetLengths()[1] || + outputDesc.GetLengths().size() != input_dims) + { + MIOPEN_THROW(miopenStatusBadParm, + "AvgPool: Input and output tensor sizes do not match."); + } + + return true; + } + + NetworkConfig MakeNetworkConfig() const override; + +protected: + TensorDescriptor inputDesc; + TensorDescriptor outputDesc; +}; + +struct BwdProblemDescription : ProblemDescription +{ + BwdProblemDescription(const TensorDescriptor& outputGradDesc_, + const TensorDescriptor& inputGradDesc_, + const bool count_include_pad_, + const int32_t divisor_override_) + : ProblemDescription(count_include_pad_, divisor_override_), + outputGradDesc(outputGradDesc_), + inputGradDesc(inputGradDesc_) + { + IsValidLength(); + } + + auto GetOutputGradDesc() const { return outputGradDesc; } + auto GetInputGradDesc() const { return inputGradDesc; } + auto GetNtotal() const { return inputGradDesc.GetElementSize(); } + + bool IsValidLength() const + { + auto input_dims = inputGradDesc.GetLengths().size(); + if(outputGradDesc.GetLengths()[0] != inputGradDesc.GetLengths()[0] || + outputGradDesc.GetLengths()[1] != inputGradDesc.GetLengths()[1] || + outputGradDesc.GetLengths().size() != input_dims) + { + MIOPEN_THROW(miopenStatusBadParm, + "AvgPool: Input grad and output grad tensor sizes do not match."); + } + + return true; + } + + NetworkConfig MakeNetworkConfig() const override; + +protected: + TensorDescriptor outputGradDesc; + TensorDescriptor inputGradDesc; +}; + +} // namespace avgpool + +} // namespace miopen diff --git a/src/include/miopen/avgpool/solvers.hpp b/src/include/miopen/avgpool/solvers.hpp new file mode 100644 index 0000000000..5577b9fad6 --- /dev/null +++ b/src/include/miopen/avgpool/solvers.hpp @@ -0,0 +1,113 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include +#include +#include "miopen/kernel_build_params.hpp" +#include "miopen/kernel_info.hpp" + +namespace miopen { + +namespace solver { + +const auto make_hip_kernel = [](std::vector localsize, + std::vector gridsize, + std::string kernel_file, + std::string kernel_name, + KernelBuildParameters build_params) { + while(localsize.size() < 3) + localsize.push_back(1); + while(gridsize.size() < 3) + gridsize.push_back(1); + for(int i = 0; i < localsize.size(); ++i) + gridsize[i] = AlignUp(gridsize[i], localsize[i]); + return KernelInfo{ + build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name}; +}; + +namespace avgpool { + +using AvgPoolForward = + NonTunableSolverBase; + +using AvgPoolBackward = + NonTunableSolverBase; + +// FORWARD +struct AvgPoolForward2d final : AvgPoolForward +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::FwdProblemDescription& problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::avgpool::FwdProblemDescription& problem) const override; +}; + +struct AvgPoolForward3d final : AvgPoolForward +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::FwdProblemDescription& problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::avgpool::FwdProblemDescription& problem) const override; +}; + +// BACKWARD +struct AvgPoolBackward2d final : AvgPoolBackward +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::BwdProblemDescription& problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::avgpool::BwdProblemDescription& problem) const override; +}; + +struct AvgPoolBackward3d final : AvgPoolBackward +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::avgpool::BwdProblemDescription& problem) const override; + + ConvSolution GetSolution(const ExecutionContext& context, + const miopen::avgpool::BwdProblemDescription& problem) const override; +}; + +} // namespace avgpool + +} // namespace solver + +} // namespace miopen diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp index 81c15f6bea..194afd79ac 100644 --- a/src/include/miopen/solver_id.hpp +++ b/src/include/miopen/solver_id.hpp @@ -59,7 +59,8 @@ enum class Primitive Mha, Softmax, Adam, - Item + Item, + AvgPool }; struct MIOPEN_INTERNALS_EXPORT Id diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp index 9f7430ba8a..050d431844 100644 --- a/src/include/miopen/tensor_view_utils.hpp +++ b/src/include/miopen/tensor_view_utils.hpp @@ -29,6 +29,7 @@ #include #include "../../kernels/tensor_view.hpp" +#include "miopen/tensor.hpp" namespace miopen { diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp new file mode 100644 index 0000000000..76355d5729 --- /dev/null +++ b/src/kernels/MIOpenAvgPool.cpp @@ -0,0 +1,574 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" +#include "tensor_view.hpp" + +#ifndef INPUT_TYPE +#define INPUT_TYPE float +#endif + +#ifndef OUTPUT_TYPE +#define OUTPUT_TYPE float +#endif + +template +__device__ void avgPoolForward2d(const TI* __restrict__ input, + TO* __restrict__ output, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + int32_t R, + int32_t S, + int32_t sh, + int32_t sw, + int32_t ph, + int32_t pw, + bool count_include_pad, + int32_t divisor_override, + tensor_view_t<4> input_tv, + tensor_view_t<4> output_tv) +{ + int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; + int32_t ncoh = gid / OW, ow = gid % OW; + int32_t nc = ncoh / OH, oh = ncoh % OH; + int32_t n = nc / C, c = nc % C; + + if(n >= N) + return; + + FLOAT_ACCUM m = 0; + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + // input idx : (n, c, h, w) + int32_t h = oh * sh - ph + r; + if(h < 0 || h >= H) + continue; + int32_t w = ow * sw - pw + s; + if(w < 0 || w >= W) + continue; + // int32_t input_idx = ((n * C + c) * H + h) * W + w; + m += CVT_FLOAT2ACCUM( + input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]); + } + } + + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (hend - hstart) * (wend - wstart); + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, H); + wend = min(wend, W); + + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (hend - hstart) * (wend - wstart); + } + } + FLOAT_ACCUM val = m / divide_factor; + + output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] = CVT_ACCUM2FLOAT(val); +} + +extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input, + OUTPUT_TYPE* __restrict__ output, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + int32_t R, + int32_t S, + int32_t sh, + int32_t sw, + int32_t ph, + int32_t pw, + bool count_include_pad, + int32_t divisor_override, + tensor_view_t<4> input_tv, + tensor_view_t<4> output_tv) +{ + avgPoolForward2d(input, + output, + N, + C, + H, + W, + OH, + OW, + R, + S, + sh, + sw, + ph, + pw, + count_include_pad, + divisor_override, + input_tv, + output_tv); +} + +template +__device__ void avgPoolForward3d(const TI* __restrict__ input, + TO* __restrict__ output, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, + int32_t KD, + int32_t R, + int32_t S, + int32_t sd, + int32_t sh, + int32_t sw, + int32_t pd, + int32_t ph, + int32_t pw, + bool count_include_pad, + int32_t divisor_override, + tensor_view_t<5> input_tv, + tensor_view_t<5> output_tv) +{ + int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; + int32_t ncodoh = gid / OW, ow = gid % OW; + int32_t ncod = ncodoh / OH, oh = ncodoh % OH; + int32_t nc = ncod / OD, od = ncod % OD; + int32_t n = nc / C, c = nc % C; + + if(n >= N) + return; + FLOAT_ACCUM sum = 0; + for(int32_t kd = 0; kd < KD; ++kd) + { + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + // input idx : (n, c, d, h, w) + int32_t d = od * sd - pd + kd; + if(d < 0 || d >= D) + continue; + int32_t h = oh * sh - ph + r; + if(h < 0 || h >= H) + continue; + int32_t w = ow * sw - pw + s; + if(w < 0 || w >= W) + continue; + // int32_t input_idx = ((n * C + c) * H + h) * W + w; + sum += CVT_FLOAT2ACCUM( + input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]); + } + } + } + int32_t dstart = od * sd - pd; + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t dend = min(dstart + KD, D + pd); + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + dend = min(dend, D); + hend = min(hend, H); + wend = min(wend, W); + + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart); + } + } + FLOAT_ACCUM val = sum / divide_factor; + output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] = + CVT_ACCUM2FLOAT(val); +} + +extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input, + OUTPUT_TYPE* __restrict__ output, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, + int32_t KD, + int32_t R, + int32_t S, + int32_t sd, + int32_t sh, + int32_t sw, + int32_t pd, + int32_t ph, + int32_t pw, + bool count_include_pad, + int32_t divisor_override, + tensor_view_t<5> input_tv, + tensor_view_t<5> output_tv) +{ + avgPoolForward3d(input, + output, + N, + C, + D, + H, + W, + OD, + OH, + OW, + KD, + R, + S, + sd, + sh, + sw, + pd, + ph, + pw, + count_include_pad, + divisor_override, + input_tv, + output_tv); +} + +template +__device__ void avgPoolBackward2d(const TI* __restrict__ output_grad, + TO* __restrict__ input_grad, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + int32_t R, + int32_t S, + int32_t sh, + int32_t sw, + int32_t ph, + int32_t pw, + bool count_include_pad, + int32_t divisor_override, + tensor_view_t<4> output_grad_tv, + tensor_view_t<4> input_grad_tv) +{ + int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; + int32_t nch = gid / W, w = gid % W; + int32_t nc = nch / H, h = nch % H; + int32_t n = nc / C, c = nc % C; + + if(n >= N) + return; + + FLOAT_ACCUM grad = 0; + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + int32_t ohsh = h + ph - r; + if(ohsh % sh != 0) + continue; + int32_t oh = ohsh / sh; + if(oh < 0 || oh >= OH) + continue; + int32_t owsw = w + pw - s; + if(owsw % sw != 0) + continue; + int32_t ow = owsw / sw; + if(ow < 0 || ow >= OW) + continue; + + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (hend - hstart) * (wend - wstart); + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, H); + wend = min(wend, W); + + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (hend - hstart) * (wend - wstart); + } + } + + grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx( + tensor_layout_t<4>(n, c, oh, ow))]) / + divide_factor; + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] = + CVT_ACCUM2FLOAT(grad); +} + +extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad, + OUTPUT_TYPE* __restrict__ input_grad, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + int32_t R, + int32_t S, + int32_t sh, + int32_t sw, + int32_t ph, + int32_t pw, + bool count_include_pad, + int32_t divisor_override, + tensor_view_t<4> output_grad_tv, + tensor_view_t<4> input_grad_tv) +{ + avgPoolBackward2d(output_grad, + input_grad, + N, + C, + H, + W, + OH, + OW, + R, + S, + sh, + sw, + ph, + pw, + count_include_pad, + divisor_override, + output_grad_tv, + input_grad_tv); +} + +template +__device__ void avgPoolBackward3d(const TI* __restrict__ output_grad, + TO* __restrict__ input_grad, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, + int32_t KD, + int32_t R, + int32_t S, + int32_t sd, + int32_t sh, + int32_t sw, + int32_t pd, + int32_t ph, + int32_t pw, + bool count_include_pad, + int32_t divisor_override, + tensor_view_t<5> output_grad_tv, + tensor_view_t<5> input_grad_tv) +{ + int32_t gid = threadIdx.x + blockIdx.x * blockDim.x; + int32_t ncdh = gid / W, w = gid % W; + int32_t ncd = ncdh / H, h = ncdh % H; + int32_t nc = ncd / D, d = ncd % D; + int32_t n = nc / C, c = nc % C; + + if(n >= N) + return; + + FLOAT_ACCUM grad = 0; + for(int32_t kd = 0; kd < KD; ++kd) + { + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + int32_t odsd = d + pd - kd; + if(odsd % sd != 0) + continue; + int32_t od = odsd / sd; + if(od < 0 || od >= OD) + continue; + + int32_t ohsh = h + ph - r; + if(ohsh % sh != 0) + continue; + int32_t oh = ohsh / sh; + if(oh < 0 || oh >= OH) + continue; + + int32_t owsw = w + pw - s; + if(owsw % sw != 0) + continue; + int32_t ow = owsw / sw; + if(ow < 0 || ow >= OW) + continue; + + int32_t dstart = od * sd - pd; + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t dend = min(dstart + KD, D + pd); + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + dend = min(dend, D); + hend = min(hend, H); + wend = min(wend, W); + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart); + } + } + grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx( + tensor_layout_t<5>(n, c, od, oh, ow))]) / + divide_factor; + } + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] = + CVT_ACCUM2FLOAT(grad); +} + +extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad, + OUTPUT_TYPE* __restrict__ input_grad, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, + int32_t KD, + int32_t R, + int32_t S, + int32_t sd, + int32_t sh, + int32_t sw, + int32_t pd, + int32_t ph, + int32_t pw, + bool count_include_pad, + int32_t divisor_override, + tensor_view_t<5> output_grad_tv, + tensor_view_t<5> input_grad_tv) +{ + avgPoolBackward3d(output_grad, + input_grad, + N, + C, + D, + H, + W, + OD, + OH, + OW, + KD, + R, + S, + sd, + sh, + sw, + pd, + ph, + pw, + count_include_pad, + divisor_override, + output_grad_tv, + input_grad_tv); +} diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp index d35bfd93fc..d64dbf21f9 100644 --- a/src/kernels/tensor_view.hpp +++ b/src/kernels/tensor_view.hpp @@ -72,6 +72,46 @@ struct tensor_layout_t } } + constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w) + { + static_assert(N == 5); + layout[0] = n; + layout[1] = c; + layout[2] = d; + layout[3] = h; + layout[4] = w; + } + + constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t h, uint64_t w) + { + static_assert(N == 4); + layout[0] = n; + layout[1] = c; + layout[2] = h; + layout[3] = w; + } + + constexpr tensor_layout_t(uint64_t n, uint64_t h, uint64_t w) + { + static_assert(N == 3); + layout[0] = n; + layout[1] = h; + layout[2] = w; + } + + constexpr tensor_layout_t(uint64_t n, uint64_t w) + { + static_assert(N == 2); + layout[0] = n; + layout[1] = w; + } + + constexpr tensor_layout_t(uint64_t n) + { + static_assert(N == 1); + layout[0] = n; + } + uint64_t layout[N]; }; diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp new file mode 100644 index 0000000000..73adabb8e7 --- /dev/null +++ b/src/solver/avgpool/backward_avgpool_2d.cpp @@ -0,0 +1,158 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include "miopen/tensor_view_utils.hpp" +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_BWD_2D 256 + +namespace miopen { + +namespace solver { + +namespace avgpool { + +bool IsOverRocmBwd2d(const miopen::avgpool::BwdProblemDescription& problem) +{ + auto dtype = problem.GetInputGradDesc().GetType(); + auto in_nelems = problem.GetInputGradDesc().GetElementSize(); + auto out_nelems = problem.GetOutputGradDesc().GetElementSize(); + auto mul_nc = + problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1]; + auto in_over_out = static_cast(in_nelems) / out_nelems; + + if(dtype == miopenFloat) + { + return false; + } + else if(dtype == miopenHalf) + { + if(in_over_out < 2 && in_nelems >= 11075584) + { + return true; + } + } + else if(dtype == miopenBFloat16) + { + if(in_over_out < 2 || (in_nelems > 20000000 && mul_nc <= 2048)) + { + return true; + } + } + return false; +} + +bool AvgPoolBackward2d::IsApplicable(const ExecutionContext&, + const miopen::avgpool::BwdProblemDescription& problem) const +{ + if(problem.GetInputGradDesc().GetNumDims() != 4 || + problem.GetOutputGradDesc().GetNumDims() != 4) + { + return false; + } + if(!IsOverRocmBwd2d(problem)) + { + return false; + } + return true; +} + +ConvSolution +AvgPoolBackward2d::GetSolution(const ExecutionContext& context, + const miopen::avgpool::BwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetOutputGradDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType()); + auto dtype = problem.GetInputGradDesc().GetType(); + size_t N_total = problem.GetNtotal(); + + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}}; + + result.construction_params.push_back(make_hip_kernel( + {LOCAL_SIZE_BWD_2D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolBackward2d", build_params)); + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + decltype(auto) kernel = handle_.Run(kernels.front()); + + auto input_grad_tv = get_inner_expanded_tv<4>(deref(params.inputGradDesc)); + auto output_grad_tv = get_inner_expanded_tv<4>(deref(params.outputGradDesc)); + + auto N = deref(params.inputGradDesc).GetLengths()[0]; + auto C = deref(params.inputGradDesc).GetLengths()[1]; + auto H = deref(params.inputGradDesc).GetLengths()[2]; + auto W = deref(params.inputGradDesc).GetLengths()[3]; + auto OH = deref(params.outputGradDesc).GetLengths()[2]; + auto OW = deref(params.outputGradDesc).GetLengths()[3]; + + kernel(params.output_grad, + params.input_grad, + N, + C, + H, + W, + OH, + OW, + params.KH, + params.KW, + params.SH, + params.SW, + params.PH, + params.PW, + params.count_include_pad, + params.divisor_override, + output_grad_tv, + input_grad_tv); + }; + }; + + return result; +} + +} // namespace avgpool + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp new file mode 100644 index 0000000000..4815803ad3 --- /dev/null +++ b/src/solver/avgpool/backward_avgpool_3d.cpp @@ -0,0 +1,171 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include "miopen/tensor_view_utils.hpp" +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_BWD_3D 256 + +namespace miopen { + +namespace solver { + +namespace avgpool { + +bool IsOverRocmBwd3d(const miopen::avgpool::BwdProblemDescription& problem) +{ + auto dtype = problem.GetInputGradDesc().GetType(); + auto in_nelems = problem.GetInputGradDesc().GetElementSize(); + auto out_nelems = problem.GetOutputGradDesc().GetElementSize(); + auto mul_nc = + problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1]; + auto N = problem.GetOutputGradDesc().GetLengths()[0]; + auto in_over_out = static_cast(in_nelems) / out_nelems; + + if(dtype == miopenFloat) + { + if((in_over_out < 2 && out_nelems <= 12582912) || (in_over_out <= 8 && N >= 6)) + { + return true; + } + return false; + } + else if(dtype == miopenHalf) + { + if((in_over_out < 2 && mul_nc < 8192) || (8 > in_over_out && out_nelems >= 29052108)) + { + return true; + } + } + else if(dtype == miopenBFloat16) + { + if((1 <= in_over_out && in_over_out < 2 && in_nelems >= 4194304) || + (in_over_out <= 8 && in_nelems >= 944111616)) + { + return true; + } + } + return false; +} + +bool AvgPoolBackward3d::IsApplicable(const ExecutionContext&, + const miopen::avgpool::BwdProblemDescription& problem) const +{ + if(problem.GetInputGradDesc().GetNumDims() != 5 || + problem.GetOutputGradDesc().GetNumDims() != 5) + { + return false; + } + if(!IsOverRocmBwd3d(problem)) + { + return false; + } + return true; +} + +ConvSolution +AvgPoolBackward3d::GetSolution(const ExecutionContext& context, + const miopen::avgpool::BwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetOutputGradDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType()); + auto dtype = problem.GetInputGradDesc().GetType(); + size_t N_total = problem.GetNtotal(); + + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}}; + + result.construction_params.push_back(make_hip_kernel( + {LOCAL_SIZE_BWD_3D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolBackward3d", build_params)); + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + decltype(auto) kernel = handle_.Run(kernels.front()); + + auto input_grad_tv = get_inner_expanded_tv<5>(deref(params.inputGradDesc)); + auto output_grad_tv = get_inner_expanded_tv<5>(deref(params.outputGradDesc)); + + auto N = deref(params.inputGradDesc).GetLengths()[0]; + auto C = deref(params.inputGradDesc).GetLengths()[1]; + auto D = deref(params.inputGradDesc).GetLengths()[2]; + auto H = deref(params.inputGradDesc).GetLengths()[3]; + auto W = deref(params.inputGradDesc).GetLengths()[4]; + auto OD = deref(params.outputGradDesc).GetLengths()[2]; + auto OH = deref(params.outputGradDesc).GetLengths()[3]; + auto OW = deref(params.outputGradDesc).GetLengths()[4]; + + kernel(params.output_grad, + params.input_grad, + N, + C, + D, + H, + W, + OD, + OH, + OW, + params.KD, + params.KH, + params.KW, + params.SD, + params.SH, + params.SW, + params.PD, + params.PH, + params.PW, + params.count_include_pad, + params.divisor_override, + output_grad_tv, + input_grad_tv); + }; + }; + + return result; +} + +} // namespace avgpool + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp new file mode 100644 index 0000000000..1c51feb54b --- /dev/null +++ b/src/solver/avgpool/forward_avgpool_2d.cpp @@ -0,0 +1,160 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include "miopen/tensor_view_utils.hpp" +#include +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_FWD_2D 256 + +namespace miopen { + +namespace solver { + +namespace avgpool { + +bool IsOverRocmFwd2d(const miopen::avgpool::FwdProblemDescription& problem) +{ + auto dtype = problem.GetOutputDesc().GetType(); + auto in_nelems = problem.GetInputDesc().GetElementSize(); + auto out_nelems = problem.GetOutputDesc().GetElementSize(); + auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1]; + auto in_over_out = static_cast(in_nelems) / out_nelems; + + if(dtype == miopenFloat) + { + if(in_over_out > 11 || (in_over_out < 2 && mul_nc >= 12288)) + { + return true; + } + } + else if(dtype == miopenHalf) + { + if(in_over_out > 11 || (in_over_out < 2 && mul_nc < 90000)) + { + return true; + } + } + else if(dtype == miopenBFloat16) + { + if(in_over_out >= 1024 || in_over_out < 2 || out_nelems >= 4816896) + { + return true; + } + } + return false; +} + +bool AvgPoolForward2d::IsApplicable(const ExecutionContext&, + const miopen::avgpool::FwdProblemDescription& problem) const +{ + if(problem.GetInputDesc().GetNumDims() != 4 || problem.GetOutputDesc().GetNumDims() != 4) + { + return false; + } + if(!IsOverRocmFwd2d(problem)) + { + return false; + } + return true; +} + +ConvSolution +AvgPoolForward2d::GetSolution(const ExecutionContext& context, + const miopen::avgpool::FwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType()); + auto dtype = problem.GetOutputDesc().GetType(); + size_t N_total = problem.GetNtotal(); + + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}}; + + result.construction_params.push_back(make_hip_kernel( + {LOCAL_SIZE_FWD_2D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolForward2d", build_params)); + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + decltype(auto) kernel = handle_.Run(kernels.front()); + + auto input_tv = get_inner_expanded_tv<4>(deref(params.inputDesc)); + auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc)); + + size_t N = deref(params.inputDesc).GetLengths()[0]; + size_t C = deref(params.inputDesc).GetLengths()[1]; + size_t H = deref(params.inputDesc).GetLengths()[2]; + size_t W = deref(params.inputDesc).GetLengths()[3]; + size_t OH = deref(params.outputDesc).GetLengths()[2]; + size_t OW = deref(params.outputDesc).GetLengths()[3]; + + kernel(params.input, + params.output, + N, + C, + H, + W, + OH, + OW, + params.KH, + params.KW, + params.SH, + params.SW, + params.PH, + params.PW, + params.count_include_pad, + params.divisor_override, + input_tv, + output_tv); + }; + }; + + return result; +} + +} // namespace avgpool + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp new file mode 100644 index 0000000000..6f70a07419 --- /dev/null +++ b/src/solver/avgpool/forward_avgpool_3d.cpp @@ -0,0 +1,171 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "miopen/conv_solution.hpp" +#include "miopen/execution_context.hpp" +#include "miopen/invoke_params.hpp" +#include "miopen/tensor_view_utils.hpp" +#include + +#include +#include +#include +#include + +#define LOCAL_SIZE_FWD_3D 256 + +namespace miopen { + +namespace solver { + +namespace avgpool { + +bool IsOverRocmFwd3d(const miopen::avgpool::FwdProblemDescription& problem) +{ + auto dtype = problem.GetOutputDesc().GetType(); + auto in_nelems = problem.GetInputDesc().GetElementSize(); + auto out_nelems = problem.GetOutputDesc().GetElementSize(); + auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1]; + auto N = problem.GetOutputDesc().GetLengths()[0]; + auto in_over_out = static_cast(in_nelems) / out_nelems; + + std::cout << "in_over_out: " << in_over_out << std::endl; + std::cout << "in_nelems: " << in_nelems << std::endl; + std::cout << "out_nelems: " << out_nelems << std::endl; + + if(dtype == miopenFloat) + { + if(in_over_out < 2 || in_over_out >= 262144 || (out_nelems >= 10125000 && N > 4)) + { + return true; + } + } + else if(dtype == miopenHalf) + { + if(in_nelems >= 201326592 || (in_over_out < 2 && mul_nc < 8192)) + { + return true; + } + } + else if(dtype == miopenBFloat16) + { + if((out_nelems >= 5971968 && in_over_out < 2) || out_nelems >= 74088000) + { + return true; + } + } + return false; +} + +bool AvgPoolForward3d::IsApplicable(const ExecutionContext&, + const miopen::avgpool::FwdProblemDescription& problem) const +{ + if(problem.GetInputDesc().GetNumDims() != 5 || problem.GetOutputDesc().GetNumDims() != 5) + { + return false; + } + if(!IsOverRocmFwd3d(problem)) + { + return false; + } + return true; +} + +ConvSolution +AvgPoolForward3d::GetSolution(const ExecutionContext& context, + const miopen::avgpool::FwdProblemDescription& problem) const +{ + std::ignore = context; + + auto result = ConvSolution{miopenStatusSuccess}; + auto input_dtype = miopen::GetDataType(problem.GetInputDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType()); + auto dtype = problem.GetOutputDesc().GetType(); + size_t N_total = problem.GetNtotal(); + + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}}; + + result.construction_params.push_back(make_hip_kernel( + {LOCAL_SIZE_FWD_3D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolForward3d", build_params)); + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + decltype(auto) kernel = handle_.Run(kernels.front()); + + auto input_tv = get_inner_expanded_tv<5>(deref(params.inputDesc)); + auto output_tv = get_inner_expanded_tv<5>(deref(params.outputDesc)); + + auto N = deref(params.inputDesc).GetLengths()[0]; + auto C = deref(params.inputDesc).GetLengths()[1]; + auto D = deref(params.inputDesc).GetLengths()[2]; + auto H = deref(params.inputDesc).GetLengths()[3]; + auto W = deref(params.inputDesc).GetLengths()[4]; + auto OD = deref(params.outputDesc).GetLengths()[2]; + auto OH = deref(params.outputDesc).GetLengths()[3]; + auto OW = deref(params.outputDesc).GetLengths()[4]; + + kernel(params.input, + params.output, + N, + C, + D, + H, + W, + OD, + OH, + OW, + params.KD, + params.KH, + params.KW, + params.SD, + params.SH, + params.SW, + params.PD, + params.PH, + params.PW, + params.count_include_pad, + params.divisor_override, + input_tv, + output_tv); + }; + }; + + return result; +} + +} // namespace avgpool + +} // namespace solver + +} // namespace miopen diff --git a/test/cpu_avgpool.hpp b/test/cpu_avgpool.hpp new file mode 100644 index 0000000000..5b91033633 --- /dev/null +++ b/test/cpu_avgpool.hpp @@ -0,0 +1,426 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_CPU_AVGPOOL_HPP +#define GUARD_CPU_AVGPOOL_HPP + +#include "tensor_holder.hpp" +#include + +template +void cpu_avgpool_forward_2d(tensor input, + tensor& output, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + tensor ksize, + tensor stride, + tensor padding, + bool count_include_pad, + int32_t divisor_override) +{ + auto dims = input.desc.GetLengths(); + auto numel = output.desc.GetElementSize(); + + auto input_tv = miopen::get_inner_expanded_tv<4>(input.desc); + auto output_tv = miopen::get_inner_expanded_tv<4>(output.desc); + + for(int32_t gid = 0; gid < numel; gid++) + { + int32_t ncoh = gid / OW, ow = gid % OW; + int32_t nc = ncoh / OH, oh = ncoh % OH; + int32_t n = nc / C, c = nc % C; + int32_t R = ksize[0]; + int32_t S = ksize[1]; + int32_t sh = stride[0]; + int32_t sw = stride[1]; + int32_t ph = padding[0]; + int32_t pw = padding[1]; + + if(n >= N) + return; + + float m = 0; + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + // input idx : (n, c, h, w) + int32_t h = oh * sh - ph + r; + if(h < 0 || h >= H) + continue; + int32_t w = ow * sw - pw + s; + if(w < 0 || w >= W) + continue; + // int32_t input_idx = ((n * C + c) * H + h) * W + w; + m += static_cast( + input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]); + } + } + + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (hend - hstart) * (wend - wstart); + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, H); + wend = min(wend, W); + + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (hend - hstart) * (wend - wstart); + } + } + float val = m / divide_factor; + + output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] = + static_cast(val); + } +} + +template +void cpu_avgpool_forward_3d(tensor input, + tensor& output, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, + tensor ksize, + tensor stride, + tensor padding, + bool count_include_pad, + int32_t divisor_override) +{ + auto dims = input.desc.GetLengths(); + auto numel = output.desc.GetElementSize(); + + auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); + auto output_tv = miopen::get_inner_expanded_tv<5>(output.desc); + + for(int32_t gid = 0; gid < numel; gid++) + { + int32_t ncodoh = gid / OW, ow = gid % OW; + int32_t ncod = ncodoh / OH, oh = ncodoh % OH; + int32_t nc = ncod / OD, od = ncod % OD; + int32_t n = nc / C, c = nc % C; + int32_t KD = ksize[0]; + int32_t R = ksize[1]; + int32_t S = ksize[2]; + int32_t sd = stride[0]; + int32_t sh = stride[1]; + int32_t sw = stride[2]; + int32_t pd = padding[0]; + int32_t ph = padding[1]; + int32_t pw = padding[2]; + + if(n >= N) + return; + float sum = 0; + for(int32_t kd = 0; kd < KD; ++kd) + { + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + // input idx : (n, c, d, h, w) + int32_t d = od * sd - pd + kd; + if(d < 0 || d >= D) + continue; + int32_t h = oh * sh - ph + r; + if(h < 0 || h >= H) + continue; + int32_t w = ow * sw - pw + s; + if(w < 0 || w >= W) + continue; + // int32_t input_idx = ((n * C + c) * H + h) * W + w; + sum += static_cast( + input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]); + } + } + } + int32_t dstart = od * sd - pd; + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t dend = min(dstart + KD, D + pd); + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + dend = min(dend, D); + hend = min(hend, H); + wend = min(wend, W); + + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart); + } + } + float val = sum / divide_factor; + output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] = + static_cast(val); + } +} + +template +void cpu_avgpool_backward_2d(tensor output_grad, + tensor& input_grad, + size_t N, + size_t C, + size_t H, + size_t W, + size_t OH, + size_t OW, + tensor ksize, + tensor stride, + tensor padding, + bool count_include_pad, + int32_t divisor_override) +{ + auto dims = input_grad.desc.GetLengths(); + auto numel = input_grad.desc.GetElementSize(); + + auto output_grad_tv = miopen::get_inner_expanded_tv<4>(output_grad.desc); + auto input_grad_tv = miopen::get_inner_expanded_tv<4>(input_grad.desc); + + for(size_t gid = 0; gid < numel; gid++) + { + int32_t nch = gid / W, w = gid % W; + int32_t nc = nch / H, h = nch % H; + int32_t n = nc / C, c = nc % C; + int32_t R = ksize[0]; + int32_t S = ksize[1]; + int32_t sh = stride[0]; + int32_t sw = stride[1]; + int32_t ph = padding[0]; + int32_t pw = padding[1]; + + if(n >= N) + return; + + float grad = 0; + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + int32_t ohsh = h + ph - r; + if(ohsh % sh != 0) + continue; + int32_t oh = ohsh / sh; + if(oh < 0 || oh >= OH) + continue; + int32_t owsw = w + pw - s; + if(owsw % sw != 0) + continue; + int32_t ow = owsw / sw; + if(ow < 0 || ow >= OW) + continue; + + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (hend - hstart) * (wend - wstart); + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + hend = min(hend, H); + wend = min(wend, W); + + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (hend - hstart) * (wend - wstart); + } + } + + grad += static_cast(output_grad[output_grad_tv.get_tensor_view_idx( + tensor_layout_t<4>(n, c, oh, ow))]) / + divide_factor; + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] = + static_cast(grad); + } +} + +template +void cpu_avgpool_backward_3d(tensor output_grad, + tensor& input_grad, + size_t N, + size_t C, + size_t D, + size_t H, + size_t W, + size_t OD, + size_t OH, + size_t OW, + tensor ksize, + tensor stride, + tensor padding, + bool count_include_pad, + int32_t divisor_override) +{ + auto dims = input_grad.desc.GetLengths(); + auto numel = input_grad.desc.GetElementSize(); + + auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc); + auto input_grad_tv = miopen::get_inner_expanded_tv<5>(input_grad.desc); + + for(size_t gid = 0; gid < numel; gid++) + { + int32_t ncdh = gid / W, w = gid % W; + int32_t ncd = ncdh / H, h = ncdh % H; + int32_t nc = ncd / D, d = ncd % D; + int32_t n = nc / C, c = nc % C; + int32_t KD = ksize[0]; + int32_t R = ksize[1]; + int32_t S = ksize[2]; + int32_t sd = stride[0]; + int32_t sh = stride[1]; + int32_t sw = stride[2]; + int32_t pd = padding[0]; + int32_t ph = padding[1]; + int32_t pw = padding[2]; + + if(n >= N) + return; + + float grad = 0; + for(int32_t kd = 0; kd < KD; ++kd) + { + for(int32_t r = 0; r < R; ++r) + { + for(int32_t s = 0; s < S; ++s) + { + int32_t odsd = d + pd - kd; + if(odsd % sd != 0) + continue; + int32_t od = odsd / sd; + if(od < 0 || od >= OD) + continue; + + int32_t ohsh = h + ph - r; + if(ohsh % sh != 0) + continue; + int32_t oh = ohsh / sh; + if(oh < 0 || oh >= OH) + continue; + + int32_t owsw = w + pw - s; + if(owsw % sw != 0) + continue; + int32_t ow = owsw / sw; + if(ow < 0 || ow >= OW) + continue; + + int32_t dstart = od * sd - pd; + int32_t hstart = oh * sh - ph; + int32_t wstart = ow * sw - pw; + int32_t dend = min(dstart + KD, D + pd); + int32_t hend = min(hstart + R, H + ph); + int32_t wend = min(wstart + S, W + pw); + + const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + dend = min(dend, D); + hend = min(hend, H); + wend = min(wend, W); + int32_t divide_factor; + if(divisor_override != 0) + { + divide_factor = divisor_override; + } + else + { + if(count_include_pad) + { + divide_factor = pool_size; + } + else + { + divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart); + } + } + grad += static_cast(output_grad[output_grad_tv.get_tensor_view_idx( + tensor_layout_t<5>(n, c, od, oh, ow))]) / + divide_factor; + } + } + } + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] = + static_cast(grad); + } +} + +#endif diff --git a/test/gtest/avgpool.cpp b/test/gtest/avgpool.cpp new file mode 100644 index 0000000000..3ab32be510 --- /dev/null +++ b/test/gtest/avgpool.cpp @@ -0,0 +1,175 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "avgpool.hpp" +#include + +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) + +namespace avgpool { + +std::string GetFloatArg() +{ + const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG); + if(tmp.empty()) + { + return ""; + } + return tmp; +} + +struct GPU_Avgpool_fwd_FP32 : AvgPoolTestFwd +{ +}; + +struct GPU_Avgpool_fwd_FP16 : AvgPoolTestFwd +{ +}; + +struct GPU_Avgpool_fwd_BFP16 : AvgPoolTestFwd +{ +}; + +struct GPU_Avgpool_bwd_FP32 : AvgPoolTestBwd +{ +}; + +struct GPU_Avgpool_bwd_FP16 : AvgPoolTestBwd +{ +}; + +struct GPU_Avgpool_bwd_BFP16 : AvgPoolTestBwd +{ +}; + +} // namespace avgpool +using namespace avgpool; + +// FORWARD TEST +TEST_P(GPU_Avgpool_fwd_FP32, AvgPoolTestFwd) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(GPU_Avgpool_fwd_FP16, AvgPoolTestFwd) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(GPU_Avgpool_fwd_BFP16, AvgPoolTestFwd) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Avgpool_fwd_FP32, + testing::ValuesIn(AvgPoolTestConfigsFwdFp32())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Avgpool_fwd_FP16, + testing::ValuesIn(AvgPoolTestConfigsFwdFp16())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Avgpool_fwd_BFP16, + testing::ValuesIn(AvgPoolTestConfigsFwdBfp16())); + +// BACKWARD TEST +TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(GPU_Avgpool_bwd_FP16, AvgPoolTestBwd) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd) +{ + if(!MIOPEN_TEST_ALL || + (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Avgpool_bwd_FP32, + testing::ValuesIn(AvgPoolTestConfigsBwdFp32())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Avgpool_bwd_FP16, + testing::ValuesIn(AvgPoolTestConfigsBwdFp16())); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_Avgpool_bwd_BFP16, + testing::ValuesIn(AvgPoolTestConfigsBwdBfp16())); diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp new file mode 100644 index 0000000000..94898d32b6 --- /dev/null +++ b/test/gtest/avgpool.hpp @@ -0,0 +1,451 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "../driver/tensor_driver.hpp" +#include "cpu_avgpool.hpp" +#include "get_handle.hpp" +#include "random.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" +#include +#include +#include +#include +#include + +template +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + os << '{'; + for(int i = 0; i < v.size(); ++i) + { + if(i != 0) + os << ','; + os << v[i]; + } + os << '}'; + return os; +} + +struct AvgPoolTestCase +{ + std::vector input_dims; + std::vector kernel_size; + std::vector stride; + std::vector padding; + bool ceil_mode; + bool count_include_pad; + int32_t divisor_override; + + friend std::ostream& operator<<(std::ostream& os, const AvgPoolTestCase& tc) + { + return os << " input_dims:" << tc.input_dims << " kernel_size:" << tc.kernel_size + << " stride:" << tc.stride << " padding:" << tc.padding + << " ceil_mode:" << tc.ceil_mode << " count_include_pad:" << tc.count_include_pad + << " divisor_override:" << tc.divisor_override; + } + + std::vector GetInput() const { return input_dims; } +}; + +inline std::vector AvgPoolTestConfigsFwdFp32() +{ + return { + {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0}, + {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, + }; +} + +inline std::vector AvgPoolTestConfigsFwdFp16() +{ + return { + {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0}, + {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, + }; +} + +inline std::vector AvgPoolTestConfigsFwdBfp16() +{ + return { + {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0}, + {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, + }; +} + +inline std::vector AvgPoolTestConfigsBwdFp32() +{ + return { + {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, + }; +} + +inline std::vector AvgPoolTestConfigsBwdFp16() +{ + return { + {{64, 288, 35, 35}, {3, 3}, {1, 1}, {1, 1}, false, true, 0}, + {{6, 288, 35, 35, 35}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, false, true, 0}, + }; +} + +inline std::vector AvgPoolTestConfigsBwdBfp16() +{ + return { + {{64, 2048, 9, 9}, {3, 3}, {1, 1}, {1, 1}, false, true, 0}, + {{6, 128, 112, 112, 112}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0}, + }; +} + +// FORWARD TEST +template +struct AvgPoolTestFwd : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + avgpool_config = GetParam(); + auto in_dim = avgpool_config.GetInput(); + N = in_dim[0]; + C = in_dim[1]; + D = in_dim.size() == 5 ? in_dim[2] : 1; + H = in_dim.size() == 5 ? in_dim[3] : in_dim[2]; + W = in_dim.size() == 5 ? in_dim[4] : in_dim[3]; + ksize = tensor{in_dim.size() - 2}; + ksize.data = avgpool_config.kernel_size; + stride = tensor{in_dim.size() - 2}; + stride.data = avgpool_config.stride; + padding = tensor{in_dim.size() - 2}; + padding.data = avgpool_config.padding; + ceil_mode = avgpool_config.ceil_mode; + count_include_pad = avgpool_config.count_include_pad; + divisor_override = avgpool_config.divisor_override; + + auto gen_input_value = [](auto...) { + return prng::gen_A_to_B(static_cast(-10.0f), static_cast(10.0f)); + }; + input = tensor{in_dim}.generate(gen_input_value); + + std::vector out_dim; + if(in_dim.size() == 5) + { + if(ceil_mode) + { + OD = std::ceil(static_cast(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OH = std::ceil(static_cast(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + OW = std::ceil(static_cast(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1; + } + else + { + OD = std::floor(static_cast(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OH = std::floor(static_cast(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + OW = std::floor(static_cast(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1; + } + out_dim = {N, C, OD, OH, OW}; + } + else + { + if(ceil_mode) + { + OH = std::ceil(static_cast(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OW = std::ceil(static_cast(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + } + else + { + OH = std::floor(static_cast(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OW = std::floor(static_cast(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + } + out_dim = {N, C, OH, OW}; + } + + output = tensor{out_dim}; + std::fill(output.begin(), output.end(), std::numeric_limits::quiet_NaN()); + + ref_output = tensor{out_dim}; + std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits::quiet_NaN()); + + input_dev = handle.Write(input.data); + output_dev = handle.Write(output.data); + ksize_dev = handle.Write(ksize.data); + stride_dev = handle.Write(stride.data); + padding_dev = handle.Write(padding.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + miopenStatus_t status; + + auto dims = input.desc.GetNumDims(); + if(dims == 4) + { + cpu_avgpool_forward_2d(input, + ref_output, + N, + C, + H, + W, + OH, + OW, + ksize, + stride, + padding, + count_include_pad, + divisor_override); + } + else if(dims == 5) + { + cpu_avgpool_forward_3d(input, + ref_output, + N, + C, + D, + H, + W, + OD, + OH, + OW, + ksize, + stride, + padding, + count_include_pad, + divisor_override); + } + status = miopen::AvgPoolForward(handle, + input.desc, + input_dev.get(), + output.desc, + output_dev.get(), + ksize.GetSize() == 3 ? ksize[0] : 0, + ksize.GetSize() == 3 ? ksize[1] : ksize[0], + ksize.GetSize() == 3 ? ksize[2] : ksize[1], + stride.GetSize() == 3 ? stride[0] : 0, + stride.GetSize() == 3 ? stride[1] : stride[0], + stride.GetSize() == 3 ? stride[2] : stride[1], + padding.GetSize() == 3 ? padding[0] : 0, + padding.GetSize() == 3 ? padding[1] : padding[0], + padding.GetSize() == 3 ? padding[2] : padding[1], + count_include_pad, + divisor_override); + fflush(stdout); + ASSERT_EQ(status, miopenStatusSuccess); + + output.data = handle.Read(output_dev, output.data.size()); + } + + void Verify() + { + double threshold = std::numeric_limits::epsilon(); + + auto error = miopen::rms_range(ref_output, output); + + ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output)); + EXPECT_LT(error, threshold * 10); + } + AvgPoolTestCase avgpool_config; + + tensor input; + tensor output; + tensor ref_output; + tensor ksize; + tensor stride; + tensor padding; + + bool ceil_mode; + bool count_include_pad; + int32_t divisor_override; + int32_t N, C, D, H, W, OD, OH, OW; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr output_dev; + miopen::Allocator::ManageDataPtr ksize_dev; + miopen::Allocator::ManageDataPtr stride_dev; + miopen::Allocator::ManageDataPtr padding_dev; +}; + +// BACKWARD TEST +template +struct AvgPoolTestBwd : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + avgpool_config = GetParam(); + auto in_grad_dim = avgpool_config.GetInput(); + N = in_grad_dim[0]; + C = in_grad_dim[1]; + D = in_grad_dim.size() == 5 ? in_grad_dim[2] : 1; + H = in_grad_dim.size() == 5 ? in_grad_dim[3] : in_grad_dim[2]; + W = in_grad_dim.size() == 5 ? in_grad_dim[4] : in_grad_dim[3]; + ksize = tensor{in_grad_dim.size() - 2}; + ksize.data = avgpool_config.kernel_size; + stride = tensor{in_grad_dim.size() - 2}; + stride.data = avgpool_config.stride; + padding = tensor{in_grad_dim.size() - 2}; + padding.data = avgpool_config.padding; + ceil_mode = avgpool_config.ceil_mode; + count_include_pad = avgpool_config.count_include_pad; + divisor_override = avgpool_config.divisor_override; + + std::vector out_grad_dim; + if(in_grad_dim.size() == 5) + { + if(ceil_mode) + { + OD = std::ceil(static_cast(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OH = std::ceil(static_cast(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + OW = std::ceil(static_cast(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1; + } + else + { + OD = std::floor(static_cast(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OH = std::floor(static_cast(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + OW = std::floor(static_cast(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1; + } + out_grad_dim = {N, C, OD, OH, OW}; + } + else + { + if(ceil_mode) + { + OH = std::ceil(static_cast(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OW = std::ceil(static_cast(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + } + else + { + OH = std::floor(static_cast(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1; + OW = std::floor(static_cast(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1; + } + out_grad_dim = {N, C, OH, OW}; + } + auto gen_output_grad_value = [](auto...) { + return prng::gen_A_to_B(static_cast(-10.0f), static_cast(10.0f)); + }; + output_grad = tensor{out_grad_dim}.generate(gen_output_grad_value); + + input_grad = tensor{in_grad_dim}; + std::fill(input_grad.begin(), input_grad.end(), std::numeric_limits::quiet_NaN()); + + ref_input_grad = tensor{in_grad_dim}; + std::fill( + ref_input_grad.begin(), ref_input_grad.end(), std::numeric_limits::quiet_NaN()); + + output_grad_dev = handle.Write(output_grad.data); + input_grad_dev = handle.Write(input_grad.data); + ksize_dev = handle.Write(ksize.data); + stride_dev = handle.Write(stride.data); + padding_dev = handle.Write(padding.data); + } + + void RunTest() + { + auto&& handle = get_handle(); + + miopenStatus_t status; + + auto dims = input_grad.desc.GetNumDims(); + if(dims == 4) + { + cpu_avgpool_backward_2d(output_grad, + ref_input_grad, + N, + C, + H, + W, + OH, + OW, + ksize, + stride, + padding, + count_include_pad, + divisor_override); + } + else if(dims == 5) + { + cpu_avgpool_backward_3d(output_grad, + ref_input_grad, + N, + C, + D, + H, + W, + OD, + OH, + OW, + ksize, + stride, + padding, + count_include_pad, + divisor_override); + } + status = miopen::AvgPoolBackward(handle, + output_grad.desc, + output_grad_dev.get(), + input_grad.desc, + input_grad_dev.get(), + ksize.GetSize() == 3 ? ksize[0] : 0, + ksize.GetSize() == 3 ? ksize[1] : ksize[0], + ksize.GetSize() == 3 ? ksize[2] : ksize[1], + stride.GetSize() == 3 ? stride[0] : 0, + stride.GetSize() == 3 ? stride[1] : stride[0], + stride.GetSize() == 3 ? stride[2] : stride[1], + padding.GetSize() == 3 ? padding[0] : 0, + padding.GetSize() == 3 ? padding[1] : padding[0], + padding.GetSize() == 3 ? padding[2] : padding[1], + count_include_pad, + divisor_override); + + ASSERT_EQ(status, miopenStatusSuccess); + + input_grad.data = handle.Read(input_grad_dev, input_grad.data.size()); + } + + void Verify() + { + double threshold = std::numeric_limits::epsilon(); + auto error = miopen::rms_range(ref_input_grad, input_grad); + ASSERT_EQ(miopen::range_distance(ref_input_grad), miopen::range_distance(input_grad)); + EXPECT_LT(error, threshold * 10); + } + AvgPoolTestCase avgpool_config; + + tensor output_grad; + tensor input_grad; + tensor ref_input_grad; + tensor ksize; + tensor stride; + tensor padding; + + bool ceil_mode; + bool count_include_pad; + int32_t divisor_override; + int32_t N, C, D, H, W, OD, OH, OW; + + miopen::Allocator::ManageDataPtr output_grad_dev; + miopen::Allocator::ManageDataPtr input_grad_dev; + miopen::Allocator::ManageDataPtr ksize_dev; + miopen::Allocator::ManageDataPtr stride_dev; + miopen::Allocator::ManageDataPtr padding_dev; +};