From 6feaec91f4969c202c4ba9e7ace1a50d10b16e03 Mon Sep 17 00:00:00 2001 From: Long Luong <45379462+long10024070@users.noreply.github.com> Date: Wed, 21 Aug 2024 13:06:16 +0700 Subject: [PATCH] Implement PReLU backward (#3152) --- docs/reference/index.rst | 1 + driver/CMakeLists.txt | 1 + driver/dm_prelu.cpp | 40 ++ driver/driver.hpp | 6 +- driver/mloPReLUHost.hpp | 104 +++++ driver/prelu_driver.hpp | 394 ++++++++++++++++++ include/miopen/miopen.h | 55 +++ src/CMakeLists.txt | 10 + src/include/miopen/prelu.hpp | 53 +++ src/include/miopen/prelu/invoke_params.hpp | 59 +++ .../miopen/prelu/problem_description.hpp | 115 +++++ src/include/miopen/prelu/solvers.hpp | 77 ++++ src/include/miopen/prelu/utils.hpp | 45 ++ src/include/miopen/solver_id.hpp | 3 +- src/include/miopen/tensor_view_utils.hpp | 1 + src/kernels/MIOpenPReLU.cpp | 115 +++++ src/kernels/MIOpenReduceSum.cpp | 104 +++++ src/kernels/block_reduce.hpp | 72 ++++ src/kernels/tensor_view.hpp | 11 + src/kernels/warp_reduce.hpp | 58 +++ src/prelu.cpp | 95 +++++ src/prelu/problem_description.cpp | 76 ++++ src/prelu_api.cpp | 133 ++++++ src/solver.cpp | 4 + .../prelu/backward_prelu_multi_weights.cpp | 183 ++++++++ .../prelu/backward_prelu_single_weight.cpp | 202 +++++++++ src/solver/prelu/utils.cpp | 53 +++ test/cpu_prelu.hpp | 100 +++++ test/gtest/prelu.cpp | 118 ++++++ test/gtest/prelu.hpp | 247 +++++++++++ 30 files changed, 2532 insertions(+), 3 deletions(-) create mode 100644 driver/dm_prelu.cpp create mode 100644 driver/mloPReLUHost.hpp create mode 100644 driver/prelu_driver.hpp create mode 100644 src/include/miopen/prelu.hpp create mode 100644 src/include/miopen/prelu/invoke_params.hpp create mode 100644 src/include/miopen/prelu/problem_description.hpp create mode 100644 src/include/miopen/prelu/solvers.hpp create mode 100644 src/include/miopen/prelu/utils.hpp create mode 100644 src/kernels/MIOpenPReLU.cpp create mode 100644 src/kernels/MIOpenReduceSum.cpp create mode 100644 src/kernels/block_reduce.hpp create mode 100644 src/kernels/warp_reduce.hpp create mode 100644 src/prelu.cpp create mode 100644 src/prelu/problem_description.cpp create mode 100644 src/prelu_api.cpp create mode 100644 src/solver/prelu/backward_prelu_multi_weights.cpp create mode 100644 src/solver/prelu/backward_prelu_single_weight.cpp create mode 100644 src/solver/prelu/utils.cpp create mode 100644 test/cpu_prelu.hpp create mode 100644 test/gtest/prelu.cpp create mode 100644 test/gtest/prelu.hpp diff --git a/docs/reference/index.rst b/docs/reference/index.rst index 2574dbbf5e..34a045bca8 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -36,3 +36,4 @@ The MIOpen API library is structured as follows: * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental) * :doc:`ReduceCalculation <../doxygen/html/group__ReduceCalculation>` (experimental) * :doc:`RotaryPositionalEmbeddings <../doxygen/html/group__RotaryPositionalEmbeddings>` (experimental) + * :doc:`ReLU <../doxygen/html/group___re_l_u>` (experimental) diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt index 19abd61597..15a3941046 100644 --- a/driver/CMakeLists.txt +++ b/driver/CMakeLists.txt @@ -48,6 +48,7 @@ add_executable(MIOpenDriver dm_layernorm.cpp dm_lrn.cpp dm_pool.cpp + dm_prelu.cpp dm_reduce.cpp dm_reduceextreme.cpp dm_reducecalculation.cpp diff --git a/driver/dm_prelu.cpp b/driver/dm_prelu.cpp new file mode 100644 index 0000000000..06ec99e8a5 --- /dev/null +++ b/driver/dm_prelu.cpp @@ -0,0 +1,40 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include "registry_driver_maker.hpp" +#include "prelu_driver.hpp" + +static Driver* makeDriver(const std::string& base_arg) +{ + if(base_arg == "prelu") + return new PReLUDriver(); + if(base_arg == "prelufp16") + return new PReLUDriver(); + if(base_arg == "prelubfp16") + return new PReLUDriver(); + return nullptr; +} + +REGISTER_DRIVER_MAKER(makeDriver); diff --git a/driver/driver.hpp b/driver/driver.hpp index 196d48c2b6..d0c708ff1d 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -175,7 +175,8 @@ inline void PadBufferSize(size_t& sz, int datatype_sz) "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], " "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], " "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, " - "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16]\n"); + "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], " + "prelu[bfp16|fp16]\n"); exit(0); // NOLINT (concurrency-mt-unsafe) } @@ -207,7 +208,8 @@ inline std::string ParseBaseArg(int argc, char* argv[]) arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" && arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" && arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "rope" && - arg != "ropefp16" && arg != "ropebfp16" && arg != "--version") + arg != "ropefp16" && arg != "ropebfp16" && arg != "prelu" && arg != "prelufp16" && + arg != "prelubfp16" && arg != "--version") { printf("FAILED: Invalid Base Input Argument\n"); Usage(); diff --git a/driver/mloPReLUHost.hpp b/driver/mloPReLUHost.hpp new file mode 100644 index 0000000000..c3460b6425 --- /dev/null +++ b/driver/mloPReLUHost.hpp @@ -0,0 +1,104 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include <../test/ford.hpp> + +#include +#include +#include + +template +int32_t mloPReLUBackwardRunHost(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t weightDesc, + const miopenTensorDescriptor_t doutputDesc, + const miopenTensorDescriptor_t dinputDesc, + const Tgpu* input, + const Tgpu* weight, + const Tgpu* doutput, + Tcheck* dinput_host, + Tcheck* dweight_host) +{ + auto input_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc)); + auto doutput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(doutputDesc)); + auto dinput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(dinputDesc)); + + auto input_sz = miopen::deref(inputDesc).GetElementSize(); + auto weight_sz = miopen::deref(weightDesc).GetElementSize(); + auto weight_grad_collector = std::vector(input_sz); + + par_ford(input_sz)([&](int gid) { + auto tensor_layout = tensor_layout_t<5>(input_tv, gid); + float input_v = static_cast(input[input_tv.get_tensor_view_idx(tensor_layout)]); + float grad_v = static_cast(doutput[doutput_tv.get_tensor_view_idx(tensor_layout)]); + + if(dinput_host) + { + float weight_v; + if(weight_sz == 1) + weight_v = static_cast(weight[0]); + else + weight_v = static_cast(weight[tensor_layout.layout[1]]); + float input_grad_v = input_v > 0 ? grad_v : weight_v * grad_v; + dinput_host[dinput_tv.get_tensor_view_idx(tensor_layout)] = + static_cast(input_grad_v); + } + if(dweight_host) + { + weight_grad_collector[gid] = input_v > 0 ? 0 : input_v * grad_v; + } + }); + + if(dweight_host) + { + if(weight_sz == 1) + { + double sum = 0; + for(int i = 0; i < input_sz; ++i) + sum += static_cast(weight_grad_collector[i]); + dweight_host[0] = static_cast(sum); + } + else + { + size_t inner_size = std::accumulate( + &input_tv.size[2], &input_tv.size[4], 1ul, std::multiplies()); + size_t outer_size = inner_size * input_tv.size[1]; + par_ford(weight_sz)([&](int i) { + double sum = 0; + ford(input_tv.size[0])([&](int j) { + ford(inner_size)([&](int k) { + sum += static_cast( + weight_grad_collector[j * outer_size + i * inner_size + k]); + }); + }); + dweight_host[i] = static_cast(sum); + }); + } + } + + return miopenStatusSuccess; +} diff --git a/driver/prelu_driver.hpp b/driver/prelu_driver.hpp new file mode 100644 index 0000000000..e304f27465 --- /dev/null +++ b/driver/prelu_driver.hpp @@ -0,0 +1,394 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include "driver.hpp" +#include "mloPReLUHost.hpp" +#include "tensor_driver.hpp" +#include "timer.hpp" + +#include <../test/ford.hpp> +#include <../test/verify.hpp> + +#include + +template +class PReLUDriver : public Driver +{ +public: + PReLUDriver() : Driver() + { + miopenCreateTensorDescriptor(&inputDesc); + miopenCreateTensorDescriptor(&weightDesc); + miopenCreateTensorDescriptor(&doutputDesc); + miopenCreateTensorDescriptor(&dinputDesc); + miopenCreateTensorDescriptor(&dweightDesc); + + data_type = miopen_type{}; + } + + int AddCmdLineArgs() override; + int ParseCmdLineArgs(int argc, char* argv[]) override; + InputFlags& GetInputFlags() override { return inflags; } + + int GetandSetData() override; + std::vector GetTensorLengthsFromCmdLine(); + + int AllocateBuffersAndCopy() override; + + int RunForwardGPU() override; + int RunForwardCPU(); + + int RunBackwardGPU() override; + int RunBackwardCPU(); + + Tref GetTolerance(); + int VerifyBackward() override; + int VerifyForward() override; + ~PReLUDriver() override + { + miopenDestroyTensorDescriptor(inputDesc); + miopenDestroyTensorDescriptor(weightDesc); + miopenDestroyTensorDescriptor(doutputDesc); + miopenDestroyTensorDescriptor(dinputDesc); + miopenDestroyTensorDescriptor(dweightDesc); + } + +private: + InputFlags inflags; + + int forw; + + miopenTensorDescriptor_t inputDesc; + miopenTensorDescriptor_t weightDesc; + miopenTensorDescriptor_t doutputDesc; + miopenTensorDescriptor_t dinputDesc; + miopenTensorDescriptor_t dweightDesc; + + std::unique_ptr input_dev; + std::unique_ptr weight_dev; + std::unique_ptr doutput_dev; + std::unique_ptr dinput_dev; + std::unique_ptr dweight_dev; + std::unique_ptr workspace_dev; + + std::vector input; + std::vector weight; + std::vector doutput; + std::vector dinput; + std::vector dweight; + + std::vector dinput_host; + std::vector dweight_host; + + size_t ws_sizeInBytes; +}; + +template +int PReLUDriver::ParseCmdLineArgs(int argc, char* argv[]) +{ + inflags.Parse(argc, argv); + + if(inflags.GetValueInt("time") == 1) + { + miopenEnableProfiling(GetHandle(), true); + } + + auto inTensorParam = inflags.GetValueTensor("input"); + auto input_length = inTensorParam.lengths; + if(input_length.empty()) + { + std::cout << "Tensor must not be empty"; + return miopenStatusBadParm; + } + + std::vector weight_length = {inflags.GetValueInt("NumParameters")}; + if(weight_length[0] != 1 && (input_length.size() == 1 || weight_length[0] != input_length[1])) + { + std::cout << "NumParameters must be 1 or the second dim of DimLengths"; + return miopenStatusBadParm; + } + + return miopenStatusSuccess; +} + +template +int PReLUDriver::GetandSetData() +{ + auto inTensorParam = inflags.GetValueTensor("input"); + auto input_length = inTensorParam.lengths; + std::vector weight_length = {inflags.GetValueInt("NumParameters")}; + + if(SetTensorNd(inputDesc, input_length, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing input tensor: " + inflags.GetValueStr("input") + "."); + if(SetTensorNd(dinputDesc, input_length, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing input gradient tensor"); + + if(SetTensorNd(weightDesc, weight_length, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing weight tensor"); + if(SetTensorNd(dweightDesc, weight_length, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing weight gradient tensor"); + + if(SetTensorNd(doutputDesc, input_length, data_type) != miopenStatusSuccess) + MIOPEN_THROW("Error parsing output gradient tensor"); + + return miopenStatusSuccess; +} + +template +int PReLUDriver::AddCmdLineArgs() +{ + inflags.AddInputFlag("forw", 'F', "1", "Run only Forward PReLU (Default=1)", "int"); + inflags.AddTensorFlag("input", 'D', "256x4x1x1x8723", "input tensor descriptor"); + inflags.AddInputFlag( + "NumParameters", + 'P', + "1", + "Number of weight to learn. Although it takes an int as input, there is only two values " + "are legitimate: 1, or the number of channels (the second dim) at input (Default=1)", + "int"); + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); + inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int"); + inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); + inflags.AddInputFlag( + "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int"); + + return miopenStatusSuccess; +} + +template +int PReLUDriver::AllocateBuffersAndCopy() +{ + size_t input_sz = GetTensorSize(inputDesc); + size_t weight_sz = GetTensorSize(weightDesc); + + miopenGetPReLUBackwardWorkspaceSize(GetHandle(), inputDesc, weightDesc, &ws_sizeInBytes); + + if(ws_sizeInBytes == static_cast(-1)) + return miopenStatusAllocFailed; + + uint32_t ctx = 0; + + input_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); + weight_dev = std::unique_ptr(new GPUMem(ctx, weight_sz, sizeof(Tgpu))); + doutput_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); + dinput_dev = std::unique_ptr(new GPUMem(ctx, input_sz, sizeof(Tgpu))); + dweight_dev = std::unique_ptr(new GPUMem(ctx, weight_sz, sizeof(Tgpu))); + workspace_dev = std::unique_ptr(new GPUMem(ctx, ws_sizeInBytes, sizeof(std::byte))); + + input = std::vector(input_sz); + weight = std::vector(weight_sz); + doutput = std::vector(input_sz, static_cast(1.0f)); + dinput = std::vector(input_sz, std::numeric_limits::quiet_NaN()); + dweight = std::vector(weight_sz, std::numeric_limits::quiet_NaN()); + + dinput_host = std::vector(input_sz, std::numeric_limits::quiet_NaN()); + dweight_host = std::vector(weight_sz, std::numeric_limits::quiet_NaN()); + + for(int i = 0; i < input_sz; i++) + input[i] = prng::gen_A_to_B(static_cast(-1e-5), static_cast(1e-6)); + + for(int i = 0; i < weight_sz; i++) + weight[i] = prng::gen_A_to_B(static_cast(-1e-5), static_cast(1e-6)); + + if(input_dev->ToGPU(GetStream(), input.data()) != 0) + { + std::cerr << "Error copying (input) to GPU, size: " << input_dev->GetSize() << std::endl; + return miopenStatusAllocFailed; + } + + if(weight_dev->ToGPU(GetStream(), weight.data()) != 0) + { + std::cerr << "Error copying (weight) to GPU, size: " << weight_dev->GetSize() << std::endl; + return miopenStatusAllocFailed; + } + + if(doutput_dev->ToGPU(GetStream(), doutput.data()) != 0) + { + std::cerr << "Error copying (out grad) to GPU, size: " << doutput_dev->GetSize() + << std::endl; + return miopenStatusAllocFailed; + } + + if(dinput_dev->ToGPU(GetStream(), dinput.data()) != 0) + { + std::cerr << "Error copying (input grad) to GPU, size: " << dinput_dev->GetSize() + << std::endl; + return miopenStatusAllocFailed; + } + + if(dweight_dev->ToGPU(GetStream(), dweight.data()) != 0) + { + std::cerr << "Error copying (weight grad) to GPU, size: " << dweight_dev->GetSize() + << std::endl; + return miopenStatusAllocFailed; + } + + return miopenStatusSuccess; +} + +template +int PReLUDriver::RunForwardGPU() +{ + return miopenStatusNotImplemented; +} + +template +int PReLUDriver::RunForwardCPU() +{ + return miopenStatusNotImplemented; +} + +template +int PReLUDriver::RunBackwardGPU() +{ + float kernel_total_time = 0; + float kernel_first_time = 0; + + Timer t; + START_TIME + + for(int i = 0; i < inflags.GetValueInt("iter"); i++) + { + miopenPReLUBackward(GetHandle(), + workspace_dev->GetMem(), + ws_sizeInBytes, + inputDesc, + input_dev->GetMem(), + weightDesc, + weight_dev->GetMem(), + doutputDesc, + doutput_dev->GetMem(), + dinputDesc, + dinput_dev->GetMem(), + dweightDesc, + dweight_dev->GetMem()); + + float time = 0.0; + miopenGetKernelTime(GetHandle(), &time); + kernel_total_time += time; + if(i == 0) + kernel_first_time = time; + } + + if(inflags.GetValueInt("time") == 1) + { + STOP_TIME + int iter = inflags.GetValueInt("iter"); + if(WALL_CLOCK) + std::cout << "Wall-clock Time Backward PReLU Elapsed: " << t.gettime_ms() / iter + << " ms" << std::endl; + + float kernel_average_time = + iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time; + std::cout << "GPU Kernel Time Backward PReLU Elapsed: " << kernel_average_time << " ms" + << std::endl; + } + + if(dinput_dev->FromGPU(GetStream(), dinput.data()) != 0) + { + std::cerr << "Error copying (dinput_dev) from GPU, size: " << dinput_dev->GetSize() + << std::endl; + return miopenStatusInternalError; + } + if(dweight_dev->FromGPU(GetStream(), dweight.data()) != 0) + { + std::cerr << "Error copying (dweight_dev) from GPU, size: " << dweight_dev->GetSize() + << std::endl; + return miopenStatusInternalError; + } + + return miopenStatusSuccess; +} + +template +int PReLUDriver::RunBackwardCPU() +{ + return mloPReLUBackwardRunHost(inputDesc, + weightDesc, + doutputDesc, + dinputDesc, + input.data(), + weight.data(), + doutput.data(), + dinput_host.data(), + dweight_host.data()); +} + +template +Tref PReLUDriver::GetTolerance() +{ + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + auto tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + return tolerance; +} + +template +int PReLUDriver::VerifyForward() +{ + return miopenStatusNotImplemented; +} + +template +int PReLUDriver::VerifyBackward() +{ + RunBackwardCPU(); + const Tref tolerance = GetTolerance(); + auto error_dinput = miopen::rms_range(dinput_host, dinput); + auto error_dweight = miopen::rms_range(dweight_host, dweight); + + if(!std::isfinite(error_dinput) || error_dinput > tolerance) + { + std::cout << "Backward PReLU Input Gradient FAILED: " << error_dinput << " > " << tolerance + << std::endl; + return EC_VerifyBwd; + } + else + { + std::cout << "Backward PReLU Input Gradient Verifies OK on CPU reference (" << error_dinput + << " < " << tolerance << ')' << std::endl; + } + + if(!std::isfinite(error_dweight) || error_dweight > tolerance) + { + std::cout << "Backward PReLU Weight Gradient FAILED: " << error_dweight << " > " + << tolerance << std::endl; + return EC_VerifyBwd; + } + else + { + std::cout << "Backward PReLU Weight Gradient Verifies OK on CPU reference (" + << error_dweight << " < " << tolerance << ')' << std::endl; + } + + return miopenStatusSuccess; +} diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 6c66f04867..ccf4b702e0 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -71,6 +71,7 @@ * @defgroup getitem * @defgroup ReduceCalculation * @defgroup RotaryPositionalEmbeddings + * @defgroup ReLU * */ @@ -7677,6 +7678,60 @@ MIOPEN_EXPORT miopenStatus_t miopenRoPEBackward(miopenHandle_t handle, // CLOSEOUT ROPE DOXYGEN GROUP #endif // MIOPEN_BETA_API +#ifdef MIOPEN_BETA_API +/** @addtogroup ReLU + * + * @{ + */ + +/*! @brief Helper function to query the minimum workspace size required by the PReLU backward call + * + * @param handle MIOpen Handle (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param weightDesc Tensor descriptor for weight tensor (input) + * @param sizeInBytes Pointer to data to return the minimum workspace size + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenGetPReLUBackwardWorkspaceSize(miopenHandle_t handle, + miopenTensorDescriptor_t inputDesc, + miopenTensorDescriptor_t weightDesc, + size_t* sizeInBytes); + +/*! @brief Execute a PReLU backward layer + * + * @param handle MIOpen handle (input) + * @param workspace Address of the allocated workspace data (input) + * @param workspaceSizeInBytes Size in bytes of the allocated workspace data (input) + * @param inputDesc Tensor descriptor for input tensor (input) + * @param input Data tensor input (input) + * @param weightDesc Tensor descriptor for weight tensor (input) + * @param weight Data tensor weight (input) + * @param doutputDesc Tensor descriptor for output gradient (input) + * @param doutput Gradient of output (input) + * @param dinputDesc Tensor descriptor for input gradient (input) + * @param dinput Gradient of input (output) + * @param dweightDesc Tensor descriptor for weight gradient (input) + * @param dweight Gradient of weight (output) + */ +MIOPEN_EXPORT miopenStatus_t miopenPReLUBackward(miopenHandle_t handle, + void* workspace, + size_t workspaceSizeInBytes, + miopenTensorDescriptor_t inputDesc, + const void* input, + miopenTensorDescriptor_t weightDesc, + const void* weight, + miopenTensorDescriptor_t doutputDesc, + const void* doutput, + miopenTensorDescriptor_t dinputDesc, + void* dinput, + miopenTensorDescriptor_t dweightDesc, + void* dweight); + +/** @} */ +// CLOSEOUT RELU DOXYGEN GROUP +#endif // MIOPEN_BETA_API + #ifdef __cplusplus } #endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d2f84f43b8..1fe108679c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -163,6 +163,8 @@ set( MIOpen_Source performance_config.cpp pooling/problem_description.cpp pooling_api.cpp + prelu/problem_description.cpp + prelu_api.cpp problem.cpp process.cpp ramdb.cpp @@ -302,6 +304,9 @@ set( MIOpen_Source solver/pooling/forwardNd.cpp solver/pooling/backward2d.cpp solver/pooling/backwardNd.cpp + solver/prelu/backward_prelu_multi_weights.cpp + solver/prelu/backward_prelu_single_weight.cpp + solver/prelu/utils.cpp solver/reduce/forward_argmax.cpp solver/reduce/forward_argmin.cpp solver/reduce/forward_max.cpp @@ -448,6 +453,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenReduceCalculation.hpp kernels/MIOpenReduceExtreme.hpp kernels/bfloat16_dev.hpp + kernels/block_reduce.hpp kernels/conv_common.inc kernels/conv_sizes.inc kernels/float_types.h @@ -471,6 +477,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1024vgprs_fp16_fp16acc_f2x3_c16_stride1.inc kernels/winograd/Conv_Winograd_Fury_v2_4_1_metadata.inc kernels/workaround_issue_1431.hpp + kernels/warp_reduce.hpp kernels/xform_bidirect_winograd_code.inc kernels/xform_data_filter.inc kernels/xform_kd_cov2.inc @@ -508,6 +515,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenLRNBwd.cl kernels/MIOpenLRNFwd.cl kernels/MIOpenNeuron.cl + kernels/MIOpenPReLU.cpp kernels/MIOpenPooling.cl kernels/MIOpenPoolingBwd.cl kernels/MIOpenPoolingBwdND.cl @@ -519,6 +527,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/MIOpenReduceCalculation.cpp kernels/MIOpenReduceExtreme.cpp kernels/MIOpenRoPE.cpp + kernels/MIOpenReduceSum.cpp kernels/MIOpenSoftmax.cl kernels/MIOpenSoftmaxAttn.cpp kernels/MIOpenUtilKernels3.cl @@ -661,6 +670,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN t5layernorm.cpp ocl/fusionopconvocl.cpp ocl/fusionopbiasbnactivocl.cpp + prelu.cpp reducecalculation.cpp reduceextreme.cpp rope.cpp diff --git a/src/include/miopen/prelu.hpp b/src/include/miopen/prelu.hpp new file mode 100644 index 0000000000..35e5e7f611 --- /dev/null +++ b/src/include/miopen/prelu.hpp @@ -0,0 +1,53 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include + +namespace miopen { + +struct Handle; +struct TensorDescriptor; + +MIOPEN_INTERNALS_EXPORT size_t GetPReLUBackwardWorkspaceSize(Handle& handle, + const TensorDescriptor& inputDesc, + const TensorDescriptor& weightDesc); + +MIOPEN_INTERNALS_EXPORT miopenStatus_t PReLUBackward(Handle& handle, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& weightDesc, + ConstData_t weight, + const TensorDescriptor& doutputDesc, + ConstData_t doutput, + const TensorDescriptor& dinputDesc, + Data_t dinput, + const TensorDescriptor& dweightDesc, + Data_t dweight); + +} // namespace miopen diff --git a/src/include/miopen/prelu/invoke_params.hpp b/src/include/miopen/prelu/invoke_params.hpp new file mode 100644 index 0000000000..1acc2dc143 --- /dev/null +++ b/src/include/miopen/prelu/invoke_params.hpp @@ -0,0 +1,59 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include + +namespace miopen { + +namespace prelu { + +struct InvokeParams : public miopen::InvokeParams +{ + InvokeParams() = default; + + const TensorDescriptor* inputDesc = nullptr; + const TensorDescriptor* weightDesc = nullptr; + const TensorDescriptor* doutputDesc = nullptr; + const TensorDescriptor* dinputDesc = nullptr; + const TensorDescriptor* dweightDesc = nullptr; + + ConstData_t input = nullptr; + ConstData_t weight = nullptr; + ConstData_t doutput = nullptr; + Data_t dinput = nullptr; + Data_t dweight = nullptr; + Data_t workspace = nullptr; + std::size_t workspace_size = 0; + + std::size_t GetWorkspaceSize() const { return workspace_size; } + Data_t GetWorkspace() const { return workspace; } +}; + +} // namespace prelu + +} // namespace miopen diff --git a/src/include/miopen/prelu/problem_description.hpp b/src/include/miopen/prelu/problem_description.hpp new file mode 100644 index 0000000000..ce10d96046 --- /dev/null +++ b/src/include/miopen/prelu/problem_description.hpp @@ -0,0 +1,115 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include + +namespace miopen { + +struct NetworkConfig; + +namespace prelu { + +bool checkSameType(const TensorDescriptor& x, const TensorDescriptor& y); +bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y); + +struct BackwardProblemDescription : ProblemDescriptionBase +{ + + BackwardProblemDescription(const TensorDescriptor& inputDesc_, + const TensorDescriptor& weightDesc_, + const TensorDescriptor& doutputDesc_, + const TensorDescriptor& dinputDesc_, + const TensorDescriptor& dweightDesc_) + : inputDesc(inputDesc_), + weightDesc(weightDesc_), + doutputDesc(doutputDesc_), + dinputDesc(dinputDesc_), + dweightDesc(dweightDesc_) + { + IsSameType(); + IsRightLength(); + } + + const TensorDescriptor& GetInputDesc() const { return inputDesc; } + const TensorDescriptor& GetWeightDesc() const { return weightDesc; } + const TensorDescriptor& GetdOuputDesc() const { return doutputDesc; } + const TensorDescriptor& GetdInputDesc() const { return dinputDesc; } + const TensorDescriptor& GetdWeightDesc() const { return dweightDesc; } + + bool IsSameType() const + { + if(!checkSameType(inputDesc, weightDesc)) + MIOPEN_THROW(miopenStatusBadParm, + "PReLU: Input and Weight tensor must have same type."); + if(!checkSameType(inputDesc, dinputDesc) || !checkSameType(weightDesc, dweightDesc)) + MIOPEN_THROW(miopenStatusBadParm, + "PReLU: Gradient tensors (excluding Output gradient) must share a same " + "type with Input and Weight tensor."); + return true; + } + + bool IsRightLength() const + { + if(!checkSameLength(inputDesc, doutputDesc) || !checkSameLength(inputDesc, dinputDesc)) + MIOPEN_THROW( + miopenStatusBadParm, + "PReLU: Input and Output Gradient tensors sizes must match with Input tensor."); + if(weightDesc.GetNumDims() != 1) + MIOPEN_THROW(miopenStatusBadParm, "PReLU: Weight tensor must have 1 dimension."); + if(weightDesc.GetElementSize() != 1 && + (inputDesc.GetNumDims() == 1 || + weightDesc.GetElementSize() != inputDesc.GetLengths()[1])) + MIOPEN_THROW( + miopenStatusBadParm, + "PReLU: Weight size must be 1 or equal to the second dim of Input tensor."); + if(!checkSameLength(weightDesc, dweightDesc)) + MIOPEN_THROW(miopenStatusBadParm, + "PReLU: Weight Gradient tensors sizes must match with Weight tensor."); + return true; + } + + bool IsSingleWeight() const + { + if(weightDesc.GetElementSize() > 1) + return false; + return true; + } + + NetworkConfig MakeNetworkConfig() const override; + +private: + TensorDescriptor inputDesc; + TensorDescriptor weightDesc; + TensorDescriptor doutputDesc; + TensorDescriptor dinputDesc; + TensorDescriptor dweightDesc; +}; + +} // namespace prelu + +} // namespace miopen diff --git a/src/include/miopen/prelu/solvers.hpp b/src/include/miopen/prelu/solvers.hpp new file mode 100644 index 0000000000..a2bec477f0 --- /dev/null +++ b/src/include/miopen/prelu/solvers.hpp @@ -0,0 +1,77 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include + +namespace miopen { + +namespace solver { + +namespace prelu { + +using BackwardSolverBase = + NonTunableSolverBase; + +struct Backward : BackwardSolverBase +{ + bool MayNeedWorkspace() const override { return true; } +}; + +struct SingleWeightBackward final : Backward +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::prelu::BackwardProblemDescription& problem) const override; + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::prelu::BackwardProblemDescription& problem) const override; + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::prelu::BackwardProblemDescription& problem) const override; +}; + +struct MultiWeightsBackward final : Backward +{ + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + bool IsApplicable(const ExecutionContext& context, + const miopen::prelu::BackwardProblemDescription& problem) const override; + ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::prelu::BackwardProblemDescription& problem) const override; + std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::prelu::BackwardProblemDescription& problem) const override; +}; + +} // namespace prelu + +} // namespace solver + +} // namespace miopen diff --git a/src/include/miopen/prelu/utils.hpp b/src/include/miopen/prelu/utils.hpp new file mode 100644 index 0000000000..b7551a4135 --- /dev/null +++ b/src/include/miopen/prelu/utils.hpp @@ -0,0 +1,45 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include +#include +#include +#include + +namespace miopen { +namespace solver { +namespace prelu { + +KernelInfo make_hip_kernel(std::vector localsize, + std::vector gridsize, + std::string kernel_file, + std::string kernel_name, + KernelBuildParameters build_params); + +} // namespace prelu +} // namespace solver +} // namespace miopen diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp index fb81bafb5c..ab824faa32 100644 --- a/src/include/miopen/solver_id.hpp +++ b/src/include/miopen/solver_id.hpp @@ -60,7 +60,8 @@ enum class Primitive Softmax, Adam, Item, - RoPE + RoPE, + ReLU }; struct MIOPEN_INTERNALS_EXPORT Id diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp index 9f7430ba8a..b7149c8809 100644 --- a/src/include/miopen/tensor_view_utils.hpp +++ b/src/include/miopen/tensor_view_utils.hpp @@ -28,6 +28,7 @@ #define MIOPEN_TENSOR_VIEW_UTIL_HPP_ #include +#include #include "../../kernels/tensor_view.hpp" namespace miopen { diff --git a/src/kernels/MIOpenPReLU.cpp b/src/kernels/MIOpenPReLU.cpp new file mode 100644 index 0000000000..750b226e7f --- /dev/null +++ b/src/kernels/MIOpenPReLU.cpp @@ -0,0 +1,115 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" +#include "tensor_view.hpp" + +template +__device__ void PReLUBackward(const TI* __restrict__ input, + const TI* __restrict__ weight, + const TO* __restrict__ output_grad, + TI* __restrict__ input_grad, + FLOAT_ACCUM* __restrict__ weight_grad_collector, + uint64_t N, + tensor_view_t input_tv, + tensor_view_t<1> weight_tv, + tensor_view_t output_grad_tv, + tensor_view_t input_grad_tv) +{ + uint64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + if(gid >= N) + return; + + auto tensor_layout = tensor_layout_t(input_tv, gid); + FLOAT_ACCUM input_v = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(tensor_layout)]); + FLOAT_ACCUM grad_v = + CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]); + + if(input_grad != nullptr) + { + FLOAT_ACCUM weight_v = CVT_FLOAT2ACCUM( + weight[SingleWeight ? 0 : weight_tv.get_tensor_view_idx({tensor_layout.layout[1]})]); + FLOAT_ACCUM input_grad_v = input_v > 0 ? grad_v : weight_v * grad_v; + input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = + CVT_ACCUM2FLOAT(input_grad_v); + } + if(weight_grad_collector != nullptr) + { + weight_grad_collector[gid] = input_v > 0 ? 0 : input_v * grad_v; + } +} + +extern "C" __global__ void PReLUSWBackward(const INPUT_TYPE* __restrict__ input, + const INPUT_TYPE* __restrict__ weight, + const OUTPUT_TYPE* __restrict__ output_grad, + INPUT_TYPE* __restrict__ input_grad, + FLOAT_ACCUM* __restrict__ weight_grad_collector, + uint64_t N, + tensor_view_t input_tv, + tensor_view_t<1> weight_tv, + tensor_view_t output_grad_tv, + tensor_view_t input_grad_tv) +{ + // instantiate the kernel + PReLUBackward(input, + weight, + output_grad, + input_grad, + weight_grad_collector, + N, + input_tv, + weight_tv, + output_grad_tv, + input_grad_tv); +} + +extern "C" __global__ void PReLUMWBackward(const INPUT_TYPE* __restrict__ input, + const INPUT_TYPE* __restrict__ weight, + const OUTPUT_TYPE* __restrict__ output_grad, + INPUT_TYPE* __restrict__ input_grad, + FLOAT_ACCUM* __restrict__ weight_grad_collector, + uint64_t N, + tensor_view_t input_tv, + tensor_view_t<1> weight_tv, + tensor_view_t output_grad_tv, + tensor_view_t input_grad_tv) +{ + // instantiate the kernel + PReLUBackward(input, + weight, + output_grad, + input_grad, + weight_grad_collector, + N, + input_tv, + weight_tv, + output_grad_tv, + input_grad_tv); +} diff --git a/src/kernels/MIOpenReduceSum.cpp b/src/kernels/MIOpenReduceSum.cpp new file mode 100644 index 0000000000..5ed52008bf --- /dev/null +++ b/src/kernels/MIOpenReduceSum.cpp @@ -0,0 +1,104 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" +#include "tensor_view.hpp" +#include "warp_reduce.hpp" +#include "block_reduce.hpp" + +template +__device__ void +ReduceSum(const FLOAT_ACCUM* input, TO* output, uint64_t N, tensor_view_t<1> output_tv) +{ + uint64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + + FLOAT_ACCUM val = gid < N ? input[gid] : CVT_FP32_2ACCUM(0.0f); + val = block_reduce(val); + + if(threadIdx.x == 0) + output[output_tv.get_tensor_view_idx({blockIdx.x})] = CVT_ACCUM2FLOAT(val); +} + +extern "C" __global__ void ReduceSum(const FLOAT_ACCUM* __restrict__ input, + OUTPUT_TYPE* __restrict__ output, + uint64_t N, + tensor_view_t<1> output_tv) +{ + // instantiate the kernel + ReduceSum(input, output, N, output_tv); +} + +extern "C" __global__ void ReduceSumFLOATACCUM(const FLOAT_ACCUM* __restrict__ input, + FLOAT_ACCUM* __restrict__ output, + uint64_t N) +{ + uint64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + + FLOAT_ACCUM val = gid < N ? input[gid] : 0.0f; + val = block_reduce(val); + + if(threadIdx.x == 0) + output[blockIdx.x] = val; +} + +template +__device__ void Reduce1dSum(const FLOAT_ACCUM* __restrict__ input, + TO* __restrict__ output, + uint64_t output_numel, + uint64_t inner_size, + uint64_t outer_size, + tensor_view_t<1> output_tv) +{ + uint64_t tid = threadIdx.x; + uint64_t oidx = blockIdx.x; + + // use double instead of FLOAT_ACCUM for better precision + double sum_double = 0.0; + for(uint64_t i = tid; i < outer_size * inner_size; i += blockDim.x) + sum_double += static_cast( + input[i / inner_size * output_numel * inner_size + oidx * inner_size + i % inner_size]); + + FLOAT_ACCUM sum = static_cast(sum_double); + sum = block_reduce(sum); + + if(tid == 0) + output[output_tv.get_tensor_view_idx({oidx})] = CVT_ACCUM2FLOAT(sum); +} + +extern "C" __global__ void Reduce1dSum(const FLOAT_ACCUM* __restrict__ input, + OUTPUT_TYPE* __restrict__ output, + uint64_t output_numel, + uint64_t inner_size, + uint64_t outer_size, + tensor_view_t<1> output_tv) +{ + // instantiate the kernel + Reduce1dSum(input, output, output_numel, inner_size, outer_size, output_tv); +} diff --git a/src/kernels/block_reduce.hpp b/src/kernels/block_reduce.hpp new file mode 100644 index 0000000000..7d298a6ead --- /dev/null +++ b/src/kernels/block_reduce.hpp @@ -0,0 +1,72 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_BLOCK_REDUCE_HPP +#define GUARD_BLOCK_REDUCE_HPP + +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" +#include "warp_reduce.hpp" + +enum class ReduceThreadDim : int32_t +{ + X = 1 << 0, + Y = 1 << 1, + Z = 1 << 2, +}; + +template +__device__ FLOAT_ACCUM block_reduce(FLOAT_ACCUM val) +{ + if(reduce_size == warpSize) + return warp_reduce(val); + + static __shared__ FLOAT_ACCUM shared[reduce_size / warpSize]; + uint64_t tid = 0; + if(static_cast(thread_dim) & static_cast(ReduceThreadDim::X)) + tid += threadIdx.x; + if(static_cast(thread_dim) & static_cast(ReduceThreadDim::Y)) + tid = tid * blockDim.y + threadIdx.y; + if(static_cast(thread_dim) & static_cast(ReduceThreadDim::Z)) + tid = tid * blockDim.z + threadIdx.z; + const uint64_t lane = tid % warpSize; + const uint64_t wid = tid / warpSize; + + val = warp_reduce(val); + if(lane == 0) + shared[wid] = val; + __syncthreads(); + + val = tid < reduce_size / warpSize ? shared[lane] : 0; + if(wid == 0) + val = warp_reduce(val); + return val; +} + +#endif // GUARD_BLOCK_REDUCE_HPP diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp index d35bfd93fc..c9357dd729 100644 --- a/src/kernels/tensor_view.hpp +++ b/src/kernels/tensor_view.hpp @@ -27,6 +27,8 @@ #ifndef GUARD_TENSOR_VIEW_HPP #define GUARD_TENSOR_VIEW_HPP +#include + template struct tensor_layout_t; @@ -72,6 +74,15 @@ struct tensor_layout_t } } + constexpr tensor_layout_t(std::initializer_list layout_) + { + static_assert(N > 0); + for(auto i = 0; i < N; ++i) + { + layout[i] = layout_.begin()[i]; + } + } + uint64_t layout[N]; }; diff --git a/src/kernels/warp_reduce.hpp b/src/kernels/warp_reduce.hpp new file mode 100644 index 0000000000..f1490bd214 --- /dev/null +++ b/src/kernels/warp_reduce.hpp @@ -0,0 +1,58 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#ifndef GUARD_WARP_REDUCE_HPP +#define GUARD_WARP_REDUCE_HPP + +#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS +#include +#include +#endif + +#include "float_types.h" + +enum class BinaryOp_t +{ + Add, +}; + +template +struct BinaryFunc; + +template +struct BinaryFunc +{ + constexpr void exec(T& a, const T& b) { a += b; } +}; + +template +__device__ FLOAT_ACCUM warp_reduce(FLOAT_ACCUM val) +{ + for(auto d = ws / 2; d >= 1; d >>= 1) + BinaryFunc{}.exec(val, __shfl_down(val, d)); + return val; +} + +#endif // GUARD_WARP_REDUCE_HPP diff --git a/src/prelu.cpp b/src/prelu.cpp new file mode 100644 index 0000000000..3966c1f45f --- /dev/null +++ b/src/prelu.cpp @@ -0,0 +1,95 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include + +namespace miopen { + +size_t GetPReLUBackwardWorkspaceSize(Handle& handle, + const TensorDescriptor& inputDesc, + const TensorDescriptor& weightDesc) +{ + auto ctx = ExecutionContext{&handle}; + const auto problem = + prelu::BackwardProblemDescription{inputDesc, weightDesc, inputDesc, inputDesc, weightDesc}; + + const auto solvers = solver::SolverContainer{}; + + auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem); + + return pair_size_vector.empty() ? static_cast(-1) : pair_size_vector.front().second; +} + +miopenStatus_t PReLUBackward(Handle& handle, + Data_t workspace, + size_t workspaceSizeInBytes, + const TensorDescriptor& inputDesc, + ConstData_t input, + const TensorDescriptor& weightDesc, + ConstData_t weight, + const TensorDescriptor& doutputDesc, + ConstData_t doutput, + const TensorDescriptor& dinputDesc, + Data_t dinput, + const TensorDescriptor& dweightDesc, + Data_t dweight) +{ + const auto problem = prelu::BackwardProblemDescription{ + inputDesc, weightDesc, doutputDesc, inputDesc, dweightDesc}; + + const auto invoke_params = [&]() { + auto tmp = prelu::InvokeParams{}; + tmp.type = InvokeType::Run; + tmp.inputDesc = &inputDesc; + tmp.weightDesc = &weightDesc; + tmp.doutputDesc = &doutputDesc; + tmp.dinputDesc = &dinputDesc; + tmp.dweightDesc = &dweightDesc; + tmp.input = input; + tmp.weight = weight; + tmp.doutput = doutput; + tmp.dinput = dinput; + tmp.dweight = dweight; + tmp.workspace = workspace; + tmp.workspace_size = workspaceSizeInBytes; + return tmp; + }(); + + const auto algo = AlgorithmName{"PReLUBackward"}; + const auto solvers = solver::SolverContainer{}; + + solvers.ExecutePrimitive(handle, problem, algo, invoke_params); + + return miopenStatusSuccess; +} + +} // namespace miopen diff --git a/src/prelu/problem_description.cpp b/src/prelu/problem_description.cpp new file mode 100644 index 0000000000..f364c60d19 --- /dev/null +++ b/src/prelu/problem_description.cpp @@ -0,0 +1,76 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include + +namespace miopen { + +namespace prelu { + +bool checkSameType(const TensorDescriptor& x, const TensorDescriptor& y) +{ + if(x.GetType() != y.GetType()) + return false; + return true; +} + +bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y) +{ + if(x.GetNumDims() != y.GetNumDims()) + return false; + for(int32_t i = 0; i < x.GetNumDims(); ++i) + { + if(x.GetLengths()[i] != y.GetLengths()[i]) + return false; + } + return true; +} + +NetworkConfig BackwardProblemDescription::MakeNetworkConfig() const +{ + auto input_dtype = inputDesc.GetType(); + auto output_dtype = doutputDesc.GetType(); + auto size = inputDesc.GetElementSize(); + auto num_params = weightDesc.GetElementSize(); + + std::ostringstream ss; + + ss << "prelu_bwd"; + ss << "idtype" << input_dtype; + ss << "odtype" << output_dtype; + ss << "ndim" << inputDesc.GetNumDims(); + ss << "size" << size; + ss << "num_params" << num_params; + + return NetworkConfig{ss.str()}; +} + +} // namespace prelu + +} // namespace miopen diff --git a/src/prelu_api.cpp b/src/prelu_api.cpp new file mode 100644 index 0000000000..5e670a0a2b --- /dev/null +++ b/src/prelu_api.cpp @@ -0,0 +1,133 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include + +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + os << '{'; + for(int i = 0; i < v.size(); ++i) + { + if(i != 0) + os << ','; + os << v[i]; + } + os << '}'; + return os; +} + +static void LogCmdPReLU(const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t weightDesc, + bool is_fwd) +{ + if(miopen::IsLoggingCmd()) + { + std::stringstream ss; + auto dtype = miopen::deref(inputDesc).GetType(); + if(dtype == miopenHalf) + { + ss << "prelufp16"; + } + else if(dtype == miopenFloat) + { + ss << "prelufp32"; + } + else if(dtype == miopenBFloat16) + { + ss << "prelubfp16"; + } + + MIOPEN_LOG_FUNCTION(inputDesc, weightDesc); + ss << " --input " << miopen::deref(inputDesc).GetLengths(); + ss << " --weight " << miopen::deref(weightDesc).GetLengths(); + ss << " -F " << ((is_fwd) ? "1" : "2"); + + MIOPEN_LOG_DRIVER_CMD(ss.str()); + } +} + +extern "C" miopenStatus_t +miopenGetPReLUBackwardWorkspaceSize(miopenHandle_t handle, + const miopenTensorDescriptor_t inputDesc, + const miopenTensorDescriptor_t weightDesc, + size_t* sizeInBytes) +{ + + MIOPEN_LOG_FUNCTION(handle, inputDesc, weightDesc, sizeInBytes); + + return miopen::try_([&] { + miopen::deref(sizeInBytes) = miopen::GetPReLUBackwardWorkspaceSize( + miopen::deref(handle), miopen::deref(inputDesc), miopen::deref(weightDesc)); + }); +} + +extern "C" miopenStatus_t miopenPReLUBackward(miopenHandle_t handle, + void* workspace, + const size_t workspaceSizeInBytes, + const miopenTensorDescriptor_t inputDesc, + const void* input, + const miopenTensorDescriptor_t weightDesc, + const void* weight, + const miopenTensorDescriptor_t doutputDesc, + const void* doutput, + const miopenTensorDescriptor_t dinputDesc, + void* dinput, + const miopenTensorDescriptor_t dweightDesc, + void* dweight) +{ + MIOPEN_LOG_FUNCTION(handle, + workspace, + workspaceSizeInBytes, + inputDesc, + input, + weightDesc, + weight, + doutputDesc, + doutput, + dinputDesc, + dinput, + dweightDesc, + dweight); + + LogCmdPReLU(inputDesc, weightDesc, false); + return miopen::try_([&] { + miopen::PReLUBackward(miopen::deref(handle), + DataCast(workspace), + workspaceSizeInBytes, + miopen::deref(inputDesc), + DataCast(input), + miopen::deref(weightDesc), + DataCast(weight), + miopen::deref(doutputDesc), + DataCast(doutput), + miopen::deref(dinputDesc), + DataCast(dinput), + miopen::deref(dweightDesc), + DataCast(dweight)); + }); +} diff --git a/src/solver.cpp b/src/solver.cpp index bbd13bd89f..eccdba1c31 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -676,6 +677,9 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::RoPE, rope::RoPEForward{}.SolverDbId()); Register(registry, ++id, Primitive::RoPE, rope::RoPEBackward{}.SolverDbId()); + Register(registry, ++id, Primitive::ReLU, prelu::MultiWeightsBackward{}.SolverDbId()); + Register(registry, ++id, Primitive::ReLU, prelu::SingleWeightBackward{}.SolverDbId()); + // IMPORTANT: New solvers should be added to the end of the function! } diff --git a/src/solver/prelu/backward_prelu_multi_weights.cpp b/src/solver/prelu/backward_prelu_multi_weights.cpp new file mode 100644 index 0000000000..5fed375a2b --- /dev/null +++ b/src/solver/prelu/backward_prelu_multi_weights.cpp @@ -0,0 +1,183 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#define VIEW_DIMS 5 + +#define warpSizeCTX (context.GetStream().GetWavefrontWidth()) +#define LOCAL_SIZE_MW_BWD 256 +#define LOCAL_SIZE_MW_REDUCE_BWD warpSizeCTX + +namespace miopen { + +namespace solver { + +namespace prelu { + +bool MultiWeightsBackward::IsApplicable( + const ExecutionContext& /*context*/, + const miopen::prelu::BackwardProblemDescription& problem) const +{ + if(problem.GetdInputDesc().GetVectorLength() > VIEW_DIMS) + return false; + if(problem.IsSingleWeight()) + return false; + return true; +} + +ConvSolution +MultiWeightsBackward::GetSolution(const ExecutionContext& context, + const miopen::prelu::BackwardProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto dtype = problem.GetdInputDesc().GetType(); + auto input_dtype = miopen::GetDataType(problem.GetdInputDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetdOuputDesc().GetType()); + + /* Phase 1: Calc gradient for each elements. */ + { + auto size = problem.GetdInputDesc().GetElementSize(); + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"VIEW_DIMS", VIEW_DIMS}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + }; + result.construction_params.push_back(make_hip_kernel( + {LOCAL_SIZE_MW_BWD}, {size}, "MIOpenPReLU.cpp", "PReLUMWBackward", build_params)); + } + + /* Phase 2: Reduce gradient. */ + { + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"OUTPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"REDUCE_SIZE", LOCAL_SIZE_MW_REDUCE_BWD}, + }; + result.construction_params.push_back( + make_hip_kernel({LOCAL_SIZE_MW_REDUCE_BWD}, + {problem.GetdWeightDesc().GetElementSize() * LOCAL_SIZE_MW_REDUCE_BWD}, + "MIOpenReduceSum.cpp", + "Reduce1dSum", + build_params)); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + HipEventPtr start, stop; + bool profiling = handle_.IsProfilingEnabled(); + if(profiling) + { + handle_.EnableProfiling(false); + hipStreamSynchronize(handle_.GetStream()); + start = miopen::make_hip_event(); + stop = miopen::make_hip_event(); + hipEventRecord(start.get(), handle_.GetStream()); + } + + int kernelCnt = 0; + + /* Phase 1: Calc gradient for each elements. */ + { + auto input_tv = get_inner_expanded_tv(deref(params.inputDesc)); + auto weight_tv = get_inner_expanded_tv<1>(deref(params.weightDesc)); + auto output_grad_tv = get_inner_expanded_tv(deref(params.doutputDesc)); + auto input_grad_tv = get_inner_expanded_tv(deref(params.dinputDesc)); + decltype(auto) kernel = handle_.Run(kernels[kernelCnt++]); + kernel(params.input, + params.weight, + params.doutput, + params.dinput, + params.workspace, + static_cast(deref(params.inputDesc).GetElementSize()), + input_tv, + weight_tv, + output_grad_tv, + input_grad_tv); + } + + /* Phase 2: Reduce gradient. */ + { + uint64_t output_numel = deref(params.weightDesc).GetElementSize(); + uint64_t outer_size = deref(params.inputDesc).GetLengths()[0]; + uint64_t inner_size = + deref(params.inputDesc).GetElementSize() / outer_size / output_numel; + auto weight_grad_tv = get_inner_expanded_tv<1>(deref(params.dweightDesc)); + decltype(auto) kernel = handle_.Run(kernels[kernelCnt++]); + kernel(params.workspace, + params.dweight, + output_numel, + inner_size, + outer_size, + weight_grad_tv); + } + + if(profiling) + { + float elapsed = 0.0f; + hipEventRecord(stop.get(), handle_.GetStream()); + handle_.EnableProfiling(true); + hipEventSynchronize(stop.get()); + hipEventElapsedTime(&elapsed, start.get(), stop.get()); + // Clean up + hipEventDestroy(start.get()); + hipEventDestroy(stop.get()); + handle_.ResetKernelTime(); + handle_.AccumKernelTime(elapsed); + }; + }; + }; + + return result; +} + +std::size_t MultiWeightsBackward::GetWorkspaceSize( + const ExecutionContext& /*context*/, + const miopen::prelu::BackwardProblemDescription& problem) const +{ + auto size = problem.GetdInputDesc().GetElementSize(); + size *= get_data_size(miopenFloat); + return size; +} + +} // namespace prelu +} // namespace solver +} // namespace miopen diff --git a/src/solver/prelu/backward_prelu_single_weight.cpp b/src/solver/prelu/backward_prelu_single_weight.cpp new file mode 100644 index 0000000000..2f8ba825f4 --- /dev/null +++ b/src/solver/prelu/backward_prelu_single_weight.cpp @@ -0,0 +1,202 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#define VIEW_DIMS 5 + +#define LOCAL_SIZE_SW_BWD 256 +#define LOCAL_SIZE_SW_REDUCE_BWD 256 + +namespace miopen { + +namespace solver { + +namespace prelu { + +namespace { +MultiBufferWorkspaceTraits GetMultiBufferWorkspaceTraits(const TensorDescriptor& inputDesc) +{ + auto size = inputDesc.GetElementSize(); + return MultiBufferWorkspaceTraits{size * get_data_size(miopenFloat), + AlignUp(size, LOCAL_SIZE_SW_REDUCE_BWD) / + LOCAL_SIZE_SW_REDUCE_BWD * get_data_size(miopenFloat)}; +} +} // namespace + +bool SingleWeightBackward::IsApplicable( + const ExecutionContext& /*context*/, + const miopen::prelu::BackwardProblemDescription& problem) const +{ + if(problem.GetdInputDesc().GetVectorLength() > VIEW_DIMS) + return false; + if(!problem.IsSingleWeight()) + return false; + return true; +} + +ConvSolution +SingleWeightBackward::GetSolution(const ExecutionContext& /*context*/, + const miopen::prelu::BackwardProblemDescription& problem) const +{ + auto result = ConvSolution{miopenStatusSuccess}; + + auto dtype = problem.GetdInputDesc().GetType(); + auto input_dtype = miopen::GetDataType(problem.GetdInputDesc().GetType()); + auto output_dtype = miopen::GetDataType(problem.GetdOuputDesc().GetType()); + + /* Phase 1: Calc gradient for each elements. */ + { + auto size = problem.GetdInputDesc().GetElementSize(); + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"VIEW_DIMS", VIEW_DIMS}, + {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}, + }; + result.construction_params.push_back(make_hip_kernel( + {LOCAL_SIZE_SW_BWD}, {size}, "MIOpenPReLU.cpp", "PReLUSWBackward", build_params)); + } + + /* Phase 2: Reduce gradient. */ + { + auto size = problem.GetdInputDesc().GetElementSize(); + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(dtype == miopenHalf)}, + {"MIOPEN_USE_FP32", static_cast(dtype == miopenFloat)}, + {"MIOPEN_USE_FP64", static_cast(dtype == miopenDouble)}, + {"MIOPEN_USE_BFP16", static_cast(dtype == miopenBFloat16)}, + {"OUTPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype}, + {"REDUCE_SIZE", LOCAL_SIZE_SW_REDUCE_BWD}, + }; + while(size > LOCAL_SIZE_SW_REDUCE_BWD) + { + result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_SW_REDUCE_BWD}, + {size}, + "MIOpenReduceSum.cpp", + "ReduceSumFLOATACCUM", + build_params)); + size = (size + LOCAL_SIZE_SW_REDUCE_BWD - 1) / LOCAL_SIZE_SW_REDUCE_BWD; + } + result.construction_params.push_back(make_hip_kernel( + {LOCAL_SIZE_SW_REDUCE_BWD}, {size}, "MIOpenReduceSum.cpp", "ReduceSum", build_params)); + } + + auto getBuffPart = [ws = GetMultiBufferWorkspaceTraits(problem.GetdInputDesc())]( + void* buffer, size_t part_idx) { + return static_cast(static_cast(buffer) + ws.GetOffset(part_idx)); + }; + + result.invoker_factory = [=](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + HipEventPtr start, stop; + bool profiling = handle_.IsProfilingEnabled(); + if(profiling) + { + handle_.EnableProfiling(false); + hipStreamSynchronize(handle_.GetStream()); + start = miopen::make_hip_event(); + stop = miopen::make_hip_event(); + hipEventRecord(start.get(), handle_.GetStream()); + } + + auto work_a = getBuffPart(params.GetWorkspace(), 0); + auto work_b = getBuffPart(params.GetWorkspace(), 1); + + int kernelCnt = 0; + + /* Phase 1: Calc gradient for each elements. */ + { + auto input_tv = get_inner_expanded_tv(deref(params.inputDesc)); + auto weight_tv = get_inner_expanded_tv<1>(deref(params.weightDesc)); + auto output_grad_tv = get_inner_expanded_tv(deref(params.doutputDesc)); + auto input_grad_tv = get_inner_expanded_tv(deref(params.dinputDesc)); + decltype(auto) kernel = handle_.Run(kernels[kernelCnt++]); + kernel(params.input, + params.weight, + params.doutput, + params.dinput, + work_a, + static_cast(deref(params.inputDesc).GetElementSize()), + input_tv, + weight_tv, + output_grad_tv, + input_grad_tv); + } + + /* Phase 2: Reduce gradient. */ + { + uint64_t size = deref(params.inputDesc).GetElementSize(); + while(size > LOCAL_SIZE_SW_REDUCE_BWD) + { + auto kernel = handle_.Run(kernels[kernelCnt++]); + kernel(work_a, work_b, size); + size = (size + LOCAL_SIZE_SW_REDUCE_BWD - 1) / LOCAL_SIZE_SW_REDUCE_BWD; + std::swap(work_a, work_b); + } + auto weight_grad_tv = get_inner_expanded_tv<1>(deref(params.dweightDesc)); + handle_.Run(kernels[kernelCnt++])(work_a, params.dweight, size, weight_grad_tv); + } + + if(profiling) + { + float elapsed = 0.0f; + hipEventRecord(stop.get(), handle_.GetStream()); + handle_.EnableProfiling(true); + hipEventSynchronize(stop.get()); + hipEventElapsedTime(&elapsed, start.get(), stop.get()); + // Clean up + hipEventDestroy(start.get()); + hipEventDestroy(stop.get()); + handle_.ResetKernelTime(); + handle_.AccumKernelTime(elapsed); + }; + }; + }; + + return result; +} + +std::size_t SingleWeightBackward::GetWorkspaceSize( + const ExecutionContext& /*context*/, + const miopen::prelu::BackwardProblemDescription& problem) const +{ + return GetMultiBufferWorkspaceTraits(problem.GetdInputDesc()).GetSize(); +} + +} // namespace prelu +} // namespace solver +} // namespace miopen diff --git a/src/solver/prelu/utils.cpp b/src/solver/prelu/utils.cpp new file mode 100644 index 0000000000..d67e0bdac5 --- /dev/null +++ b/src/solver/prelu/utils.cpp @@ -0,0 +1,53 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +namespace miopen { + +namespace solver { + +namespace prelu { + +KernelInfo make_hip_kernel(std::vector localsize, + std::vector gridsize, + std::string kernel_file, + std::string kernel_name, + KernelBuildParameters build_params) +{ + while(localsize.size() < 3) + localsize.push_back(1); + while(gridsize.size() < 3) + gridsize.push_back(1); + for(int i = 0; i < localsize.size(); ++i) + gridsize[i] = AlignUp(gridsize[i], localsize[i]); + return KernelInfo{ + build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name}; +} + +} // namespace prelu +} // namespace solver +} // namespace miopen diff --git a/test/cpu_prelu.hpp b/test/cpu_prelu.hpp new file mode 100644 index 0000000000..952099c3f6 --- /dev/null +++ b/test/cpu_prelu.hpp @@ -0,0 +1,100 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include "tensor_holder.hpp" +#include + +template +void cpu_prelu_backward(const tensor input, + const tensor weight, + const tensor output_grad, + tensor& ref_input_grad, + tensor& ref_weight_grad, + const bool has_dinput = true, + const bool has_dweight = true) +{ + auto N = input.desc.GetElementSize(); + auto input_tv = miopen::get_inner_expanded_tv<5>(input.desc); + auto weight_tv = miopen::get_inner_expanded_tv<1>(weight.desc); + auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc); + auto input_grad_tv = miopen::get_inner_expanded_tv<5>(ref_input_grad.desc); + auto weight_grad_tv = miopen::get_inner_expanded_tv<1>(ref_weight_grad.desc); + + auto weight_grad_collector = std::vector(N); + + par_ford(N)([&](int gid) { + auto tensor_layout = tensor_layout_t<5>(input_tv, gid); + float input_v = static_cast(input[input_tv.get_tensor_view_idx(tensor_layout)]); + float grad_v = + static_cast(output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]); + + if(has_dinput) + { + float weight_v; + weight_v = static_cast( + weight[weight.desc.GetElementSize() == 1 + ? 0 + : weight_tv.get_tensor_view_idx({tensor_layout.layout[1]})]); + + float input_grad_v = input_v > 0 ? grad_v : weight_v * grad_v; + ref_input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] = + static_cast(input_grad_v); + } + if(has_dweight) + { + weight_grad_collector[gid] = input_v > 0 ? 0 : input_v * grad_v; + } + }); + + if(has_dweight) + { + if(weight.desc.GetElementSize() == 1) + { + double sum = 0; + for(int i = 0; i < N; ++i) + sum += static_cast(weight_grad_collector[i]); + ref_weight_grad[0] = static_cast(sum); + } + else + { + size_t inner_size = std::accumulate( + &input_tv.size[2], &input_tv.size[4], 1ul, std::multiplies()); + size_t outer_size = inner_size * input_tv.size[1]; + par_ford(weight.desc.GetElementSize())([&](int i) { + double sum = 0; + ford(input_tv.size[0])([&](int j) { + ford(inner_size)([&](int k) { + sum += static_cast( + weight_grad_collector[j * outer_size + i * inner_size + k]); + }); + }); + ref_weight_grad[weight_grad_tv.get_tensor_view_idx({i})] = static_cast(sum); + }); + } + } +} diff --git a/test/gtest/prelu.cpp b/test/gtest/prelu.cpp new file mode 100644 index 0000000000..1d0cd4d431 --- /dev/null +++ b/test/gtest/prelu.cpp @@ -0,0 +1,118 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "prelu.hpp" +#include + +MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG) +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL) + +namespace prelu { + +std::string GetFloatArg() +{ + const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG); + if(tmp.empty()) + { + return ""; + } + return tmp; +} + +bool CheckFloatArg(std::string arg) +{ + if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == arg)) + { + return true; + } + return false; +} + +struct GPU_PReLU_bwd_FP32 : PReLUTest +{ +}; + +struct GPU_PReLU_bwd_FP16 : PReLUTest +{ +}; + +struct GPU_PReLU_bwd_BFP16 : PReLUTest +{ +}; + +} // namespace prelu +using namespace prelu; + +TEST_P(GPU_PReLU_bwd_FP32, Test) +{ + if(CheckFloatArg("--float")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(GPU_PReLU_bwd_FP16, Test) +{ + if(CheckFloatArg("--half")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +TEST_P(GPU_PReLU_bwd_BFP16, Test) +{ + if(CheckFloatArg("--bfloat16")) + { + RunTest(); + Verify(); + } + else + { + GTEST_SKIP(); + } +}; + +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_PReLU_bwd_FP32, testing::ValuesIn(PReLUSmokeTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_PReLU_bwd_FP16, testing::ValuesIn(PReLUSmokeTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_PReLU_bwd_BFP16, testing::ValuesIn(PReLUSmokeTestConfigs())); + +INSTANTIATE_TEST_SUITE_P(Full, GPU_PReLU_bwd_FP32, testing::ValuesIn(PReLUFullTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_PReLU_bwd_FP16, testing::ValuesIn(PReLUFullTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_PReLU_bwd_BFP16, testing::ValuesIn(PReLUFullTestConfigs())); + +INSTANTIATE_TEST_SUITE_P(Perf, GPU_PReLU_bwd_FP32, testing::ValuesIn(PReLUPerfTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Perf, GPU_PReLU_bwd_FP16, testing::ValuesIn(PReLUPerfTestConfigs())); +INSTANTIATE_TEST_SUITE_P(Perf, GPU_PReLU_bwd_BFP16, testing::ValuesIn(PReLUPerfTestConfigs())); diff --git a/test/gtest/prelu.hpp b/test/gtest/prelu.hpp new file mode 100644 index 0000000000..510cfb85f4 --- /dev/null +++ b/test/gtest/prelu.hpp @@ -0,0 +1,247 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "cpu_prelu.hpp" +#include "get_handle.hpp" +#include "random.hpp" +#include "tensor_holder.hpp" +#include "verify.hpp" + +#include +#include +#include + +inline std::ostream& operator<<(std::ostream& os, const std::vector& v) +{ + os << '{'; + for(int i = 0; i < v.size(); ++i) + { + if(i != 0) + os << ','; + os << v[i]; + } + os << '}'; + return os; +} + +struct PReLUTestCase +{ + std::vector lengths; + bool full_params; + bool contiguous; + + friend std::ostream& operator<<(std::ostream& os, const PReLUTestCase& tc) + { + return os << " Lengths:" << tc.lengths + << " Full_num_params:" << (tc.full_params ? "True" : "False") + << " Contiguous:" << (tc.contiguous ? "True" : "False"); + } +}; + +inline std::vector PReLUSmokeTestConfigs() +{ + std::vector tcs; + + tcs.push_back({{64, 4}, true, true}); + tcs.push_back({{64, 4}, true, false}); + tcs.push_back({{64, 4}, false, true}); + tcs.push_back({{64, 4}, false, false}); + + tcs.push_back({{64, 112}, true, true}); + tcs.push_back({{64, 112}, true, false}); + tcs.push_back({{64, 112}, false, true}); + tcs.push_back({{64, 112}, false, false}); + + return tcs; +} + +inline std::vector PReLUPerfTestConfigs() +{ + std::vector tcs; + + tcs.push_back({{64, 112, 50}, true, true}); + tcs.push_back({{64, 112, 50}, true, false}); + tcs.push_back({{64, 112, 50}, false, true}); + tcs.push_back({{64, 112, 50}, false, false}); + + return tcs; +} + +inline std::vector PReLUFullTestConfigs() +{ + std::vector tcs; + + auto smoke_test = PReLUSmokeTestConfigs(); + auto perf_test = PReLUPerfTestConfigs(); + + tcs.reserve(smoke_test.size() + perf_test.size()); + for(const auto& test : smoke_test) + tcs.push_back(test); + for(const auto& test : perf_test) + tcs.push_back(test); + + return tcs; +} + +inline std::vector GetStrides(std::vector lengths, bool contiguous) +{ + if(!contiguous) + std::swap(lengths.front(), lengths.back()); + std::vector strides(lengths.size()); + strides.back() = 1; + for(int i = lengths.size() - 2; i >= 0; --i) + strides[i] = strides[i + 1] * lengths[i + 1]; + if(!contiguous) + std::swap(strides.front(), strides.back()); + return strides; +} + +template +struct PReLUTest : public ::testing::TestWithParam +{ +protected: + void SetUp() override + { + auto&& handle = get_handle(); + prelu_config = GetParam(); + auto gen_value1 = [](auto...) { return prng::gen_descreet_uniform_sign(1e-6, 100); }; + auto gen_value2 = [](auto...) { return prng::gen_descreet_uniform_sign(1e-6, 99); }; + + auto lengths = prelu_config.lengths; + + auto input_strides = GetStrides(lengths, prelu_config.contiguous); + input = tensor{lengths, input_strides}.generate(gen_value1); + + std::vector weight_length = {1}; + if(prelu_config.full_params) + weight_length = {lengths[1]}; + std::vector weight_strides = {prelu_config.contiguous ? 1 : 2}; + weight = tensor{weight_length, weight_strides}.generate(gen_value2); + + ws_sizeInBytes = miopen::GetPReLUBackwardWorkspaceSize(handle, input.desc, weight.desc); + if(ws_sizeInBytes == static_cast(-1)) + GTEST_SKIP(); + + if(ws_sizeInBytes != 0) + { + std::vector workspace_dims; + workspace_dims.push_back(ws_sizeInBytes / sizeof(float)); + + workspace = tensor{workspace_dims}; + std::fill(workspace.begin(), workspace.end(), 0.0f); + + workspace_dev = handle.Write(workspace.data); + } + + dinput = tensor{lengths, input_strides}; + ref_dinput = tensor{lengths, input_strides}; + std::fill(dinput.begin(), dinput.end(), static_cast(0.0f)); + std::fill(ref_dinput.begin(), ref_dinput.end(), static_cast(0.0f)); + + dweight = tensor{weight_length, weight_strides}; + ref_dweight = tensor{weight_length, weight_strides}; + std::fill(dweight.begin(), dweight.end(), static_cast(0.0f)); + std::fill(ref_dweight.begin(), ref_dweight.end(), static_cast(0.0f)); + + auto out_strides = GetStrides(lengths, true); + doutput = tensor{lengths, out_strides}; + std::fill(doutput.begin(), doutput.end(), static_cast(1.0f)); + + input_dev = handle.Write(input.data); + weight_dev = handle.Write(weight.data); + doutput_dev = handle.Write(doutput.data); + dinput_dev = handle.Write(dinput.data); + dweight_dev = handle.Write(dweight.data); + } + + void RunTest() + { + cpu_prelu_backward(input, weight, doutput, ref_dinput, ref_dweight); + + auto&& handle = get_handle(); + miopenStatus_t status; + + status = miopen::PReLUBackward(handle, + workspace_dev.get(), + ws_sizeInBytes, + input.desc, + input_dev.get(), + weight.desc, + weight_dev.get(), + doutput.desc, + doutput_dev.get(), + dinput.desc, + dinput_dev.get(), + dweight.desc, + dweight_dev.get()); + EXPECT_EQ(status, miopenStatusSuccess); + dinput.data = handle.Read(dinput_dev, dinput.data.size()); + dweight.data = handle.Read(dweight_dev, dweight.data.size()); + } + + void Verify() + { + // Computation error of fp16 is ~2^13 (=8192) bigger than + // the one of fp32 because mantissa is shorter by 13 bits. + double tolerance = std::is_same::value ? 1.5e-6 : 8.2e-3; + + // bf16 mantissa has 7 bits, by 3 bits shorter than fp16. + if(std::is_same::value) + tolerance *= 8.0; + + auto error_dinput = miopen::rms_range(ref_dinput, dinput); + auto error_dweight = miopen::rms_range(ref_dweight, dweight); + ASSERT_EQ(miopen::range_distance(ref_dinput), miopen::range_distance(dinput)); + ASSERT_EQ(miopen::range_distance(ref_dweight), miopen::range_distance(dweight)); + EXPECT_LT(error_dinput, tolerance) + << "Error backward Input Gradient beyond tolerance Error: " << error_dinput + << ", Tolerance: " << tolerance; + EXPECT_LT(error_dweight, tolerance) + << "Error backward Weight Gradient beyond tolerance Error: " << error_dweight + << ", Tolerance: " << tolerance; + } + + PReLUTestCase prelu_config; + + tensor input; + tensor weight; + tensor doutput; + tensor dinput; + tensor dweight; + tensor workspace; + + tensor ref_dinput; + tensor ref_dweight; + + miopen::Allocator::ManageDataPtr input_dev; + miopen::Allocator::ManageDataPtr weight_dev; + miopen::Allocator::ManageDataPtr doutput_dev; + miopen::Allocator::ManageDataPtr dinput_dev; + miopen::Allocator::ManageDataPtr dweight_dev; + miopen::Allocator::ManageDataPtr workspace_dev; + + size_t ws_sizeInBytes; +};