From 6feaec91f4969c202c4ba9e7ace1a50d10b16e03 Mon Sep 17 00:00:00 2001
From: Long Luong <45379462+long10024070@users.noreply.github.com>
Date: Wed, 21 Aug 2024 13:06:16 +0700
Subject: [PATCH] Implement PReLU backward (#3152)

---
 docs/reference/index.rst                      |   1 +
 driver/CMakeLists.txt                         |   1 +
 driver/dm_prelu.cpp                           |  40 ++
 driver/driver.hpp                             |   6 +-
 driver/mloPReLUHost.hpp                       | 104 +++++
 driver/prelu_driver.hpp                       | 394 ++++++++++++++++++
 include/miopen/miopen.h                       |  55 +++
 src/CMakeLists.txt                            |  10 +
 src/include/miopen/prelu.hpp                  |  53 +++
 src/include/miopen/prelu/invoke_params.hpp    |  59 +++
 .../miopen/prelu/problem_description.hpp      | 115 +++++
 src/include/miopen/prelu/solvers.hpp          |  77 ++++
 src/include/miopen/prelu/utils.hpp            |  45 ++
 src/include/miopen/solver_id.hpp              |   3 +-
 src/include/miopen/tensor_view_utils.hpp      |   1 +
 src/kernels/MIOpenPReLU.cpp                   | 115 +++++
 src/kernels/MIOpenReduceSum.cpp               | 104 +++++
 src/kernels/block_reduce.hpp                  |  72 ++++
 src/kernels/tensor_view.hpp                   |  11 +
 src/kernels/warp_reduce.hpp                   |  58 +++
 src/prelu.cpp                                 |  95 +++++
 src/prelu/problem_description.cpp             |  76 ++++
 src/prelu_api.cpp                             | 133 ++++++
 src/solver.cpp                                |   4 +
 .../prelu/backward_prelu_multi_weights.cpp    | 183 ++++++++
 .../prelu/backward_prelu_single_weight.cpp    | 202 +++++++++
 src/solver/prelu/utils.cpp                    |  53 +++
 test/cpu_prelu.hpp                            | 100 +++++
 test/gtest/prelu.cpp                          | 118 ++++++
 test/gtest/prelu.hpp                          | 247 +++++++++++
 30 files changed, 2532 insertions(+), 3 deletions(-)
 create mode 100644 driver/dm_prelu.cpp
 create mode 100644 driver/mloPReLUHost.hpp
 create mode 100644 driver/prelu_driver.hpp
 create mode 100644 src/include/miopen/prelu.hpp
 create mode 100644 src/include/miopen/prelu/invoke_params.hpp
 create mode 100644 src/include/miopen/prelu/problem_description.hpp
 create mode 100644 src/include/miopen/prelu/solvers.hpp
 create mode 100644 src/include/miopen/prelu/utils.hpp
 create mode 100644 src/kernels/MIOpenPReLU.cpp
 create mode 100644 src/kernels/MIOpenReduceSum.cpp
 create mode 100644 src/kernels/block_reduce.hpp
 create mode 100644 src/kernels/warp_reduce.hpp
 create mode 100644 src/prelu.cpp
 create mode 100644 src/prelu/problem_description.cpp
 create mode 100644 src/prelu_api.cpp
 create mode 100644 src/solver/prelu/backward_prelu_multi_weights.cpp
 create mode 100644 src/solver/prelu/backward_prelu_single_weight.cpp
 create mode 100644 src/solver/prelu/utils.cpp
 create mode 100644 test/cpu_prelu.hpp
 create mode 100644 test/gtest/prelu.cpp
 create mode 100644 test/gtest/prelu.hpp

diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 2574dbbf5e..34a045bca8 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -36,3 +36,4 @@ The MIOpen API library is structured as follows:
   * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental)
   * :doc:`ReduceCalculation <../doxygen/html/group__ReduceCalculation>` (experimental)
   * :doc:`RotaryPositionalEmbeddings <../doxygen/html/group__RotaryPositionalEmbeddings>` (experimental)
+  * :doc:`ReLU <../doxygen/html/group___re_l_u>` (experimental)
diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index 19abd61597..15a3941046 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -48,6 +48,7 @@ add_executable(MIOpenDriver
     dm_layernorm.cpp
     dm_lrn.cpp
     dm_pool.cpp
+    dm_prelu.cpp
     dm_reduce.cpp
     dm_reduceextreme.cpp
     dm_reducecalculation.cpp
diff --git a/driver/dm_prelu.cpp b/driver/dm_prelu.cpp
new file mode 100644
index 0000000000..06ec99e8a5
--- /dev/null
+++ b/driver/dm_prelu.cpp
@@ -0,0 +1,40 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "registry_driver_maker.hpp"
+#include "prelu_driver.hpp"
+
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "prelu")
+        return new PReLUDriver<float, float>();
+    if(base_arg == "prelufp16")
+        return new PReLUDriver<float16, float>();
+    if(base_arg == "prelubfp16")
+        return new PReLUDriver<bfloat16, float>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
diff --git a/driver/driver.hpp b/driver/driver.hpp
index 196d48c2b6..d0c708ff1d 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -175,7 +175,8 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
            "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
            "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
-           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16]\n");
+           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], "
+           "prelu[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -207,7 +208,8 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" &&
        arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" &&
        arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "rope" &&
-       arg != "ropefp16" && arg != "ropebfp16" && arg != "--version")
+       arg != "ropefp16" && arg != "ropebfp16" && arg != "prelu" && arg != "prelufp16" &&
+       arg != "prelubfp16" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/mloPReLUHost.hpp b/driver/mloPReLUHost.hpp
new file mode 100644
index 0000000000..c3460b6425
--- /dev/null
+++ b/driver/mloPReLUHost.hpp
@@ -0,0 +1,104 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <../test/ford.hpp>
+
+#include <miopen/tensor.hpp>
+#include <miopen/tensor_view_utils.hpp>
+#include <miopen/prelu/utils.hpp>
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloPReLUBackwardRunHost(const miopenTensorDescriptor_t inputDesc,
+                                const miopenTensorDescriptor_t weightDesc,
+                                const miopenTensorDescriptor_t doutputDesc,
+                                const miopenTensorDescriptor_t dinputDesc,
+                                const Tgpu* input,
+                                const Tgpu* weight,
+                                const Tgpu* doutput,
+                                Tcheck* dinput_host,
+                                Tcheck* dweight_host)
+{
+    auto input_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
+    auto doutput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(doutputDesc));
+    auto dinput_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(dinputDesc));
+
+    auto input_sz              = miopen::deref(inputDesc).GetElementSize();
+    auto weight_sz             = miopen::deref(weightDesc).GetElementSize();
+    auto weight_grad_collector = std::vector<float>(input_sz);
+
+    par_ford(input_sz)([&](int gid) {
+        auto tensor_layout = tensor_layout_t<5>(input_tv, gid);
+        float input_v      = static_cast<float>(input[input_tv.get_tensor_view_idx(tensor_layout)]);
+        float grad_v = static_cast<float>(doutput[doutput_tv.get_tensor_view_idx(tensor_layout)]);
+
+        if(dinput_host)
+        {
+            float weight_v;
+            if(weight_sz == 1)
+                weight_v = static_cast<float>(weight[0]);
+            else
+                weight_v = static_cast<float>(weight[tensor_layout.layout[1]]);
+            float input_grad_v = input_v > 0 ? grad_v : weight_v * grad_v;
+            dinput_host[dinput_tv.get_tensor_view_idx(tensor_layout)] =
+                static_cast<Tcheck>(input_grad_v);
+        }
+        if(dweight_host)
+        {
+            weight_grad_collector[gid] = input_v > 0 ? 0 : input_v * grad_v;
+        }
+    });
+
+    if(dweight_host)
+    {
+        if(weight_sz == 1)
+        {
+            double sum = 0;
+            for(int i = 0; i < input_sz; ++i)
+                sum += static_cast<double>(weight_grad_collector[i]);
+            dweight_host[0] = static_cast<Tcheck>(sum);
+        }
+        else
+        {
+            size_t inner_size = std::accumulate(
+                &input_tv.size[2], &input_tv.size[4], 1ul, std::multiplies<size_t>());
+            size_t outer_size = inner_size * input_tv.size[1];
+            par_ford(weight_sz)([&](int i) {
+                double sum = 0;
+                ford(input_tv.size[0])([&](int j) {
+                    ford(inner_size)([&](int k) {
+                        sum += static_cast<double>(
+                            weight_grad_collector[j * outer_size + i * inner_size + k]);
+                    });
+                });
+                dweight_host[i] = static_cast<Tcheck>(sum);
+            });
+        }
+    }
+
+    return miopenStatusSuccess;
+}
diff --git a/driver/prelu_driver.hpp b/driver/prelu_driver.hpp
new file mode 100644
index 0000000000..e304f27465
--- /dev/null
+++ b/driver/prelu_driver.hpp
@@ -0,0 +1,394 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "driver.hpp"
+#include "mloPReLUHost.hpp"
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+
+#include <../test/ford.hpp>
+#include <../test/verify.hpp>
+
+#include <miopen/miopen.h>
+
+template <typename Tgpu, typename Tref>
+class PReLUDriver : public Driver
+{
+public:
+    PReLUDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&inputDesc);
+        miopenCreateTensorDescriptor(&weightDesc);
+        miopenCreateTensorDescriptor(&doutputDesc);
+        miopenCreateTensorDescriptor(&dinputDesc);
+        miopenCreateTensorDescriptor(&dweightDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    int GetandSetData() override;
+    std::vector<int> GetTensorLengthsFromCmdLine();
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+    int RunBackwardCPU();
+
+    Tref GetTolerance();
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~PReLUDriver() override
+    {
+        miopenDestroyTensorDescriptor(inputDesc);
+        miopenDestroyTensorDescriptor(weightDesc);
+        miopenDestroyTensorDescriptor(doutputDesc);
+        miopenDestroyTensorDescriptor(dinputDesc);
+        miopenDestroyTensorDescriptor(dweightDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    int forw;
+
+    miopenTensorDescriptor_t inputDesc;
+    miopenTensorDescriptor_t weightDesc;
+    miopenTensorDescriptor_t doutputDesc;
+    miopenTensorDescriptor_t dinputDesc;
+    miopenTensorDescriptor_t dweightDesc;
+
+    std::unique_ptr<GPUMem> input_dev;
+    std::unique_ptr<GPUMem> weight_dev;
+    std::unique_ptr<GPUMem> doutput_dev;
+    std::unique_ptr<GPUMem> dinput_dev;
+    std::unique_ptr<GPUMem> dweight_dev;
+    std::unique_ptr<GPUMem> workspace_dev;
+
+    std::vector<Tgpu> input;
+    std::vector<Tgpu> weight;
+    std::vector<Tgpu> doutput;
+    std::vector<Tgpu> dinput;
+    std::vector<Tgpu> dweight;
+
+    std::vector<Tref> dinput_host;
+    std::vector<Tref> dweight_host;
+
+    size_t ws_sizeInBytes;
+};
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+
+    auto inTensorParam = inflags.GetValueTensor("input");
+    auto input_length  = inTensorParam.lengths;
+    if(input_length.empty())
+    {
+        std::cout << "Tensor must not be empty";
+        return miopenStatusBadParm;
+    }
+
+    std::vector<int> weight_length = {inflags.GetValueInt("NumParameters")};
+    if(weight_length[0] != 1 && (input_length.size() == 1 || weight_length[0] != input_length[1]))
+    {
+        std::cout << "NumParameters must be 1 or the second dim of DimLengths";
+        return miopenStatusBadParm;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::GetandSetData()
+{
+    auto inTensorParam             = inflags.GetValueTensor("input");
+    auto input_length              = inTensorParam.lengths;
+    std::vector<int> weight_length = {inflags.GetValueInt("NumParameters")};
+
+    if(SetTensorNd(inputDesc, input_length, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing input tensor: " + inflags.GetValueStr("input") + ".");
+    if(SetTensorNd(dinputDesc, input_length, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing input gradient tensor");
+
+    if(SetTensorNd(weightDesc, weight_length, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing weight tensor");
+    if(SetTensorNd(dweightDesc, weight_length, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing weight gradient tensor");
+
+    if(SetTensorNd(doutputDesc, input_length, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing output gradient tensor");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag("forw", 'F', "1", "Run only Forward PReLU (Default=1)", "int");
+    inflags.AddTensorFlag("input", 'D', "256x4x1x1x8723", "input tensor descriptor");
+    inflags.AddInputFlag(
+        "NumParameters",
+        'P',
+        "1",
+        "Number of weight to learn. Although it takes an int as input, there is only two values "
+        "are legitimate: 1, or the number of channels (the second dim) at input (Default=1)",
+        "int");
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int");
+    inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    size_t input_sz  = GetTensorSize(inputDesc);
+    size_t weight_sz = GetTensorSize(weightDesc);
+
+    miopenGetPReLUBackwardWorkspaceSize(GetHandle(), inputDesc, weightDesc, &ws_sizeInBytes);
+
+    if(ws_sizeInBytes == static_cast<size_t>(-1))
+        return miopenStatusAllocFailed;
+
+    uint32_t ctx = 0;
+
+    input_dev     = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    weight_dev    = std::unique_ptr<GPUMem>(new GPUMem(ctx, weight_sz, sizeof(Tgpu)));
+    doutput_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    dinput_dev    = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    dweight_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, weight_sz, sizeof(Tgpu)));
+    workspace_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, ws_sizeInBytes, sizeof(std::byte)));
+
+    input   = std::vector<Tgpu>(input_sz);
+    weight  = std::vector<Tgpu>(weight_sz);
+    doutput = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(1.0f));
+    dinput  = std::vector<Tgpu>(input_sz, std::numeric_limits<Tgpu>::quiet_NaN());
+    dweight = std::vector<Tgpu>(weight_sz, std::numeric_limits<Tgpu>::quiet_NaN());
+
+    dinput_host  = std::vector<Tref>(input_sz, std::numeric_limits<Tref>::quiet_NaN());
+    dweight_host = std::vector<Tref>(weight_sz, std::numeric_limits<Tref>::quiet_NaN());
+
+    for(int i = 0; i < input_sz; i++)
+        input[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-1e-5), static_cast<Tgpu>(1e-6));
+
+    for(int i = 0; i < weight_sz; i++)
+        weight[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-1e-5), static_cast<Tgpu>(1e-6));
+
+    if(input_dev->ToGPU(GetStream(), input.data()) != 0)
+    {
+        std::cerr << "Error copying (input) to GPU, size: " << input_dev->GetSize() << std::endl;
+        return miopenStatusAllocFailed;
+    }
+
+    if(weight_dev->ToGPU(GetStream(), weight.data()) != 0)
+    {
+        std::cerr << "Error copying (weight) to GPU, size: " << weight_dev->GetSize() << std::endl;
+        return miopenStatusAllocFailed;
+    }
+
+    if(doutput_dev->ToGPU(GetStream(), doutput.data()) != 0)
+    {
+        std::cerr << "Error copying (out grad) to GPU, size: " << doutput_dev->GetSize()
+                  << std::endl;
+        return miopenStatusAllocFailed;
+    }
+
+    if(dinput_dev->ToGPU(GetStream(), dinput.data()) != 0)
+    {
+        std::cerr << "Error copying (input grad) to GPU, size: " << dinput_dev->GetSize()
+                  << std::endl;
+        return miopenStatusAllocFailed;
+    }
+
+    if(dweight_dev->ToGPU(GetStream(), dweight.data()) != 0)
+    {
+        std::cerr << "Error copying (weight grad) to GPU, size: " << dweight_dev->GetSize()
+                  << std::endl;
+        return miopenStatusAllocFailed;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    return miopenStatusNotImplemented;
+}
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::RunForwardCPU()
+{
+    return miopenStatusNotImplemented;
+}
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenPReLUBackward(GetHandle(),
+                            workspace_dev->GetMem(),
+                            ws_sizeInBytes,
+                            inputDesc,
+                            input_dev->GetMem(),
+                            weightDesc,
+                            weight_dev->GetMem(),
+                            doutputDesc,
+                            doutput_dev->GetMem(),
+                            dinputDesc,
+                            dinput_dev->GetMem(),
+                            dweightDesc,
+                            dweight_dev->GetMem());
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Backward PReLU Elapsed: " << t.gettime_ms() / iter
+                      << " ms" << std::endl;
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Backward PReLU Elapsed: " << kernel_average_time << " ms"
+                  << std::endl;
+    }
+
+    if(dinput_dev->FromGPU(GetStream(), dinput.data()) != 0)
+    {
+        std::cerr << "Error copying (dinput_dev) from GPU, size: " << dinput_dev->GetSize()
+                  << std::endl;
+        return miopenStatusInternalError;
+    }
+    if(dweight_dev->FromGPU(GetStream(), dweight.data()) != 0)
+    {
+        std::cerr << "Error copying (dweight_dev) from GPU, size: " << dweight_dev->GetSize()
+                  << std::endl;
+        return miopenStatusInternalError;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::RunBackwardCPU()
+{
+    return mloPReLUBackwardRunHost<Tgpu, Tref>(inputDesc,
+                                               weightDesc,
+                                               doutputDesc,
+                                               dinputDesc,
+                                               input.data(),
+                                               weight.data(),
+                                               doutput.data(),
+                                               dinput_host.data(),
+                                               dweight_host.data());
+}
+
+template <typename Tgpu, typename Tref>
+Tref PReLUDriver<Tgpu, Tref>::GetTolerance()
+{
+    // Computation error of fp16 is ~2^13 (=8192) bigger than
+    // the one of fp32 because mantissa is shorter by 13 bits.
+    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
+
+    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+    if(std::is_same<Tgpu, bfloat16>::value)
+        tolerance *= 8.0;
+    return tolerance;
+}
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::VerifyForward()
+{
+    return miopenStatusNotImplemented;
+}
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::VerifyBackward()
+{
+    RunBackwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error_dinput    = miopen::rms_range(dinput_host, dinput);
+    auto error_dweight   = miopen::rms_range(dweight_host, dweight);
+
+    if(!std::isfinite(error_dinput) || error_dinput > tolerance)
+    {
+        std::cout << "Backward PReLU Input Gradient FAILED: " << error_dinput << " > " << tolerance
+                  << std::endl;
+        return EC_VerifyBwd;
+    }
+    else
+    {
+        std::cout << "Backward PReLU Input Gradient Verifies OK on CPU reference (" << error_dinput
+                  << " < " << tolerance << ')' << std::endl;
+    }
+
+    if(!std::isfinite(error_dweight) || error_dweight > tolerance)
+    {
+        std::cout << "Backward PReLU Weight Gradient FAILED: " << error_dweight << " > "
+                  << tolerance << std::endl;
+        return EC_VerifyBwd;
+    }
+    else
+    {
+        std::cout << "Backward PReLU Weight Gradient Verifies OK on CPU reference ("
+                  << error_dweight << " < " << tolerance << ')' << std::endl;
+    }
+
+    return miopenStatusSuccess;
+}
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 6c66f04867..ccf4b702e0 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -71,6 +71,7 @@
  * @defgroup getitem
  * @defgroup ReduceCalculation
  * @defgroup RotaryPositionalEmbeddings
+ * @defgroup ReLU
  *
  */
 
@@ -7677,6 +7678,60 @@ MIOPEN_EXPORT miopenStatus_t miopenRoPEBackward(miopenHandle_t handle,
 // CLOSEOUT ROPE DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
+#ifdef MIOPEN_BETA_API
+/** @addtogroup ReLU
+ *
+ *  @{
+ */
+
+/*! @brief Helper function to query the minimum workspace size required by the PReLU backward call
+ *
+ * @param handle                   MIOpen Handle (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param weightDesc               Tensor descriptor for weight tensor (input)
+ * @param sizeInBytes              Pointer to data to return the minimum workspace size
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenGetPReLUBackwardWorkspaceSize(miopenHandle_t handle,
+                                    miopenTensorDescriptor_t inputDesc,
+                                    miopenTensorDescriptor_t weightDesc,
+                                    size_t* sizeInBytes);
+
+/*! @brief Execute a PReLU backward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param workspace                Address of the allocated workspace data (input)
+ * @param workspaceSizeInBytes     Size in bytes of the allocated workspace data (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param input                    Data tensor input (input)
+ * @param weightDesc               Tensor descriptor for weight tensor (input)
+ * @param weight                   Data tensor weight (input)
+ * @param doutputDesc              Tensor descriptor for output gradient (input)
+ * @param doutput                  Gradient of output (input)
+ * @param dinputDesc               Tensor descriptor for input gradient (input)
+ * @param dinput                   Gradient of input (output)
+ * @param dweightDesc              Tensor descriptor for weight gradient (input)
+ * @param dweight                  Gradient of weight (output)
+ */
+MIOPEN_EXPORT miopenStatus_t miopenPReLUBackward(miopenHandle_t handle,
+                                                 void* workspace,
+                                                 size_t workspaceSizeInBytes,
+                                                 miopenTensorDescriptor_t inputDesc,
+                                                 const void* input,
+                                                 miopenTensorDescriptor_t weightDesc,
+                                                 const void* weight,
+                                                 miopenTensorDescriptor_t doutputDesc,
+                                                 const void* doutput,
+                                                 miopenTensorDescriptor_t dinputDesc,
+                                                 void* dinput,
+                                                 miopenTensorDescriptor_t dweightDesc,
+                                                 void* dweight);
+
+/** @} */
+// CLOSEOUT RELU DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d2f84f43b8..1fe108679c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -163,6 +163,8 @@ set( MIOpen_Source
     performance_config.cpp
     pooling/problem_description.cpp
     pooling_api.cpp
+    prelu/problem_description.cpp
+    prelu_api.cpp
     problem.cpp
     process.cpp
     ramdb.cpp
@@ -302,6 +304,9 @@ set( MIOpen_Source
     solver/pooling/forwardNd.cpp
     solver/pooling/backward2d.cpp
     solver/pooling/backwardNd.cpp
+    solver/prelu/backward_prelu_multi_weights.cpp
+    solver/prelu/backward_prelu_single_weight.cpp
+    solver/prelu/utils.cpp
     solver/reduce/forward_argmax.cpp
     solver/reduce/forward_argmin.cpp
     solver/reduce/forward_max.cpp
@@ -448,6 +453,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenReduceCalculation.hpp
         kernels/MIOpenReduceExtreme.hpp
         kernels/bfloat16_dev.hpp
+        kernels/block_reduce.hpp
         kernels/conv_common.inc
         kernels/conv_sizes.inc
         kernels/float_types.h
@@ -471,6 +477,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/winograd/Conv_Winograd_Fury_v2_4_1_gfx11_1024vgprs_fp16_fp16acc_f2x3_c16_stride1.inc
         kernels/winograd/Conv_Winograd_Fury_v2_4_1_metadata.inc
         kernels/workaround_issue_1431.hpp
+        kernels/warp_reduce.hpp
         kernels/xform_bidirect_winograd_code.inc
         kernels/xform_data_filter.inc
         kernels/xform_kd_cov2.inc
@@ -508,6 +515,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenLRNBwd.cl
         kernels/MIOpenLRNFwd.cl
         kernels/MIOpenNeuron.cl
+        kernels/MIOpenPReLU.cpp
         kernels/MIOpenPooling.cl
         kernels/MIOpenPoolingBwd.cl
         kernels/MIOpenPoolingBwdND.cl
@@ -519,6 +527,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenReduceCalculation.cpp
         kernels/MIOpenReduceExtreme.cpp
         kernels/MIOpenRoPE.cpp
+        kernels/MIOpenReduceSum.cpp
         kernels/MIOpenSoftmax.cl
         kernels/MIOpenSoftmaxAttn.cpp
         kernels/MIOpenUtilKernels3.cl
@@ -661,6 +670,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         t5layernorm.cpp
         ocl/fusionopconvocl.cpp
         ocl/fusionopbiasbnactivocl.cpp
+        prelu.cpp
         reducecalculation.cpp
         reduceextreme.cpp
         rope.cpp
diff --git a/src/include/miopen/prelu.hpp b/src/include/miopen/prelu.hpp
new file mode 100644
index 0000000000..35e5e7f611
--- /dev/null
+++ b/src/include/miopen/prelu.hpp
@@ -0,0 +1,53 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/common.hpp>
+
+namespace miopen {
+
+struct Handle;
+struct TensorDescriptor;
+
+MIOPEN_INTERNALS_EXPORT size_t GetPReLUBackwardWorkspaceSize(Handle& handle,
+                                                             const TensorDescriptor& inputDesc,
+                                                             const TensorDescriptor& weightDesc);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t PReLUBackward(Handle& handle,
+                                                     Data_t workspace,
+                                                     size_t workspaceSizeInBytes,
+                                                     const TensorDescriptor& inputDesc,
+                                                     ConstData_t input,
+                                                     const TensorDescriptor& weightDesc,
+                                                     ConstData_t weight,
+                                                     const TensorDescriptor& doutputDesc,
+                                                     ConstData_t doutput,
+                                                     const TensorDescriptor& dinputDesc,
+                                                     Data_t dinput,
+                                                     const TensorDescriptor& dweightDesc,
+                                                     Data_t dweight);
+
+} // namespace miopen
diff --git a/src/include/miopen/prelu/invoke_params.hpp b/src/include/miopen/prelu/invoke_params.hpp
new file mode 100644
index 0000000000..1acc2dc143
--- /dev/null
+++ b/src/include/miopen/prelu/invoke_params.hpp
@@ -0,0 +1,59 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+namespace prelu {
+
+struct InvokeParams : public miopen::InvokeParams
+{
+    InvokeParams() = default;
+
+    const TensorDescriptor* inputDesc   = nullptr;
+    const TensorDescriptor* weightDesc  = nullptr;
+    const TensorDescriptor* doutputDesc = nullptr;
+    const TensorDescriptor* dinputDesc  = nullptr;
+    const TensorDescriptor* dweightDesc = nullptr;
+
+    ConstData_t input          = nullptr;
+    ConstData_t weight         = nullptr;
+    ConstData_t doutput        = nullptr;
+    Data_t dinput              = nullptr;
+    Data_t dweight             = nullptr;
+    Data_t workspace           = nullptr;
+    std::size_t workspace_size = 0;
+
+    std::size_t GetWorkspaceSize() const { return workspace_size; }
+    Data_t GetWorkspace() const { return workspace; }
+};
+
+} // namespace prelu
+
+} // namespace miopen
diff --git a/src/include/miopen/prelu/problem_description.hpp b/src/include/miopen/prelu/problem_description.hpp
new file mode 100644
index 0000000000..ce10d96046
--- /dev/null
+++ b/src/include/miopen/prelu/problem_description.hpp
@@ -0,0 +1,115 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace prelu {
+
+bool checkSameType(const TensorDescriptor& x, const TensorDescriptor& y);
+bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y);
+
+struct BackwardProblemDescription : ProblemDescriptionBase
+{
+
+    BackwardProblemDescription(const TensorDescriptor& inputDesc_,
+                               const TensorDescriptor& weightDesc_,
+                               const TensorDescriptor& doutputDesc_,
+                               const TensorDescriptor& dinputDesc_,
+                               const TensorDescriptor& dweightDesc_)
+        : inputDesc(inputDesc_),
+          weightDesc(weightDesc_),
+          doutputDesc(doutputDesc_),
+          dinputDesc(dinputDesc_),
+          dweightDesc(dweightDesc_)
+    {
+        IsSameType();
+        IsRightLength();
+    }
+
+    const TensorDescriptor& GetInputDesc() const { return inputDesc; }
+    const TensorDescriptor& GetWeightDesc() const { return weightDesc; }
+    const TensorDescriptor& GetdOuputDesc() const { return doutputDesc; }
+    const TensorDescriptor& GetdInputDesc() const { return dinputDesc; }
+    const TensorDescriptor& GetdWeightDesc() const { return dweightDesc; }
+
+    bool IsSameType() const
+    {
+        if(!checkSameType(inputDesc, weightDesc))
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "PReLU: Input and Weight tensor must have same type.");
+        if(!checkSameType(inputDesc, dinputDesc) || !checkSameType(weightDesc, dweightDesc))
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "PReLU: Gradient tensors (excluding Output gradient) must share a same "
+                         "type with Input and Weight tensor.");
+        return true;
+    }
+
+    bool IsRightLength() const
+    {
+        if(!checkSameLength(inputDesc, doutputDesc) || !checkSameLength(inputDesc, dinputDesc))
+            MIOPEN_THROW(
+                miopenStatusBadParm,
+                "PReLU: Input and Output Gradient tensors sizes must match with Input tensor.");
+        if(weightDesc.GetNumDims() != 1)
+            MIOPEN_THROW(miopenStatusBadParm, "PReLU: Weight tensor must have 1 dimension.");
+        if(weightDesc.GetElementSize() != 1 &&
+           (inputDesc.GetNumDims() == 1 ||
+            weightDesc.GetElementSize() != inputDesc.GetLengths()[1]))
+            MIOPEN_THROW(
+                miopenStatusBadParm,
+                "PReLU: Weight size must be 1 or equal to the second dim of Input tensor.");
+        if(!checkSameLength(weightDesc, dweightDesc))
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "PReLU: Weight Gradient tensors sizes must match with Weight tensor.");
+        return true;
+    }
+
+    bool IsSingleWeight() const
+    {
+        if(weightDesc.GetElementSize() > 1)
+            return false;
+        return true;
+    }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+private:
+    TensorDescriptor inputDesc;
+    TensorDescriptor weightDesc;
+    TensorDescriptor doutputDesc;
+    TensorDescriptor dinputDesc;
+    TensorDescriptor dweightDesc;
+};
+
+} // namespace prelu
+
+} // namespace miopen
diff --git a/src/include/miopen/prelu/solvers.hpp b/src/include/miopen/prelu/solvers.hpp
new file mode 100644
index 0000000000..a2bec477f0
--- /dev/null
+++ b/src/include/miopen/prelu/solvers.hpp
@@ -0,0 +1,77 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/prelu/problem_description.hpp>
+#include <miopen/solver.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace prelu {
+
+using BackwardSolverBase =
+    NonTunableSolverBase<ExecutionContext, miopen::prelu::BackwardProblemDescription>;
+
+struct Backward : BackwardSolverBase
+{
+    bool MayNeedWorkspace() const override { return true; }
+};
+
+struct SingleWeightBackward final : Backward
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<SingleWeightBackward>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::prelu::BackwardProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::prelu::BackwardProblemDescription& problem) const override;
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::prelu::BackwardProblemDescription& problem) const override;
+};
+
+struct MultiWeightsBackward final : Backward
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<MultiWeightsBackward>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::prelu::BackwardProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::prelu::BackwardProblemDescription& problem) const override;
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::prelu::BackwardProblemDescription& problem) const override;
+};
+
+} // namespace prelu
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/prelu/utils.hpp b/src/include/miopen/prelu/utils.hpp
new file mode 100644
index 0000000000..b7551a4135
--- /dev/null
+++ b/src/include/miopen/prelu/utils.hpp
@@ -0,0 +1,45 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/solver.hpp>
+
+namespace miopen {
+namespace solver {
+namespace prelu {
+
+KernelInfo make_hip_kernel(std::vector<size_t> localsize,
+                           std::vector<size_t> gridsize,
+                           std::string kernel_file,
+                           std::string kernel_name,
+                           KernelBuildParameters build_params);
+
+} // namespace prelu
+} // namespace solver
+} // namespace miopen
diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index fb81bafb5c..ab824faa32 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -60,7 +60,8 @@ enum class Primitive
     Softmax,
     Adam,
     Item,
-    RoPE
+    RoPE,
+    ReLU
 };
 
 struct MIOPEN_INTERNALS_EXPORT Id
diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp
index 9f7430ba8a..b7149c8809 100644
--- a/src/include/miopen/tensor_view_utils.hpp
+++ b/src/include/miopen/tensor_view_utils.hpp
@@ -28,6 +28,7 @@
 #define MIOPEN_TENSOR_VIEW_UTIL_HPP_
 
 #include <miopen/common.hpp>
+#include <miopen/tensor.hpp>
 #include "../../kernels/tensor_view.hpp"
 
 namespace miopen {
diff --git a/src/kernels/MIOpenPReLU.cpp b/src/kernels/MIOpenPReLU.cpp
new file mode 100644
index 0000000000..750b226e7f
--- /dev/null
+++ b/src/kernels/MIOpenPReLU.cpp
@@ -0,0 +1,115 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "tensor_view.hpp"
+
+template <typename TI, typename TO, uint32_t NDIMS, bool SingleWeight>
+__device__ void PReLUBackward(const TI* __restrict__ input,
+                              const TI* __restrict__ weight,
+                              const TO* __restrict__ output_grad,
+                              TI* __restrict__ input_grad,
+                              FLOAT_ACCUM* __restrict__ weight_grad_collector,
+                              uint64_t N,
+                              tensor_view_t<NDIMS> input_tv,
+                              tensor_view_t<1> weight_tv,
+                              tensor_view_t<NDIMS> output_grad_tv,
+                              tensor_view_t<NDIMS> input_grad_tv)
+{
+    uint64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(gid >= N)
+        return;
+
+    auto tensor_layout  = tensor_layout_t<NDIMS>(input_tv, gid);
+    FLOAT_ACCUM input_v = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(tensor_layout)]);
+    FLOAT_ACCUM grad_v =
+        CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]);
+
+    if(input_grad != nullptr)
+    {
+        FLOAT_ACCUM weight_v = CVT_FLOAT2ACCUM(
+            weight[SingleWeight ? 0 : weight_tv.get_tensor_view_idx({tensor_layout.layout[1]})]);
+        FLOAT_ACCUM input_grad_v = input_v > 0 ? grad_v : weight_v * grad_v;
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] =
+            CVT_ACCUM2FLOAT(input_grad_v);
+    }
+    if(weight_grad_collector != nullptr)
+    {
+        weight_grad_collector[gid] = input_v > 0 ? 0 : input_v * grad_v;
+    }
+}
+
+extern "C" __global__ void PReLUSWBackward(const INPUT_TYPE* __restrict__ input,
+                                           const INPUT_TYPE* __restrict__ weight,
+                                           const OUTPUT_TYPE* __restrict__ output_grad,
+                                           INPUT_TYPE* __restrict__ input_grad,
+                                           FLOAT_ACCUM* __restrict__ weight_grad_collector,
+                                           uint64_t N,
+                                           tensor_view_t<VIEW_DIMS> input_tv,
+                                           tensor_view_t<1> weight_tv,
+                                           tensor_view_t<VIEW_DIMS> output_grad_tv,
+                                           tensor_view_t<VIEW_DIMS> input_grad_tv)
+{
+    // instantiate the kernel
+    PReLUBackward<INPUT_TYPE, OUTPUT_TYPE, VIEW_DIMS, true>(input,
+                                                            weight,
+                                                            output_grad,
+                                                            input_grad,
+                                                            weight_grad_collector,
+                                                            N,
+                                                            input_tv,
+                                                            weight_tv,
+                                                            output_grad_tv,
+                                                            input_grad_tv);
+}
+
+extern "C" __global__ void PReLUMWBackward(const INPUT_TYPE* __restrict__ input,
+                                           const INPUT_TYPE* __restrict__ weight,
+                                           const OUTPUT_TYPE* __restrict__ output_grad,
+                                           INPUT_TYPE* __restrict__ input_grad,
+                                           FLOAT_ACCUM* __restrict__ weight_grad_collector,
+                                           uint64_t N,
+                                           tensor_view_t<VIEW_DIMS> input_tv,
+                                           tensor_view_t<1> weight_tv,
+                                           tensor_view_t<VIEW_DIMS> output_grad_tv,
+                                           tensor_view_t<VIEW_DIMS> input_grad_tv)
+{
+    // instantiate the kernel
+    PReLUBackward<INPUT_TYPE, OUTPUT_TYPE, VIEW_DIMS, false>(input,
+                                                             weight,
+                                                             output_grad,
+                                                             input_grad,
+                                                             weight_grad_collector,
+                                                             N,
+                                                             input_tv,
+                                                             weight_tv,
+                                                             output_grad_tv,
+                                                             input_grad_tv);
+}
diff --git a/src/kernels/MIOpenReduceSum.cpp b/src/kernels/MIOpenReduceSum.cpp
new file mode 100644
index 0000000000..5ed52008bf
--- /dev/null
+++ b/src/kernels/MIOpenReduceSum.cpp
@@ -0,0 +1,104 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "tensor_view.hpp"
+#include "warp_reduce.hpp"
+#include "block_reduce.hpp"
+
+template <typename TO>
+__device__ void
+ReduceSum(const FLOAT_ACCUM* input, TO* output, uint64_t N, tensor_view_t<1> output_tv)
+{
+    uint64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    FLOAT_ACCUM val = gid < N ? input[gid] : CVT_FP32_2ACCUM(0.0f);
+    val             = block_reduce<BinaryOp_t::Add, REDUCE_SIZE, ReduceThreadDim::X>(val);
+
+    if(threadIdx.x == 0)
+        output[output_tv.get_tensor_view_idx({blockIdx.x})] = CVT_ACCUM2FLOAT(val);
+}
+
+extern "C" __global__ void ReduceSum(const FLOAT_ACCUM* __restrict__ input,
+                                     OUTPUT_TYPE* __restrict__ output,
+                                     uint64_t N,
+                                     tensor_view_t<1> output_tv)
+{
+    // instantiate the kernel
+    ReduceSum<OUTPUT_TYPE>(input, output, N, output_tv);
+}
+
+extern "C" __global__ void ReduceSumFLOATACCUM(const FLOAT_ACCUM* __restrict__ input,
+                                               FLOAT_ACCUM* __restrict__ output,
+                                               uint64_t N)
+{
+    uint64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+
+    FLOAT_ACCUM val = gid < N ? input[gid] : 0.0f;
+    val             = block_reduce<BinaryOp_t::Add, REDUCE_SIZE, ReduceThreadDim::X>(val);
+
+    if(threadIdx.x == 0)
+        output[blockIdx.x] = val;
+}
+
+template <typename TO>
+__device__ void Reduce1dSum(const FLOAT_ACCUM* __restrict__ input,
+                            TO* __restrict__ output,
+                            uint64_t output_numel,
+                            uint64_t inner_size,
+                            uint64_t outer_size,
+                            tensor_view_t<1> output_tv)
+{
+    uint64_t tid  = threadIdx.x;
+    uint64_t oidx = blockIdx.x;
+
+    // use double instead of FLOAT_ACCUM for better precision
+    double sum_double = 0.0;
+    for(uint64_t i = tid; i < outer_size * inner_size; i += blockDim.x)
+        sum_double += static_cast<double>(
+            input[i / inner_size * output_numel * inner_size + oidx * inner_size + i % inner_size]);
+
+    FLOAT_ACCUM sum = static_cast<FLOAT_ACCUM>(sum_double);
+    sum             = block_reduce<BinaryOp_t::Add, REDUCE_SIZE, ReduceThreadDim::X>(sum);
+
+    if(tid == 0)
+        output[output_tv.get_tensor_view_idx({oidx})] = CVT_ACCUM2FLOAT(sum);
+}
+
+extern "C" __global__ void Reduce1dSum(const FLOAT_ACCUM* __restrict__ input,
+                                       OUTPUT_TYPE* __restrict__ output,
+                                       uint64_t output_numel,
+                                       uint64_t inner_size,
+                                       uint64_t outer_size,
+                                       tensor_view_t<1> output_tv)
+{
+    // instantiate the kernel
+    Reduce1dSum<OUTPUT_TYPE>(input, output, output_numel, inner_size, outer_size, output_tv);
+}
diff --git a/src/kernels/block_reduce.hpp b/src/kernels/block_reduce.hpp
new file mode 100644
index 0000000000..7d298a6ead
--- /dev/null
+++ b/src/kernels/block_reduce.hpp
@@ -0,0 +1,72 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_BLOCK_REDUCE_HPP
+#define GUARD_BLOCK_REDUCE_HPP
+
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "warp_reduce.hpp"
+
+enum class ReduceThreadDim : int32_t
+{
+    X = 1 << 0,
+    Y = 1 << 1,
+    Z = 1 << 2,
+};
+
+template <BinaryOp_t Op, uint64_t reduce_size, ReduceThreadDim thread_dim>
+__device__ FLOAT_ACCUM block_reduce(FLOAT_ACCUM val)
+{
+    if(reduce_size == warpSize)
+        return warp_reduce<Op>(val);
+
+    static __shared__ FLOAT_ACCUM shared[reduce_size / warpSize];
+    uint64_t tid = 0;
+    if(static_cast<int32_t>(thread_dim) & static_cast<int32_t>(ReduceThreadDim::X))
+        tid += threadIdx.x;
+    if(static_cast<int32_t>(thread_dim) & static_cast<int32_t>(ReduceThreadDim::Y))
+        tid = tid * blockDim.y + threadIdx.y;
+    if(static_cast<int32_t>(thread_dim) & static_cast<int32_t>(ReduceThreadDim::Z))
+        tid = tid * blockDim.z + threadIdx.z;
+    const uint64_t lane = tid % warpSize;
+    const uint64_t wid  = tid / warpSize;
+
+    val = warp_reduce<Op>(val);
+    if(lane == 0)
+        shared[wid] = val;
+    __syncthreads();
+
+    val = tid < reduce_size / warpSize ? shared[lane] : 0;
+    if(wid == 0)
+        val = warp_reduce<Op>(val);
+    return val;
+}
+
+#endif // GUARD_BLOCK_REDUCE_HPP
diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
index d35bfd93fc..c9357dd729 100644
--- a/src/kernels/tensor_view.hpp
+++ b/src/kernels/tensor_view.hpp
@@ -27,6 +27,8 @@
 #ifndef GUARD_TENSOR_VIEW_HPP
 #define GUARD_TENSOR_VIEW_HPP
 
+#include <initializer_list>
+
 template <int N>
 struct tensor_layout_t;
 
@@ -72,6 +74,15 @@ struct tensor_layout_t
         }
     }
 
+    constexpr tensor_layout_t(std::initializer_list<uint64_t> layout_)
+    {
+        static_assert(N > 0);
+        for(auto i = 0; i < N; ++i)
+        {
+            layout[i] = layout_.begin()[i];
+        }
+    }
+
     uint64_t layout[N];
 };
 
diff --git a/src/kernels/warp_reduce.hpp b/src/kernels/warp_reduce.hpp
new file mode 100644
index 0000000000..f1490bd214
--- /dev/null
+++ b/src/kernels/warp_reduce.hpp
@@ -0,0 +1,58 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_WARP_REDUCE_HPP
+#define GUARD_WARP_REDUCE_HPP
+
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+
+enum class BinaryOp_t
+{
+    Add,
+};
+
+template <BinaryOp_t Op, typename T>
+struct BinaryFunc;
+
+template <typename T>
+struct BinaryFunc<BinaryOp_t::Add, T>
+{
+    constexpr void exec(T& a, const T& b) { a += b; }
+};
+
+template <BinaryOp_t Op, uint32_t ws = warpSize>
+__device__ FLOAT_ACCUM warp_reduce(FLOAT_ACCUM val)
+{
+    for(auto d = ws / 2; d >= 1; d >>= 1)
+        BinaryFunc<Op, FLOAT_ACCUM>{}.exec(val, __shfl_down(val, d));
+    return val;
+}
+
+#endif // GUARD_WARP_REDUCE_HPP
diff --git a/src/prelu.cpp b/src/prelu.cpp
new file mode 100644
index 0000000000..3966c1f45f
--- /dev/null
+++ b/src/prelu.cpp
@@ -0,0 +1,95 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/find_solution.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/prelu.hpp>
+#include <miopen/prelu/invoke_params.hpp>
+#include <miopen/prelu/solvers.hpp>
+
+namespace miopen {
+
+size_t GetPReLUBackwardWorkspaceSize(Handle& handle,
+                                     const TensorDescriptor& inputDesc,
+                                     const TensorDescriptor& weightDesc)
+{
+    auto ctx = ExecutionContext{&handle};
+    const auto problem =
+        prelu::BackwardProblemDescription{inputDesc, weightDesc, inputDesc, inputDesc, weightDesc};
+
+    const auto solvers = solver::SolverContainer<solver::prelu::SingleWeightBackward,
+                                                 solver::prelu::MultiWeightsBackward>{};
+
+    auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem);
+
+    return pair_size_vector.empty() ? static_cast<size_t>(-1) : pair_size_vector.front().second;
+}
+
+miopenStatus_t PReLUBackward(Handle& handle,
+                             Data_t workspace,
+                             size_t workspaceSizeInBytes,
+                             const TensorDescriptor& inputDesc,
+                             ConstData_t input,
+                             const TensorDescriptor& weightDesc,
+                             ConstData_t weight,
+                             const TensorDescriptor& doutputDesc,
+                             ConstData_t doutput,
+                             const TensorDescriptor& dinputDesc,
+                             Data_t dinput,
+                             const TensorDescriptor& dweightDesc,
+                             Data_t dweight)
+{
+    const auto problem = prelu::BackwardProblemDescription{
+        inputDesc, weightDesc, doutputDesc, inputDesc, dweightDesc};
+
+    const auto invoke_params = [&]() {
+        auto tmp           = prelu::InvokeParams{};
+        tmp.type           = InvokeType::Run;
+        tmp.inputDesc      = &inputDesc;
+        tmp.weightDesc     = &weightDesc;
+        tmp.doutputDesc    = &doutputDesc;
+        tmp.dinputDesc     = &dinputDesc;
+        tmp.dweightDesc    = &dweightDesc;
+        tmp.input          = input;
+        tmp.weight         = weight;
+        tmp.doutput        = doutput;
+        tmp.dinput         = dinput;
+        tmp.dweight        = dweight;
+        tmp.workspace      = workspace;
+        tmp.workspace_size = workspaceSizeInBytes;
+        return tmp;
+    }();
+
+    const auto algo    = AlgorithmName{"PReLUBackward"};
+    const auto solvers = solver::SolverContainer<solver::prelu::SingleWeightBackward,
+                                                 solver::prelu::MultiWeightsBackward>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace miopen
diff --git a/src/prelu/problem_description.cpp b/src/prelu/problem_description.cpp
new file mode 100644
index 0000000000..f364c60d19
--- /dev/null
+++ b/src/prelu/problem_description.cpp
@@ -0,0 +1,76 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/prelu/problem_description.hpp>
+#include <miopen/names.hpp>
+
+#include <sstream>
+
+namespace miopen {
+
+namespace prelu {
+
+bool checkSameType(const TensorDescriptor& x, const TensorDescriptor& y)
+{
+    if(x.GetType() != y.GetType())
+        return false;
+    return true;
+}
+
+bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y)
+{
+    if(x.GetNumDims() != y.GetNumDims())
+        return false;
+    for(int32_t i = 0; i < x.GetNumDims(); ++i)
+    {
+        if(x.GetLengths()[i] != y.GetLengths()[i])
+            return false;
+    }
+    return true;
+}
+
+NetworkConfig BackwardProblemDescription::MakeNetworkConfig() const
+{
+    auto input_dtype  = inputDesc.GetType();
+    auto output_dtype = doutputDesc.GetType();
+    auto size         = inputDesc.GetElementSize();
+    auto num_params   = weightDesc.GetElementSize();
+
+    std::ostringstream ss;
+
+    ss << "prelu_bwd";
+    ss << "idtype" << input_dtype;
+    ss << "odtype" << output_dtype;
+    ss << "ndim" << inputDesc.GetNumDims();
+    ss << "size" << size;
+    ss << "num_params" << num_params;
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace prelu
+
+} // namespace miopen
diff --git a/src/prelu_api.cpp b/src/prelu_api.cpp
new file mode 100644
index 0000000000..5e670a0a2b
--- /dev/null
+++ b/src/prelu_api.cpp
@@ -0,0 +1,133 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/handle.hpp>
+#include <miopen/tensor_ops.hpp>
+#include <miopen/prelu.hpp>
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
+static void LogCmdPReLU(const miopenTensorDescriptor_t inputDesc,
+                        const miopenTensorDescriptor_t weightDesc,
+                        bool is_fwd)
+{
+    if(miopen::IsLoggingCmd())
+    {
+        std::stringstream ss;
+        auto dtype = miopen::deref(inputDesc).GetType();
+        if(dtype == miopenHalf)
+        {
+            ss << "prelufp16";
+        }
+        else if(dtype == miopenFloat)
+        {
+            ss << "prelufp32";
+        }
+        else if(dtype == miopenBFloat16)
+        {
+            ss << "prelubfp16";
+        }
+
+        MIOPEN_LOG_FUNCTION(inputDesc, weightDesc);
+        ss << " --input " << miopen::deref(inputDesc).GetLengths();
+        ss << " --weight " << miopen::deref(weightDesc).GetLengths();
+        ss << " -F " << ((is_fwd) ? "1" : "2");
+
+        MIOPEN_LOG_DRIVER_CMD(ss.str());
+    }
+}
+
+extern "C" miopenStatus_t
+miopenGetPReLUBackwardWorkspaceSize(miopenHandle_t handle,
+                                    const miopenTensorDescriptor_t inputDesc,
+                                    const miopenTensorDescriptor_t weightDesc,
+                                    size_t* sizeInBytes)
+{
+
+    MIOPEN_LOG_FUNCTION(handle, inputDesc, weightDesc, sizeInBytes);
+
+    return miopen::try_([&] {
+        miopen::deref(sizeInBytes) = miopen::GetPReLUBackwardWorkspaceSize(
+            miopen::deref(handle), miopen::deref(inputDesc), miopen::deref(weightDesc));
+    });
+}
+
+extern "C" miopenStatus_t miopenPReLUBackward(miopenHandle_t handle,
+                                              void* workspace,
+                                              const size_t workspaceSizeInBytes,
+                                              const miopenTensorDescriptor_t inputDesc,
+                                              const void* input,
+                                              const miopenTensorDescriptor_t weightDesc,
+                                              const void* weight,
+                                              const miopenTensorDescriptor_t doutputDesc,
+                                              const void* doutput,
+                                              const miopenTensorDescriptor_t dinputDesc,
+                                              void* dinput,
+                                              const miopenTensorDescriptor_t dweightDesc,
+                                              void* dweight)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        workspace,
+                        workspaceSizeInBytes,
+                        inputDesc,
+                        input,
+                        weightDesc,
+                        weight,
+                        doutputDesc,
+                        doutput,
+                        dinputDesc,
+                        dinput,
+                        dweightDesc,
+                        dweight);
+
+    LogCmdPReLU(inputDesc, weightDesc, false);
+    return miopen::try_([&] {
+        miopen::PReLUBackward(miopen::deref(handle),
+                              DataCast(workspace),
+                              workspaceSizeInBytes,
+                              miopen::deref(inputDesc),
+                              DataCast(input),
+                              miopen::deref(weightDesc),
+                              DataCast(weight),
+                              miopen::deref(doutputDesc),
+                              DataCast(doutput),
+                              miopen::deref(dinputDesc),
+                              DataCast(dinput),
+                              miopen::deref(dweightDesc),
+                              DataCast(dweight));
+    });
+}
diff --git a/src/solver.cpp b/src/solver.cpp
index bbd13bd89f..eccdba1c31 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -35,6 +35,7 @@
 #include <miopen/getitem/solvers.hpp>
 #include <miopen/layernorm/solvers.hpp>
 #include <miopen/pooling/solvers.hpp>
+#include <miopen/prelu/solvers.hpp>
 #include <miopen/reduce/solvers.hpp>
 #include <miopen/rope/solvers.hpp>
 #include <miopen/mha/solvers.hpp>
@@ -676,6 +677,9 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
 
     Register(registry, ++id, Primitive::RoPE, rope::RoPEForward{}.SolverDbId());
     Register(registry, ++id, Primitive::RoPE, rope::RoPEBackward{}.SolverDbId());
+    Register(registry, ++id, Primitive::ReLU, prelu::MultiWeightsBackward{}.SolverDbId());
+    Register(registry, ++id, Primitive::ReLU, prelu::SingleWeightBackward{}.SolverDbId());
+
     // IMPORTANT: New solvers should be added to the end of the function!
 }
 
diff --git a/src/solver/prelu/backward_prelu_multi_weights.cpp b/src/solver/prelu/backward_prelu_multi_weights.cpp
new file mode 100644
index 0000000000..5fed375a2b
--- /dev/null
+++ b/src/solver/prelu/backward_prelu_multi_weights.cpp
@@ -0,0 +1,183 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/tensor_view_utils.hpp>
+#include <miopen/prelu/invoke_params.hpp>
+#include <miopen/prelu/solvers.hpp>
+#include <miopen/prelu/utils.hpp>
+
+#define VIEW_DIMS 5
+
+#define warpSizeCTX (context.GetStream().GetWavefrontWidth())
+#define LOCAL_SIZE_MW_BWD 256
+#define LOCAL_SIZE_MW_REDUCE_BWD warpSizeCTX
+
+namespace miopen {
+
+namespace solver {
+
+namespace prelu {
+
+bool MultiWeightsBackward::IsApplicable(
+    const ExecutionContext& /*context*/,
+    const miopen::prelu::BackwardProblemDescription& problem) const
+{
+    if(problem.GetdInputDesc().GetVectorLength() > VIEW_DIMS)
+        return false;
+    if(problem.IsSingleWeight())
+        return false;
+    return true;
+}
+
+ConvSolution
+MultiWeightsBackward::GetSolution(const ExecutionContext& context,
+                                  const miopen::prelu::BackwardProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto dtype        = problem.GetdInputDesc().GetType();
+    auto input_dtype  = miopen::GetDataType(problem.GetdInputDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetdOuputDesc().GetType());
+
+    /* Phase 1: Calc gradient for each elements. */
+    {
+        auto size         = problem.GetdInputDesc().GetElementSize();
+        auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"VIEW_DIMS", VIEW_DIMS},
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype},
+        };
+        result.construction_params.push_back(make_hip_kernel(
+            {LOCAL_SIZE_MW_BWD}, {size}, "MIOpenPReLU.cpp", "PReLUMWBackward", build_params));
+    }
+
+    /* Phase 2: Reduce gradient. */
+    {
+        auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"OUTPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"REDUCE_SIZE", LOCAL_SIZE_MW_REDUCE_BWD},
+        };
+        result.construction_params.push_back(
+            make_hip_kernel({LOCAL_SIZE_MW_REDUCE_BWD},
+                            {problem.GetdWeightDesc().GetElementSize() * LOCAL_SIZE_MW_REDUCE_BWD},
+                            "MIOpenReduceSum.cpp",
+                            "Reduce1dSum",
+                            build_params));
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::prelu::InvokeParams>();
+
+            HipEventPtr start, stop;
+            bool profiling = handle_.IsProfilingEnabled();
+            if(profiling)
+            {
+                handle_.EnableProfiling(false);
+                hipStreamSynchronize(handle_.GetStream());
+                start = miopen::make_hip_event();
+                stop  = miopen::make_hip_event();
+                hipEventRecord(start.get(), handle_.GetStream());
+            }
+
+            int kernelCnt = 0;
+
+            /* Phase 1: Calc gradient for each elements. */
+            {
+                auto input_tv         = get_inner_expanded_tv<VIEW_DIMS>(deref(params.inputDesc));
+                auto weight_tv        = get_inner_expanded_tv<1>(deref(params.weightDesc));
+                auto output_grad_tv   = get_inner_expanded_tv<VIEW_DIMS>(deref(params.doutputDesc));
+                auto input_grad_tv    = get_inner_expanded_tv<VIEW_DIMS>(deref(params.dinputDesc));
+                decltype(auto) kernel = handle_.Run(kernels[kernelCnt++]);
+                kernel(params.input,
+                       params.weight,
+                       params.doutput,
+                       params.dinput,
+                       params.workspace,
+                       static_cast<uint64_t>(deref(params.inputDesc).GetElementSize()),
+                       input_tv,
+                       weight_tv,
+                       output_grad_tv,
+                       input_grad_tv);
+            }
+
+            /* Phase 2: Reduce gradient. */
+            {
+                uint64_t output_numel = deref(params.weightDesc).GetElementSize();
+                uint64_t outer_size   = deref(params.inputDesc).GetLengths()[0];
+                uint64_t inner_size =
+                    deref(params.inputDesc).GetElementSize() / outer_size / output_numel;
+                auto weight_grad_tv   = get_inner_expanded_tv<1>(deref(params.dweightDesc));
+                decltype(auto) kernel = handle_.Run(kernels[kernelCnt++]);
+                kernel(params.workspace,
+                       params.dweight,
+                       output_numel,
+                       inner_size,
+                       outer_size,
+                       weight_grad_tv);
+            }
+
+            if(profiling)
+            {
+                float elapsed = 0.0f;
+                hipEventRecord(stop.get(), handle_.GetStream());
+                handle_.EnableProfiling(true);
+                hipEventSynchronize(stop.get());
+                hipEventElapsedTime(&elapsed, start.get(), stop.get());
+                // Clean up
+                hipEventDestroy(start.get());
+                hipEventDestroy(stop.get());
+                handle_.ResetKernelTime();
+                handle_.AccumKernelTime(elapsed);
+            };
+        };
+    };
+
+    return result;
+}
+
+std::size_t MultiWeightsBackward::GetWorkspaceSize(
+    const ExecutionContext& /*context*/,
+    const miopen::prelu::BackwardProblemDescription& problem) const
+{
+    auto size = problem.GetdInputDesc().GetElementSize();
+    size *= get_data_size(miopenFloat);
+    return size;
+}
+
+} // namespace prelu
+} // namespace solver
+} // namespace miopen
diff --git a/src/solver/prelu/backward_prelu_single_weight.cpp b/src/solver/prelu/backward_prelu_single_weight.cpp
new file mode 100644
index 0000000000..2f8ba825f4
--- /dev/null
+++ b/src/solver/prelu/backward_prelu_single_weight.cpp
@@ -0,0 +1,202 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/tensor_view_utils.hpp>
+#include <miopen/prelu/invoke_params.hpp>
+#include <miopen/prelu/solvers.hpp>
+#include <miopen/prelu/utils.hpp>
+
+#define VIEW_DIMS 5
+
+#define LOCAL_SIZE_SW_BWD 256
+#define LOCAL_SIZE_SW_REDUCE_BWD 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace prelu {
+
+namespace {
+MultiBufferWorkspaceTraits GetMultiBufferWorkspaceTraits(const TensorDescriptor& inputDesc)
+{
+    auto size = inputDesc.GetElementSize();
+    return MultiBufferWorkspaceTraits{size * get_data_size(miopenFloat),
+                                      AlignUp(size, LOCAL_SIZE_SW_REDUCE_BWD) /
+                                          LOCAL_SIZE_SW_REDUCE_BWD * get_data_size(miopenFloat)};
+}
+} // namespace
+
+bool SingleWeightBackward::IsApplicable(
+    const ExecutionContext& /*context*/,
+    const miopen::prelu::BackwardProblemDescription& problem) const
+{
+    if(problem.GetdInputDesc().GetVectorLength() > VIEW_DIMS)
+        return false;
+    if(!problem.IsSingleWeight())
+        return false;
+    return true;
+}
+
+ConvSolution
+SingleWeightBackward::GetSolution(const ExecutionContext& /*context*/,
+                                  const miopen::prelu::BackwardProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto dtype        = problem.GetdInputDesc().GetType();
+    auto input_dtype  = miopen::GetDataType(problem.GetdInputDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetdOuputDesc().GetType());
+
+    /* Phase 1: Calc gradient for each elements. */
+    {
+        auto size         = problem.GetdInputDesc().GetElementSize();
+        auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"VIEW_DIMS", VIEW_DIMS},
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype},
+        };
+        result.construction_params.push_back(make_hip_kernel(
+            {LOCAL_SIZE_SW_BWD}, {size}, "MIOpenPReLU.cpp", "PReLUSWBackward", build_params));
+    }
+
+    /* Phase 2: Reduce gradient. */
+    {
+        auto size         = problem.GetdInputDesc().GetElementSize();
+        auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"OUTPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"REDUCE_SIZE", LOCAL_SIZE_SW_REDUCE_BWD},
+        };
+        while(size > LOCAL_SIZE_SW_REDUCE_BWD)
+        {
+            result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_SW_REDUCE_BWD},
+                                                                 {size},
+                                                                 "MIOpenReduceSum.cpp",
+                                                                 "ReduceSumFLOATACCUM",
+                                                                 build_params));
+            size = (size + LOCAL_SIZE_SW_REDUCE_BWD - 1) / LOCAL_SIZE_SW_REDUCE_BWD;
+        }
+        result.construction_params.push_back(make_hip_kernel(
+            {LOCAL_SIZE_SW_REDUCE_BWD}, {size}, "MIOpenReduceSum.cpp", "ReduceSum", build_params));
+    }
+
+    auto getBuffPart = [ws = GetMultiBufferWorkspaceTraits(problem.GetdInputDesc())](
+                           void* buffer, size_t part_idx) {
+        return static_cast<void*>(static_cast<std::byte*>(buffer) + ws.GetOffset(part_idx));
+    };
+
+    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::prelu::InvokeParams>();
+
+            HipEventPtr start, stop;
+            bool profiling = handle_.IsProfilingEnabled();
+            if(profiling)
+            {
+                handle_.EnableProfiling(false);
+                hipStreamSynchronize(handle_.GetStream());
+                start = miopen::make_hip_event();
+                stop  = miopen::make_hip_event();
+                hipEventRecord(start.get(), handle_.GetStream());
+            }
+
+            auto work_a = getBuffPart(params.GetWorkspace(), 0);
+            auto work_b = getBuffPart(params.GetWorkspace(), 1);
+
+            int kernelCnt = 0;
+
+            /* Phase 1: Calc gradient for each elements. */
+            {
+                auto input_tv         = get_inner_expanded_tv<VIEW_DIMS>(deref(params.inputDesc));
+                auto weight_tv        = get_inner_expanded_tv<1>(deref(params.weightDesc));
+                auto output_grad_tv   = get_inner_expanded_tv<VIEW_DIMS>(deref(params.doutputDesc));
+                auto input_grad_tv    = get_inner_expanded_tv<VIEW_DIMS>(deref(params.dinputDesc));
+                decltype(auto) kernel = handle_.Run(kernels[kernelCnt++]);
+                kernel(params.input,
+                       params.weight,
+                       params.doutput,
+                       params.dinput,
+                       work_a,
+                       static_cast<uint64_t>(deref(params.inputDesc).GetElementSize()),
+                       input_tv,
+                       weight_tv,
+                       output_grad_tv,
+                       input_grad_tv);
+            }
+
+            /* Phase 2: Reduce gradient. */
+            {
+                uint64_t size = deref(params.inputDesc).GetElementSize();
+                while(size > LOCAL_SIZE_SW_REDUCE_BWD)
+                {
+                    auto kernel = handle_.Run(kernels[kernelCnt++]);
+                    kernel(work_a, work_b, size);
+                    size = (size + LOCAL_SIZE_SW_REDUCE_BWD - 1) / LOCAL_SIZE_SW_REDUCE_BWD;
+                    std::swap(work_a, work_b);
+                }
+                auto weight_grad_tv = get_inner_expanded_tv<1>(deref(params.dweightDesc));
+                handle_.Run(kernels[kernelCnt++])(work_a, params.dweight, size, weight_grad_tv);
+            }
+
+            if(profiling)
+            {
+                float elapsed = 0.0f;
+                hipEventRecord(stop.get(), handle_.GetStream());
+                handle_.EnableProfiling(true);
+                hipEventSynchronize(stop.get());
+                hipEventElapsedTime(&elapsed, start.get(), stop.get());
+                // Clean up
+                hipEventDestroy(start.get());
+                hipEventDestroy(stop.get());
+                handle_.ResetKernelTime();
+                handle_.AccumKernelTime(elapsed);
+            };
+        };
+    };
+
+    return result;
+}
+
+std::size_t SingleWeightBackward::GetWorkspaceSize(
+    const ExecutionContext& /*context*/,
+    const miopen::prelu::BackwardProblemDescription& problem) const
+{
+    return GetMultiBufferWorkspaceTraits(problem.GetdInputDesc()).GetSize();
+}
+
+} // namespace prelu
+} // namespace solver
+} // namespace miopen
diff --git a/src/solver/prelu/utils.cpp b/src/solver/prelu/utils.cpp
new file mode 100644
index 0000000000..d67e0bdac5
--- /dev/null
+++ b/src/solver/prelu/utils.cpp
@@ -0,0 +1,53 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/prelu/utils.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace prelu {
+
+KernelInfo make_hip_kernel(std::vector<size_t> localsize,
+                           std::vector<size_t> gridsize,
+                           std::string kernel_file,
+                           std::string kernel_name,
+                           KernelBuildParameters build_params)
+{
+    while(localsize.size() < 3)
+        localsize.push_back(1);
+    while(gridsize.size() < 3)
+        gridsize.push_back(1);
+    for(int i = 0; i < localsize.size(); ++i)
+        gridsize[i] = AlignUp(gridsize[i], localsize[i]);
+    return KernelInfo{
+        build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name};
+}
+
+} // namespace prelu
+} // namespace solver
+} // namespace miopen
diff --git a/test/cpu_prelu.hpp b/test/cpu_prelu.hpp
new file mode 100644
index 0000000000..952099c3f6
--- /dev/null
+++ b/test/cpu_prelu.hpp
@@ -0,0 +1,100 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "tensor_holder.hpp"
+#include <miopen/tensor_view_utils.hpp>
+
+template <class T>
+void cpu_prelu_backward(const tensor<T> input,
+                        const tensor<T> weight,
+                        const tensor<T> output_grad,
+                        tensor<T>& ref_input_grad,
+                        tensor<T>& ref_weight_grad,
+                        const bool has_dinput  = true,
+                        const bool has_dweight = true)
+{
+    auto N              = input.desc.GetElementSize();
+    auto input_tv       = miopen::get_inner_expanded_tv<5>(input.desc);
+    auto weight_tv      = miopen::get_inner_expanded_tv<1>(weight.desc);
+    auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc);
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(ref_input_grad.desc);
+    auto weight_grad_tv = miopen::get_inner_expanded_tv<1>(ref_weight_grad.desc);
+
+    auto weight_grad_collector = std::vector<float>(N);
+
+    par_ford(N)([&](int gid) {
+        auto tensor_layout = tensor_layout_t<5>(input_tv, gid);
+        float input_v      = static_cast<float>(input[input_tv.get_tensor_view_idx(tensor_layout)]);
+        float grad_v =
+            static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]);
+
+        if(has_dinput)
+        {
+            float weight_v;
+            weight_v = static_cast<float>(
+                weight[weight.desc.GetElementSize() == 1
+                           ? 0
+                           : weight_tv.get_tensor_view_idx({tensor_layout.layout[1]})]);
+
+            float input_grad_v = input_v > 0 ? grad_v : weight_v * grad_v;
+            ref_input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] =
+                static_cast<T>(input_grad_v);
+        }
+        if(has_dweight)
+        {
+            weight_grad_collector[gid] = input_v > 0 ? 0 : input_v * grad_v;
+        }
+    });
+
+    if(has_dweight)
+    {
+        if(weight.desc.GetElementSize() == 1)
+        {
+            double sum = 0;
+            for(int i = 0; i < N; ++i)
+                sum += static_cast<double>(weight_grad_collector[i]);
+            ref_weight_grad[0] = static_cast<T>(sum);
+        }
+        else
+        {
+            size_t inner_size = std::accumulate(
+                &input_tv.size[2], &input_tv.size[4], 1ul, std::multiplies<size_t>());
+            size_t outer_size = inner_size * input_tv.size[1];
+            par_ford(weight.desc.GetElementSize())([&](int i) {
+                double sum = 0;
+                ford(input_tv.size[0])([&](int j) {
+                    ford(inner_size)([&](int k) {
+                        sum += static_cast<double>(
+                            weight_grad_collector[j * outer_size + i * inner_size + k]);
+                    });
+                });
+                ref_weight_grad[weight_grad_tv.get_tensor_view_idx({i})] = static_cast<T>(sum);
+            });
+        }
+    }
+}
diff --git a/test/gtest/prelu.cpp b/test/gtest/prelu.cpp
new file mode 100644
index 0000000000..1d0cd4d431
--- /dev/null
+++ b/test/gtest/prelu.cpp
@@ -0,0 +1,118 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "prelu.hpp"
+#include <miopen/env.hpp>
+
+MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
+
+namespace prelu {
+
+std::string GetFloatArg()
+{
+    const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG);
+    if(tmp.empty())
+    {
+        return "";
+    }
+    return tmp;
+}
+
+bool CheckFloatArg(std::string arg)
+{
+    if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == arg))
+    {
+        return true;
+    }
+    return false;
+}
+
+struct GPU_PReLU_bwd_FP32 : PReLUTest<float>
+{
+};
+
+struct GPU_PReLU_bwd_FP16 : PReLUTest<half>
+{
+};
+
+struct GPU_PReLU_bwd_BFP16 : PReLUTest<bfloat16>
+{
+};
+
+} // namespace prelu
+using namespace prelu;
+
+TEST_P(GPU_PReLU_bwd_FP32, Test)
+{
+    if(CheckFloatArg("--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_PReLU_bwd_FP16, Test)
+{
+    if(CheckFloatArg("--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_PReLU_bwd_BFP16, Test)
+{
+    if(CheckFloatArg("--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_PReLU_bwd_FP32, testing::ValuesIn(PReLUSmokeTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_PReLU_bwd_FP16, testing::ValuesIn(PReLUSmokeTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_PReLU_bwd_BFP16, testing::ValuesIn(PReLUSmokeTestConfigs()));
+
+INSTANTIATE_TEST_SUITE_P(Full, GPU_PReLU_bwd_FP32, testing::ValuesIn(PReLUFullTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_PReLU_bwd_FP16, testing::ValuesIn(PReLUFullTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_PReLU_bwd_BFP16, testing::ValuesIn(PReLUFullTestConfigs()));
+
+INSTANTIATE_TEST_SUITE_P(Perf, GPU_PReLU_bwd_FP32, testing::ValuesIn(PReLUPerfTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Perf, GPU_PReLU_bwd_FP16, testing::ValuesIn(PReLUPerfTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Perf, GPU_PReLU_bwd_BFP16, testing::ValuesIn(PReLUPerfTestConfigs()));
diff --git a/test/gtest/prelu.hpp b/test/gtest/prelu.hpp
new file mode 100644
index 0000000000..510cfb85f4
--- /dev/null
+++ b/test/gtest/prelu.hpp
@@ -0,0 +1,247 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "cpu_prelu.hpp"
+#include "get_handle.hpp"
+#include "random.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+
+#include <gtest/gtest.h>
+#include <miopen/miopen.h>
+#include <miopen/prelu.hpp>
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
+struct PReLUTestCase
+{
+    std::vector<size_t> lengths;
+    bool full_params;
+    bool contiguous;
+
+    friend std::ostream& operator<<(std::ostream& os, const PReLUTestCase& tc)
+    {
+        return os << " Lengths:" << tc.lengths
+                  << " Full_num_params:" << (tc.full_params ? "True" : "False")
+                  << " Contiguous:" << (tc.contiguous ? "True" : "False");
+    }
+};
+
+inline std::vector<PReLUTestCase> PReLUSmokeTestConfigs()
+{
+    std::vector<PReLUTestCase> tcs;
+
+    tcs.push_back({{64, 4}, true, true});
+    tcs.push_back({{64, 4}, true, false});
+    tcs.push_back({{64, 4}, false, true});
+    tcs.push_back({{64, 4}, false, false});
+
+    tcs.push_back({{64, 112}, true, true});
+    tcs.push_back({{64, 112}, true, false});
+    tcs.push_back({{64, 112}, false, true});
+    tcs.push_back({{64, 112}, false, false});
+
+    return tcs;
+}
+
+inline std::vector<PReLUTestCase> PReLUPerfTestConfigs()
+{
+    std::vector<PReLUTestCase> tcs;
+
+    tcs.push_back({{64, 112, 50}, true, true});
+    tcs.push_back({{64, 112, 50}, true, false});
+    tcs.push_back({{64, 112, 50}, false, true});
+    tcs.push_back({{64, 112, 50}, false, false});
+
+    return tcs;
+}
+
+inline std::vector<PReLUTestCase> PReLUFullTestConfigs()
+{
+    std::vector<PReLUTestCase> tcs;
+
+    auto smoke_test = PReLUSmokeTestConfigs();
+    auto perf_test  = PReLUPerfTestConfigs();
+
+    tcs.reserve(smoke_test.size() + perf_test.size());
+    for(const auto& test : smoke_test)
+        tcs.push_back(test);
+    for(const auto& test : perf_test)
+        tcs.push_back(test);
+
+    return tcs;
+}
+
+inline std::vector<size_t> GetStrides(std::vector<size_t> lengths, bool contiguous)
+{
+    if(!contiguous)
+        std::swap(lengths.front(), lengths.back());
+    std::vector<size_t> strides(lengths.size());
+    strides.back() = 1;
+    for(int i = lengths.size() - 2; i >= 0; --i)
+        strides[i] = strides[i + 1] * lengths[i + 1];
+    if(!contiguous)
+        std::swap(strides.front(), strides.back());
+    return strides;
+}
+
+template <typename T = float>
+struct PReLUTest : public ::testing::TestWithParam<PReLUTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle   = get_handle();
+        prelu_config    = GetParam();
+        auto gen_value1 = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-6, 100); };
+        auto gen_value2 = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-6, 99); };
+
+        auto lengths = prelu_config.lengths;
+
+        auto input_strides = GetStrides(lengths, prelu_config.contiguous);
+        input              = tensor<T>{lengths, input_strides}.generate(gen_value1);
+
+        std::vector<size_t> weight_length = {1};
+        if(prelu_config.full_params)
+            weight_length = {lengths[1]};
+        std::vector<size_t> weight_strides = {prelu_config.contiguous ? 1 : 2};
+        weight = tensor<T>{weight_length, weight_strides}.generate(gen_value2);
+
+        ws_sizeInBytes = miopen::GetPReLUBackwardWorkspaceSize(handle, input.desc, weight.desc);
+        if(ws_sizeInBytes == static_cast<size_t>(-1))
+            GTEST_SKIP();
+
+        if(ws_sizeInBytes != 0)
+        {
+            std::vector<size_t> workspace_dims;
+            workspace_dims.push_back(ws_sizeInBytes / sizeof(float));
+
+            workspace = tensor<float>{workspace_dims};
+            std::fill(workspace.begin(), workspace.end(), 0.0f);
+
+            workspace_dev = handle.Write(workspace.data);
+        }
+
+        dinput     = tensor<T>{lengths, input_strides};
+        ref_dinput = tensor<T>{lengths, input_strides};
+        std::fill(dinput.begin(), dinput.end(), static_cast<T>(0.0f));
+        std::fill(ref_dinput.begin(), ref_dinput.end(), static_cast<T>(0.0f));
+
+        dweight     = tensor<T>{weight_length, weight_strides};
+        ref_dweight = tensor<T>{weight_length, weight_strides};
+        std::fill(dweight.begin(), dweight.end(), static_cast<T>(0.0f));
+        std::fill(ref_dweight.begin(), ref_dweight.end(), static_cast<T>(0.0f));
+
+        auto out_strides = GetStrides(lengths, true);
+        doutput          = tensor<T>{lengths, out_strides};
+        std::fill(doutput.begin(), doutput.end(), static_cast<T>(1.0f));
+
+        input_dev   = handle.Write(input.data);
+        weight_dev  = handle.Write(weight.data);
+        doutput_dev = handle.Write(doutput.data);
+        dinput_dev  = handle.Write(dinput.data);
+        dweight_dev = handle.Write(dweight.data);
+    }
+
+    void RunTest()
+    {
+        cpu_prelu_backward<T>(input, weight, doutput, ref_dinput, ref_dweight);
+
+        auto&& handle = get_handle();
+        miopenStatus_t status;
+
+        status = miopen::PReLUBackward(handle,
+                                       workspace_dev.get(),
+                                       ws_sizeInBytes,
+                                       input.desc,
+                                       input_dev.get(),
+                                       weight.desc,
+                                       weight_dev.get(),
+                                       doutput.desc,
+                                       doutput_dev.get(),
+                                       dinput.desc,
+                                       dinput_dev.get(),
+                                       dweight.desc,
+                                       dweight_dev.get());
+        EXPECT_EQ(status, miopenStatusSuccess);
+        dinput.data  = handle.Read<T>(dinput_dev, dinput.data.size());
+        dweight.data = handle.Read<T>(dweight_dev, dweight.data.size());
+    }
+
+    void Verify()
+    {
+        // Computation error of fp16 is ~2^13 (=8192) bigger than
+        // the one of fp32 because mantissa is shorter by 13 bits.
+        double tolerance = std::is_same<T, float>::value ? 1.5e-6 : 8.2e-3;
+
+        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+        if(std::is_same<T, bfloat16>::value)
+            tolerance *= 8.0;
+
+        auto error_dinput  = miopen::rms_range(ref_dinput, dinput);
+        auto error_dweight = miopen::rms_range(ref_dweight, dweight);
+        ASSERT_EQ(miopen::range_distance(ref_dinput), miopen::range_distance(dinput));
+        ASSERT_EQ(miopen::range_distance(ref_dweight), miopen::range_distance(dweight));
+        EXPECT_LT(error_dinput, tolerance)
+            << "Error backward Input Gradient beyond tolerance Error: " << error_dinput
+            << ",  Tolerance: " << tolerance;
+        EXPECT_LT(error_dweight, tolerance)
+            << "Error backward Weight Gradient beyond tolerance Error: " << error_dweight
+            << ",  Tolerance: " << tolerance;
+    }
+
+    PReLUTestCase prelu_config;
+
+    tensor<T> input;
+    tensor<T> weight;
+    tensor<T> doutput;
+    tensor<T> dinput;
+    tensor<T> dweight;
+    tensor<float> workspace;
+
+    tensor<T> ref_dinput;
+    tensor<T> ref_dweight;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr weight_dev;
+    miopen::Allocator::ManageDataPtr doutput_dev;
+    miopen::Allocator::ManageDataPtr dinput_dev;
+    miopen::Allocator::ManageDataPtr dweight_dev;
+    miopen::Allocator::ManageDataPtr workspace_dev;
+
+    size_t ws_sizeInBytes;
+};