diff --git a/.clang-tidy b/.clang-tidy
index 8ed34ebb0f..39e6d96b87 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -71,6 +71,19 @@
 #   -modernize-type-traits
 #   -performance-avoid-endl
 #
+# TODO Code Quality WORKAROUND ROCm 6.2
+#   -readability-container-size-empty
+#   -bugprone-inc-dec-in-conditions
+#   -readability-avoid-nested-conditional-operator
+#   -performance-enum-size
+#   -readability-redundant-member-init
+#   -bugprone-multi-level-implicit-pointer-conversion
+#   -clang-analyzer-optin.core.EnumCastOutOfRange
+#   -cppcoreguidelines-macro-to-enum
+#   -cppcoreguidelines-owning-memory
+#   -readability-avoid-return-with-void-value
+#   -bugprone-unused-local-non-trivial-variable
+#
 Checks: >-
   *,
   -abseil-*,
@@ -78,9 +91,12 @@ Checks: >-
   -android-cloexec-fopen,
   -bugprone-easily-swappable-parameters,
   -bugprone-exception-escape,
+  -bugprone-inc-dec-in-conditions,
   -bugprone-lambda-function-name,
   -bugprone-macro-parentheses,
+  -bugprone-multi-level-implicit-pointer-conversion,
   -bugprone-narrowing-conversions,
+  -bugprone-unused-local-non-trivial-variable,
   -bugprone-use-after-move,
   -cert-env33-c,
   -cert-err33-c,
@@ -90,6 +106,7 @@ Checks: >-
   -cert-msc51-cpp,
   -clang-analyzer-alpha.core.CastToStruct,
   -clang-analyzer-cplusplus.NewDeleteLeaks,
+  -clang-analyzer-optin.core.EnumCastOutOfRange,
   -clang-analyzer-optin.performance.Padding,
   -clang-diagnostic-extern-c-compat,
   -clang-diagnostic-unused-command-line-argument,
@@ -100,11 +117,13 @@ Checks: >-
   -cppcoreguidelines-avoid-magic-numbers,
   -cppcoreguidelines-explicit-virtual-functions,
   -cppcoreguidelines-init-variables,
+  -cppcoreguidelines-macro-to-enum,
   -cppcoreguidelines-macro-usage,
   -cppcoreguidelines-misleading-capture-default-by-value,
   -cppcoreguidelines-missing-std-forward,
   -cppcoreguidelines-narrowing-conversions,
   -cppcoreguidelines-non-private-member-variables-in-classes,
+  -cppcoreguidelines-owning-memory,
   -cppcoreguidelines-prefer-member-initializer,
   -cppcoreguidelines-pro-bounds-array-to-pointer-decay,
   -cppcoreguidelines-pro-bounds-constant-array-index,
@@ -168,11 +187,15 @@ Checks: >-
   -modernize-concat-nested-namespaces,
   -modernize-unary-static-assert,
   -performance-avoid-endl,
+  -performance-enum-size,
+  -readability-avoid-nested-conditional-operator,
   -performance-no-automatic-move,
   -performance-unnecessary-copy-initialization,
   -performance-unnecessary-value-param,
+  -readability-avoid-return-with-void-value,
   -readability-avoid-unconditional-preprocessor-if,
   -readability-container-data-pointer,
+  -readability-container-size-empty,
   -readability-convert-member-functions-to-static,
   -readability-else-after-return,
   -readability-function-cognitive-complexity,
@@ -182,6 +205,7 @@ Checks: >-
   -readability-named-parameter,
   -readability-qualified-auto,
   -readability-redundant-declaration,
+  -readability-redundant-member-init,
   -readability-redundant-string-init,
   -readability-simplify-boolean-expr,
   -readability-suspicious-call-argument,
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 98201516ad..21f96a0d77 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -127,7 +127,7 @@ if(MIOPEN_STRIP_SYMBOLS AND NOT WIN32 AND NOT APPLE)
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
 endif()
 
-rocm_setup_version(VERSION 3.2.0)
+rocm_setup_version(VERSION 3.3.0)
 
 list( APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake )
 include(TargetFlags)
diff --git a/Dockerfile b/Dockerfile
index 89c8065f48..fb272596aa 100755
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,7 +7,6 @@ RUN dpkg --add-architecture i386
 # Install preliminary dependencies
 RUN apt-get update && \
 DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
-    "linux-headers-$(uname -r)" "linux-modules-extra-$(uname -r)" \
     apt-utils \
     ca-certificates \
     curl \
@@ -15,17 +14,21 @@ DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
     gnupg2 \
     wget
 
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    "linux-headers-$(uname -r)" "linux-modules-extra-$(uname -r)"
+
 #Add gpg keys
 ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
 RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg
 
-RUN wget https://repo.radeon.com/amdgpu-install/6.1/ubuntu/jammy/amdgpu-install_6.1.60100-1_all.deb --no-check-certificate
+RUN wget https://repo.radeon.com/amdgpu-install/6.2/ubuntu/jammy/amdgpu-install_6.2.60200-1_all.deb --no-check-certificate
 RUN apt-get update && \
 DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
-    ./amdgpu-install_6.1.60100-1_all.deb
+    ./amdgpu-install_6.2.60200-1_all.deb
 
 # Add rocm repository
-RUN export ROCM_APT_VER=6.1;\
+RUN export ROCM_APT_VER=6.2;\
 echo $ROCM_APT_VER &&\
 sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCM_APT_VER/ubuntu jammy main > /etc/apt/sources.list.d/amdgpu.list' &&\
 sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/rocm/apt/$ROCM_APT_VER jammy main > /etc/apt/sources.list.d/rocm.list'
@@ -85,6 +88,7 @@ ADD dev-requirements.txt /dev-requirements.txt
 # Install dependencies
 # TODO: Add --std=c++14
 ARG GPU_ARCH=";"
+# install to /opt/rocm will cause permission issue
 ARG PREFIX=/usr/local
 ARG USE_FIN="OFF"
 ARG CCACHE_SECONDARY_STORAGE=""
@@ -101,9 +105,12 @@ RUN ccache -s
 
 # purge existing composable kernel installed with ROCm
 # hence cannot use autoremove since it will remove more components
+# even purge will remove some other components which is not ideal
 RUN apt-get update && \
 DEBIAN_FRONTEND=noninteractive apt-get purge -y --allow-unauthenticated \
-    composablekernel-dev
+    composablekernel-dev \
+    miopen-hip
+
 ARG COMPILER_LAUNCHER=""
 # rbuild is used to trigger build of requirements.txt, dev-requirements.txt
 RUN if [ "$USE_FIN" = "ON" ]; then \
@@ -120,4 +127,7 @@ RUN pip3 install -r /doc-requirements.txt
 # Composable Kernel requires this version cmake
 RUN pip3 install --upgrade cmake==3.27.5
 
+# groupadd can add one group a time
 RUN groupadd -f render
+RUN groupadd -f video
+RUN usermod -a -G render,video root
\ No newline at end of file
diff --git a/Jenkinsfile b/Jenkinsfile
index 424c1d8a2f..6d85a6f7d1 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -33,14 +33,21 @@ def cmake_build(Map conf=[:]){
     def debug_flags = "-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined -Wno-option-ignored " + conf.get("extradebugflags", "")
     def build_envs = "CTEST_PARALLEL_LEVEL=4 " + conf.get("build_env","")
     def prefixpath = conf.get("prefixpath","/opt/rocm")
+    def build_type_debug = (conf.get("build_type",'release') == 'debug')
+    def code_conv_enabled = conf.get("codecov", false)
+
     def mlir_args = " -DMIOPEN_USE_MLIR=" + conf.get("mlir_build", "ON")
+    // WORKAROUND_ISSUE_3192 Disabling MLIR for debug builds since MLIR generates sanitizer errors.
+    if (build_type_debug || code_conv_enabled)
+    {
+        mlir_args = " -DMIOPEN_USE_MLIR=OFF"
+    }
+
     def setup_args = mlir_args + " -DMIOPEN_GPU_SYNC=Off " + conf.get("setup_flags","")
     def build_fin = conf.get("build_fin", "OFF")
 
     setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} "
 
-    def build_type_debug = (conf.get("build_type",'release') == 'debug')
-
     //cmake_env can overwrite default CXX variables.
     def cmake_envs = "CXX=${compiler} CXXFLAGS='-Werror' " + conf.get("cmake_ex_env","")
 
@@ -72,7 +79,7 @@ def cmake_build(Map conf=[:]){
         test_flags = " --disable-verification-cache " + test_flags
     }
 
-    if(conf.get("codecov", false)){ //Need
+    if(code_conv_enabled){ //Need
         setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags} -fprofile-arcs -ftest-coverage' -DCODECOV_TEST=On " + setup_args
     }else if(build_type_debug){
         setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}'" + setup_args
@@ -194,9 +201,9 @@ def getDockerImage(Map conf=[:])
 {
     env.DOCKER_BUILDKIT=1
     def prefixpath = conf.get("prefixpath", "/opt/rocm") // one image for each prefix 1: /usr/local 2:/opt/rocm
-    def gpu_arch = "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" // prebuilt dockers should have all the architectures enabled so one image can be used for all stages
+    def gpu_arch = "gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200" // prebuilt dockers should have all the architectures enabled so one image can be used for all stages
     def mlir_build = conf.get("mlir_build", "ON") // always ON
-    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg GPU_ARCH='${gpu_arch}' --build-arg USE_MLIR='${mlir_build}' "
+    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg GPU_TARGETS='${gpu_arch}' --build-arg USE_MLIR='${mlir_build}' "
     if(env.CCACHE_HOST)
     {
         def check_host = sh(script:"""(printf "PING\r\n";) | nc -N ${env.CCACHE_HOST} 6379 """, returnStdout: true).trim()
@@ -284,7 +291,7 @@ def buildHipClangJob(Map conf=[:]){
             }
 
             withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
-                timeout(time: 210, unit:'MINUTES')
+                timeout(time: 300, unit:'MINUTES')
                 {
                     if (lfs_pull) {
                         sh "git lfs pull --exclude="
@@ -648,7 +655,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx90a") }
                     steps{
-                        buildHipClangJobAndReboot(make_targets: Smoke_targets)
+                        buildHipClangJobAndReboot(make_targets: Smoke_targets, build_install: "true")
                     }
                 }
                 stage('Fp32 Hip Debug gfx90a') {
@@ -661,7 +668,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx90a") }
                     steps{
-                        buildHipClangJobAndReboot(build_type: 'debug', make_targets: Smoke_targets)
+                        buildHipClangJobAndReboot(build_type: 'debug', make_targets: Smoke_targets, build_install: "true")
                     }
                 }
                 stage('Fp32 Hip Debug gfx908') {
@@ -674,7 +681,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx908") }
                     steps{
-                        buildHipClangJobAndReboot(build_type: 'debug', make_targets: Smoke_targets)
+                        buildHipClangJobAndReboot(build_type: 'debug', make_targets: Smoke_targets, build_install: "true")
                     }
                 }
                 stage('Fp32 Hip Debug gfx94X') {
@@ -687,7 +694,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx94X") }
                     steps{
-                        buildHipClangJobAndReboot(build_type: 'debug', make_targets: Smoke_targets, needs_reboot:false)
+                        buildHipClangJobAndReboot(build_type: 'debug', make_targets: Smoke_targets, needs_reboot:false, build_install: "true")
                     }
                 }
             }
@@ -711,7 +718,7 @@ pipeline {
                         NOCOMGR_build_cmd = "CTEST_PARALLEL_LEVEL=4 MIOPEN_LOG_LEVEL=5 make -j\$(nproc) check"
                     }
                     steps{
-                        buildHipClangJobAndReboot( build_type: 'debug', setup_flags: NOCOMGR_flags, build_cmd: NOCOMGR_build_cmd, test_flags: ' --verbose ')
+                        buildHipClangJobAndReboot( build_type: 'debug', setup_flags: NOCOMGR_flags, build_cmd: NOCOMGR_build_cmd, test_flags: ' --verbose ', build_install: "true")
                     }
                 }
                 stage('Fp32 Hip Debug NOMLIR gfx90a') {
@@ -728,7 +735,7 @@ pipeline {
                         NOMLIR_build_cmd = "CTEST_PARALLEL_LEVEL=4 MIOPEN_LOG_LEVEL=5 make -j\$(nproc) check"
                     }
                     steps{
-                        buildHipClangJobAndReboot( build_type: 'debug', setup_flags: NOMLIR_flags, build_cmd: NOMLIR_build_cmd, test_flags: ' --verbose ')
+                        buildHipClangJobAndReboot( build_type: 'debug', setup_flags: NOMLIR_flags, build_cmd: NOMLIR_build_cmd, test_flags: ' --verbose ', build_install: "true")
                     }
                 }
                 stage('Fp32 Hip Debug NOCK gfx90a Build-Only') {
@@ -741,7 +748,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx90a") }
                     steps{
-                        buildHipClangJobAndReboot( build_type: 'debug', setup_flags: "-DMIOPEN_USE_COMPOSABLEKERNEL=Off", make_targets: "")
+                        buildHipClangJobAndReboot( build_type: 'debug', setup_flags: "-DMIOPEN_USE_COMPOSABLEKERNEL=Off", make_targets: "", build_install: "true")
                     }
                 }
                 stage('Fp32 Hip Debug Embedded Vega20') {
@@ -757,7 +764,7 @@ pipeline {
                         Embedded_flags = "-DMIOPEN_EMBED_DB='gfx906_60'"
                     }
                     steps{
-                        buildHipClangJobAndReboot( build_type: 'debug', setup_flags: Embedded_flags, build_env: extra_log_env, test_flags: ' --verbose ')
+                        buildHipClangJobAndReboot( build_type: 'debug', setup_flags: Embedded_flags, build_env: extra_log_env, test_flags: ' --verbose ', build_install: "true")
                     }
                 }
                 stage('Fp32 Hip Static gfx90a') {
@@ -770,7 +777,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx90a") }
                     steps{
-                        buildHipClangJobAndReboot( setup_flags: "-DBUILD_SHARED_LIBS=Off", mlir_build: 'OFF')
+                        buildHipClangJobAndReboot( setup_flags: "-DBUILD_SHARED_LIBS=Off", mlir_build: 'OFF', build_install: "true")
                     }
                 }
                 stage('Fp32 Hip Normal-Find gfx90a') {
@@ -787,7 +794,7 @@ pipeline {
                         execute_cmd = "bin/test_conv2d --disable-verification-cache"
                     }
                     steps{
-                        buildHipClangJobAndReboot(make_targets: make_targets, execute_cmd: execute_cmd, find_mode: "Normal")
+                        buildHipClangJobAndReboot(make_targets: make_targets, execute_cmd: execute_cmd, find_mode: "Normal", build_install: "true")
                     }
                 }
                 stage('Fp32 Hip Fast-Find gfx90a') {
@@ -804,7 +811,7 @@ pipeline {
                         execute_cmd = "MIOPEN_FIND_MODE=2 CTEST_PARALLEL_LEVEL=4 bin/test_conv2d --disable-verification-cache"
                     }
                     steps{
-                        buildHipClangJobAndReboot( make_targets: make_targets, execute_cmd: execute_cmd)
+                        buildHipClangJobAndReboot( make_targets: make_targets, execute_cmd: execute_cmd, build_install: "true")
                     }
                 }
                 stage('Fp32 Hip gfx90a') {
@@ -830,7 +837,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx90a") }
                     steps{
-                        buildHipClangJobAndReboot(make_targets: Smoke_targets, setup_flags: "-DMIOPEN_USE_SQLITE_PERF_DB=On")
+                        buildHipClangJobAndReboot(make_targets: Smoke_targets, setup_flags: "-DMIOPEN_USE_SQLITE_PERF_DB=On", build_install: "true")
                     }
                 }
             }
@@ -850,7 +857,7 @@ pipeline {
                     }
                     agent{ label rocmnode("vega20") }
                     steps{
-                        buildHipClangJobAndReboot( setup_flags: Fp16_flags, make_targets: Smoke_targets)
+                        buildHipClangJobAndReboot( setup_flags: Fp16_flags, make_targets: Smoke_targets, build_install: "true")
                     }
                 }
                 stage('Bf16 Hip Vega20') {
@@ -863,7 +870,7 @@ pipeline {
                     }
                     agent{ label rocmnode("vega20") }
                     steps{
-                        buildHipClangJobAndReboot(setup_flags: Bf16_flags, make_targets: Smoke_targets)
+                        buildHipClangJobAndReboot(setup_flags: Bf16_flags, make_targets: Smoke_targets, build_install: "true")
                     }
                 }
                 stage('Fp16 Hip gfx908') {
@@ -876,7 +883,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx908") }
                     steps{
-                        buildHipClangJobAndReboot( setup_flags: Fp16_flags, make_targets: Smoke_targets)
+                        buildHipClangJobAndReboot( setup_flags: Fp16_flags, make_targets: Smoke_targets, build_install: "true")
                     }
                 }
                 stage('Bf16 Hip gfx908') {
@@ -889,7 +896,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx908") }
                     steps{
-                        buildHipClangJobAndReboot(setup_flags: Bf16_flags, make_targets: Smoke_targets)
+                        buildHipClangJobAndReboot(setup_flags: Bf16_flags, make_targets: Smoke_targets, build_install: "true")
                     }
                 }
                 stage('Fp16 Hip gfx90a') {
@@ -902,7 +909,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx90a") }
                     steps{
-                        buildHipClangJobAndReboot( setup_flags: Fp16_flags, make_targets: Smoke_targets)
+                        buildHipClangJobAndReboot( setup_flags: Fp16_flags, make_targets: Smoke_targets, build_install: "true")
                     }
                 }
                 stage('Bf16 Hip gfx90a') {
@@ -915,7 +922,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx90a") }
                     steps{
-                        buildHipClangJobAndReboot(setup_flags: Bf16_flags, make_targets: Smoke_targets)
+                        buildHipClangJobAndReboot(setup_flags: Bf16_flags, make_targets: Smoke_targets, build_install: "true")
                     }
                 }
                 stage('Fp16 Hip gfx94X') {
@@ -928,7 +935,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx94X") }
                     steps{
-                        buildHipClangJobAndReboot( setup_flags: Fp16_flags, make_targets: Smoke_targets, needs_reboot:false)
+                        buildHipClangJobAndReboot( setup_flags: Fp16_flags, make_targets: Smoke_targets, needs_reboot:false, build_install: "true")
                     }
                 }
                 stage('Bf16 Hip gfx94X') {
@@ -941,7 +948,7 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx94X") }
                     steps{
-                        buildHipClangJobAndReboot(setup_flags: Bf16_flags, make_targets: Smoke_targets, needs_reboot:false)
+                        buildHipClangJobAndReboot(setup_flags: Bf16_flags, make_targets: Smoke_targets, needs_reboot:false, build_install: "true")
                     }
                 }
             }
diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 2574dbbf5e..34a045bca8 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -36,3 +36,4 @@ The MIOpen API library is structured as follows:
   * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental)
   * :doc:`ReduceCalculation <../doxygen/html/group__ReduceCalculation>` (experimental)
   * :doc:`RotaryPositionalEmbeddings <../doxygen/html/group__RotaryPositionalEmbeddings>` (experimental)
+  * :doc:`ReLU <../doxygen/html/group___re_l_u>` (experimental)
diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index bb4f98b6a5..72eac33929 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1 +1 @@
-rocm-docs-core[api_reference]==1.6.2
+rocm-docs-core[api_reference]==1.7.1
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index d3060ed33b..f8ad206428 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -92,7 +92,7 @@ requests==2.32.2
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core[api-reference]==1.6.2
+rocm-docs-core[api-reference]==1.7.1
     # via -r requirements.in
 smmap==5.0.1
     # via gitdb
diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index 1e841ee5e5..2ba219dd5e 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -49,6 +49,7 @@ add_executable(MIOpenDriver
     dm_lrn.cpp
     dm_multimarginloss.cpp
     dm_pool.cpp
+    dm_prelu.cpp
     dm_reduce.cpp
     dm_reduceextreme.cpp
     dm_reducecalculation.cpp
diff --git a/driver/dm_prelu.cpp b/driver/dm_prelu.cpp
new file mode 100644
index 0000000000..06ec99e8a5
--- /dev/null
+++ b/driver/dm_prelu.cpp
@@ -0,0 +1,40 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "registry_driver_maker.hpp"
+#include "prelu_driver.hpp"
+
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "prelu")
+        return new PReLUDriver<float, float>();
+    if(base_arg == "prelufp16")
+        return new PReLUDriver<float16, float>();
+    if(base_arg == "prelubfp16")
+        return new PReLUDriver<bfloat16, float>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
diff --git a/driver/driver.hpp b/driver/driver.hpp
index 8d7cbe758a..b4cca0356a 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -176,6 +176,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
            "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
            "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], "
+           "prelu[bfp16|fp16], "
            "multimarginloss[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
@@ -208,8 +209,9 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" &&
        arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" &&
        arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "rope" &&
-       arg != "ropefp16" && arg != "ropebfp16" && arg != "multimarginloss" &&
-       arg != "multimarginlossfp16" && arg != "multimarginlossbfp16" && arg != "--version")
+       arg != "ropefp16" && arg != "ropebfp16" && arg != "prelu" && arg != "prelufp16" &&
+       arg != "prelubfp16" && arg != "multimarginloss" && arg != "multimarginlossfp16" &&
+       arg != "multimarginlossbfp16" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/mloPReLUHost.hpp b/driver/mloPReLUHost.hpp
new file mode 100644
index 0000000000..c3460b6425
--- /dev/null
+++ b/driver/mloPReLUHost.hpp
@@ -0,0 +1,104 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <../test/ford.hpp>
+
+#include <miopen/tensor.hpp>
+#include <miopen/tensor_view_utils.hpp>
+#include <miopen/prelu/utils.hpp>
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloPReLUBackwardRunHost(const miopenTensorDescriptor_t inputDesc,
+                                const miopenTensorDescriptor_t weightDesc,
+                                const miopenTensorDescriptor_t doutputDesc,
+                                const miopenTensorDescriptor_t dinputDesc,
+                                const Tgpu* input,
+                                const Tgpu* weight,
+                                const Tgpu* doutput,
+                                Tcheck* dinput_host,
+                                Tcheck* dweight_host)
+{
+    auto input_tv   = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
+    auto doutput_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(doutputDesc));
+    auto dinput_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(dinputDesc));
+
+    auto input_sz              = miopen::deref(inputDesc).GetElementSize();
+    auto weight_sz             = miopen::deref(weightDesc).GetElementSize();
+    auto weight_grad_collector = std::vector<float>(input_sz);
+
+    par_ford(input_sz)([&](int gid) {
+        auto tensor_layout = tensor_layout_t<5>(input_tv, gid);
+        float input_v      = static_cast<float>(input[input_tv.get_tensor_view_idx(tensor_layout)]);
+        float grad_v = static_cast<float>(doutput[doutput_tv.get_tensor_view_idx(tensor_layout)]);
+
+        if(dinput_host)
+        {
+            float weight_v;
+            if(weight_sz == 1)
+                weight_v = static_cast<float>(weight[0]);
+            else
+                weight_v = static_cast<float>(weight[tensor_layout.layout[1]]);
+            float input_grad_v = input_v > 0 ? grad_v : weight_v * grad_v;
+            dinput_host[dinput_tv.get_tensor_view_idx(tensor_layout)] =
+                static_cast<Tcheck>(input_grad_v);
+        }
+        if(dweight_host)
+        {
+            weight_grad_collector[gid] = input_v > 0 ? 0 : input_v * grad_v;
+        }
+    });
+
+    if(dweight_host)
+    {
+        if(weight_sz == 1)
+        {
+            double sum = 0;
+            for(int i = 0; i < input_sz; ++i)
+                sum += static_cast<double>(weight_grad_collector[i]);
+            dweight_host[0] = static_cast<Tcheck>(sum);
+        }
+        else
+        {
+            size_t inner_size = std::accumulate(
+                &input_tv.size[2], &input_tv.size[4], 1ul, std::multiplies<size_t>());
+            size_t outer_size = inner_size * input_tv.size[1];
+            par_ford(weight_sz)([&](int i) {
+                double sum = 0;
+                ford(input_tv.size[0])([&](int j) {
+                    ford(inner_size)([&](int k) {
+                        sum += static_cast<double>(
+                            weight_grad_collector[j * outer_size + i * inner_size + k]);
+                    });
+                });
+                dweight_host[i] = static_cast<Tcheck>(sum);
+            });
+        }
+    }
+
+    return miopenStatusSuccess;
+}
diff --git a/driver/prelu_driver.hpp b/driver/prelu_driver.hpp
new file mode 100644
index 0000000000..e304f27465
--- /dev/null
+++ b/driver/prelu_driver.hpp
@@ -0,0 +1,394 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "driver.hpp"
+#include "mloPReLUHost.hpp"
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+
+#include <../test/ford.hpp>
+#include <../test/verify.hpp>
+
+#include <miopen/miopen.h>
+
+template <typename Tgpu, typename Tref>
+class PReLUDriver : public Driver
+{
+public:
+    PReLUDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&inputDesc);
+        miopenCreateTensorDescriptor(&weightDesc);
+        miopenCreateTensorDescriptor(&doutputDesc);
+        miopenCreateTensorDescriptor(&dinputDesc);
+        miopenCreateTensorDescriptor(&dweightDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    int GetandSetData() override;
+    std::vector<int> GetTensorLengthsFromCmdLine();
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+    int RunBackwardCPU();
+
+    Tref GetTolerance();
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~PReLUDriver() override
+    {
+        miopenDestroyTensorDescriptor(inputDesc);
+        miopenDestroyTensorDescriptor(weightDesc);
+        miopenDestroyTensorDescriptor(doutputDesc);
+        miopenDestroyTensorDescriptor(dinputDesc);
+        miopenDestroyTensorDescriptor(dweightDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    int forw;
+
+    miopenTensorDescriptor_t inputDesc;
+    miopenTensorDescriptor_t weightDesc;
+    miopenTensorDescriptor_t doutputDesc;
+    miopenTensorDescriptor_t dinputDesc;
+    miopenTensorDescriptor_t dweightDesc;
+
+    std::unique_ptr<GPUMem> input_dev;
+    std::unique_ptr<GPUMem> weight_dev;
+    std::unique_ptr<GPUMem> doutput_dev;
+    std::unique_ptr<GPUMem> dinput_dev;
+    std::unique_ptr<GPUMem> dweight_dev;
+    std::unique_ptr<GPUMem> workspace_dev;
+
+    std::vector<Tgpu> input;
+    std::vector<Tgpu> weight;
+    std::vector<Tgpu> doutput;
+    std::vector<Tgpu> dinput;
+    std::vector<Tgpu> dweight;
+
+    std::vector<Tref> dinput_host;
+    std::vector<Tref> dweight_host;
+
+    size_t ws_sizeInBytes;
+};
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+
+    auto inTensorParam = inflags.GetValueTensor("input");
+    auto input_length  = inTensorParam.lengths;
+    if(input_length.empty())
+    {
+        std::cout << "Tensor must not be empty";
+        return miopenStatusBadParm;
+    }
+
+    std::vector<int> weight_length = {inflags.GetValueInt("NumParameters")};
+    if(weight_length[0] != 1 && (input_length.size() == 1 || weight_length[0] != input_length[1]))
+    {
+        std::cout << "NumParameters must be 1 or the second dim of DimLengths";
+        return miopenStatusBadParm;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::GetandSetData()
+{
+    auto inTensorParam             = inflags.GetValueTensor("input");
+    auto input_length              = inTensorParam.lengths;
+    std::vector<int> weight_length = {inflags.GetValueInt("NumParameters")};
+
+    if(SetTensorNd(inputDesc, input_length, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing input tensor: " + inflags.GetValueStr("input") + ".");
+    if(SetTensorNd(dinputDesc, input_length, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing input gradient tensor");
+
+    if(SetTensorNd(weightDesc, weight_length, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing weight tensor");
+    if(SetTensorNd(dweightDesc, weight_length, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing weight gradient tensor");
+
+    if(SetTensorNd(doutputDesc, input_length, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing output gradient tensor");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag("forw", 'F', "1", "Run only Forward PReLU (Default=1)", "int");
+    inflags.AddTensorFlag("input", 'D', "256x4x1x1x8723", "input tensor descriptor");
+    inflags.AddInputFlag(
+        "NumParameters",
+        'P',
+        "1",
+        "Number of weight to learn. Although it takes an int as input, there is only two values "
+        "are legitimate: 1, or the number of channels (the second dim) at input (Default=1)",
+        "int");
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int");
+    inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    size_t input_sz  = GetTensorSize(inputDesc);
+    size_t weight_sz = GetTensorSize(weightDesc);
+
+    miopenGetPReLUBackwardWorkspaceSize(GetHandle(), inputDesc, weightDesc, &ws_sizeInBytes);
+
+    if(ws_sizeInBytes == static_cast<size_t>(-1))
+        return miopenStatusAllocFailed;
+
+    uint32_t ctx = 0;
+
+    input_dev     = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    weight_dev    = std::unique_ptr<GPUMem>(new GPUMem(ctx, weight_sz, sizeof(Tgpu)));
+    doutput_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    dinput_dev    = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    dweight_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, weight_sz, sizeof(Tgpu)));
+    workspace_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, ws_sizeInBytes, sizeof(std::byte)));
+
+    input   = std::vector<Tgpu>(input_sz);
+    weight  = std::vector<Tgpu>(weight_sz);
+    doutput = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(1.0f));
+    dinput  = std::vector<Tgpu>(input_sz, std::numeric_limits<Tgpu>::quiet_NaN());
+    dweight = std::vector<Tgpu>(weight_sz, std::numeric_limits<Tgpu>::quiet_NaN());
+
+    dinput_host  = std::vector<Tref>(input_sz, std::numeric_limits<Tref>::quiet_NaN());
+    dweight_host = std::vector<Tref>(weight_sz, std::numeric_limits<Tref>::quiet_NaN());
+
+    for(int i = 0; i < input_sz; i++)
+        input[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-1e-5), static_cast<Tgpu>(1e-6));
+
+    for(int i = 0; i < weight_sz; i++)
+        weight[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-1e-5), static_cast<Tgpu>(1e-6));
+
+    if(input_dev->ToGPU(GetStream(), input.data()) != 0)
+    {
+        std::cerr << "Error copying (input) to GPU, size: " << input_dev->GetSize() << std::endl;
+        return miopenStatusAllocFailed;
+    }
+
+    if(weight_dev->ToGPU(GetStream(), weight.data()) != 0)
+    {
+        std::cerr << "Error copying (weight) to GPU, size: " << weight_dev->GetSize() << std::endl;
+        return miopenStatusAllocFailed;
+    }
+
+    if(doutput_dev->ToGPU(GetStream(), doutput.data()) != 0)
+    {
+        std::cerr << "Error copying (out grad) to GPU, size: " << doutput_dev->GetSize()
+                  << std::endl;
+        return miopenStatusAllocFailed;
+    }
+
+    if(dinput_dev->ToGPU(GetStream(), dinput.data()) != 0)
+    {
+        std::cerr << "Error copying (input grad) to GPU, size: " << dinput_dev->GetSize()
+                  << std::endl;
+        return miopenStatusAllocFailed;
+    }
+
+    if(dweight_dev->ToGPU(GetStream(), dweight.data()) != 0)
+    {
+        std::cerr << "Error copying (weight grad) to GPU, size: " << dweight_dev->GetSize()
+                  << std::endl;
+        return miopenStatusAllocFailed;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    return miopenStatusNotImplemented;
+}
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::RunForwardCPU()
+{
+    return miopenStatusNotImplemented;
+}
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenPReLUBackward(GetHandle(),
+                            workspace_dev->GetMem(),
+                            ws_sizeInBytes,
+                            inputDesc,
+                            input_dev->GetMem(),
+                            weightDesc,
+                            weight_dev->GetMem(),
+                            doutputDesc,
+                            doutput_dev->GetMem(),
+                            dinputDesc,
+                            dinput_dev->GetMem(),
+                            dweightDesc,
+                            dweight_dev->GetMem());
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Backward PReLU Elapsed: " << t.gettime_ms() / iter
+                      << " ms" << std::endl;
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Backward PReLU Elapsed: " << kernel_average_time << " ms"
+                  << std::endl;
+    }
+
+    if(dinput_dev->FromGPU(GetStream(), dinput.data()) != 0)
+    {
+        std::cerr << "Error copying (dinput_dev) from GPU, size: " << dinput_dev->GetSize()
+                  << std::endl;
+        return miopenStatusInternalError;
+    }
+    if(dweight_dev->FromGPU(GetStream(), dweight.data()) != 0)
+    {
+        std::cerr << "Error copying (dweight_dev) from GPU, size: " << dweight_dev->GetSize()
+                  << std::endl;
+        return miopenStatusInternalError;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::RunBackwardCPU()
+{
+    return mloPReLUBackwardRunHost<Tgpu, Tref>(inputDesc,
+                                               weightDesc,
+                                               doutputDesc,
+                                               dinputDesc,
+                                               input.data(),
+                                               weight.data(),
+                                               doutput.data(),
+                                               dinput_host.data(),
+                                               dweight_host.data());
+}
+
+template <typename Tgpu, typename Tref>
+Tref PReLUDriver<Tgpu, Tref>::GetTolerance()
+{
+    // Computation error of fp16 is ~2^13 (=8192) bigger than
+    // the one of fp32 because mantissa is shorter by 13 bits.
+    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
+
+    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+    if(std::is_same<Tgpu, bfloat16>::value)
+        tolerance *= 8.0;
+    return tolerance;
+}
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::VerifyForward()
+{
+    return miopenStatusNotImplemented;
+}
+
+template <typename Tgpu, typename Tref>
+int PReLUDriver<Tgpu, Tref>::VerifyBackward()
+{
+    RunBackwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error_dinput    = miopen::rms_range(dinput_host, dinput);
+    auto error_dweight   = miopen::rms_range(dweight_host, dweight);
+
+    if(!std::isfinite(error_dinput) || error_dinput > tolerance)
+    {
+        std::cout << "Backward PReLU Input Gradient FAILED: " << error_dinput << " > " << tolerance
+                  << std::endl;
+        return EC_VerifyBwd;
+    }
+    else
+    {
+        std::cout << "Backward PReLU Input Gradient Verifies OK on CPU reference (" << error_dinput
+                  << " < " << tolerance << ')' << std::endl;
+    }
+
+    if(!std::isfinite(error_dweight) || error_dweight > tolerance)
+    {
+        std::cout << "Backward PReLU Weight Gradient FAILED: " << error_dweight << " > "
+                  << tolerance << std::endl;
+        return EC_VerifyBwd;
+    }
+    else
+    {
+        std::cout << "Backward PReLU Weight Gradient Verifies OK on CPU reference ("
+                  << error_dweight << " < " << tolerance << ')' << std::endl;
+    }
+
+    return miopenStatusSuccess;
+}
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 75fe5df342..d5c9c7cea8 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -71,6 +71,7 @@
  * @defgroup getitem
  * @defgroup ReduceCalculation
  * @defgroup RotaryPositionalEmbeddings
+ * @defgroup ReLU
  *
  */
 
@@ -7678,6 +7679,61 @@ MIOPEN_EXPORT miopenStatus_t miopenRoPEBackward(miopenHandle_t handle,
 #endif // MIOPEN_BETA_API
 
 #ifdef MIOPEN_BETA_API
+/** @addtogroup ReLU
+ *
+ *  @{
+ */
+
+/*! @brief Helper function to query the minimum workspace size required by the PReLU backward call
+ *
+ * @param handle                   MIOpen Handle (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param weightDesc               Tensor descriptor for weight tensor (input)
+ * @param sizeInBytes              Pointer to data to return the minimum workspace size
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenGetPReLUBackwardWorkspaceSize(miopenHandle_t handle,
+                                    miopenTensorDescriptor_t inputDesc,
+                                    miopenTensorDescriptor_t weightDesc,
+                                    size_t* sizeInBytes);
+
+/*! @brief Execute a PReLU backward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param workspace                Address of the allocated workspace data (input)
+ * @param workspaceSizeInBytes     Size in bytes of the allocated workspace data (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param input                    Data tensor input (input)
+ * @param weightDesc               Tensor descriptor for weight tensor (input)
+ * @param weight                   Data tensor weight (input)
+ * @param doutputDesc              Tensor descriptor for output gradient (input)
+ * @param doutput                  Gradient of output (input)
+ * @param dinputDesc               Tensor descriptor for input gradient (input)
+ * @param dinput                   Gradient of input (output)
+ * @param dweightDesc              Tensor descriptor for weight gradient (input)
+ * @param dweight                  Gradient of weight (output)
+ */
+MIOPEN_EXPORT miopenStatus_t miopenPReLUBackward(miopenHandle_t handle,
+                                                 void* workspace,
+                                                 size_t workspaceSizeInBytes,
+                                                 miopenTensorDescriptor_t inputDesc,
+                                                 const void* input,
+                                                 miopenTensorDescriptor_t weightDesc,
+                                                 const void* weight,
+                                                 miopenTensorDescriptor_t doutputDesc,
+                                                 const void* doutput,
+                                                 miopenTensorDescriptor_t dinputDesc,
+                                                 void* dinput,
+                                                 miopenTensorDescriptor_t dweightDesc,
+                                                 void* dweight);
+
+/** @} */
+// CLOSEOUT RELU DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
+#ifdef MIOPEN_BETA_API
+
 /*! @ingroup LossFunction
  * @enum miopenLossReductionMode_t
  * Reduction mode for loss function
@@ -7769,7 +7825,7 @@ MIOPEN_EXPORT miopenStatus_t miopenMultiMarginLossForward(miopenHandle_t handle,
 
 /** @} */
 // CLOSEOUT LossFunction DOXYGEN GROUP
-#endif
+#endif // MIOPEN_BETA_API
 
 #ifdef __cplusplus
 }
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9dd357540d..32da2a6229 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -165,6 +165,8 @@ set( MIOpen_Source
     performance_config.cpp
     pooling/problem_description.cpp
     pooling_api.cpp
+    prelu/problem_description.cpp
+    prelu_api.cpp
     problem.cpp
     process.cpp
     ramdb.cpp
@@ -306,6 +308,9 @@ set( MIOpen_Source
     solver/pooling/forwardNd.cpp
     solver/pooling/backward2d.cpp
     solver/pooling/backwardNd.cpp
+    solver/prelu/backward_prelu_multi_weights.cpp
+    solver/prelu/backward_prelu_single_weight.cpp
+    solver/prelu/utils.cpp
     solver/reduce/forward_argmax.cpp
     solver/reduce/forward_argmin.cpp
     solver/reduce/forward_max.cpp
@@ -515,6 +520,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenLRNFwd.cl
         kernels/MIOpenMultiMarginLoss.cpp
         kernels/MIOpenNeuron.cl
+        kernels/MIOpenPReLU.cpp
         kernels/MIOpenPooling.cl
         kernels/MIOpenPoolingBwd.cl
         kernels/MIOpenPoolingBwdND.cl
@@ -670,6 +676,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         t5layernorm.cpp
         ocl/fusionopconvocl.cpp
         ocl/fusionopbiasbnactivocl.cpp
+        prelu.cpp
         reducecalculation.cpp
         reduceextreme.cpp
         rope.cpp
diff --git a/src/comgr.cpp b/src/comgr.cpp
index aa53b71bb5..b508bced2f 100644
--- a/src/comgr.cpp
+++ b/src/comgr.cpp
@@ -701,6 +701,7 @@ void BuildAsm(const std::string& name,
 
 #define WORKAROUND_ISSUE_HIPRTC_HIPRTC_HEADER_H 1 // See SWDEV-307838, issue #1648.
 #define WORKAROUND_ISSUE_1674 (HIP_PACKAGE_VERSION_FLAT >= 5003022305ULL)
+#define WORKAROUND_ISSUE_3188 (HIP_PACKAGE_VERSION_FLAT >= 6002041133ULL)
 
 // See WORKAROUND_SWDEV_413293 in ./CmakeLists.txt
 #define WORKAROUND_SWDEV_413293 MIOPEN_HIP_COMPILER_HAS_OPTION_OFFLOAD_UNIFORM_BLOCK
@@ -975,6 +976,9 @@ void BuildHip(const std::string& name,
 #endif
 #if WORKAROUND_ISSUE_1674
         opts.push_back("-Wno-gnu-line-marker");
+#endif
+#if WORKAROUND_ISSUE_3188
+        opts.push_back("-Wno-pass-failed");
 #endif
         opts.push_back("-Wno-cuda-compat");
         opts.push_back("-fno-gpu-rdc");
diff --git a/src/include/miopen/graphapi/util.hpp b/src/include/miopen/graphapi/util.hpp
index f9415a3630..02af1c9131 100644
--- a/src/include/miopen/graphapi/util.hpp
+++ b/src/include/miopen/graphapi/util.hpp
@@ -36,13 +36,12 @@
 namespace miopen {
 namespace graphapi {
 
-inline std::string tensorIdAsStr(const Tensor* tens_ptr)
+inline std::string tensorIdAsStr(int64_t tens_id)
 {
 
-    int64_t id = tens_ptr->getId();
-    char* b    = reinterpret_cast<char*>(&id);
+    char* b = reinterpret_cast<char*>(&tens_id);
 
-    return {b, sizeof(id)};
+    return {b, sizeof(tens_id)};
 }
 
 template <bool isVirtual, typename Vec>
diff --git a/src/include/miopen/prelu.hpp b/src/include/miopen/prelu.hpp
new file mode 100644
index 0000000000..35e5e7f611
--- /dev/null
+++ b/src/include/miopen/prelu.hpp
@@ -0,0 +1,53 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/common.hpp>
+
+namespace miopen {
+
+struct Handle;
+struct TensorDescriptor;
+
+MIOPEN_INTERNALS_EXPORT size_t GetPReLUBackwardWorkspaceSize(Handle& handle,
+                                                             const TensorDescriptor& inputDesc,
+                                                             const TensorDescriptor& weightDesc);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t PReLUBackward(Handle& handle,
+                                                     Data_t workspace,
+                                                     size_t workspaceSizeInBytes,
+                                                     const TensorDescriptor& inputDesc,
+                                                     ConstData_t input,
+                                                     const TensorDescriptor& weightDesc,
+                                                     ConstData_t weight,
+                                                     const TensorDescriptor& doutputDesc,
+                                                     ConstData_t doutput,
+                                                     const TensorDescriptor& dinputDesc,
+                                                     Data_t dinput,
+                                                     const TensorDescriptor& dweightDesc,
+                                                     Data_t dweight);
+
+} // namespace miopen
diff --git a/src/include/miopen/prelu/invoke_params.hpp b/src/include/miopen/prelu/invoke_params.hpp
new file mode 100644
index 0000000000..1acc2dc143
--- /dev/null
+++ b/src/include/miopen/prelu/invoke_params.hpp
@@ -0,0 +1,59 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+namespace prelu {
+
+struct InvokeParams : public miopen::InvokeParams
+{
+    InvokeParams() = default;
+
+    const TensorDescriptor* inputDesc   = nullptr;
+    const TensorDescriptor* weightDesc  = nullptr;
+    const TensorDescriptor* doutputDesc = nullptr;
+    const TensorDescriptor* dinputDesc  = nullptr;
+    const TensorDescriptor* dweightDesc = nullptr;
+
+    ConstData_t input          = nullptr;
+    ConstData_t weight         = nullptr;
+    ConstData_t doutput        = nullptr;
+    Data_t dinput              = nullptr;
+    Data_t dweight             = nullptr;
+    Data_t workspace           = nullptr;
+    std::size_t workspace_size = 0;
+
+    std::size_t GetWorkspaceSize() const { return workspace_size; }
+    Data_t GetWorkspace() const { return workspace; }
+};
+
+} // namespace prelu
+
+} // namespace miopen
diff --git a/src/include/miopen/prelu/problem_description.hpp b/src/include/miopen/prelu/problem_description.hpp
new file mode 100644
index 0000000000..ce10d96046
--- /dev/null
+++ b/src/include/miopen/prelu/problem_description.hpp
@@ -0,0 +1,115 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace prelu {
+
+bool checkSameType(const TensorDescriptor& x, const TensorDescriptor& y);
+bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y);
+
+struct BackwardProblemDescription : ProblemDescriptionBase
+{
+
+    BackwardProblemDescription(const TensorDescriptor& inputDesc_,
+                               const TensorDescriptor& weightDesc_,
+                               const TensorDescriptor& doutputDesc_,
+                               const TensorDescriptor& dinputDesc_,
+                               const TensorDescriptor& dweightDesc_)
+        : inputDesc(inputDesc_),
+          weightDesc(weightDesc_),
+          doutputDesc(doutputDesc_),
+          dinputDesc(dinputDesc_),
+          dweightDesc(dweightDesc_)
+    {
+        IsSameType();
+        IsRightLength();
+    }
+
+    const TensorDescriptor& GetInputDesc() const { return inputDesc; }
+    const TensorDescriptor& GetWeightDesc() const { return weightDesc; }
+    const TensorDescriptor& GetdOuputDesc() const { return doutputDesc; }
+    const TensorDescriptor& GetdInputDesc() const { return dinputDesc; }
+    const TensorDescriptor& GetdWeightDesc() const { return dweightDesc; }
+
+    bool IsSameType() const
+    {
+        if(!checkSameType(inputDesc, weightDesc))
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "PReLU: Input and Weight tensor must have same type.");
+        if(!checkSameType(inputDesc, dinputDesc) || !checkSameType(weightDesc, dweightDesc))
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "PReLU: Gradient tensors (excluding Output gradient) must share a same "
+                         "type with Input and Weight tensor.");
+        return true;
+    }
+
+    bool IsRightLength() const
+    {
+        if(!checkSameLength(inputDesc, doutputDesc) || !checkSameLength(inputDesc, dinputDesc))
+            MIOPEN_THROW(
+                miopenStatusBadParm,
+                "PReLU: Input and Output Gradient tensors sizes must match with Input tensor.");
+        if(weightDesc.GetNumDims() != 1)
+            MIOPEN_THROW(miopenStatusBadParm, "PReLU: Weight tensor must have 1 dimension.");
+        if(weightDesc.GetElementSize() != 1 &&
+           (inputDesc.GetNumDims() == 1 ||
+            weightDesc.GetElementSize() != inputDesc.GetLengths()[1]))
+            MIOPEN_THROW(
+                miopenStatusBadParm,
+                "PReLU: Weight size must be 1 or equal to the second dim of Input tensor.");
+        if(!checkSameLength(weightDesc, dweightDesc))
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "PReLU: Weight Gradient tensors sizes must match with Weight tensor.");
+        return true;
+    }
+
+    bool IsSingleWeight() const
+    {
+        if(weightDesc.GetElementSize() > 1)
+            return false;
+        return true;
+    }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+private:
+    TensorDescriptor inputDesc;
+    TensorDescriptor weightDesc;
+    TensorDescriptor doutputDesc;
+    TensorDescriptor dinputDesc;
+    TensorDescriptor dweightDesc;
+};
+
+} // namespace prelu
+
+} // namespace miopen
diff --git a/src/include/miopen/prelu/solvers.hpp b/src/include/miopen/prelu/solvers.hpp
new file mode 100644
index 0000000000..a2bec477f0
--- /dev/null
+++ b/src/include/miopen/prelu/solvers.hpp
@@ -0,0 +1,77 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/prelu/problem_description.hpp>
+#include <miopen/solver.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace prelu {
+
+using BackwardSolverBase =
+    NonTunableSolverBase<ExecutionContext, miopen::prelu::BackwardProblemDescription>;
+
+struct Backward : BackwardSolverBase
+{
+    bool MayNeedWorkspace() const override { return true; }
+};
+
+struct SingleWeightBackward final : Backward
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<SingleWeightBackward>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::prelu::BackwardProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::prelu::BackwardProblemDescription& problem) const override;
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::prelu::BackwardProblemDescription& problem) const override;
+};
+
+struct MultiWeightsBackward final : Backward
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<MultiWeightsBackward>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::prelu::BackwardProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::prelu::BackwardProblemDescription& problem) const override;
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::prelu::BackwardProblemDescription& problem) const override;
+};
+
+} // namespace prelu
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/prelu/utils.hpp b/src/include/miopen/prelu/utils.hpp
new file mode 100644
index 0000000000..b7551a4135
--- /dev/null
+++ b/src/include/miopen/prelu/utils.hpp
@@ -0,0 +1,45 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/solver.hpp>
+
+namespace miopen {
+namespace solver {
+namespace prelu {
+
+KernelInfo make_hip_kernel(std::vector<size_t> localsize,
+                           std::vector<size_t> gridsize,
+                           std::string kernel_file,
+                           std::string kernel_name,
+                           KernelBuildParameters build_params);
+
+} // namespace prelu
+} // namespace solver
+} // namespace miopen
diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index 485b7a24d8..fdfb857319 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -60,6 +60,7 @@ enum class Primitive
     Adam,
     Item,
     RoPE,
+    ReLU,
     MultiMarginLoss
 };
 
diff --git a/src/kernels/MIOpenPReLU.cpp b/src/kernels/MIOpenPReLU.cpp
new file mode 100644
index 0000000000..750b226e7f
--- /dev/null
+++ b/src/kernels/MIOpenPReLU.cpp
@@ -0,0 +1,115 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "tensor_view.hpp"
+
+template <typename TI, typename TO, uint32_t NDIMS, bool SingleWeight>
+__device__ void PReLUBackward(const TI* __restrict__ input,
+                              const TI* __restrict__ weight,
+                              const TO* __restrict__ output_grad,
+                              TI* __restrict__ input_grad,
+                              FLOAT_ACCUM* __restrict__ weight_grad_collector,
+                              uint64_t N,
+                              tensor_view_t<NDIMS> input_tv,
+                              tensor_view_t<1> weight_tv,
+                              tensor_view_t<NDIMS> output_grad_tv,
+                              tensor_view_t<NDIMS> input_grad_tv)
+{
+    uint64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+    if(gid >= N)
+        return;
+
+    auto tensor_layout  = tensor_layout_t<NDIMS>(input_tv, gid);
+    FLOAT_ACCUM input_v = CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx(tensor_layout)]);
+    FLOAT_ACCUM grad_v =
+        CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]);
+
+    if(input_grad != nullptr)
+    {
+        FLOAT_ACCUM weight_v = CVT_FLOAT2ACCUM(
+            weight[SingleWeight ? 0 : weight_tv.get_tensor_view_idx({tensor_layout.layout[1]})]);
+        FLOAT_ACCUM input_grad_v = input_v > 0 ? grad_v : weight_v * grad_v;
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] =
+            CVT_ACCUM2FLOAT(input_grad_v);
+    }
+    if(weight_grad_collector != nullptr)
+    {
+        weight_grad_collector[gid] = input_v > 0 ? 0 : input_v * grad_v;
+    }
+}
+
+extern "C" __global__ void PReLUSWBackward(const INPUT_TYPE* __restrict__ input,
+                                           const INPUT_TYPE* __restrict__ weight,
+                                           const OUTPUT_TYPE* __restrict__ output_grad,
+                                           INPUT_TYPE* __restrict__ input_grad,
+                                           FLOAT_ACCUM* __restrict__ weight_grad_collector,
+                                           uint64_t N,
+                                           tensor_view_t<VIEW_DIMS> input_tv,
+                                           tensor_view_t<1> weight_tv,
+                                           tensor_view_t<VIEW_DIMS> output_grad_tv,
+                                           tensor_view_t<VIEW_DIMS> input_grad_tv)
+{
+    // instantiate the kernel
+    PReLUBackward<INPUT_TYPE, OUTPUT_TYPE, VIEW_DIMS, true>(input,
+                                                            weight,
+                                                            output_grad,
+                                                            input_grad,
+                                                            weight_grad_collector,
+                                                            N,
+                                                            input_tv,
+                                                            weight_tv,
+                                                            output_grad_tv,
+                                                            input_grad_tv);
+}
+
+extern "C" __global__ void PReLUMWBackward(const INPUT_TYPE* __restrict__ input,
+                                           const INPUT_TYPE* __restrict__ weight,
+                                           const OUTPUT_TYPE* __restrict__ output_grad,
+                                           INPUT_TYPE* __restrict__ input_grad,
+                                           FLOAT_ACCUM* __restrict__ weight_grad_collector,
+                                           uint64_t N,
+                                           tensor_view_t<VIEW_DIMS> input_tv,
+                                           tensor_view_t<1> weight_tv,
+                                           tensor_view_t<VIEW_DIMS> output_grad_tv,
+                                           tensor_view_t<VIEW_DIMS> input_grad_tv)
+{
+    // instantiate the kernel
+    PReLUBackward<INPUT_TYPE, OUTPUT_TYPE, VIEW_DIMS, false>(input,
+                                                             weight,
+                                                             output_grad,
+                                                             input_grad,
+                                                             weight_grad_collector,
+                                                             N,
+                                                             input_tv,
+                                                             weight_tv,
+                                                             output_grad_tv,
+                                                             input_grad_tv);
+}
diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
index d7294d8992..8cdab4b942 100644
--- a/src/kernels/tensor_view.hpp
+++ b/src/kernels/tensor_view.hpp
@@ -27,6 +27,8 @@
 #ifndef GUARD_TENSOR_VIEW_HPP
 #define GUARD_TENSOR_VIEW_HPP
 
+#include <initializer_list>
+
 template <int N>
 struct tensor_layout_t;
 
@@ -77,6 +79,15 @@ struct tensor_layout_t
         }
     }
 
+    constexpr tensor_layout_t(std::initializer_list<uint64_t> layout_)
+    {
+        static_assert(N > 0);
+        for(auto i = 0; i < N; ++i)
+        {
+            layout[i] = layout_.begin()[i];
+        }
+    }
+
     uint64_t layout[N];
 };
 
diff --git a/src/prelu.cpp b/src/prelu.cpp
new file mode 100644
index 0000000000..3966c1f45f
--- /dev/null
+++ b/src/prelu.cpp
@@ -0,0 +1,95 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/find_solution.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/prelu.hpp>
+#include <miopen/prelu/invoke_params.hpp>
+#include <miopen/prelu/solvers.hpp>
+
+namespace miopen {
+
+size_t GetPReLUBackwardWorkspaceSize(Handle& handle,
+                                     const TensorDescriptor& inputDesc,
+                                     const TensorDescriptor& weightDesc)
+{
+    auto ctx = ExecutionContext{&handle};
+    const auto problem =
+        prelu::BackwardProblemDescription{inputDesc, weightDesc, inputDesc, inputDesc, weightDesc};
+
+    const auto solvers = solver::SolverContainer<solver::prelu::SingleWeightBackward,
+                                                 solver::prelu::MultiWeightsBackward>{};
+
+    auto pair_size_vector = solvers.GetWorkspaceSizes(ctx, problem);
+
+    return pair_size_vector.empty() ? static_cast<size_t>(-1) : pair_size_vector.front().second;
+}
+
+miopenStatus_t PReLUBackward(Handle& handle,
+                             Data_t workspace,
+                             size_t workspaceSizeInBytes,
+                             const TensorDescriptor& inputDesc,
+                             ConstData_t input,
+                             const TensorDescriptor& weightDesc,
+                             ConstData_t weight,
+                             const TensorDescriptor& doutputDesc,
+                             ConstData_t doutput,
+                             const TensorDescriptor& dinputDesc,
+                             Data_t dinput,
+                             const TensorDescriptor& dweightDesc,
+                             Data_t dweight)
+{
+    const auto problem = prelu::BackwardProblemDescription{
+        inputDesc, weightDesc, doutputDesc, inputDesc, dweightDesc};
+
+    const auto invoke_params = [&]() {
+        auto tmp           = prelu::InvokeParams{};
+        tmp.type           = InvokeType::Run;
+        tmp.inputDesc      = &inputDesc;
+        tmp.weightDesc     = &weightDesc;
+        tmp.doutputDesc    = &doutputDesc;
+        tmp.dinputDesc     = &dinputDesc;
+        tmp.dweightDesc    = &dweightDesc;
+        tmp.input          = input;
+        tmp.weight         = weight;
+        tmp.doutput        = doutput;
+        tmp.dinput         = dinput;
+        tmp.dweight        = dweight;
+        tmp.workspace      = workspace;
+        tmp.workspace_size = workspaceSizeInBytes;
+        return tmp;
+    }();
+
+    const auto algo    = AlgorithmName{"PReLUBackward"};
+    const auto solvers = solver::SolverContainer<solver::prelu::SingleWeightBackward,
+                                                 solver::prelu::MultiWeightsBackward>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace miopen
diff --git a/src/prelu/problem_description.cpp b/src/prelu/problem_description.cpp
new file mode 100644
index 0000000000..f364c60d19
--- /dev/null
+++ b/src/prelu/problem_description.cpp
@@ -0,0 +1,76 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/prelu/problem_description.hpp>
+#include <miopen/names.hpp>
+
+#include <sstream>
+
+namespace miopen {
+
+namespace prelu {
+
+bool checkSameType(const TensorDescriptor& x, const TensorDescriptor& y)
+{
+    if(x.GetType() != y.GetType())
+        return false;
+    return true;
+}
+
+bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y)
+{
+    if(x.GetNumDims() != y.GetNumDims())
+        return false;
+    for(int32_t i = 0; i < x.GetNumDims(); ++i)
+    {
+        if(x.GetLengths()[i] != y.GetLengths()[i])
+            return false;
+    }
+    return true;
+}
+
+NetworkConfig BackwardProblemDescription::MakeNetworkConfig() const
+{
+    auto input_dtype  = inputDesc.GetType();
+    auto output_dtype = doutputDesc.GetType();
+    auto size         = inputDesc.GetElementSize();
+    auto num_params   = weightDesc.GetElementSize();
+
+    std::ostringstream ss;
+
+    ss << "prelu_bwd";
+    ss << "idtype" << input_dtype;
+    ss << "odtype" << output_dtype;
+    ss << "ndim" << inputDesc.GetNumDims();
+    ss << "size" << size;
+    ss << "num_params" << num_params;
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace prelu
+
+} // namespace miopen
diff --git a/src/prelu_api.cpp b/src/prelu_api.cpp
new file mode 100644
index 0000000000..5e670a0a2b
--- /dev/null
+++ b/src/prelu_api.cpp
@@ -0,0 +1,133 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/handle.hpp>
+#include <miopen/tensor_ops.hpp>
+#include <miopen/prelu.hpp>
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
+static void LogCmdPReLU(const miopenTensorDescriptor_t inputDesc,
+                        const miopenTensorDescriptor_t weightDesc,
+                        bool is_fwd)
+{
+    if(miopen::IsLoggingCmd())
+    {
+        std::stringstream ss;
+        auto dtype = miopen::deref(inputDesc).GetType();
+        if(dtype == miopenHalf)
+        {
+            ss << "prelufp16";
+        }
+        else if(dtype == miopenFloat)
+        {
+            ss << "prelufp32";
+        }
+        else if(dtype == miopenBFloat16)
+        {
+            ss << "prelubfp16";
+        }
+
+        MIOPEN_LOG_FUNCTION(inputDesc, weightDesc);
+        ss << " --input " << miopen::deref(inputDesc).GetLengths();
+        ss << " --weight " << miopen::deref(weightDesc).GetLengths();
+        ss << " -F " << ((is_fwd) ? "1" : "2");
+
+        MIOPEN_LOG_DRIVER_CMD(ss.str());
+    }
+}
+
+extern "C" miopenStatus_t
+miopenGetPReLUBackwardWorkspaceSize(miopenHandle_t handle,
+                                    const miopenTensorDescriptor_t inputDesc,
+                                    const miopenTensorDescriptor_t weightDesc,
+                                    size_t* sizeInBytes)
+{
+
+    MIOPEN_LOG_FUNCTION(handle, inputDesc, weightDesc, sizeInBytes);
+
+    return miopen::try_([&] {
+        miopen::deref(sizeInBytes) = miopen::GetPReLUBackwardWorkspaceSize(
+            miopen::deref(handle), miopen::deref(inputDesc), miopen::deref(weightDesc));
+    });
+}
+
+extern "C" miopenStatus_t miopenPReLUBackward(miopenHandle_t handle,
+                                              void* workspace,
+                                              const size_t workspaceSizeInBytes,
+                                              const miopenTensorDescriptor_t inputDesc,
+                                              const void* input,
+                                              const miopenTensorDescriptor_t weightDesc,
+                                              const void* weight,
+                                              const miopenTensorDescriptor_t doutputDesc,
+                                              const void* doutput,
+                                              const miopenTensorDescriptor_t dinputDesc,
+                                              void* dinput,
+                                              const miopenTensorDescriptor_t dweightDesc,
+                                              void* dweight)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        workspace,
+                        workspaceSizeInBytes,
+                        inputDesc,
+                        input,
+                        weightDesc,
+                        weight,
+                        doutputDesc,
+                        doutput,
+                        dinputDesc,
+                        dinput,
+                        dweightDesc,
+                        dweight);
+
+    LogCmdPReLU(inputDesc, weightDesc, false);
+    return miopen::try_([&] {
+        miopen::PReLUBackward(miopen::deref(handle),
+                              DataCast(workspace),
+                              workspaceSizeInBytes,
+                              miopen::deref(inputDesc),
+                              DataCast(input),
+                              miopen::deref(weightDesc),
+                              DataCast(weight),
+                              miopen::deref(doutputDesc),
+                              DataCast(doutput),
+                              miopen::deref(dinputDesc),
+                              DataCast(dinput),
+                              miopen::deref(dweightDesc),
+                              DataCast(dweight));
+    });
+}
diff --git a/src/solver.cpp b/src/solver.cpp
index 16d7ed92e4..11ada34965 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -35,6 +35,7 @@
 #include <miopen/getitem/solvers.hpp>
 #include <miopen/layernorm/solvers.hpp>
 #include <miopen/pooling/solvers.hpp>
+#include <miopen/prelu/solvers.hpp>
 #include <miopen/reduce/solvers.hpp>
 #include <miopen/rope/solvers.hpp>
 #include <miopen/mha/solvers.hpp>
@@ -677,6 +678,9 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
 
     Register(registry, ++id, Primitive::RoPE, rope::RoPEForward{}.SolverDbId());
     Register(registry, ++id, Primitive::RoPE, rope::RoPEBackward{}.SolverDbId());
+    Register(registry, ++id, Primitive::ReLU, prelu::MultiWeightsBackward{}.SolverDbId());
+    Register(registry, ++id, Primitive::ReLU, prelu::SingleWeightBackward{}.SolverDbId());
+
     Register(registry,
              ++id,
              Primitive::MultiMarginLoss,
diff --git a/src/solver/prelu/backward_prelu_multi_weights.cpp b/src/solver/prelu/backward_prelu_multi_weights.cpp
new file mode 100644
index 0000000000..5fed375a2b
--- /dev/null
+++ b/src/solver/prelu/backward_prelu_multi_weights.cpp
@@ -0,0 +1,183 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/tensor_view_utils.hpp>
+#include <miopen/prelu/invoke_params.hpp>
+#include <miopen/prelu/solvers.hpp>
+#include <miopen/prelu/utils.hpp>
+
+#define VIEW_DIMS 5
+
+#define warpSizeCTX (context.GetStream().GetWavefrontWidth())
+#define LOCAL_SIZE_MW_BWD 256
+#define LOCAL_SIZE_MW_REDUCE_BWD warpSizeCTX
+
+namespace miopen {
+
+namespace solver {
+
+namespace prelu {
+
+bool MultiWeightsBackward::IsApplicable(
+    const ExecutionContext& /*context*/,
+    const miopen::prelu::BackwardProblemDescription& problem) const
+{
+    if(problem.GetdInputDesc().GetVectorLength() > VIEW_DIMS)
+        return false;
+    if(problem.IsSingleWeight())
+        return false;
+    return true;
+}
+
+ConvSolution
+MultiWeightsBackward::GetSolution(const ExecutionContext& context,
+                                  const miopen::prelu::BackwardProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto dtype        = problem.GetdInputDesc().GetType();
+    auto input_dtype  = miopen::GetDataType(problem.GetdInputDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetdOuputDesc().GetType());
+
+    /* Phase 1: Calc gradient for each elements. */
+    {
+        auto size         = problem.GetdInputDesc().GetElementSize();
+        auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"VIEW_DIMS", VIEW_DIMS},
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype},
+        };
+        result.construction_params.push_back(make_hip_kernel(
+            {LOCAL_SIZE_MW_BWD}, {size}, "MIOpenPReLU.cpp", "PReLUMWBackward", build_params));
+    }
+
+    /* Phase 2: Reduce gradient. */
+    {
+        auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"OUTPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"REDUCE_SIZE", LOCAL_SIZE_MW_REDUCE_BWD},
+        };
+        result.construction_params.push_back(
+            make_hip_kernel({LOCAL_SIZE_MW_REDUCE_BWD},
+                            {problem.GetdWeightDesc().GetElementSize() * LOCAL_SIZE_MW_REDUCE_BWD},
+                            "MIOpenReduceSum.cpp",
+                            "Reduce1dSum",
+                            build_params));
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::prelu::InvokeParams>();
+
+            HipEventPtr start, stop;
+            bool profiling = handle_.IsProfilingEnabled();
+            if(profiling)
+            {
+                handle_.EnableProfiling(false);
+                hipStreamSynchronize(handle_.GetStream());
+                start = miopen::make_hip_event();
+                stop  = miopen::make_hip_event();
+                hipEventRecord(start.get(), handle_.GetStream());
+            }
+
+            int kernelCnt = 0;
+
+            /* Phase 1: Calc gradient for each elements. */
+            {
+                auto input_tv         = get_inner_expanded_tv<VIEW_DIMS>(deref(params.inputDesc));
+                auto weight_tv        = get_inner_expanded_tv<1>(deref(params.weightDesc));
+                auto output_grad_tv   = get_inner_expanded_tv<VIEW_DIMS>(deref(params.doutputDesc));
+                auto input_grad_tv    = get_inner_expanded_tv<VIEW_DIMS>(deref(params.dinputDesc));
+                decltype(auto) kernel = handle_.Run(kernels[kernelCnt++]);
+                kernel(params.input,
+                       params.weight,
+                       params.doutput,
+                       params.dinput,
+                       params.workspace,
+                       static_cast<uint64_t>(deref(params.inputDesc).GetElementSize()),
+                       input_tv,
+                       weight_tv,
+                       output_grad_tv,
+                       input_grad_tv);
+            }
+
+            /* Phase 2: Reduce gradient. */
+            {
+                uint64_t output_numel = deref(params.weightDesc).GetElementSize();
+                uint64_t outer_size   = deref(params.inputDesc).GetLengths()[0];
+                uint64_t inner_size =
+                    deref(params.inputDesc).GetElementSize() / outer_size / output_numel;
+                auto weight_grad_tv   = get_inner_expanded_tv<1>(deref(params.dweightDesc));
+                decltype(auto) kernel = handle_.Run(kernels[kernelCnt++]);
+                kernel(params.workspace,
+                       params.dweight,
+                       output_numel,
+                       inner_size,
+                       outer_size,
+                       weight_grad_tv);
+            }
+
+            if(profiling)
+            {
+                float elapsed = 0.0f;
+                hipEventRecord(stop.get(), handle_.GetStream());
+                handle_.EnableProfiling(true);
+                hipEventSynchronize(stop.get());
+                hipEventElapsedTime(&elapsed, start.get(), stop.get());
+                // Clean up
+                hipEventDestroy(start.get());
+                hipEventDestroy(stop.get());
+                handle_.ResetKernelTime();
+                handle_.AccumKernelTime(elapsed);
+            };
+        };
+    };
+
+    return result;
+}
+
+std::size_t MultiWeightsBackward::GetWorkspaceSize(
+    const ExecutionContext& /*context*/,
+    const miopen::prelu::BackwardProblemDescription& problem) const
+{
+    auto size = problem.GetdInputDesc().GetElementSize();
+    size *= get_data_size(miopenFloat);
+    return size;
+}
+
+} // namespace prelu
+} // namespace solver
+} // namespace miopen
diff --git a/src/solver/prelu/backward_prelu_single_weight.cpp b/src/solver/prelu/backward_prelu_single_weight.cpp
new file mode 100644
index 0000000000..2f8ba825f4
--- /dev/null
+++ b/src/solver/prelu/backward_prelu_single_weight.cpp
@@ -0,0 +1,202 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/tensor_view_utils.hpp>
+#include <miopen/prelu/invoke_params.hpp>
+#include <miopen/prelu/solvers.hpp>
+#include <miopen/prelu/utils.hpp>
+
+#define VIEW_DIMS 5
+
+#define LOCAL_SIZE_SW_BWD 256
+#define LOCAL_SIZE_SW_REDUCE_BWD 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace prelu {
+
+namespace {
+MultiBufferWorkspaceTraits GetMultiBufferWorkspaceTraits(const TensorDescriptor& inputDesc)
+{
+    auto size = inputDesc.GetElementSize();
+    return MultiBufferWorkspaceTraits{size * get_data_size(miopenFloat),
+                                      AlignUp(size, LOCAL_SIZE_SW_REDUCE_BWD) /
+                                          LOCAL_SIZE_SW_REDUCE_BWD * get_data_size(miopenFloat)};
+}
+} // namespace
+
+bool SingleWeightBackward::IsApplicable(
+    const ExecutionContext& /*context*/,
+    const miopen::prelu::BackwardProblemDescription& problem) const
+{
+    if(problem.GetdInputDesc().GetVectorLength() > VIEW_DIMS)
+        return false;
+    if(!problem.IsSingleWeight())
+        return false;
+    return true;
+}
+
+ConvSolution
+SingleWeightBackward::GetSolution(const ExecutionContext& /*context*/,
+                                  const miopen::prelu::BackwardProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto dtype        = problem.GetdInputDesc().GetType();
+    auto input_dtype  = miopen::GetDataType(problem.GetdInputDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetdOuputDesc().GetType());
+
+    /* Phase 1: Calc gradient for each elements. */
+    {
+        auto size         = problem.GetdInputDesc().GetElementSize();
+        auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"VIEW_DIMS", VIEW_DIMS},
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype},
+        };
+        result.construction_params.push_back(make_hip_kernel(
+            {LOCAL_SIZE_SW_BWD}, {size}, "MIOpenPReLU.cpp", "PReLUSWBackward", build_params));
+    }
+
+    /* Phase 2: Reduce gradient. */
+    {
+        auto size         = problem.GetdInputDesc().GetElementSize();
+        auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"OUTPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"REDUCE_SIZE", LOCAL_SIZE_SW_REDUCE_BWD},
+        };
+        while(size > LOCAL_SIZE_SW_REDUCE_BWD)
+        {
+            result.construction_params.push_back(make_hip_kernel({LOCAL_SIZE_SW_REDUCE_BWD},
+                                                                 {size},
+                                                                 "MIOpenReduceSum.cpp",
+                                                                 "ReduceSumFLOATACCUM",
+                                                                 build_params));
+            size = (size + LOCAL_SIZE_SW_REDUCE_BWD - 1) / LOCAL_SIZE_SW_REDUCE_BWD;
+        }
+        result.construction_params.push_back(make_hip_kernel(
+            {LOCAL_SIZE_SW_REDUCE_BWD}, {size}, "MIOpenReduceSum.cpp", "ReduceSum", build_params));
+    }
+
+    auto getBuffPart = [ws = GetMultiBufferWorkspaceTraits(problem.GetdInputDesc())](
+                           void* buffer, size_t part_idx) {
+        return static_cast<void*>(static_cast<std::byte*>(buffer) + ws.GetOffset(part_idx));
+    };
+
+    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::prelu::InvokeParams>();
+
+            HipEventPtr start, stop;
+            bool profiling = handle_.IsProfilingEnabled();
+            if(profiling)
+            {
+                handle_.EnableProfiling(false);
+                hipStreamSynchronize(handle_.GetStream());
+                start = miopen::make_hip_event();
+                stop  = miopen::make_hip_event();
+                hipEventRecord(start.get(), handle_.GetStream());
+            }
+
+            auto work_a = getBuffPart(params.GetWorkspace(), 0);
+            auto work_b = getBuffPart(params.GetWorkspace(), 1);
+
+            int kernelCnt = 0;
+
+            /* Phase 1: Calc gradient for each elements. */
+            {
+                auto input_tv         = get_inner_expanded_tv<VIEW_DIMS>(deref(params.inputDesc));
+                auto weight_tv        = get_inner_expanded_tv<1>(deref(params.weightDesc));
+                auto output_grad_tv   = get_inner_expanded_tv<VIEW_DIMS>(deref(params.doutputDesc));
+                auto input_grad_tv    = get_inner_expanded_tv<VIEW_DIMS>(deref(params.dinputDesc));
+                decltype(auto) kernel = handle_.Run(kernels[kernelCnt++]);
+                kernel(params.input,
+                       params.weight,
+                       params.doutput,
+                       params.dinput,
+                       work_a,
+                       static_cast<uint64_t>(deref(params.inputDesc).GetElementSize()),
+                       input_tv,
+                       weight_tv,
+                       output_grad_tv,
+                       input_grad_tv);
+            }
+
+            /* Phase 2: Reduce gradient. */
+            {
+                uint64_t size = deref(params.inputDesc).GetElementSize();
+                while(size > LOCAL_SIZE_SW_REDUCE_BWD)
+                {
+                    auto kernel = handle_.Run(kernels[kernelCnt++]);
+                    kernel(work_a, work_b, size);
+                    size = (size + LOCAL_SIZE_SW_REDUCE_BWD - 1) / LOCAL_SIZE_SW_REDUCE_BWD;
+                    std::swap(work_a, work_b);
+                }
+                auto weight_grad_tv = get_inner_expanded_tv<1>(deref(params.dweightDesc));
+                handle_.Run(kernels[kernelCnt++])(work_a, params.dweight, size, weight_grad_tv);
+            }
+
+            if(profiling)
+            {
+                float elapsed = 0.0f;
+                hipEventRecord(stop.get(), handle_.GetStream());
+                handle_.EnableProfiling(true);
+                hipEventSynchronize(stop.get());
+                hipEventElapsedTime(&elapsed, start.get(), stop.get());
+                // Clean up
+                hipEventDestroy(start.get());
+                hipEventDestroy(stop.get());
+                handle_.ResetKernelTime();
+                handle_.AccumKernelTime(elapsed);
+            };
+        };
+    };
+
+    return result;
+}
+
+std::size_t SingleWeightBackward::GetWorkspaceSize(
+    const ExecutionContext& /*context*/,
+    const miopen::prelu::BackwardProblemDescription& problem) const
+{
+    return GetMultiBufferWorkspaceTraits(problem.GetdInputDesc()).GetSize();
+}
+
+} // namespace prelu
+} // namespace solver
+} // namespace miopen
diff --git a/src/solver/prelu/utils.cpp b/src/solver/prelu/utils.cpp
new file mode 100644
index 0000000000..d67e0bdac5
--- /dev/null
+++ b/src/solver/prelu/utils.cpp
@@ -0,0 +1,53 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/prelu/utils.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace prelu {
+
+KernelInfo make_hip_kernel(std::vector<size_t> localsize,
+                           std::vector<size_t> gridsize,
+                           std::string kernel_file,
+                           std::string kernel_name,
+                           KernelBuildParameters build_params)
+{
+    while(localsize.size() < 3)
+        localsize.push_back(1);
+    while(gridsize.size() < 3)
+        gridsize.push_back(1);
+    for(int i = 0; i < localsize.size(); ++i)
+        gridsize[i] = AlignUp(gridsize[i], localsize[i]);
+    return KernelInfo{
+        build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name};
+}
+
+} // namespace prelu
+} // namespace solver
+} // namespace miopen
diff --git a/test/cpu_prelu.hpp b/test/cpu_prelu.hpp
new file mode 100644
index 0000000000..952099c3f6
--- /dev/null
+++ b/test/cpu_prelu.hpp
@@ -0,0 +1,100 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "tensor_holder.hpp"
+#include <miopen/tensor_view_utils.hpp>
+
+template <class T>
+void cpu_prelu_backward(const tensor<T> input,
+                        const tensor<T> weight,
+                        const tensor<T> output_grad,
+                        tensor<T>& ref_input_grad,
+                        tensor<T>& ref_weight_grad,
+                        const bool has_dinput  = true,
+                        const bool has_dweight = true)
+{
+    auto N              = input.desc.GetElementSize();
+    auto input_tv       = miopen::get_inner_expanded_tv<5>(input.desc);
+    auto weight_tv      = miopen::get_inner_expanded_tv<1>(weight.desc);
+    auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc);
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(ref_input_grad.desc);
+    auto weight_grad_tv = miopen::get_inner_expanded_tv<1>(ref_weight_grad.desc);
+
+    auto weight_grad_collector = std::vector<float>(N);
+
+    par_ford(N)([&](int gid) {
+        auto tensor_layout = tensor_layout_t<5>(input_tv, gid);
+        float input_v      = static_cast<float>(input[input_tv.get_tensor_view_idx(tensor_layout)]);
+        float grad_v =
+            static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(tensor_layout)]);
+
+        if(has_dinput)
+        {
+            float weight_v;
+            weight_v = static_cast<float>(
+                weight[weight.desc.GetElementSize() == 1
+                           ? 0
+                           : weight_tv.get_tensor_view_idx({tensor_layout.layout[1]})]);
+
+            float input_grad_v = input_v > 0 ? grad_v : weight_v * grad_v;
+            ref_input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout)] =
+                static_cast<T>(input_grad_v);
+        }
+        if(has_dweight)
+        {
+            weight_grad_collector[gid] = input_v > 0 ? 0 : input_v * grad_v;
+        }
+    });
+
+    if(has_dweight)
+    {
+        if(weight.desc.GetElementSize() == 1)
+        {
+            double sum = 0;
+            for(int i = 0; i < N; ++i)
+                sum += static_cast<double>(weight_grad_collector[i]);
+            ref_weight_grad[0] = static_cast<T>(sum);
+        }
+        else
+        {
+            size_t inner_size = std::accumulate(
+                &input_tv.size[2], &input_tv.size[4], 1ul, std::multiplies<size_t>());
+            size_t outer_size = inner_size * input_tv.size[1];
+            par_ford(weight.desc.GetElementSize())([&](int i) {
+                double sum = 0;
+                ford(input_tv.size[0])([&](int j) {
+                    ford(inner_size)([&](int k) {
+                        sum += static_cast<double>(
+                            weight_grad_collector[j * outer_size + i * inner_size + k]);
+                    });
+                });
+                ref_weight_grad[weight_grad_tv.get_tensor_view_idx({i})] = static_cast<T>(sum);
+            });
+        }
+    }
+}
diff --git a/test/gtest/graphapi_mha_bwd.cpp b/test/gtest/graphapi_mha_bwd.cpp
index 35bf97f3d0..13d78cd424 100644
--- a/test/gtest/graphapi_mha_bwd.cpp
+++ b/test/gtest/graphapi_mha_bwd.cpp
@@ -39,6 +39,7 @@ class MhaBwdGraphTest : public MhaGraphTestBase
         mGraphBuilder = std::make_unique<gr::OpGraphBuilder>();
 
         std::vector<size_t> nhsd  = {n, h, s, d};
+        std::vector<size_t> nhds  = {n, h, d, s};
         std::vector<size_t> nhss  = {n, h, s, s};
         std::vector<size_t> nhs1  = {n, h, s, 1};
         std::vector<size_t> all1s = {1, 1, 1, 1};
@@ -49,8 +50,11 @@ class MhaBwdGraphTest : public MhaGraphTestBase
         MAKE_TENSOR_F(dO, nhsd, false);
         MAKE_TENSOR_F(O, nhsd, false);
 
+        MAKE_TENSOR_F(K_T, nhds, true);
         MAKE_TENSOR_F(T_MM_0, nhss, true);
-        addNode("OP_MATMUL", {Q, K}, {T_MM_0});
+
+        addNode("OP_RESHAPE", {K}, {K_T});
+        addNode("OP_MATMUL", {Q, K_T}, {T_MM_0});
 
         MAKE_TENSOR_F(T_SCL_0, nhss, true);
         // MAKE_TENSOR_F(ATN_SCL, all1s, false);
@@ -101,9 +105,11 @@ class MhaBwdGraphTest : public MhaGraphTestBase
         MAKE_TENSOR_F(SCL_S, all1s, false);
         addNode("OP_POINTWISE:MUL", {T_SCL_3, SCL_S}, {T_SCL_4});
 
-        // NOTE(Amber): omitting the Reshape transpose node here
+        MAKE_TENSOR_F(SCL_4T, nhss, true);
+        addNode("OP_RESHAPE", {T_SCL_4}, {SCL_4T});
+
         MAKE_TENSOR_F(T_MM_1, nhsd, true);
-        addNode("OP_MATMUL", {T_SCL_4, dO}, {T_MM_1});
+        addNode("OP_MATMUL", {SCL_4T, dO}, {T_MM_1});
 
         MAKE_TENSOR_F(T_SCL_5, nhsd, true);
         MAKE_TENSOR_F(DSCL_S, all1s, false);
@@ -122,8 +128,11 @@ class MhaBwdGraphTest : public MhaGraphTestBase
 
         ////////////////// Center-top //////////////////////////////////
 
+        MAKE_TENSOR_F(V_T, nhds, true);
+        addNode("OP_RESHAPE", {V}, {V_T});
+
         MAKE_TENSOR_F(T_MM_2, nhss, true);
-        addNode("OP_MATMUL", {dO, V}, {T_MM_2});
+        addNode("OP_MATMUL", {dO, V_T}, {T_MM_2});
 
         MAKE_TENSOR_F(T_SCL_7, nhss, true);
         addNode("OP_POINTWISE:MUL", {T_MM_2, DSCL_dO}, {T_SCL_7});
@@ -192,9 +201,11 @@ class MhaBwdGraphTest : public MhaGraphTestBase
         addNode("OP_REDUCTION:MAX", {T_SCL_17}, {AMax_dQ});
         ////////////////// Right Bottom //////////////////////////////////
 
-        // XXX(Amber): Reshape transpose node goes here
+        MAKE_TENSOR_F(SCL_15T, nhss, true);
+        addNode("OP_RESHAPE", {T_SCL_15}, {SCL_15T});
+
         MAKE_TENSOR_F(T_MM_4, nhsd, true);
-        addNode("OP_MATMUL", {T_SCL_15, Q}, {T_MM_4});
+        addNode("OP_MATMUL", {SCL_15T, Q}, {T_MM_4});
 
         MAKE_TENSOR_F(T_SCL_18, nhsd, true);
         addNode("OP_POINTWISE:MUL", {T_MM_4, DSCL_dS}, {T_SCL_18});
@@ -295,6 +306,11 @@ class MhaBwdGraphTest : public MhaGraphTestBase
 
         // softmax needed for calling cpu backward mha
         mSoftMax = std::move(out.mSoftMax);
+
+        // Remove "SCL_O" here so that it doesn't pollute the variant pack used for
+        // execution
+        size_t cnt = mFilledTensors.erase("SCL_O");
+        ASSERT_EQ(cnt, 1);
     }
 
     void runCpuVerify(size_t n, size_t h, size_t s, size_t d) override
@@ -350,16 +366,13 @@ class MhaBwdGraphTest : public MhaGraphTestBase
         checkTensor("dK", dK_ref);
         checkTensor("dV", dV_ref);
     }
-
-    /// \todo remove once backward mha is ready to execute
-    void executeMhaGraph() override {}
 };
 
 } // end namespace mha_graph_test
   //
 using namespace mha_graph_test;
 
-TEST_P(MhaBwdGraphTest, MhaBwdGraph) { Run(); }
+TEST_P(MhaBwdGraphTest, MhaBwdGraph) { Run(MhaDir::Bwd); }
 
 INSTANTIATE_TEST_SUITE_P(MhaGraphBwdSuite,
                          MhaBwdGraphTest,
diff --git a/test/gtest/graphapi_mha_cpp_common.hpp b/test/gtest/graphapi_mha_cpp_common.hpp
index 5f35855bd4..ec385ab360 100644
--- a/test/gtest/graphapi_mha_cpp_common.hpp
+++ b/test/gtest/graphapi_mha_cpp_common.hpp
@@ -38,6 +38,7 @@
 #include <miopen/graphapi/opgraph.hpp>
 #include <miopen/graphapi/pointwise.hpp>
 #include <miopen/graphapi/reduction.hpp>
+#include <miopen/graphapi/reshape.hpp>
 #include <miopen/graphapi/rng.hpp>
 #include <miopen/graphapi/util.hpp>
 #include <miopen/graphapi/variant_pack.hpp>
@@ -271,6 +272,13 @@ class MhaGraphTestBase
                                                        .setOutput(out_tensors[0])
                                                        .build()));
         }
+        else if(name == "OP_RESHAPE")
+        {
+            assert(in_tensors.size() == 1);
+            assert(out_tensors.size() == 1);
+            mGraphBuilder->addNode(mAlloc.allocate(
+                gr::OperationReshapeBuilder{}.setX(in_tensors[0]).setY(out_tensors[0]).build()));
+        }
         else
         {
             FAIL() << "Unknown graph node type: " << name;
@@ -423,7 +431,13 @@ class MhaGraphTestBase
     }
 
 public:
-    void Run()
+    enum class MhaDir
+    {
+        Fwd,
+        Bwd
+    };
+
+    void Run(MhaDir direction)
     {
         auto [n, h, s, d, p] = GetParam();
         std::cout << "n:" << n << ", h:" << h << ", s:" << s << ", d:" << d << ", p:" << p
@@ -431,10 +445,17 @@ class MhaGraphTestBase
         mProbDropout = p;
 
         auto& handle = get_handle();
-        if((p > 0.0f) && (s % handle.GetWavefrontWidth() != 0))
+        if(direction == MhaDir::Fwd && (p > 0.0f) && (s % handle.GetWavefrontWidth() != 0))
         {
-            GTEST_SKIP() << "CPU Dropout currently supprorts only fully occupied warps";
+            GTEST_SKIP()
+                << "CPU Fwd pass with Dropout currently supprorts only fully occupied warps";
         }
+
+        if(direction == MhaDir::Bwd && p > 0.0f)
+        {
+            GTEST_SKIP() << "CPU backward pass with Dropout is not supported currently";
+        }
+
         createMhaGraph(n, h, s, d);
         initInputs(n, h, s, d);
         executeMhaGraph();
diff --git a/test/gtest/graphapi_mha_fwd_f8.cpp b/test/gtest/graphapi_mha_fwd.cpp
similarity index 99%
rename from test/gtest/graphapi_mha_fwd_f8.cpp
rename to test/gtest/graphapi_mha_fwd.cpp
index a1897feae9..d64477e557 100644
--- a/test/gtest/graphapi_mha_fwd_f8.cpp
+++ b/test/gtest/graphapi_mha_fwd.cpp
@@ -201,7 +201,7 @@ class GPU_MhaFwdGraph_FP32 : public MhaGraphTestBase
 
 using namespace mha_graph_test;
 
-TEST_P(GPU_MhaFwdGraph_FP32, MhaFwdGraph) { Run(); }
+TEST_P(GPU_MhaFwdGraph_FP32, MhaFwdGraph) { Run(MhaDir::Fwd); }
 
 INSTANTIATE_TEST_SUITE_P(Unit,
                          GPU_MhaFwdGraph_FP32,
diff --git a/test/gtest/prelu.cpp b/test/gtest/prelu.cpp
new file mode 100644
index 0000000000..1d0cd4d431
--- /dev/null
+++ b/test/gtest/prelu.cpp
@@ -0,0 +1,118 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "prelu.hpp"
+#include <miopen/env.hpp>
+
+MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
+
+namespace prelu {
+
+std::string GetFloatArg()
+{
+    const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG);
+    if(tmp.empty())
+    {
+        return "";
+    }
+    return tmp;
+}
+
+bool CheckFloatArg(std::string arg)
+{
+    if(!MIOPEN_TEST_ALL || (env::enabled(MIOPEN_TEST_ALL) && GetFloatArg() == arg))
+    {
+        return true;
+    }
+    return false;
+}
+
+struct GPU_PReLU_bwd_FP32 : PReLUTest<float>
+{
+};
+
+struct GPU_PReLU_bwd_FP16 : PReLUTest<half>
+{
+};
+
+struct GPU_PReLU_bwd_BFP16 : PReLUTest<bfloat16>
+{
+};
+
+} // namespace prelu
+using namespace prelu;
+
+TEST_P(GPU_PReLU_bwd_FP32, Test)
+{
+    if(CheckFloatArg("--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_PReLU_bwd_FP16, Test)
+{
+    if(CheckFloatArg("--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_PReLU_bwd_BFP16, Test)
+{
+    if(CheckFloatArg("--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_PReLU_bwd_FP32, testing::ValuesIn(PReLUSmokeTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_PReLU_bwd_FP16, testing::ValuesIn(PReLUSmokeTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_PReLU_bwd_BFP16, testing::ValuesIn(PReLUSmokeTestConfigs()));
+
+INSTANTIATE_TEST_SUITE_P(Full, GPU_PReLU_bwd_FP32, testing::ValuesIn(PReLUFullTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_PReLU_bwd_FP16, testing::ValuesIn(PReLUFullTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_PReLU_bwd_BFP16, testing::ValuesIn(PReLUFullTestConfigs()));
+
+INSTANTIATE_TEST_SUITE_P(Perf, GPU_PReLU_bwd_FP32, testing::ValuesIn(PReLUPerfTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Perf, GPU_PReLU_bwd_FP16, testing::ValuesIn(PReLUPerfTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Perf, GPU_PReLU_bwd_BFP16, testing::ValuesIn(PReLUPerfTestConfigs()));
diff --git a/test/gtest/prelu.hpp b/test/gtest/prelu.hpp
new file mode 100644
index 0000000000..510cfb85f4
--- /dev/null
+++ b/test/gtest/prelu.hpp
@@ -0,0 +1,247 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "cpu_prelu.hpp"
+#include "get_handle.hpp"
+#include "random.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+
+#include <gtest/gtest.h>
+#include <miopen/miopen.h>
+#include <miopen/prelu.hpp>
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
+struct PReLUTestCase
+{
+    std::vector<size_t> lengths;
+    bool full_params;
+    bool contiguous;
+
+    friend std::ostream& operator<<(std::ostream& os, const PReLUTestCase& tc)
+    {
+        return os << " Lengths:" << tc.lengths
+                  << " Full_num_params:" << (tc.full_params ? "True" : "False")
+                  << " Contiguous:" << (tc.contiguous ? "True" : "False");
+    }
+};
+
+inline std::vector<PReLUTestCase> PReLUSmokeTestConfigs()
+{
+    std::vector<PReLUTestCase> tcs;
+
+    tcs.push_back({{64, 4}, true, true});
+    tcs.push_back({{64, 4}, true, false});
+    tcs.push_back({{64, 4}, false, true});
+    tcs.push_back({{64, 4}, false, false});
+
+    tcs.push_back({{64, 112}, true, true});
+    tcs.push_back({{64, 112}, true, false});
+    tcs.push_back({{64, 112}, false, true});
+    tcs.push_back({{64, 112}, false, false});
+
+    return tcs;
+}
+
+inline std::vector<PReLUTestCase> PReLUPerfTestConfigs()
+{
+    std::vector<PReLUTestCase> tcs;
+
+    tcs.push_back({{64, 112, 50}, true, true});
+    tcs.push_back({{64, 112, 50}, true, false});
+    tcs.push_back({{64, 112, 50}, false, true});
+    tcs.push_back({{64, 112, 50}, false, false});
+
+    return tcs;
+}
+
+inline std::vector<PReLUTestCase> PReLUFullTestConfigs()
+{
+    std::vector<PReLUTestCase> tcs;
+
+    auto smoke_test = PReLUSmokeTestConfigs();
+    auto perf_test  = PReLUPerfTestConfigs();
+
+    tcs.reserve(smoke_test.size() + perf_test.size());
+    for(const auto& test : smoke_test)
+        tcs.push_back(test);
+    for(const auto& test : perf_test)
+        tcs.push_back(test);
+
+    return tcs;
+}
+
+inline std::vector<size_t> GetStrides(std::vector<size_t> lengths, bool contiguous)
+{
+    if(!contiguous)
+        std::swap(lengths.front(), lengths.back());
+    std::vector<size_t> strides(lengths.size());
+    strides.back() = 1;
+    for(int i = lengths.size() - 2; i >= 0; --i)
+        strides[i] = strides[i + 1] * lengths[i + 1];
+    if(!contiguous)
+        std::swap(strides.front(), strides.back());
+    return strides;
+}
+
+template <typename T = float>
+struct PReLUTest : public ::testing::TestWithParam<PReLUTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle   = get_handle();
+        prelu_config    = GetParam();
+        auto gen_value1 = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-6, 100); };
+        auto gen_value2 = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-6, 99); };
+
+        auto lengths = prelu_config.lengths;
+
+        auto input_strides = GetStrides(lengths, prelu_config.contiguous);
+        input              = tensor<T>{lengths, input_strides}.generate(gen_value1);
+
+        std::vector<size_t> weight_length = {1};
+        if(prelu_config.full_params)
+            weight_length = {lengths[1]};
+        std::vector<size_t> weight_strides = {prelu_config.contiguous ? 1 : 2};
+        weight = tensor<T>{weight_length, weight_strides}.generate(gen_value2);
+
+        ws_sizeInBytes = miopen::GetPReLUBackwardWorkspaceSize(handle, input.desc, weight.desc);
+        if(ws_sizeInBytes == static_cast<size_t>(-1))
+            GTEST_SKIP();
+
+        if(ws_sizeInBytes != 0)
+        {
+            std::vector<size_t> workspace_dims;
+            workspace_dims.push_back(ws_sizeInBytes / sizeof(float));
+
+            workspace = tensor<float>{workspace_dims};
+            std::fill(workspace.begin(), workspace.end(), 0.0f);
+
+            workspace_dev = handle.Write(workspace.data);
+        }
+
+        dinput     = tensor<T>{lengths, input_strides};
+        ref_dinput = tensor<T>{lengths, input_strides};
+        std::fill(dinput.begin(), dinput.end(), static_cast<T>(0.0f));
+        std::fill(ref_dinput.begin(), ref_dinput.end(), static_cast<T>(0.0f));
+
+        dweight     = tensor<T>{weight_length, weight_strides};
+        ref_dweight = tensor<T>{weight_length, weight_strides};
+        std::fill(dweight.begin(), dweight.end(), static_cast<T>(0.0f));
+        std::fill(ref_dweight.begin(), ref_dweight.end(), static_cast<T>(0.0f));
+
+        auto out_strides = GetStrides(lengths, true);
+        doutput          = tensor<T>{lengths, out_strides};
+        std::fill(doutput.begin(), doutput.end(), static_cast<T>(1.0f));
+
+        input_dev   = handle.Write(input.data);
+        weight_dev  = handle.Write(weight.data);
+        doutput_dev = handle.Write(doutput.data);
+        dinput_dev  = handle.Write(dinput.data);
+        dweight_dev = handle.Write(dweight.data);
+    }
+
+    void RunTest()
+    {
+        cpu_prelu_backward<T>(input, weight, doutput, ref_dinput, ref_dweight);
+
+        auto&& handle = get_handle();
+        miopenStatus_t status;
+
+        status = miopen::PReLUBackward(handle,
+                                       workspace_dev.get(),
+                                       ws_sizeInBytes,
+                                       input.desc,
+                                       input_dev.get(),
+                                       weight.desc,
+                                       weight_dev.get(),
+                                       doutput.desc,
+                                       doutput_dev.get(),
+                                       dinput.desc,
+                                       dinput_dev.get(),
+                                       dweight.desc,
+                                       dweight_dev.get());
+        EXPECT_EQ(status, miopenStatusSuccess);
+        dinput.data  = handle.Read<T>(dinput_dev, dinput.data.size());
+        dweight.data = handle.Read<T>(dweight_dev, dweight.data.size());
+    }
+
+    void Verify()
+    {
+        // Computation error of fp16 is ~2^13 (=8192) bigger than
+        // the one of fp32 because mantissa is shorter by 13 bits.
+        double tolerance = std::is_same<T, float>::value ? 1.5e-6 : 8.2e-3;
+
+        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+        if(std::is_same<T, bfloat16>::value)
+            tolerance *= 8.0;
+
+        auto error_dinput  = miopen::rms_range(ref_dinput, dinput);
+        auto error_dweight = miopen::rms_range(ref_dweight, dweight);
+        ASSERT_EQ(miopen::range_distance(ref_dinput), miopen::range_distance(dinput));
+        ASSERT_EQ(miopen::range_distance(ref_dweight), miopen::range_distance(dweight));
+        EXPECT_LT(error_dinput, tolerance)
+            << "Error backward Input Gradient beyond tolerance Error: " << error_dinput
+            << ",  Tolerance: " << tolerance;
+        EXPECT_LT(error_dweight, tolerance)
+            << "Error backward Weight Gradient beyond tolerance Error: " << error_dweight
+            << ",  Tolerance: " << tolerance;
+    }
+
+    PReLUTestCase prelu_config;
+
+    tensor<T> input;
+    tensor<T> weight;
+    tensor<T> doutput;
+    tensor<T> dinput;
+    tensor<T> dweight;
+    tensor<float> workspace;
+
+    tensor<T> ref_dinput;
+    tensor<T> ref_dweight;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr weight_dev;
+    miopen::Allocator::ManageDataPtr doutput_dev;
+    miopen::Allocator::ManageDataPtr dinput_dev;
+    miopen::Allocator::ManageDataPtr dweight_dev;
+    miopen::Allocator::ManageDataPtr workspace_dev;
+
+    size_t ws_sizeInBytes;
+};
diff --git a/test/nightlies/JenkinsfileNightlyAux b/test/nightlies/JenkinsfileNightlyAux
index d1b7e70951..feb6ab5dc7 100644
--- a/test/nightlies/JenkinsfileNightlyAux
+++ b/test/nightlies/JenkinsfileNightlyAux
@@ -46,7 +46,8 @@ def buildJob(compiler, flags, image, test, testargs){
             timeout(time: 8, unit: 'HOURS')
             {
                 sh "echo \$HSA_ENABLE_SDMA"
-                sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .."
+                // WORKAROUND_ISSUE_3192 Disabling MLIR for debug builds since MLIR generates sanitizer errors.
+                sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_USE_MLIR=OFF -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .."
                 sh "cd build; CTEST_PARALLEL_LEVEL=4 MIOPEN_VERIFY_CACHE_PATH=/var/jenkins/.cache/miopen/vcache dumb-init make -j\$(nproc) ${test}"
                 sh "MIOPEN_LOG_LEVEL=5 ./build/bin/${test} ${testargs}"
 
diff --git a/test/nightlies/JenkinsfileNightlyConv2D b/test/nightlies/JenkinsfileNightlyConv2D
index 5b67507376..809cfff95c 100644
--- a/test/nightlies/JenkinsfileNightlyConv2D
+++ b/test/nightlies/JenkinsfileNightlyConv2D
@@ -47,7 +47,8 @@ def buildJob(compiler, flags, image, test, testargs){
             timeout(time: 8, unit: 'HOURS')
             {
                 sh "echo \$HSA_ENABLE_SDMA"
-                sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .."
+                // WORKAROUND_ISSUE_3192 Disabling MLIR for debug builds since MLIR generates sanitizer errors.
+                sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_USE_MLIR=OFF -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .."
                 sh "cd build; CTEST_PARALLEL_LEVEL=4 MIOPEN_VERIFY_CACHE_PATH=/var/jenkins/.cache/miopen/vcache dumb-init make -j\$(nproc) ${test}"
                 sh "MIOPEN_LOG_LEVEL=6 ./build/bin/${test} ${testargs}"
 
diff --git a/test/nightlies/JenkinsfileNightlyConv2Daux b/test/nightlies/JenkinsfileNightlyConv2Daux
index e6439313f7..f967352b21 100644
--- a/test/nightlies/JenkinsfileNightlyConv2Daux
+++ b/test/nightlies/JenkinsfileNightlyConv2Daux
@@ -48,7 +48,8 @@ def buildJob(compiler, flags, image, test, testargs){
             timeout(time: 8, unit: 'HOURS')
             {
                 sh "echo \$HSA_ENABLE_SDMA"
-                sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .."
+                // WORKAROUND_ISSUE_3192 Disabling MLIR for debug builds since MLIR generates sanitizer errors.
+                sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_USE_MLIR=OFF -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .."
                 sh "cd build; CTEST_PARALLEL_LEVEL=4 MIOPEN_VERIFY_CACHE_PATH=/var/jenkins/.cache/miopen/vcache dumb-init make -j\$(nproc) ${test}"
                 sh "MIOPEN_LOG_LEVEL=6 ./build/bin/${test} ${testargs}"
 
diff --git a/test/nightlies/JenkinsfileNightlyFastFullConv2D b/test/nightlies/JenkinsfileNightlyFastFullConv2D
index eaa703b3b1..a94992002b 100644
--- a/test/nightlies/JenkinsfileNightlyFastFullConv2D
+++ b/test/nightlies/JenkinsfileNightlyFastFullConv2D
@@ -43,7 +43,8 @@ def buildJob(compiler, flags, image, test, testargs){
             timeout(time: 12, unit: 'HOURS')
             {
                 sh "echo \$HSA_ENABLE_SDMA"
-                sh "rm -rf build; mkdir build; cd build; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .."
+                // WORKAROUND_ISSUE_3192 Disabling MLIR for debug builds since MLIR generates sanitizer errors.
+                sh "rm -rf build; mkdir build; cd build; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_USE_MLIR=OFF -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .."
                 sh "cd build; CTEST_PARALLEL_LEVEL=4 MIOPEN_VERIFY_CACHE_PATH=/var/jenkins/.cache/miopen/vcache dumb-init make -j\$(nproc) ${test}"
                 sh "MIOPEN_ENABLE_LOGGING_CMD=1 MIOPEN_LOG_LEVEL=6 ./build/bin/${test} ${testargs}"
 
diff --git a/test/nightlies/JenkinsfileNightlyFusions b/test/nightlies/JenkinsfileNightlyFusions
index 84d8329c6a..269480baf7 100644
--- a/test/nightlies/JenkinsfileNightlyFusions
+++ b/test/nightlies/JenkinsfileNightlyFusions
@@ -46,7 +46,8 @@ def buildJob(compiler, flags, image, test, testargs){
             timeout(time: 8, unit: 'HOURS')
             {
                 sh "echo \$HSA_ENABLE_SDMA"
-                sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .."
+                // WORKAROUND_ISSUE_3192 Disabling MLIR for debug builds since MLIR generates sanitizer errors.
+                sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_USE_MLIR=OFF -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .."
                 sh "cd build; CTEST_PARALLEL_LEVEL=4 MIOPEN_VERIFY_CACHE_PATH=/var/jenkins/.cache/miopen/vcache dumb-init make -j\$(nproc) ${test}"
                 sh "MIOPEN_LOG_LEVEL=5 ./build/bin/${test} ${testargs}"
 
diff --git a/test/nightlies/JenkinsfileNightlyRNN b/test/nightlies/JenkinsfileNightlyRNN
index b299f832cc..33ac860053 100644
--- a/test/nightlies/JenkinsfileNightlyRNN
+++ b/test/nightlies/JenkinsfileNightlyRNN
@@ -47,7 +47,8 @@ def buildJob(compiler, flags, image, test, testargs){
             timeout(time: 8, unit: 'HOURS')
             {
                 sh "echo \$HSA_ENABLE_SDMA"
-                sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .."
+                // WORKAROUND_ISSUE_3192 Disabling MLIR for debug builds since MLIR generates sanitizer errors.
+                sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_USE_MLIR=Off -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .."
                 sh "cd build; CTEST_PARALLEL_LEVEL=4 MIOPEN_VERIFY_CACHE_PATH=/var/jenkins/.cache/miopen/vcache dumb-init make -j\$(nproc) ${test}"
                 sh "MIOPEN_LOG_LEVEL=5 ./build/bin/${test} ${testargs}"
 
diff --git a/test/nightlies/JenkinsfileNightlyReduce b/test/nightlies/JenkinsfileNightlyReduce
index a7d6c6becb..c2b14e8f79 100644
--- a/test/nightlies/JenkinsfileNightlyReduce
+++ b/test/nightlies/JenkinsfileNightlyReduce
@@ -47,7 +47,8 @@ def buildJob(compiler, flags, image, test, testargs){
             timeout(time: 6, unit: 'HOURS')
             {
                 sh "echo \$HSA_ENABLE_SDMA"
-                sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .."
+                // WORKAROUND_ISSUE_3192 Disabling MLIR for debug builds since MLIR generates sanitizer errors.
+                sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_USE_MLIR=OFF -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .."
                 sh "cd build; CTEST_PARALLEL_LEVEL=4 MIOPEN_VERIFY_CACHE_PATH=/var/jenkins/.cache/miopen/vcache dumb-init make -j\$(nproc) ${test}"
                 sh "MIOPEN_LOG_LEVEL=5 ./build/bin/${test} ${testargs}"
 
diff --git a/test/nightlies/JenkinsfileNightlyTensorOps b/test/nightlies/JenkinsfileNightlyTensorOps
index e19a0a9f18..d326c64f9f 100644
--- a/test/nightlies/JenkinsfileNightlyTensorOps
+++ b/test/nightlies/JenkinsfileNightlyTensorOps
@@ -48,7 +48,8 @@ def buildJob(compiler, flags, image, test, testargs){
             timeout(time: 6, unit: 'HOURS')
             {
                 sh "echo \$HSA_ENABLE_SDMA"
-                sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .."
+                // WORKAROUND_ISSUE_3192 Disabling MLIR for debug builds since MLIR generates sanitizer errors.
+                sh "rm -rf build; mkdir build; cd build; export PATH=/opt/rocm/bin:$PATH; CXX=${compiler} CXXFLAGS='-Werror' cmake -DMIOPEN_GPU_SYNC=On -DMIOPEN_USE_MLIR=Off -DMIOPEN_TEST_FLAGS='--disable-verification-cache' -DCMAKE_CXX_FLAGS_DEBUG='-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined' ${flags} .."
                 sh "cd build; CTEST_PARALLEL_LEVEL=4 MIOPEN_VERIFY_CACHE_PATH=/var/jenkins/.cache/miopen/vcache dumb-init make -j\$(nproc) ${test}"
                 sh "MIOPEN_LOG_LEVEL=5 ./build/bin/${test} ${testargs}"