From 255b4c3ffe81e3d5efd4ebc7b40a36dedd512574 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Sun, 14 Apr 2024 17:58:58 +0000
Subject: [PATCH 01/29] Add Pad Reflection implementation and test

---
 include/miopen/miopen.h                       |  27 +++
 src/CMakeLists.txt                            |   5 +
 src/include/miopen/pad_reflection.hpp         |  45 +++++
 .../miopen/pad_reflection/invoke_params.hpp   |  53 +++++
 .../pad_reflection/problem_description.hpp    |  89 +++++++++
 src/include/miopen/pad_reflection/solvers.hpp |  55 ++++++
 src/kernels/MIOpenPadReflection.cpp           | 146 ++++++++++++++
 src/pad_reflection.cpp                        |  67 +++++++
 src/pad_reflection/problem_description.cpp    |  55 ++++++
 src/pad_reflection_api.cpp                    |  54 +++++
 src/solver/pad_reflection/pad_reflection.cpp  | 176 +++++++++++++++++
 test/cpu_pad_reflection.hpp                   |  91 +++++++++
 test/gtest/pad_reflection.cpp                 |  65 ++++++
 test/gtest/pad_reflection.hpp                 | 186 ++++++++++++++++++
 14 files changed, 1114 insertions(+)
 create mode 100644 src/include/miopen/pad_reflection.hpp
 create mode 100644 src/include/miopen/pad_reflection/invoke_params.hpp
 create mode 100644 src/include/miopen/pad_reflection/problem_description.hpp
 create mode 100644 src/include/miopen/pad_reflection/solvers.hpp
 create mode 100644 src/kernels/MIOpenPadReflection.cpp
 create mode 100644 src/pad_reflection.cpp
 create mode 100644 src/pad_reflection/problem_description.cpp
 create mode 100644 src/pad_reflection_api.cpp
 create mode 100644 src/solver/pad_reflection/pad_reflection.cpp
 create mode 100644 test/cpu_pad_reflection.hpp
 create mode 100644 test/gtest/pad_reflection.cpp
 create mode 100644 test/gtest/pad_reflection.hpp

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index e768c7b349..bf69585717 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -5795,6 +5795,33 @@ MIOPEN_EXPORT miopenStatus_t miopenSumForward(miopenHandle_t handle,
 
 #ifdef MIOPEN_BETA_API
 
+
+/*! @brief Execute a sum forward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param nanPropagation           Nan number propagation mode (input)
+ * @param workspace                Address of the allocated workspace data (input)
+ * @param workspaceSizeInBytes     Size in bytes of the allocated workspace data (input)
+ * @param xDesc                    Tensor descriptor for data input tensor x (input)
+ * @param x                        Data tensor x (input)
+ * @param dim                      Dimensions to sum. (input)
+ * @param yDesc                    Tensor descriptor for output data tensor y (input)
+ * @param y                        Data tensor y (output)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenPadReflection(miopenHandle_t handle,
+                                              const miopenTensorDescriptor_t xDesc,
+                                              const void* x,
+                                              const miopenTensorDescriptor_t yDesc,
+                                              void* y,
+                                              const int * padding);
+
+/** @} */
+// CLOSEOUT SUM DOXYGEN GROUP
+#endif
+
+#ifdef MIOPEN_BETA_API
+
 /*! @ingroup argmax
  * @brief Find the index of the maximum value of a tensor across dimensions.
  *
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1c7f8f7a8e..9550c878e1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -152,6 +152,7 @@ set( MIOpen_Source
     reducetensor.cpp
     reducetensor_api.cpp
     reduce/problem_description.cpp
+    pad_reflection/problem_description.cpp
     rnn.cpp
     rnn_api.cpp
     rnn/rnn_util.cpp
@@ -269,10 +270,12 @@ set( MIOpen_Source
     solver/pooling/backwardNd.cpp
     solver/reduce/forward_argmax.cpp
     solver/reduce/forward_sum.cpp
+    solver/pad_reflection/pad_reflection.cpp
     solver/softmax/attn_softmax.cpp
     solver/softmax/softmax.cpp
     subbuffers.cpp
     sum_api.cpp
+    pad_reflection_api.cpp
     target_properties.cpp
     temp_file.cpp
     tensor.cpp
@@ -468,6 +471,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/MIOpenSoftmax.cl
         kernels/MIOpenSoftmaxAttn.cpp
         kernels/MIOpenSum.cpp
+        kernels/MIOpenPadReflection.cpp
         kernels/MIOpenUtilKernels3.cl
         kernels/MIOpenUtilKernels4.cl
         kernels/MIOpenUtilKernels5.cl
@@ -602,6 +606,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         ocl/fusionopconvocl.cpp
         ocl/fusionopbiasbnactivocl.cpp
         sum.cpp
+        pad_reflection.cpp
         ${PROJECT_BINARY_DIR}/db_path.cpp
         )
 
diff --git a/src/include/miopen/pad_reflection.hpp b/src/include/miopen/pad_reflection.hpp
new file mode 100644
index 0000000000..5fa22935ae
--- /dev/null
+++ b/src/include/miopen/pad_reflection.hpp
@@ -0,0 +1,45 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_SUM_HPP_
+#define MIOPEN_SUM_HPP_
+
+#include <miopen/common.hpp>
+
+namespace miopen {
+
+struct Handle;
+struct TensorDescriptor;
+
+miopenStatus_t PadReflection(Handle& handle,
+                               const TensorDescriptor& xDesc,
+                               ConstData_t x,
+                               const TensorDescriptor& yDesc,
+                               Data_t y,
+                               const int* padding
+                               );
+
+} // namespace miopen
+#endif // _MIOPEN_SUM_HPP_
diff --git a/src/include/miopen/pad_reflection/invoke_params.hpp b/src/include/miopen/pad_reflection/invoke_params.hpp
new file mode 100644
index 0000000000..db2b434d06
--- /dev/null
+++ b/src/include/miopen/pad_reflection/invoke_params.hpp
@@ -0,0 +1,53 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+namespace pad_reflection {
+
+struct InvokeParams : public miopen::InvokeParams
+{
+    InvokeParams() = default;
+
+    const TensorDescriptor* xDesc = nullptr;
+    const TensorDescriptor* yDesc = nullptr;
+
+    ConstData_t x                            = nullptr;
+    Data_t y                                 = nullptr;
+
+    const int* padding = nullptr;
+
+    std::size_t GetWorkspaceSize() const { return 0; }
+    Data_t GetWorkspace() const { return nullptr; }
+};
+
+} // namespace pad_reflection
+
+} // namespace miopen
diff --git a/src/include/miopen/pad_reflection/problem_description.hpp b/src/include/miopen/pad_reflection/problem_description.hpp
new file mode 100644
index 0000000000..87c6521017
--- /dev/null
+++ b/src/include/miopen/pad_reflection/problem_description.hpp
@@ -0,0 +1,89 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/activ.hpp>
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+#include <cassert>
+#include <string>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace pad_reflection {
+
+struct ProblemDescription : ProblemDescriptionBase
+{
+    ProblemDescription(const TensorDescriptor& xDesc_,
+                       const TensorDescriptor& yDesc_)
+        : xDesc(xDesc_), yDesc(yDesc_)
+    {
+    }
+
+    const TensorDescriptor& GetXDesc() const { return xDesc; }
+    const TensorDescriptor& GetYDesc() const { return yDesc; }
+
+    
+    bool IsSameType() const
+    {
+        if(xDesc.GetType() != yDesc.GetType())
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm, "Pad Reflection: Tensor types do not match.");
+#else
+            return false;
+#endif
+        }
+        return true;
+    }
+
+    bool IsAllPacked() const
+    {
+        if(!(xDesc.IsPacked() && yDesc.IsPacked()))
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm, "Pad Reflection: Unpacked tensors not supported.");
+#else
+            return false;
+#endif
+        }
+        return true;
+    }
+
+    
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+private:
+    TensorDescriptor xDesc;
+    TensorDescriptor yDesc;
+};
+
+} // namespace pad_reflection
+
+} // namespace miopen
diff --git a/src/include/miopen/pad_reflection/solvers.hpp b/src/include/miopen/pad_reflection/solvers.hpp
new file mode 100644
index 0000000000..3becfd36b1
--- /dev/null
+++ b/src/include/miopen/pad_reflection/solvers.hpp
@@ -0,0 +1,55 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/pad_reflection/problem_description.hpp>
+#include <miopen/solver.hpp>
+#include <utility>
+
+namespace miopen {
+
+namespace solver {
+
+namespace pad_reflection {
+
+using PadReflectionSolver = NonTunableSolverBase<ExecutionContext, miopen::pad_reflection::ProblemDescription>;
+
+struct PadReflection final : PadReflectionSolver
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<PadReflection>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::pad_reflection::ProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::pad_reflection::ProblemDescription& problem) const override;
+    bool MayNeedWorkspace() const override { return true; }
+};
+
+} // namespace pad_reflection
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/kernels/MIOpenPadReflection.cpp b/src/kernels/MIOpenPadReflection.cpp
new file mode 100644
index 0000000000..11175485c3
--- /dev/null
+++ b/src/kernels/MIOpenPadReflection.cpp
@@ -0,0 +1,146 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <cstddef>
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+
+// extern "C" __global__ void PadReflection2dFwdContiguous(const FLOAT* __restrict__ input,
+//                                             FLOAT* __restrict__ output,
+//                                             uint64_t output_size,
+//                                             long padding_l, long padding_t,
+//                                             const size_t * input_tv_size,
+//                                             const size_t * output_tv_size,
+//                                             const size_t * input_tv_stride
+//                                             )
+// {
+//     const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+//     if(gid >= output_size)
+//         return;
+
+//     // gid to output n, c, h, w
+//     size_t in_H = input_tv_size[2];
+//     size_t in_W = input_tv_size[3];
+
+//     long n, c, h, w;
+//     // GET_NCHW(n, c, h, w, gid, output);
+//     ulong nch = (gid) / output_tv_size[3];
+//     w = (gid) % output_tv_size[3];        
+//     ulong nc = nch / output_tv_size[2];   
+//     h = nch % output_tv_size[2];          
+//     n = nc / output_tv_size[1];           
+//     c = nc % output_tv_size[1];
+
+//     long in_start_x = max(0L, -padding_l);
+//     long in_start_y = max(0L, -padding_t);
+//     long out_start_x = max(0L, padding_l);
+//     long out_start_y = max(0L, padding_t);
+
+//     if (w < padding_l) { 
+//         w = padding_l * 2 - w;
+//     } else if (padding_l <= w && w < in_W + padding_l) {
+//         w = w;
+//     } else {
+//         w = (in_W + padding_l - 1) * 2 - w;
+//     }
+//     w = w - out_start_x + in_start_x;
+
+//     if (h < padding_t) {
+//         h = padding_t * 2 - h;
+//     } else if (padding_t <= h && h < in_H + padding_t) {
+//         h = h;
+//     } else {
+//         h = (in_H + padding_t - 1) * 2 - h;
+//     }
+//     h = h - out_start_y + in_start_y;
+
+//     output[gid] = input[(input_tv_stride[3] * (w)) + 
+//                         (input_tv_stride[2] * (h)) + 
+//                         (input_tv_stride[1] * (c)) + 
+//                         (input_tv_stride[0] * (n)) + 
+//                         0];
+// }
+
+extern "C" __global__ void PadReflection2dFwdContiguous(const FLOAT* __restrict__ input,
+                                            FLOAT* __restrict__ output,
+                                            uint64_t output_size,
+                                            long padding_l, long padding_t,
+                                            const size_t in_H,
+                                            const size_t in_W,
+                                            const size_t output_size_1,
+                                            const size_t output_size_2,
+                                            const size_t output_size_3,
+                                            const size_t input_stride_0,
+                                            const size_t input_stride_1,
+                                            const size_t input_stride_2,
+                                            const size_t input_stride_3
+                                            )
+{
+    const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+    if(gid >= output_size)
+        return;
+
+    long n, c, h, w;
+    // GET_NCHW(n, c, h, w, gid, output);
+    ulong nch = (gid) / output_size_3;
+    w = (gid) % output_size_3;        
+    ulong nc = nch / output_size_2;   
+    h = nch % output_size_2;          
+    n = nc / output_size_1;           
+    c = nc % output_size_1;
+
+    long in_start_x = max(0L, -padding_l);
+    long in_start_y = max(0L, -padding_t);
+    long out_start_x = max(0L, padding_l);
+    long out_start_y = max(0L, padding_t);
+
+    if (w < padding_l) { 
+        w = padding_l * 2 - w;
+    } else if (padding_l <= w && w < in_W + padding_l) {
+        w = w;
+    } else {
+        w = (in_W + padding_l - 1) * 2 - w;
+    }
+    w = w - out_start_x + in_start_x;
+
+    if (h < padding_t) {
+        h = padding_t * 2 - h;
+    } else if (padding_t <= h && h < in_H + padding_t) {
+        h = h;
+    } else {
+        h = (in_H + padding_t - 1) * 2 - h;
+    }
+    h = h - out_start_y + in_start_y;
+
+    output[gid] = input[(input_stride_3 * (w)) + 
+                        (input_stride_2 * (h)) + 
+                        (input_stride_1 * (c)) + 
+                        (input_stride_0 * (n)) + 
+                        0];
+}
\ No newline at end of file
diff --git a/src/pad_reflection.cpp b/src/pad_reflection.cpp
new file mode 100644
index 0000000000..2f2a2b4241
--- /dev/null
+++ b/src/pad_reflection.cpp
@@ -0,0 +1,67 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/datatype.hpp>
+#include <miopen/find_solution.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/kernel_cache.hpp>
+#include <miopen/pad_reflection/invoke_params.hpp>
+#include <miopen/pad_reflection/solvers.hpp>
+#include <miopen/pad_reflection.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+miopenStatus_t PadReflection(Handle& handle,
+                          const TensorDescriptor& xDesc,
+                          ConstData_t x,
+                          const TensorDescriptor& yDesc,
+                          Data_t y,
+                          const int* padding
+                          )
+{
+    const auto problem = pad_reflection::ProblemDescription{xDesc, yDesc};
+
+    const auto invoke_params = [&]() {
+        auto tmp           = pad_reflection::InvokeParams{};
+        tmp.type           = InvokeType::Run;
+        tmp.xDesc          = &xDesc;
+        tmp.yDesc          = &yDesc;
+        tmp.x              = x;
+        tmp.y              = y;
+        tmp.padding        = padding;
+        return tmp;
+    }();
+
+    const auto algo    = AlgorithmName{"PadReflection"};
+    const auto solvers = solver::SolverContainer<solver::pad_reflection::PadReflection>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace miopen
diff --git a/src/pad_reflection/problem_description.cpp b/src/pad_reflection/problem_description.cpp
new file mode 100644
index 0000000000..e0b3545854
--- /dev/null
+++ b/src/pad_reflection/problem_description.cpp
@@ -0,0 +1,55 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/pad_reflection/problem_description.hpp>
+#include <miopen/names.hpp>
+
+#include <sstream>
+
+namespace miopen {
+
+namespace pad_reflection {
+
+NetworkConfig ProblemDescription::MakeNetworkConfig() const
+{
+    auto xlength = xDesc.GetLengths();
+    auto ylength = yDesc.GetLengths();
+
+    auto output_size = std::accumulate(
+        ylength.begin(), ylength.end(), static_cast<size_t>(1), std::multiplies<size_t>());
+    auto dtype = xDesc.GetType();
+
+    std::ostringstream ss;
+
+    ss << "dtype" << dtype;
+    ss << "output_size" << output_size;
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace pad_reflection
+
+} // namespace miopen
diff --git a/src/pad_reflection_api.cpp b/src/pad_reflection_api.cpp
new file mode 100644
index 0000000000..26c99c25ea
--- /dev/null
+++ b/src/pad_reflection_api.cpp
@@ -0,0 +1,54 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/pad_reflection.hpp>
+#include <miopen/errors.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/logger.hpp>
+#include <miopen/tensor_ops.hpp>
+
+
+extern "C" miopenStatus_t miopenPadReflection(miopenHandle_t handle,
+                                                const miopenTensorDescriptor_t xDesc,
+                                                const void* x,
+                                                const miopenTensorDescriptor_t yDesc,
+                                                void* y,
+                                                const int * padding
+                                                )
+{
+    MIOPEN_LOG_FUNCTION(
+        handle, xDesc, x, yDesc, y, padding);
+
+    return miopen::try_([&] {
+        miopen::PadReflection(miopen::deref(handle),
+                           miopen::deref(xDesc),
+                           DataCast(x),
+                           miopen::deref(yDesc),
+                           DataCast(y),
+                           padding
+                           );
+    });
+}
diff --git a/src/solver/pad_reflection/pad_reflection.cpp b/src/solver/pad_reflection/pad_reflection.cpp
new file mode 100644
index 0000000000..b73aa4e7b7
--- /dev/null
+++ b/src/solver/pad_reflection/pad_reflection.cpp
@@ -0,0 +1,176 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <cstddef>
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/pad_reflection/invoke_params.hpp>
+#include <miopen/pad_reflection/solvers.hpp>
+#include <miopen/pad_reflection.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace pad_reflection {
+
+bool PadReflection::IsApplicable(const ExecutionContext& context,
+                                 const miopen::pad_reflection::ProblemDescription& problem) const
+{
+    return true;
+}
+
+ConvSolution
+PadReflection::GetSolution(const ExecutionContext& context,
+                           const miopen::pad_reflection::ProblemDescription& problem) const
+{
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto dtype = problem.GetXDesc().GetType();
+    auto xdims = problem.GetXDesc().GetLengths();
+    auto ydims = problem.GetYDesc().GetLengths();
+
+    auto kernel = KernelInfo{};
+
+    kernel.kernel_file = "MIOpenPadReflection.cpp";
+    kernel.kernel_name = "PadReflection2dFwdContiguous";
+    auto output_numel =
+        std::accumulate(ydims.begin(), ydims.end(), 1ULL, std::multiplies<size_t>());
+
+    const auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int32_t>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int32_t>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int32_t>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int32_t>(dtype == miopenBFloat16)},
+    };
+
+    kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+    size_t xlocalsize = LOCAL_SIZE;
+    size_t xgridsize  = AlignUp(output_numel, xlocalsize);
+    size_t ylocalsize = 1;
+    size_t ygridsize  = 1;
+    size_t zlocalsize = 1;
+    size_t zgridsize  = 1;
+    kernel.l_wk.push_back(xlocalsize);
+    kernel.l_wk.push_back(ylocalsize);
+    kernel.l_wk.push_back(zlocalsize);
+
+    kernel.g_wk.push_back(xgridsize);
+    kernel.g_wk.push_back(ygridsize);
+    kernel.g_wk.push_back(zgridsize);
+
+    result.construction_params.push_back(kernel);
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels[0]);
+            decltype(auto) params = raw_params.CastTo<miopen::pad_reflection::InvokeParams>();
+
+            auto xdims = params.xDesc->GetLengths();
+            auto ydims = params.yDesc->GetLengths();
+
+            auto xstrides = params.xDesc->GetStrides();
+
+            auto output_size =
+                std::accumulate(ydims.begin(), ydims.end(), 1ULL, std::multiplies<size_t>());
+
+            // const size_t* xdims_data;
+            // const size_t* ydims_data;
+            // const size_t* xstrides_data;
+
+            // hipMalloc(&xdims_data, sizeof(size_t) * xdims.size());
+            // hipMalloc(&ydims_data, sizeof(size_t) * ydims.size());
+            // hipMalloc(&xstrides_data, sizeof(size_t) * xstrides.size());
+
+            // hipMemcpy((void*)xdims_data,
+            //           xdims.data(),
+            //           sizeof(size_t) * xdims.size(),
+            //           hipMemcpyHostToDevice);
+            // hipMemcpy((void*)ydims_data,
+            //           ydims.data(),
+            //           sizeof(size_t) * ydims.size(),
+            //           hipMemcpyHostToDevice);
+            // hipMemcpy((void*)xstrides_data,
+            //           xstrides.data(),
+            //           sizeof(size_t) * xstrides.size(),
+            //           hipMemcpyHostToDevice);
+
+            // long padding_l = params.padding[0];
+            // long padding_t = params.padding[2];
+            // kernel(params.x,
+            //        params.y,
+            //        output_size,
+            //        padding_l,
+            //        padding_t,
+            //        xdims_data,
+            //        ydims_data,
+            //        xstrides_data);
+
+            // hipFree((void*)xdims_data);
+            // hipFree((void*)ydims_data);
+            // hipFree((void*)xstrides_data);
+
+
+            long padding_l        = params.padding[0];
+            long padding_t        = params.padding[2];
+            size_t in_H           = xdims[2];
+            size_t in_W           = xdims[3];
+            size_t output_size_1  = ydims[1];
+            size_t output_size_2  = ydims[2];
+            size_t output_size_3  = ydims[3];
+            size_t input_stride_0 = xstrides[0];
+            size_t input_stride_1 = xstrides[1];
+            size_t input_stride_2 = xstrides[2];
+            size_t input_stride_3 = xstrides[3];
+            kernel(params.x,
+                   params.y,
+                   output_size,
+                   padding_l,
+                   padding_t,
+                   in_H,
+                   in_W,
+                   output_size_1,
+                   output_size_2,
+                   output_size_3,
+                   input_stride_0,
+                   input_stride_1,
+                   input_stride_2,
+                   input_stride_3);
+        };
+    };
+
+    return result;
+}
+
+} // namespace pad_reflection
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/test/cpu_pad_reflection.hpp b/test/cpu_pad_reflection.hpp
new file mode 100644
index 0000000000..a64069a5c6
--- /dev/null
+++ b/test/cpu_pad_reflection.hpp
@@ -0,0 +1,91 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_CPU_SUM_HPP
+#define GUARD_CPU_SUM_HPP
+
+#include "tensor_holder.hpp"
+
+template <class T>
+void cpu_pad_reflection(tensor<T> input_tensor,
+                     tensor<T>& ref_output_tensor,
+                     const int * padding_gpu
+                     )
+{
+    long padding_l = padding_gpu[0];
+    long padding_t = padding_gpu[2];
+    auto input_dims  = input_tensor.desc.GetLengths();
+    auto output_dims = ref_output_tensor.desc.GetLengths();
+    auto input = input_tensor.data.data();
+    auto output = ref_output_tensor.data.data();
+    auto input_strides = input_tensor.desc.GetStrides();
+    auto output_size =
+        std::accumulate(output_dims.begin(), output_dims.end(), 1L, std::multiplies<int64_t>());
+
+    size_t in_H = input_dims[2];
+    size_t in_W = input_dims[3];
+
+    for (size_t gid = 0; gid < output_size; ++gid) {
+
+        long n, c, h, w;
+        // GET_NCHW(n, c, h, w, gid, output);
+        ulong nch = (gid) / output_dims[3];
+        w = (gid) % output_dims[3];        
+        ulong nc = nch / output_dims[2];   
+        h = nch % output_dims[2];          
+        n = nc / output_dims[1];           
+        c = nc % output_dims[1];
+
+        long in_start_x = max(0L, -padding_l);
+        long in_start_y = max(0L, -padding_t);
+        long out_start_x = max(0L, padding_l);
+        long out_start_y = max(0L, padding_t);
+
+        if (w < padding_l) { 
+            w = padding_l * 2 - w;
+        } else if (padding_l <= w && w < in_W + padding_l) {
+            w = w;
+        } else {
+            w = (in_W + padding_l - 1) * 2 - w;
+        }
+        w = w - out_start_x + in_start_x;
+
+        if (h < padding_t) {
+            h = padding_t * 2 - h;
+        } else if (padding_t <= h && h < in_H + padding_t) {
+            h = h;
+        } else {
+            h = (in_H + padding_t - 1) * 2 - h;
+        }
+        h = h - out_start_y + in_start_y;
+
+        output[gid] = input[(input_strides[3] * (w)) + 
+                        (input_strides[2] * (h)) + 
+                        (input_strides[1] * (c)) + 
+                        (input_strides[0] * (n)) + 
+                        0];
+    }
+}
+#endif
diff --git a/test/gtest/pad_reflection.cpp b/test/gtest/pad_reflection.cpp
new file mode 100644
index 0000000000..90f09eee41
--- /dev/null
+++ b/test/gtest/pad_reflection.cpp
@@ -0,0 +1,65 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "pad_reflection.hpp"
+#include <miopen/env.hpp>
+
+MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
+
+namespace pad_reflection {
+
+std::string GetFloatArg()
+{
+    const auto& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG));
+    if(tmp.empty())
+    {
+        return "";
+    }
+    return tmp;
+}
+
+struct PadReflectionTestFloat : PadReflectionTest<float>
+{
+};
+
+} // namespace sum_duong
+using namespace pad_reflection;
+
+TEST_P(PadReflectionTestFloat, SumDuongTestFw)
+{
+    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(SumDuongTestSet, PadReflectionTestFloat, testing::ValuesIn(PadReflectionTestFloatConfigs()));
diff --git a/test/gtest/pad_reflection.hpp b/test/gtest/pad_reflection.hpp
new file mode 100644
index 0000000000..9dcc60a4db
--- /dev/null
+++ b/test/gtest/pad_reflection.hpp
@@ -0,0 +1,186 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "../driver/tensor_driver.hpp"
+#include "cpu_pad_reflection.hpp"
+#include "get_handle.hpp"
+#include "random.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+#include <gtest/gtest.h>
+#include <miopen/miopen.h>
+#include <miopen/pad_reflection.hpp>
+
+struct PadReflectionCase
+{
+    size_t N;
+    size_t C;
+    size_t D;
+    size_t H;
+    size_t W;
+    int padding[4];
+    friend std::ostream& operator<<(std::ostream& os, const PadReflectionCase& tc)
+    {
+        return os << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H
+                  << " W:" << tc.W;
+    }
+
+    std::vector<size_t> GetInput()
+    {
+        if((N != 0) && (C != 0) && (D != 0) && (H != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, C, D, H, W});
+        }
+        else if((N != 0) && (C != 0) && (H != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, C, H, W});
+        }
+        else if((N != 0) && (C != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, C, W});
+        }
+        else if((N != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, W});
+        }
+        else if((N != 0))
+        {
+            return std::vector<size_t>({N});
+        }
+        else
+        {
+            std::cout << "Error Input Tensor Lengths\n" << std::endl;
+            return std::vector<size_t>({0});
+        }
+    }
+
+    int * GetPadding() {
+        return padding;
+    }
+};
+
+std::vector<PadReflectionCase> PadReflectionTestFloatConfigs()
+{ // n c d h w dim nanPropagation
+    // clang-format off
+    return {
+        // { 8,    120,  0,  0,   1},  //bart
+        // { 8,    120,  0,  0,   1},
+        // { 8,    1023, 0,  0,   1},  //gpt_neo
+        // { 8,    1024, 0,  0,   768},
+        // { 8,    1023, 0,  0,   1},
+        // { 8,    1024, 0,  0,   768},
+        // { 16,   1024, 0,  0,   768},  //gpt2
+        // { 16,   1024, 0,  0,   768},
+        { 1,   1,    0,  3, 3, {2, 2, 2, 2}},
+        { 48,   8,    0,  512, 512, {1, 1, 1, 1}},
+        { 48,   8,    0,  512, 512, {1, 1, 3, 3}},
+        { 48,   8,    0,  512, 512, {0, 0, 2, 2}},
+        { 16, 311,    0,  98,  512, {1, 1, 1, 1}},
+        { 16, 311,    0,  98,  512, {1, 1, 3, 3}},
+        { 16, 311,    0,  98,  512, {0, 0, 2, 2}},
+      };
+    // clang-format on
+}
+
+template <typename T = float>
+struct PadReflectionTest : public ::testing::TestWithParam<PadReflectionCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle  = get_handle();
+        pad_reflection_config     = GetParam();
+        auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
+
+        auto in_dims = pad_reflection_config.GetInput();
+        auto padding = pad_reflection_config.GetPadding();
+        input = tensor<T>{in_dims}.generate(gen_value);
+        std::vector<size_t> out_dims;
+
+        for(int i = 0; i < in_dims.size(); i++)
+        {
+            if(i == 2)
+            {
+                out_dims.push_back(in_dims[i]  + 2*padding[2]);
+            } else if (i == 3) {
+                out_dims.push_back(in_dims[i]  + 2*padding[0]);
+            } 
+            else {
+                out_dims.push_back(in_dims[i]);
+            }
+        }
+
+        output = tensor<T>{out_dims};
+        std::fill(output.begin(), output.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ref_output = tensor<T>{out_dims};
+        std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits<T>::quiet_NaN());
+
+        input_dev  = handle.Write(input.data);
+        output_dev = handle.Write(output.data);
+    }
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+        auto padding = pad_reflection_config.GetPadding();
+
+        cpu_pad_reflection<T>(input, ref_output, padding);
+        miopenStatus_t status;
+        const int * padding_gpu;
+        hipMalloc(&padding_gpu, sizeof(int) * 4);
+        hipMemcpy((void*)padding_gpu, padding, sizeof(int) * 4, hipMemcpyHostToDevice);
+
+        status = miopen::PadReflection(handle,
+                                    input.desc,
+                                    input_dev.get(),
+                                    output.desc,
+                                    output_dev.get(),
+                                    padding_gpu
+                                    );
+
+        EXPECT_EQ(status, miopenStatusSuccess);
+
+        output.data = handle.Read<T>(output_dev, output.data.size());
+    }
+
+    void Verify()
+    {
+        for(int i = 0; i < output.data.size() - 1; ++i)
+        {
+            EXPECT_EQ(output.data[i], ref_output.data[i]);
+        }
+    }
+    PadReflectionCase pad_reflection_config;
+
+    tensor<T> input;
+    tensor<T> output;
+
+    tensor<T> ref_output;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+
+};

From ef7e9a5ec5f74250850bfc80ac2e3079eea42c65 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Sun, 14 Apr 2024 18:17:23 +0000
Subject: [PATCH 02/29] Change some define variable

---
 src/include/miopen/pad_reflection.hpp | 6 +++---
 test/cpu_pad_reflection.hpp           | 4 ++--
 test/gtest/pad_reflection.cpp         | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/include/miopen/pad_reflection.hpp b/src/include/miopen/pad_reflection.hpp
index 5fa22935ae..bbbcc8a1f8 100644
--- a/src/include/miopen/pad_reflection.hpp
+++ b/src/include/miopen/pad_reflection.hpp
@@ -23,8 +23,8 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#ifndef MIOPEN_SUM_HPP_
-#define MIOPEN_SUM_HPP_
+#ifndef MIOPEN_PAD_REFLECTION_HPP_
+#define MIOPEN_PAD_REFLECTION_HPP_
 
 #include <miopen/common.hpp>
 
@@ -42,4 +42,4 @@ miopenStatus_t PadReflection(Handle& handle,
                                );
 
 } // namespace miopen
-#endif // _MIOPEN_SUM_HPP_
+#endif // MIOPEN_PAD_REFLECTION_HPP_
diff --git a/test/cpu_pad_reflection.hpp b/test/cpu_pad_reflection.hpp
index a64069a5c6..0fd774ec71 100644
--- a/test/cpu_pad_reflection.hpp
+++ b/test/cpu_pad_reflection.hpp
@@ -23,8 +23,8 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#ifndef GUARD_CPU_SUM_HPP
-#define GUARD_CPU_SUM_HPP
+#ifndef GUARD_CPU_PAD_REFLECTION_HPP
+#define GUARD_CPU_PAD_REFLECTION_HPP
 
 #include "tensor_holder.hpp"
 
diff --git a/test/gtest/pad_reflection.cpp b/test/gtest/pad_reflection.cpp
index 90f09eee41..b4986b94ae 100644
--- a/test/gtest/pad_reflection.cpp
+++ b/test/gtest/pad_reflection.cpp
@@ -46,10 +46,10 @@ struct PadReflectionTestFloat : PadReflectionTest<float>
 {
 };
 
-} // namespace sum_duong
+} // namespace pad_reflection
 using namespace pad_reflection;
 
-TEST_P(PadReflectionTestFloat, SumDuongTestFw)
+TEST_P(PadReflectionTestFloat, PadReflectionFw)
 {
     if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))
     {
@@ -62,4 +62,4 @@ TEST_P(PadReflectionTestFloat, SumDuongTestFw)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(SumDuongTestSet, PadReflectionTestFloat, testing::ValuesIn(PadReflectionTestFloatConfigs()));
+INSTANTIATE_TEST_SUITE_P(PadReflectionTestSet, PadReflectionTestFloat, testing::ValuesIn(PadReflectionTestFloatConfigs()));

From c70a5ce445a99b52584465b2ee4b8dbd95161293 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Mon, 15 Apr 2024 09:32:38 +0000
Subject: [PATCH 03/29] Change padding to std::vector<size_t>, update kernel
 new format, add MIOpenDriver for pad_reflection

---
 driver/CMakeLists.txt                         |   1 +
 driver/dm_pad_reflection.cpp                  |  40 ++
 driver/driver.hpp                             |   5 +-
 driver/pad_reflection_driver.hpp              | 394 ++++++++++++++++++
 include/miopen/miopen.h                       |   3 +-
 src/include/miopen/pad_reflection.hpp         |   2 +-
 .../miopen/pad_reflection/invoke_params.hpp   |   2 +-
 .../pad_reflection/problem_description.hpp    |   8 +-
 src/kernels/MIOpenPadReflection.cpp           | 160 ++++---
 src/pad_reflection.cpp                        |   6 +-
 src/pad_reflection_api.cpp                    |   2 +-
 src/solver/pad_reflection/pad_reflection.cpp  | 106 ++---
 test/cpu_pad_reflection.hpp                   |   6 +-
 test/gtest/pad_reflection.hpp                 |  16 +-
 14 files changed, 571 insertions(+), 180 deletions(-)
 create mode 100644 driver/dm_pad_reflection.cpp
 create mode 100644 driver/pad_reflection_driver.hpp

diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index 224e550fed..17a6c20628 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -51,6 +51,7 @@ add_executable(MIOpenDriver
     dm_softmax.cpp
     dm_sum.cpp
     dm_tensorop.cpp
+    dm_pad_reflection.cpp
     main.cpp
     registry_driver_maker.cpp
     rocrand_wrapper.cpp)
diff --git a/driver/dm_pad_reflection.cpp b/driver/dm_pad_reflection.cpp
new file mode 100644
index 0000000000..295634d38c
--- /dev/null
+++ b/driver/dm_pad_reflection.cpp
@@ -0,0 +1,40 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "registry_driver_maker.hpp"
+#include "pad_reflection_driver.hpp"
+
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "padreflection")
+        return new PadReflectionDriver<float, float>();
+    if(base_arg == "padreflectionfp16")
+        return new PadReflectionDriver<float16, float>();
+    if(base_arg == "padreflectionbfp16")
+        return new PadReflectionDriver<bfloat16, float>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
diff --git a/driver/driver.hpp b/driver/driver.hpp
index 4cfc2b544e..01eea79570 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -151,7 +151,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "pool[fp16], lrn[fp16], "
            "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], "
            "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
-           "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16]\n");
+           "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16], padreflection[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -176,7 +176,8 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "layernormfp16" && arg != "layernormbfp16" && arg != "sum" && arg != "sumfp16" &&
        arg != "sumbfp16" && arg != "argmax" && arg != "argmaxfp16" && arg != "argmaxbfp16" &&
        arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" &&
-       arg != "catfp16" && arg != "catbfp16" && arg != "--version")
+       arg != "catfp16" && arg != "catbfp16" && arg != "padreflection" && arg != "padreflectionfp16" &&
+       arg != "padreflectionbfp16" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/pad_reflection_driver.hpp b/driver/pad_reflection_driver.hpp
new file mode 100644
index 0000000000..7cba644381
--- /dev/null
+++ b/driver/pad_reflection_driver.hpp
@@ -0,0 +1,394 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_PAD_REFLECTION_DRIVER_HPP
+#define GUARD_MIOPEN_PAD_REFLECTION_DRIVER_HPP
+
+#include "InputFlags.hpp"
+#include "driver.hpp"
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+#include "random.hpp"
+#include <algorithm>
+#include <cfloat>
+#include <cstdlib>
+#include <memory>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <numeric>
+#include <vector>
+#include <../test/tensor_holder.hpp>
+#include <../test/verify.hpp>
+
+#ifndef MLO_PADREFLECTIONHOST_H_
+#define MLO_PADREFLECTIONHOST_H_
+
+template <typename Tgpu, typename Tcheck>
+void mloPadReflectionRunHost(miopenTensorDescriptor_t inputDesc,
+                             miopenTensorDescriptor_t outputDesc,
+                             Tgpu* input,
+                             Tcheck* outputhost,
+                             std::vector<size_t> padding)
+{
+    auto input_dims  = miopen::deref(inputDesc).GetLengths();
+    auto output_dims = miopen::deref(outputDesc).GetLengths();
+
+    auto output_numel =
+        std::accumulate(output_dims.begin(), output_dims.end(), 1L, std::multiplies<int64_t>());
+
+    long padding_l = padding[0];
+    long padding_t = padding[2];
+    auto input_strides =  miopen::deref(inputDesc).GetStrides();
+    size_t in_H = input_dims[2];
+    size_t in_W = input_dims[3];
+
+    for(size_t gid = 0; gid < output_numel; gid++)
+    {
+        long n, c, h, w;
+        // GET_NCHW(n, c, h, w, gid, output);
+        ulong nch = (gid) / output_dims[3];
+        w = (gid) % output_dims[3];        
+        ulong nc = nch / output_dims[2];   
+        h = nch % output_dims[2];          
+        n = nc / output_dims[1];           
+        c = nc % output_dims[1];
+
+        long in_start_x = max(0L, -padding_l);
+        long in_start_y = max(0L, -padding_t);
+        long out_start_x = max(0L, padding_l);
+        long out_start_y = max(0L, padding_t);
+
+        if (w < padding_l) { 
+            w = padding_l * 2 - w;
+        } else if (padding_l <= w && w < in_W + padding_l) {
+            w = w;
+        } else {
+            w = (in_W + padding_l - 1) * 2 - w;
+        }
+        w = w - out_start_x + in_start_x;
+
+        if (h < padding_t) {
+            h = padding_t * 2 - h;
+        } else if (padding_t <= h && h < in_H + padding_t) {
+            h = h;
+        } else {
+            h = (in_H + padding_t - 1) * 2 - h;
+        }
+        h = h - out_start_y + in_start_y;
+
+        outputhost[gid] = input[(input_strides[3] * (w)) + 
+                        (input_strides[2] * (h)) + 
+                        (input_strides[1] * (c)) + 
+                        (input_strides[0] * (n)) + 
+                        0];
+    }
+}
+#endif
+
+template <typename Tgpu, typename Tref>
+class PadReflectionDriver : public Driver
+{
+public:
+    PadReflectionDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&inputDesc);
+        miopenCreateTensorDescriptor(&outputDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    int GetandSetData() override;
+    std::vector<int> GetInputTensorLengthsFromCmdLine();
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+
+    Tref GetTolerance();
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~PadReflectionDriver() override
+    {
+        miopenDestroyTensorDescriptor(inputDesc);
+        miopenDestroyTensorDescriptor(outputDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    int forw;
+
+    miopenTensorDescriptor_t inputDesc;
+    miopenTensorDescriptor_t outputDesc;
+
+    std::unique_ptr<GPUMem> in_dev;
+    std::unique_ptr<GPUMem> out_dev;
+
+    std::vector<Tgpu> in;
+    std::vector<Tgpu> out;
+    std::vector<Tref> outhost;
+
+    std::vector<size_t> padding;
+};
+
+template <typename Tgpu, typename Tref>
+int PadReflectionDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int PadReflectionDriver<Tgpu, Tref>::GetandSetData()
+{
+    std::string padding_str = inflags.GetValueStr("padding");
+    std::stringstream ss(padding_str);
+    std::string token;
+    while (std::getline(ss, token, ',')) {
+        padding.push_back(std::stoul(token));
+    }
+
+    std::vector<int> in_len = GetInputTensorLengthsFromCmdLine();
+
+    SetTensorNd(inputDesc, in_len, data_type);
+
+    std::vector<int> out_len;
+
+    for(int i = 0; i < in_len.size(); i++)
+    {
+        //If H
+        if(i == 2)
+        {
+            out_len.push_back(in_len[i] + 2*padding[2]);
+        }
+        //If W
+        else if(i == 3)
+        {
+            out_len.push_back(in_len[i] + 2*padding[0]);
+        }
+        else {
+            out_len.push_back(in_len[i]);
+        }
+    }
+
+    if(out_len.empty())
+        out_len.push_back(1);
+
+    SetTensorNd(outputDesc, out_len, data_type);
+
+    return 0;
+}
+
+template <typename Tgpu, typename Tref>
+int PadReflectionDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag("forw", 'F', "1", "Run only Forward Pad Reflection (Default=1)", "int");
+    inflags.AddInputFlag("batchsize", 'n', "256", "Mini-batch size (Default=100)", "int");
+    inflags.AddInputFlag("in_channels", 'c', "4", "Number of Input Channels (Default=3)", "int");
+    inflags.AddInputFlag("in_d", 'D', "0", "Input Depth (Default=0)", "int");
+    inflags.AddInputFlag("in_h", 'H', "32", "Input Height (Default=32)", "int");
+    inflags.AddInputFlag("in_w", 'W', "32", "Input Width (Default=32)", "int");
+
+    inflags.AddInputFlag("padding", 'P', "1,1,1,1", "Padding array (Default=1,1,1,1)", "str");
+
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int");
+    inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+std::vector<int> PadReflectionDriver<Tgpu, Tref>::GetInputTensorLengthsFromCmdLine()
+{
+    int in_n = inflags.GetValueInt("batchsize");
+    int in_c = inflags.GetValueInt("in_channels");
+    int in_w = inflags.GetValueInt("in_w");
+    int in_h = inflags.GetValueInt("in_h");
+    int in_d = inflags.GetValueInt("in_d");
+
+    if((in_n != 0) && (in_c != 0) && (in_d != 0) && (in_h != 0) && (in_w != 0))
+    {
+        return std::vector<int>({in_n, in_c, in_d, in_h, in_w});
+    }
+    else if((in_n != 0) && (in_c != 0) && (in_h != 0) && (in_w != 0))
+    {
+        return std::vector<int>({in_n, in_c, in_h, in_w});
+    }
+    else if((in_n != 0) && (in_c != 0) && (in_w != 0))
+    {
+        return std::vector<int>({in_n, in_c, in_w});
+    }
+    else if((in_n != 0) && (in_w != 0))
+    {
+        return std::vector<int>({in_n, in_w});
+    }
+    else if(in_n != 0)
+    {
+        return std::vector<int>({in_n});
+    }
+    else
+    {
+        std::cerr << "Error Input Tensor Lengths\n" << std::endl;
+        return std::vector<int>({0});
+    }
+}
+
+template <typename Tgpu, typename Tref>
+int PadReflectionDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    size_t in_sz  = GetTensorSize(inputDesc);
+    size_t out_sz = GetTensorSize(outputDesc);
+
+    uint32_t ctx = 0;
+
+    in_dev        = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_sz, sizeof(Tgpu)));
+    out_dev       = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(Tgpu)));
+
+    in      = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
+    out     = std::vector<Tgpu>(out_sz, static_cast<Tgpu>(0));
+    outhost = std::vector<Tref>(out_sz, static_cast<Tref>(0));
+
+    for(int i = 0; i < in_sz; i++)
+    {
+        in[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
+    }
+
+    if(in_dev->ToGPU(GetStream(), in.data()) != 0)
+        std::cerr << "Error copying (in) to GPU, size: " << in_dev->GetSize() << std::endl;
+
+    if(out_dev->ToGPU(GetStream(), out.data()) != 0)
+        std::cerr << "Error copying (out) to GPU, size: " << out_dev->GetSize() << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int PadReflectionDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenPadReflection(GetHandle(),
+                         inputDesc,
+                         in_dev->GetMem(),
+                         outputDesc,
+                         out_dev->GetMem(),
+                         padding);
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Forward Pad Reflection Elapsed: " << t.gettime_ms() / iter
+                      << " ms\n";
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Forward Pad Reflection Elapsed: " << kernel_average_time << " ms\n";
+    }
+
+    if(out_dev->FromGPU(GetStream(), out.data()) != 0)
+        std::cerr << "Error copying (out_dev) from GPU, size: " << out_dev->GetSize() << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int PadReflectionDriver<Tgpu, Tref>::RunForwardCPU()
+{
+    mloPadReflectionRunHost<Tgpu, Tref>(
+        inputDesc, outputDesc, in.data(), outhost.data(), padding);
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int PadReflectionDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+Tref PadReflectionDriver<Tgpu, Tref>::GetTolerance()
+{
+    return 0;
+}
+
+template <typename Tgpu, typename Tref>
+int PadReflectionDriver<Tgpu, Tref>::VerifyForward()
+{
+    RunForwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error           = miopen::rms_range(outhost, out);
+
+    if(!std::isfinite(error) || error > 0)
+    {
+        std::cout << "Pad Reflection FAILED: " << error << " > " << tolerance << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        std::cout << "Pad Reflection Verifies OK on CPU reference (" << error << " == " << tolerance
+                  << ')' << std::endl;
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int PadReflectionDriver<Tgpu, Tref>::VerifyBackward()
+{
+    return miopenStatusSuccess;
+}
+
+#endif // GUARD_MIOPEN_PAD_REFLECTION_DRIVER_HPP
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index bf69585717..7965edb26b 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -26,6 +26,7 @@
 #ifndef MIOPEN_GUARD_MIOPEN_H_
 #define MIOPEN_GUARD_MIOPEN_H_
 
+#include <vector>
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wextern-c-compat"
@@ -5814,7 +5815,7 @@ MIOPEN_EXPORT miopenStatus_t miopenPadReflection(miopenHandle_t handle,
                                               const void* x,
                                               const miopenTensorDescriptor_t yDesc,
                                               void* y,
-                                              const int * padding);
+                                              const std::vector<size_t> padding);
 
 /** @} */
 // CLOSEOUT SUM DOXYGEN GROUP
diff --git a/src/include/miopen/pad_reflection.hpp b/src/include/miopen/pad_reflection.hpp
index bbbcc8a1f8..9b08a6eeaf 100644
--- a/src/include/miopen/pad_reflection.hpp
+++ b/src/include/miopen/pad_reflection.hpp
@@ -38,7 +38,7 @@ miopenStatus_t PadReflection(Handle& handle,
                                ConstData_t x,
                                const TensorDescriptor& yDesc,
                                Data_t y,
-                               const int* padding
+                               const std::vector<size_t> padding
                                );
 
 } // namespace miopen
diff --git a/src/include/miopen/pad_reflection/invoke_params.hpp b/src/include/miopen/pad_reflection/invoke_params.hpp
index db2b434d06..afa9999d8d 100644
--- a/src/include/miopen/pad_reflection/invoke_params.hpp
+++ b/src/include/miopen/pad_reflection/invoke_params.hpp
@@ -42,7 +42,7 @@ struct InvokeParams : public miopen::InvokeParams
     ConstData_t x                            = nullptr;
     Data_t y                                 = nullptr;
 
-    const int* padding = nullptr;
+    const std::vector<size_t>* padding = nullptr;
 
     std::size_t GetWorkspaceSize() const { return 0; }
     Data_t GetWorkspace() const { return nullptr; }
diff --git a/src/include/miopen/pad_reflection/problem_description.hpp b/src/include/miopen/pad_reflection/problem_description.hpp
index 87c6521017..3ffa4a4284 100644
--- a/src/include/miopen/pad_reflection/problem_description.hpp
+++ b/src/include/miopen/pad_reflection/problem_description.hpp
@@ -40,13 +40,16 @@ namespace pad_reflection {
 struct ProblemDescription : ProblemDescriptionBase
 {
     ProblemDescription(const TensorDescriptor& xDesc_,
-                       const TensorDescriptor& yDesc_)
-        : xDesc(xDesc_), yDesc(yDesc_)
+                       const TensorDescriptor& yDesc_,
+                       const std::vector<size_t> padding_
+                       )
+        : xDesc(xDesc_), yDesc(yDesc_), padding(padding_)
     {
     }
 
     const TensorDescriptor& GetXDesc() const { return xDesc; }
     const TensorDescriptor& GetYDesc() const { return yDesc; }
+    const std::vector<size_t>& GetPadding() const { return padding; }
 
     
     bool IsSameType() const
@@ -82,6 +85,7 @@ struct ProblemDescription : ProblemDescriptionBase
 private:
     TensorDescriptor xDesc;
     TensorDescriptor yDesc;
+    std::vector<size_t> padding;
 };
 
 } // namespace pad_reflection
diff --git a/src/kernels/MIOpenPadReflection.cpp b/src/kernels/MIOpenPadReflection.cpp
index 11175485c3..2d360d45f2 100644
--- a/src/kernels/MIOpenPadReflection.cpp
+++ b/src/kernels/MIOpenPadReflection.cpp
@@ -31,116 +31,98 @@
 
 #include "float_types.h"
 
-// extern "C" __global__ void PadReflection2dFwdContiguous(const FLOAT* __restrict__ input,
-//                                             FLOAT* __restrict__ output,
-//                                             uint64_t output_size,
-//                                             long padding_l, long padding_t,
-//                                             const size_t * input_tv_size,
-//                                             const size_t * output_tv_size,
-//                                             const size_t * input_tv_stride
-//                                             )
-// {
-//     const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
-//     if(gid >= output_size)
-//         return;
-
-//     // gid to output n, c, h, w
-//     size_t in_H = input_tv_size[2];
-//     size_t in_W = input_tv_size[3];
-
-//     long n, c, h, w;
-//     // GET_NCHW(n, c, h, w, gid, output);
-//     ulong nch = (gid) / output_tv_size[3];
-//     w = (gid) % output_tv_size[3];        
-//     ulong nc = nch / output_tv_size[2];   
-//     h = nch % output_tv_size[2];          
-//     n = nc / output_tv_size[1];           
-//     c = nc % output_tv_size[1];
-
-//     long in_start_x = max(0L, -padding_l);
-//     long in_start_y = max(0L, -padding_t);
-//     long out_start_x = max(0L, padding_l);
-//     long out_start_y = max(0L, padding_t);
-
-//     if (w < padding_l) { 
-//         w = padding_l * 2 - w;
-//     } else if (padding_l <= w && w < in_W + padding_l) {
-//         w = w;
-//     } else {
-//         w = (in_W + padding_l - 1) * 2 - w;
-//     }
-//     w = w - out_start_x + in_start_x;
-
-//     if (h < padding_t) {
-//         h = padding_t * 2 - h;
-//     } else if (padding_t <= h && h < in_H + padding_t) {
-//         h = h;
-//     } else {
-//         h = (in_H + padding_t - 1) * 2 - h;
-//     }
-//     h = h - out_start_y + in_start_y;
-
-//     output[gid] = input[(input_tv_stride[3] * (w)) + 
-//                         (input_tv_stride[2] * (h)) + 
-//                         (input_tv_stride[1] * (c)) + 
-//                         (input_tv_stride[0] * (n)) + 
-//                         0];
-// }
-
-extern "C" __global__ void PadReflection2dFwdContiguous(const FLOAT* __restrict__ input,
-                                            FLOAT* __restrict__ output,
-                                            uint64_t output_size,
-                                            long padding_l, long padding_t,
-                                            const size_t in_H,
-                                            const size_t in_W,
-                                            const size_t output_size_1,
-                                            const size_t output_size_2,
-                                            const size_t output_size_3,
-                                            const size_t input_stride_0,
-                                            const size_t input_stride_1,
-                                            const size_t input_stride_2,
-                                            const size_t input_stride_3
-                                            )
+template <typename TI, typename TO>
+__device__ void padReflection2dFwdContiguous(const TI* __restrict__ input,
+                                             TO* __restrict__ output,
+                                             uint64_t output_size,
+                                             long padding_l,
+                                             long padding_t,
+                                             const size_t in_H,
+                                             const size_t in_W,
+                                             const size_t output_size_1,
+                                             const size_t output_size_2,
+                                             const size_t output_size_3,
+                                             const size_t input_stride_0,
+                                             const size_t input_stride_1,
+                                             const size_t input_stride_2,
+                                             const size_t input_stride_3)
 {
     const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
     if(gid >= output_size)
         return;
 
     long n, c, h, w;
-    // GET_NCHW(n, c, h, w, gid, output);
     ulong nch = (gid) / output_size_3;
-    w = (gid) % output_size_3;        
-    ulong nc = nch / output_size_2;   
-    h = nch % output_size_2;          
-    n = nc / output_size_1;           
-    c = nc % output_size_1;
+    w         = (gid) % output_size_3;
+    ulong nc  = nch / output_size_2;
+    h         = nch % output_size_2;
+    n         = nc / output_size_1;
+    c         = nc % output_size_1;
 
-    long in_start_x = max(0L, -padding_l);
-    long in_start_y = max(0L, -padding_t);
+    long in_start_x  = max(0L, -padding_l);
+    long in_start_y  = max(0L, -padding_t);
     long out_start_x = max(0L, padding_l);
     long out_start_y = max(0L, padding_t);
 
-    if (w < padding_l) { 
+    if(w < padding_l)
+    {
         w = padding_l * 2 - w;
-    } else if (padding_l <= w && w < in_W + padding_l) {
+    }
+    else if(padding_l <= w && w < in_W + padding_l)
+    {
         w = w;
-    } else {
+    }
+    else
+    {
         w = (in_W + padding_l - 1) * 2 - w;
     }
     w = w - out_start_x + in_start_x;
 
-    if (h < padding_t) {
+    if(h < padding_t)
+    {
         h = padding_t * 2 - h;
-    } else if (padding_t <= h && h < in_H + padding_t) {
+    }
+    else if(padding_t <= h && h < in_H + padding_t)
+    {
         h = h;
-    } else {
+    }
+    else
+    {
         h = (in_H + padding_t - 1) * 2 - h;
     }
     h = h - out_start_y + in_start_y;
 
-    output[gid] = input[(input_stride_3 * (w)) + 
-                        (input_stride_2 * (h)) + 
-                        (input_stride_1 * (c)) + 
-                        (input_stride_0 * (n)) + 
-                        0];
+    output[gid] = input[(input_stride_3 * (w)) + (input_stride_2 * (h)) + (input_stride_1 * (c)) +
+                        (input_stride_0 * (n)) + 0];
+}
+
+extern "C" __global__ void PadReflection2dFwdContiguous(const INPUT_TYPE* __restrict__ input,
+                                                        OUTPUT_TYPE* __restrict__ output,
+                                                        uint64_t output_size,
+                                                        long padding_l,
+                                                        long padding_t,
+                                                        const size_t in_H,
+                                                        const size_t in_W,
+                                                        const size_t output_size_1,
+                                                        const size_t output_size_2,
+                                                        const size_t output_size_3,
+                                                        const size_t input_stride_0,
+                                                        const size_t input_stride_1,
+                                                        const size_t input_stride_2,
+                                                        const size_t input_stride_3)
+{
+    padReflection2dFwdContiguous<INPUT_TYPE, OUTPUT_TYPE>(input,
+                                                          output,
+                                                          output_size,
+                                                          padding_l,
+                                                          padding_t,
+                                                          in_H,
+                                                          in_W,
+                                                          output_size_1,
+                                                          output_size_2,
+                                                          output_size_3,
+                                                          input_stride_0,
+                                                          input_stride_1,
+                                                          input_stride_2,
+                                                          input_stride_3);
 }
\ No newline at end of file
diff --git a/src/pad_reflection.cpp b/src/pad_reflection.cpp
index 2f2a2b4241..2c3dee192d 100644
--- a/src/pad_reflection.cpp
+++ b/src/pad_reflection.cpp
@@ -40,10 +40,10 @@ miopenStatus_t PadReflection(Handle& handle,
                           ConstData_t x,
                           const TensorDescriptor& yDesc,
                           Data_t y,
-                          const int* padding
+                          const std::vector<size_t> padding
                           )
 {
-    const auto problem = pad_reflection::ProblemDescription{xDesc, yDesc};
+    const auto problem = pad_reflection::ProblemDescription{xDesc, yDesc, padding};
 
     const auto invoke_params = [&]() {
         auto tmp           = pad_reflection::InvokeParams{};
@@ -52,7 +52,7 @@ miopenStatus_t PadReflection(Handle& handle,
         tmp.yDesc          = &yDesc;
         tmp.x              = x;
         tmp.y              = y;
-        tmp.padding        = padding;
+        tmp.padding        = &padding;
         return tmp;
     }();
 
diff --git a/src/pad_reflection_api.cpp b/src/pad_reflection_api.cpp
index 26c99c25ea..cfefee3dcb 100644
--- a/src/pad_reflection_api.cpp
+++ b/src/pad_reflection_api.cpp
@@ -36,7 +36,7 @@ extern "C" miopenStatus_t miopenPadReflection(miopenHandle_t handle,
                                                 const void* x,
                                                 const miopenTensorDescriptor_t yDesc,
                                                 void* y,
-                                                const int * padding
+                                                const std::vector<size_t> padding
                                                 )
 {
     MIOPEN_LOG_FUNCTION(
diff --git a/src/solver/pad_reflection/pad_reflection.cpp b/src/solver/pad_reflection/pad_reflection.cpp
index b73aa4e7b7..e3540f7562 100644
--- a/src/solver/pad_reflection/pad_reflection.cpp
+++ b/src/solver/pad_reflection/pad_reflection.cpp
@@ -52,42 +52,42 @@ PadReflection::GetSolution(const ExecutionContext& context,
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
-    auto dtype = problem.GetXDesc().GetType();
-    auto xdims = problem.GetXDesc().GetLengths();
-    auto ydims = problem.GetYDesc().GetLengths();
-
-    auto kernel = KernelInfo{};
-
-    kernel.kernel_file = "MIOpenPadReflection.cpp";
-    kernel.kernel_name = "PadReflection2dFwdContiguous";
-    auto output_numel =
-        std::accumulate(ydims.begin(), ydims.end(), 1ULL, std::multiplies<size_t>());
-
-    const auto build_params = KernelBuildParameters{
-        {"MIOPEN_USE_FP16", static_cast<int32_t>(dtype == miopenHalf)},
-        {"MIOPEN_USE_FP32", static_cast<int32_t>(dtype == miopenFloat)},
-        {"MIOPEN_USE_FP64", static_cast<int32_t>(dtype == miopenDouble)},
-        {"MIOPEN_USE_BFP16", static_cast<int32_t>(dtype == miopenBFloat16)},
-    };
-
-    kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+    auto input_dtype  = miopen::GetDataType(problem.GetXDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetYDesc().GetType());
+    auto xdims        = problem.GetXDesc().GetLengths();
+    auto ydims        = problem.GetYDesc().GetLengths();
+
+    {
+        auto kernel = KernelInfo{};
+
+        kernel.kernel_file = "MIOpenPadReflection.cpp";
+        kernel.kernel_name = "PadReflection2dFwdContiguous";
+        auto output_numel =
+            std::accumulate(ydims.begin(), ydims.end(), 1ULL, std::multiplies<size_t>());
+
+        const auto build_params = KernelBuildParameters{
+            {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+            {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype},
+        };
 
-    size_t xlocalsize = LOCAL_SIZE;
-    size_t xgridsize  = AlignUp(output_numel, xlocalsize);
-    size_t ylocalsize = 1;
-    size_t ygridsize  = 1;
-    size_t zlocalsize = 1;
-    size_t zgridsize  = 1;
-    kernel.l_wk.push_back(xlocalsize);
-    kernel.l_wk.push_back(ylocalsize);
-    kernel.l_wk.push_back(zlocalsize);
+        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
 
-    kernel.g_wk.push_back(xgridsize);
-    kernel.g_wk.push_back(ygridsize);
-    kernel.g_wk.push_back(zgridsize);
+        size_t xlocalsize = LOCAL_SIZE;
+        size_t xgridsize  = AlignUp(output_numel, xlocalsize);
+        size_t ylocalsize = 1;
+        size_t ygridsize  = 1;
+        size_t zlocalsize = 1;
+        size_t zgridsize  = 1;
+        kernel.l_wk.push_back(xlocalsize);
+        kernel.l_wk.push_back(ylocalsize);
+        kernel.l_wk.push_back(zlocalsize);
 
-    result.construction_params.push_back(kernel);
+        kernel.g_wk.push_back(xgridsize);
+        kernel.g_wk.push_back(ygridsize);
+        kernel.g_wk.push_back(zgridsize);
 
+        result.construction_params.push_back(kernel);
+    }
     result.invoker_factory = [](const std::vector<Kernel>& kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) kernel = handle_.Run(kernels[0]);
@@ -101,45 +101,9 @@ PadReflection::GetSolution(const ExecutionContext& context,
             auto output_size =
                 std::accumulate(ydims.begin(), ydims.end(), 1ULL, std::multiplies<size_t>());
 
-            // const size_t* xdims_data;
-            // const size_t* ydims_data;
-            // const size_t* xstrides_data;
-
-            // hipMalloc(&xdims_data, sizeof(size_t) * xdims.size());
-            // hipMalloc(&ydims_data, sizeof(size_t) * ydims.size());
-            // hipMalloc(&xstrides_data, sizeof(size_t) * xstrides.size());
-
-            // hipMemcpy((void*)xdims_data,
-            //           xdims.data(),
-            //           sizeof(size_t) * xdims.size(),
-            //           hipMemcpyHostToDevice);
-            // hipMemcpy((void*)ydims_data,
-            //           ydims.data(),
-            //           sizeof(size_t) * ydims.size(),
-            //           hipMemcpyHostToDevice);
-            // hipMemcpy((void*)xstrides_data,
-            //           xstrides.data(),
-            //           sizeof(size_t) * xstrides.size(),
-            //           hipMemcpyHostToDevice);
-
-            // long padding_l = params.padding[0];
-            // long padding_t = params.padding[2];
-            // kernel(params.x,
-            //        params.y,
-            //        output_size,
-            //        padding_l,
-            //        padding_t,
-            //        xdims_data,
-            //        ydims_data,
-            //        xstrides_data);
-
-            // hipFree((void*)xdims_data);
-            // hipFree((void*)ydims_data);
-            // hipFree((void*)xstrides_data);
-
-
-            long padding_l        = params.padding[0];
-            long padding_t        = params.padding[2];
+            auto padding   = params.padding;
+            long padding_l = (*padding)[0];
+            long padding_t = (*padding)[2];
             size_t in_H           = xdims[2];
             size_t in_W           = xdims[3];
             size_t output_size_1  = ydims[1];
diff --git a/test/cpu_pad_reflection.hpp b/test/cpu_pad_reflection.hpp
index 0fd774ec71..befa63f08b 100644
--- a/test/cpu_pad_reflection.hpp
+++ b/test/cpu_pad_reflection.hpp
@@ -31,11 +31,11 @@
 template <class T>
 void cpu_pad_reflection(tensor<T> input_tensor,
                      tensor<T>& ref_output_tensor,
-                     const int * padding_gpu
+                     const std::vector<size_t> padding
                      )
 {
-    long padding_l = padding_gpu[0];
-    long padding_t = padding_gpu[2];
+    long padding_l = padding[0];
+    long padding_t = padding[2];
     auto input_dims  = input_tensor.desc.GetLengths();
     auto output_dims = ref_output_tensor.desc.GetLengths();
     auto input = input_tensor.data.data();
diff --git a/test/gtest/pad_reflection.hpp b/test/gtest/pad_reflection.hpp
index 9dcc60a4db..b4d6ba2755 100644
--- a/test/gtest/pad_reflection.hpp
+++ b/test/gtest/pad_reflection.hpp
@@ -77,8 +77,12 @@ struct PadReflectionCase
         }
     }
 
-    int * GetPadding() {
-        return padding;
+    std::vector<size_t> GetPadding() {
+        std::vector<size_t> paddingVector;
+        for (int i = 0; i < 4; ++i) {
+            paddingVector.push_back(static_cast<size_t>(padding[i]));
+        }
+        return paddingVector;
     }
 };
 
@@ -149,16 +153,16 @@ struct PadReflectionTest : public ::testing::TestWithParam<PadReflectionCase>
 
         cpu_pad_reflection<T>(input, ref_output, padding);
         miopenStatus_t status;
-        const int * padding_gpu;
-        hipMalloc(&padding_gpu, sizeof(int) * 4);
-        hipMemcpy((void*)padding_gpu, padding, sizeof(int) * 4, hipMemcpyHostToDevice);
+        // const int * padding_gpu;
+        // hipMalloc(&padding_gpu, sizeof(int) * 4);
+        // hipMemcpy((void*)padding_gpu, padding, sizeof(int) * 4, hipMemcpyHostToDevice);
 
         status = miopen::PadReflection(handle,
                                     input.desc,
                                     input_dev.get(),
                                     output.desc,
                                     output_dev.get(),
-                                    padding_gpu
+                                    padding
                                     );
 
         EXPECT_EQ(status, miopenStatusSuccess);

From 49e7a4687c816804e6d654cb2097dba6041b315c Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Mon, 15 Apr 2024 10:15:38 +0000
Subject: [PATCH 04/29] add padreflection docs and update cmakelist to
 alphabetic ordering

---
 docs/reference/index.rst |  1 +
 driver/CMakeLists.txt    |  2 +-
 include/miopen/miopen.h  | 12 ++++++++----
 src/CMakeLists.txt       |  8 ++++----
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 02bcb88622..60883f9a1b 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -31,4 +31,5 @@ The MIOpen API library is structured as follows:
   * :doc:`Sum <../doxygen/html/group__sum>` (experimental)
   * :doc:`GroupNorm <../doxygen/html/group__groupnorm>` (experimental)
   * :doc:`Cat <../doxygen/html/group__cat>` (experimental)
+  * :doc:`PadReflection <../doxygen/html/group__padreflection>` (experimental)
   * :doc:`Argmax<./argmax>` (experimental)
diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index 17a6c20628..bfd196f790 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -45,13 +45,13 @@ add_executable(MIOpenDriver
     dm_groupnorm.cpp
     dm_layernorm.cpp
     dm_lrn.cpp
+    dm_pad_reflection.cpp
     dm_pool.cpp
     dm_reduce.cpp
     dm_rnn.cpp
     dm_softmax.cpp
     dm_sum.cpp
     dm_tensorop.cpp
-    dm_pad_reflection.cpp
     main.cpp
     registry_driver_maker.cpp
     rocrand_wrapper.cpp)
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 7965edb26b..003b989f9a 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -69,6 +69,7 @@
  * @defgroup argmax
  * @defgroup groupnorm
  * @defgroup cat
+ * @defgroup padreflection
  *
  */
 
@@ -5795,9 +5796,12 @@ MIOPEN_EXPORT miopenStatus_t miopenSumForward(miopenHandle_t handle,
 #endif
 
 #ifdef MIOPEN_BETA_API
-
-
-/*! @brief Execute a sum forward layer
+// Padreflection APIs
+/** @addtogroup padreflection
+ *
+ *  @{
+ */
+/*! @brief Add padding by the reflection of the tensor
  *
  * @param handle                   MIOpen handle (input)
  * @param nanPropagation           Nan number propagation mode (input)
@@ -5818,7 +5822,7 @@ MIOPEN_EXPORT miopenStatus_t miopenPadReflection(miopenHandle_t handle,
                                               const std::vector<size_t> padding);
 
 /** @} */
-// CLOSEOUT SUM DOXYGEN GROUP
+// CLOSEOUT PAD REFLECTION DOXYGEN GROUP
 #endif
 
 #ifdef MIOPEN_BETA_API
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fd6157964c..06a3d3d2d0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -144,6 +144,8 @@ set( MIOpen_Source
     mha/problem_description.cpp
     op_args.cpp
     operator.cpp
+    pad_reflection/problem_description.cpp
+    pad_reflection_api.cpp
     performance_config.cpp
     pooling/problem_description.cpp
     pooling_api.cpp
@@ -154,7 +156,6 @@ set( MIOpen_Source
     reducetensor.cpp
     reducetensor_api.cpp
     reduce/problem_description.cpp
-    pad_reflection/problem_description.cpp
     rnn.cpp
     rnn_api.cpp
     rnn/rnn_util.cpp
@@ -265,6 +266,7 @@ set( MIOpen_Source
     solver/layernorm/forward_layernorm2d_ck.cpp
     solver/layernorm/forward_layernorm4d_ck.cpp
     solver/mha/mha_solver.cpp
+    solver/pad_reflection/pad_reflection.cpp
     solver/pooling/forward2d.cpp
     solver/pooling/forwardNaive.cpp
     solver/pooling/forwardNd.cpp
@@ -272,12 +274,10 @@ set( MIOpen_Source
     solver/pooling/backwardNd.cpp
     solver/reduce/forward_argmax.cpp
     solver/reduce/forward_sum.cpp
-    solver/pad_reflection/pad_reflection.cpp
     solver/softmax/attn_softmax.cpp
     solver/softmax/softmax.cpp
     subbuffers.cpp
     sum_api.cpp
-    pad_reflection_api.cpp
     target_properties.cpp
     temp_file.cpp
     tensor.cpp
@@ -604,11 +604,11 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         hip/hip_build_utils.cpp
         hip/batched_transpose_sol.cpp
         hip/general_tensor_reorder_sol.cpp
+        pad_reflection.cpp
         pooling.cpp
         ocl/fusionopconvocl.cpp
         ocl/fusionopbiasbnactivocl.cpp
         sum.cpp
-        pad_reflection.cpp
         ${PROJECT_BINARY_DIR}/db_path.cpp
         )
 

From 00905ef966ab250842837649db42d63e2376ee83 Mon Sep 17 00:00:00 2001
From: duong <duongqlee@gmail.com>
Date: Mon, 15 Apr 2024 15:28:45 +0000
Subject: [PATCH 05/29] Fix some warning error

---
 src/include/miopen/pad_reflection.hpp         |  2 +-
 .../pad_reflection/problem_description.hpp    |  2 --
 src/include/miopen/pad_reflection/solvers.hpp |  7 ++++++-
 src/kernels/MIOpenPadReflection.cpp           |  2 --
 src/solver/pad_reflection/pad_reflection.cpp  | 19 +++++++++++++++----
 test/cpu_pad_reflection.hpp                   |  2 --
 6 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/src/include/miopen/pad_reflection.hpp b/src/include/miopen/pad_reflection.hpp
index 9b08a6eeaf..66f7d9f07a 100644
--- a/src/include/miopen/pad_reflection.hpp
+++ b/src/include/miopen/pad_reflection.hpp
@@ -38,7 +38,7 @@ miopenStatus_t PadReflection(Handle& handle,
                                ConstData_t x,
                                const TensorDescriptor& yDesc,
                                Data_t y,
-                               const std::vector<size_t> padding
+                               std::vector<size_t> padding
                                );
 
 } // namespace miopen
diff --git a/src/include/miopen/pad_reflection/problem_description.hpp b/src/include/miopen/pad_reflection/problem_description.hpp
index 3ffa4a4284..d5d488a3f1 100644
--- a/src/include/miopen/pad_reflection/problem_description.hpp
+++ b/src/include/miopen/pad_reflection/problem_description.hpp
@@ -78,8 +78,6 @@ struct ProblemDescription : ProblemDescriptionBase
         return true;
     }
 
-    
-
     NetworkConfig MakeNetworkConfig() const override;
 
 private:
diff --git a/src/include/miopen/pad_reflection/solvers.hpp b/src/include/miopen/pad_reflection/solvers.hpp
index 3becfd36b1..df867a3776 100644
--- a/src/include/miopen/pad_reflection/solvers.hpp
+++ b/src/include/miopen/pad_reflection/solvers.hpp
@@ -43,9 +43,14 @@ struct PadReflection final : PadReflectionSolver
 
     bool IsApplicable(const ExecutionContext& context,
                       const miopen::pad_reflection::ProblemDescription& problem) const override;
+
     ConvSolution GetSolution(const ExecutionContext& context,
                              const miopen::pad_reflection::ProblemDescription& problem) const override;
-    bool MayNeedWorkspace() const override { return true; }
+                             
+    std::size_t GetWorkspaceSize(const ExecutionContext& context,
+                                 const miopen::pad_reflection::ProblemDescription& problem) const override;
+
+    bool MayNeedWorkspace() const override { return false; }
 };
 
 } // namespace pad_reflection
diff --git a/src/kernels/MIOpenPadReflection.cpp b/src/kernels/MIOpenPadReflection.cpp
index 2d360d45f2..7d67dc7019 100644
--- a/src/kernels/MIOpenPadReflection.cpp
+++ b/src/kernels/MIOpenPadReflection.cpp
@@ -70,7 +70,6 @@ __device__ void padReflection2dFwdContiguous(const TI* __restrict__ input,
     }
     else if(padding_l <= w && w < in_W + padding_l)
     {
-        w = w;
     }
     else
     {
@@ -84,7 +83,6 @@ __device__ void padReflection2dFwdContiguous(const TI* __restrict__ input,
     }
     else if(padding_t <= h && h < in_H + padding_t)
     {
-        h = h;
     }
     else
     {
diff --git a/src/solver/pad_reflection/pad_reflection.cpp b/src/solver/pad_reflection/pad_reflection.cpp
index e3540f7562..9856ab98d6 100644
--- a/src/solver/pad_reflection/pad_reflection.cpp
+++ b/src/solver/pad_reflection/pad_reflection.cpp
@@ -40,15 +40,26 @@ namespace solver {
 
 namespace pad_reflection {
 
-bool PadReflection::IsApplicable(const ExecutionContext& context,
-                                 const miopen::pad_reflection::ProblemDescription& problem) const
+bool PadReflection::IsApplicable([[maybe_unused]] const ExecutionContext& context,
+                                 [[maybe_unused]] const miopen::pad_reflection::ProblemDescription& problem) const
 {
+    if(!problem.IsSameType())
+        return false;
+    if(!problem.IsAllPacked())
+        return false;
     return true;
 }
 
+std::size_t
+PadReflection::GetWorkspaceSize([[maybe_unused]] const ExecutionContext& context,
+                                [[maybe_unused]] const miopen::pad_reflection::ProblemDescription& problem) const
+{
+    return 0;
+}
+
 ConvSolution
-PadReflection::GetSolution(const ExecutionContext& context,
-                           const miopen::pad_reflection::ProblemDescription& problem) const
+PadReflection::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                           [[maybe_unused]] const miopen::pad_reflection::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
diff --git a/test/cpu_pad_reflection.hpp b/test/cpu_pad_reflection.hpp
index befa63f08b..b5d6f413e9 100644
--- a/test/cpu_pad_reflection.hpp
+++ b/test/cpu_pad_reflection.hpp
@@ -66,7 +66,6 @@ void cpu_pad_reflection(tensor<T> input_tensor,
         if (w < padding_l) { 
             w = padding_l * 2 - w;
         } else if (padding_l <= w && w < in_W + padding_l) {
-            w = w;
         } else {
             w = (in_W + padding_l - 1) * 2 - w;
         }
@@ -75,7 +74,6 @@ void cpu_pad_reflection(tensor<T> input_tensor,
         if (h < padding_t) {
             h = padding_t * 2 - h;
         } else if (padding_t <= h && h < in_H + padding_t) {
-            h = h;
         } else {
             h = (in_H + padding_t - 1) * 2 - h;
         }

From 6f8afd0cf81b889ed5457a7f09812e857e35ecb2 Mon Sep 17 00:00:00 2001
From: duong <duongqlee@gmail.com>
Date: Mon, 15 Apr 2024 15:30:43 +0000
Subject: [PATCH 06/29] githook indentation

---
 src/include/miopen/pad_reflection.hpp         | 11 ++--
 .../pad_reflection/problem_description.hpp    |  4 +-
 src/include/miopen/pad_reflection/solvers.hpp | 15 +++--
 src/solver/pad_reflection/pad_reflection.cpp  | 23 +++----
 test/cpu_pad_reflection.hpp                   | 63 ++++++++++---------
 5 files changed, 62 insertions(+), 54 deletions(-)

diff --git a/src/include/miopen/pad_reflection.hpp b/src/include/miopen/pad_reflection.hpp
index 66f7d9f07a..6bde02014a 100644
--- a/src/include/miopen/pad_reflection.hpp
+++ b/src/include/miopen/pad_reflection.hpp
@@ -34,12 +34,11 @@ struct Handle;
 struct TensorDescriptor;
 
 miopenStatus_t PadReflection(Handle& handle,
-                               const TensorDescriptor& xDesc,
-                               ConstData_t x,
-                               const TensorDescriptor& yDesc,
-                               Data_t y,
-                               std::vector<size_t> padding
-                               );
+                             const TensorDescriptor& xDesc,
+                             ConstData_t x,
+                             const TensorDescriptor& yDesc,
+                             Data_t y,
+                             std::vector<size_t> padding);
 
 } // namespace miopen
 #endif // MIOPEN_PAD_REFLECTION_HPP_
diff --git a/src/include/miopen/pad_reflection/problem_description.hpp b/src/include/miopen/pad_reflection/problem_description.hpp
index d5d488a3f1..c81ea198eb 100644
--- a/src/include/miopen/pad_reflection/problem_description.hpp
+++ b/src/include/miopen/pad_reflection/problem_description.hpp
@@ -41,8 +41,7 @@ struct ProblemDescription : ProblemDescriptionBase
 {
     ProblemDescription(const TensorDescriptor& xDesc_,
                        const TensorDescriptor& yDesc_,
-                       const std::vector<size_t> padding_
-                       )
+                       const std::vector<size_t> padding_)
         : xDesc(xDesc_), yDesc(yDesc_), padding(padding_)
     {
     }
@@ -51,7 +50,6 @@ struct ProblemDescription : ProblemDescriptionBase
     const TensorDescriptor& GetYDesc() const { return yDesc; }
     const std::vector<size_t>& GetPadding() const { return padding; }
 
-    
     bool IsSameType() const
     {
         if(xDesc.GetType() != yDesc.GetType())
diff --git a/src/include/miopen/pad_reflection/solvers.hpp b/src/include/miopen/pad_reflection/solvers.hpp
index df867a3776..44668f886d 100644
--- a/src/include/miopen/pad_reflection/solvers.hpp
+++ b/src/include/miopen/pad_reflection/solvers.hpp
@@ -35,7 +35,8 @@ namespace solver {
 
 namespace pad_reflection {
 
-using PadReflectionSolver = NonTunableSolverBase<ExecutionContext, miopen::pad_reflection::ProblemDescription>;
+using PadReflectionSolver =
+    NonTunableSolverBase<ExecutionContext, miopen::pad_reflection::ProblemDescription>;
 
 struct PadReflection final : PadReflectionSolver
 {
@@ -44,11 +45,13 @@ struct PadReflection final : PadReflectionSolver
     bool IsApplicable(const ExecutionContext& context,
                       const miopen::pad_reflection::ProblemDescription& problem) const override;
 
-    ConvSolution GetSolution(const ExecutionContext& context,
-                             const miopen::pad_reflection::ProblemDescription& problem) const override;
-                             
-    std::size_t GetWorkspaceSize(const ExecutionContext& context,
-                                 const miopen::pad_reflection::ProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::pad_reflection::ProblemDescription& problem) const override;
+
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::pad_reflection::ProblemDescription& problem) const override;
 
     bool MayNeedWorkspace() const override { return false; }
 };
diff --git a/src/solver/pad_reflection/pad_reflection.cpp b/src/solver/pad_reflection/pad_reflection.cpp
index 9856ab98d6..a75feadfe0 100644
--- a/src/solver/pad_reflection/pad_reflection.cpp
+++ b/src/solver/pad_reflection/pad_reflection.cpp
@@ -40,8 +40,9 @@ namespace solver {
 
 namespace pad_reflection {
 
-bool PadReflection::IsApplicable([[maybe_unused]] const ExecutionContext& context,
-                                 [[maybe_unused]] const miopen::pad_reflection::ProblemDescription& problem) const
+bool PadReflection::IsApplicable(
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::pad_reflection::ProblemDescription& problem) const
 {
     if(!problem.IsSameType())
         return false;
@@ -50,16 +51,16 @@ bool PadReflection::IsApplicable([[maybe_unused]] const ExecutionContext& contex
     return true;
 }
 
-std::size_t
-PadReflection::GetWorkspaceSize([[maybe_unused]] const ExecutionContext& context,
-                                [[maybe_unused]] const miopen::pad_reflection::ProblemDescription& problem) const
+std::size_t PadReflection::GetWorkspaceSize(
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::pad_reflection::ProblemDescription& problem) const
 {
     return 0;
 }
 
-ConvSolution
-PadReflection::GetSolution([[maybe_unused]] const ExecutionContext& context,
-                           [[maybe_unused]] const miopen::pad_reflection::ProblemDescription& problem) const
+ConvSolution PadReflection::GetSolution(
+    [[maybe_unused]] const ExecutionContext& context,
+    [[maybe_unused]] const miopen::pad_reflection::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 
@@ -112,9 +113,9 @@ PadReflection::GetSolution([[maybe_unused]] const ExecutionContext& context,
             auto output_size =
                 std::accumulate(ydims.begin(), ydims.end(), 1ULL, std::multiplies<size_t>());
 
-            auto padding   = params.padding;
-            long padding_l = (*padding)[0];
-            long padding_t = (*padding)[2];
+            auto padding          = params.padding;
+            long padding_l        = (*padding)[0];
+            long padding_t        = (*padding)[2];
             size_t in_H           = xdims[2];
             size_t in_W           = xdims[3];
             size_t output_size_1  = ydims[1];
diff --git a/test/cpu_pad_reflection.hpp b/test/cpu_pad_reflection.hpp
index b5d6f413e9..ef15909e7c 100644
--- a/test/cpu_pad_reflection.hpp
+++ b/test/cpu_pad_reflection.hpp
@@ -30,16 +30,15 @@
 
 template <class T>
 void cpu_pad_reflection(tensor<T> input_tensor,
-                     tensor<T>& ref_output_tensor,
-                     const std::vector<size_t> padding
-                     )
+                        tensor<T>& ref_output_tensor,
+                        const std::vector<size_t> padding)
 {
-    long padding_l = padding[0];
-    long padding_t = padding[2];
-    auto input_dims  = input_tensor.desc.GetLengths();
-    auto output_dims = ref_output_tensor.desc.GetLengths();
-    auto input = input_tensor.data.data();
-    auto output = ref_output_tensor.data.data();
+    long padding_l     = padding[0];
+    long padding_t     = padding[2];
+    auto input_dims    = input_tensor.desc.GetLengths();
+    auto output_dims   = ref_output_tensor.desc.GetLengths();
+    auto input         = input_tensor.data.data();
+    auto output        = ref_output_tensor.data.data();
     auto input_strides = input_tensor.desc.GetStrides();
     auto output_size =
         std::accumulate(output_dims.begin(), output_dims.end(), 1L, std::multiplies<int64_t>());
@@ -47,43 +46,51 @@ void cpu_pad_reflection(tensor<T> input_tensor,
     size_t in_H = input_dims[2];
     size_t in_W = input_dims[3];
 
-    for (size_t gid = 0; gid < output_size; ++gid) {
+    for(size_t gid = 0; gid < output_size; ++gid)
+    {
 
         long n, c, h, w;
         // GET_NCHW(n, c, h, w, gid, output);
         ulong nch = (gid) / output_dims[3];
-        w = (gid) % output_dims[3];        
-        ulong nc = nch / output_dims[2];   
-        h = nch % output_dims[2];          
-        n = nc / output_dims[1];           
-        c = nc % output_dims[1];
+        w         = (gid) % output_dims[3];
+        ulong nc  = nch / output_dims[2];
+        h         = nch % output_dims[2];
+        n         = nc / output_dims[1];
+        c         = nc % output_dims[1];
 
-        long in_start_x = max(0L, -padding_l);
-        long in_start_y = max(0L, -padding_t);
+        long in_start_x  = max(0L, -padding_l);
+        long in_start_y  = max(0L, -padding_t);
         long out_start_x = max(0L, padding_l);
         long out_start_y = max(0L, padding_t);
 
-        if (w < padding_l) { 
+        if(w < padding_l)
+        {
             w = padding_l * 2 - w;
-        } else if (padding_l <= w && w < in_W + padding_l) {
-        } else {
+        }
+        else if(padding_l <= w && w < in_W + padding_l)
+        {
+        }
+        else
+        {
             w = (in_W + padding_l - 1) * 2 - w;
         }
         w = w - out_start_x + in_start_x;
 
-        if (h < padding_t) {
+        if(h < padding_t)
+        {
             h = padding_t * 2 - h;
-        } else if (padding_t <= h && h < in_H + padding_t) {
-        } else {
+        }
+        else if(padding_t <= h && h < in_H + padding_t)
+        {
+        }
+        else
+        {
             h = (in_H + padding_t - 1) * 2 - h;
         }
         h = h - out_start_y + in_start_y;
 
-        output[gid] = input[(input_strides[3] * (w)) + 
-                        (input_strides[2] * (h)) + 
-                        (input_strides[1] * (c)) + 
-                        (input_strides[0] * (n)) + 
-                        0];
+        output[gid] = input[(input_strides[3] * (w)) + (input_strides[2] * (h)) +
+                            (input_strides[1] * (c)) + (input_strides[0] * (n)) + 0];
     }
 }
 #endif

From 508fabdfb2cb686c3bcc8815853ae4ee232c8988 Mon Sep 17 00:00:00 2001
From: duong <duongqlee@gmail.com>
Date: Mon, 15 Apr 2024 15:38:45 +0000
Subject: [PATCH 07/29] Code clean up

---
 driver/dm_pad_reflection.cpp                        |  1 -
 driver/pad_reflection_driver.hpp                    |  2 --
 src/include/miopen/pad_reflection.hpp               |  1 -
 src/include/miopen/pad_reflection/invoke_params.hpp |  1 -
 .../miopen/pad_reflection/problem_description.hpp   |  1 -
 src/include/miopen/pad_reflection/solvers.hpp       |  1 -
 src/pad_reflection.cpp                              |  1 -
 src/pad_reflection/problem_description.cpp          |  1 -
 src/pad_reflection_api.cpp                          |  1 -
 test/gtest/pad_reflection.cpp                       |  1 +
 test/gtest/pad_reflection.hpp                       | 13 +------------
 11 files changed, 2 insertions(+), 22 deletions(-)

diff --git a/driver/dm_pad_reflection.cpp b/driver/dm_pad_reflection.cpp
index 295634d38c..b036d5ccb6 100644
--- a/driver/dm_pad_reflection.cpp
+++ b/driver/dm_pad_reflection.cpp
@@ -25,7 +25,6 @@
  *******************************************************************************/
 #include "registry_driver_maker.hpp"
 #include "pad_reflection_driver.hpp"
-
 static Driver* makeDriver(const std::string& base_arg)
 {
     if(base_arg == "padreflection")
diff --git a/driver/pad_reflection_driver.hpp b/driver/pad_reflection_driver.hpp
index 7cba644381..8e7c2634e8 100644
--- a/driver/pad_reflection_driver.hpp
+++ b/driver/pad_reflection_driver.hpp
@@ -54,7 +54,6 @@ void mloPadReflectionRunHost(miopenTensorDescriptor_t inputDesc,
 {
     auto input_dims  = miopen::deref(inputDesc).GetLengths();
     auto output_dims = miopen::deref(outputDesc).GetLengths();
-
     auto output_numel =
         std::accumulate(output_dims.begin(), output_dims.end(), 1L, std::multiplies<int64_t>());
 
@@ -187,7 +186,6 @@ int PadReflectionDriver<Tgpu, Tref>::GetandSetData()
     SetTensorNd(inputDesc, in_len, data_type);
 
     std::vector<int> out_len;
-
     for(int i = 0; i < in_len.size(); i++)
     {
         //If H
diff --git a/src/include/miopen/pad_reflection.hpp b/src/include/miopen/pad_reflection.hpp
index 6bde02014a..fcd265922a 100644
--- a/src/include/miopen/pad_reflection.hpp
+++ b/src/include/miopen/pad_reflection.hpp
@@ -39,6 +39,5 @@ miopenStatus_t PadReflection(Handle& handle,
                              const TensorDescriptor& yDesc,
                              Data_t y,
                              std::vector<size_t> padding);
-
 } // namespace miopen
 #endif // MIOPEN_PAD_REFLECTION_HPP_
diff --git a/src/include/miopen/pad_reflection/invoke_params.hpp b/src/include/miopen/pad_reflection/invoke_params.hpp
index afa9999d8d..7121364f7f 100644
--- a/src/include/miopen/pad_reflection/invoke_params.hpp
+++ b/src/include/miopen/pad_reflection/invoke_params.hpp
@@ -43,7 +43,6 @@ struct InvokeParams : public miopen::InvokeParams
     Data_t y                                 = nullptr;
 
     const std::vector<size_t>* padding = nullptr;
-
     std::size_t GetWorkspaceSize() const { return 0; }
     Data_t GetWorkspace() const { return nullptr; }
 };
diff --git a/src/include/miopen/pad_reflection/problem_description.hpp b/src/include/miopen/pad_reflection/problem_description.hpp
index c81ea198eb..19b70ca476 100644
--- a/src/include/miopen/pad_reflection/problem_description.hpp
+++ b/src/include/miopen/pad_reflection/problem_description.hpp
@@ -45,7 +45,6 @@ struct ProblemDescription : ProblemDescriptionBase
         : xDesc(xDesc_), yDesc(yDesc_), padding(padding_)
     {
     }
-
     const TensorDescriptor& GetXDesc() const { return xDesc; }
     const TensorDescriptor& GetYDesc() const { return yDesc; }
     const std::vector<size_t>& GetPadding() const { return padding; }
diff --git a/src/include/miopen/pad_reflection/solvers.hpp b/src/include/miopen/pad_reflection/solvers.hpp
index 44668f886d..e6913f5192 100644
--- a/src/include/miopen/pad_reflection/solvers.hpp
+++ b/src/include/miopen/pad_reflection/solvers.hpp
@@ -48,7 +48,6 @@ struct PadReflection final : PadReflectionSolver
     ConvSolution
     GetSolution(const ExecutionContext& context,
                 const miopen::pad_reflection::ProblemDescription& problem) const override;
-
     std::size_t
     GetWorkspaceSize(const ExecutionContext& context,
                      const miopen::pad_reflection::ProblemDescription& problem) const override;
diff --git a/src/pad_reflection.cpp b/src/pad_reflection.cpp
index 2c3dee192d..eacd2605ef 100644
--- a/src/pad_reflection.cpp
+++ b/src/pad_reflection.cpp
@@ -58,7 +58,6 @@ miopenStatus_t PadReflection(Handle& handle,
 
     const auto algo    = AlgorithmName{"PadReflection"};
     const auto solvers = solver::SolverContainer<solver::pad_reflection::PadReflection>{};
-
     solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
 
     return miopenStatusSuccess;
diff --git a/src/pad_reflection/problem_description.cpp b/src/pad_reflection/problem_description.cpp
index e0b3545854..98cc727c17 100644
--- a/src/pad_reflection/problem_description.cpp
+++ b/src/pad_reflection/problem_description.cpp
@@ -46,7 +46,6 @@ NetworkConfig ProblemDescription::MakeNetworkConfig() const
 
     ss << "dtype" << dtype;
     ss << "output_size" << output_size;
-
     return NetworkConfig{ss.str()};
 }
 
diff --git a/src/pad_reflection_api.cpp b/src/pad_reflection_api.cpp
index cfefee3dcb..3b67d5a2d6 100644
--- a/src/pad_reflection_api.cpp
+++ b/src/pad_reflection_api.cpp
@@ -30,7 +30,6 @@
 #include <miopen/logger.hpp>
 #include <miopen/tensor_ops.hpp>
 
-
 extern "C" miopenStatus_t miopenPadReflection(miopenHandle_t handle,
                                                 const miopenTensorDescriptor_t xDesc,
                                                 const void* x,
diff --git a/test/gtest/pad_reflection.cpp b/test/gtest/pad_reflection.cpp
index b4986b94ae..d2dd368ef1 100644
--- a/test/gtest/pad_reflection.cpp
+++ b/test/gtest/pad_reflection.cpp
@@ -44,6 +44,7 @@ std::string GetFloatArg()
 
 struct PadReflectionTestFloat : PadReflectionTest<float>
 {
+    
 };
 
 } // namespace pad_reflection
diff --git a/test/gtest/pad_reflection.hpp b/test/gtest/pad_reflection.hpp
index b4d6ba2755..c02522e17b 100644
--- a/test/gtest/pad_reflection.hpp
+++ b/test/gtest/pad_reflection.hpp
@@ -87,17 +87,9 @@ struct PadReflectionCase
 };
 
 std::vector<PadReflectionCase> PadReflectionTestFloatConfigs()
-{ // n c d h w dim nanPropagation
+{ // n c d h w padding
     // clang-format off
     return {
-        // { 8,    120,  0,  0,   1},  //bart
-        // { 8,    120,  0,  0,   1},
-        // { 8,    1023, 0,  0,   1},  //gpt_neo
-        // { 8,    1024, 0,  0,   768},
-        // { 8,    1023, 0,  0,   1},
-        // { 8,    1024, 0,  0,   768},
-        // { 16,   1024, 0,  0,   768},  //gpt2
-        // { 16,   1024, 0,  0,   768},
         { 1,   1,    0,  3, 3, {2, 2, 2, 2}},
         { 48,   8,    0,  512, 512, {1, 1, 1, 1}},
         { 48,   8,    0,  512, 512, {1, 1, 3, 3}},
@@ -153,9 +145,6 @@ struct PadReflectionTest : public ::testing::TestWithParam<PadReflectionCase>
 
         cpu_pad_reflection<T>(input, ref_output, padding);
         miopenStatus_t status;
-        // const int * padding_gpu;
-        // hipMalloc(&padding_gpu, sizeof(int) * 4);
-        // hipMemcpy((void*)padding_gpu, padding, sizeof(int) * 4, hipMemcpyHostToDevice);
 
         status = miopen::PadReflection(handle,
                                     input.desc,

From a4a249b6a11b07e3e35532a9ccc5165f59b17982 Mon Sep 17 00:00:00 2001
From: duong <duongqlee@gmail.com>
Date: Mon, 15 Apr 2024 15:39:21 +0000
Subject: [PATCH 08/29] githook cleanup

---
 driver/pad_reflection_driver.hpp              | 87 ++++++++++---------
 .../miopen/pad_reflection/invoke_params.hpp   |  4 +-
 src/pad_reflection.cpp                        | 25 +++---
 src/pad_reflection_api.cpp                    | 25 +++---
 test/gtest/pad_reflection.cpp                 |  5 +-
 test/gtest/pad_reflection.hpp                 | 37 ++++----
 6 files changed, 92 insertions(+), 91 deletions(-)

diff --git a/driver/pad_reflection_driver.hpp b/driver/pad_reflection_driver.hpp
index 8e7c2634e8..edb35fee26 100644
--- a/driver/pad_reflection_driver.hpp
+++ b/driver/pad_reflection_driver.hpp
@@ -57,51 +57,58 @@ void mloPadReflectionRunHost(miopenTensorDescriptor_t inputDesc,
     auto output_numel =
         std::accumulate(output_dims.begin(), output_dims.end(), 1L, std::multiplies<int64_t>());
 
-    long padding_l = padding[0];
-    long padding_t = padding[2];
-    auto input_strides =  miopen::deref(inputDesc).GetStrides();
-    size_t in_H = input_dims[2];
-    size_t in_W = input_dims[3];
+    long padding_l     = padding[0];
+    long padding_t     = padding[2];
+    auto input_strides = miopen::deref(inputDesc).GetStrides();
+    size_t in_H        = input_dims[2];
+    size_t in_W        = input_dims[3];
 
     for(size_t gid = 0; gid < output_numel; gid++)
     {
         long n, c, h, w;
         // GET_NCHW(n, c, h, w, gid, output);
         ulong nch = (gid) / output_dims[3];
-        w = (gid) % output_dims[3];        
-        ulong nc = nch / output_dims[2];   
-        h = nch % output_dims[2];          
-        n = nc / output_dims[1];           
-        c = nc % output_dims[1];
-
-        long in_start_x = max(0L, -padding_l);
-        long in_start_y = max(0L, -padding_t);
+        w         = (gid) % output_dims[3];
+        ulong nc  = nch / output_dims[2];
+        h         = nch % output_dims[2];
+        n         = nc / output_dims[1];
+        c         = nc % output_dims[1];
+
+        long in_start_x  = max(0L, -padding_l);
+        long in_start_y  = max(0L, -padding_t);
         long out_start_x = max(0L, padding_l);
         long out_start_y = max(0L, padding_t);
 
-        if (w < padding_l) { 
+        if(w < padding_l)
+        {
             w = padding_l * 2 - w;
-        } else if (padding_l <= w && w < in_W + padding_l) {
+        }
+        else if(padding_l <= w && w < in_W + padding_l)
+        {
             w = w;
-        } else {
+        }
+        else
+        {
             w = (in_W + padding_l - 1) * 2 - w;
         }
         w = w - out_start_x + in_start_x;
 
-        if (h < padding_t) {
+        if(h < padding_t)
+        {
             h = padding_t * 2 - h;
-        } else if (padding_t <= h && h < in_H + padding_t) {
+        }
+        else if(padding_t <= h && h < in_H + padding_t)
+        {
             h = h;
-        } else {
+        }
+        else
+        {
             h = (in_H + padding_t - 1) * 2 - h;
         }
         h = h - out_start_y + in_start_y;
 
-        outputhost[gid] = input[(input_strides[3] * (w)) + 
-                        (input_strides[2] * (h)) + 
-                        (input_strides[1] * (c)) + 
-                        (input_strides[0] * (n)) + 
-                        0];
+        outputhost[gid] = input[(input_strides[3] * (w)) + (input_strides[2] * (h)) +
+                                (input_strides[1] * (c)) + (input_strides[0] * (n)) + 0];
     }
 }
 #endif
@@ -177,7 +184,8 @@ int PadReflectionDriver<Tgpu, Tref>::GetandSetData()
     std::string padding_str = inflags.GetValueStr("padding");
     std::stringstream ss(padding_str);
     std::string token;
-    while (std::getline(ss, token, ',')) {
+    while(std::getline(ss, token, ','))
+    {
         padding.push_back(std::stoul(token));
     }
 
@@ -188,17 +196,18 @@ int PadReflectionDriver<Tgpu, Tref>::GetandSetData()
     std::vector<int> out_len;
     for(int i = 0; i < in_len.size(); i++)
     {
-        //If H
+        // If H
         if(i == 2)
         {
-            out_len.push_back(in_len[i] + 2*padding[2]);
+            out_len.push_back(in_len[i] + 2 * padding[2]);
         }
-        //If W
+        // If W
         else if(i == 3)
         {
-            out_len.push_back(in_len[i] + 2*padding[0]);
+            out_len.push_back(in_len[i] + 2 * padding[0]);
         }
-        else {
+        else
+        {
             out_len.push_back(in_len[i]);
         }
     }
@@ -276,8 +285,8 @@ int PadReflectionDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
 
     uint32_t ctx = 0;
 
-    in_dev        = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_sz, sizeof(Tgpu)));
-    out_dev       = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(Tgpu)));
+    in_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_sz, sizeof(Tgpu)));
+    out_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(Tgpu)));
 
     in      = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
     out     = std::vector<Tgpu>(out_sz, static_cast<Tgpu>(0));
@@ -308,12 +317,8 @@ int PadReflectionDriver<Tgpu, Tref>::RunForwardGPU()
 
     for(int i = 0; i < inflags.GetValueInt("iter"); i++)
     {
-        miopenPadReflection(GetHandle(),
-                         inputDesc,
-                         in_dev->GetMem(),
-                         outputDesc,
-                         out_dev->GetMem(),
-                         padding);
+        miopenPadReflection(
+            GetHandle(), inputDesc, in_dev->GetMem(), outputDesc, out_dev->GetMem(), padding);
 
         float time = 0.0;
         miopenGetKernelTime(GetHandle(), &time);
@@ -332,7 +337,8 @@ int PadReflectionDriver<Tgpu, Tref>::RunForwardGPU()
 
         float kernel_average_time =
             iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
-        std::cout << "GPU Kernel Time Forward Pad Reflection Elapsed: " << kernel_average_time << " ms\n";
+        std::cout << "GPU Kernel Time Forward Pad Reflection Elapsed: " << kernel_average_time
+                  << " ms\n";
     }
 
     if(out_dev->FromGPU(GetStream(), out.data()) != 0)
@@ -344,8 +350,7 @@ int PadReflectionDriver<Tgpu, Tref>::RunForwardGPU()
 template <typename Tgpu, typename Tref>
 int PadReflectionDriver<Tgpu, Tref>::RunForwardCPU()
 {
-    mloPadReflectionRunHost<Tgpu, Tref>(
-        inputDesc, outputDesc, in.data(), outhost.data(), padding);
+    mloPadReflectionRunHost<Tgpu, Tref>(inputDesc, outputDesc, in.data(), outhost.data(), padding);
 
     return miopenStatusSuccess;
 }
diff --git a/src/include/miopen/pad_reflection/invoke_params.hpp b/src/include/miopen/pad_reflection/invoke_params.hpp
index 7121364f7f..c7b847676c 100644
--- a/src/include/miopen/pad_reflection/invoke_params.hpp
+++ b/src/include/miopen/pad_reflection/invoke_params.hpp
@@ -39,8 +39,8 @@ struct InvokeParams : public miopen::InvokeParams
     const TensorDescriptor* xDesc = nullptr;
     const TensorDescriptor* yDesc = nullptr;
 
-    ConstData_t x                            = nullptr;
-    Data_t y                                 = nullptr;
+    ConstData_t x = nullptr;
+    Data_t y      = nullptr;
 
     const std::vector<size_t>* padding = nullptr;
     std::size_t GetWorkspaceSize() const { return 0; }
diff --git a/src/pad_reflection.cpp b/src/pad_reflection.cpp
index eacd2605ef..e3ac64d2be 100644
--- a/src/pad_reflection.cpp
+++ b/src/pad_reflection.cpp
@@ -36,23 +36,22 @@
 namespace miopen {
 
 miopenStatus_t PadReflection(Handle& handle,
-                          const TensorDescriptor& xDesc,
-                          ConstData_t x,
-                          const TensorDescriptor& yDesc,
-                          Data_t y,
-                          const std::vector<size_t> padding
-                          )
+                             const TensorDescriptor& xDesc,
+                             ConstData_t x,
+                             const TensorDescriptor& yDesc,
+                             Data_t y,
+                             const std::vector<size_t> padding)
 {
     const auto problem = pad_reflection::ProblemDescription{xDesc, yDesc, padding};
 
     const auto invoke_params = [&]() {
-        auto tmp           = pad_reflection::InvokeParams{};
-        tmp.type           = InvokeType::Run;
-        tmp.xDesc          = &xDesc;
-        tmp.yDesc          = &yDesc;
-        tmp.x              = x;
-        tmp.y              = y;
-        tmp.padding        = &padding;
+        auto tmp    = pad_reflection::InvokeParams{};
+        tmp.type    = InvokeType::Run;
+        tmp.xDesc   = &xDesc;
+        tmp.yDesc   = &yDesc;
+        tmp.x       = x;
+        tmp.y       = y;
+        tmp.padding = &padding;
         return tmp;
     }();
 
diff --git a/src/pad_reflection_api.cpp b/src/pad_reflection_api.cpp
index 3b67d5a2d6..97fe6a7fe6 100644
--- a/src/pad_reflection_api.cpp
+++ b/src/pad_reflection_api.cpp
@@ -31,23 +31,20 @@
 #include <miopen/tensor_ops.hpp>
 
 extern "C" miopenStatus_t miopenPadReflection(miopenHandle_t handle,
-                                                const miopenTensorDescriptor_t xDesc,
-                                                const void* x,
-                                                const miopenTensorDescriptor_t yDesc,
-                                                void* y,
-                                                const std::vector<size_t> padding
-                                                )
+                                              const miopenTensorDescriptor_t xDesc,
+                                              const void* x,
+                                              const miopenTensorDescriptor_t yDesc,
+                                              void* y,
+                                              const std::vector<size_t> padding)
 {
-    MIOPEN_LOG_FUNCTION(
-        handle, xDesc, x, yDesc, y, padding);
+    MIOPEN_LOG_FUNCTION(handle, xDesc, x, yDesc, y, padding);
 
     return miopen::try_([&] {
         miopen::PadReflection(miopen::deref(handle),
-                           miopen::deref(xDesc),
-                           DataCast(x),
-                           miopen::deref(yDesc),
-                           DataCast(y),
-                           padding
-                           );
+                              miopen::deref(xDesc),
+                              DataCast(x),
+                              miopen::deref(yDesc),
+                              DataCast(y),
+                              padding);
     });
 }
diff --git a/test/gtest/pad_reflection.cpp b/test/gtest/pad_reflection.cpp
index d2dd368ef1..1eefd15d92 100644
--- a/test/gtest/pad_reflection.cpp
+++ b/test/gtest/pad_reflection.cpp
@@ -44,7 +44,6 @@ std::string GetFloatArg()
 
 struct PadReflectionTestFloat : PadReflectionTest<float>
 {
-    
 };
 
 } // namespace pad_reflection
@@ -63,4 +62,6 @@ TEST_P(PadReflectionTestFloat, PadReflectionFw)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(PadReflectionTestSet, PadReflectionTestFloat, testing::ValuesIn(PadReflectionTestFloatConfigs()));
+INSTANTIATE_TEST_SUITE_P(PadReflectionTestSet,
+                         PadReflectionTestFloat,
+                         testing::ValuesIn(PadReflectionTestFloatConfigs()));
diff --git a/test/gtest/pad_reflection.hpp b/test/gtest/pad_reflection.hpp
index c02522e17b..b755639f08 100644
--- a/test/gtest/pad_reflection.hpp
+++ b/test/gtest/pad_reflection.hpp
@@ -77,9 +77,11 @@ struct PadReflectionCase
         }
     }
 
-    std::vector<size_t> GetPadding() {
+    std::vector<size_t> GetPadding()
+    {
         std::vector<size_t> paddingVector;
-        for (int i = 0; i < 4; ++i) {
+        for(int i = 0; i < 4; ++i)
+        {
             paddingVector.push_back(static_cast<size_t>(padding[i]));
         }
         return paddingVector;
@@ -107,24 +109,27 @@ struct PadReflectionTest : public ::testing::TestWithParam<PadReflectionCase>
 protected:
     void SetUp() override
     {
-        auto&& handle  = get_handle();
-        pad_reflection_config     = GetParam();
+        auto&& handle         = get_handle();
+        pad_reflection_config = GetParam();
         auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
 
         auto in_dims = pad_reflection_config.GetInput();
         auto padding = pad_reflection_config.GetPadding();
-        input = tensor<T>{in_dims}.generate(gen_value);
+        input        = tensor<T>{in_dims}.generate(gen_value);
         std::vector<size_t> out_dims;
 
         for(int i = 0; i < in_dims.size(); i++)
         {
             if(i == 2)
             {
-                out_dims.push_back(in_dims[i]  + 2*padding[2]);
-            } else if (i == 3) {
-                out_dims.push_back(in_dims[i]  + 2*padding[0]);
-            } 
-            else {
+                out_dims.push_back(in_dims[i] + 2 * padding[2]);
+            }
+            else if(i == 3)
+            {
+                out_dims.push_back(in_dims[i] + 2 * padding[0]);
+            }
+            else
+            {
                 out_dims.push_back(in_dims[i]);
             }
         }
@@ -141,18 +146,13 @@ struct PadReflectionTest : public ::testing::TestWithParam<PadReflectionCase>
     void RunTest()
     {
         auto&& handle = get_handle();
-        auto padding = pad_reflection_config.GetPadding();
+        auto padding  = pad_reflection_config.GetPadding();
 
         cpu_pad_reflection<T>(input, ref_output, padding);
         miopenStatus_t status;
 
-        status = miopen::PadReflection(handle,
-                                    input.desc,
-                                    input_dev.get(),
-                                    output.desc,
-                                    output_dev.get(),
-                                    padding
-                                    );
+        status = miopen::PadReflection(
+            handle, input.desc, input_dev.get(), output.desc, output_dev.get(), padding);
 
         EXPECT_EQ(status, miopenStatusSuccess);
 
@@ -175,5 +175,4 @@ struct PadReflectionTest : public ::testing::TestWithParam<PadReflectionCase>
 
     miopen::Allocator::ManageDataPtr input_dev;
     miopen::Allocator::ManageDataPtr output_dev;
-
 };

From a0ef46c20a734c34e1342c87b9cfb9f6d40e5098 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 16 Apr 2024 08:40:34 +0000
Subject: [PATCH 09/29] Change copyright 2023->2024, change padding from vector
 to pointer, add padding test half and bf16

---
 driver/pad_reflection_driver.hpp              | 14 +++---
 include/miopen/miopen.h                       |  6 +--
 src/include/miopen/pad_reflection.hpp         |  5 ++-
 .../miopen/pad_reflection/invoke_params.hpp   |  6 ++-
 .../pad_reflection/problem_description.hpp    | 24 +++++++---
 src/include/miopen/pad_reflection/solvers.hpp |  7 +--
 src/kernels/MIOpenPadReflection.cpp           |  2 +-
 src/pad_reflection.cpp                        | 10 +++--
 src/pad_reflection/problem_description.cpp    |  2 +-
 src/pad_reflection_api.cpp                    |  8 ++--
 src/solver/pad_reflection/pad_reflection.cpp  | 29 +++++++-----
 test/cpu_pad_reflection.hpp                   |  2 +-
 test/gtest/pad_reflection.cpp                 | 44 ++++++++++++++++++-
 test/gtest/pad_reflection.hpp                 | 16 +++----
 14 files changed, 121 insertions(+), 54 deletions(-)

diff --git a/driver/pad_reflection_driver.hpp b/driver/pad_reflection_driver.hpp
index edb35fee26..f964b47e6f 100644
--- a/driver/pad_reflection_driver.hpp
+++ b/driver/pad_reflection_driver.hpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -85,7 +85,6 @@ void mloPadReflectionRunHost(miopenTensorDescriptor_t inputDesc,
         }
         else if(padding_l <= w && w < in_W + padding_l)
         {
-            w = w;
         }
         else
         {
@@ -99,7 +98,6 @@ void mloPadReflectionRunHost(miopenTensorDescriptor_t inputDesc,
         }
         else if(padding_t <= h && h < in_H + padding_t)
         {
-            h = h;
         }
         else
         {
@@ -189,6 +187,10 @@ int PadReflectionDriver<Tgpu, Tref>::GetandSetData()
         padding.push_back(std::stoul(token));
     }
 
+    if (!(padding.size() == 1 or padding.size() == 4)) {
+        std::cerr << "Error Padding Lengths\n" << std::endl;
+    }
+
     std::vector<int> in_len = GetInputTensorLengthsFromCmdLine();
 
     SetTensorNd(inputDesc, in_len, data_type);
@@ -230,7 +232,7 @@ int PadReflectionDriver<Tgpu, Tref>::AddCmdLineArgs()
     inflags.AddInputFlag("in_h", 'H', "32", "Input Height (Default=32)", "int");
     inflags.AddInputFlag("in_w", 'W', "32", "Input Width (Default=32)", "int");
 
-    inflags.AddInputFlag("padding", 'P', "1,1,1,1", "Padding array (Default=1,1,1,1)", "str");
+    inflags.AddInputFlag("padding", 'P', "1,1,1,1", "Padding array (Default=1 or 1,1,1,1)", "str");
 
     inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
     inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int");
@@ -318,7 +320,7 @@ int PadReflectionDriver<Tgpu, Tref>::RunForwardGPU()
     for(int i = 0; i < inflags.GetValueInt("iter"); i++)
     {
         miopenPadReflection(
-            GetHandle(), inputDesc, in_dev->GetMem(), outputDesc, out_dev->GetMem(), padding);
+            GetHandle(), inputDesc, in_dev->GetMem(), outputDesc, out_dev->GetMem(), padding.data(), padding.size());
 
         float time = 0.0;
         miopenGetKernelTime(GetHandle(), &time);
@@ -374,7 +376,7 @@ int PadReflectionDriver<Tgpu, Tref>::VerifyForward()
     const Tref tolerance = GetTolerance();
     auto error           = miopen::rms_range(outhost, out);
 
-    if(!std::isfinite(error) || error > 0)
+    if(std::abs(static_cast<float>(error)) != 0.0f)
     {
         std::cout << "Pad Reflection FAILED: " << error << " > " << tolerance << std::endl;
         return EC_VerifyFwd;
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 003b989f9a..a5c92ad65f 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -26,7 +26,6 @@
 #ifndef MIOPEN_GUARD_MIOPEN_H_
 #define MIOPEN_GUARD_MIOPEN_H_
 
-#include <vector>
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wextern-c-compat"
@@ -5819,7 +5818,8 @@ MIOPEN_EXPORT miopenStatus_t miopenPadReflection(miopenHandle_t handle,
                                               const void* x,
                                               const miopenTensorDescriptor_t yDesc,
                                               void* y,
-                                              const std::vector<size_t> padding);
+                                              const size_t * padding,
+                                              const size_t num_padding);
 
 /** @} */
 // CLOSEOUT PAD REFLECTION DOXYGEN GROUP
diff --git a/src/include/miopen/pad_reflection.hpp b/src/include/miopen/pad_reflection.hpp
index fcd265922a..19d39b040b 100644
--- a/src/include/miopen/pad_reflection.hpp
+++ b/src/include/miopen/pad_reflection.hpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -38,6 +38,7 @@ miopenStatus_t PadReflection(Handle& handle,
                              ConstData_t x,
                              const TensorDescriptor& yDesc,
                              Data_t y,
-                             std::vector<size_t> padding);
+                             const size_t * padding,
+                             const size_t num_padding);
 } // namespace miopen
 #endif // MIOPEN_PAD_REFLECTION_HPP_
diff --git a/src/include/miopen/pad_reflection/invoke_params.hpp b/src/include/miopen/pad_reflection/invoke_params.hpp
index c7b847676c..2ff8bad10c 100644
--- a/src/include/miopen/pad_reflection/invoke_params.hpp
+++ b/src/include/miopen/pad_reflection/invoke_params.hpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -42,7 +42,9 @@ struct InvokeParams : public miopen::InvokeParams
     ConstData_t x = nullptr;
     Data_t y      = nullptr;
 
-    const std::vector<size_t>* padding = nullptr;
+    const size_t * padding = nullptr;
+    size_t num_padding = 0;
+
     std::size_t GetWorkspaceSize() const { return 0; }
     Data_t GetWorkspace() const { return nullptr; }
 };
diff --git a/src/include/miopen/pad_reflection/problem_description.hpp b/src/include/miopen/pad_reflection/problem_description.hpp
index 19b70ca476..65c258970d 100644
--- a/src/include/miopen/pad_reflection/problem_description.hpp
+++ b/src/include/miopen/pad_reflection/problem_description.hpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -41,13 +41,13 @@ struct ProblemDescription : ProblemDescriptionBase
 {
     ProblemDescription(const TensorDescriptor& xDesc_,
                        const TensorDescriptor& yDesc_,
-                       const std::vector<size_t> padding_)
-        : xDesc(xDesc_), yDesc(yDesc_), padding(padding_)
+                       const size_t num_padding_)
+        : xDesc(xDesc_), yDesc(yDesc_), num_padding(num_padding_) 
     {
     }
     const TensorDescriptor& GetXDesc() const { return xDesc; }
     const TensorDescriptor& GetYDesc() const { return yDesc; }
-    const std::vector<size_t>& GetPadding() const { return padding; }
+    size_t GetNumPadding() const { return num_padding; }
 
     bool IsSameType() const
     {
@@ -75,12 +75,26 @@ struct ProblemDescription : ProblemDescriptionBase
         return true;
     }
 
+    
+    bool IsRightNumPadding() const
+    {
+        if(!(num_padding == 1 || num_padding == 4))
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm, "Pad Reflection: Padding input needs to have 1 or 4 elements only.");
+#else
+            return false;
+#endif
+        }
+        return true;
+    } 
+
     NetworkConfig MakeNetworkConfig() const override;
 
 private:
     TensorDescriptor xDesc;
     TensorDescriptor yDesc;
-    std::vector<size_t> padding;
+    size_t num_padding;
 };
 
 } // namespace pad_reflection
diff --git a/src/include/miopen/pad_reflection/solvers.hpp b/src/include/miopen/pad_reflection/solvers.hpp
index e6913f5192..e2da94bc4b 100644
--- a/src/include/miopen/pad_reflection/solvers.hpp
+++ b/src/include/miopen/pad_reflection/solvers.hpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -48,11 +48,6 @@ struct PadReflection final : PadReflectionSolver
     ConvSolution
     GetSolution(const ExecutionContext& context,
                 const miopen::pad_reflection::ProblemDescription& problem) const override;
-    std::size_t
-    GetWorkspaceSize(const ExecutionContext& context,
-                     const miopen::pad_reflection::ProblemDescription& problem) const override;
-
-    bool MayNeedWorkspace() const override { return false; }
 };
 
 } // namespace pad_reflection
diff --git a/src/kernels/MIOpenPadReflection.cpp b/src/kernels/MIOpenPadReflection.cpp
index 7d67dc7019..fa70b00add 100644
--- a/src/kernels/MIOpenPadReflection.cpp
+++ b/src/kernels/MIOpenPadReflection.cpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/src/pad_reflection.cpp b/src/pad_reflection.cpp
index e3ac64d2be..5ddc10c3df 100644
--- a/src/pad_reflection.cpp
+++ b/src/pad_reflection.cpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -40,9 +40,10 @@ miopenStatus_t PadReflection(Handle& handle,
                              ConstData_t x,
                              const TensorDescriptor& yDesc,
                              Data_t y,
-                             const std::vector<size_t> padding)
+                             const size_t * padding,
+                             const size_t num_padding)
 {
-    const auto problem = pad_reflection::ProblemDescription{xDesc, yDesc, padding};
+    const auto problem = pad_reflection::ProblemDescription{xDesc, yDesc, num_padding};
 
     const auto invoke_params = [&]() {
         auto tmp    = pad_reflection::InvokeParams{};
@@ -51,7 +52,8 @@ miopenStatus_t PadReflection(Handle& handle,
         tmp.yDesc   = &yDesc;
         tmp.x       = x;
         tmp.y       = y;
-        tmp.padding = &padding;
+        tmp.padding = padding;
+        tmp.num_padding = num_padding;
         return tmp;
     }();
 
diff --git a/src/pad_reflection/problem_description.cpp b/src/pad_reflection/problem_description.cpp
index 98cc727c17..403b9e913b 100644
--- a/src/pad_reflection/problem_description.cpp
+++ b/src/pad_reflection/problem_description.cpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/src/pad_reflection_api.cpp b/src/pad_reflection_api.cpp
index 97fe6a7fe6..e8854b2aac 100644
--- a/src/pad_reflection_api.cpp
+++ b/src/pad_reflection_api.cpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -35,7 +35,8 @@ extern "C" miopenStatus_t miopenPadReflection(miopenHandle_t handle,
                                               const void* x,
                                               const miopenTensorDescriptor_t yDesc,
                                               void* y,
-                                              const std::vector<size_t> padding)
+                                              const size_t * padding,
+                                              const size_t num_padding)
 {
     MIOPEN_LOG_FUNCTION(handle, xDesc, x, yDesc, y, padding);
 
@@ -45,6 +46,7 @@ extern "C" miopenStatus_t miopenPadReflection(miopenHandle_t handle,
                               DataCast(x),
                               miopen::deref(yDesc),
                               DataCast(y),
-                              padding);
+                              padding,
+                              num_padding);
     });
 }
diff --git a/src/solver/pad_reflection/pad_reflection.cpp b/src/solver/pad_reflection/pad_reflection.cpp
index a75feadfe0..148cde5f3b 100644
--- a/src/solver/pad_reflection/pad_reflection.cpp
+++ b/src/solver/pad_reflection/pad_reflection.cpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -48,16 +48,11 @@ bool PadReflection::IsApplicable(
         return false;
     if(!problem.IsAllPacked())
         return false;
+    if(!problem.IsRightNumPadding())
+        return false;
     return true;
 }
 
-std::size_t PadReflection::GetWorkspaceSize(
-    [[maybe_unused]] const ExecutionContext& context,
-    [[maybe_unused]] const miopen::pad_reflection::ProblemDescription& problem) const
-{
-    return 0;
-}
-
 ConvSolution PadReflection::GetSolution(
     [[maybe_unused]] const ExecutionContext& context,
     [[maybe_unused]] const miopen::pad_reflection::ProblemDescription& problem) const
@@ -113,9 +108,21 @@ ConvSolution PadReflection::GetSolution(
             auto output_size =
                 std::accumulate(ydims.begin(), ydims.end(), 1ULL, std::multiplies<size_t>());
 
-            auto padding          = params.padding;
-            long padding_l        = (*padding)[0];
-            long padding_t        = (*padding)[2];
+            auto padding     = params.padding;
+            auto num_padding = params.num_padding;
+            long padding_l, padding_t;
+            if(num_padding == 1)
+            {
+                padding_l = padding[0];
+                padding_t = padding[0];
+            }
+            else if(num_padding == 4)
+            {
+                padding_l = padding[0];
+                padding_t = padding[2];
+            }
+            // long padding_l        = (*padding)[0];
+            // long padding_t        = (*padding)[2];
             size_t in_H           = xdims[2];
             size_t in_W           = xdims[3];
             size_t output_size_1  = ydims[1];
diff --git a/test/cpu_pad_reflection.hpp b/test/cpu_pad_reflection.hpp
index ef15909e7c..55e6935d8b 100644
--- a/test/cpu_pad_reflection.hpp
+++ b/test/cpu_pad_reflection.hpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/test/gtest/pad_reflection.cpp b/test/gtest/pad_reflection.cpp
index 1eefd15d92..93f93aecd4 100644
--- a/test/gtest/pad_reflection.cpp
+++ b/test/gtest/pad_reflection.cpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -46,6 +46,14 @@ struct PadReflectionTestFloat : PadReflectionTest<float>
 {
 };
 
+struct PadReflectionTestHalf : PadReflectionTest<half_float::half>
+{
+};
+
+struct PadReflectionTestBF16 : PadReflectionTest<bfloat16>
+{
+};
+
 } // namespace pad_reflection
 using namespace pad_reflection;
 
@@ -62,6 +70,40 @@ TEST_P(PadReflectionTestFloat, PadReflectionFw)
     }
 };
 
+TEST_P(PadReflectionTestHalf, PadReflectionFw)
+{
+    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(PadReflectionTestBF16, PadReflectionFw)
+{
+    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
 INSTANTIATE_TEST_SUITE_P(PadReflectionTestSet,
                          PadReflectionTestFloat,
                          testing::ValuesIn(PadReflectionTestFloatConfigs()));
+
+INSTANTIATE_TEST_SUITE_P(PadReflectionTestSet,
+                         PadReflectionTestHalf,
+                         testing::ValuesIn(PadReflectionTestFloatConfigs()));
+
+INSTANTIATE_TEST_SUITE_P(PadReflectionTestSet,
+                         PadReflectionTestBF16,
+                         testing::ValuesIn(PadReflectionTestFloatConfigs()));
diff --git a/test/gtest/pad_reflection.hpp b/test/gtest/pad_reflection.hpp
index b755639f08..5f3ce15478 100644
--- a/test/gtest/pad_reflection.hpp
+++ b/test/gtest/pad_reflection.hpp
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -41,11 +41,11 @@ struct PadReflectionCase
     size_t D;
     size_t H;
     size_t W;
-    int padding[4];
+    size_t padding[4];
     friend std::ostream& operator<<(std::ostream& os, const PadReflectionCase& tc)
     {
         return os << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H
-                  << " W:" << tc.W;
+                  << " W:" << tc.W << " Padding:" << tc.padding[0] << " " << tc.padding[1] << " " << tc.padding[2] << " " << tc.padding[3];
     }
 
     std::vector<size_t> GetInput()
@@ -82,7 +82,7 @@ struct PadReflectionCase
         std::vector<size_t> paddingVector;
         for(int i = 0; i < 4; ++i)
         {
-            paddingVector.push_back(static_cast<size_t>(padding[i]));
+            paddingVector.push_back(padding[i]);
         }
         return paddingVector;
     }
@@ -92,11 +92,11 @@ std::vector<PadReflectionCase> PadReflectionTestFloatConfigs()
 { // n c d h w padding
     // clang-format off
     return {
-        { 1,   1,    0,  3, 3, {2, 2, 2, 2}},
-        { 48,   8,    0,  512, 512, {1, 1, 1, 1}},
+        { 1,   1,    0,  3, 3, {2}},
+        { 48,   8,    0,  512, 512, {1}},
         { 48,   8,    0,  512, 512, {1, 1, 3, 3}},
         { 48,   8,    0,  512, 512, {0, 0, 2, 2}},
-        { 16, 311,    0,  98,  512, {1, 1, 1, 1}},
+        { 16, 311,    0,  98,  512, {1}},
         { 16, 311,    0,  98,  512, {1, 1, 3, 3}},
         { 16, 311,    0,  98,  512, {0, 0, 2, 2}},
       };
@@ -152,7 +152,7 @@ struct PadReflectionTest : public ::testing::TestWithParam<PadReflectionCase>
         miopenStatus_t status;
 
         status = miopen::PadReflection(
-            handle, input.desc, input_dev.get(), output.desc, output_dev.get(), padding);
+            handle, input.desc, input_dev.get(), output.desc, output_dev.get(), padding.data(), padding.size());
 
         EXPECT_EQ(status, miopenStatusSuccess);
 

From cd548ef582348ca90e53aaed40954ae0889efb72 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 16 Apr 2024 08:40:59 +0000
Subject: [PATCH 10/29] githook clean up

---
 driver/pad_reflection_driver.hpp                 | 12 +++++++++---
 include/miopen/miopen.h                          | 12 ++++++------
 src/include/miopen/pad_reflection.hpp            |  2 +-
 .../miopen/pad_reflection/invoke_params.hpp      |  4 ++--
 .../pad_reflection/problem_description.hpp       |  8 ++++----
 src/pad_reflection.cpp                           | 16 ++++++++--------
 src/pad_reflection_api.cpp                       |  2 +-
 test/gtest/pad_reflection.hpp                    | 12 +++++++++---
 8 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/driver/pad_reflection_driver.hpp b/driver/pad_reflection_driver.hpp
index f964b47e6f..f53a1ffe7d 100644
--- a/driver/pad_reflection_driver.hpp
+++ b/driver/pad_reflection_driver.hpp
@@ -187,7 +187,8 @@ int PadReflectionDriver<Tgpu, Tref>::GetandSetData()
         padding.push_back(std::stoul(token));
     }
 
-    if (!(padding.size() == 1 or padding.size() == 4)) {
+    if(!(padding.size() == 1 or padding.size() == 4))
+    {
         std::cerr << "Error Padding Lengths\n" << std::endl;
     }
 
@@ -319,8 +320,13 @@ int PadReflectionDriver<Tgpu, Tref>::RunForwardGPU()
 
     for(int i = 0; i < inflags.GetValueInt("iter"); i++)
     {
-        miopenPadReflection(
-            GetHandle(), inputDesc, in_dev->GetMem(), outputDesc, out_dev->GetMem(), padding.data(), padding.size());
+        miopenPadReflection(GetHandle(),
+                            inputDesc,
+                            in_dev->GetMem(),
+                            outputDesc,
+                            out_dev->GetMem(),
+                            padding.data(),
+                            padding.size());
 
         float time = 0.0;
         miopenGetKernelTime(GetHandle(), &time);
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index a5c92ad65f..4626464794 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -5814,12 +5814,12 @@ MIOPEN_EXPORT miopenStatus_t miopenSumForward(miopenHandle_t handle,
  * @return                         miopenStatus_t
  */
 MIOPEN_EXPORT miopenStatus_t miopenPadReflection(miopenHandle_t handle,
-                                              const miopenTensorDescriptor_t xDesc,
-                                              const void* x,
-                                              const miopenTensorDescriptor_t yDesc,
-                                              void* y,
-                                              const size_t * padding,
-                                              const size_t num_padding);
+                                                 const miopenTensorDescriptor_t xDesc,
+                                                 const void* x,
+                                                 const miopenTensorDescriptor_t yDesc,
+                                                 void* y,
+                                                 const size_t* padding,
+                                                 const size_t num_padding);
 
 /** @} */
 // CLOSEOUT PAD REFLECTION DOXYGEN GROUP
diff --git a/src/include/miopen/pad_reflection.hpp b/src/include/miopen/pad_reflection.hpp
index 19d39b040b..ce65b16ec3 100644
--- a/src/include/miopen/pad_reflection.hpp
+++ b/src/include/miopen/pad_reflection.hpp
@@ -38,7 +38,7 @@ miopenStatus_t PadReflection(Handle& handle,
                              ConstData_t x,
                              const TensorDescriptor& yDesc,
                              Data_t y,
-                             const size_t * padding,
+                             const size_t* padding,
                              const size_t num_padding);
 } // namespace miopen
 #endif // MIOPEN_PAD_REFLECTION_HPP_
diff --git a/src/include/miopen/pad_reflection/invoke_params.hpp b/src/include/miopen/pad_reflection/invoke_params.hpp
index 2ff8bad10c..b64438d83a 100644
--- a/src/include/miopen/pad_reflection/invoke_params.hpp
+++ b/src/include/miopen/pad_reflection/invoke_params.hpp
@@ -42,8 +42,8 @@ struct InvokeParams : public miopen::InvokeParams
     ConstData_t x = nullptr;
     Data_t y      = nullptr;
 
-    const size_t * padding = nullptr;
-    size_t num_padding = 0;
+    const size_t* padding = nullptr;
+    size_t num_padding    = 0;
 
     std::size_t GetWorkspaceSize() const { return 0; }
     Data_t GetWorkspace() const { return nullptr; }
diff --git a/src/include/miopen/pad_reflection/problem_description.hpp b/src/include/miopen/pad_reflection/problem_description.hpp
index 65c258970d..3172512757 100644
--- a/src/include/miopen/pad_reflection/problem_description.hpp
+++ b/src/include/miopen/pad_reflection/problem_description.hpp
@@ -42,7 +42,7 @@ struct ProblemDescription : ProblemDescriptionBase
     ProblemDescription(const TensorDescriptor& xDesc_,
                        const TensorDescriptor& yDesc_,
                        const size_t num_padding_)
-        : xDesc(xDesc_), yDesc(yDesc_), num_padding(num_padding_) 
+        : xDesc(xDesc_), yDesc(yDesc_), num_padding(num_padding_)
     {
     }
     const TensorDescriptor& GetXDesc() const { return xDesc; }
@@ -75,19 +75,19 @@ struct ProblemDescription : ProblemDescriptionBase
         return true;
     }
 
-    
     bool IsRightNumPadding() const
     {
         if(!(num_padding == 1 || num_padding == 4))
         {
 #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-            MIOPEN_THROW(miopenStatusBadParm, "Pad Reflection: Padding input needs to have 1 or 4 elements only.");
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Pad Reflection: Padding input needs to have 1 or 4 elements only.");
 #else
             return false;
 #endif
         }
         return true;
-    } 
+    }
 
     NetworkConfig MakeNetworkConfig() const override;
 
diff --git a/src/pad_reflection.cpp b/src/pad_reflection.cpp
index 5ddc10c3df..4605984f90 100644
--- a/src/pad_reflection.cpp
+++ b/src/pad_reflection.cpp
@@ -40,19 +40,19 @@ miopenStatus_t PadReflection(Handle& handle,
                              ConstData_t x,
                              const TensorDescriptor& yDesc,
                              Data_t y,
-                             const size_t * padding,
+                             const size_t* padding,
                              const size_t num_padding)
 {
     const auto problem = pad_reflection::ProblemDescription{xDesc, yDesc, num_padding};
 
     const auto invoke_params = [&]() {
-        auto tmp    = pad_reflection::InvokeParams{};
-        tmp.type    = InvokeType::Run;
-        tmp.xDesc   = &xDesc;
-        tmp.yDesc   = &yDesc;
-        tmp.x       = x;
-        tmp.y       = y;
-        tmp.padding = padding;
+        auto tmp        = pad_reflection::InvokeParams{};
+        tmp.type        = InvokeType::Run;
+        tmp.xDesc       = &xDesc;
+        tmp.yDesc       = &yDesc;
+        tmp.x           = x;
+        tmp.y           = y;
+        tmp.padding     = padding;
         tmp.num_padding = num_padding;
         return tmp;
     }();
diff --git a/src/pad_reflection_api.cpp b/src/pad_reflection_api.cpp
index e8854b2aac..bbe94d3ec6 100644
--- a/src/pad_reflection_api.cpp
+++ b/src/pad_reflection_api.cpp
@@ -35,7 +35,7 @@ extern "C" miopenStatus_t miopenPadReflection(miopenHandle_t handle,
                                               const void* x,
                                               const miopenTensorDescriptor_t yDesc,
                                               void* y,
-                                              const size_t * padding,
+                                              const size_t* padding,
                                               const size_t num_padding)
 {
     MIOPEN_LOG_FUNCTION(handle, xDesc, x, yDesc, y, padding);
diff --git a/test/gtest/pad_reflection.hpp b/test/gtest/pad_reflection.hpp
index 5f3ce15478..c164885baa 100644
--- a/test/gtest/pad_reflection.hpp
+++ b/test/gtest/pad_reflection.hpp
@@ -45,7 +45,8 @@ struct PadReflectionCase
     friend std::ostream& operator<<(std::ostream& os, const PadReflectionCase& tc)
     {
         return os << " N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H
-                  << " W:" << tc.W << " Padding:" << tc.padding[0] << " " << tc.padding[1] << " " << tc.padding[2] << " " << tc.padding[3];
+                  << " W:" << tc.W << " Padding:" << tc.padding[0] << " " << tc.padding[1] << " "
+                  << tc.padding[2] << " " << tc.padding[3];
     }
 
     std::vector<size_t> GetInput()
@@ -151,8 +152,13 @@ struct PadReflectionTest : public ::testing::TestWithParam<PadReflectionCase>
         cpu_pad_reflection<T>(input, ref_output, padding);
         miopenStatus_t status;
 
-        status = miopen::PadReflection(
-            handle, input.desc, input_dev.get(), output.desc, output_dev.get(), padding.data(), padding.size());
+        status = miopen::PadReflection(handle,
+                                       input.desc,
+                                       input_dev.get(),
+                                       output.desc,
+                                       output_dev.get(),
+                                       padding.data(),
+                                       padding.size());
 
         EXPECT_EQ(status, miopenStatusSuccess);
 

From f21e7f57777fa48481c2d2de7e7fdfe5b29ffab1 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 16 Apr 2024 09:26:23 +0000
Subject: [PATCH 11/29] remove maybe_unused for problem in
 PadReflection::IsApplicable and PadReflection::GetSolution

---
 src/solver/pad_reflection/pad_reflection.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/solver/pad_reflection/pad_reflection.cpp b/src/solver/pad_reflection/pad_reflection.cpp
index 148cde5f3b..b0b9fc16b2 100644
--- a/src/solver/pad_reflection/pad_reflection.cpp
+++ b/src/solver/pad_reflection/pad_reflection.cpp
@@ -42,7 +42,7 @@ namespace pad_reflection {
 
 bool PadReflection::IsApplicable(
     [[maybe_unused]] const ExecutionContext& context,
-    [[maybe_unused]] const miopen::pad_reflection::ProblemDescription& problem) const
+    const miopen::pad_reflection::ProblemDescription& problem) const
 {
     if(!problem.IsSameType())
         return false;
@@ -55,7 +55,7 @@ bool PadReflection::IsApplicable(
 
 ConvSolution PadReflection::GetSolution(
     [[maybe_unused]] const ExecutionContext& context,
-    [[maybe_unused]] const miopen::pad_reflection::ProblemDescription& problem) const
+    const miopen::pad_reflection::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 

From 825111e197f5f5e8ad84d35f01a6649188d56b0f Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 16 Apr 2024 09:26:39 +0000
Subject: [PATCH 12/29] githook cleanup

---
 src/solver/pad_reflection/pad_reflection.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/solver/pad_reflection/pad_reflection.cpp b/src/solver/pad_reflection/pad_reflection.cpp
index b0b9fc16b2..b2b03a2d91 100644
--- a/src/solver/pad_reflection/pad_reflection.cpp
+++ b/src/solver/pad_reflection/pad_reflection.cpp
@@ -40,9 +40,8 @@ namespace solver {
 
 namespace pad_reflection {
 
-bool PadReflection::IsApplicable(
-    [[maybe_unused]] const ExecutionContext& context,
-    const miopen::pad_reflection::ProblemDescription& problem) const
+bool PadReflection::IsApplicable([[maybe_unused]] const ExecutionContext& context,
+                                 const miopen::pad_reflection::ProblemDescription& problem) const
 {
     if(!problem.IsSameType())
         return false;
@@ -53,9 +52,9 @@ bool PadReflection::IsApplicable(
     return true;
 }
 
-ConvSolution PadReflection::GetSolution(
-    [[maybe_unused]] const ExecutionContext& context,
-    const miopen::pad_reflection::ProblemDescription& problem) const
+ConvSolution
+PadReflection::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                           const miopen::pad_reflection::ProblemDescription& problem) const
 {
     auto result = ConvSolution{miopenStatusSuccess};
 

From 6f4e1a0352dfe9444be3f7b7dd23b925ae5e2491 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 16 Apr 2024 15:09:12 +0000
Subject: [PATCH 13/29] update miopen.h padreflection description

---
 include/miopen/miopen.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 4626464794..c4d4c082c7 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -5803,14 +5803,12 @@ MIOPEN_EXPORT miopenStatus_t miopenSumForward(miopenHandle_t handle,
 /*! @brief Add padding by the reflection of the tensor
  *
  * @param handle                   MIOpen handle (input)
- * @param nanPropagation           Nan number propagation mode (input)
- * @param workspace                Address of the allocated workspace data (input)
- * @param workspaceSizeInBytes     Size in bytes of the allocated workspace data (input)
  * @param xDesc                    Tensor descriptor for data input tensor x (input)
  * @param x                        Data tensor x (input)
- * @param dim                      Dimensions to sum. (input)
  * @param yDesc                    Tensor descriptor for output data tensor y (input)
  * @param y                        Data tensor y (output)
+ * @param padding                  Padding array contain 1 or 4 elements (input)
+ * @param num_padding              Number of elements in padding, equals 1 or 4 (input)
  * @return                         miopenStatus_t
  */
 MIOPEN_EXPORT miopenStatus_t miopenPadReflection(miopenHandle_t handle,

From c4aedd15ea3978ba8fd06c89d76486b1a8d886f7 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 16 Apr 2024 15:15:05 +0000
Subject: [PATCH 14/29] update "CLOSEOUT padreflection DOXYGEN GROUP"

---
 include/miopen/miopen.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index c4d4c082c7..18c60f17b6 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -5820,7 +5820,7 @@ MIOPEN_EXPORT miopenStatus_t miopenPadReflection(miopenHandle_t handle,
                                                  const size_t num_padding);
 
 /** @} */
-// CLOSEOUT PAD REFLECTION DOXYGEN GROUP
+// CLOSEOUT padreflection DOXYGEN GROUP
 #endif
 
 #ifdef MIOPEN_BETA_API

From a805d4258317c4103f2bd8ead6083cc1cd7f5e6e Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Wed, 17 Apr 2024 03:12:41 +0000
Subject: [PATCH 15/29] Chang padding_l -> padding_left, padding_t ->
 padding_top for padreflection kernel

---
 src/kernels/MIOpenPadReflection.cpp | 36 ++++++++++++++---------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/kernels/MIOpenPadReflection.cpp b/src/kernels/MIOpenPadReflection.cpp
index fa70b00add..3a4922399c 100644
--- a/src/kernels/MIOpenPadReflection.cpp
+++ b/src/kernels/MIOpenPadReflection.cpp
@@ -35,8 +35,8 @@ template <typename TI, typename TO>
 __device__ void padReflection2dFwdContiguous(const TI* __restrict__ input,
                                              TO* __restrict__ output,
                                              uint64_t output_size,
-                                             long padding_l,
-                                             long padding_t,
+                                             long padding_left,
+                                             long padding_top,
                                              const size_t in_H,
                                              const size_t in_W,
                                              const size_t output_size_1,
@@ -59,34 +59,34 @@ __device__ void padReflection2dFwdContiguous(const TI* __restrict__ input,
     n         = nc / output_size_1;
     c         = nc % output_size_1;
 
-    long in_start_x  = max(0L, -padding_l);
-    long in_start_y  = max(0L, -padding_t);
-    long out_start_x = max(0L, padding_l);
-    long out_start_y = max(0L, padding_t);
+    long in_start_x  = max(0L, -padding_left);
+    long in_start_y  = max(0L, -padding_top);
+    long out_start_x = max(0L, padding_left);
+    long out_start_y = max(0L, padding_top);
 
-    if(w < padding_l)
+    if(w < padding_left)
     {
-        w = padding_l * 2 - w;
+        w = padding_left * 2 - w;
     }
-    else if(padding_l <= w && w < in_W + padding_l)
+    else if(padding_left <= w && w < in_W + padding_left)
     {
     }
     else
     {
-        w = (in_W + padding_l - 1) * 2 - w;
+        w = (in_W + padding_left - 1) * 2 - w;
     }
     w = w - out_start_x + in_start_x;
 
-    if(h < padding_t)
+    if(h < padding_top)
     {
-        h = padding_t * 2 - h;
+        h = padding_top * 2 - h;
     }
-    else if(padding_t <= h && h < in_H + padding_t)
+    else if(padding_top <= h && h < in_H + padding_top)
     {
     }
     else
     {
-        h = (in_H + padding_t - 1) * 2 - h;
+        h = (in_H + padding_top - 1) * 2 - h;
     }
     h = h - out_start_y + in_start_y;
 
@@ -97,8 +97,8 @@ __device__ void padReflection2dFwdContiguous(const TI* __restrict__ input,
 extern "C" __global__ void PadReflection2dFwdContiguous(const INPUT_TYPE* __restrict__ input,
                                                         OUTPUT_TYPE* __restrict__ output,
                                                         uint64_t output_size,
-                                                        long padding_l,
-                                                        long padding_t,
+                                                        long padding_left,
+                                                        long padding_top,
                                                         const size_t in_H,
                                                         const size_t in_W,
                                                         const size_t output_size_1,
@@ -112,8 +112,8 @@ extern "C" __global__ void PadReflection2dFwdContiguous(const INPUT_TYPE* __rest
     padReflection2dFwdContiguous<INPUT_TYPE, OUTPUT_TYPE>(input,
                                                           output,
                                                           output_size,
-                                                          padding_l,
-                                                          padding_t,
+                                                          padding_left,
+                                                          padding_top,
                                                           in_H,
                                                           in_W,
                                                           output_size_1,

From c916b1d4d45978492def1503f2539eb7c1caec4c Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Wed, 17 Apr 2024 03:22:15 +0000
Subject: [PATCH 16/29] remove unused condition line in padreflection kernel

---
 src/kernels/MIOpenPadReflection.cpp | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/kernels/MIOpenPadReflection.cpp b/src/kernels/MIOpenPadReflection.cpp
index 3a4922399c..e269f168ff 100644
--- a/src/kernels/MIOpenPadReflection.cpp
+++ b/src/kernels/MIOpenPadReflection.cpp
@@ -68,10 +68,7 @@ __device__ void padReflection2dFwdContiguous(const TI* __restrict__ input,
     {
         w = padding_left * 2 - w;
     }
-    else if(padding_left <= w && w < in_W + padding_left)
-    {
-    }
-    else
+    else if(!(padding_left <= w && w < in_W + padding_left))
     {
         w = (in_W + padding_left - 1) * 2 - w;
     }
@@ -81,10 +78,7 @@ __device__ void padReflection2dFwdContiguous(const TI* __restrict__ input,
     {
         h = padding_top * 2 - h;
     }
-    else if(padding_top <= h && h < in_H + padding_top)
-    {
-    }
-    else
+    else if(!(padding_top <= h && h < in_H + padding_top))
     {
         h = (in_H + padding_top - 1) * 2 - h;
     }

From a008c17039ef89a729d072c0f174df1312b932f9 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Wed, 17 Apr 2024 03:22:38 +0000
Subject: [PATCH 17/29] remove unused line

---
 src/solver/pad_reflection/pad_reflection.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/solver/pad_reflection/pad_reflection.cpp b/src/solver/pad_reflection/pad_reflection.cpp
index b2b03a2d91..d0aebbd521 100644
--- a/src/solver/pad_reflection/pad_reflection.cpp
+++ b/src/solver/pad_reflection/pad_reflection.cpp
@@ -120,8 +120,6 @@ PadReflection::GetSolution([[maybe_unused]] const ExecutionContext& context,
                 padding_l = padding[0];
                 padding_t = padding[2];
             }
-            // long padding_l        = (*padding)[0];
-            // long padding_t        = (*padding)[2];
             size_t in_H           = xdims[2];
             size_t in_W           = xdims[3];
             size_t output_size_1  = ydims[1];

From c0eacdf799b045395f0fe0816d33c07a5aef635c Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Wed, 17 Apr 2024 03:24:29 +0000
Subject: [PATCH 18/29] indent test_case

---
 test/gtest/pad_reflection.hpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/gtest/pad_reflection.hpp b/test/gtest/pad_reflection.hpp
index c164885baa..482bc89a71 100644
--- a/test/gtest/pad_reflection.hpp
+++ b/test/gtest/pad_reflection.hpp
@@ -93,13 +93,13 @@ std::vector<PadReflectionCase> PadReflectionTestFloatConfigs()
 { // n c d h w padding
     // clang-format off
     return {
-        { 1,   1,    0,  3, 3, {2}},
-        { 48,   8,    0,  512, 512, {1}},
-        { 48,   8,    0,  512, 512, {1, 1, 3, 3}},
-        { 48,   8,    0,  512, 512, {0, 0, 2, 2}},
-        { 16, 311,    0,  98,  512, {1}},
-        { 16, 311,    0,  98,  512, {1, 1, 3, 3}},
-        { 16, 311,    0,  98,  512, {0, 0, 2, 2}},
+        {  1,   1,    0,    3,    3, {2}},
+        { 48,   8,    0,  512,  512, {1}},
+        { 48,   8,    0,  512,  512, {1, 1, 3, 3}},
+        { 48,   8,    0,  512,  512, {0, 0, 2, 2}},
+        { 16, 311,    0,   98,  512, {1}},
+        { 16, 311,    0,   98,  512, {1, 1, 3, 3}},
+        { 16, 311,    0,   98,  512, {0, 0, 2, 2}},
       };
     // clang-format on
 }

From e41d851c6c8ae49ece2299504d17ca18d81ba75a Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Fri, 19 Apr 2024 02:52:32 +0000
Subject: [PATCH 19/29] Update padreflection group -> pad group for doxygen

---
 include/miopen/miopen.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 18c60f17b6..c4bb5f01e8 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -68,7 +68,7 @@
  * @defgroup argmax
  * @defgroup groupnorm
  * @defgroup cat
- * @defgroup padreflection
+ * @defgroup pad
  *
  */
 
@@ -5796,7 +5796,7 @@ MIOPEN_EXPORT miopenStatus_t miopenSumForward(miopenHandle_t handle,
 
 #ifdef MIOPEN_BETA_API
 // Padreflection APIs
-/** @addtogroup padreflection
+/** @addtogroup pad
  *
  *  @{
  */
@@ -5820,7 +5820,7 @@ MIOPEN_EXPORT miopenStatus_t miopenPadReflection(miopenHandle_t handle,
                                                  const size_t num_padding);
 
 /** @} */
-// CLOSEOUT padreflection DOXYGEN GROUP
+// CLOSEOUT pad DOXYGEN GROUP
 #endif
 
 #ifdef MIOPEN_BETA_API

From b7d79aca073ed9fd60d239caaeb870d98fc9546c Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Fri, 19 Apr 2024 06:34:22 +0000
Subject: [PATCH 20/29] update docs index padreflection group -> pad group

---
 docs/reference/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 60883f9a1b..58828bdcce 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -31,5 +31,5 @@ The MIOpen API library is structured as follows:
   * :doc:`Sum <../doxygen/html/group__sum>` (experimental)
   * :doc:`GroupNorm <../doxygen/html/group__groupnorm>` (experimental)
   * :doc:`Cat <../doxygen/html/group__cat>` (experimental)
-  * :doc:`PadReflection <../doxygen/html/group__padreflection>` (experimental)
+  * :doc:`Pad <../doxygen/html/group__pad>` (experimental)
   * :doc:`Argmax<./argmax>` (experimental)

From 2d32b06bad4cbb74d141d589758a24f029d4e0e7 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 6 Aug 2024 07:44:20 +0000
Subject: [PATCH 21/29] githook format

---
 driver/driver.hpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/driver/driver.hpp b/driver/driver.hpp
index 677a4b0dc4..fbf4cd4e86 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -206,8 +206,9 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" &&
        arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" &&
        arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" &&
-       arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "padreflection" &&
-       arg != "padreflectionfp16" && arg != "padreflectionbfp16" && arg != "--version")
+       arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" &&
+       arg != "padreflection" && arg != "padreflectionfp16" && arg != "padreflectionbfp16" &&
+       arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();

From b427a78b9d74abc0f51a679214a5149aef2b223b Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 6 Aug 2024 09:58:05 +0000
Subject: [PATCH 22/29] update driver and gtest format

---
 driver/pad_reflection_driver.hpp | 48 ++------------------------------
 test/gtest/pad_reflection.cpp    | 14 ++++++----
 2 files changed, 12 insertions(+), 50 deletions(-)

diff --git a/driver/pad_reflection_driver.hpp b/driver/pad_reflection_driver.hpp
index f53a1ffe7d..881cb45c46 100644
--- a/driver/pad_reflection_driver.hpp
+++ b/driver/pad_reflection_driver.hpp
@@ -31,8 +31,6 @@
 #include "tensor_driver.hpp"
 #include "timer.hpp"
 #include "random.hpp"
-#include <algorithm>
-#include <cfloat>
 #include <cstdlib>
 #include <memory>
 #include <miopen/miopen.h>
@@ -128,7 +126,6 @@ class PadReflectionDriver : public Driver
     InputFlags& GetInputFlags() override { return inflags; }
 
     int GetandSetData() override;
-    std::vector<int> GetInputTensorLengthsFromCmdLine();
 
     int AllocateBuffersAndCopy() override;
 
@@ -192,7 +189,7 @@ int PadReflectionDriver<Tgpu, Tref>::GetandSetData()
         std::cerr << "Error Padding Lengths\n" << std::endl;
     }
 
-    std::vector<int> in_len = GetInputTensorLengthsFromCmdLine();
+    std::vector<int> in_len = inflags.GetValueTensor("DimLengths").lengths;
 
     SetTensorNd(inputDesc, in_len, data_type);
 
@@ -227,11 +224,8 @@ template <typename Tgpu, typename Tref>
 int PadReflectionDriver<Tgpu, Tref>::AddCmdLineArgs()
 {
     inflags.AddInputFlag("forw", 'F', "1", "Run only Forward Pad Reflection (Default=1)", "int");
-    inflags.AddInputFlag("batchsize", 'n', "256", "Mini-batch size (Default=100)", "int");
-    inflags.AddInputFlag("in_channels", 'c', "4", "Number of Input Channels (Default=3)", "int");
-    inflags.AddInputFlag("in_d", 'D', "0", "Input Depth (Default=0)", "int");
-    inflags.AddInputFlag("in_h", 'H', "32", "Input Height (Default=32)", "int");
-    inflags.AddInputFlag("in_w", 'W', "32", "Input Width (Default=32)", "int");
+    inflags.AddTensorFlag(
+        "DimLengths", 'D', "256x4x32x32", "The dimensional lengths of the input tensor");
 
     inflags.AddInputFlag("padding", 'P', "1,1,1,1", "Padding array (Default=1 or 1,1,1,1)", "str");
 
@@ -244,42 +238,6 @@ int PadReflectionDriver<Tgpu, Tref>::AddCmdLineArgs()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref>
-std::vector<int> PadReflectionDriver<Tgpu, Tref>::GetInputTensorLengthsFromCmdLine()
-{
-    int in_n = inflags.GetValueInt("batchsize");
-    int in_c = inflags.GetValueInt("in_channels");
-    int in_w = inflags.GetValueInt("in_w");
-    int in_h = inflags.GetValueInt("in_h");
-    int in_d = inflags.GetValueInt("in_d");
-
-    if((in_n != 0) && (in_c != 0) && (in_d != 0) && (in_h != 0) && (in_w != 0))
-    {
-        return std::vector<int>({in_n, in_c, in_d, in_h, in_w});
-    }
-    else if((in_n != 0) && (in_c != 0) && (in_h != 0) && (in_w != 0))
-    {
-        return std::vector<int>({in_n, in_c, in_h, in_w});
-    }
-    else if((in_n != 0) && (in_c != 0) && (in_w != 0))
-    {
-        return std::vector<int>({in_n, in_c, in_w});
-    }
-    else if((in_n != 0) && (in_w != 0))
-    {
-        return std::vector<int>({in_n, in_w});
-    }
-    else if(in_n != 0)
-    {
-        return std::vector<int>({in_n});
-    }
-    else
-    {
-        std::cerr << "Error Input Tensor Lengths\n" << std::endl;
-        return std::vector<int>({0});
-    }
-}
-
 template <typename Tgpu, typename Tref>
 int PadReflectionDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
 {
diff --git a/test/gtest/pad_reflection.cpp b/test/gtest/pad_reflection.cpp
index 93f93aecd4..38c57464a8 100644
--- a/test/gtest/pad_reflection.cpp
+++ b/test/gtest/pad_reflection.cpp
@@ -34,7 +34,7 @@ namespace pad_reflection {
 
 std::string GetFloatArg()
 {
-    const auto& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG));
+    const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG);
     if(tmp.empty())
     {
         return "";
@@ -46,7 +46,7 @@ struct PadReflectionTestFloat : PadReflectionTest<float>
 {
 };
 
-struct PadReflectionTestHalf : PadReflectionTest<half_float::half>
+struct PadReflectionTestHalf : PadReflectionTest<half>
 {
 };
 
@@ -55,11 +55,13 @@ struct PadReflectionTestBF16 : PadReflectionTest<bfloat16>
 };
 
 } // namespace pad_reflection
+
 using namespace pad_reflection;
 
 TEST_P(PadReflectionTestFloat, PadReflectionFw)
 {
-    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float"))
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
     {
         RunTest();
         Verify();
@@ -72,7 +74,8 @@ TEST_P(PadReflectionTestFloat, PadReflectionFw)
 
 TEST_P(PadReflectionTestHalf, PadReflectionFw)
 {
-    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--half"))
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
     {
         RunTest();
         Verify();
@@ -85,7 +88,8 @@ TEST_P(PadReflectionTestHalf, PadReflectionFw)
 
 TEST_P(PadReflectionTestBF16, PadReflectionFw)
 {
-    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--bfloat16"))
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
     {
         RunTest();
         Verify();

From 3314ea26a2c0414b781f6cd9af7550f43d86240b Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 6 Aug 2024 21:29:45 +0000
Subject: [PATCH 23/29] add pad reflection to solver

---
 src/include/miopen/solver_id.hpp    | 3 ++-
 src/kernels/MIOpenPadReflection.cpp | 2 +-
 src/solver.cpp                      | 3 +++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index 81c15f6bea..8721a67a14 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -59,7 +59,8 @@ enum class Primitive
     Mha,
     Softmax,
     Adam,
-    Item
+    Item,
+    PadReflection
 };
 
 struct MIOPEN_INTERNALS_EXPORT Id
diff --git a/src/kernels/MIOpenPadReflection.cpp b/src/kernels/MIOpenPadReflection.cpp
index e269f168ff..bfb4f2b00d 100644
--- a/src/kernels/MIOpenPadReflection.cpp
+++ b/src/kernels/MIOpenPadReflection.cpp
@@ -23,7 +23,7 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include <cstddef>
+ 
 #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
 #include <hip/hip_fp16.h>
 #include <hip/hip_runtime.h>
diff --git a/src/solver.cpp b/src/solver.cpp
index 6b451ca498..25ae51b562 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -34,6 +34,7 @@
 #include <miopen/groupnorm/solvers.hpp>
 #include <miopen/getitem/solvers.hpp>
 #include <miopen/layernorm/solvers.hpp>
+#include <miopen/pad_reflection/solvers.hpp>
 #include <miopen/pooling/solvers.hpp>
 #include <miopen/reduce/solvers.hpp>
 #include <miopen/mha/solvers.hpp>
@@ -673,6 +674,8 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
              fusion::ConvWinoFuryRxSFused<2, 3>{}.SolverDbId(),
              miopenConvolutionAlgoWinograd);
 
+    Register(registry, ++id, Primitive::PadReflection, pad_reflection::PadReflection{}.SolverDbId());
+
     // IMPORTANT: New solvers should be added to the end of the function!
 }
 

From 2a4e52ebe778496c5c8c5d632aeeb10009b81740 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 6 Aug 2024 21:30:00 +0000
Subject: [PATCH 24/29] githook format

---
 src/kernels/MIOpenPadReflection.cpp | 2 +-
 src/solver.cpp                      | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/kernels/MIOpenPadReflection.cpp b/src/kernels/MIOpenPadReflection.cpp
index bfb4f2b00d..9e2379ddfc 100644
--- a/src/kernels/MIOpenPadReflection.cpp
+++ b/src/kernels/MIOpenPadReflection.cpp
@@ -23,7 +23,7 @@
  * SOFTWARE.
  *
  *******************************************************************************/
- 
+
 #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
 #include <hip/hip_fp16.h>
 #include <hip/hip_runtime.h>
diff --git a/src/solver.cpp b/src/solver.cpp
index 25ae51b562..47d1ab33bd 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -674,7 +674,8 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
              fusion::ConvWinoFuryRxSFused<2, 3>{}.SolverDbId(),
              miopenConvolutionAlgoWinograd);
 
-    Register(registry, ++id, Primitive::PadReflection, pad_reflection::PadReflection{}.SolverDbId());
+    Register(
+        registry, ++id, Primitive::PadReflection, pad_reflection::PadReflection{}.SolverDbId());
 
     // IMPORTANT: New solvers should be added to the end of the function!
 }

From ea86f83cd73538c266839cd2fde0e723e98a0297 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Wed, 7 Aug 2024 03:39:21 +0000
Subject: [PATCH 25/29] fix uninitialize variable

---
 src/solver/pad_reflection/pad_reflection.cpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/solver/pad_reflection/pad_reflection.cpp b/src/solver/pad_reflection/pad_reflection.cpp
index d0aebbd521..08b33e90e7 100644
--- a/src/solver/pad_reflection/pad_reflection.cpp
+++ b/src/solver/pad_reflection/pad_reflection.cpp
@@ -109,13 +109,9 @@ PadReflection::GetSolution([[maybe_unused]] const ExecutionContext& context,
 
             auto padding     = params.padding;
             auto num_padding = params.num_padding;
-            long padding_l, padding_t;
-            if(num_padding == 1)
-            {
-                padding_l = padding[0];
-                padding_t = padding[0];
-            }
-            else if(num_padding == 4)
+            long padding_l = padding[0];
+            long padding_t = padding[0];
+            if(num_padding == 4)
             {
                 padding_l = padding[0];
                 padding_t = padding[2];

From 871ad5709d4c7ff0939a4ef32c066feba0e38042 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Wed, 7 Aug 2024 03:40:00 +0000
Subject: [PATCH 26/29] githook format

---
 src/solver/pad_reflection/pad_reflection.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/solver/pad_reflection/pad_reflection.cpp b/src/solver/pad_reflection/pad_reflection.cpp
index 08b33e90e7..b5e85b15bb 100644
--- a/src/solver/pad_reflection/pad_reflection.cpp
+++ b/src/solver/pad_reflection/pad_reflection.cpp
@@ -109,8 +109,8 @@ PadReflection::GetSolution([[maybe_unused]] const ExecutionContext& context,
 
             auto padding     = params.padding;
             auto num_padding = params.num_padding;
-            long padding_l = padding[0];
-            long padding_t = padding[0];
+            long padding_l   = padding[0];
+            long padding_t   = padding[0];
             if(num_padding == 4)
             {
                 padding_l = padding[0];

From afb2464d5af72a3cd3d7f492acf5e2a495972df4 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Wed, 7 Aug 2024 10:38:27 +0000
Subject: [PATCH 27/29] fix tidy

---
 src/include/miopen/pad_reflection.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/miopen/pad_reflection.hpp b/src/include/miopen/pad_reflection.hpp
index ce65b16ec3..017d020340 100644
--- a/src/include/miopen/pad_reflection.hpp
+++ b/src/include/miopen/pad_reflection.hpp
@@ -39,6 +39,6 @@ miopenStatus_t PadReflection(Handle& handle,
                              const TensorDescriptor& yDesc,
                              Data_t y,
                              const size_t* padding,
-                             const size_t num_padding);
+                             size_t num_padding);
 } // namespace miopen
 #endif // MIOPEN_PAD_REFLECTION_HPP_

From 07c7fe9a098ceb983509690cd68a12cd21728e5b Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Fri, 9 Aug 2024 04:21:16 +0000
Subject: [PATCH 28/29] update gtest

---
 test/gtest/pad_reflection.cpp | 40 ++++++++++++++++++++++-------------
 test/gtest/pad_reflection.hpp | 30 +++++++++++++++++++-------
 2 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/test/gtest/pad_reflection.cpp b/test/gtest/pad_reflection.cpp
index 38c57464a8..0f5b97840c 100644
--- a/test/gtest/pad_reflection.cpp
+++ b/test/gtest/pad_reflection.cpp
@@ -42,23 +42,25 @@ std::string GetFloatArg()
     return tmp;
 }
 
-struct PadReflectionTestFloat : PadReflectionTest<float>
+struct GPU_PadReflection_FP32 : PadReflectionTest<float>
 {
 };
 
-struct PadReflectionTestHalf : PadReflectionTest<half>
+struct GPU_PadReflection_FP16 : PadReflectionTest<half>
 {
 };
 
-struct PadReflectionTestBF16 : PadReflectionTest<bfloat16>
+struct GPU_PadReflection_BFP16 : PadReflectionTest<bfloat16>
 {
 };
 
 } // namespace pad_reflection
 
-using namespace pad_reflection;
+using pad_reflection::GPU_PadReflection_FP32;
+using pad_reflection::GPU_PadReflection_FP16;
+using pad_reflection::GPU_PadReflection_BFP16;
 
-TEST_P(PadReflectionTestFloat, PadReflectionFw)
+TEST_P(GPU_PadReflection_FP32, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -72,7 +74,7 @@ TEST_P(PadReflectionTestFloat, PadReflectionFw)
     }
 };
 
-TEST_P(PadReflectionTestHalf, PadReflectionFw)
+TEST_P(GPU_PadReflection_FP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -86,7 +88,7 @@ TEST_P(PadReflectionTestHalf, PadReflectionFw)
     }
 };
 
-TEST_P(PadReflectionTestBF16, PadReflectionFw)
+TEST_P(GPU_PadReflection_BFP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -100,14 +102,22 @@ TEST_P(PadReflectionTestBF16, PadReflectionFw)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(PadReflectionTestSet,
-                         PadReflectionTestFloat,
-                         testing::ValuesIn(PadReflectionTestFloatConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_PadReflection_FP32,
+                         testing::ValuesIn(PadReflectionSmokeTestFloatConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_PadReflection_FP16,
+                         testing::ValuesIn(PadReflectionSmokeTestFloatConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_PadReflection_BFP16,
+                         testing::ValuesIn(PadReflectionSmokeTestFloatConfigs()));
 
-INSTANTIATE_TEST_SUITE_P(PadReflectionTestSet,
-                         PadReflectionTestHalf,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_PadReflection_FP32,
                          testing::ValuesIn(PadReflectionTestFloatConfigs()));
-
-INSTANTIATE_TEST_SUITE_P(PadReflectionTestSet,
-                         PadReflectionTestBF16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_PadReflection_FP16,
+                         testing::ValuesIn(PadReflectionTestFloatConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_PadReflection_BFP16,
                          testing::ValuesIn(PadReflectionTestFloatConfigs()));
diff --git a/test/gtest/pad_reflection.hpp b/test/gtest/pad_reflection.hpp
index 482bc89a71..e1abac149d 100644
--- a/test/gtest/pad_reflection.hpp
+++ b/test/gtest/pad_reflection.hpp
@@ -93,17 +93,25 @@ std::vector<PadReflectionCase> PadReflectionTestFloatConfigs()
 { // n c d h w padding
     // clang-format off
     return {
-        {  1,   1,    0,    3,    3, {2}},
-        { 48,   8,    0,  512,  512, {1}},
+        { 48,   8,    0,  512,  512, {1, 1, 1, 1}},
         { 48,   8,    0,  512,  512, {1, 1, 3, 3}},
         { 48,   8,    0,  512,  512, {0, 0, 2, 2}},
-        { 16, 311,    0,   98,  512, {1}},
+        { 16, 311,    0,   98,  512, {1, 1, 1, 1}},
         { 16, 311,    0,   98,  512, {1, 1, 3, 3}},
         { 16, 311,    0,   98,  512, {0, 0, 2, 2}},
       };
     // clang-format on
 }
 
+std::vector<PadReflectionCase> PadReflectionSmokeTestFloatConfigs()
+{ // n c d h w padding
+    // clang-format off
+    return {
+        {  1,   1,    0,    3,    3, {2}},
+      };
+    // clang-format on
+}
+
 template <typename T = float>
 struct PadReflectionTest : public ::testing::TestWithParam<PadReflectionCase>
 {
@@ -160,17 +168,23 @@ struct PadReflectionTest : public ::testing::TestWithParam<PadReflectionCase>
                                        padding.data(),
                                        padding.size());
 
-        EXPECT_EQ(status, miopenStatusSuccess);
+        ASSERT_EQ(status, miopenStatusSuccess);
 
         output.data = handle.Read<T>(output_dev, output.data.size());
     }
 
     void Verify()
     {
-        for(int i = 0; i < output.data.size() - 1; ++i)
-        {
-            EXPECT_EQ(output.data[i], ref_output.data[i]);
-        }
+        // Computation error of fp16 is ~2^13 (=8192) bigger than
+        // the one of fp32 because mantissa is shorter by 13 bits.
+        auto tolerance = std::is_same<T, float>::value ? 1.5e-6 : 8.2e-3;
+        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+        if(std::is_same<T, bfloat16>::value)
+            tolerance *= 8.0;
+
+        auto error = miopen::rms_range(ref_output, output);
+        ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output));
+        EXPECT_LT(error, tolerance);
     }
     PadReflectionCase pad_reflection_config;
 

From 186974c5a34ecd90ad181dcfde46f9fcf70232c9 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Fri, 9 Aug 2024 04:23:03 +0000
Subject: [PATCH 29/29] githook format

---
 test/gtest/pad_reflection.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/gtest/pad_reflection.cpp b/test/gtest/pad_reflection.cpp
index 0f5b97840c..ad4563152c 100644
--- a/test/gtest/pad_reflection.cpp
+++ b/test/gtest/pad_reflection.cpp
@@ -56,9 +56,9 @@ struct GPU_PadReflection_BFP16 : PadReflectionTest<bfloat16>
 
 } // namespace pad_reflection
 
-using pad_reflection::GPU_PadReflection_FP32;
-using pad_reflection::GPU_PadReflection_FP16;
 using pad_reflection::GPU_PadReflection_BFP16;
+using pad_reflection::GPU_PadReflection_FP16;
+using pad_reflection::GPU_PadReflection_FP32;
 
 TEST_P(GPU_PadReflection_FP32, Test)
 {