From 113620a6a2c13a66ef6677a9dbfbf355f8aa8ce6 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 2 Jul 2024 11:02:38 +0000
Subject: [PATCH 01/46] UnfoldFwd4d driver, test and api

---
 driver/CMakeLists.txt                         |   1 +
 driver/dm_fold.cpp                            |   0
 driver/dm_unfold.cpp                          |  39 ++
 driver/driver.hpp                             |   5 +-
 driver/fold_driver.hpp                        |   0
 driver/mloUnfoldHost.hpp                      | 114 +++++
 driver/unfold_driver.hpp                      | 422 ++++++++++++++++++
 include/miopen/miopen.h                       |  72 +++
 src/CMakeLists.txt                            |   6 +
 src/fold.cpp                                  |  82 ++++
 src/fold/problem_description.cpp              |  79 ++++
 src/fold_api.cpp                              |  63 +++
 src/include/miopen/fold.hpp                   |  50 +++
 src/include/miopen/fold/invoke_params.hpp     |  63 +++
 .../miopen/fold/problem_description.hpp       | 163 +++++++
 src/include/miopen/fold/solvers.hpp           |  75 ++++
 src/include/miopen/solver_id.hpp              |   4 +-
 src/include/miopen/tensor_view_utils.hpp      |  80 ++++
 src/kernels/MIOpenUnfold.cpp                  | 227 ++++++++++
 src/kernels/tensor_view.hpp                   |  78 ++++
 src/solver.cpp                                |   3 +
 src/solver/fold/fold_forward.cpp              | 168 +++++++
 src/solver/fold/unfold_forward.cpp            | 178 ++++++++
 test/cpu_fold.hpp                             | 104 +++++
 test/gtest/fold.cpp                           |  97 ++++
 test/gtest/fold.hpp                           | 218 +++++++++
 26 files changed, 2388 insertions(+), 3 deletions(-)
 create mode 100644 driver/dm_fold.cpp
 create mode 100644 driver/dm_unfold.cpp
 create mode 100644 driver/fold_driver.hpp
 create mode 100644 driver/mloUnfoldHost.hpp
 create mode 100644 driver/unfold_driver.hpp
 create mode 100644 src/fold.cpp
 create mode 100644 src/fold/problem_description.cpp
 create mode 100644 src/fold_api.cpp
 create mode 100644 src/include/miopen/fold.hpp
 create mode 100644 src/include/miopen/fold/invoke_params.hpp
 create mode 100644 src/include/miopen/fold/problem_description.hpp
 create mode 100644 src/include/miopen/fold/solvers.hpp
 create mode 100644 src/include/miopen/tensor_view_utils.hpp
 create mode 100644 src/kernels/MIOpenUnfold.cpp
 create mode 100644 src/kernels/tensor_view.hpp
 create mode 100644 src/solver/fold/fold_forward.cpp
 create mode 100644 src/solver/fold/unfold_forward.cpp
 create mode 100644 test/cpu_fold.hpp
 create mode 100644 test/gtest/fold.cpp
 create mode 100644 test/gtest/fold.hpp

diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index 224e550fed..8ca4ccd5c1 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -51,6 +51,7 @@ add_executable(MIOpenDriver
     dm_softmax.cpp
     dm_sum.cpp
     dm_tensorop.cpp
+    dm_unfold.cpp
     main.cpp
     registry_driver_maker.cpp
     rocrand_wrapper.cpp)
diff --git a/driver/dm_fold.cpp b/driver/dm_fold.cpp
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/driver/dm_unfold.cpp b/driver/dm_unfold.cpp
new file mode 100644
index 0000000000..3d7ed56a91
--- /dev/null
+++ b/driver/dm_unfold.cpp
@@ -0,0 +1,39 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "registry_driver_maker.hpp"
+#include "unfold_driver.hpp"
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "unfold")
+        return new UnfoldDriver<float, float>();
+    if(base_arg == "unfoldfp16")
+        return new UnfoldDriver<float16, float>();
+    if(base_arg == "unfoldbfp16")
+        return new UnfoldDriver<bfloat16, float>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
diff --git a/driver/driver.hpp b/driver/driver.hpp
index 4cfc2b544e..a7396d272f 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -151,7 +151,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "pool[fp16], lrn[fp16], "
            "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], "
            "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
-           "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16]\n");
+           "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16], unfold[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -176,7 +176,8 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "layernormfp16" && arg != "layernormbfp16" && arg != "sum" && arg != "sumfp16" &&
        arg != "sumbfp16" && arg != "argmax" && arg != "argmaxfp16" && arg != "argmaxbfp16" &&
        arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" &&
-       arg != "catfp16" && arg != "catbfp16" && arg != "--version")
+       arg != "catfp16" && arg != "catbfp16" && arg != "unfold" && arg != "unfoldfp16" &&
+       arg != "unfoldbfp16" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/fold_driver.hpp b/driver/fold_driver.hpp
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/driver/mloUnfoldHost.hpp b/driver/mloUnfoldHost.hpp
new file mode 100644
index 0000000000..465bfa7b4f
--- /dev/null
+++ b/driver/mloUnfoldHost.hpp
@@ -0,0 +1,114 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <../test/ford.hpp>
+#include "tensor_view.hpp"
+#include "miopen/tensor_view_utils.hpp"
+#include <cmath>
+#include <vector>
+#include <miopen/tensor.hpp>
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloUnFoldFwd4DRunHost(Tgpu* input,
+                                const miopenTensorDescriptor_t inputDesc,
+                                Tcheck* ref_output,
+                                const miopenTensorDescriptor_t ref_outputDesc,
+                                const std::vector<int32_t> kernel_size,
+                                const std::vector<int32_t> stride,
+                                const std::vector<int32_t> padding,
+                                const std::vector<int32_t> dilation)
+{
+    auto input_tv                = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc));
+    auto output_tv                = miopen::get_inner_expanded_tv<3>(miopen::deref(ref_outputDesc));
+    auto input_dims     = miopen::deref(inputDesc).GetLengths();
+    auto input_size     = miopen::deref(inputDesc).GetSize();
+
+    const int LOCAL_SIZE = 256;
+    int spatial_dim_size = input_size - 2;
+    const int32_t N = static_cast<int32_t>(input_dims[0]);
+    const int32_t C = static_cast<int32_t>(input_dims[1]);
+    int32_t P = 1, L = 1;
+    std::vector<int32_t> ls;
+    for (int i = 0; i < spatial_dim_size; ++i) {
+        P *= kernel_size[i];
+        int32_t l = (static_cast<int32_t>(input_dims[i + 2]) + 2 * padding[i] -
+                    dilation[i] * (kernel_size[i] - 1) - 1) /
+                        stride[i] +
+                    1;
+        L *= l;
+        ls.push_back(l);
+    }
+    [[maybe_unused]] int32_t kernel_size_h = kernel_size[0];
+    int32_t kernel_size_w = kernel_size[1];
+    int32_t stride_h = stride[0];
+    int32_t stride_w = stride[1];
+    int32_t padding_h = padding[0];
+    int32_t padding_w = padding[1];
+    int32_t dilation_h = dilation[0];
+    int32_t dilation_w = dilation[1];
+    [[maybe_unused]] int32_t LH = ls[0];
+    int32_t LW = ls[1];
+    int32_t H = static_cast<int32_t>(input_dims[2]);
+    int32_t W = static_cast<int32_t>(input_dims[3]);
+    int work_size = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
+    par_ford(work_size)([&](int gid) {
+            int ncp = gid / L, l = gid % L;
+            int nc = ncp / P, p = ncp % P;
+            int n = nc / C, c = nc % C;
+            if (n >= N) return;
+
+            int lh = l / LW, lw = l % LW;  // sliding window position
+            int ph = p / kernel_size_w, pw = p % kernel_size_w;  // position inside kernel
+            int h = lh * stride_h - padding_h + ph * dilation_h;
+            int w = lw * stride_w - padding_w + pw * dilation_w;
+
+            Tgpu x = static_cast<Tgpu>(0.0f);
+            if (0 <= h && h < H && 0 <= w && w < W) {
+                long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + input_tv.stride[1] * c + input_tv.stride[0] * n;
+                x = input[input_idx];
+            }
+
+            long output_idx = output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n;
+            ref_output[output_idx] = static_cast<Tcheck>(x);
+    });
+    
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloUnFoldBwd4DRunHost(Tgpu* input,
+                                const miopenTensorDescriptor_t inputDesc,
+                                Tcheck* ref_output,
+                                const miopenTensorDescriptor_t ref_outputDesc,
+                                const std::vector<int32_t> kernel_size,
+                                const std::vector<int32_t> stride,
+                                const std::vector<int32_t> padding,
+                                const std::vector<int32_t> dilation)
+{
+    return miopenStatusSuccess;
+}
diff --git a/driver/unfold_driver.hpp b/driver/unfold_driver.hpp
new file mode 100644
index 0000000000..da835d4f3a
--- /dev/null
+++ b/driver/unfold_driver.hpp
@@ -0,0 +1,422 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACTORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_UNFOLD_DRIVER_HPP
+#define GUARD_MIOPEN_UNFOLD_DRIVER_HPP
+
+#include "InputFlags.hpp"
+#include "driver.hpp"
+#include "mloUnfoldHost.hpp"
+#include "random.hpp"
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+#include "util_driver.hpp"
+
+#include <../test/tensor_holder.hpp>
+#include <../test/verify.hpp>
+
+#include <miopen/env.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <vector>
+
+template <typename Tgpu, typename Tref>
+class UnfoldDriver : public Driver
+{
+public:
+    UnfoldDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&inputDesc);
+        miopenCreateTensorDescriptor(&outputDesc);
+        miopenCreateTensorDescriptor(&dinputDesc);
+        miopenCreateTensorDescriptor(&doutputDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    int GetandSetData() override;
+    std::vector<int> GetTensorLengthsFromCmdLine();
+    std::vector<int32_t> GetVectorInt32tFromCmdLine(std::string long_name);
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+    int RunBackwardCPU();
+
+    Tref GetTolerance();
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~UnfoldDriver() override
+    {
+        miopenDestroyTensorDescriptor(inputDesc);
+        miopenDestroyTensorDescriptor(outputDesc);
+        miopenDestroyTensorDescriptor(dinputDesc);
+        miopenDestroyTensorDescriptor(doutputDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    int forw;
+
+    miopenTensorDescriptor_t inputDesc;
+    miopenTensorDescriptor_t outputDesc;
+
+    miopenTensorDescriptor_t doutputDesc;
+    miopenTensorDescriptor_t dinputDesc;
+
+    std::unique_ptr<GPUMem> input_dev;
+    std::unique_ptr<GPUMem> output_dev;
+
+    std::unique_ptr<GPUMem> doutput_dev;
+    std::unique_ptr<GPUMem> dinput_dev;
+
+    std::vector<Tgpu> input;
+    std::vector<Tgpu> output;
+
+    std::vector<Tgpu> doutput;
+    std::vector<Tgpu> dinput;
+
+    std::vector<Tref> output_host;
+
+    std::vector<Tref> doutput_host;
+
+    std::vector<int32_t> kernel_size;
+    std::vector<int32_t> stride;
+    std::vector<int32_t> padding;
+    std::vector<int32_t> dilation;
+};
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::GetandSetData()
+{
+    std::vector<int> input_length = GetTensorLengthsFromCmdLine();
+
+    kernel_size = GetVectorInt32tFromCmdLine("kernelSize");
+    stride = GetVectorInt32tFromCmdLine("stride");
+    padding = GetVectorInt32tFromCmdLine("padding");
+    dilation = GetVectorInt32tFromCmdLine("dilation");
+    std::cout << "asdasdkernel_size " << kernel_size.size() << std::endl; 
+    std::cout << "stride " << stride.size() << std::endl; 
+    std::cout << "padding " << padding.size() << std::endl; 
+    std::cout << "dilation " << dilation.size() << std::endl; 
+
+    int spatial_dim_size = input_length.size() - 2;
+    std::cout << "spatial_dim_size " << spatial_dim_size << std::endl; 
+
+    const int N = input_length[0];
+    const int C = input_length[1];
+
+    int P = 1, L = 1;
+    std::vector<int> ls;
+    for (int i = 0; i < spatial_dim_size; ++i) {
+        P *= kernel_size[i];
+        int l = (input_length[i + 2] + 2 * padding[i] -
+                    dilation[i] * (kernel_size[i] - 1) - 1) /
+                        stride[i] + 1;
+        L *= l;
+        ls.push_back(l);
+    }
+    std::cout << "input-asdasd " << input_length.size() << std::endl; 
+
+    std::vector<int> output_length = {N, (C*P), L};
+    std::cout << "output_length " << output_length.size() << std::endl; 
+    SetTensorNd(inputDesc, input_length, data_type);
+    SetTensorNd(outputDesc, output_length, data_type);
+
+    SetTensorNd(doutputDesc, output_length, data_type);
+    SetTensorNd(dinputDesc, input_length, data_type);
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag("forw", 'F', "1", "Run Unfold Forward (Default=1) or both Forward and Backward (0)", "int");
+    inflags.AddInputFlag("DimLengths",
+                         'D',
+                         "2,5,3,4",
+                         "The dimensional lengths of the input tensor",
+                         "string");
+    inflags.AddInputFlag("kernelSize", 'k', "2,3", "Kernel Size (Default=2,3)", "str");
+    inflags.AddInputFlag("stride", 's', "1,1", "Stride (Default=1,1)", "str");
+    inflags.AddInputFlag("padding", 'p', "0,0", "Stride (Default=0,0)", "str");
+    inflags.AddInputFlag("dilation", 'd', "1,1", "Stride (Default=1,1)", "str");
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int");
+    inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+std::vector<int> UnfoldDriver<Tgpu, Tref>::GetTensorLengthsFromCmdLine()
+{
+    std::string lengthsStr = inflags.GetValueStr("DimLengths");
+
+    std::vector<int> lengths;
+    std::size_t pos = 0;
+    std::size_t new_pos;
+
+    new_pos = lengthsStr.find(',', pos);
+    while(new_pos != std::string::npos)
+    {
+        std::string sliceStr = lengthsStr.substr(pos, new_pos - pos);
+
+        int len = std::stoi(sliceStr);
+
+        lengths.push_back(len);
+
+        pos     = new_pos + 1;
+        new_pos = lengthsStr.find(',', pos);
+    };
+
+    std::string sliceStr = lengthsStr.substr(pos);
+    int len              = std::stoi(sliceStr);
+
+    lengths.push_back(len);
+
+    return (lengths);
+}
+
+template <typename Tgpu, typename Tref>
+std::vector<int32_t> UnfoldDriver<Tgpu, Tref>::GetVectorInt32tFromCmdLine(std::string long_name)
+{
+    std::string lengthsStr = inflags.GetValueStr(long_name);
+
+    std::vector<int32_t> lengths;
+    std::size_t pos = 0;
+    std::size_t new_pos;
+
+    new_pos = lengthsStr.find(',', pos);
+    while(new_pos != std::string::npos)
+    {
+        std::string sliceStr = lengthsStr.substr(pos, new_pos - pos);
+
+        int len = std::stoi(sliceStr);
+
+        lengths.push_back(static_cast<int32_t>(len));
+
+        pos     = new_pos + 1;
+        new_pos = lengthsStr.find(',', pos);
+    };
+
+    std::string sliceStr = lengthsStr.substr(pos);
+    int len              = std::stoi(sliceStr);
+
+    lengths.push_back(static_cast<int32_t>(len));
+
+    return (lengths);
+}
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    size_t input_sz   = GetTensorSize(inputDesc);
+    size_t output_sz  = GetTensorSize(outputDesc);
+
+    size_t doutput_sz = GetTensorSize(doutputDesc);
+    size_t dinput_sz  = GetTensorSize(dinputDesc);
+
+    uint32_t ctx = 0;
+
+    input_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    output_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
+
+    doutput_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, doutput_sz, sizeof(Tgpu)));
+    dinput_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, dinput_sz, sizeof(Tgpu)));
+
+    input   = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(0.0f));
+    output  = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0.0f));
+
+    doutput = std::vector<Tgpu>(doutput_sz, static_cast<Tgpu>(1.0f));
+    dinput  = std::vector<Tgpu>(dinput_sz, static_cast<Tgpu>(0.0f));
+
+    output_host  = std::vector<Tref>(output_sz, static_cast<Tref>(0.0f));
+
+    doutput_host = std::vector<Tref>(doutput_sz, static_cast<Tref>(0.0f));
+
+    int status;
+
+    for(int i = 0; i < input_sz; i++)
+        input[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
+    status = input_dev->ToGPU(GetStream(), input.data());
+
+    for(int i = 0; i < doutput_sz; i++)
+    {
+        doutput[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
+    }
+    status |= doutput_dev->ToGPU(GetStream(), doutput.data());
+    status |= dinput_dev->ToGPU(GetStream(), dinput.data());
+
+    if(status != 0)
+        std::cout << "Unfold Driver Error copying data to GPU\n" << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenUnfoldForward(GetHandle(),
+                                  inputDesc,
+                                  input_dev->GetMem(),
+                                  outputDesc,
+                                  output_dev->GetMem(),
+                                  kernel_size.data(),
+                                  kernel_size.size(),
+                                  stride.data(),
+                                  stride.size(),
+                                  padding.data(),
+                                  padding.size(),
+                                  dilation.data(),
+                                  dilation.size());
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Unfold Forward Elapsed: " << t.gettime_ms() / iter
+                      << " ms" << std::endl;
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Unfold Forward Elapsed: " << kernel_average_time
+                  << " ms" << std::endl;
+    }
+
+    if(output_dev->FromGPU(GetStream(), output.data()) != 0)
+        std::cerr << "Error copying (out_dev) from GPU, size: " << output_dev->GetSize()
+                  << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::RunForwardCPU()
+{
+    mloUnFoldFwd4DRunHost(input.data(),
+                        inputDesc,
+                        output_host.data(),
+                        outputDesc,
+                        kernel_size,
+                        stride,
+                        padding,
+                        dilation);
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::RunBackwardCPU()
+{
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+Tref UnfoldDriver<Tgpu, Tref>::GetTolerance()
+{
+    // Computation error of fp16 is ~2^13 (=8192) bigger than
+    // the one of fp32 because mantissa is shorter by 13 bits.
+    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
+
+    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+    if(std::is_same<Tgpu, bfloat16>::value)
+        tolerance *= 8.0;
+    return tolerance;
+}
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::VerifyForward()
+{
+    RunForwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error_output    = miopen::rms_range(output_host, output);
+
+    if(!std::isfinite(error_output) || error_output > tolerance)
+    {
+        std::cout << "Forward Unfold FAILED: {" << error_output << "} > " << tolerance
+                    << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        std::cout << "Forward Unfold Verifies OK on CPU reference ({" << error_output << "} < " << tolerance << ')' << std::endl;
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::VerifyBackward()
+{
+    return miopenStatusSuccess;
+}
+
+#endif // GUARD_MIOPEN_UNFOLD_DRIVER_HPP
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index e768c7b349..ac4e08b63e 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -6582,6 +6582,78 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d
 // CLOSEOUT BackendAPI DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
+#ifdef MIOPEN_BETA_API
+// Fold APIs
+/** @addtogroup FOLD
+ *
+ *  @{
+ */
+/*! @brief Execute an unfold forward layer
+ *
+ * @param handle              MIOpen handle (input)
+ * @param inputDesc           Tensor descriptor for data input tensor input (input)
+ * @param input               Data tensor input (input)
+ * @param outputDesc          Tensor descriptor for data output tensor output (output)
+ * @param output              Data tensor output (output)
+ * @param kernel_size         Size of the sliding box array (input)
+ * @param kernel_size_size    Size of the kernel_size array (input)
+ * @param stride              Stride array of the sliding box (input)
+ * @param stride_size         Size of the stride array (input)
+ * @param padding             Padding array to be added on input (input)
+ * @param padding_size        Size of the padding array (input)
+ * @param dilation            Dilation array control the stride of the elements within the neighborhood (input)
+ * @param dilation_size       Size of the dilation array (input)
+ * @return               miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle,
+                                                    const miopenTensorDescriptor_t inputDesc,
+                                                    const void* input,
+                                                    const miopenTensorDescriptor_t outputDesc,
+                                                    void* output,
+                                                    const int32_t* kernel_size,
+                                                    const int kernel_size_size,
+                                                    const int32_t* stride,
+                                                    const int stride_size,
+                                                    const int32_t* padding,
+                                                    const int padding_size,
+                                                    const int32_t* dilation,
+                                                    const int dilation_size);
+
+// /*! @brief Execute an unfold forward layer
+//  *
+//  * @param handle              MIOpen handle (input)
+//  * @param inputDesc           Tensor descriptor for data input tensor input (input)
+//  * @param input               Data tensor input (input)
+//  * @param outputDesc          Tensor descriptor for data output tensor output (output)
+//  * @param output              Data tensor output (output)
+//  * @param kernel_size         Size of the sliding box array (input)
+//  * @param kernel_size_size    Size of the kernel_size array (input)
+//  * @param stride              Stride array of the sliding box (input)
+//  * @param stride_size         Size of the stride array (input)
+//  * @param padding             Padding array to be added on input (input)
+//  * @param padding_size        Size of the padding array (input)
+//  * @param dilation            Dilation array control the stride of the elements within the neighborhood (input)
+//  * @param dilation_size       Size of the dilation array (input)
+//  * @return               miopenStatus_t
+//  */
+// MIOPEN_EXPORT miopenStatus_t miopenFoldForward(miopenHandle_t handle,
+//                                                     const miopenTensorDescriptor_t inputDesc,
+//                                                     const void* input,
+//                                                     const miopenTensorDescriptor_t outputDesc,
+//                                                     void* output,
+//                                                     const int32_t* kernel_size,
+//                                                     const int kernel_size_size,
+//                                                     const int32_t* stride,
+//                                                     const int stride_size,
+//                                                     const int32_t* padding,
+//                                                     const int padding_size,
+//                                                     const int32_t* dilation,
+//                                                     const int dilation_size);
+
+ /** @} */
+// CLOSEOUT FOLD DOXYGEN GROUP
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9671eed03c..34153587d3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -117,6 +117,8 @@ set( MIOpen_Source
     expanduser.cpp
     find_controls.cpp
     find_db.cpp
+    fold_api.cpp
+    fold/problem_description.cpp
     fused_api.cpp
     fusion.cpp
     fusion/problem_description.cpp
@@ -256,6 +258,7 @@ set( MIOpen_Source
     solver/conv_winoRxS.cpp
     solver/conv_winoRxS_fused.cpp
     solver/fft.cpp
+    solver/fold/unfold_forward.cpp
     solver/gemm.cpp
     solver/gemm_bwd.cpp
     solver/gemm_wrw.cpp
@@ -421,6 +424,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/neuron.inc
         kernels/rocm_version.inc
         kernels/stride_array.hpp
+        kernels/tensor_view.hpp
         kernels/utilities.inc
         kernels/workaround_issue_1431.hpp
         kernels/xform_bidirect_winograd_code.inc
@@ -499,6 +503,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/conv7x7c3h224w224k64u2v2p3q3f1.s
         kernels/xform_out.s
         kernels/gcnAsmBNBwdTrainSpatial.s
+        kernels/MIOpenUnfold.cpp
         kernels/MIOpenTensorKernels.cl
         kernels/MIOpenTensorKernelsHip.cpp
         kernels/MIOpenSubTensorOpWithScalarKernel.cl
@@ -578,6 +583,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         activ.cpp
         argmax.cpp
         cat.cpp
+        fold.cpp
         groupnorm.cpp
         kernel_cache.cpp
         layer_norm.cpp
diff --git a/src/fold.cpp b/src/fold.cpp
new file mode 100644
index 0000000000..97dee5a3e2
--- /dev/null
+++ b/src/fold.cpp
@@ -0,0 +1,82 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/miopen.h"
+#include "miopen/fold/problem_description.hpp"
+#include <miopen/datatype.hpp>
+#include <miopen/find_solution.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/kernel_cache.hpp>
+#include <miopen/fold/invoke_params.hpp>
+#include <miopen/fold/solvers.hpp>
+#include <miopen/fold.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+miopenStatus_t UnfoldForward(Handle& handle,
+                        const TensorDescriptor& inputDesc,
+                        ConstData_t input,
+                        const TensorDescriptor& outputDesc,
+                        Data_t output,
+                        const int32_t* kernel_size,
+                        const int kernel_size_size,
+                        const int32_t* stride,
+                        const int stride_size,
+                        const int32_t* padding,
+                        const int padding_size,
+                        const int32_t* dilation,
+                        const int dilation_size)
+{
+    const auto problem =
+        fold::UnfoldFwdProblemDescription{inputDesc, outputDesc, kernel_size, kernel_size_size, stride, stride_size, padding, padding_size, dilation, dilation_size};
+
+    const auto invoke_params = [&]() {
+        auto tmp        = fold::InvokeParams{};
+        tmp.type        = InvokeType::Run;
+        tmp.inputDesc       = &inputDesc;
+        tmp.outputDesc       = &outputDesc;
+        tmp.input       = input;
+        tmp.output       = output;
+        tmp.kernel_size           = kernel_size;
+        tmp.stride           = stride;
+        tmp.padding     = padding;
+        tmp.dilation = dilation;
+        tmp.kernel_size_size = kernel_size_size;
+        tmp.stride_size = stride_size;
+        tmp.padding_size = padding_size;
+        tmp.dilation_size = dilation_size;
+        return tmp;
+    }();
+
+    const auto algo    = AlgorithmName{"UnfoldFwd"};
+    const auto solvers = solver::SolverContainer<solver::fold::UnfoldFwd>{};
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace miopen
diff --git a/src/fold/problem_description.cpp b/src/fold/problem_description.cpp
new file mode 100644
index 0000000000..d65ebd020b
--- /dev/null
+++ b/src/fold/problem_description.cpp
@@ -0,0 +1,79 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/fold/problem_description.hpp>
+#include <miopen/names.hpp>
+
+#include <sstream>
+
+namespace miopen {
+
+namespace fold {
+
+// NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const
+// {
+//     auto input_dtype   = inputDesc.GetType();
+//     auto output_dtype  = outputDesc.GetType();
+//     auto size          = inputDesc.GetElementSize();
+
+//     std::ostringstream ss;
+
+//     ss << "fold_fwd";
+//     ss << "i_dtype" << input_dtype;
+//     ss << "o_dtype" << output_dtype;
+//     ss << "size" << size;
+
+//     return NetworkConfig{ss.str()};
+// }
+
+NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const
+{
+    auto input_dtype   = inputDesc.GetType();
+    auto output_dtype  = outputDesc.GetType();
+    auto size          = inputDesc.GetElementSize();
+    auto in_dims = inputDesc.GetLengths();
+
+    std::ostringstream ss;
+
+    ss << "Unfold_fwd";
+    ss << "i_dtype" << input_dtype;
+    ss << "o_dtype" << output_dtype;
+    ss << "size" << size;
+    ss << "in_dims" ;
+    for (auto val : in_dims) {
+        ss << "_" << val;
+    }
+    ss << "kernel_size_" << kernel_size[0] << "_" << kernel_size[1]; 
+    ss << "stride_" << stride[0] << "_" << stride[1]; 
+    ss << "padding_" << padding[0] << "_" << padding[1]; 
+    ss << "dilation_" << dilation[0] << "_" << dilation[1]; 
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace fold
+
+} // namespace miopen
diff --git a/src/fold_api.cpp b/src/fold_api.cpp
new file mode 100644
index 0000000000..6c02dea728
--- /dev/null
+++ b/src/fold_api.cpp
@@ -0,0 +1,63 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/miopen.h"
+#include <miopen/fold.hpp>
+#include <miopen/errors.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/logger.hpp>
+#include <miopen/tensor_ops.hpp>
+
+extern "C" miopenStatus_t miopenUnfoldForward(miopenHandle_t handle,
+                                                    const miopenTensorDescriptor_t inputDesc,
+                                                    const void* input,
+                                                    const miopenTensorDescriptor_t outputDesc,
+                                                    void* output,
+                                                    const int32_t* kernel_size,
+                                                    const int kernel_size_size,
+                                                    const int32_t* stride,
+                                                    const int stride_size,
+                                                    const int32_t* padding,
+                                                    const int padding_size,
+                                                    const int32_t* dilation,
+                                                    const int dilation_size)
+{
+    return miopen::try_([&] {
+        miopen::UnfoldForward(miopen::deref(handle),
+                                    miopen::deref(inputDesc),
+                                    DataCast(input),
+                                    miopen::deref(outputDesc),
+                                    DataCast(output),
+                                    kernel_size,
+                                    kernel_size_size,
+                                    stride,
+                                    stride_size,
+                                    padding,
+                                    padding_size,
+                                    dilation,
+                                    dilation_size);
+    });
+}
diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp
new file mode 100644
index 0000000000..f536f22ce8
--- /dev/null
+++ b/src/include/miopen/fold.hpp
@@ -0,0 +1,50 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_INSTANCE_NORM_HPP_
+#define MIOPEN_INSTANCE_NORM_HPP_
+
+#include <miopen/common.hpp>
+
+namespace miopen {
+
+struct Handle;
+struct TensorDescriptor;
+
+miopenStatus_t UnfoldForward(Handle& handle,
+                                   const TensorDescriptor& inputDesc,
+                                   ConstData_t input,
+                                   const TensorDescriptor& outputDesc,
+                                   Data_t output,
+                                   const int32_t* kernel_size,
+                                   const int kernel_size_size,
+                                   const int32_t* stride,
+                                   const int stride_size,
+                                   const int32_t* padding,
+                                   const int padding_size,
+                                   const int32_t* dilation,
+                                   const int dilation_size);
+} // namespace miopen
+#endif // MIOPEN_INSTANCE_NORM_HPP_
diff --git a/src/include/miopen/fold/invoke_params.hpp b/src/include/miopen/fold/invoke_params.hpp
new file mode 100644
index 0000000000..5bcaf6faf0
--- /dev/null
+++ b/src/include/miopen/fold/invoke_params.hpp
@@ -0,0 +1,63 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include "miopen/miopen.h"
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+
+#include <limits>
+
+namespace miopen {
+
+namespace fold {
+
+struct InvokeParams : public miopen::InvokeParams
+{
+    InvokeParams() = default;
+
+    const TensorDescriptor* inputDesc   = nullptr;
+    const TensorDescriptor* outputDesc  = nullptr;
+
+    ConstData_t input  = nullptr;
+    Data_t output      = nullptr;
+
+    const int32_t* kernel_size = nullptr;
+    const int32_t* stride = nullptr;
+    const int32_t* padding = nullptr;
+    const int32_t* dilation = nullptr;
+    int kernel_size_size = 0;
+    int stride_size = 0;
+    int padding_size = 0;
+    int dilation_size = 0;
+
+    std::size_t GetWorkspaceSize() const { return 0; }
+    Data_t GetWorkspace() const { return nullptr; }
+};
+
+} // namespace fold
+
+} // namespace miopen
diff --git a/src/include/miopen/fold/problem_description.hpp b/src/include/miopen/fold/problem_description.hpp
new file mode 100644
index 0000000000..5dccce8782
--- /dev/null
+++ b/src/include/miopen/fold/problem_description.hpp
@@ -0,0 +1,163 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include "miopen/errors.hpp"
+#include "miopen/miopen.h"
+#include <miopen/activ.hpp>
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+
+#include <cassert>
+#include <string>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace fold {
+
+bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y);
+
+// struct FoldFwdProblemDescription : ProblemDescriptionBase
+// {
+//     FoldFwdProblemDescription(const TensorDescriptor& inputDesc_,
+//                                 const TensorDescriptor& outputDesc_,
+//                                 const int32_t* kernel_size_,
+//                                 const int kernel_size_size_,
+//                                 const int32_t* stride_,
+//                                 const int stride_size_,
+//                                 const int32_t* padding_,
+//                                 const int padding_size_,
+//                                 const int32_t* dilation_,
+//                                 const int dilation_size_)
+//         : inputDesc(inputDesc_),
+//           outputDesc(outputDesc_),
+//             kernel_size(kernel_size_),
+//             kernel_size_size(kernel_size_size_),
+//             stride(stride_),
+//             stride_size(stride_size_),
+//             padding(padding_),
+//             padding_size(padding_size_),
+//             dilation(dilation_),
+//             dilation_size(dilation_size_)
+//     {
+//         // IsValidSize();
+//     }
+
+// //     bool IsValidSize() const
+// //     {
+// //         if(inputDesc.GetSize() < 2 || inputDesc.GetSize() > 5)
+// //         {
+// // #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+// //             MIOPEN_THROW(miopenStatusBadParm,
+// //                          "Instance Norm: The input tensor dimension should be in range [2, 5].");
+// // #else
+// //             return false;
+// // #endif
+// //         }
+// //         return true;
+// //     }
+
+//     const TensorDescriptor& GetInputDesc() const { return inputDesc; }
+//     const TensorDescriptor& GetOutputDesc() const { return outputDesc; }
+
+//     NetworkConfig MakeNetworkConfig() const override;
+
+// public:
+//     TensorDescriptor inputDesc;
+//     TensorDescriptor outputDesc;
+//     const int32_t* kernel_size;
+//     const int kernel_size_size;
+//     const int32_t* stride;
+//     const int stride_size;
+//     const int32_t* padding;
+//     const int padding_size;
+//     const int32_t* dilation;
+//     const int dilation_size;
+// };
+
+struct UnfoldFwdProblemDescription : ProblemDescriptionBase
+{
+    UnfoldFwdProblemDescription(const TensorDescriptor& inputDesc_,
+                                const TensorDescriptor& outputDesc_,
+                                const int32_t* kernel_size_,
+                                const int kernel_size_size_,
+                                const int32_t* stride_,
+                                const int stride_size_,
+                                const int32_t* padding_,
+                                const int padding_size_,
+                                const int32_t* dilation_,
+                                const int dilation_size_)
+        : inputDesc(inputDesc_),
+          outputDesc(outputDesc_),
+            kernel_size(kernel_size_),
+            kernel_size_size(kernel_size_size_),
+            stride(stride_),
+            stride_size(stride_size_),
+            padding(padding_),
+            padding_size(padding_size_),
+            dilation(dilation_),
+            dilation_size(dilation_size_)
+    {
+        // IsValidSize();
+    }
+
+//     bool IsValidSize() const
+//     {
+//         if(inputDesc.GetSize() < 2 || inputDesc.GetSize() > 5)
+//         {
+// #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+//             MIOPEN_THROW(miopenStatusBadParm,
+//                          "Instance Norm: The input tensor dimension should be in range [2, 5].");
+// #else
+//             return false;
+// #endif
+//         }
+//         return true;
+//     }
+
+    const TensorDescriptor& GetInputDesc() const { return inputDesc; }
+    const TensorDescriptor& GetOutputDesc() const { return outputDesc; }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+public:
+    TensorDescriptor inputDesc;
+    TensorDescriptor outputDesc;
+    const int32_t* kernel_size;
+    const int kernel_size_size;
+    const int32_t* stride;
+    const int stride_size;
+    const int32_t* padding;
+    const int padding_size;
+    const int32_t* dilation;
+    const int dilation_size;
+};
+
+} // namespace fold
+
+} // namespace miopen
diff --git a/src/include/miopen/fold/solvers.hpp b/src/include/miopen/fold/solvers.hpp
new file mode 100644
index 0000000000..743a3b6194
--- /dev/null
+++ b/src/include/miopen/fold/solvers.hpp
@@ -0,0 +1,75 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/fold/problem_description.hpp>
+#include <miopen/solver.hpp>
+
+#include <utility>
+
+namespace miopen {
+
+namespace solver {
+
+namespace fold {
+
+// using FoldFwdSolverBase =
+//     NonTunableSolverBase<ExecutionContext, miopen::fold::FoldFwdProblemDescription>;
+
+// struct FoldFwd final : FoldFwdSolverBase
+// {
+//     const std::string& SolverDbId() const override { return GetSolverDbId<FoldFwd>(); }
+
+//     bool IsApplicable(
+//         const ExecutionContext& context,
+//         const miopen::fold::FoldFwdProblemDescription& problem) const override;
+
+//     ConvSolution GetSolution(
+//         const ExecutionContext& context,
+//         const miopen::fold::FoldFwdProblemDescription& problem) const override;
+// };
+
+using UnfoldFwdSolverBase =
+    NonTunableSolverBase<ExecutionContext, miopen::fold::UnfoldFwdProblemDescription>;
+
+struct UnfoldFwd final : UnfoldFwdSolverBase
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<UnfoldFwd>(); }
+
+    bool IsApplicable(
+        const ExecutionContext& context,
+        const miopen::fold::UnfoldFwdProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(
+        const ExecutionContext& context,
+        const miopen::fold::UnfoldFwdProblemDescription& problem) const override;
+};
+
+} // namespace fold
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index c52dc020ac..ca3b700772 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -56,7 +56,9 @@ enum class Primitive
     Reduce,
     Cat,
     Mha,
-    Softmax
+    Softmax,
+    Fold,
+    Unfold,
 };
 
 struct MIOPEN_EXPORT Id
diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp
new file mode 100644
index 0000000000..4a7c0b51ad
--- /dev/null
+++ b/src/include/miopen/tensor_view_utils.hpp
@@ -0,0 +1,80 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef MIOPEN_TENSOR_VIEW_UTIL_HPP_
+#define MIOPEN_TENSOR_VIEW_UTIL_HPP_
+
+#include "../../kernels/tensor_view.hpp"
+#include "miopen/tensor.hpp"
+
+namespace miopen {
+
+template <int N>
+inline tensor_view_t<N> get_inner_expanded_tv(const TensorDescriptor Desc)
+{
+    auto dims    = Desc.GetLengths();
+    auto strides = Desc.GetStrides();
+
+    tensor_view_t<N> tensor_view;
+    for(size_t i = 0; i < N; ++i)
+    {
+        if(i < dims.size())
+        {
+            tensor_view.stride[i] = strides[i];
+            tensor_view.size[i]   = dims[i];
+        }
+        else
+        {
+            tensor_view.stride[i] = (i == 0 ? 1 : strides[i - 1]);
+            tensor_view.size[i]   = 1;
+        }
+    }
+    return tensor_view;
+}
+
+template <int N>
+inline void slice_tv(tensor_view_t<N>& tensor_view, int32_t sliceCount, const int32_t* slices)
+{
+    for(int32_t i = 0; i < sliceCount; i++)
+    {
+        int32_t dim   = slices[4 * i + 0];
+        int32_t start = slices[4 * i + 1];
+        int32_t end   = slices[4 * i + 2];
+        int32_t step  = slices[4 * i + 3];
+
+        if(end > static_cast<int32_t>(tensor_view.size[dim]))
+            end = tensor_view.size[dim];
+
+        auto len = end - start;
+
+        tensor_view.size[dim] = (len + step - 1) / step;
+        tensor_view.stride[dim] *= step;
+    }
+}
+
+} // namespace miopen
+
+#endif // MIOPEN_TENSOR_REORDER_UTIL_HPP_
diff --git a/src/kernels/MIOpenUnfold.cpp b/src/kernels/MIOpenUnfold.cpp
new file mode 100644
index 0000000000..1135797401
--- /dev/null
+++ b/src/kernels/MIOpenUnfold.cpp
@@ -0,0 +1,227 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "tensor_view.hpp"
+
+template <typename TIO>
+__device__ void unfoldForward4D(const TIO* input,
+                                TIO* output,
+                                int N,
+                                int C,
+                                int H,
+                                int W,
+                                int P,
+                                int L,
+                                int LH,
+                                int LW,
+                                int kernel_size_h,
+                                int kernel_size_w,
+                                int stride_h,
+                                int stride_w,
+                                int padding_h,
+                                int padding_w,
+                                int dilation_h,
+                                int dilation_w,
+                                tensor_view_t<4> input_tv,
+                                tensor_view_t<3> output_tv)
+{
+  /*
+   * input = {N, C, H, W}, output = {N, C * P, L}
+   * where P = kernel_size_h * kernel_size_w, L = # of blocks (see host code for
+   * formula)
+   * => gws = {ceil(N * C * P * L, LOCAL_SIZE)}, lws = {LOCAL_SIZE}
+   */
+
+    const int gid = threadIdx.x + blockIdx.x * blockDim.x;
+    int ncp = gid / L, l = gid % L;
+    int nc = ncp / P, p = ncp % P;
+    int n = nc / C, c = nc % C;
+    if (n >= N) return;
+
+    
+    int lh = l / LW, lw = l % LW;  // sliding window position
+    int ph = p / kernel_size_w, pw = p % kernel_size_w;  // position inside kernel
+    int h = lh * stride_h - padding_h + ph * dilation_h;
+    int w = lw * stride_w - padding_w + pw * dilation_w;
+
+    TIO x = 0;
+    if (0 <= h && h < H && 0 <= w && w < W) {
+        long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + input_tv.stride[1] * c + input_tv.stride[0] * n;
+        x = input[input_idx];
+    }
+
+    long output_idx = output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n;
+    output[output_idx] = x;  
+}
+
+extern "C" __global__ void UnfoldForward4D(const IN_OUT_TYPE* input,
+                                                IN_OUT_TYPE* output,
+                                                int N,
+                                                int C,
+                                                int H,
+                                                int W,
+                                                int P,
+                                                int L,
+                                                int LH,
+                                                int LW,
+                                                int kernel_size_h,
+                                                int kernel_size_w,
+                                                int stride_h,
+                                                int stride_w,
+                                                int padding_h,
+                                                int padding_w,
+                                                int dilation_h,
+                                                int dilation_w,
+                                                tensor_view_t<4> input_tv,
+                                                tensor_view_t<3> output_tv)
+{
+    unfoldForward4D<IN_OUT_TYPE>( input,
+                                output,
+                                N,
+                                C,
+                                H,
+                                W,
+                                P,
+                                L,
+                                LH,
+                                LW,
+                                kernel_size_h,
+                                kernel_size_w,
+                                stride_h,
+                                stride_w,
+                                padding_h,
+                                padding_w,
+                                dilation_h,
+                                dilation_w,
+                                input_tv,
+                                output_tv);
+}
+
+template <typename TIO>
+__device__ void unfoldBackward4D(const TIO* output_grad,
+                                TIO* input_grad,
+                                int N,
+                                int C,
+                                int H,
+                                int W,
+                                int P,
+                                int L,
+                                int LH,
+                                int LW,
+                                int kernel_size_h,
+                                int kernel_size_w,
+                                int stride_h,
+                                int stride_w,
+                                int padding_h,
+                                int padding_w,
+                                int dilation_h,
+                                int dilation_w,
+                                tensor_view_t<3> output_grad_tv,
+                                tensor_view_t<4> input_grad_tv)
+{
+  /*
+   * output_grad = {N, C * P, L}, input_grad = {N, C, H, W}
+   * where P = kernel_size_h * kernel_size_w, L = # of blocks (see host code for
+   * formula)
+   * => gws = {ceil(N * C * H * W, LOCAL_SIZE)}, lws = {LOCAL_SIZE}
+   */
+
+    const int gid = threadIdx.x + blockIdx.x * blockDim.x;
+    int nch = gid / W, w = gid % W;
+    int nc = nch / H, h = nch % H;
+    int n = nc / C, c = nc % C;
+    if (n >= N) return;
+
+    FLOAT_ACCUM sum = 0.0f;
+    for (int ph = 0; ph < kernel_size_h; ++ph)
+    {
+        for (int pw = 0; pw < kernel_size_w; ++pw)
+        {
+            int lhsh = h - ph * dilation_h + padding_h;
+            int lwsw = w - pw * dilation_w + padding_w;
+            if (lhsh % stride_h != 0) continue;
+            if (lwsw % stride_w != 0) continue;
+            int lh = lhsh / stride_h;
+            int lw = lwsw / stride_w;
+            if (lh < 0 || LH <= lh) continue;
+            if (lw < 0 || LW <= lw) continue;
+            long output_grad_idx = output_grad_tv.stride[2] * (lh * LW + lw) + output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) + output_grad_tv.stride[0] * n;
+            sum += CVT_FLOAT2ACCUM(output_grad[output_grad_idx]);
+        }
+    }
+
+    long input_grad_idx = input_grad_tv.stride[3] * w + input_grad_tv.stride[2] * h + input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n;
+    input_grad[input_grad_idx] = CVT_ACCUM2FLOAT(sum);
+}
+
+extern "C" __global__ void UnfoldBackward4D(const IN_OUT_TYPE* output_grad,
+                                                IN_OUT_TYPE* input_grad,
+                                                int N,
+                                                int C,
+                                                int H,
+                                                int W,
+                                                int P,
+                                                int L,
+                                                int LH,
+                                                int LW,
+                                                int kernel_size_h,
+                                                int kernel_size_w,
+                                                int stride_h,
+                                                int stride_w,
+                                                int padding_h,
+                                                int padding_w,
+                                                int dilation_h,
+                                                int dilation_w,
+                                                tensor_view_t<3> output_grad_tv,
+                                                tensor_view_t<4> input_grad_tv)
+{
+    unfoldBackward4D<IN_OUT_TYPE>(output_grad,
+                                input_grad,
+                                N,
+                                C,
+                                H,
+                                W,
+                                P,
+                                L,
+                                LH,
+                                LW,
+                                kernel_size_h,
+                                kernel_size_w,
+                                stride_h,
+                                stride_w,
+                                padding_h,
+                                padding_w,
+                                dilation_h,
+                                dilation_w,
+                                output_grad_tv,
+                                input_grad_tv);
+}
diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
new file mode 100644
index 0000000000..d35bfd93fc
--- /dev/null
+++ b/src/kernels/tensor_view.hpp
@@ -0,0 +1,78 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef GUARD_TENSOR_VIEW_HPP
+#define GUARD_TENSOR_VIEW_HPP
+
+template <int N>
+struct tensor_layout_t;
+
+template <int N>
+struct tensor_view_t
+{
+    // Get index in tensor view at tensor layout
+    constexpr uint64_t get_tensor_view_idx(const tensor_layout_t<N>& tensor_layout)
+    {
+        static_assert(N > 0);
+        uint64_t idx = 0;
+        for(auto i = 0; i < N; ++i)
+        {
+            idx += stride[i] * tensor_layout.layout[i];
+        }
+        return idx;
+    }
+    uint64_t stride[N];
+    uint64_t size[N];
+};
+
+template <int N>
+struct tensor_layout_t
+{
+    // Make tensor layout at index using tensor view
+    constexpr tensor_layout_t(const tensor_view_t<N>& tensor_view, uint64_t idx)
+    {
+        static_assert(N > 0);
+        uint64_t temp = idx;
+        if constexpr(N == 1)
+        {
+            layout[0] = idx;
+        }
+        else
+        {
+            for(auto i = N - 1; i > 1; --i)
+            {
+                layout[i] = temp % tensor_view.size[i];
+                temp      = temp / tensor_view.size[i];
+            }
+            layout[1] = temp % tensor_view.size[1];
+            layout[0] = temp / tensor_view.size[1];
+        }
+    }
+
+    uint64_t layout[N];
+};
+
+#endif // GUARD_TENSOR_VIEW_HPP
diff --git a/src/solver.cpp b/src/solver.cpp
index f45f3058a6..97fa4637f3 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -28,6 +28,7 @@
 
 #include <miopen/activ/solvers.hpp>
 #include <miopen/batchnorm/solvers.hpp>
+#include <miopen/fold/solvers.hpp>
 #include <miopen/fusion/solvers.hpp>
 #include <miopen/groupnorm/solvers.hpp>
 #include <miopen/layernorm/solvers.hpp>
@@ -648,6 +649,8 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
     Register(registry, ++id, Primitive::Mha, mha::Mha{}.SolverDbId());
     Register(registry, ++id, Primitive::Softmax, softmax::Softmax{}.SolverDbId());
     Register(registry, ++id, Primitive::Softmax, softmax::AttnSoftmax{}.SolverDbId());
+    // Register(registry, ++id, Primitive::Fold, fold::FoldFwd{}.SolverDbId());
+    Register(registry, ++id, Primitive::Unfold, fold::UnfoldFwd{}.SolverDbId());
 
     // IMPORTANT: New solvers should be added to the end of the function!
 }
diff --git a/src/solver/fold/fold_forward.cpp b/src/solver/fold/fold_forward.cpp
new file mode 100644
index 0000000000..d3e44c0d33
--- /dev/null
+++ b/src/solver/fold/fold_forward.cpp
@@ -0,0 +1,168 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/fold/problem_description.hpp"
+#include "miopen/miopen.h"
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/fold/invoke_params.hpp>
+#include <miopen/fold/solvers.hpp>
+#include <miopen/fold.hpp>
+#include <miopen/target_properties.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define LOCAL_SIZE 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace fold {
+
+bool FoldFwd::IsApplicable(
+    [[maybe_unused]] const ExecutionContext& /*context*/,
+    const miopen::fold::FoldFwdProblemDescription& problem) const
+{
+    return true;
+}
+
+ConvSolution FoldFwd::GetSolution(
+    [[maybe_unused]] const ExecutionContext& context,
+    const miopen::fold::FoldFwdProblemDescription& problem) const
+{
+    std::ignore = context;
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto in_dtype    = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto dtype       = problem.GetOutputDesc().GetType();
+    auto input_dims  = problem.GetInputDesc().GetLengths();
+
+    auto output_dims = problem.GetOutputDesc().GetLengths();
+    const int32_t N = static_cast<int32_t>(output_dims[0]);
+    const int32_t C = static_cast<int32_t>(output_dims[1]);
+    int32_t H = static_cast<int32_t>(output_dims[2]);
+    int32_t W = static_cast<int32_t>(output_dims[3]);
+
+    {
+        auto kernel        = KernelInfo{};
+        kernel.kernel_file = "MIOpenUnfold.cpp";
+        kernel.kernel_name = "UnfoldBackward4D";
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype},
+        };
+        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+        size_t xlocalsize = LOCAL_SIZE;
+        size_t xgridsize  = AlignUp(N * C * H * W, LOCAL_SIZE);
+        size_t ylocalsize = 1;
+        size_t ygridsize  = 1;
+        size_t zlocalsize = 1;
+        size_t zgridsize  = 1;
+        kernel.l_wk.push_back(xlocalsize);
+        kernel.l_wk.push_back(ylocalsize);
+        kernel.l_wk.push_back(zlocalsize);
+
+        kernel.g_wk.push_back(xgridsize);
+        kernel.g_wk.push_back(ygridsize);
+        kernel.g_wk.push_back(zgridsize);
+
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::fold::InvokeParams>();
+
+            auto input_tv                = get_inner_expanded_tv<3>(deref(params.inputDesc));
+            auto output_tv                = get_inner_expanded_tv<4>(deref(params.outputDesc));
+            auto input_dims          = deref(params.inputDesc).GetLengths();
+            auto output_dims          = deref(params.outputDesc).GetLengths();
+
+            int spatial_dim_size = output_dims.size() - 2;
+            const int32_t N = static_cast<int32_t>(output_dims[0]);
+            const int32_t C = static_cast<int32_t>(output_dims[1]);
+            int32_t P = 1, L = 1;
+            std::vector<int32_t> ls;
+            for (int i = 0; i < spatial_dim_size; ++i) {
+                P *= params.kernel_size[i];
+                int32_t l = (output_dims[i + 2] + 2 * params.padding[i] -
+                            params.dilation[i] * (params.kernel_size[i] - 1) - 1) /
+                                params.stride[i] +
+                            0;
+                L *= l;
+                ls.push_back(l);
+            }
+
+            int32_t kernel_size_h = params.kernel_size[0];
+            int32_t kernel_size_w = params.kernel_size[1];
+            int32_t stride_h = params.stride[0];
+            int32_t stride_w = params.stride[1];
+            int32_t padding_h = params.padding[0];
+            int32_t padding_w = params.padding[1];
+            int32_t dilation_h = params.dilation[0];
+            int32_t dilation_w = params.dilation[1];
+            int32_t LH = ls[0];
+            int32_t LW = ls[1];
+            int32_t H = static_cast<int32_t>(output_dims[2]);
+            int32_t W = static_cast<int32_t>(output_dims[3]);
+
+            kernel(params.input,
+                    params.output,
+                    N,
+                    C,
+                    H,
+                    W,
+                    P,
+                    L,
+                    LH,
+                    LW,
+                    kernel_size_h,
+                    kernel_size_w,
+                    stride_h,
+                    stride_w,
+                    padding_h,
+                    padding_w,
+                    dilation_h,
+                    dilation_w,
+                    input_tv,
+                    output_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace fold
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/fold/unfold_forward.cpp b/src/solver/fold/unfold_forward.cpp
new file mode 100644
index 0000000000..68f8072e74
--- /dev/null
+++ b/src/solver/fold/unfold_forward.cpp
@@ -0,0 +1,178 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/fold/problem_description.hpp"
+#include "miopen/miopen.h"
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/fold/invoke_params.hpp>
+#include <miopen/fold/solvers.hpp>
+#include <miopen/fold.hpp>
+#include <miopen/target_properties.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define LOCAL_SIZE 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace fold {
+
+bool UnfoldFwd::IsApplicable(
+    [[maybe_unused]] const ExecutionContext& /*context*/,
+    const miopen::fold::UnfoldFwdProblemDescription& problem) const
+{
+    return true;
+}
+
+ConvSolution UnfoldFwd::GetSolution(
+    [[maybe_unused]] const ExecutionContext& context,
+    const miopen::fold::UnfoldFwdProblemDescription& problem) const
+{
+    std::ignore = context;
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto in_dtype    = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto dtype       = problem.GetOutputDesc().GetType();
+    auto input_dims  = problem.GetInputDesc().GetLengths();
+
+    auto output_dims = problem.GetOutputDesc().GetLengths();
+    const int32_t N = static_cast<int32_t>(input_dims[0]);
+    const int32_t C = static_cast<int32_t>(input_dims[1]);
+    int spatial_dim_size = input_dims.size() - 2;
+    int32_t P = 1, L = 1;
+    std::vector<int32_t> ls;
+    for (int i = 0; i < spatial_dim_size; ++i) {
+        P *= problem.kernel_size[i];
+        int32_t l = (static_cast<int32_t>(input_dims[i + 2]) + 2 * problem.padding[i] -
+                    problem.dilation[i] * (problem.kernel_size[i] - 1) - 1) /
+                        problem.stride[i] +
+                    1;
+        L *= l;
+        ls.push_back(l);
+    }
+
+    {
+        auto kernel        = KernelInfo{};
+        kernel.kernel_file = "MIOpenUnfold.cpp";
+        kernel.kernel_name = "UnfoldForward4D";
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype},
+        };
+        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+        size_t xlocalsize = LOCAL_SIZE;
+        size_t xgridsize  = AlignUp(N * C * P * L, LOCAL_SIZE);
+        size_t ylocalsize = 1;
+        size_t ygridsize  = 1;
+        size_t zlocalsize = 1;
+        size_t zgridsize  = 1;
+        kernel.l_wk.push_back(xlocalsize);
+        kernel.l_wk.push_back(ylocalsize);
+        kernel.l_wk.push_back(zlocalsize);
+
+        kernel.g_wk.push_back(xgridsize);
+        kernel.g_wk.push_back(ygridsize);
+        kernel.g_wk.push_back(zgridsize);
+
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::fold::InvokeParams>();
+
+            auto input_tv                = get_inner_expanded_tv<4>(deref(params.inputDesc));
+            auto output_tv                = get_inner_expanded_tv<3>(deref(params.outputDesc));
+            auto input_dims          = deref(params.inputDesc).GetLengths();
+            auto output_dims          = deref(params.outputDesc).GetLengths();
+
+            int spatial_dim_size = input_dims.size() - 2;
+            const int32_t N = static_cast<int32_t>(input_dims[0]);
+            const int32_t C = static_cast<int32_t>(input_dims[1]);
+            int32_t P = 1, L = 1;
+            std::vector<int32_t> ls;
+            for (int i = 0; i < spatial_dim_size; ++i) {
+                P *= params.kernel_size[i];
+                int32_t l = (static_cast<int32_t>(input_dims[i + 2]) + 2 * params.padding[i] -
+                            params.dilation[i] * (params.kernel_size[i] - 1) - 1) /
+                                params.stride[i] +
+                            1;
+                L *= l;
+                ls.push_back(l);
+            }
+
+            int32_t kernel_size_h = params.kernel_size[0];
+            int32_t kernel_size_w = params.kernel_size[1];
+            int32_t stride_h = params.stride[0];
+            int32_t stride_w = params.stride[1];
+            int32_t padding_h = params.padding[0];
+            int32_t padding_w = params.padding[1];
+            int32_t dilation_h = params.dilation[0];
+            int32_t dilation_w = params.dilation[1];
+            int32_t LH = ls[0];
+            int32_t LW = ls[1];
+            int32_t H = static_cast<int32_t>(input_dims[2]);
+            int32_t W = static_cast<int32_t>(input_dims[3]);
+
+            kernel(params.input,
+                    params.output,
+                    N,
+                    C,
+                    H,
+                    W,
+                    P,
+                    L,
+                    LH,
+                    LW,
+                    kernel_size_h,
+                    kernel_size_w,
+                    stride_h,
+                    stride_w,
+                    padding_h,
+                    padding_w,
+                    dilation_h,
+                    dilation_w,
+                    input_tv,
+                    output_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace fold
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/test/cpu_fold.hpp b/test/cpu_fold.hpp
new file mode 100644
index 0000000000..0e9588e000
--- /dev/null
+++ b/test/cpu_fold.hpp
@@ -0,0 +1,104 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_CPU_UNFOLD_HPP
+#define GUARD_CPU_UNFOLD_HPP
+
+#pragma once
+
+#include "miopen/tensor.hpp"
+#include "tensor_holder.hpp"
+#include "tensor_view.hpp"
+#include "miopen/tensor_view_utils.hpp"
+
+template <class T>
+void cpu_unfold_fwd_4d(tensor<T> input_tensor,
+                            tensor<T>& ref_output_tensor,
+                            const std::vector<int32_t> kernel_size,
+                            const std::vector<int32_t> stride,
+                            const std::vector<int32_t> padding,
+                            const std::vector<int32_t> dilation)
+{
+    auto input_tv                = miopen::get_inner_expanded_tv<4>(input_tensor.desc);
+    auto output_tv                = miopen::get_inner_expanded_tv<3>(ref_output_tensor.desc);
+    auto input_size     = input_tensor.desc.GetSize();
+    auto input_dims     = input_tensor.desc.GetLengths();
+
+    auto input          = input_tensor.data.data();
+    auto output         = ref_output_tensor.data.data();
+
+    const int LOCAL_SIZE = 256;
+    int spatial_dim_size = input_size - 2;
+
+    const int32_t N = static_cast<int32_t>(input_dims[0]);
+    const int32_t C = static_cast<int32_t>(input_dims[1]);
+
+    int32_t P = 1, L = 1;
+    std::vector<int32_t> ls;
+    for (int i = 0; i < spatial_dim_size; ++i) {
+        P *= kernel_size[i];
+        int32_t l = (static_cast<int32_t>(input_dims[i + 2]) + 2 * padding[i] -
+                    dilation[i] * (kernel_size[i] - 1) - 1) /
+                        stride[i] +
+                    1;
+        L *= l;
+        ls.push_back(l);
+    }
+
+    int32_t kernel_size_h = kernel_size[0];
+    int32_t kernel_size_w = kernel_size[1];
+    int32_t stride_h = stride[0];
+    int32_t stride_w = stride[1];
+    int32_t padding_h = padding[0];
+    int32_t padding_w = padding[1];
+    int32_t dilation_h = dilation[0];
+    int32_t dilation_w = dilation[1];
+    int32_t LH = ls[0];
+    int32_t LW = ls[1];
+    int32_t H = static_cast<int32_t>(input_dims[2]);
+    int32_t W = static_cast<int32_t>(input_dims[3]);
+    int work_size = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
+    par_ford(work_size)([&](int gid) {
+            int ncp = gid / L, l = gid % L;
+            int nc = ncp / P, p = ncp % P;
+            int n = nc / C, c = nc % C;
+            if (n >= N) return;
+
+            int lh = l / LW, lw = l % LW;  // sliding window position
+            int ph = p / kernel_size_w, pw = p % kernel_size_w;  // position inside kernel
+            int h = lh * stride_h - padding_h + ph * dilation_h;
+            int w = lw * stride_w - padding_w + pw * dilation_w;
+
+            T x = static_cast<T>(0.0f);
+            if (0 <= h && h < H && 0 <= w && w < W) {
+                long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + input_tv.stride[1] * c + input_tv.stride[0] * n;
+                x = input[input_idx];
+            }
+
+            long output_idx = output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n;
+            output[output_idx] = x;
+    });
+}
+#endif
diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp
new file mode 100644
index 0000000000..d1843ae3c8
--- /dev/null
+++ b/test/gtest/fold.cpp
@@ -0,0 +1,97 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "fold.hpp"
+#include "miopen/bfloat16.hpp"
+#include "tensor_holder.hpp"
+#include <miopen/env.hpp>
+
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
+
+namespace fold {
+struct UnfoldForwardTestFloat32 : UnfoldFwdTest<float>
+{
+};
+
+struct UnfoldForwardTestFloat16 : UnfoldFwdTest<half>
+{
+};
+
+struct UnfoldForwardTestBFloat16 : UnfoldFwdTest<bfloat16>
+{
+};
+}; // namespace fold
+using namespace fold;
+TEST_P(UnfoldForwardTestFloat32, UnfoldForwardTest)
+{
+    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
+                         UnfoldForwardTestFloat32,
+                         testing::ValuesIn(UnfoldTestConfigs()));
+
+TEST_P(UnfoldForwardTestFloat16, UnfoldForwardTest)
+{
+    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
+                         UnfoldForwardTestFloat16,
+                         testing::ValuesIn(UnfoldTestConfigs()));
+
+TEST_P(UnfoldForwardTestBFloat16, UnfoldForwardTest)
+{
+    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
+                         UnfoldForwardTestBFloat16,
+                         testing::ValuesIn(UnfoldTestConfigs()));
diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp
new file mode 100644
index 0000000000..eee1e79fef
--- /dev/null
+++ b/test/gtest/fold.hpp
@@ -0,0 +1,218 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTN OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTN WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "../driver/tensor_driver.hpp"
+#include "cpu_fold.hpp"
+#include "get_handle.hpp"
+#include "miopen/allocator.hpp"
+#include "random.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+#include <cstddef>
+#include <cstdlib>
+#include <random>
+#include <gtest/gtest.h>
+#include <miopen/miopen.h>
+#include <miopen/fold.hpp>
+
+struct UnfoldTestCase
+{
+    size_t N;
+    size_t C;
+    size_t D;
+    size_t H;
+    size_t W;
+    std::vector<int32_t> kernelSize;
+    std::vector<int32_t> stride;
+    std::vector<int32_t> padding;
+    std::vector<int32_t> dilation;
+    bool isContiguous = true;
+    friend std::ostream& operator<<(std::ostream& os, const UnfoldTestCase& tc)
+    {
+        os << "N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H
+           << " W:" << tc.W << " kernel_size:";
+        for (const auto& ks : tc.kernelSize) os << ks << " ";
+        os << "stride:";
+        for (const auto& s : tc.stride) os << s << " ";
+        os << "padding:";
+        for (const auto& p : tc.padding) os << p << " ";
+        os << "dilation:";
+        for (const auto& d : tc.dilation) os << d << " ";
+        os << "isContiguous:" << std::boolalpha << tc.isContiguous;
+        return os;
+    }
+
+    std::vector<size_t> GetInput()
+    {
+        if((N != 0) && (C != 0) && (D != 0) && (H != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, C, D, H, W});
+        }
+        else if((N != 0) && (C != 0) && (H != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, C, H, W});
+        }
+        else if((N != 0) && (C != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, C, W});
+        }
+        else if((N != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, W});
+        }
+        else if((N != 0))
+        {
+            return std::vector<size_t>({N});
+        }
+        else
+        {
+            std::cout << "Error Input Tensor Lengths\n" << std::endl;
+            return std::vector<size_t>({0});
+        }
+    }
+
+    std::vector<size_t> ComputeStrides(std::vector<size_t> inputDim) const
+    {
+        if(!isContiguous)
+            std::swap(inputDim.front(), inputDim.back());
+        std::vector<size_t> strides(inputDim.size());
+        strides.back() = 1;
+        for(int i = inputDim.size() - 2; i >= 0; --i)
+            strides[i] = strides[i + 1] * inputDim[i + 1];
+        if(!isContiguous)
+            std::swap(strides.front(), strides.back());
+        return strides;
+    }
+};
+
+std::vector<UnfoldTestCase> UnfoldTestConfigs()
+{ // n c d h w padding
+    return {
+        {2, 5, 0, 3, 4, {2, 3}, {1, 1}, {0, 0}, {1, 1}, true},
+        {1, 3, 0, 10, 12, {4, 5}, {1, 1}, {0, 0}, {1, 1}, true},
+        {11, 13, 0, 17, 19, {3, 3}, {3, 2}, {0, 0}, {1, 1}, true},
+        {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {3, 2}, {1, 1}, true},
+        {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {0, 0}, {3, 2}, true},
+        {11, 13, 0, 33, 37, {4, 3}, {2, 3}, {5, 2}, {3, 5}, true},
+    };
+}
+
+template <typename T>
+struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle = get_handle();
+        config        = GetParam();
+
+        std::vector<size_t> in_dims          = config.GetInput();
+        std::vector<size_t> in_strides       = config.ComputeStrides(in_dims);
+
+        auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
+        auto gen_one   = [&](auto...) { return 1; };
+        auto gen_zero  = [&](auto...) { return 0; };
+        input          = tensor<T>{in_dims, in_strides}.generate(gen_value);
+
+        int spatial_dim_size = in_dims.size() - 2;
+        const int32_t N = static_cast<int32_t>(in_dims[0]);
+        const int32_t C = static_cast<int32_t>(in_dims[1]);
+        int32_t P = 1, L = 1;
+        std::vector<int32_t> ls;
+        for (int i = 0; i < spatial_dim_size; ++i) {
+            P *= config.kernelSize[i];
+            int32_t l = (static_cast<int32_t>(in_dims[i + 2]) + 2 * config.padding[i] -
+                        config.dilation[i] * (config.kernelSize[i] - 1) - 1) /
+                            config.stride[i] +
+                        1;
+            L *= l;
+            ls.push_back(l);
+        }
+
+        std::vector<size_t> out_dims{static_cast<size_t>(N),
+                                    static_cast<size_t>(C * P),
+                                    static_cast<size_t>(L)};
+
+        output     = tensor<T>{out_dims}.generate(gen_zero);
+        outputHost = tensor<T>{out_dims}.generate(gen_zero);
+
+        input_dev   = handle.Write(input.data);
+        output_dev  = handle.Write(output.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+        miopenStatus_t status;
+
+        status = miopen::UnfoldForward(handle,
+                                input.desc,
+                                input_dev.get(),
+                                output.desc,
+                                output_dev.get(),
+                                config.kernelSize.data(),
+                                static_cast<int>(config.kernelSize.size()),
+                                config.stride.data(),
+                                static_cast<int>(config.stride.size()),
+                                config.padding.data(),
+                                static_cast<int>(config.padding.size()),
+                                config.dilation.data(),
+                                static_cast<int>(config.dilation.size()));
+                                
+        cpu_unfold_fwd_4d<T>(input,
+                            outputHost,
+                            config.kernelSize,
+                            config.stride,
+                            config.padding,
+                            config.dilation);
+
+        EXPECT_EQ(status, miopenStatusSuccess);
+        output.data = handle.Read<T>(output_dev, output.data.size());
+    }
+
+    void Verify()
+    {
+        // Computation error of fp16 is ~2^13 (=8192) bigger than
+        // the one of fp32 because mantissa is shorter by 13 bits.
+        double tolerance = std::is_same<T, float>::value ? 1.5e-6 : 8.2e-3;
+
+        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+        if(std::is_same<T, bfloat16>::value)
+            tolerance *= 8.0;
+        auto error_output   = miopen::rms_range(outputHost, output);
+        EXPECT_TRUE(error_output < tolerance)
+            << "Error forward output beyond tolerance Error: {" << error_output
+            << "},  Tolerance: " << tolerance;
+    }
+    UnfoldTestCase config;
+
+    tensor<T> input;
+    tensor<T> output;
+
+    tensor<T> outputHost;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+};

From 6902fdfda3e8c31c1ea7524ff6dc744a400ab359 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Thu, 4 Jul 2024 03:23:41 +0000
Subject: [PATCH 02/46] githook format

---
 driver/mloUnfoldHost.hpp                      | 103 +++----
 driver/unfold_driver.hpp                      | 101 +++----
 include/miopen/miopen.h                       |  32 ++-
 src/fold.cpp                                  |  58 ++--
 src/fold/problem_description.cpp              |  21 +-
 src/fold_api.cpp                              |  48 ++--
 src/include/miopen/fold.hpp                   |  24 +-
 src/include/miopen/fold/invoke_params.hpp     |  22 +-
 .../miopen/fold/problem_description.hpp       |  46 +--
 src/include/miopen/fold/solvers.hpp           |  11 +-
 src/kernels/MIOpenUnfold.cpp                  | 261 +++++++++---------
 src/solver/fold/fold_forward.cpp              |  99 ++++---
 src/solver/fold/unfold_forward.cpp            | 102 +++----
 test/cpu_fold.hpp                             |  83 +++---
 test/gtest/fold.hpp                           |  79 +++---
 15 files changed, 554 insertions(+), 536 deletions(-)

diff --git a/driver/mloUnfoldHost.hpp b/driver/mloUnfoldHost.hpp
index 465bfa7b4f..7941eb63c4 100644
--- a/driver/mloUnfoldHost.hpp
+++ b/driver/mloUnfoldHost.hpp
@@ -35,80 +35,85 @@
 
 template <typename Tgpu, typename Tcheck>
 int32_t mloUnFoldFwd4DRunHost(Tgpu* input,
-                                const miopenTensorDescriptor_t inputDesc,
-                                Tcheck* ref_output,
-                                const miopenTensorDescriptor_t ref_outputDesc,
-                                const std::vector<int32_t> kernel_size,
-                                const std::vector<int32_t> stride,
-                                const std::vector<int32_t> padding,
-                                const std::vector<int32_t> dilation)
+                              const miopenTensorDescriptor_t inputDesc,
+                              Tcheck* ref_output,
+                              const miopenTensorDescriptor_t ref_outputDesc,
+                              const std::vector<int32_t> kernel_size,
+                              const std::vector<int32_t> stride,
+                              const std::vector<int32_t> padding,
+                              const std::vector<int32_t> dilation)
 {
-    auto input_tv                = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc));
-    auto output_tv                = miopen::get_inner_expanded_tv<3>(miopen::deref(ref_outputDesc));
-    auto input_dims     = miopen::deref(inputDesc).GetLengths();
-    auto input_size     = miopen::deref(inputDesc).GetSize();
+    auto input_tv   = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc));
+    auto output_tv  = miopen::get_inner_expanded_tv<3>(miopen::deref(ref_outputDesc));
+    auto input_dims = miopen::deref(inputDesc).GetLengths();
+    auto input_size = miopen::deref(inputDesc).GetSize();
 
     const int LOCAL_SIZE = 256;
     int spatial_dim_size = input_size - 2;
-    const int32_t N = static_cast<int32_t>(input_dims[0]);
-    const int32_t C = static_cast<int32_t>(input_dims[1]);
+    const int32_t N      = static_cast<int32_t>(input_dims[0]);
+    const int32_t C      = static_cast<int32_t>(input_dims[1]);
     int32_t P = 1, L = 1;
     std::vector<int32_t> ls;
-    for (int i = 0; i < spatial_dim_size; ++i) {
+    for(int i = 0; i < spatial_dim_size; ++i)
+    {
         P *= kernel_size[i];
         int32_t l = (static_cast<int32_t>(input_dims[i + 2]) + 2 * padding[i] -
-                    dilation[i] * (kernel_size[i] - 1) - 1) /
+                     dilation[i] * (kernel_size[i] - 1) - 1) /
                         stride[i] +
                     1;
         L *= l;
         ls.push_back(l);
     }
     [[maybe_unused]] int32_t kernel_size_h = kernel_size[0];
-    int32_t kernel_size_w = kernel_size[1];
-    int32_t stride_h = stride[0];
-    int32_t stride_w = stride[1];
-    int32_t padding_h = padding[0];
-    int32_t padding_w = padding[1];
-    int32_t dilation_h = dilation[0];
-    int32_t dilation_w = dilation[1];
-    [[maybe_unused]] int32_t LH = ls[0];
-    int32_t LW = ls[1];
-    int32_t H = static_cast<int32_t>(input_dims[2]);
-    int32_t W = static_cast<int32_t>(input_dims[3]);
+    int32_t kernel_size_w                  = kernel_size[1];
+    int32_t stride_h                       = stride[0];
+    int32_t stride_w                       = stride[1];
+    int32_t padding_h                      = padding[0];
+    int32_t padding_w                      = padding[1];
+    int32_t dilation_h                     = dilation[0];
+    int32_t dilation_w                     = dilation[1];
+    [[maybe_unused]] int32_t LH            = ls[0];
+    int32_t LW                             = ls[1];
+    int32_t H                              = static_cast<int32_t>(input_dims[2]);
+    int32_t W                              = static_cast<int32_t>(input_dims[3]);
     int work_size = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
     par_ford(work_size)([&](int gid) {
-            int ncp = gid / L, l = gid % L;
-            int nc = ncp / P, p = ncp % P;
-            int n = nc / C, c = nc % C;
-            if (n >= N) return;
+        int ncp = gid / L, l = gid % L;
+        int nc = ncp / P, p = ncp % P;
+        int n = nc / C, c = nc % C;
+        if(n >= N)
+            return;
 
-            int lh = l / LW, lw = l % LW;  // sliding window position
-            int ph = p / kernel_size_w, pw = p % kernel_size_w;  // position inside kernel
-            int h = lh * stride_h - padding_h + ph * dilation_h;
-            int w = lw * stride_w - padding_w + pw * dilation_w;
+        int lh = l / LW, lw = l % LW;                       // sliding window position
+        int ph = p / kernel_size_w, pw = p % kernel_size_w; // position inside kernel
+        int h = lh * stride_h - padding_h + ph * dilation_h;
+        int w = lw * stride_w - padding_w + pw * dilation_w;
 
-            Tgpu x = static_cast<Tgpu>(0.0f);
-            if (0 <= h && h < H && 0 <= w && w < W) {
-                long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + input_tv.stride[1] * c + input_tv.stride[0] * n;
-                x = input[input_idx];
-            }
+        Tgpu x = static_cast<Tgpu>(0.0f);
+        if(0 <= h && h < H && 0 <= w && w < W)
+        {
+            long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h +
+                             input_tv.stride[1] * c + input_tv.stride[0] * n;
+            x = input[input_idx];
+        }
 
-            long output_idx = output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n;
-            ref_output[output_idx] = static_cast<Tcheck>(x);
+        long output_idx =
+            output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n;
+        ref_output[output_idx] = static_cast<Tcheck>(x);
     });
-    
+
     return miopenStatusSuccess;
 }
 
 template <typename Tgpu, typename Tcheck>
 int32_t mloUnFoldBwd4DRunHost(Tgpu* input,
-                                const miopenTensorDescriptor_t inputDesc,
-                                Tcheck* ref_output,
-                                const miopenTensorDescriptor_t ref_outputDesc,
-                                const std::vector<int32_t> kernel_size,
-                                const std::vector<int32_t> stride,
-                                const std::vector<int32_t> padding,
-                                const std::vector<int32_t> dilation)
+                              const miopenTensorDescriptor_t inputDesc,
+                              Tcheck* ref_output,
+                              const miopenTensorDescriptor_t ref_outputDesc,
+                              const std::vector<int32_t> kernel_size,
+                              const std::vector<int32_t> stride,
+                              const std::vector<int32_t> padding,
+                              const std::vector<int32_t> dilation)
 {
     return miopenStatusSuccess;
 }
diff --git a/driver/unfold_driver.hpp b/driver/unfold_driver.hpp
index da835d4f3a..cfa25d3a85 100644
--- a/driver/unfold_driver.hpp
+++ b/driver/unfold_driver.hpp
@@ -135,37 +135,27 @@ int UnfoldDriver<Tgpu, Tref>::GetandSetData()
     std::vector<int> input_length = GetTensorLengthsFromCmdLine();
 
     kernel_size = GetVectorInt32tFromCmdLine("kernelSize");
-    stride = GetVectorInt32tFromCmdLine("stride");
-    padding = GetVectorInt32tFromCmdLine("padding");
-    dilation = GetVectorInt32tFromCmdLine("dilation");
-    std::cout << "asdasdkernel_size " << kernel_size.size() << std::endl; 
-    std::cout << "stride " << stride.size() << std::endl; 
-    std::cout << "padding " << padding.size() << std::endl; 
-    std::cout << "dilation " << dilation.size() << std::endl; 
-
+    stride      = GetVectorInt32tFromCmdLine("stride");
+    padding     = GetVectorInt32tFromCmdLine("padding");
+    dilation    = GetVectorInt32tFromCmdLine("dilation");
     int spatial_dim_size = input_length.size() - 2;
-    std::cout << "spatial_dim_size " << spatial_dim_size << std::endl; 
-
     const int N = input_length[0];
     const int C = input_length[1];
 
     int P = 1, L = 1;
     std::vector<int> ls;
-    for (int i = 0; i < spatial_dim_size; ++i) {
+    for(int i = 0; i < spatial_dim_size; ++i)
+    {
         P *= kernel_size[i];
-        int l = (input_length[i + 2] + 2 * padding[i] -
-                    dilation[i] * (kernel_size[i] - 1) - 1) /
-                        stride[i] + 1;
+        int l = (input_length[i + 2] + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1) /
+                    stride[i] +
+                1;
         L *= l;
         ls.push_back(l);
     }
-    std::cout << "input-asdasd " << input_length.size() << std::endl; 
-
-    std::vector<int> output_length = {N, (C*P), L};
-    std::cout << "output_length " << output_length.size() << std::endl; 
+    std::vector<int> output_length = {N, (C * P), L};
     SetTensorNd(inputDesc, input_length, data_type);
     SetTensorNd(outputDesc, output_length, data_type);
-
     SetTensorNd(doutputDesc, output_length, data_type);
     SetTensorNd(dinputDesc, input_length, data_type);
 
@@ -175,12 +165,10 @@ int UnfoldDriver<Tgpu, Tref>::GetandSetData()
 template <typename Tgpu, typename Tref>
 int UnfoldDriver<Tgpu, Tref>::AddCmdLineArgs()
 {
-    inflags.AddInputFlag("forw", 'F', "1", "Run Unfold Forward (Default=1) or both Forward and Backward (0)", "int");
-    inflags.AddInputFlag("DimLengths",
-                         'D',
-                         "2,5,3,4",
-                         "The dimensional lengths of the input tensor",
-                         "string");
+    inflags.AddInputFlag(
+        "forw", 'F', "1", "Run Unfold Forward (Default=1) or both Forward and Backward (0)", "int");
+    inflags.AddInputFlag(
+        "DimLengths", 'D', "2,5,3,4", "The dimensional lengths of the input tensor", "string");
     inflags.AddInputFlag("kernelSize", 'k', "2,3", "Kernel Size (Default=2,3)", "str");
     inflags.AddInputFlag("stride", 's', "1,1", "Stride (Default=1,1)", "str");
     inflags.AddInputFlag("padding", 'p', "0,0", "Stride (Default=0,0)", "str");
@@ -257,27 +245,27 @@ std::vector<int32_t> UnfoldDriver<Tgpu, Tref>::GetVectorInt32tFromCmdLine(std::s
 template <typename Tgpu, typename Tref>
 int UnfoldDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
 {
-    size_t input_sz   = GetTensorSize(inputDesc);
-    size_t output_sz  = GetTensorSize(outputDesc);
+    size_t input_sz  = GetTensorSize(inputDesc);
+    size_t output_sz = GetTensorSize(outputDesc);
 
     size_t doutput_sz = GetTensorSize(doutputDesc);
     size_t dinput_sz  = GetTensorSize(dinputDesc);
 
     uint32_t ctx = 0;
 
-    input_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
-    output_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
+    input_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    output_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
 
     doutput_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, doutput_sz, sizeof(Tgpu)));
     dinput_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, dinput_sz, sizeof(Tgpu)));
 
-    input   = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(0.0f));
-    output  = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0.0f));
+    input  = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(0.0f));
+    output = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0.0f));
 
     doutput = std::vector<Tgpu>(doutput_sz, static_cast<Tgpu>(1.0f));
     dinput  = std::vector<Tgpu>(dinput_sz, static_cast<Tgpu>(0.0f));
 
-    output_host  = std::vector<Tref>(output_sz, static_cast<Tref>(0.0f));
+    output_host = std::vector<Tref>(output_sz, static_cast<Tref>(0.0f));
 
     doutput_host = std::vector<Tref>(doutput_sz, static_cast<Tref>(0.0f));
 
@@ -312,18 +300,18 @@ int UnfoldDriver<Tgpu, Tref>::RunForwardGPU()
     for(int i = 0; i < inflags.GetValueInt("iter"); i++)
     {
         miopenUnfoldForward(GetHandle(),
-                                  inputDesc,
-                                  input_dev->GetMem(),
-                                  outputDesc,
-                                  output_dev->GetMem(),
-                                  kernel_size.data(),
-                                  kernel_size.size(),
-                                  stride.data(),
-                                  stride.size(),
-                                  padding.data(),
-                                  padding.size(),
-                                  dilation.data(),
-                                  dilation.size());
+                            inputDesc,
+                            input_dev->GetMem(),
+                            outputDesc,
+                            output_dev->GetMem(),
+                            kernel_size.data(),
+                            kernel_size.size(),
+                            stride.data(),
+                            stride.size(),
+                            padding.data(),
+                            padding.size(),
+                            dilation.data(),
+                            dilation.size());
 
         float time = 0.0;
         miopenGetKernelTime(GetHandle(), &time);
@@ -342,8 +330,8 @@ int UnfoldDriver<Tgpu, Tref>::RunForwardGPU()
 
         float kernel_average_time =
             iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
-        std::cout << "GPU Kernel Time Unfold Forward Elapsed: " << kernel_average_time
-                  << " ms" << std::endl;
+        std::cout << "GPU Kernel Time Unfold Forward Elapsed: " << kernel_average_time << " ms"
+                  << std::endl;
     }
 
     if(output_dev->FromGPU(GetStream(), output.data()) != 0)
@@ -357,14 +345,13 @@ template <typename Tgpu, typename Tref>
 int UnfoldDriver<Tgpu, Tref>::RunForwardCPU()
 {
     mloUnFoldFwd4DRunHost(input.data(),
-                        inputDesc,
-                        output_host.data(),
-                        outputDesc,
-                        kernel_size,
-                        stride,
-                        padding,
-                        dilation);
-
+                          inputDesc,
+                          output_host.data(),
+                          outputDesc,
+                          kernel_size,
+                          stride,
+                          padding,
+                          dilation);
     return miopenStatusSuccess;
 }
 
@@ -402,13 +389,13 @@ int UnfoldDriver<Tgpu, Tref>::VerifyForward()
 
     if(!std::isfinite(error_output) || error_output > tolerance)
     {
-        std::cout << "Forward Unfold FAILED: {" << error_output << "} > " << tolerance
-                    << std::endl;
+        std::cout << "Forward Unfold FAILED: {" << error_output << "} > " << tolerance << std::endl;
         return EC_VerifyFwd;
     }
     else
     {
-        std::cout << "Forward Unfold Verifies OK on CPU reference ({" << error_output << "} < " << tolerance << ')' << std::endl;
+        std::cout << "Forward Unfold Verifies OK on CPU reference ({" << error_output << "} < "
+                  << tolerance << ')' << std::endl;
     }
     return miopenStatusSuccess;
 }
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index ac4e08b63e..45e8df42db 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -6601,23 +6601,24 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d
  * @param stride_size         Size of the stride array (input)
  * @param padding             Padding array to be added on input (input)
  * @param padding_size        Size of the padding array (input)
- * @param dilation            Dilation array control the stride of the elements within the neighborhood (input)
+ * @param dilation            Dilation array control the stride of the elements within the
+ * neighborhood (input)
  * @param dilation_size       Size of the dilation array (input)
  * @return               miopenStatus_t
  */
 MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle,
-                                                    const miopenTensorDescriptor_t inputDesc,
-                                                    const void* input,
-                                                    const miopenTensorDescriptor_t outputDesc,
-                                                    void* output,
-                                                    const int32_t* kernel_size,
-                                                    const int kernel_size_size,
-                                                    const int32_t* stride,
-                                                    const int stride_size,
-                                                    const int32_t* padding,
-                                                    const int padding_size,
-                                                    const int32_t* dilation,
-                                                    const int dilation_size);
+                                                 const miopenTensorDescriptor_t inputDesc,
+                                                 const void* input,
+                                                 const miopenTensorDescriptor_t outputDesc,
+                                                 void* output,
+                                                 const int32_t* kernel_size,
+                                                 const int kernel_size_size,
+                                                 const int32_t* stride,
+                                                 const int stride_size,
+                                                 const int32_t* padding,
+                                                 const int padding_size,
+                                                 const int32_t* dilation,
+                                                 const int dilation_size);
 
 // /*! @brief Execute an unfold forward layer
 //  *
@@ -6632,7 +6633,8 @@ MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle,
 //  * @param stride_size         Size of the stride array (input)
 //  * @param padding             Padding array to be added on input (input)
 //  * @param padding_size        Size of the padding array (input)
-//  * @param dilation            Dilation array control the stride of the elements within the neighborhood (input)
+//  * @param dilation            Dilation array control the stride of the elements within the
+//  neighborhood (input)
 //  * @param dilation_size       Size of the dilation array (input)
 //  * @return               miopenStatus_t
 //  */
@@ -6650,7 +6652,7 @@ MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle,
 //                                                     const int32_t* dilation,
 //                                                     const int dilation_size);
 
- /** @} */
+/** @} */
 // CLOSEOUT FOLD DOXYGEN GROUP
 #endif
 
diff --git a/src/fold.cpp b/src/fold.cpp
index 97dee5a3e2..d545c01964 100644
--- a/src/fold.cpp
+++ b/src/fold.cpp
@@ -38,37 +38,45 @@
 namespace miopen {
 
 miopenStatus_t UnfoldForward(Handle& handle,
-                        const TensorDescriptor& inputDesc,
-                        ConstData_t input,
-                        const TensorDescriptor& outputDesc,
-                        Data_t output,
-                        const int32_t* kernel_size,
-                        const int kernel_size_size,
-                        const int32_t* stride,
-                        const int stride_size,
-                        const int32_t* padding,
-                        const int padding_size,
-                        const int32_t* dilation,
-                        const int dilation_size)
+                             const TensorDescriptor& inputDesc,
+                             ConstData_t input,
+                             const TensorDescriptor& outputDesc,
+                             Data_t output,
+                             const int32_t* kernel_size,
+                             const int kernel_size_size,
+                             const int32_t* stride,
+                             const int stride_size,
+                             const int32_t* padding,
+                             const int padding_size,
+                             const int32_t* dilation,
+                             const int dilation_size)
 {
-    const auto problem =
-        fold::UnfoldFwdProblemDescription{inputDesc, outputDesc, kernel_size, kernel_size_size, stride, stride_size, padding, padding_size, dilation, dilation_size};
+    const auto problem = fold::UnfoldFwdProblemDescription{inputDesc,
+                                                           outputDesc,
+                                                           kernel_size,
+                                                           kernel_size_size,
+                                                           stride,
+                                                           stride_size,
+                                                           padding,
+                                                           padding_size,
+                                                           dilation,
+                                                           dilation_size};
 
     const auto invoke_params = [&]() {
-        auto tmp        = fold::InvokeParams{};
-        tmp.type        = InvokeType::Run;
-        tmp.inputDesc       = &inputDesc;
+        auto tmp             = fold::InvokeParams{};
+        tmp.type             = InvokeType::Run;
+        tmp.inputDesc        = &inputDesc;
         tmp.outputDesc       = &outputDesc;
-        tmp.input       = input;
-        tmp.output       = output;
-        tmp.kernel_size           = kernel_size;
+        tmp.input            = input;
+        tmp.output           = output;
+        tmp.kernel_size      = kernel_size;
         tmp.stride           = stride;
-        tmp.padding     = padding;
-        tmp.dilation = dilation;
+        tmp.padding          = padding;
+        tmp.dilation         = dilation;
         tmp.kernel_size_size = kernel_size_size;
-        tmp.stride_size = stride_size;
-        tmp.padding_size = padding_size;
-        tmp.dilation_size = dilation_size;
+        tmp.stride_size      = stride_size;
+        tmp.padding_size     = padding_size;
+        tmp.dilation_size    = dilation_size;
         return tmp;
     }();
 
diff --git a/src/fold/problem_description.cpp b/src/fold/problem_description.cpp
index d65ebd020b..9a0c6ec921 100644
--- a/src/fold/problem_description.cpp
+++ b/src/fold/problem_description.cpp
@@ -51,10 +51,10 @@ namespace fold {
 
 NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const
 {
-    auto input_dtype   = inputDesc.GetType();
-    auto output_dtype  = outputDesc.GetType();
-    auto size          = inputDesc.GetElementSize();
-    auto in_dims = inputDesc.GetLengths();
+    auto input_dtype  = inputDesc.GetType();
+    auto output_dtype = outputDesc.GetType();
+    auto size         = inputDesc.GetElementSize();
+    auto in_dims      = inputDesc.GetLengths();
 
     std::ostringstream ss;
 
@@ -62,14 +62,15 @@ NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const
     ss << "i_dtype" << input_dtype;
     ss << "o_dtype" << output_dtype;
     ss << "size" << size;
-    ss << "in_dims" ;
-    for (auto val : in_dims) {
+    ss << "in_dims";
+    for(auto val : in_dims)
+    {
         ss << "_" << val;
     }
-    ss << "kernel_size_" << kernel_size[0] << "_" << kernel_size[1]; 
-    ss << "stride_" << stride[0] << "_" << stride[1]; 
-    ss << "padding_" << padding[0] << "_" << padding[1]; 
-    ss << "dilation_" << dilation[0] << "_" << dilation[1]; 
+    ss << "kernel_size_" << kernel_size[0] << "_" << kernel_size[1];
+    ss << "stride_" << stride[0] << "_" << stride[1];
+    ss << "padding_" << padding[0] << "_" << padding[1];
+    ss << "dilation_" << dilation[0] << "_" << dilation[1];
 
     return NetworkConfig{ss.str()};
 }
diff --git a/src/fold_api.cpp b/src/fold_api.cpp
index 6c02dea728..1e6c97ef83 100644
--- a/src/fold_api.cpp
+++ b/src/fold_api.cpp
@@ -32,32 +32,32 @@
 #include <miopen/tensor_ops.hpp>
 
 extern "C" miopenStatus_t miopenUnfoldForward(miopenHandle_t handle,
-                                                    const miopenTensorDescriptor_t inputDesc,
-                                                    const void* input,
-                                                    const miopenTensorDescriptor_t outputDesc,
-                                                    void* output,
-                                                    const int32_t* kernel_size,
-                                                    const int kernel_size_size,
-                                                    const int32_t* stride,
-                                                    const int stride_size,
-                                                    const int32_t* padding,
-                                                    const int padding_size,
-                                                    const int32_t* dilation,
-                                                    const int dilation_size)
+                                              const miopenTensorDescriptor_t inputDesc,
+                                              const void* input,
+                                              const miopenTensorDescriptor_t outputDesc,
+                                              void* output,
+                                              const int32_t* kernel_size,
+                                              const int kernel_size_size,
+                                              const int32_t* stride,
+                                              const int stride_size,
+                                              const int32_t* padding,
+                                              const int padding_size,
+                                              const int32_t* dilation,
+                                              const int dilation_size)
 {
     return miopen::try_([&] {
         miopen::UnfoldForward(miopen::deref(handle),
-                                    miopen::deref(inputDesc),
-                                    DataCast(input),
-                                    miopen::deref(outputDesc),
-                                    DataCast(output),
-                                    kernel_size,
-                                    kernel_size_size,
-                                    stride,
-                                    stride_size,
-                                    padding,
-                                    padding_size,
-                                    dilation,
-                                    dilation_size);
+                              miopen::deref(inputDesc),
+                              DataCast(input),
+                              miopen::deref(outputDesc),
+                              DataCast(output),
+                              kernel_size,
+                              kernel_size_size,
+                              stride,
+                              stride_size,
+                              padding,
+                              padding_size,
+                              dilation,
+                              dilation_size);
     });
 }
diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp
index f536f22ce8..33e879eb0a 100644
--- a/src/include/miopen/fold.hpp
+++ b/src/include/miopen/fold.hpp
@@ -34,17 +34,17 @@ struct Handle;
 struct TensorDescriptor;
 
 miopenStatus_t UnfoldForward(Handle& handle,
-                                   const TensorDescriptor& inputDesc,
-                                   ConstData_t input,
-                                   const TensorDescriptor& outputDesc,
-                                   Data_t output,
-                                   const int32_t* kernel_size,
-                                   const int kernel_size_size,
-                                   const int32_t* stride,
-                                   const int stride_size,
-                                   const int32_t* padding,
-                                   const int padding_size,
-                                   const int32_t* dilation,
-                                   const int dilation_size);
+                             const TensorDescriptor& inputDesc,
+                             ConstData_t input,
+                             const TensorDescriptor& outputDesc,
+                             Data_t output,
+                             const int32_t* kernel_size,
+                             const int kernel_size_size,
+                             const int32_t* stride,
+                             const int stride_size,
+                             const int32_t* padding,
+                             const int padding_size,
+                             const int32_t* dilation,
+                             const int dilation_size);
 } // namespace miopen
 #endif // MIOPEN_INSTANCE_NORM_HPP_
diff --git a/src/include/miopen/fold/invoke_params.hpp b/src/include/miopen/fold/invoke_params.hpp
index 5bcaf6faf0..318e312206 100644
--- a/src/include/miopen/fold/invoke_params.hpp
+++ b/src/include/miopen/fold/invoke_params.hpp
@@ -39,20 +39,20 @@ struct InvokeParams : public miopen::InvokeParams
 {
     InvokeParams() = default;
 
-    const TensorDescriptor* inputDesc   = nullptr;
-    const TensorDescriptor* outputDesc  = nullptr;
+    const TensorDescriptor* inputDesc  = nullptr;
+    const TensorDescriptor* outputDesc = nullptr;
 
-    ConstData_t input  = nullptr;
-    Data_t output      = nullptr;
+    ConstData_t input = nullptr;
+    Data_t output     = nullptr;
 
     const int32_t* kernel_size = nullptr;
-    const int32_t* stride = nullptr;
-    const int32_t* padding = nullptr;
-    const int32_t* dilation = nullptr;
-    int kernel_size_size = 0;
-    int stride_size = 0;
-    int padding_size = 0;
-    int dilation_size = 0;
+    const int32_t* stride      = nullptr;
+    const int32_t* padding     = nullptr;
+    const int32_t* dilation    = nullptr;
+    int kernel_size_size       = 0;
+    int stride_size            = 0;
+    int padding_size           = 0;
+    int dilation_size          = 0;
 
     std::size_t GetWorkspaceSize() const { return 0; }
     Data_t GetWorkspace() const { return nullptr; }
diff --git a/src/include/miopen/fold/problem_description.hpp b/src/include/miopen/fold/problem_description.hpp
index 5dccce8782..938abe6dae 100644
--- a/src/include/miopen/fold/problem_description.hpp
+++ b/src/include/miopen/fold/problem_description.hpp
@@ -74,7 +74,8 @@ bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y);
 // //         {
 // // #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
 // //             MIOPEN_THROW(miopenStatusBadParm,
-// //                          "Instance Norm: The input tensor dimension should be in range [2, 5].");
+// //                          "Instance Norm: The input tensor dimension should be in range [2,
+// 5].");
 // // #else
 // //             return false;
 // // #endif
@@ -114,31 +115,32 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase
                                 const int dilation_size_)
         : inputDesc(inputDesc_),
           outputDesc(outputDesc_),
-            kernel_size(kernel_size_),
-            kernel_size_size(kernel_size_size_),
-            stride(stride_),
-            stride_size(stride_size_),
-            padding(padding_),
-            padding_size(padding_size_),
-            dilation(dilation_),
-            dilation_size(dilation_size_)
+          kernel_size(kernel_size_),
+          kernel_size_size(kernel_size_size_),
+          stride(stride_),
+          stride_size(stride_size_),
+          padding(padding_),
+          padding_size(padding_size_),
+          dilation(dilation_),
+          dilation_size(dilation_size_)
     {
         // IsValidSize();
     }
 
-//     bool IsValidSize() const
-//     {
-//         if(inputDesc.GetSize() < 2 || inputDesc.GetSize() > 5)
-//         {
-// #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-//             MIOPEN_THROW(miopenStatusBadParm,
-//                          "Instance Norm: The input tensor dimension should be in range [2, 5].");
-// #else
-//             return false;
-// #endif
-//         }
-//         return true;
-//     }
+    //     bool IsValidSize() const
+    //     {
+    //         if(inputDesc.GetSize() < 2 || inputDesc.GetSize() > 5)
+    //         {
+    // #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+    //             MIOPEN_THROW(miopenStatusBadParm,
+    //                          "Instance Norm: The input tensor dimension should be in range [2,
+    //                          5].");
+    // #else
+    //             return false;
+    // #endif
+    //         }
+    //         return true;
+    //     }
 
     const TensorDescriptor& GetInputDesc() const { return inputDesc; }
     const TensorDescriptor& GetOutputDesc() const { return outputDesc; }
diff --git a/src/include/miopen/fold/solvers.hpp b/src/include/miopen/fold/solvers.hpp
index 743a3b6194..0d2cbe282f 100644
--- a/src/include/miopen/fold/solvers.hpp
+++ b/src/include/miopen/fold/solvers.hpp
@@ -59,13 +59,12 @@ struct UnfoldFwd final : UnfoldFwdSolverBase
 {
     const std::string& SolverDbId() const override { return GetSolverDbId<UnfoldFwd>(); }
 
-    bool IsApplicable(
-        const ExecutionContext& context,
-        const miopen::fold::UnfoldFwdProblemDescription& problem) const override;
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::fold::UnfoldFwdProblemDescription& problem) const override;
 
-    ConvSolution GetSolution(
-        const ExecutionContext& context,
-        const miopen::fold::UnfoldFwdProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::fold::UnfoldFwdProblemDescription& problem) const override;
 };
 
 } // namespace fold
diff --git a/src/kernels/MIOpenUnfold.cpp b/src/kernels/MIOpenUnfold.cpp
index 1135797401..5c39a82e2c 100644
--- a/src/kernels/MIOpenUnfold.cpp
+++ b/src/kernels/MIOpenUnfold.cpp
@@ -54,174 +54,185 @@ __device__ void unfoldForward4D(const TIO* input,
                                 tensor_view_t<4> input_tv,
                                 tensor_view_t<3> output_tv)
 {
-  /*
-   * input = {N, C, H, W}, output = {N, C * P, L}
-   * where P = kernel_size_h * kernel_size_w, L = # of blocks (see host code for
-   * formula)
-   * => gws = {ceil(N * C * P * L, LOCAL_SIZE)}, lws = {LOCAL_SIZE}
-   */
+    /*
+     * input = {N, C, H, W}, output = {N, C * P, L}
+     * where P = kernel_size_h * kernel_size_w, L = # of blocks (see host code for
+     * formula)
+     * => gws = {ceil(N * C * P * L, LOCAL_SIZE)}, lws = {LOCAL_SIZE}
+     */
 
     const int gid = threadIdx.x + blockIdx.x * blockDim.x;
     int ncp = gid / L, l = gid % L;
     int nc = ncp / P, p = ncp % P;
     int n = nc / C, c = nc % C;
-    if (n >= N) return;
+    if(n >= N)
+        return;
 
-    
-    int lh = l / LW, lw = l % LW;  // sliding window position
-    int ph = p / kernel_size_w, pw = p % kernel_size_w;  // position inside kernel
+    int lh = l / LW, lw = l % LW;                       // sliding window position
+    int ph = p / kernel_size_w, pw = p % kernel_size_w; // position inside kernel
     int h = lh * stride_h - padding_h + ph * dilation_h;
     int w = lw * stride_w - padding_w + pw * dilation_w;
 
     TIO x = 0;
-    if (0 <= h && h < H && 0 <= w && w < W) {
-        long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + input_tv.stride[1] * c + input_tv.stride[0] * n;
+    if(0 <= h && h < H && 0 <= w && w < W)
+    {
+        long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + input_tv.stride[1] * c +
+                         input_tv.stride[0] * n;
         x = input[input_idx];
     }
 
-    long output_idx = output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n;
-    output[output_idx] = x;  
+    long output_idx =
+        output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n;
+    output[output_idx] = x;
 }
 
 extern "C" __global__ void UnfoldForward4D(const IN_OUT_TYPE* input,
-                                                IN_OUT_TYPE* output,
-                                                int N,
-                                                int C,
-                                                int H,
-                                                int W,
-                                                int P,
-                                                int L,
-                                                int LH,
-                                                int LW,
-                                                int kernel_size_h,
-                                                int kernel_size_w,
-                                                int stride_h,
-                                                int stride_w,
-                                                int padding_h,
-                                                int padding_w,
-                                                int dilation_h,
-                                                int dilation_w,
-                                                tensor_view_t<4> input_tv,
-                                                tensor_view_t<3> output_tv)
+                                           IN_OUT_TYPE* output,
+                                           int N,
+                                           int C,
+                                           int H,
+                                           int W,
+                                           int P,
+                                           int L,
+                                           int LH,
+                                           int LW,
+                                           int kernel_size_h,
+                                           int kernel_size_w,
+                                           int stride_h,
+                                           int stride_w,
+                                           int padding_h,
+                                           int padding_w,
+                                           int dilation_h,
+                                           int dilation_w,
+                                           tensor_view_t<4> input_tv,
+                                           tensor_view_t<3> output_tv)
 {
-    unfoldForward4D<IN_OUT_TYPE>( input,
-                                output,
-                                N,
-                                C,
-                                H,
-                                W,
-                                P,
-                                L,
-                                LH,
-                                LW,
-                                kernel_size_h,
-                                kernel_size_w,
-                                stride_h,
-                                stride_w,
-                                padding_h,
-                                padding_w,
-                                dilation_h,
-                                dilation_w,
-                                input_tv,
-                                output_tv);
+    unfoldForward4D<IN_OUT_TYPE>(input,
+                                 output,
+                                 N,
+                                 C,
+                                 H,
+                                 W,
+                                 P,
+                                 L,
+                                 LH,
+                                 LW,
+                                 kernel_size_h,
+                                 kernel_size_w,
+                                 stride_h,
+                                 stride_w,
+                                 padding_h,
+                                 padding_w,
+                                 dilation_h,
+                                 dilation_w,
+                                 input_tv,
+                                 output_tv);
 }
 
 template <typename TIO>
 __device__ void unfoldBackward4D(const TIO* output_grad,
-                                TIO* input_grad,
-                                int N,
-                                int C,
-                                int H,
-                                int W,
-                                int P,
-                                int L,
-                                int LH,
-                                int LW,
-                                int kernel_size_h,
-                                int kernel_size_w,
-                                int stride_h,
-                                int stride_w,
-                                int padding_h,
-                                int padding_w,
-                                int dilation_h,
-                                int dilation_w,
-                                tensor_view_t<3> output_grad_tv,
-                                tensor_view_t<4> input_grad_tv)
+                                 TIO* input_grad,
+                                 int N,
+                                 int C,
+                                 int H,
+                                 int W,
+                                 int P,
+                                 int L,
+                                 int LH,
+                                 int LW,
+                                 int kernel_size_h,
+                                 int kernel_size_w,
+                                 int stride_h,
+                                 int stride_w,
+                                 int padding_h,
+                                 int padding_w,
+                                 int dilation_h,
+                                 int dilation_w,
+                                 tensor_view_t<3> output_grad_tv,
+                                 tensor_view_t<4> input_grad_tv)
 {
-  /*
-   * output_grad = {N, C * P, L}, input_grad = {N, C, H, W}
-   * where P = kernel_size_h * kernel_size_w, L = # of blocks (see host code for
-   * formula)
-   * => gws = {ceil(N * C * H * W, LOCAL_SIZE)}, lws = {LOCAL_SIZE}
-   */
+    /*
+     * output_grad = {N, C * P, L}, input_grad = {N, C, H, W}
+     * where P = kernel_size_h * kernel_size_w, L = # of blocks (see host code for
+     * formula)
+     * => gws = {ceil(N * C * H * W, LOCAL_SIZE)}, lws = {LOCAL_SIZE}
+     */
 
     const int gid = threadIdx.x + blockIdx.x * blockDim.x;
     int nch = gid / W, w = gid % W;
     int nc = nch / H, h = nch % H;
     int n = nc / C, c = nc % C;
-    if (n >= N) return;
+    if(n >= N)
+        return;
 
     FLOAT_ACCUM sum = 0.0f;
-    for (int ph = 0; ph < kernel_size_h; ++ph)
+    for(int ph = 0; ph < kernel_size_h; ++ph)
     {
-        for (int pw = 0; pw < kernel_size_w; ++pw)
+        for(int pw = 0; pw < kernel_size_w; ++pw)
         {
             int lhsh = h - ph * dilation_h + padding_h;
             int lwsw = w - pw * dilation_w + padding_w;
-            if (lhsh % stride_h != 0) continue;
-            if (lwsw % stride_w != 0) continue;
+            if(lhsh % stride_h != 0)
+                continue;
+            if(lwsw % stride_w != 0)
+                continue;
             int lh = lhsh / stride_h;
             int lw = lwsw / stride_w;
-            if (lh < 0 || LH <= lh) continue;
-            if (lw < 0 || LW <= lw) continue;
-            long output_grad_idx = output_grad_tv.stride[2] * (lh * LW + lw) + output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) + output_grad_tv.stride[0] * n;
+            if(lh < 0 || LH <= lh)
+                continue;
+            if(lw < 0 || LW <= lw)
+                continue;
+            long output_grad_idx = output_grad_tv.stride[2] * (lh * LW + lw) +
+                                   output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) +
+                                   output_grad_tv.stride[0] * n;
             sum += CVT_FLOAT2ACCUM(output_grad[output_grad_idx]);
         }
     }
 
-    long input_grad_idx = input_grad_tv.stride[3] * w + input_grad_tv.stride[2] * h + input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n;
+    long input_grad_idx = input_grad_tv.stride[3] * w + input_grad_tv.stride[2] * h +
+                          input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n;
     input_grad[input_grad_idx] = CVT_ACCUM2FLOAT(sum);
 }
 
 extern "C" __global__ void UnfoldBackward4D(const IN_OUT_TYPE* output_grad,
-                                                IN_OUT_TYPE* input_grad,
-                                                int N,
-                                                int C,
-                                                int H,
-                                                int W,
-                                                int P,
-                                                int L,
-                                                int LH,
-                                                int LW,
-                                                int kernel_size_h,
-                                                int kernel_size_w,
-                                                int stride_h,
-                                                int stride_w,
-                                                int padding_h,
-                                                int padding_w,
-                                                int dilation_h,
-                                                int dilation_w,
-                                                tensor_view_t<3> output_grad_tv,
-                                                tensor_view_t<4> input_grad_tv)
+                                            IN_OUT_TYPE* input_grad,
+                                            int N,
+                                            int C,
+                                            int H,
+                                            int W,
+                                            int P,
+                                            int L,
+                                            int LH,
+                                            int LW,
+                                            int kernel_size_h,
+                                            int kernel_size_w,
+                                            int stride_h,
+                                            int stride_w,
+                                            int padding_h,
+                                            int padding_w,
+                                            int dilation_h,
+                                            int dilation_w,
+                                            tensor_view_t<3> output_grad_tv,
+                                            tensor_view_t<4> input_grad_tv)
 {
     unfoldBackward4D<IN_OUT_TYPE>(output_grad,
-                                input_grad,
-                                N,
-                                C,
-                                H,
-                                W,
-                                P,
-                                L,
-                                LH,
-                                LW,
-                                kernel_size_h,
-                                kernel_size_w,
-                                stride_h,
-                                stride_w,
-                                padding_h,
-                                padding_w,
-                                dilation_h,
-                                dilation_w,
-                                output_grad_tv,
-                                input_grad_tv);
+                                  input_grad,
+                                  N,
+                                  C,
+                                  H,
+                                  W,
+                                  P,
+                                  L,
+                                  LH,
+                                  LW,
+                                  kernel_size_h,
+                                  kernel_size_w,
+                                  stride_h,
+                                  stride_w,
+                                  padding_h,
+                                  padding_w,
+                                  dilation_h,
+                                  dilation_w,
+                                  output_grad_tv,
+                                  input_grad_tv);
 }
diff --git a/src/solver/fold/fold_forward.cpp b/src/solver/fold/fold_forward.cpp
index d3e44c0d33..67528b00b7 100644
--- a/src/solver/fold/fold_forward.cpp
+++ b/src/solver/fold/fold_forward.cpp
@@ -42,29 +42,27 @@ namespace solver {
 
 namespace fold {
 
-bool FoldFwd::IsApplicable(
-    [[maybe_unused]] const ExecutionContext& /*context*/,
-    const miopen::fold::FoldFwdProblemDescription& problem) const
+bool FoldFwd::IsApplicable([[maybe_unused]] const ExecutionContext& /*context*/,
+                           const miopen::fold::FoldFwdProblemDescription& problem) const
 {
     return true;
 }
 
-ConvSolution FoldFwd::GetSolution(
-    [[maybe_unused]] const ExecutionContext& context,
-    const miopen::fold::FoldFwdProblemDescription& problem) const
+ConvSolution FoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                                  const miopen::fold::FoldFwdProblemDescription& problem) const
 {
     std::ignore = context;
     auto result = ConvSolution{miopenStatusSuccess};
 
-    auto in_dtype    = miopen::GetDataType(problem.GetInputDesc().GetType());
-    auto dtype       = problem.GetOutputDesc().GetType();
-    auto input_dims  = problem.GetInputDesc().GetLengths();
+    auto in_dtype   = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto dtype      = problem.GetOutputDesc().GetType();
+    auto input_dims = problem.GetInputDesc().GetLengths();
 
     auto output_dims = problem.GetOutputDesc().GetLengths();
-    const int32_t N = static_cast<int32_t>(output_dims[0]);
-    const int32_t C = static_cast<int32_t>(output_dims[1]);
-    int32_t H = static_cast<int32_t>(output_dims[2]);
-    int32_t W = static_cast<int32_t>(output_dims[3]);
+    const int32_t N  = static_cast<int32_t>(output_dims[0]);
+    const int32_t C  = static_cast<int32_t>(output_dims[1]);
+    int32_t H        = static_cast<int32_t>(output_dims[2]);
+    int32_t W        = static_cast<int32_t>(output_dims[3]);
 
     {
         auto kernel        = KernelInfo{};
@@ -102,20 +100,21 @@ ConvSolution FoldFwd::GetSolution(
             decltype(auto) kernel = handle_.Run(kernels.front());
             decltype(auto) params = raw_params.CastTo<miopen::fold::InvokeParams>();
 
-            auto input_tv                = get_inner_expanded_tv<3>(deref(params.inputDesc));
-            auto output_tv                = get_inner_expanded_tv<4>(deref(params.outputDesc));
-            auto input_dims          = deref(params.inputDesc).GetLengths();
-            auto output_dims          = deref(params.outputDesc).GetLengths();
+            auto input_tv    = get_inner_expanded_tv<3>(deref(params.inputDesc));
+            auto output_tv   = get_inner_expanded_tv<4>(deref(params.outputDesc));
+            auto input_dims  = deref(params.inputDesc).GetLengths();
+            auto output_dims = deref(params.outputDesc).GetLengths();
 
             int spatial_dim_size = output_dims.size() - 2;
-            const int32_t N = static_cast<int32_t>(output_dims[0]);
-            const int32_t C = static_cast<int32_t>(output_dims[1]);
+            const int32_t N      = static_cast<int32_t>(output_dims[0]);
+            const int32_t C      = static_cast<int32_t>(output_dims[1]);
             int32_t P = 1, L = 1;
             std::vector<int32_t> ls;
-            for (int i = 0; i < spatial_dim_size; ++i) {
+            for(int i = 0; i < spatial_dim_size; ++i)
+            {
                 P *= params.kernel_size[i];
                 int32_t l = (output_dims[i + 2] + 2 * params.padding[i] -
-                            params.dilation[i] * (params.kernel_size[i] - 1) - 1) /
+                             params.dilation[i] * (params.kernel_size[i] - 1) - 1) /
                                 params.stride[i] +
                             0;
                 L *= l;
@@ -124,37 +123,37 @@ ConvSolution FoldFwd::GetSolution(
 
             int32_t kernel_size_h = params.kernel_size[0];
             int32_t kernel_size_w = params.kernel_size[1];
-            int32_t stride_h = params.stride[0];
-            int32_t stride_w = params.stride[1];
-            int32_t padding_h = params.padding[0];
-            int32_t padding_w = params.padding[1];
-            int32_t dilation_h = params.dilation[0];
-            int32_t dilation_w = params.dilation[1];
-            int32_t LH = ls[0];
-            int32_t LW = ls[1];
-            int32_t H = static_cast<int32_t>(output_dims[2]);
-            int32_t W = static_cast<int32_t>(output_dims[3]);
+            int32_t stride_h      = params.stride[0];
+            int32_t stride_w      = params.stride[1];
+            int32_t padding_h     = params.padding[0];
+            int32_t padding_w     = params.padding[1];
+            int32_t dilation_h    = params.dilation[0];
+            int32_t dilation_w    = params.dilation[1];
+            int32_t LH            = ls[0];
+            int32_t LW            = ls[1];
+            int32_t H             = static_cast<int32_t>(output_dims[2]);
+            int32_t W             = static_cast<int32_t>(output_dims[3]);
 
             kernel(params.input,
-                    params.output,
-                    N,
-                    C,
-                    H,
-                    W,
-                    P,
-                    L,
-                    LH,
-                    LW,
-                    kernel_size_h,
-                    kernel_size_w,
-                    stride_h,
-                    stride_w,
-                    padding_h,
-                    padding_w,
-                    dilation_h,
-                    dilation_w,
-                    input_tv,
-                    output_tv);
+                   params.output,
+                   N,
+                   C,
+                   H,
+                   W,
+                   P,
+                   L,
+                   LH,
+                   LW,
+                   kernel_size_h,
+                   kernel_size_w,
+                   stride_h,
+                   stride_w,
+                   padding_h,
+                   padding_w,
+                   dilation_h,
+                   dilation_w,
+                   input_tv,
+                   output_tv);
         };
     };
 
diff --git a/src/solver/fold/unfold_forward.cpp b/src/solver/fold/unfold_forward.cpp
index 68f8072e74..b866b5d167 100644
--- a/src/solver/fold/unfold_forward.cpp
+++ b/src/solver/fold/unfold_forward.cpp
@@ -42,34 +42,33 @@ namespace solver {
 
 namespace fold {
 
-bool UnfoldFwd::IsApplicable(
-    [[maybe_unused]] const ExecutionContext& /*context*/,
-    const miopen::fold::UnfoldFwdProblemDescription& problem) const
+bool UnfoldFwd::IsApplicable([[maybe_unused]] const ExecutionContext& /*context*/,
+                             const miopen::fold::UnfoldFwdProblemDescription& problem) const
 {
     return true;
 }
 
-ConvSolution UnfoldFwd::GetSolution(
-    [[maybe_unused]] const ExecutionContext& context,
-    const miopen::fold::UnfoldFwdProblemDescription& problem) const
+ConvSolution UnfoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                                    const miopen::fold::UnfoldFwdProblemDescription& problem) const
 {
     std::ignore = context;
     auto result = ConvSolution{miopenStatusSuccess};
 
-    auto in_dtype    = miopen::GetDataType(problem.GetInputDesc().GetType());
-    auto dtype       = problem.GetOutputDesc().GetType();
-    auto input_dims  = problem.GetInputDesc().GetLengths();
+    auto in_dtype   = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto dtype      = problem.GetOutputDesc().GetType();
+    auto input_dims = problem.GetInputDesc().GetLengths();
 
-    auto output_dims = problem.GetOutputDesc().GetLengths();
-    const int32_t N = static_cast<int32_t>(input_dims[0]);
-    const int32_t C = static_cast<int32_t>(input_dims[1]);
+    auto output_dims     = problem.GetOutputDesc().GetLengths();
+    const int32_t N      = static_cast<int32_t>(input_dims[0]);
+    const int32_t C      = static_cast<int32_t>(input_dims[1]);
     int spatial_dim_size = input_dims.size() - 2;
     int32_t P = 1, L = 1;
     std::vector<int32_t> ls;
-    for (int i = 0; i < spatial_dim_size; ++i) {
+    for(int i = 0; i < spatial_dim_size; ++i)
+    {
         P *= problem.kernel_size[i];
         int32_t l = (static_cast<int32_t>(input_dims[i + 2]) + 2 * problem.padding[i] -
-                    problem.dilation[i] * (problem.kernel_size[i] - 1) - 1) /
+                     problem.dilation[i] * (problem.kernel_size[i] - 1) - 1) /
                         problem.stride[i] +
                     1;
         L *= l;
@@ -112,20 +111,21 @@ ConvSolution UnfoldFwd::GetSolution(
             decltype(auto) kernel = handle_.Run(kernels.front());
             decltype(auto) params = raw_params.CastTo<miopen::fold::InvokeParams>();
 
-            auto input_tv                = get_inner_expanded_tv<4>(deref(params.inputDesc));
-            auto output_tv                = get_inner_expanded_tv<3>(deref(params.outputDesc));
-            auto input_dims          = deref(params.inputDesc).GetLengths();
-            auto output_dims          = deref(params.outputDesc).GetLengths();
+            auto input_tv    = get_inner_expanded_tv<4>(deref(params.inputDesc));
+            auto output_tv   = get_inner_expanded_tv<3>(deref(params.outputDesc));
+            auto input_dims  = deref(params.inputDesc).GetLengths();
+            auto output_dims = deref(params.outputDesc).GetLengths();
 
             int spatial_dim_size = input_dims.size() - 2;
-            const int32_t N = static_cast<int32_t>(input_dims[0]);
-            const int32_t C = static_cast<int32_t>(input_dims[1]);
+            const int32_t N      = static_cast<int32_t>(input_dims[0]);
+            const int32_t C      = static_cast<int32_t>(input_dims[1]);
             int32_t P = 1, L = 1;
             std::vector<int32_t> ls;
-            for (int i = 0; i < spatial_dim_size; ++i) {
+            for(int i = 0; i < spatial_dim_size; ++i)
+            {
                 P *= params.kernel_size[i];
                 int32_t l = (static_cast<int32_t>(input_dims[i + 2]) + 2 * params.padding[i] -
-                            params.dilation[i] * (params.kernel_size[i] - 1) - 1) /
+                             params.dilation[i] * (params.kernel_size[i] - 1) - 1) /
                                 params.stride[i] +
                             1;
                 L *= l;
@@ -134,37 +134,37 @@ ConvSolution UnfoldFwd::GetSolution(
 
             int32_t kernel_size_h = params.kernel_size[0];
             int32_t kernel_size_w = params.kernel_size[1];
-            int32_t stride_h = params.stride[0];
-            int32_t stride_w = params.stride[1];
-            int32_t padding_h = params.padding[0];
-            int32_t padding_w = params.padding[1];
-            int32_t dilation_h = params.dilation[0];
-            int32_t dilation_w = params.dilation[1];
-            int32_t LH = ls[0];
-            int32_t LW = ls[1];
-            int32_t H = static_cast<int32_t>(input_dims[2]);
-            int32_t W = static_cast<int32_t>(input_dims[3]);
+            int32_t stride_h      = params.stride[0];
+            int32_t stride_w      = params.stride[1];
+            int32_t padding_h     = params.padding[0];
+            int32_t padding_w     = params.padding[1];
+            int32_t dilation_h    = params.dilation[0];
+            int32_t dilation_w    = params.dilation[1];
+            int32_t LH            = ls[0];
+            int32_t LW            = ls[1];
+            int32_t H             = static_cast<int32_t>(input_dims[2]);
+            int32_t W             = static_cast<int32_t>(input_dims[3]);
 
             kernel(params.input,
-                    params.output,
-                    N,
-                    C,
-                    H,
-                    W,
-                    P,
-                    L,
-                    LH,
-                    LW,
-                    kernel_size_h,
-                    kernel_size_w,
-                    stride_h,
-                    stride_w,
-                    padding_h,
-                    padding_w,
-                    dilation_h,
-                    dilation_w,
-                    input_tv,
-                    output_tv);
+                   params.output,
+                   N,
+                   C,
+                   H,
+                   W,
+                   P,
+                   L,
+                   LH,
+                   LW,
+                   kernel_size_h,
+                   kernel_size_w,
+                   stride_h,
+                   stride_w,
+                   padding_h,
+                   padding_w,
+                   dilation_h,
+                   dilation_w,
+                   input_tv,
+                   output_tv);
         };
     };
 
diff --git a/test/cpu_fold.hpp b/test/cpu_fold.hpp
index 0e9588e000..030d5722d2 100644
--- a/test/cpu_fold.hpp
+++ b/test/cpu_fold.hpp
@@ -35,19 +35,19 @@
 
 template <class T>
 void cpu_unfold_fwd_4d(tensor<T> input_tensor,
-                            tensor<T>& ref_output_tensor,
-                            const std::vector<int32_t> kernel_size,
-                            const std::vector<int32_t> stride,
-                            const std::vector<int32_t> padding,
-                            const std::vector<int32_t> dilation)
+                       tensor<T>& ref_output_tensor,
+                       const std::vector<int32_t> kernel_size,
+                       const std::vector<int32_t> stride,
+                       const std::vector<int32_t> padding,
+                       const std::vector<int32_t> dilation)
 {
-    auto input_tv                = miopen::get_inner_expanded_tv<4>(input_tensor.desc);
-    auto output_tv                = miopen::get_inner_expanded_tv<3>(ref_output_tensor.desc);
-    auto input_size     = input_tensor.desc.GetSize();
-    auto input_dims     = input_tensor.desc.GetLengths();
+    auto input_tv   = miopen::get_inner_expanded_tv<4>(input_tensor.desc);
+    auto output_tv  = miopen::get_inner_expanded_tv<3>(ref_output_tensor.desc);
+    auto input_size = input_tensor.desc.GetSize();
+    auto input_dims = input_tensor.desc.GetLengths();
 
-    auto input          = input_tensor.data.data();
-    auto output         = ref_output_tensor.data.data();
+    auto input  = input_tensor.data.data();
+    auto output = ref_output_tensor.data.data();
 
     const int LOCAL_SIZE = 256;
     int spatial_dim_size = input_size - 2;
@@ -57,10 +57,11 @@ void cpu_unfold_fwd_4d(tensor<T> input_tensor,
 
     int32_t P = 1, L = 1;
     std::vector<int32_t> ls;
-    for (int i = 0; i < spatial_dim_size; ++i) {
+    for(int i = 0; i < spatial_dim_size; ++i)
+    {
         P *= kernel_size[i];
         int32_t l = (static_cast<int32_t>(input_dims[i + 2]) + 2 * padding[i] -
-                    dilation[i] * (kernel_size[i] - 1) - 1) /
+                     dilation[i] * (kernel_size[i] - 1) - 1) /
                         stride[i] +
                     1;
         L *= l;
@@ -69,36 +70,40 @@ void cpu_unfold_fwd_4d(tensor<T> input_tensor,
 
     int32_t kernel_size_h = kernel_size[0];
     int32_t kernel_size_w = kernel_size[1];
-    int32_t stride_h = stride[0];
-    int32_t stride_w = stride[1];
-    int32_t padding_h = padding[0];
-    int32_t padding_w = padding[1];
-    int32_t dilation_h = dilation[0];
-    int32_t dilation_w = dilation[1];
-    int32_t LH = ls[0];
-    int32_t LW = ls[1];
-    int32_t H = static_cast<int32_t>(input_dims[2]);
-    int32_t W = static_cast<int32_t>(input_dims[3]);
-    int work_size = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
+    int32_t stride_h      = stride[0];
+    int32_t stride_w      = stride[1];
+    int32_t padding_h     = padding[0];
+    int32_t padding_w     = padding[1];
+    int32_t dilation_h    = dilation[0];
+    int32_t dilation_w    = dilation[1];
+    int32_t LH            = ls[0];
+    int32_t LW            = ls[1];
+    int32_t H             = static_cast<int32_t>(input_dims[2]);
+    int32_t W             = static_cast<int32_t>(input_dims[3]);
+    int work_size         = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
     par_ford(work_size)([&](int gid) {
-            int ncp = gid / L, l = gid % L;
-            int nc = ncp / P, p = ncp % P;
-            int n = nc / C, c = nc % C;
-            if (n >= N) return;
+        int ncp = gid / L, l = gid % L;
+        int nc = ncp / P, p = ncp % P;
+        int n = nc / C, c = nc % C;
+        if(n >= N)
+            return;
 
-            int lh = l / LW, lw = l % LW;  // sliding window position
-            int ph = p / kernel_size_w, pw = p % kernel_size_w;  // position inside kernel
-            int h = lh * stride_h - padding_h + ph * dilation_h;
-            int w = lw * stride_w - padding_w + pw * dilation_w;
+        int lh = l / LW, lw = l % LW;                       // sliding window position
+        int ph = p / kernel_size_w, pw = p % kernel_size_w; // position inside kernel
+        int h = lh * stride_h - padding_h + ph * dilation_h;
+        int w = lw * stride_w - padding_w + pw * dilation_w;
 
-            T x = static_cast<T>(0.0f);
-            if (0 <= h && h < H && 0 <= w && w < W) {
-                long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + input_tv.stride[1] * c + input_tv.stride[0] * n;
-                x = input[input_idx];
-            }
+        T x = static_cast<T>(0.0f);
+        if(0 <= h && h < H && 0 <= w && w < W)
+        {
+            long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h +
+                             input_tv.stride[1] * c + input_tv.stride[0] * n;
+            x = input[input_idx];
+        }
 
-            long output_idx = output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n;
-            output[output_idx] = x;
+        long output_idx =
+            output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n;
+        output[output_idx] = x;
     });
 }
 #endif
diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp
index eee1e79fef..8900ea4827 100644
--- a/test/gtest/fold.hpp
+++ b/test/gtest/fold.hpp
@@ -51,15 +51,19 @@ struct UnfoldTestCase
     bool isContiguous = true;
     friend std::ostream& operator<<(std::ostream& os, const UnfoldTestCase& tc)
     {
-        os << "N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H
-           << " W:" << tc.W << " kernel_size:";
-        for (const auto& ks : tc.kernelSize) os << ks << " ";
+        os << "N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H << " W:" << tc.W
+           << " kernel_size:";
+        for(const auto& ks : tc.kernelSize)
+            os << ks << " ";
         os << "stride:";
-        for (const auto& s : tc.stride) os << s << " ";
+        for(const auto& s : tc.stride)
+            os << s << " ";
         os << "padding:";
-        for (const auto& p : tc.padding) os << p << " ";
+        for(const auto& p : tc.padding)
+            os << p << " ";
         os << "dilation:";
-        for (const auto& d : tc.dilation) os << d << " ";
+        for(const auto& d : tc.dilation)
+            os << d << " ";
         os << "isContiguous:" << std::boolalpha << tc.isContiguous;
         return os;
     }
@@ -128,8 +132,8 @@ struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         auto&& handle = get_handle();
         config        = GetParam();
 
-        std::vector<size_t> in_dims          = config.GetInput();
-        std::vector<size_t> in_strides       = config.ComputeStrides(in_dims);
+        std::vector<size_t> in_dims    = config.GetInput();
+        std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
 
         auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
         auto gen_one   = [&](auto...) { return 1; };
@@ -137,29 +141,29 @@ struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         input          = tensor<T>{in_dims, in_strides}.generate(gen_value);
 
         int spatial_dim_size = in_dims.size() - 2;
-        const int32_t N = static_cast<int32_t>(in_dims[0]);
-        const int32_t C = static_cast<int32_t>(in_dims[1]);
+        const int32_t N      = static_cast<int32_t>(in_dims[0]);
+        const int32_t C      = static_cast<int32_t>(in_dims[1]);
         int32_t P = 1, L = 1;
         std::vector<int32_t> ls;
-        for (int i = 0; i < spatial_dim_size; ++i) {
+        for(int i = 0; i < spatial_dim_size; ++i)
+        {
             P *= config.kernelSize[i];
             int32_t l = (static_cast<int32_t>(in_dims[i + 2]) + 2 * config.padding[i] -
-                        config.dilation[i] * (config.kernelSize[i] - 1) - 1) /
+                         config.dilation[i] * (config.kernelSize[i] - 1) - 1) /
                             config.stride[i] +
                         1;
             L *= l;
             ls.push_back(l);
         }
 
-        std::vector<size_t> out_dims{static_cast<size_t>(N),
-                                    static_cast<size_t>(C * P),
-                                    static_cast<size_t>(L)};
+        std::vector<size_t> out_dims{
+            static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
 
         output     = tensor<T>{out_dims}.generate(gen_zero);
         outputHost = tensor<T>{out_dims}.generate(gen_zero);
 
-        input_dev   = handle.Write(input.data);
-        output_dev  = handle.Write(output.data);
+        input_dev  = handle.Write(input.data);
+        output_dev = handle.Write(output.data);
     }
 
     void RunTest()
@@ -168,25 +172,21 @@ struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         miopenStatus_t status;
 
         status = miopen::UnfoldForward(handle,
-                                input.desc,
-                                input_dev.get(),
-                                output.desc,
-                                output_dev.get(),
-                                config.kernelSize.data(),
-                                static_cast<int>(config.kernelSize.size()),
-                                config.stride.data(),
-                                static_cast<int>(config.stride.size()),
-                                config.padding.data(),
-                                static_cast<int>(config.padding.size()),
-                                config.dilation.data(),
-                                static_cast<int>(config.dilation.size()));
-                                
-        cpu_unfold_fwd_4d<T>(input,
-                            outputHost,
-                            config.kernelSize,
-                            config.stride,
-                            config.padding,
-                            config.dilation);
+                                       input.desc,
+                                       input_dev.get(),
+                                       output.desc,
+                                       output_dev.get(),
+                                       config.kernelSize.data(),
+                                       static_cast<int>(config.kernelSize.size()),
+                                       config.stride.data(),
+                                       static_cast<int>(config.stride.size()),
+                                       config.padding.data(),
+                                       static_cast<int>(config.padding.size()),
+                                       config.dilation.data(),
+                                       static_cast<int>(config.dilation.size()));
+
+        cpu_unfold_fwd_4d<T>(
+            input, outputHost, config.kernelSize, config.stride, config.padding, config.dilation);
 
         EXPECT_EQ(status, miopenStatusSuccess);
         output.data = handle.Read<T>(output_dev, output.data.size());
@@ -201,10 +201,9 @@ struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
         if(std::is_same<T, bfloat16>::value)
             tolerance *= 8.0;
-        auto error_output   = miopen::rms_range(outputHost, output);
-        EXPECT_TRUE(error_output < tolerance)
-            << "Error forward output beyond tolerance Error: {" << error_output
-            << "},  Tolerance: " << tolerance;
+        auto error_output = miopen::rms_range(outputHost, output);
+        EXPECT_TRUE(error_output < tolerance) << "Error forward output beyond tolerance Error: {"
+                                              << error_output << "},  Tolerance: " << tolerance;
     }
     UnfoldTestCase config;
 

From 0f15ed504f7c49570ec0b408909088389a3e380b Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Thu, 4 Jul 2024 03:24:18 +0000
Subject: [PATCH 03/46] githook format

---
 driver/unfold_driver.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/driver/unfold_driver.hpp b/driver/unfold_driver.hpp
index cfa25d3a85..1c7fb75bbf 100644
--- a/driver/unfold_driver.hpp
+++ b/driver/unfold_driver.hpp
@@ -134,13 +134,13 @@ int UnfoldDriver<Tgpu, Tref>::GetandSetData()
 {
     std::vector<int> input_length = GetTensorLengthsFromCmdLine();
 
-    kernel_size = GetVectorInt32tFromCmdLine("kernelSize");
-    stride      = GetVectorInt32tFromCmdLine("stride");
-    padding     = GetVectorInt32tFromCmdLine("padding");
-    dilation    = GetVectorInt32tFromCmdLine("dilation");
+    kernel_size          = GetVectorInt32tFromCmdLine("kernelSize");
+    stride               = GetVectorInt32tFromCmdLine("stride");
+    padding              = GetVectorInt32tFromCmdLine("padding");
+    dilation             = GetVectorInt32tFromCmdLine("dilation");
     int spatial_dim_size = input_length.size() - 2;
-    const int N = input_length[0];
-    const int C = input_length[1];
+    const int N          = input_length[0];
+    const int C          = input_length[1];
 
     int P = 1, L = 1;
     std::vector<int> ls;

From 9b49b9dc66d2f32380af7db770ebe8dd7030ba73 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Thu, 4 Jul 2024 08:36:03 +0000
Subject: [PATCH 04/46] unfold backward driver and gtest

---
 driver/mloUnfoldHost.hpp                      |  79 ++++++++-
 driver/unfold_driver.hpp                      |  75 +++++++-
 include/miopen/miopen.h                       |  62 +++----
 src/CMakeLists.txt                            |   1 +
 src/fold.cpp                                  |  50 ++++++
 src/fold/problem_description.cpp              |  26 +++
 src/fold_api.cpp                              |  31 ++++
 src/include/miopen/fold.hpp                   |  14 ++
 src/include/miopen/fold/invoke_params.hpp     |   6 +-
 .../miopen/fold/problem_description.hpp       | 118 ++++++-------
 src/include/miopen/fold/solvers.hpp           |  15 ++
 src/solver.cpp                                |   1 +
 src/solver/fold/unfold_backward.cpp           | 167 ++++++++++++++++++
 test/cpu_fold.hpp                             |  86 +++++++++
 test/gtest/fold.cpp                           |  63 +++++++
 test/gtest/fold.hpp                           |  94 ++++++++++
 16 files changed, 790 insertions(+), 98 deletions(-)
 create mode 100644 src/solver/fold/unfold_backward.cpp

diff --git a/driver/mloUnfoldHost.hpp b/driver/mloUnfoldHost.hpp
index 7941eb63c4..466217feba 100644
--- a/driver/mloUnfoldHost.hpp
+++ b/driver/mloUnfoldHost.hpp
@@ -106,14 +106,85 @@ int32_t mloUnFoldFwd4DRunHost(Tgpu* input,
 }
 
 template <typename Tgpu, typename Tcheck>
-int32_t mloUnFoldBwd4DRunHost(Tgpu* input,
-                              const miopenTensorDescriptor_t inputDesc,
-                              Tcheck* ref_output,
-                              const miopenTensorDescriptor_t ref_outputDesc,
+int32_t mloUnFoldBwd4DRunHost(Tcheck* ref_dinput,
+                              const miopenTensorDescriptor_t dinputDesc,
+                              Tgpu* doutput,
+                              const miopenTensorDescriptor_t doutputDesc,
                               const std::vector<int32_t> kernel_size,
                               const std::vector<int32_t> stride,
                               const std::vector<int32_t> padding,
                               const std::vector<int32_t> dilation)
 {
+    auto input_grad_tv   = miopen::get_inner_expanded_tv<4>(miopen::deref(dinputDesc));
+    auto output_grad_tv  = miopen::get_inner_expanded_tv<3>(miopen::deref(doutputDesc));
+    auto input_grad_dims = miopen::deref(dinputDesc).GetLengths();
+    auto input_size = miopen::deref(dinputDesc).GetSize();
+
+    const int LOCAL_SIZE = 256;
+    int spatial_dim_size = input_size - 2;
+    const int32_t N      = static_cast<int32_t>(input_grad_dims[0]);
+    const int32_t C      = static_cast<int32_t>(input_grad_dims[1]);
+    [[maybe_unused]] int32_t P = 1, L = 1;
+    std::vector<int32_t> ls;
+    for(int i = 0; i < spatial_dim_size; ++i)
+    {
+        P *= kernel_size[i];
+        int32_t l = (static_cast<int32_t>(input_grad_dims[i + 2]) + 2 * padding[i] -
+                     dilation[i] * (kernel_size[i] - 1) - 1) /
+                        stride[i] +
+                    1;
+        L *= l;
+        ls.push_back(l);
+    }
+    int32_t kernel_size_h                  = kernel_size[0];
+    int32_t kernel_size_w                  = kernel_size[1];
+    int32_t stride_h                       = stride[0];
+    int32_t stride_w                       = stride[1];
+    int32_t padding_h                      = padding[0];
+    int32_t padding_w                      = padding[1];
+    int32_t dilation_h                     = dilation[0];
+    int32_t dilation_w                     = dilation[1];
+    int32_t LH                             = ls[0];
+    int32_t LW                             = ls[1];
+    int32_t H                              = static_cast<int32_t>(input_grad_dims[2]);
+    int32_t W                              = static_cast<int32_t>(input_grad_dims[3]);
+    int work_size = (((N * C * H * W) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
+    par_ford(work_size)([&](int gid) {
+        int nch = gid / W, w = gid % W;
+        int nc = nch / H, h = nch % H;
+        int n = nc / C, c = nc % C;
+        if(n >= N)
+            return;
+
+        float sum = 0.0f;
+
+        for(int ph = 0; ph < kernel_size_h; ++ph)
+        {
+            for(int pw = 0; pw < kernel_size_w; ++pw)
+            {
+                int lhsh = h - ph * dilation_h + padding_h;
+                int lwsw = w - pw * dilation_w + padding_w;
+                if(lhsh % stride_h != 0)
+                    continue;
+                if(lwsw % stride_w != 0)
+                    continue;
+                int lh = lhsh / stride_h;
+                int lw = lwsw / stride_w;
+                if(lh < 0 || LH <= lh)
+                    continue;
+                if(lw < 0 || LW <= lw)
+                    continue;
+                long output_grad_idx = output_grad_tv.stride[2] * (lh * LW + lw) +
+                                    output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) +
+                                    output_grad_tv.stride[0] * n;
+                sum += static_cast<float>(doutput[output_grad_idx]);
+            }
+        }
+
+        long input_grad_idx = input_grad_tv.stride[3] * w + input_grad_tv.stride[2] * h +
+                          input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n;
+        ref_dinput[input_grad_idx] = static_cast<Tcheck>(sum);
+    });
+
     return miopenStatusSuccess;
 }
diff --git a/driver/unfold_driver.hpp b/driver/unfold_driver.hpp
index 1c7fb75bbf..57f92b3423 100644
--- a/driver/unfold_driver.hpp
+++ b/driver/unfold_driver.hpp
@@ -109,7 +109,7 @@ class UnfoldDriver : public Driver
 
     std::vector<Tref> output_host;
 
-    std::vector<Tref> doutput_host;
+    std::vector<Tref> dinput_host;
 
     std::vector<int32_t> kernel_size;
     std::vector<int32_t> stride;
@@ -156,8 +156,8 @@ int UnfoldDriver<Tgpu, Tref>::GetandSetData()
     std::vector<int> output_length = {N, (C * P), L};
     SetTensorNd(inputDesc, input_length, data_type);
     SetTensorNd(outputDesc, output_length, data_type);
-    SetTensorNd(doutputDesc, output_length, data_type);
     SetTensorNd(dinputDesc, input_length, data_type);
+    SetTensorNd(doutputDesc, output_length, data_type);
 
     return miopenStatusSuccess;
 }
@@ -267,7 +267,7 @@ int UnfoldDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
 
     output_host = std::vector<Tref>(output_sz, static_cast<Tref>(0.0f));
 
-    doutput_host = std::vector<Tref>(doutput_sz, static_cast<Tref>(0.0f));
+    dinput_host = std::vector<Tref>(dinput_sz, static_cast<Tref>(0.0f));
 
     int status;
 
@@ -358,12 +358,67 @@ int UnfoldDriver<Tgpu, Tref>::RunForwardCPU()
 template <typename Tgpu, typename Tref>
 int UnfoldDriver<Tgpu, Tref>::RunBackwardGPU()
 {
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenUnfoldBackward(GetHandle(),
+                            dinputDesc,
+                            dinput_dev->GetMem(),
+                            doutputDesc,
+                            doutput_dev->GetMem(),
+                            kernel_size.data(),
+                            kernel_size.size(),
+                            stride.data(),
+                            stride.size(),
+                            padding.data(),
+                            padding.size(),
+                            dilation.data(),
+                            dilation.size());
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Unfold Backward Elapsed: " << t.gettime_ms() / iter
+                      << " ms" << std::endl;
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Unfold Backward Elapsed: " << kernel_average_time << " ms"
+                  << std::endl;
+    }
+
+    if(dinput_dev->FromGPU(GetStream(), dinput.data()) != 0)
+        std::cerr << "Error copying (dinput_dev) from GPU, size: " << dinput_dev->GetSize()
+                  << std::endl;
+
     return miopenStatusSuccess;
 }
 
 template <typename Tgpu, typename Tref>
 int UnfoldDriver<Tgpu, Tref>::RunBackwardCPU()
 {
+    mloUnFoldBwd4DRunHost(dinput_host.data(),
+                        inputDesc,
+                        doutput.data(),
+                        doutputDesc,
+                        kernel_size,
+                        stride,
+                        padding,
+                        dilation);
     return miopenStatusSuccess;
 }
 
@@ -403,6 +458,20 @@ int UnfoldDriver<Tgpu, Tref>::VerifyForward()
 template <typename Tgpu, typename Tref>
 int UnfoldDriver<Tgpu, Tref>::VerifyBackward()
 {
+    RunBackwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error_dinput    = miopen::rms_range(dinput_host, dinput);
+
+    if(!std::isfinite(error_dinput) || error_dinput > tolerance)
+    {
+        std::cout << "Backward Unfold FAILED: {" << error_dinput << "} > " << tolerance << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        std::cout << "Backward Unfold Verifies OK on CPU reference ({" << error_dinput << "} < "
+                  << tolerance << ')' << std::endl;
+    }
     return miopenStatusSuccess;
 }
 
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 45e8df42db..9fae26ed6e 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -6620,37 +6620,37 @@ MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle,
                                                  const int32_t* dilation,
                                                  const int dilation_size);
 
-// /*! @brief Execute an unfold forward layer
-//  *
-//  * @param handle              MIOpen handle (input)
-//  * @param inputDesc           Tensor descriptor for data input tensor input (input)
-//  * @param input               Data tensor input (input)
-//  * @param outputDesc          Tensor descriptor for data output tensor output (output)
-//  * @param output              Data tensor output (output)
-//  * @param kernel_size         Size of the sliding box array (input)
-//  * @param kernel_size_size    Size of the kernel_size array (input)
-//  * @param stride              Stride array of the sliding box (input)
-//  * @param stride_size         Size of the stride array (input)
-//  * @param padding             Padding array to be added on input (input)
-//  * @param padding_size        Size of the padding array (input)
-//  * @param dilation            Dilation array control the stride of the elements within the
-//  neighborhood (input)
-//  * @param dilation_size       Size of the dilation array (input)
-//  * @return               miopenStatus_t
-//  */
-// MIOPEN_EXPORT miopenStatus_t miopenFoldForward(miopenHandle_t handle,
-//                                                     const miopenTensorDescriptor_t inputDesc,
-//                                                     const void* input,
-//                                                     const miopenTensorDescriptor_t outputDesc,
-//                                                     void* output,
-//                                                     const int32_t* kernel_size,
-//                                                     const int kernel_size_size,
-//                                                     const int32_t* stride,
-//                                                     const int stride_size,
-//                                                     const int32_t* padding,
-//                                                     const int padding_size,
-//                                                     const int32_t* dilation,
-//                                                     const int dilation_size);
+/*! @brief Execute an unfold backward layer
+ *
+ * @param handle              MIOpen handle (input)
+ * @param dinputDesc          Tensor descriptor for data input grad tensor (output)
+ * @param dinput              Data tensor input grad (output)
+ * @param doutputDesc         Tensor descriptor for data output grad tensor (input)
+ * @param doutput             Data tensor output grad (input)
+ * @param kernel_size         Size of the sliding box array (input)
+ * @param kernel_size_size    Size of the kernel_size array (input)
+ * @param stride              Stride array of the sliding box (input)
+ * @param stride_size         Size of the stride array (input)
+ * @param padding             Padding array to be added on input (input)
+ * @param padding_size        Size of the padding array (input)
+ * @param dilation            Dilation array control the stride of the elements within the
+ neighborhood (input)
+ * @param dilation_size       Size of the dilation array (input)
+ * @return               miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle,
+                                                    const miopenTensorDescriptor_t dinputDesc,
+                                                    void* dinput,
+                                                    const miopenTensorDescriptor_t doutputDesc,
+                                                    const void* doutput,
+                                                    const int32_t* kernel_size,
+                                                    const int kernel_size_size,
+                                                    const int32_t* stride,
+                                                    const int stride_size,
+                                                    const int32_t* padding,
+                                                    const int padding_size,
+                                                    const int32_t* dilation,
+                                                    const int dilation_size);
 
 /** @} */
 // CLOSEOUT FOLD DOXYGEN GROUP
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 34153587d3..ae2965b07c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -259,6 +259,7 @@ set( MIOpen_Source
     solver/conv_winoRxS_fused.cpp
     solver/fft.cpp
     solver/fold/unfold_forward.cpp
+    solver/fold/unfold_backward.cpp
     solver/gemm.cpp
     solver/gemm_bwd.cpp
     solver/gemm_wrw.cpp
diff --git a/src/fold.cpp b/src/fold.cpp
index d545c01964..1117cdc642 100644
--- a/src/fold.cpp
+++ b/src/fold.cpp
@@ -87,4 +87,54 @@ miopenStatus_t UnfoldForward(Handle& handle,
     return miopenStatusSuccess;
 }
 
+miopenStatus_t UnfoldBackward(Handle& handle,
+                             const TensorDescriptor& dinputDesc,
+                             Data_t dinput,
+                             const TensorDescriptor& doutputDesc,
+                             ConstData_t doutput,
+                             const int32_t* kernel_size,
+                             const int kernel_size_size,
+                             const int32_t* stride,
+                             const int stride_size,
+                             const int32_t* padding,
+                             const int padding_size,
+                             const int32_t* dilation,
+                             const int dilation_size)
+{
+    const auto problem = fold::UnfoldBwdProblemDescription{dinputDesc,
+                                                           doutputDesc,
+                                                           kernel_size,
+                                                           kernel_size_size,
+                                                           stride,
+                                                           stride_size,
+                                                           padding,
+                                                           padding_size,
+                                                           dilation,
+                                                           dilation_size};
+
+    const auto invoke_params = [&]() {
+        auto tmp             = fold::InvokeParams{};
+        tmp.type             = InvokeType::Run;
+        tmp.dinputDesc        = &dinputDesc;
+        tmp.doutputDesc       = &doutputDesc;
+        tmp.dinput            = dinput;
+        tmp.doutput           = doutput;
+        tmp.kernel_size      = kernel_size;
+        tmp.stride           = stride;
+        tmp.padding          = padding;
+        tmp.dilation         = dilation;
+        tmp.kernel_size_size = kernel_size_size;
+        tmp.stride_size      = stride_size;
+        tmp.padding_size     = padding_size;
+        tmp.dilation_size    = dilation_size;
+        return tmp;
+    }();
+
+    const auto algo    = AlgorithmName{"UnfoldBwd"};
+    const auto solvers = solver::SolverContainer<solver::fold::UnfoldBwd>{};
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
 } // namespace miopen
diff --git a/src/fold/problem_description.cpp b/src/fold/problem_description.cpp
index 9a0c6ec921..d0ecf629e9 100644
--- a/src/fold/problem_description.cpp
+++ b/src/fold/problem_description.cpp
@@ -75,6 +75,32 @@ NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const
     return NetworkConfig{ss.str()};
 }
 
+NetworkConfig UnfoldBwdProblemDescription::MakeNetworkConfig() const
+{
+    auto input_dtype  = dinputDesc.GetType();
+    auto output_dtype = doutputDesc.GetType();
+    auto size         = dinputDesc.GetElementSize();
+    auto in_dims      = dinputDesc.GetLengths();
+
+    std::ostringstream ss;
+
+    ss << "Unfold_bwd";
+    ss << "i_dtype" << input_dtype;
+    ss << "o_dtype" << output_dtype;
+    ss << "size" << size;
+    ss << "in_grad_dims";
+    for(auto val : in_dims)
+    {
+        ss << "_" << val;
+    }
+    ss << "kernel_size_" << kernel_size[0] << "_" << kernel_size[1];
+    ss << "stride_" << stride[0] << "_" << stride[1];
+    ss << "padding_" << padding[0] << "_" << padding[1];
+    ss << "dilation_" << dilation[0] << "_" << dilation[1];
+
+    return NetworkConfig{ss.str()};
+}
+
 } // namespace fold
 
 } // namespace miopen
diff --git a/src/fold_api.cpp b/src/fold_api.cpp
index 1e6c97ef83..cb50b194ea 100644
--- a/src/fold_api.cpp
+++ b/src/fold_api.cpp
@@ -61,3 +61,34 @@ extern "C" miopenStatus_t miopenUnfoldForward(miopenHandle_t handle,
                               dilation_size);
     });
 }
+
+extern "C" miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle,
+                                              const miopenTensorDescriptor_t dinputDesc,
+                                              void* dinput,
+                                              const miopenTensorDescriptor_t doutputDesc,
+                                              const void* doutput,
+                                              const int32_t* kernel_size,
+                                              const int kernel_size_size,
+                                              const int32_t* stride,
+                                              const int stride_size,
+                                              const int32_t* padding,
+                                              const int padding_size,
+                                              const int32_t* dilation,
+                                              const int dilation_size)
+{
+    return miopen::try_([&] {
+        miopen::UnfoldBackward(miopen::deref(handle),
+                              miopen::deref(dinputDesc),
+                              DataCast(dinput),
+                              miopen::deref(doutputDesc),
+                              DataCast(doutput),
+                              kernel_size,
+                              kernel_size_size,
+                              stride,
+                              stride_size,
+                              padding,
+                              padding_size,
+                              dilation,
+                              dilation_size);
+    });
+}
diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp
index 33e879eb0a..7bb0cc946a 100644
--- a/src/include/miopen/fold.hpp
+++ b/src/include/miopen/fold.hpp
@@ -46,5 +46,19 @@ miopenStatus_t UnfoldForward(Handle& handle,
                              const int padding_size,
                              const int32_t* dilation,
                              const int dilation_size);
+
+miopenStatus_t UnfoldBackward(Handle& handle,
+                             const TensorDescriptor& dinputDesc,
+                             Data_t dinput,
+                             const TensorDescriptor& doutputDesc,
+                             ConstData_t doutput,
+                             const int32_t* kernel_size,
+                             const int kernel_size_size,
+                             const int32_t* stride,
+                             const int stride_size,
+                             const int32_t* padding,
+                             const int padding_size,
+                             const int32_t* dilation,
+                             const int dilation_size);
 } // namespace miopen
 #endif // MIOPEN_INSTANCE_NORM_HPP_
diff --git a/src/include/miopen/fold/invoke_params.hpp b/src/include/miopen/fold/invoke_params.hpp
index 318e312206..b256680d8e 100644
--- a/src/include/miopen/fold/invoke_params.hpp
+++ b/src/include/miopen/fold/invoke_params.hpp
@@ -41,10 +41,14 @@ struct InvokeParams : public miopen::InvokeParams
 
     const TensorDescriptor* inputDesc  = nullptr;
     const TensorDescriptor* outputDesc = nullptr;
-
     ConstData_t input = nullptr;
     Data_t output     = nullptr;
 
+    const TensorDescriptor* dinputDesc  = nullptr;
+    const TensorDescriptor* doutputDesc = nullptr;
+    Data_t dinput = nullptr;
+    ConstData_t doutput     = nullptr;
+
     const int32_t* kernel_size = nullptr;
     const int32_t* stride      = nullptr;
     const int32_t* padding     = nullptr;
diff --git a/src/include/miopen/fold/problem_description.hpp b/src/include/miopen/fold/problem_description.hpp
index 938abe6dae..9e4e5b427f 100644
--- a/src/include/miopen/fold/problem_description.hpp
+++ b/src/include/miopen/fold/problem_description.hpp
@@ -42,65 +42,6 @@ namespace fold {
 
 bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y);
 
-// struct FoldFwdProblemDescription : ProblemDescriptionBase
-// {
-//     FoldFwdProblemDescription(const TensorDescriptor& inputDesc_,
-//                                 const TensorDescriptor& outputDesc_,
-//                                 const int32_t* kernel_size_,
-//                                 const int kernel_size_size_,
-//                                 const int32_t* stride_,
-//                                 const int stride_size_,
-//                                 const int32_t* padding_,
-//                                 const int padding_size_,
-//                                 const int32_t* dilation_,
-//                                 const int dilation_size_)
-//         : inputDesc(inputDesc_),
-//           outputDesc(outputDesc_),
-//             kernel_size(kernel_size_),
-//             kernel_size_size(kernel_size_size_),
-//             stride(stride_),
-//             stride_size(stride_size_),
-//             padding(padding_),
-//             padding_size(padding_size_),
-//             dilation(dilation_),
-//             dilation_size(dilation_size_)
-//     {
-//         // IsValidSize();
-//     }
-
-// //     bool IsValidSize() const
-// //     {
-// //         if(inputDesc.GetSize() < 2 || inputDesc.GetSize() > 5)
-// //         {
-// // #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-// //             MIOPEN_THROW(miopenStatusBadParm,
-// //                          "Instance Norm: The input tensor dimension should be in range [2,
-// 5].");
-// // #else
-// //             return false;
-// // #endif
-// //         }
-// //         return true;
-// //     }
-
-//     const TensorDescriptor& GetInputDesc() const { return inputDesc; }
-//     const TensorDescriptor& GetOutputDesc() const { return outputDesc; }
-
-//     NetworkConfig MakeNetworkConfig() const override;
-
-// public:
-//     TensorDescriptor inputDesc;
-//     TensorDescriptor outputDesc;
-//     const int32_t* kernel_size;
-//     const int kernel_size_size;
-//     const int32_t* stride;
-//     const int stride_size;
-//     const int32_t* padding;
-//     const int padding_size;
-//     const int32_t* dilation;
-//     const int dilation_size;
-// };
-
 struct UnfoldFwdProblemDescription : ProblemDescriptionBase
 {
     UnfoldFwdProblemDescription(const TensorDescriptor& inputDesc_,
@@ -160,6 +101,65 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase
     const int dilation_size;
 };
 
+struct UnfoldBwdProblemDescription : ProblemDescriptionBase
+{
+    UnfoldBwdProblemDescription(const TensorDescriptor& dinputDesc_,
+                                const TensorDescriptor& doutputDesc_,
+                                const int32_t* kernel_size_,
+                                const int kernel_size_size_,
+                                const int32_t* stride_,
+                                const int stride_size_,
+                                const int32_t* padding_,
+                                const int padding_size_,
+                                const int32_t* dilation_,
+                                const int dilation_size_)
+        : dinputDesc(dinputDesc_),
+          doutputDesc(doutputDesc_),
+          kernel_size(kernel_size_),
+          kernel_size_size(kernel_size_size_),
+          stride(stride_),
+          stride_size(stride_size_),
+          padding(padding_),
+          padding_size(padding_size_),
+          dilation(dilation_),
+          dilation_size(dilation_size_)
+    {
+        // IsValidSize();
+    }
+
+    //     bool IsValidSize() const
+    //     {
+    //         if(inputDesc.GetSize() < 2 || inputDesc.GetSize() > 5)
+    //         {
+    // #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+    //             MIOPEN_THROW(miopenStatusBadParm,
+    //                          "Instance Norm: The input tensor dimension should be in range [2,
+    //                          5].");
+    // #else
+    //             return false;
+    // #endif
+    //         }
+    //         return true;
+    //     }
+
+    const TensorDescriptor& GetDinputDesc() const { return dinputDesc; }
+    const TensorDescriptor& GetDoutputDesc() const { return doutputDesc; }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+public:
+    TensorDescriptor dinputDesc;
+    TensorDescriptor doutputDesc;
+    const int32_t* kernel_size;
+    const int kernel_size_size;
+    const int32_t* stride;
+    const int stride_size;
+    const int32_t* padding;
+    const int padding_size;
+    const int32_t* dilation;
+    const int dilation_size;
+};
+
 } // namespace fold
 
 } // namespace miopen
diff --git a/src/include/miopen/fold/solvers.hpp b/src/include/miopen/fold/solvers.hpp
index 0d2cbe282f..d463bb0251 100644
--- a/src/include/miopen/fold/solvers.hpp
+++ b/src/include/miopen/fold/solvers.hpp
@@ -67,6 +67,21 @@ struct UnfoldFwd final : UnfoldFwdSolverBase
                 const miopen::fold::UnfoldFwdProblemDescription& problem) const override;
 };
 
+using UnfoldBwdSolverBase =
+    NonTunableSolverBase<ExecutionContext, miopen::fold::UnfoldBwdProblemDescription>;
+
+struct UnfoldBwd final : UnfoldBwdSolverBase
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<UnfoldBwd>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::fold::UnfoldBwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::fold::UnfoldBwdProblemDescription& problem) const override;
+};
+
 } // namespace fold
 
 } // namespace solver
diff --git a/src/solver.cpp b/src/solver.cpp
index 97fa4637f3..8e3d5afcb3 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -651,6 +651,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
     Register(registry, ++id, Primitive::Softmax, softmax::AttnSoftmax{}.SolverDbId());
     // Register(registry, ++id, Primitive::Fold, fold::FoldFwd{}.SolverDbId());
     Register(registry, ++id, Primitive::Unfold, fold::UnfoldFwd{}.SolverDbId());
+    Register(registry, ++id, Primitive::Unfold, fold::UnfoldBwd{}.SolverDbId());
 
     // IMPORTANT: New solvers should be added to the end of the function!
 }
diff --git a/src/solver/fold/unfold_backward.cpp b/src/solver/fold/unfold_backward.cpp
new file mode 100644
index 0000000000..c8613d9cca
--- /dev/null
+++ b/src/solver/fold/unfold_backward.cpp
@@ -0,0 +1,167 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/fold/problem_description.hpp"
+#include "miopen/miopen.h"
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/fold/invoke_params.hpp>
+#include <miopen/fold/solvers.hpp>
+#include <miopen/fold.hpp>
+#include <miopen/target_properties.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define LOCAL_SIZE 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace fold {
+
+bool UnfoldBwd::IsApplicable([[maybe_unused]] const ExecutionContext& /*context*/,
+                             const miopen::fold::UnfoldBwdProblemDescription& problem) const
+{
+    return true;
+}
+
+ConvSolution UnfoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                                    const miopen::fold::UnfoldBwdProblemDescription& problem) const
+{
+    std::ignore = context;
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto in_dtype   = miopen::GetDataType(problem.GetDinputDesc().GetType());
+    auto dtype      = problem.GetDoutputDesc().GetType();
+    auto input_grad_dims = problem.GetDinputDesc().GetLengths();
+    auto output_grad_dims = problem.GetDoutputDesc().GetLengths();
+
+    const int32_t N      = static_cast<int32_t>(input_grad_dims[0]);
+    const int32_t C      = static_cast<int32_t>(input_grad_dims[1]);
+    int32_t H = static_cast<int32_t>(input_grad_dims[2]);
+    int32_t W = static_cast<int32_t>(input_grad_dims[3]);
+
+    {
+        auto kernel        = KernelInfo{};
+        kernel.kernel_file = "MIOpenUnfold.cpp";
+        kernel.kernel_name = "UnfoldBackward4D";
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype},
+        };
+        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+        size_t xlocalsize = LOCAL_SIZE;
+        size_t xgridsize  = AlignUp(N * C * H * W, LOCAL_SIZE);
+        size_t ylocalsize = 1;
+        size_t ygridsize  = 1;
+        size_t zlocalsize = 1;
+        size_t zgridsize  = 1;
+        kernel.l_wk.push_back(xlocalsize);
+        kernel.l_wk.push_back(ylocalsize);
+        kernel.l_wk.push_back(zlocalsize);
+
+        kernel.g_wk.push_back(xgridsize);
+        kernel.g_wk.push_back(ygridsize);
+        kernel.g_wk.push_back(zgridsize);
+
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::fold::InvokeParams>();
+
+            auto input_grad_tv    = get_inner_expanded_tv<4>(deref(params.dinputDesc));
+            auto output_grad_tv   = get_inner_expanded_tv<3>(deref(params.doutputDesc));
+            auto input_grad_dims  = deref(params.dinputDesc).GetLengths();
+            auto output_grad_dims = deref(params.doutputDesc).GetLengths();
+
+            int spatial_dim_size = input_grad_dims.size() - 2;
+            const int32_t N      = static_cast<int32_t>(input_grad_dims[0]);
+            const int32_t C      = static_cast<int32_t>(input_grad_dims[1]);
+            int32_t P = 1, L = 1;
+            std::vector<int32_t> ls;
+            for(int i = 0; i < spatial_dim_size; ++i)
+            {
+                P *= params.kernel_size[i];
+                int32_t l = (static_cast<int32_t>(input_grad_dims[i + 2]) + 2 * params.padding[i] -
+                             params.dilation[i] * (params.kernel_size[i] - 1) - 1) /
+                                params.stride[i] +
+                            1;
+                L *= l;
+                ls.push_back(l);
+            }
+
+            int32_t kernel_size_h = params.kernel_size[0];
+            int32_t kernel_size_w = params.kernel_size[1];
+            int32_t stride_h      = params.stride[0];
+            int32_t stride_w      = params.stride[1];
+            int32_t padding_h     = params.padding[0];
+            int32_t padding_w     = params.padding[1];
+            int32_t dilation_h    = params.dilation[0];
+            int32_t dilation_w    = params.dilation[1];
+            int32_t LH            = ls[0];
+            int32_t LW            = ls[1];
+            int32_t H             = static_cast<int32_t>(input_grad_dims[2]);
+            int32_t W             = static_cast<int32_t>(input_grad_dims[3]);
+
+            kernel(params.doutput,
+                   params.dinput,
+                   N,
+                   C,
+                   H,
+                   W,
+                   P,
+                   L,
+                   LH,
+                   LW,
+                   kernel_size_h,
+                   kernel_size_w,
+                   stride_h,
+                   stride_w,
+                   padding_h,
+                   padding_w,
+                   dilation_h,
+                   dilation_w,
+                   output_grad_tv,
+                   input_grad_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace fold
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/test/cpu_fold.hpp b/test/cpu_fold.hpp
index 030d5722d2..4c9427f2d4 100644
--- a/test/cpu_fold.hpp
+++ b/test/cpu_fold.hpp
@@ -106,4 +106,90 @@ void cpu_unfold_fwd_4d(tensor<T> input_tensor,
         output[output_idx] = x;
     });
 }
+
+template <class T>
+void cpu_unfold_bwd_4d(tensor<T>& ref_dinput_tensor,
+                       tensor<T> doutput_tensor,
+                       const std::vector<int32_t> kernel_size,
+                       const std::vector<int32_t> stride,
+                       const std::vector<int32_t> padding,
+                       const std::vector<int32_t> dilation)
+{
+    auto input_grad_tv   = miopen::get_inner_expanded_tv<4>(ref_dinput_tensor.desc);
+    auto output_grad_tv  = miopen::get_inner_expanded_tv<3>(doutput_tensor.desc);
+    auto input_size = ref_dinput_tensor.desc.GetSize();
+    auto input_grad_dims = ref_dinput_tensor.desc.GetLengths();
+
+    auto input_grad  = ref_dinput_tensor.data.data();
+    auto output_grad = doutput_tensor.data.data();
+
+    const int LOCAL_SIZE = 256;
+    int spatial_dim_size = input_size - 2;
+
+    const int32_t N = static_cast<int32_t>(input_grad_dims[0]);
+    const int32_t C = static_cast<int32_t>(input_grad_dims[1]);
+
+    int32_t P = 1, L = 1;
+    std::vector<int32_t> ls;
+    for(int i = 0; i < spatial_dim_size; ++i)
+    {
+        P *= kernel_size[i];
+        int32_t l = (static_cast<int32_t>(input_grad_dims[i + 2]) + 2 * padding[i] -
+                     dilation[i] * (kernel_size[i] - 1) - 1) /
+                        stride[i] +
+                    1;
+        L *= l;
+        ls.push_back(l);
+    }
+
+    int32_t kernel_size_h = kernel_size[0];
+    int32_t kernel_size_w = kernel_size[1];
+    int32_t stride_h      = stride[0];
+    int32_t stride_w      = stride[1];
+    int32_t padding_h     = padding[0];
+    int32_t padding_w     = padding[1];
+    int32_t dilation_h    = dilation[0];
+    int32_t dilation_w    = dilation[1];
+    int32_t LH            = ls[0];
+    int32_t LW            = ls[1];
+    int32_t H             = static_cast<int32_t>(input_grad_dims[2]);
+    int32_t W             = static_cast<int32_t>(input_grad_dims[3]);
+    int work_size         = (((N * C * H * W) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
+    par_ford(work_size)([&](int gid) {
+        int nch = gid / W, w = gid % W;
+        int nc = nch / H, h = nch % H;
+        int n = nc / C, c = nc % C;
+        if(n >= N)
+            return;
+
+        float sum = 0.0f;
+
+        for(int ph = 0; ph < kernel_size_h; ++ph)
+        {
+            for(int pw = 0; pw < kernel_size_w; ++pw)
+            {
+                int lhsh = h - ph * dilation_h + padding_h;
+                int lwsw = w - pw * dilation_w + padding_w;
+                if(lhsh % stride_h != 0)
+                    continue;
+                if(lwsw % stride_w != 0)
+                    continue;
+                int lh = lhsh / stride_h;
+                int lw = lwsw / stride_w;
+                if(lh < 0 || LH <= lh)
+                    continue;
+                if(lw < 0 || LW <= lw)
+                    continue;
+                long output_grad_idx = output_grad_tv.stride[2] * (lh * LW + lw) +
+                                    output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) +
+                                    output_grad_tv.stride[0] * n;
+                sum += static_cast<float>(output_grad[output_grad_idx]);
+            }
+        }
+
+        long input_grad_idx = input_grad_tv.stride[3] * w + input_grad_tv.stride[2] * h +
+                          input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n;
+        input_grad[input_grad_idx] = static_cast<T>(sum);
+    });
+}
 #endif
diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp
index d1843ae3c8..0e9fe9ddd8 100644
--- a/test/gtest/fold.cpp
+++ b/test/gtest/fold.cpp
@@ -43,6 +43,18 @@ struct UnfoldForwardTestFloat16 : UnfoldFwdTest<half>
 struct UnfoldForwardTestBFloat16 : UnfoldFwdTest<bfloat16>
 {
 };
+
+struct UnfoldBackwardTestFloat32 : UnfoldBwdTest<float>
+{
+};
+
+struct UnfoldBackwardTestFloat16 : UnfoldBwdTest<half>
+{
+};
+
+struct UnfoldBackwardTestBFloat16 : UnfoldBwdTest<bfloat16>
+{
+};
 }; // namespace fold
 using namespace fold;
 TEST_P(UnfoldForwardTestFloat32, UnfoldForwardTest)
@@ -95,3 +107,54 @@ TEST_P(UnfoldForwardTestBFloat16, UnfoldForwardTest)
 INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
                          UnfoldForwardTestBFloat16,
                          testing::ValuesIn(UnfoldTestConfigs()));
+
+TEST_P(UnfoldBackwardTestFloat32, UnfoldBackwardTest)
+{
+    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet,
+                         UnfoldBackwardTestFloat32,
+                         testing::ValuesIn(UnfoldTestConfigs()));
+
+TEST_P(UnfoldBackwardTestFloat16, UnfoldBackwardTest)
+{
+    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet,
+                         UnfoldBackwardTestFloat16,
+                         testing::ValuesIn(UnfoldTestConfigs()));
+
+TEST_P(UnfoldBackwardTestBFloat16, UnfoldBackwardTest)
+{
+    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet,
+                         UnfoldBackwardTestBFloat16,
+                         testing::ValuesIn(UnfoldTestConfigs()));
diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp
index 8900ea4827..150edf0a47 100644
--- a/test/gtest/fold.hpp
+++ b/test/gtest/fold.hpp
@@ -215,3 +215,97 @@ struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
     miopen::Allocator::ManageDataPtr input_dev;
     miopen::Allocator::ManageDataPtr output_dev;
 };
+
+
+template <typename T>
+struct UnfoldBwdTest : public ::testing::TestWithParam<UnfoldTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle = get_handle();
+        config        = GetParam();
+
+        std::vector<size_t> in_dims    = config.GetInput();
+        std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
+
+        auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
+        auto gen_one   = [&](auto...) { return 1; };
+        auto gen_zero  = [&](auto...) { return 0; };
+        dinput          = tensor<T>{in_dims, in_strides}.generate(gen_zero);
+        dinputHost = tensor<T>{in_dims, in_strides}.generate(gen_zero);
+
+        int spatial_dim_size = in_dims.size() - 2;
+        const int32_t N      = static_cast<int32_t>(in_dims[0]);
+        const int32_t C      = static_cast<int32_t>(in_dims[1]);
+        int32_t P = 1, L = 1;
+        std::vector<int32_t> ls;
+        for(int i = 0; i < spatial_dim_size; ++i)
+        {
+            P *= config.kernelSize[i];
+            int32_t l = (static_cast<int32_t>(in_dims[i + 2]) + 2 * config.padding[i] -
+                         config.dilation[i] * (config.kernelSize[i] - 1) - 1) /
+                            config.stride[i] +
+                        1;
+            L *= l;
+            ls.push_back(l);
+        }
+
+        std::vector<size_t> out_dims{
+            static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
+
+        doutput     = tensor<T>{out_dims}.generate(gen_value);
+
+        dinput_dev  = handle.Write(dinput.data);
+        doutput_dev = handle.Write(doutput.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+        miopenStatus_t status;
+
+        status = miopen::UnfoldBackward(handle,
+                                       dinput.desc,
+                                       dinput_dev.get(),
+                                       doutput.desc,
+                                       doutput_dev.get(),
+                                       config.kernelSize.data(),
+                                       static_cast<int>(config.kernelSize.size()),
+                                       config.stride.data(),
+                                       static_cast<int>(config.stride.size()),
+                                       config.padding.data(),
+                                       static_cast<int>(config.padding.size()),
+                                       config.dilation.data(),
+                                       static_cast<int>(config.dilation.size()));
+
+        cpu_unfold_bwd_4d<T>(
+            dinputHost, doutput, config.kernelSize, config.stride, config.padding, config.dilation);
+
+        EXPECT_EQ(status, miopenStatusSuccess);
+        dinput.data = handle.Read<T>(dinput_dev, dinput.data.size());
+    }
+
+    void Verify()
+    {
+        // Computation error of fp16 is ~2^13 (=8192) bigger than
+        // the one of fp32 because mantissa is shorter by 13 bits.
+        double tolerance = std::is_same<T, float>::value ? 1.5e-6 : 8.2e-3;
+
+        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+        if(std::is_same<T, bfloat16>::value)
+            tolerance *= 8.0;
+        auto error_dinput = miopen::rms_range(dinputHost, dinput);
+        EXPECT_TRUE(error_dinput < tolerance) << "Error backward input_grad beyond tolerance Error: {"
+                                              << error_dinput << "},  Tolerance: " << tolerance;
+    }
+    UnfoldTestCase config;
+
+    tensor<T> dinput;
+    tensor<T> doutput;
+
+    tensor<T> dinputHost;
+
+    miopen::Allocator::ManageDataPtr dinput_dev;
+    miopen::Allocator::ManageDataPtr doutput_dev;
+};

From 83fba5690843a85a8533d1647736bf32e10b17fa Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Thu, 4 Jul 2024 08:36:22 +0000
Subject: [PATCH 05/46] githook format

---
 driver/mloUnfoldHost.hpp                  | 45 ++++++++++-----------
 driver/unfold_driver.hpp                  | 41 +++++++++----------
 include/miopen/miopen.h                   | 24 ++++++------
 src/fold.cpp                              | 32 +++++++--------
 src/fold_api.cpp                          | 48 +++++++++++------------
 src/include/miopen/fold.hpp               | 24 ++++++------
 src/include/miopen/fold/invoke_params.hpp |  8 ++--
 src/solver/fold/unfold_backward.cpp       | 14 +++----
 test/cpu_fold.hpp                         | 11 +++---
 test/gtest/fold.hpp                       | 36 ++++++++---------
 10 files changed, 143 insertions(+), 140 deletions(-)

diff --git a/driver/mloUnfoldHost.hpp b/driver/mloUnfoldHost.hpp
index 466217feba..fcfd5f4a6b 100644
--- a/driver/mloUnfoldHost.hpp
+++ b/driver/mloUnfoldHost.hpp
@@ -118,12 +118,12 @@ int32_t mloUnFoldBwd4DRunHost(Tcheck* ref_dinput,
     auto input_grad_tv   = miopen::get_inner_expanded_tv<4>(miopen::deref(dinputDesc));
     auto output_grad_tv  = miopen::get_inner_expanded_tv<3>(miopen::deref(doutputDesc));
     auto input_grad_dims = miopen::deref(dinputDesc).GetLengths();
-    auto input_size = miopen::deref(dinputDesc).GetSize();
+    auto input_size      = miopen::deref(dinputDesc).GetSize();
 
-    const int LOCAL_SIZE = 256;
-    int spatial_dim_size = input_size - 2;
-    const int32_t N      = static_cast<int32_t>(input_grad_dims[0]);
-    const int32_t C      = static_cast<int32_t>(input_grad_dims[1]);
+    const int LOCAL_SIZE       = 256;
+    int spatial_dim_size       = input_size - 2;
+    const int32_t N            = static_cast<int32_t>(input_grad_dims[0]);
+    const int32_t C            = static_cast<int32_t>(input_grad_dims[1]);
     [[maybe_unused]] int32_t P = 1, L = 1;
     std::vector<int32_t> ls;
     for(int i = 0; i < spatial_dim_size; ++i)
@@ -136,19 +136,19 @@ int32_t mloUnFoldBwd4DRunHost(Tcheck* ref_dinput,
         L *= l;
         ls.push_back(l);
     }
-    int32_t kernel_size_h                  = kernel_size[0];
-    int32_t kernel_size_w                  = kernel_size[1];
-    int32_t stride_h                       = stride[0];
-    int32_t stride_w                       = stride[1];
-    int32_t padding_h                      = padding[0];
-    int32_t padding_w                      = padding[1];
-    int32_t dilation_h                     = dilation[0];
-    int32_t dilation_w                     = dilation[1];
-    int32_t LH                             = ls[0];
-    int32_t LW                             = ls[1];
-    int32_t H                              = static_cast<int32_t>(input_grad_dims[2]);
-    int32_t W                              = static_cast<int32_t>(input_grad_dims[3]);
-    int work_size = (((N * C * H * W) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
+    int32_t kernel_size_h = kernel_size[0];
+    int32_t kernel_size_w = kernel_size[1];
+    int32_t stride_h      = stride[0];
+    int32_t stride_w      = stride[1];
+    int32_t padding_h     = padding[0];
+    int32_t padding_w     = padding[1];
+    int32_t dilation_h    = dilation[0];
+    int32_t dilation_w    = dilation[1];
+    int32_t LH            = ls[0];
+    int32_t LW            = ls[1];
+    int32_t H             = static_cast<int32_t>(input_grad_dims[2]);
+    int32_t W             = static_cast<int32_t>(input_grad_dims[3]);
+    int work_size         = (((N * C * H * W) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
     par_ford(work_size)([&](int gid) {
         int nch = gid / W, w = gid % W;
         int nc = nch / H, h = nch % H;
@@ -174,15 +174,16 @@ int32_t mloUnFoldBwd4DRunHost(Tcheck* ref_dinput,
                     continue;
                 if(lw < 0 || LW <= lw)
                     continue;
-                long output_grad_idx = output_grad_tv.stride[2] * (lh * LW + lw) +
-                                    output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) +
-                                    output_grad_tv.stride[0] * n;
+                long output_grad_idx =
+                    output_grad_tv.stride[2] * (lh * LW + lw) +
+                    output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) +
+                    output_grad_tv.stride[0] * n;
                 sum += static_cast<float>(doutput[output_grad_idx]);
             }
         }
 
         long input_grad_idx = input_grad_tv.stride[3] * w + input_grad_tv.stride[2] * h +
-                          input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n;
+                              input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n;
         ref_dinput[input_grad_idx] = static_cast<Tcheck>(sum);
     });
 
diff --git a/driver/unfold_driver.hpp b/driver/unfold_driver.hpp
index 57f92b3423..d565d192f5 100644
--- a/driver/unfold_driver.hpp
+++ b/driver/unfold_driver.hpp
@@ -367,18 +367,18 @@ int UnfoldDriver<Tgpu, Tref>::RunBackwardGPU()
     for(int i = 0; i < inflags.GetValueInt("iter"); i++)
     {
         miopenUnfoldBackward(GetHandle(),
-                            dinputDesc,
-                            dinput_dev->GetMem(),
-                            doutputDesc,
-                            doutput_dev->GetMem(),
-                            kernel_size.data(),
-                            kernel_size.size(),
-                            stride.data(),
-                            stride.size(),
-                            padding.data(),
-                            padding.size(),
-                            dilation.data(),
-                            dilation.size());
+                             dinputDesc,
+                             dinput_dev->GetMem(),
+                             doutputDesc,
+                             doutput_dev->GetMem(),
+                             kernel_size.data(),
+                             kernel_size.size(),
+                             stride.data(),
+                             stride.size(),
+                             padding.data(),
+                             padding.size(),
+                             dilation.data(),
+                             dilation.size());
 
         float time = 0.0;
         miopenGetKernelTime(GetHandle(), &time);
@@ -412,13 +412,13 @@ template <typename Tgpu, typename Tref>
 int UnfoldDriver<Tgpu, Tref>::RunBackwardCPU()
 {
     mloUnFoldBwd4DRunHost(dinput_host.data(),
-                        inputDesc,
-                        doutput.data(),
-                        doutputDesc,
-                        kernel_size,
-                        stride,
-                        padding,
-                        dilation);
+                          inputDesc,
+                          doutput.data(),
+                          doutputDesc,
+                          kernel_size,
+                          stride,
+                          padding,
+                          dilation);
     return miopenStatusSuccess;
 }
 
@@ -464,7 +464,8 @@ int UnfoldDriver<Tgpu, Tref>::VerifyBackward()
 
     if(!std::isfinite(error_dinput) || error_dinput > tolerance)
     {
-        std::cout << "Backward Unfold FAILED: {" << error_dinput << "} > " << tolerance << std::endl;
+        std::cout << "Backward Unfold FAILED: {" << error_dinput << "} > " << tolerance
+                  << std::endl;
         return EC_VerifyFwd;
     }
     else
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 9fae26ed6e..56633f57c2 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -6639,18 +6639,18 @@ MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle,
  * @return               miopenStatus_t
  */
 MIOPEN_EXPORT miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle,
-                                                    const miopenTensorDescriptor_t dinputDesc,
-                                                    void* dinput,
-                                                    const miopenTensorDescriptor_t doutputDesc,
-                                                    const void* doutput,
-                                                    const int32_t* kernel_size,
-                                                    const int kernel_size_size,
-                                                    const int32_t* stride,
-                                                    const int stride_size,
-                                                    const int32_t* padding,
-                                                    const int padding_size,
-                                                    const int32_t* dilation,
-                                                    const int dilation_size);
+                                                  const miopenTensorDescriptor_t dinputDesc,
+                                                  void* dinput,
+                                                  const miopenTensorDescriptor_t doutputDesc,
+                                                  const void* doutput,
+                                                  const int32_t* kernel_size,
+                                                  const int kernel_size_size,
+                                                  const int32_t* stride,
+                                                  const int stride_size,
+                                                  const int32_t* padding,
+                                                  const int padding_size,
+                                                  const int32_t* dilation,
+                                                  const int dilation_size);
 
 /** @} */
 // CLOSEOUT FOLD DOXYGEN GROUP
diff --git a/src/fold.cpp b/src/fold.cpp
index 1117cdc642..8a028d379e 100644
--- a/src/fold.cpp
+++ b/src/fold.cpp
@@ -88,18 +88,18 @@ miopenStatus_t UnfoldForward(Handle& handle,
 }
 
 miopenStatus_t UnfoldBackward(Handle& handle,
-                             const TensorDescriptor& dinputDesc,
-                             Data_t dinput,
-                             const TensorDescriptor& doutputDesc,
-                             ConstData_t doutput,
-                             const int32_t* kernel_size,
-                             const int kernel_size_size,
-                             const int32_t* stride,
-                             const int stride_size,
-                             const int32_t* padding,
-                             const int padding_size,
-                             const int32_t* dilation,
-                             const int dilation_size)
+                              const TensorDescriptor& dinputDesc,
+                              Data_t dinput,
+                              const TensorDescriptor& doutputDesc,
+                              ConstData_t doutput,
+                              const int32_t* kernel_size,
+                              const int kernel_size_size,
+                              const int32_t* stride,
+                              const int stride_size,
+                              const int32_t* padding,
+                              const int padding_size,
+                              const int32_t* dilation,
+                              const int dilation_size)
 {
     const auto problem = fold::UnfoldBwdProblemDescription{dinputDesc,
                                                            doutputDesc,
@@ -115,10 +115,10 @@ miopenStatus_t UnfoldBackward(Handle& handle,
     const auto invoke_params = [&]() {
         auto tmp             = fold::InvokeParams{};
         tmp.type             = InvokeType::Run;
-        tmp.dinputDesc        = &dinputDesc;
-        tmp.doutputDesc       = &doutputDesc;
-        tmp.dinput            = dinput;
-        tmp.doutput           = doutput;
+        tmp.dinputDesc       = &dinputDesc;
+        tmp.doutputDesc      = &doutputDesc;
+        tmp.dinput           = dinput;
+        tmp.doutput          = doutput;
         tmp.kernel_size      = kernel_size;
         tmp.stride           = stride;
         tmp.padding          = padding;
diff --git a/src/fold_api.cpp b/src/fold_api.cpp
index cb50b194ea..ba9f2fd805 100644
--- a/src/fold_api.cpp
+++ b/src/fold_api.cpp
@@ -63,32 +63,32 @@ extern "C" miopenStatus_t miopenUnfoldForward(miopenHandle_t handle,
 }
 
 extern "C" miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle,
-                                              const miopenTensorDescriptor_t dinputDesc,
-                                              void* dinput,
-                                              const miopenTensorDescriptor_t doutputDesc,
-                                              const void* doutput,
-                                              const int32_t* kernel_size,
-                                              const int kernel_size_size,
-                                              const int32_t* stride,
-                                              const int stride_size,
-                                              const int32_t* padding,
-                                              const int padding_size,
-                                              const int32_t* dilation,
-                                              const int dilation_size)
+                                               const miopenTensorDescriptor_t dinputDesc,
+                                               void* dinput,
+                                               const miopenTensorDescriptor_t doutputDesc,
+                                               const void* doutput,
+                                               const int32_t* kernel_size,
+                                               const int kernel_size_size,
+                                               const int32_t* stride,
+                                               const int stride_size,
+                                               const int32_t* padding,
+                                               const int padding_size,
+                                               const int32_t* dilation,
+                                               const int dilation_size)
 {
     return miopen::try_([&] {
         miopen::UnfoldBackward(miopen::deref(handle),
-                              miopen::deref(dinputDesc),
-                              DataCast(dinput),
-                              miopen::deref(doutputDesc),
-                              DataCast(doutput),
-                              kernel_size,
-                              kernel_size_size,
-                              stride,
-                              stride_size,
-                              padding,
-                              padding_size,
-                              dilation,
-                              dilation_size);
+                               miopen::deref(dinputDesc),
+                               DataCast(dinput),
+                               miopen::deref(doutputDesc),
+                               DataCast(doutput),
+                               kernel_size,
+                               kernel_size_size,
+                               stride,
+                               stride_size,
+                               padding,
+                               padding_size,
+                               dilation,
+                               dilation_size);
     });
 }
diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp
index 7bb0cc946a..040bb681ea 100644
--- a/src/include/miopen/fold.hpp
+++ b/src/include/miopen/fold.hpp
@@ -48,17 +48,17 @@ miopenStatus_t UnfoldForward(Handle& handle,
                              const int dilation_size);
 
 miopenStatus_t UnfoldBackward(Handle& handle,
-                             const TensorDescriptor& dinputDesc,
-                             Data_t dinput,
-                             const TensorDescriptor& doutputDesc,
-                             ConstData_t doutput,
-                             const int32_t* kernel_size,
-                             const int kernel_size_size,
-                             const int32_t* stride,
-                             const int stride_size,
-                             const int32_t* padding,
-                             const int padding_size,
-                             const int32_t* dilation,
-                             const int dilation_size);
+                              const TensorDescriptor& dinputDesc,
+                              Data_t dinput,
+                              const TensorDescriptor& doutputDesc,
+                              ConstData_t doutput,
+                              const int32_t* kernel_size,
+                              const int kernel_size_size,
+                              const int32_t* stride,
+                              const int stride_size,
+                              const int32_t* padding,
+                              const int padding_size,
+                              const int32_t* dilation,
+                              const int dilation_size);
 } // namespace miopen
 #endif // MIOPEN_INSTANCE_NORM_HPP_
diff --git a/src/include/miopen/fold/invoke_params.hpp b/src/include/miopen/fold/invoke_params.hpp
index b256680d8e..da89023f17 100644
--- a/src/include/miopen/fold/invoke_params.hpp
+++ b/src/include/miopen/fold/invoke_params.hpp
@@ -41,13 +41,13 @@ struct InvokeParams : public miopen::InvokeParams
 
     const TensorDescriptor* inputDesc  = nullptr;
     const TensorDescriptor* outputDesc = nullptr;
-    ConstData_t input = nullptr;
-    Data_t output     = nullptr;
+    ConstData_t input                  = nullptr;
+    Data_t output                      = nullptr;
 
     const TensorDescriptor* dinputDesc  = nullptr;
     const TensorDescriptor* doutputDesc = nullptr;
-    Data_t dinput = nullptr;
-    ConstData_t doutput     = nullptr;
+    Data_t dinput                       = nullptr;
+    ConstData_t doutput                 = nullptr;
 
     const int32_t* kernel_size = nullptr;
     const int32_t* stride      = nullptr;
diff --git a/src/solver/fold/unfold_backward.cpp b/src/solver/fold/unfold_backward.cpp
index c8613d9cca..249f08592c 100644
--- a/src/solver/fold/unfold_backward.cpp
+++ b/src/solver/fold/unfold_backward.cpp
@@ -54,15 +54,15 @@ ConvSolution UnfoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& con
     std::ignore = context;
     auto result = ConvSolution{miopenStatusSuccess};
 
-    auto in_dtype   = miopen::GetDataType(problem.GetDinputDesc().GetType());
-    auto dtype      = problem.GetDoutputDesc().GetType();
-    auto input_grad_dims = problem.GetDinputDesc().GetLengths();
+    auto in_dtype         = miopen::GetDataType(problem.GetDinputDesc().GetType());
+    auto dtype            = problem.GetDoutputDesc().GetType();
+    auto input_grad_dims  = problem.GetDinputDesc().GetLengths();
     auto output_grad_dims = problem.GetDoutputDesc().GetLengths();
 
-    const int32_t N      = static_cast<int32_t>(input_grad_dims[0]);
-    const int32_t C      = static_cast<int32_t>(input_grad_dims[1]);
-    int32_t H = static_cast<int32_t>(input_grad_dims[2]);
-    int32_t W = static_cast<int32_t>(input_grad_dims[3]);
+    const int32_t N = static_cast<int32_t>(input_grad_dims[0]);
+    const int32_t C = static_cast<int32_t>(input_grad_dims[1]);
+    int32_t H       = static_cast<int32_t>(input_grad_dims[2]);
+    int32_t W       = static_cast<int32_t>(input_grad_dims[3]);
 
     {
         auto kernel        = KernelInfo{};
diff --git a/test/cpu_fold.hpp b/test/cpu_fold.hpp
index 4c9427f2d4..46f7552083 100644
--- a/test/cpu_fold.hpp
+++ b/test/cpu_fold.hpp
@@ -117,7 +117,7 @@ void cpu_unfold_bwd_4d(tensor<T>& ref_dinput_tensor,
 {
     auto input_grad_tv   = miopen::get_inner_expanded_tv<4>(ref_dinput_tensor.desc);
     auto output_grad_tv  = miopen::get_inner_expanded_tv<3>(doutput_tensor.desc);
-    auto input_size = ref_dinput_tensor.desc.GetSize();
+    auto input_size      = ref_dinput_tensor.desc.GetSize();
     auto input_grad_dims = ref_dinput_tensor.desc.GetLengths();
 
     auto input_grad  = ref_dinput_tensor.data.data();
@@ -180,15 +180,16 @@ void cpu_unfold_bwd_4d(tensor<T>& ref_dinput_tensor,
                     continue;
                 if(lw < 0 || LW <= lw)
                     continue;
-                long output_grad_idx = output_grad_tv.stride[2] * (lh * LW + lw) +
-                                    output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) +
-                                    output_grad_tv.stride[0] * n;
+                long output_grad_idx =
+                    output_grad_tv.stride[2] * (lh * LW + lw) +
+                    output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) +
+                    output_grad_tv.stride[0] * n;
                 sum += static_cast<float>(output_grad[output_grad_idx]);
             }
         }
 
         long input_grad_idx = input_grad_tv.stride[3] * w + input_grad_tv.stride[2] * h +
-                          input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n;
+                              input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n;
         input_grad[input_grad_idx] = static_cast<T>(sum);
     });
 }
diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp
index 150edf0a47..f15c5b6a5f 100644
--- a/test/gtest/fold.hpp
+++ b/test/gtest/fold.hpp
@@ -216,7 +216,6 @@ struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
     miopen::Allocator::ManageDataPtr output_dev;
 };
 
-
 template <typename T>
 struct UnfoldBwdTest : public ::testing::TestWithParam<UnfoldTestCase>
 {
@@ -232,8 +231,8 @@ struct UnfoldBwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
         auto gen_one   = [&](auto...) { return 1; };
         auto gen_zero  = [&](auto...) { return 0; };
-        dinput          = tensor<T>{in_dims, in_strides}.generate(gen_zero);
-        dinputHost = tensor<T>{in_dims, in_strides}.generate(gen_zero);
+        dinput         = tensor<T>{in_dims, in_strides}.generate(gen_zero);
+        dinputHost     = tensor<T>{in_dims, in_strides}.generate(gen_zero);
 
         int spatial_dim_size = in_dims.size() - 2;
         const int32_t N      = static_cast<int32_t>(in_dims[0]);
@@ -254,7 +253,7 @@ struct UnfoldBwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         std::vector<size_t> out_dims{
             static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
 
-        doutput     = tensor<T>{out_dims}.generate(gen_value);
+        doutput = tensor<T>{out_dims}.generate(gen_value);
 
         dinput_dev  = handle.Write(dinput.data);
         doutput_dev = handle.Write(doutput.data);
@@ -266,18 +265,18 @@ struct UnfoldBwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         miopenStatus_t status;
 
         status = miopen::UnfoldBackward(handle,
-                                       dinput.desc,
-                                       dinput_dev.get(),
-                                       doutput.desc,
-                                       doutput_dev.get(),
-                                       config.kernelSize.data(),
-                                       static_cast<int>(config.kernelSize.size()),
-                                       config.stride.data(),
-                                       static_cast<int>(config.stride.size()),
-                                       config.padding.data(),
-                                       static_cast<int>(config.padding.size()),
-                                       config.dilation.data(),
-                                       static_cast<int>(config.dilation.size()));
+                                        dinput.desc,
+                                        dinput_dev.get(),
+                                        doutput.desc,
+                                        doutput_dev.get(),
+                                        config.kernelSize.data(),
+                                        static_cast<int>(config.kernelSize.size()),
+                                        config.stride.data(),
+                                        static_cast<int>(config.stride.size()),
+                                        config.padding.data(),
+                                        static_cast<int>(config.padding.size()),
+                                        config.dilation.data(),
+                                        static_cast<int>(config.dilation.size()));
 
         cpu_unfold_bwd_4d<T>(
             dinputHost, doutput, config.kernelSize, config.stride, config.padding, config.dilation);
@@ -296,8 +295,9 @@ struct UnfoldBwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         if(std::is_same<T, bfloat16>::value)
             tolerance *= 8.0;
         auto error_dinput = miopen::rms_range(dinputHost, dinput);
-        EXPECT_TRUE(error_dinput < tolerance) << "Error backward input_grad beyond tolerance Error: {"
-                                              << error_dinput << "},  Tolerance: " << tolerance;
+        EXPECT_TRUE(error_dinput < tolerance)
+            << "Error backward input_grad beyond tolerance Error: {" << error_dinput
+            << "},  Tolerance: " << tolerance;
     }
     UnfoldTestCase config;
 

From 101794f07ed0d67386e391cc5b5a073dd2ddc330 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Wed, 10 Jul 2024 07:43:30 +0000
Subject: [PATCH 06/46] Add foldfwd, foldbwd, problem_description verification,
 gtest and driver

---
 driver/CMakeLists.txt                         |   1 +
 driver/dm_fold.cpp                            |  39 ++
 driver/driver.hpp                             |   4 +-
 driver/fold_driver.hpp                        | 474 ++++++++++++++++++
 include/miopen/miopen.h                       |  64 +++
 src/CMakeLists.txt                            |   2 +
 src/fold.cpp                                  | 100 ++++
 src/fold/problem_description.cpp              |  80 ++-
 src/fold_api.cpp                              |  62 +++
 src/include/miopen/fold.hpp                   |  28 ++
 .../miopen/fold/problem_description.hpp       | 395 +++++++++++++--
 src/include/miopen/fold/solvers.hpp           |  46 +-
 src/solver.cpp                                |   3 +-
 src/solver/fold/fold_backward.cpp             | 178 +++++++
 src/solver/fold/fold_forward.cpp              |   4 +-
 test/cpu_fold.hpp                             |   6 +-
 test/gtest/fold.cpp                           |  60 +--
 test/gtest/fold.hpp                           |  90 ++--
 test/gtest/unfold.cpp                         | 160 ++++++
 test/gtest/unfold.hpp                         | 311 ++++++++++++
 20 files changed, 1957 insertions(+), 150 deletions(-)
 create mode 100644 src/solver/fold/fold_backward.cpp
 create mode 100644 test/gtest/unfold.cpp
 create mode 100644 test/gtest/unfold.hpp

diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index 8ca4ccd5c1..c115cf435f 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -52,6 +52,7 @@ add_executable(MIOpenDriver
     dm_sum.cpp
     dm_tensorop.cpp
     dm_unfold.cpp
+    dm_fold.cpp
     main.cpp
     registry_driver_maker.cpp
     rocrand_wrapper.cpp)
diff --git a/driver/dm_fold.cpp b/driver/dm_fold.cpp
index e69de29bb2..d7a8e2cb9a 100644
--- a/driver/dm_fold.cpp
+++ b/driver/dm_fold.cpp
@@ -0,0 +1,39 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "registry_driver_maker.hpp"
+#include "fold_driver.hpp"
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "fold")
+        return new FoldDriver<float, float>();
+    if(base_arg == "foldfp16")
+        return new FoldDriver<float16, float>();
+    if(base_arg == "foldbfp16")
+        return new FoldDriver<bfloat16, float>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
diff --git a/driver/driver.hpp b/driver/driver.hpp
index a7396d272f..f26d7053f3 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -151,7 +151,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "pool[fp16], lrn[fp16], "
            "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], "
            "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
-           "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16], unfold[bfp16|fp16]\n");
+           "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16], unfold[bfp16|fp16], fold[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -177,7 +177,7 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "sumbfp16" && arg != "argmax" && arg != "argmaxfp16" && arg != "argmaxbfp16" &&
        arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" &&
        arg != "catfp16" && arg != "catbfp16" && arg != "unfold" && arg != "unfoldfp16" &&
-       arg != "unfoldbfp16" && arg != "--version")
+       arg != "unfoldbfp16" && arg != "fold" && arg != "foldfp16" && arg != "foldbfp16" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/fold_driver.hpp b/driver/fold_driver.hpp
index e69de29bb2..117538452e 100644
--- a/driver/fold_driver.hpp
+++ b/driver/fold_driver.hpp
@@ -0,0 +1,474 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACTORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_FOLD_DRIVER_HPP
+#define GUARD_MIOPEN_FOLD_DRIVER_HPP
+
+#include "InputFlags.hpp"
+#include "driver.hpp"
+#include "mloUnfoldHost.hpp"
+#include "random.hpp"
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+#include "util_driver.hpp"
+
+#include <../test/tensor_holder.hpp>
+#include <../test/verify.hpp>
+
+#include <miopen/env.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <vector>
+
+template <typename Tgpu, typename Tref>
+class FoldDriver : public Driver
+{
+public:
+    FoldDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&inputDesc);
+        miopenCreateTensorDescriptor(&outputDesc);
+        miopenCreateTensorDescriptor(&dinputDesc);
+        miopenCreateTensorDescriptor(&doutputDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    int GetandSetData() override;
+    std::vector<int> GetTensorLengthsFromCmdLine();
+    std::vector<int32_t> GetVectorInt32tFromCmdLine(std::string long_name);
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+    int RunBackwardCPU();
+
+    Tref GetTolerance();
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~FoldDriver() override
+    {
+        miopenDestroyTensorDescriptor(inputDesc);
+        miopenDestroyTensorDescriptor(outputDesc);
+        miopenDestroyTensorDescriptor(dinputDesc);
+        miopenDestroyTensorDescriptor(doutputDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    int forw;
+
+    miopenTensorDescriptor_t inputDesc;
+    miopenTensorDescriptor_t outputDesc;
+
+    miopenTensorDescriptor_t doutputDesc;
+    miopenTensorDescriptor_t dinputDesc;
+
+    std::unique_ptr<GPUMem> input_dev;
+    std::unique_ptr<GPUMem> output_dev;
+
+    std::unique_ptr<GPUMem> doutput_dev;
+    std::unique_ptr<GPUMem> dinput_dev;
+
+    std::vector<Tgpu> input;
+    std::vector<Tgpu> output;
+
+    std::vector<Tgpu> doutput;
+    std::vector<Tgpu> dinput;
+
+    std::vector<Tref> output_host;
+
+    std::vector<Tref> dinput_host;
+
+    std::vector<int32_t> output_size;
+    std::vector<int32_t> kernel_size;
+    std::vector<int32_t> stride;
+    std::vector<int32_t> padding;
+    std::vector<int32_t> dilation;
+};
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::GetandSetData()
+{
+    std::vector<int> input_length = GetTensorLengthsFromCmdLine();
+
+    output_size          = GetVectorInt32tFromCmdLine("outputSize");
+    kernel_size          = GetVectorInt32tFromCmdLine("kernelSize");
+    stride               = GetVectorInt32tFromCmdLine("stride");
+    padding              = GetVectorInt32tFromCmdLine("padding");
+    dilation             = GetVectorInt32tFromCmdLine("dilation");
+    const int N          = input_length[0];
+    int C          = input_length[1];
+    for (int32_t i : kernel_size)
+    {
+        C = C / i;
+    }
+
+    std::vector<int> output_length = {N, C, output_size[0], output_size[1]};
+    SetTensorNd(inputDesc, input_length, data_type);
+    SetTensorNd(outputDesc, output_length, data_type);
+    SetTensorNd(dinputDesc, input_length, data_type);
+    SetTensorNd(doutputDesc, output_length, data_type);
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag(
+        "forw", 'F', "1", "Run Fold Forward (Default=1) or both Forward and Backward (0)", "int");
+    inflags.AddInputFlag(
+        "DimLengths", 'D', "3,12,12", "The dimensional lengths of the input tensor", "string");
+    inflags.AddInputFlag("outputSize", 'o', "4,5", "Output Size (Default=2,3)", "str");
+    inflags.AddInputFlag("kernelSize", 'k', "2,2", "Kernel Size (Default=2,3)", "str");
+    inflags.AddInputFlag("stride", 's', "1,1", "Stride (Default=1,1)", "str");
+    inflags.AddInputFlag("padding", 'p', "0,0", "Stride (Default=0,0)", "str");
+    inflags.AddInputFlag("dilation", 'd', "1,1", "Stride (Default=1,1)", "str");
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int");
+    inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+std::vector<int> FoldDriver<Tgpu, Tref>::GetTensorLengthsFromCmdLine()
+{
+    std::string lengthsStr = inflags.GetValueStr("DimLengths");
+
+    std::vector<int> lengths;
+    std::size_t pos = 0;
+    std::size_t new_pos;
+
+    new_pos = lengthsStr.find(',', pos);
+    while(new_pos != std::string::npos)
+    {
+        std::string sliceStr = lengthsStr.substr(pos, new_pos - pos);
+
+        int len = std::stoi(sliceStr);
+
+        lengths.push_back(len);
+
+        pos     = new_pos + 1;
+        new_pos = lengthsStr.find(',', pos);
+    };
+
+    std::string sliceStr = lengthsStr.substr(pos);
+    int len              = std::stoi(sliceStr);
+
+    lengths.push_back(len);
+
+    return (lengths);
+}
+
+template <typename Tgpu, typename Tref>
+std::vector<int32_t> FoldDriver<Tgpu, Tref>::GetVectorInt32tFromCmdLine(std::string long_name)
+{
+    std::string lengthsStr = inflags.GetValueStr(long_name);
+
+    std::vector<int32_t> lengths;
+    std::size_t pos = 0;
+    std::size_t new_pos;
+
+    new_pos = lengthsStr.find(',', pos);
+    while(new_pos != std::string::npos)
+    {
+        std::string sliceStr = lengthsStr.substr(pos, new_pos - pos);
+
+        int len = std::stoi(sliceStr);
+
+        lengths.push_back(static_cast<int32_t>(len));
+
+        pos     = new_pos + 1;
+        new_pos = lengthsStr.find(',', pos);
+    };
+
+    std::string sliceStr = lengthsStr.substr(pos);
+    int len              = std::stoi(sliceStr);
+
+    lengths.push_back(static_cast<int32_t>(len));
+
+    return (lengths);
+}
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    size_t input_sz  = GetTensorSize(inputDesc);
+    size_t output_sz = GetTensorSize(outputDesc);
+
+    size_t doutput_sz = GetTensorSize(doutputDesc);
+    size_t dinput_sz  = GetTensorSize(dinputDesc);
+
+    uint32_t ctx = 0;
+
+    input_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    output_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
+
+    doutput_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, doutput_sz, sizeof(Tgpu)));
+    dinput_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, dinput_sz, sizeof(Tgpu)));
+
+    input  = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(0.0f));
+    output = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0.0f));
+
+    doutput = std::vector<Tgpu>(doutput_sz, static_cast<Tgpu>(1.0f));
+    dinput  = std::vector<Tgpu>(dinput_sz, static_cast<Tgpu>(0.0f));
+
+    output_host = std::vector<Tref>(output_sz, static_cast<Tref>(0.0f));
+
+    dinput_host = std::vector<Tref>(dinput_sz, static_cast<Tref>(0.0f));
+
+    int status;
+
+    for(int i = 0; i < input_sz; i++)
+        input[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
+    status = input_dev->ToGPU(GetStream(), input.data());
+
+    for(int i = 0; i < doutput_sz; i++)
+    {
+        doutput[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
+    }
+    status |= doutput_dev->ToGPU(GetStream(), doutput.data());
+    status |= dinput_dev->ToGPU(GetStream(), dinput.data());
+
+    if(status != 0)
+        std::cout << "Fold Driver Error copying data to GPU\n" << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenFoldForward(GetHandle(),
+                            inputDesc,
+                            input_dev->GetMem(),
+                            outputDesc,
+                            output_dev->GetMem(),
+                            kernel_size.data(),
+                            kernel_size.size(),
+                            stride.data(),
+                            stride.size(),
+                            padding.data(),
+                            padding.size(),
+                            dilation.data(),
+                            dilation.size());
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Fold Forward Elapsed: " << t.gettime_ms() / iter
+                      << " ms" << std::endl;
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Fold Forward Elapsed: " << kernel_average_time << " ms"
+                  << std::endl;
+    }
+
+    if(output_dev->FromGPU(GetStream(), output.data()) != 0)
+        std::cerr << "Error copying (out_dev) from GPU, size: " << output_dev->GetSize()
+                  << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::RunForwardCPU()
+{
+    mloUnFoldBwd4DRunHost(output_host.data(),
+                          outputDesc,
+                          input.data(),
+                          inputDesc,
+                          kernel_size,
+                          stride,
+                          padding,
+                          dilation);
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenFoldBackward(GetHandle(),
+                             dinputDesc,
+                             dinput_dev->GetMem(),
+                             doutputDesc,
+                             doutput_dev->GetMem(),
+                             kernel_size.data(),
+                             kernel_size.size(),
+                             stride.data(),
+                             stride.size(),
+                             padding.data(),
+                             padding.size(),
+                             dilation.data(),
+                             dilation.size());
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Fold Backward Elapsed: " << t.gettime_ms() / iter
+                      << " ms" << std::endl;
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Fold Backward Elapsed: " << kernel_average_time << " ms"
+                  << std::endl;
+    }
+
+    if(dinput_dev->FromGPU(GetStream(), dinput.data()) != 0)
+        std::cerr << "Error copying (dinput_dev) from GPU, size: " << dinput_dev->GetSize()
+                  << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::RunBackwardCPU()
+{
+    mloUnFoldFwd4DRunHost(doutput.data(),
+                          doutputDesc,
+                          dinput_host.data(),
+                          dinputDesc,
+                          kernel_size,
+                          stride,
+                          padding,
+                          dilation);
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+Tref FoldDriver<Tgpu, Tref>::GetTolerance()
+{
+    // Computation error of fp16 is ~2^13 (=8192) bigger than
+    // the one of fp32 because mantissa is shorter by 13 bits.
+    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
+
+    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+    if(std::is_same<Tgpu, bfloat16>::value)
+        tolerance *= 8.0;
+    return tolerance;
+}
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::VerifyForward()
+{
+    RunForwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error_output    = miopen::rms_range(output_host, output);
+
+    if(!std::isfinite(error_output) || error_output > tolerance)
+    {
+        std::cout << "Forward Fold FAILED: {" << error_output << "} > " << tolerance << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        std::cout << "Forward Fold Verifies OK on CPU reference ({" << error_output << "} < "
+                  << tolerance << ')' << std::endl;
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::VerifyBackward()
+{
+    RunBackwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error_dinput    = miopen::rms_range(dinput_host, dinput);
+
+    if(!std::isfinite(error_dinput) || error_dinput > tolerance)
+    {
+        std::cout << "Backward Fold FAILED: {" << error_dinput << "} > " << tolerance
+                  << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        std::cout << "Backward Fold Verifies OK on CPU reference ({" << error_dinput << "} < "
+                  << tolerance << ')' << std::endl;
+    }
+    return miopenStatusSuccess;
+}
+
+#endif // GUARD_MIOPEN_FOLD_DRIVER_HPP
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 56633f57c2..a45ece12fe 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -6588,6 +6588,70 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d
  *
  *  @{
  */
+/*! @brief Execute an unfold forward layer
+ *
+ * @param handle              MIOpen handle (input)
+ * @param inputDesc           Tensor descriptor for data input tensor input (input)
+ * @param input               Data tensor input (input)
+ * @param outputDesc          Tensor descriptor for data output tensor output (output)
+ * @param output              Data tensor output (output)
+ * @param kernel_size         Size of the sliding box array (input)
+ * @param kernel_size_size    Size of the kernel_size array (input)
+ * @param stride              Stride array of the sliding box (input)
+ * @param stride_size         Size of the stride array (input)
+ * @param padding             Padding array to be added on input (input)
+ * @param padding_size        Size of the padding array (input)
+ * @param dilation            Dilation array control the stride of the elements within the
+ * neighborhood (input)
+ * @param dilation_size       Size of the dilation array (input)
+ * @return               miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenFoldForward(miopenHandle_t handle,
+                                                 const miopenTensorDescriptor_t inputDesc,
+                                                 const void* input,
+                                                 const miopenTensorDescriptor_t outputDesc,
+                                                 void* output,
+                                                 const int32_t* kernel_size,
+                                                 const int kernel_size_size,
+                                                 const int32_t* stride,
+                                                 const int stride_size,
+                                                 const int32_t* padding,
+                                                 const int padding_size,
+                                                 const int32_t* dilation,
+                                                 const int dilation_size);
+
+ /*! @brief Execute an unfold backward layer
+ *
+ * @param handle              MIOpen handle (input)
+ * @param dinputDesc          Tensor descriptor for data input grad tensor (output)
+ * @param dinput              Data tensor input grad (output)
+ * @param doutputDesc         Tensor descriptor for data output grad tensor (input)
+ * @param doutput             Data tensor output grad (input)
+ * @param kernel_size         Size of the sliding box array (input)
+ * @param kernel_size_size    Size of the kernel_size array (input)
+ * @param stride              Stride array of the sliding box (input)
+ * @param stride_size         Size of the stride array (input)
+ * @param padding             Padding array to be added on input (input)
+ * @param padding_size        Size of the padding array (input)
+ * @param dilation            Dilation array control the stride of the elements within the
+ neighborhood (input)
+ * @param dilation_size       Size of the dilation array (input)
+ * @return               miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenFoldBackward(miopenHandle_t handle,
+                                                  const miopenTensorDescriptor_t dinputDesc,
+                                                  void* dinput,
+                                                  const miopenTensorDescriptor_t doutputDesc,
+                                                  const void* doutput,
+                                                  const int32_t* kernel_size,
+                                                  const int kernel_size_size,
+                                                  const int32_t* stride,
+                                                  const int stride_size,
+                                                  const int32_t* padding,
+                                                  const int padding_size,
+                                                  const int32_t* dilation,
+                                                  const int dilation_size);
+
 /*! @brief Execute an unfold forward layer
  *
  * @param handle              MIOpen handle (input)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ae2965b07c..bd057795a3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -260,6 +260,8 @@ set( MIOpen_Source
     solver/fft.cpp
     solver/fold/unfold_forward.cpp
     solver/fold/unfold_backward.cpp
+    solver/fold/fold_forward.cpp
+    solver/fold/fold_backward.cpp
     solver/gemm.cpp
     solver/gemm_bwd.cpp
     solver/gemm_wrw.cpp
diff --git a/src/fold.cpp b/src/fold.cpp
index 8a028d379e..d2ff285af1 100644
--- a/src/fold.cpp
+++ b/src/fold.cpp
@@ -137,4 +137,104 @@ miopenStatus_t UnfoldBackward(Handle& handle,
     return miopenStatusSuccess;
 }
 
+miopenStatus_t FoldForward(Handle& handle,
+                             const TensorDescriptor& inputDesc,
+                             ConstData_t input,
+                             const TensorDescriptor& outputDesc,
+                             Data_t output,
+                             const int32_t* kernel_size,
+                             const int kernel_size_size,
+                             const int32_t* stride,
+                             const int stride_size,
+                             const int32_t* padding,
+                             const int padding_size,
+                             const int32_t* dilation,
+                             const int dilation_size)
+{
+    const auto problem = fold::FoldFwdProblemDescription{inputDesc,
+                                                           outputDesc,
+                                                           kernel_size,
+                                                           kernel_size_size,
+                                                           stride,
+                                                           stride_size,
+                                                           padding,
+                                                           padding_size,
+                                                           dilation,
+                                                           dilation_size};
+
+    const auto invoke_params = [&]() {
+        auto tmp             = fold::InvokeParams{};
+        tmp.type             = InvokeType::Run;
+        tmp.inputDesc        = &inputDesc;
+        tmp.outputDesc       = &outputDesc;
+        tmp.input            = input;
+        tmp.output           = output;
+        tmp.kernel_size      = kernel_size;
+        tmp.stride           = stride;
+        tmp.padding          = padding;
+        tmp.dilation         = dilation;
+        tmp.kernel_size_size = kernel_size_size;
+        tmp.stride_size      = stride_size;
+        tmp.padding_size     = padding_size;
+        tmp.dilation_size    = dilation_size;
+        return tmp;
+    }();
+
+    const auto algo    = AlgorithmName{"FoldFwd"};
+    const auto solvers = solver::SolverContainer<solver::fold::FoldFwd>{};
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+miopenStatus_t FoldBackward(Handle& handle,
+                              const TensorDescriptor& dinputDesc,
+                              Data_t dinput,
+                              const TensorDescriptor& doutputDesc,
+                              ConstData_t doutput,
+                              const int32_t* kernel_size,
+                              const int kernel_size_size,
+                              const int32_t* stride,
+                              const int stride_size,
+                              const int32_t* padding,
+                              const int padding_size,
+                              const int32_t* dilation,
+                              const int dilation_size)
+{
+    const auto problem = fold::FoldBwdProblemDescription{dinputDesc,
+                                                           doutputDesc,
+                                                           kernel_size,
+                                                           kernel_size_size,
+                                                           stride,
+                                                           stride_size,
+                                                           padding,
+                                                           padding_size,
+                                                           dilation,
+                                                           dilation_size};
+
+    const auto invoke_params = [&]() {
+        auto tmp             = fold::InvokeParams{};
+        tmp.type             = InvokeType::Run;
+        tmp.dinputDesc       = &dinputDesc;
+        tmp.doutputDesc      = &doutputDesc;
+        tmp.dinput           = dinput;
+        tmp.doutput          = doutput;
+        tmp.kernel_size      = kernel_size;
+        tmp.stride           = stride;
+        tmp.padding          = padding;
+        tmp.dilation         = dilation;
+        tmp.kernel_size_size = kernel_size_size;
+        tmp.stride_size      = stride_size;
+        tmp.padding_size     = padding_size;
+        tmp.dilation_size    = dilation_size;
+        return tmp;
+    }();
+
+    const auto algo    = AlgorithmName{"FoldBwd"};
+    const auto solvers = solver::SolverContainer<solver::fold::FoldBwd>{};
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
 } // namespace miopen
diff --git a/src/fold/problem_description.cpp b/src/fold/problem_description.cpp
index d0ecf629e9..39202fd372 100644
--- a/src/fold/problem_description.cpp
+++ b/src/fold/problem_description.cpp
@@ -33,22 +33,6 @@ namespace miopen {
 
 namespace fold {
 
-// NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const
-// {
-//     auto input_dtype   = inputDesc.GetType();
-//     auto output_dtype  = outputDesc.GetType();
-//     auto size          = inputDesc.GetElementSize();
-
-//     std::ostringstream ss;
-
-//     ss << "fold_fwd";
-//     ss << "i_dtype" << input_dtype;
-//     ss << "o_dtype" << output_dtype;
-//     ss << "size" << size;
-
-//     return NetworkConfig{ss.str()};
-// }
-
 NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const
 {
     auto input_dtype  = inputDesc.GetType();
@@ -101,6 +85,70 @@ NetworkConfig UnfoldBwdProblemDescription::MakeNetworkConfig() const
     return NetworkConfig{ss.str()};
 }
 
+NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const
+{
+    auto input_dtype  = inputDesc.GetType();
+    auto output_dtype = outputDesc.GetType();
+    auto size         = inputDesc.GetElementSize();
+    auto in_dims      = inputDesc.GetLengths();
+    auto out_dims      = outputDesc.GetLengths();
+
+    std::ostringstream ss;
+
+    ss << "Fold_fwd";
+    ss << "i_dtype" << input_dtype;
+    ss << "o_dtype" << output_dtype;
+    ss << "size" << size;
+    ss << "in_dims";
+    for(auto val : in_dims)
+    {
+        ss << "_" << val;
+    }
+    ss << "out_dims";
+    for (auto val: out_dims)
+    {
+        ss << "_" << val;
+    }
+    ss << "kernel_size_" << kernel_size[0] << "_" << kernel_size[1];
+    ss << "stride_" << stride[0] << "_" << stride[1];
+    ss << "padding_" << padding[0] << "_" << padding[1];
+    ss << "dilation_" << dilation[0] << "_" << dilation[1];
+
+    return NetworkConfig{ss.str()};
+}
+
+NetworkConfig FoldBwdProblemDescription::MakeNetworkConfig() const
+{
+    auto input_dtype  = dinputDesc.GetType();
+    auto output_dtype = doutputDesc.GetType();
+    auto size         = dinputDesc.GetElementSize();
+    auto in_dims      = dinputDesc.GetLengths();
+    auto out_dims      = doutputDesc.GetLengths();
+
+    std::ostringstream ss;
+
+    ss << "Fold_bwd";
+    ss << "i_dtype" << input_dtype;
+    ss << "o_dtype" << output_dtype;
+    ss << "size" << size;
+    ss << "in_grad_dims";
+    for(auto val : in_dims)
+    {
+        ss << "_" << val;
+    }
+    ss << "out_grad_dims";
+    for (auto val: out_dims)
+    {
+        ss << "_" << val;
+    }
+    ss << "kernel_size_" << kernel_size[0] << "_" << kernel_size[1];
+    ss << "stride_" << stride[0] << "_" << stride[1];
+    ss << "padding_" << padding[0] << "_" << padding[1];
+    ss << "dilation_" << dilation[0] << "_" << dilation[1];
+
+    return NetworkConfig{ss.str()};
+}
+
 } // namespace fold
 
 } // namespace miopen
diff --git a/src/fold_api.cpp b/src/fold_api.cpp
index ba9f2fd805..fb22fa90b4 100644
--- a/src/fold_api.cpp
+++ b/src/fold_api.cpp
@@ -92,3 +92,65 @@ extern "C" miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle,
                                dilation_size);
     });
 }
+
+extern "C" miopenStatus_t miopenFoldForward(miopenHandle_t handle,
+                                              const miopenTensorDescriptor_t inputDesc,
+                                              const void* input,
+                                              const miopenTensorDescriptor_t outputDesc,
+                                              void* output,
+                                              const int32_t* kernel_size,
+                                              const int kernel_size_size,
+                                              const int32_t* stride,
+                                              const int stride_size,
+                                              const int32_t* padding,
+                                              const int padding_size,
+                                              const int32_t* dilation,
+                                              const int dilation_size)
+{
+    return miopen::try_([&] {
+        miopen::FoldForward(miopen::deref(handle),
+                              miopen::deref(inputDesc),
+                              DataCast(input),
+                              miopen::deref(outputDesc),
+                              DataCast(output),
+                              kernel_size,
+                              kernel_size_size,
+                              stride,
+                              stride_size,
+                              padding,
+                              padding_size,
+                              dilation,
+                              dilation_size);
+    });
+}
+
+extern "C" miopenStatus_t miopenFoldBackward(miopenHandle_t handle,
+                                               const miopenTensorDescriptor_t dinputDesc,
+                                               void* dinput,
+                                               const miopenTensorDescriptor_t doutputDesc,
+                                               const void* doutput,
+                                               const int32_t* kernel_size,
+                                               const int kernel_size_size,
+                                               const int32_t* stride,
+                                               const int stride_size,
+                                               const int32_t* padding,
+                                               const int padding_size,
+                                               const int32_t* dilation,
+                                               const int dilation_size)
+{
+    return miopen::try_([&] {
+        miopen::FoldBackward(miopen::deref(handle),
+                               miopen::deref(dinputDesc),
+                               DataCast(dinput),
+                               miopen::deref(doutputDesc),
+                               DataCast(doutput),
+                               kernel_size,
+                               kernel_size_size,
+                               stride,
+                               stride_size,
+                               padding,
+                               padding_size,
+                               dilation,
+                               dilation_size);
+    });
+}
diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp
index 040bb681ea..d94a42ee5a 100644
--- a/src/include/miopen/fold.hpp
+++ b/src/include/miopen/fold.hpp
@@ -60,5 +60,33 @@ miopenStatus_t UnfoldBackward(Handle& handle,
                               const int padding_size,
                               const int32_t* dilation,
                               const int dilation_size);
+
+miopenStatus_t FoldForward(Handle& handle,
+                             const TensorDescriptor& inputDesc,
+                             ConstData_t input,
+                             const TensorDescriptor& outputDesc,
+                             Data_t output,
+                             const int32_t* kernel_size,
+                             const int kernel_size_size,
+                             const int32_t* stride,
+                             const int stride_size,
+                             const int32_t* padding,
+                             const int padding_size,
+                             const int32_t* dilation,
+                             const int dilation_size);
+
+miopenStatus_t FoldBackward(Handle& handle,
+                              const TensorDescriptor& dinputDesc,
+                              Data_t dinput,
+                              const TensorDescriptor& doutputDesc,
+                              ConstData_t doutput,
+                              const int32_t* kernel_size,
+                              const int kernel_size_size,
+                              const int32_t* stride,
+                              const int stride_size,
+                              const int32_t* padding,
+                              const int padding_size,
+                              const int32_t* dilation,
+                              const int dilation_size);
 } // namespace miopen
 #endif // MIOPEN_INSTANCE_NORM_HPP_
diff --git a/src/include/miopen/fold/problem_description.hpp b/src/include/miopen/fold/problem_description.hpp
index 9e4e5b427f..f89a90eac2 100644
--- a/src/include/miopen/fold/problem_description.hpp
+++ b/src/include/miopen/fold/problem_description.hpp
@@ -65,23 +65,77 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase
           dilation(dilation_),
           dilation_size(dilation_size_)
     {
-        // IsValidSize();
+        IsValidSize();
+        IsValidType();
     }
 
-    //     bool IsValidSize() const
-    //     {
-    //         if(inputDesc.GetSize() < 2 || inputDesc.GetSize() > 5)
-    //         {
-    // #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-    //             MIOPEN_THROW(miopenStatusBadParm,
-    //                          "Instance Norm: The input tensor dimension should be in range [2,
-    //                          5].");
-    // #else
-    //             return false;
-    // #endif
-    //         }
-    //         return true;
-    //     }
+    bool IsValidSize() const
+    {
+        if(inputDesc.GetSize() != 4)
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Unfold: The input tensor should be 4D.");
+#else
+            return false;
+#endif
+        }
+        int spatial_dim_size = inputDesc.GetSize() - 2;
+        if (kernel_size_size != spatial_dim_size ||
+        stride_size != spatial_dim_size ||
+        padding_size != spatial_dim_size ||
+        dilation_size != spatial_dim_size)
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Unfold: Argument length should be 2D");
+#else
+            return false;
+#endif
+        }
+        auto input_dims = inputDesc.GetLengths();
+        const int32_t N = static_cast<int32_t>(input_dims[0]);
+        const int32_t C = static_cast<int32_t>(input_dims[1]);
+        int32_t P = 1, L = 1;
+        std::vector<int32_t> ls;
+        for (int i = 0; i < spatial_dim_size; ++i) {
+            P *= kernel_size[i];
+            int32_t l = (static_cast<int32_t>(input_dims[i + 2]) + 2 * padding[i] -
+                        dilation[i] * (kernel_size[i] - 1) - 1) /
+                            stride[i] +
+                        1;
+            L *= l;
+            ls.push_back(l);
+        }
+        std::vector<size_t> output_dims_desired{static_cast<size_t>(N),
+                                        static_cast<size_t>(C * P),
+                                        static_cast<size_t>(L)};
+        auto output_dims = outputDesc.GetLengths();
+        if (output_dims != output_dims_desired)
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Unfold: Invalid output dimension");
+#else
+            return false;
+#endif
+        }
+        return true;
+    }
+
+    bool IsValidType() const
+    {
+        if (inputDesc.GetType() != outputDesc.GetType())
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Unfold: The input tensor and output tensor has mismatch type.");
+#else
+            return false;
+#endif
+        }
+        return true;
+    }
 
     const TensorDescriptor& GetInputDesc() const { return inputDesc; }
     const TensorDescriptor& GetOutputDesc() const { return outputDesc; }
@@ -124,23 +178,304 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase
           dilation(dilation_),
           dilation_size(dilation_size_)
     {
-        // IsValidSize();
+        IsValidSize();
+        IsValidType();
+    }
+
+    bool IsValidSize() const
+    {
+        if(dinputDesc.GetSize() != 4)
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Unfold: The input gradient tensor should be 4D.");
+#else
+            return false;
+#endif
+        }
+        int spatial_dim_size = dinputDesc.GetSize() - 2;
+        if (kernel_size_size != spatial_dim_size ||
+        stride_size != spatial_dim_size ||
+        padding_size != spatial_dim_size ||
+        dilation_size != spatial_dim_size)
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Unfold: Argument length should be 2D");
+#else
+            return false;
+#endif
+        }
+        auto input_dims = dinputDesc.GetLengths();
+        const int32_t N = static_cast<int32_t>(input_dims[0]);
+        const int32_t C = static_cast<int32_t>(input_dims[1]);
+        int32_t P = 1, L = 1;
+        std::vector<int32_t> ls;
+        for (int i = 0; i < spatial_dim_size; ++i) {
+            P *= kernel_size[i];
+            int32_t l = (static_cast<int32_t>(input_dims[i + 2]) + 2 * padding[i] -
+                        dilation[i] * (kernel_size[i] - 1) - 1) /
+                            stride[i] +
+                        1;
+            L *= l;
+            ls.push_back(l);
+        }
+        std::vector<size_t> output_dims_desired{static_cast<size_t>(N),
+                                        static_cast<size_t>(C * P),
+                                        static_cast<size_t>(L)};
+        auto output_dims = doutputDesc.GetLengths();
+        if (output_dims != output_dims_desired)
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Unfold: Invalid output gradient dimension");
+#else
+            return false;
+#endif
+        }
+        return true;
+    }
+
+    bool IsValidType() const
+    {
+        if (dinputDesc.GetType() != doutputDesc.GetType())
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Unfold: The input gradient tensor and output gradient tensor has mismatch type.");
+#else
+            return false;
+#endif
+        }
+        return true;
+    }
+
+    const TensorDescriptor& GetDinputDesc() const { return dinputDesc; }
+    const TensorDescriptor& GetDoutputDesc() const { return doutputDesc; }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+public:
+    TensorDescriptor dinputDesc;
+    TensorDescriptor doutputDesc;
+    const int32_t* kernel_size;
+    const int kernel_size_size;
+    const int32_t* stride;
+    const int stride_size;
+    const int32_t* padding;
+    const int padding_size;
+    const int32_t* dilation;
+    const int dilation_size;
+};
+
+struct FoldFwdProblemDescription : ProblemDescriptionBase
+{
+    FoldFwdProblemDescription(const TensorDescriptor& inputDesc_,
+                                const TensorDescriptor& outputDesc_,
+                                const int32_t* kernel_size_,
+                                const int kernel_size_size_,
+                                const int32_t* stride_,
+                                const int stride_size_,
+                                const int32_t* padding_,
+                                const int padding_size_,
+                                const int32_t* dilation_,
+                                const int dilation_size_)
+        : inputDesc(inputDesc_),
+          outputDesc(outputDesc_),
+          kernel_size(kernel_size_),
+          kernel_size_size(kernel_size_size_),
+          stride(stride_),
+          stride_size(stride_size_),
+          padding(padding_),
+          padding_size(padding_size_),
+          dilation(dilation_),
+          dilation_size(dilation_size_)
+    {
+        IsValidSize();
+        IsValidType();
+    }
+
+    bool IsValidSize() const
+    {
+        if(outputDesc.GetSize() != 4)
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Fold: The output tensor should be 4D.");
+#else
+            return false;
+#endif
+        }
+        int spatial_dim_size = outputDesc.GetSize() - 2;
+        if (kernel_size_size != spatial_dim_size ||
+        stride_size != spatial_dim_size ||
+        padding_size != spatial_dim_size ||
+        dilation_size != spatial_dim_size)
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Fold: Argument length should be 2D");
+#else
+            return false;
+#endif
+        }
+        auto input_dims = inputDesc.GetLengths();
+        auto output_dims = outputDesc.GetLengths();
+        const int32_t N = static_cast<int32_t>(output_dims[0]);
+        const int32_t C = static_cast<int32_t>(output_dims[1]);
+        int32_t P = 1, L = 1;
+        std::vector<int32_t> ls;
+        for (int i = 0; i < spatial_dim_size; ++i) {
+            P *= kernel_size[i];
+            int32_t l = (static_cast<int32_t>(output_dims[i + 2]) + 2 * padding[i] -
+                        dilation[i] * (kernel_size[i] - 1) - 1) /
+                            stride[i] +
+                        1;
+            L *= l;
+            ls.push_back(l);
+        }
+        std::vector<size_t> input_dims_desired{static_cast<size_t>(N),
+                                        static_cast<size_t>(C * P),
+                                        static_cast<size_t>(L)};
+        if (input_dims != input_dims_desired)
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Fold: Invalid input dimension");
+#else
+            return false;
+#endif
+        }
+        return true;
+    }
+
+    bool IsValidType() const
+    {
+        if (inputDesc.GetType() != outputDesc.GetType())
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Fold: The input tensor and output tensor has mismatch type.");
+#else
+            return false;
+#endif
+        }
+        return true;
+    }
+
+    const TensorDescriptor& GetInputDesc() const { return inputDesc; }
+    const TensorDescriptor& GetOutputDesc() const { return outputDesc; }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+public:
+    TensorDescriptor inputDesc;
+    TensorDescriptor outputDesc;
+    const int32_t* kernel_size;
+    const int kernel_size_size;
+    const int32_t* stride;
+    const int stride_size;
+    const int32_t* padding;
+    const int padding_size;
+    const int32_t* dilation;
+    const int dilation_size;
+};
+
+struct FoldBwdProblemDescription : ProblemDescriptionBase
+{
+    FoldBwdProblemDescription(const TensorDescriptor& dinputDesc_,
+                                const TensorDescriptor& doutputDesc_,
+                                const int32_t* kernel_size_,
+                                const int kernel_size_size_,
+                                const int32_t* stride_,
+                                const int stride_size_,
+                                const int32_t* padding_,
+                                const int padding_size_,
+                                const int32_t* dilation_,
+                                const int dilation_size_)
+        : dinputDesc(dinputDesc_),
+          doutputDesc(doutputDesc_),
+          kernel_size(kernel_size_),
+          kernel_size_size(kernel_size_size_),
+          stride(stride_),
+          stride_size(stride_size_),
+          padding(padding_),
+          padding_size(padding_size_),
+          dilation(dilation_),
+          dilation_size(dilation_size_)
+    {
+        IsValidSize();
+        IsValidType();
+    }
+
+    bool IsValidSize() const
+    {
+        if(doutputDesc.GetSize() != 4)
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Fold: The output gradient tensor should be 4D.");
+#else
+            return false;
+#endif
+        }
+        int spatial_dim_size = doutputDesc.GetSize() - 2;
+        if (kernel_size_size != spatial_dim_size ||
+        stride_size != spatial_dim_size ||
+        padding_size != spatial_dim_size ||
+        dilation_size != spatial_dim_size)
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Fold: Argument length should be 2D");
+#else
+            return false;
+#endif
+        }
+        auto input_dims = dinputDesc.GetLengths();
+        auto output_dims = doutputDesc.GetLengths();
+        const int32_t N = static_cast<int32_t>(output_dims[0]);
+        const int32_t C = static_cast<int32_t>(output_dims[1]);
+        int32_t P = 1, L = 1;
+        std::vector<int32_t> ls;
+        for (int i = 0; i < spatial_dim_size; ++i) {
+            P *= kernel_size[i];
+            int32_t l = (static_cast<int32_t>(output_dims[i + 2]) + 2 * padding[i] -
+                        dilation[i] * (kernel_size[i] - 1) - 1) /
+                            stride[i] +
+                        1;
+            L *= l;
+            ls.push_back(l);
+        }
+        std::vector<size_t> input_dims_desired{static_cast<size_t>(N),
+                                        static_cast<size_t>(C * P),
+                                        static_cast<size_t>(L)};
+        if (input_dims != input_dims_desired)
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Fold: Invalid input gradient dimension");
+#else
+            return false;
+#endif
+        }
+        return true;
+    }
+
+    bool IsValidType() const
+    {
+        if (dinputDesc.GetType() != doutputDesc.GetType())
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Fold: The input gradient tensor and output gradient tensor has mismatch type.");
+#else
+            return false;
+#endif
+        }
+        return true;
     }
 
-    //     bool IsValidSize() const
-    //     {
-    //         if(inputDesc.GetSize() < 2 || inputDesc.GetSize() > 5)
-    //         {
-    // #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-    //             MIOPEN_THROW(miopenStatusBadParm,
-    //                          "Instance Norm: The input tensor dimension should be in range [2,
-    //                          5].");
-    // #else
-    //             return false;
-    // #endif
-    //         }
-    //         return true;
-    //     }
 
     const TensorDescriptor& GetDinputDesc() const { return dinputDesc; }
     const TensorDescriptor& GetDoutputDesc() const { return doutputDesc; }
diff --git a/src/include/miopen/fold/solvers.hpp b/src/include/miopen/fold/solvers.hpp
index d463bb0251..e92213f434 100644
--- a/src/include/miopen/fold/solvers.hpp
+++ b/src/include/miopen/fold/solvers.hpp
@@ -36,22 +36,6 @@ namespace solver {
 
 namespace fold {
 
-// using FoldFwdSolverBase =
-//     NonTunableSolverBase<ExecutionContext, miopen::fold::FoldFwdProblemDescription>;
-
-// struct FoldFwd final : FoldFwdSolverBase
-// {
-//     const std::string& SolverDbId() const override { return GetSolverDbId<FoldFwd>(); }
-
-//     bool IsApplicable(
-//         const ExecutionContext& context,
-//         const miopen::fold::FoldFwdProblemDescription& problem) const override;
-
-//     ConvSolution GetSolution(
-//         const ExecutionContext& context,
-//         const miopen::fold::FoldFwdProblemDescription& problem) const override;
-// };
-
 using UnfoldFwdSolverBase =
     NonTunableSolverBase<ExecutionContext, miopen::fold::UnfoldFwdProblemDescription>;
 
@@ -82,6 +66,36 @@ struct UnfoldBwd final : UnfoldBwdSolverBase
                 const miopen::fold::UnfoldBwdProblemDescription& problem) const override;
 };
 
+using FoldFwdSolverBase =
+    NonTunableSolverBase<ExecutionContext, miopen::fold::FoldFwdProblemDescription>;
+
+struct FoldFwd final : FoldFwdSolverBase
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<FoldFwd>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::fold::FoldFwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::fold::FoldFwdProblemDescription& problem) const override;
+};
+
+using FoldBwdSolverBase =
+    NonTunableSolverBase<ExecutionContext, miopen::fold::FoldBwdProblemDescription>;
+
+struct FoldBwd final : FoldBwdSolverBase
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<FoldBwd>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::fold::FoldBwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::fold::FoldBwdProblemDescription& problem) const override;
+};
+
 } // namespace fold
 
 } // namespace solver
diff --git a/src/solver.cpp b/src/solver.cpp
index 8e3d5afcb3..f47b766272 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -649,9 +649,10 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
     Register(registry, ++id, Primitive::Mha, mha::Mha{}.SolverDbId());
     Register(registry, ++id, Primitive::Softmax, softmax::Softmax{}.SolverDbId());
     Register(registry, ++id, Primitive::Softmax, softmax::AttnSoftmax{}.SolverDbId());
-    // Register(registry, ++id, Primitive::Fold, fold::FoldFwd{}.SolverDbId());
     Register(registry, ++id, Primitive::Unfold, fold::UnfoldFwd{}.SolverDbId());
     Register(registry, ++id, Primitive::Unfold, fold::UnfoldBwd{}.SolverDbId());
+    Register(registry, ++id, Primitive::Fold, fold::UnfoldFwd{}.SolverDbId());
+    Register(registry, ++id, Primitive::Fold, fold::UnfoldBwd{}.SolverDbId());
 
     // IMPORTANT: New solvers should be added to the end of the function!
 }
diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp
new file mode 100644
index 0000000000..e7696b10a3
--- /dev/null
+++ b/src/solver/fold/fold_backward.cpp
@@ -0,0 +1,178 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/fold/problem_description.hpp"
+#include "miopen/miopen.h"
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/fold/invoke_params.hpp>
+#include <miopen/fold/solvers.hpp>
+#include <miopen/fold.hpp>
+#include <miopen/target_properties.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define LOCAL_SIZE 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace fold {
+
+bool FoldBwd::IsApplicable([[maybe_unused]] const ExecutionContext& /*context*/,
+                             const miopen::fold::FoldBwdProblemDescription& problem) const
+{
+    return true;
+}
+
+ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                                    const miopen::fold::FoldBwdProblemDescription& problem) const
+{
+    std::ignore = context;
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto in_dtype         = miopen::GetDataType(problem.GetDinputDesc().GetType());
+    auto dtype            = problem.GetDoutputDesc().GetType();
+    auto input_grad_dims  = problem.GetDinputDesc().GetLengths();
+    auto output_grad_dims = problem.GetDoutputDesc().GetLengths();
+
+    const int32_t N = static_cast<int32_t>(output_grad_dims[0]);
+    const int32_t C = static_cast<int32_t>(output_grad_dims[1]);
+    int spatial_dim_size = output_grad_dims.size() - 2;
+    int32_t P = 1, L = 1;
+    std::vector<int32_t> ls;
+    for(int i = 0; i < spatial_dim_size; ++i)
+    {
+        P *= problem.kernel_size[i];
+        int32_t l = (static_cast<int32_t>(output_grad_dims[i + 2]) + 2 * problem.padding[i] -
+                    problem.dilation[i] * (problem.kernel_size[i] - 1) - 1) /
+                        problem.stride[i] +
+                    1;
+        L *= l;
+        ls.push_back(l);
+    }
+
+    {
+        auto kernel        = KernelInfo{};
+        kernel.kernel_file = "MIOpenUnfold.cpp";
+        kernel.kernel_name = "UnfoldForward4D";
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+            {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype},
+        };
+        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+        size_t xlocalsize = LOCAL_SIZE;
+        size_t xgridsize  = AlignUp(N * C * P * L, LOCAL_SIZE);
+        size_t ylocalsize = 1;
+        size_t ygridsize  = 1;
+        size_t zlocalsize = 1;
+        size_t zgridsize  = 1;
+        kernel.l_wk.push_back(xlocalsize);
+        kernel.l_wk.push_back(ylocalsize);
+        kernel.l_wk.push_back(zlocalsize);
+
+        kernel.g_wk.push_back(xgridsize);
+        kernel.g_wk.push_back(ygridsize);
+        kernel.g_wk.push_back(zgridsize);
+
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::fold::InvokeParams>();
+
+            auto input_grad_tv    = get_inner_expanded_tv<3>(deref(params.dinputDesc));
+            auto output_grad_tv   = get_inner_expanded_tv<4>(deref(params.doutputDesc));
+            auto input_grad_dims  = deref(params.dinputDesc).GetLengths();
+            auto output_grad_dims = deref(params.doutputDesc).GetLengths();
+
+            int spatial_dim_size = output_grad_dims.size() - 2;
+            const int32_t N      = static_cast<int32_t>(output_grad_dims[0]);
+            const int32_t C      = static_cast<int32_t>(output_grad_dims[1]);
+            int32_t P = 1, L = 1;
+            std::vector<int32_t> ls;
+            for(int i = 0; i < spatial_dim_size; ++i)
+            {
+                P *= params.kernel_size[i];
+                int32_t l = (static_cast<int32_t>(output_grad_dims[i + 2]) + 2 * params.padding[i] -
+                             params.dilation[i] * (params.kernel_size[i] - 1) - 1) /
+                                params.stride[i] +
+                            1;
+                L *= l;
+                ls.push_back(l);
+            }
+
+            int32_t kernel_size_h = params.kernel_size[0];
+            int32_t kernel_size_w = params.kernel_size[1];
+            int32_t stride_h      = params.stride[0];
+            int32_t stride_w      = params.stride[1];
+            int32_t padding_h     = params.padding[0];
+            int32_t padding_w     = params.padding[1];
+            int32_t dilation_h    = params.dilation[0];
+            int32_t dilation_w    = params.dilation[1];
+            int32_t LH            = ls[0];
+            int32_t LW            = ls[1];
+            int32_t H             = static_cast<int32_t>(output_grad_dims[2]);
+            int32_t W             = static_cast<int32_t>(output_grad_dims[3]);
+
+            kernel(params.doutput,
+                   params.dinput,
+                   N,
+                   C,
+                   H,
+                   W,
+                   P,
+                   L,
+                   LH,
+                   LW,
+                   kernel_size_h,
+                   kernel_size_w,
+                   stride_h,
+                   stride_w,
+                   padding_h,
+                   padding_w,
+                   dilation_h,
+                   dilation_w,
+                   output_grad_tv,
+                   input_grad_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace fold
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/fold/fold_forward.cpp b/src/solver/fold/fold_forward.cpp
index 67528b00b7..5b8f638cb1 100644
--- a/src/solver/fold/fold_forward.cpp
+++ b/src/solver/fold/fold_forward.cpp
@@ -113,10 +113,10 @@ ConvSolution FoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& conte
             for(int i = 0; i < spatial_dim_size; ++i)
             {
                 P *= params.kernel_size[i];
-                int32_t l = (output_dims[i + 2] + 2 * params.padding[i] -
+                int32_t l = (static_cast<int32_t>(output_dims[i + 2]) + 2 * params.padding[i] -
                              params.dilation[i] * (params.kernel_size[i] - 1) - 1) /
                                 params.stride[i] +
-                            0;
+                            1;
                 L *= l;
                 ls.push_back(l);
             }
diff --git a/test/cpu_fold.hpp b/test/cpu_fold.hpp
index 46f7552083..de34115177 100644
--- a/test/cpu_fold.hpp
+++ b/test/cpu_fold.hpp
@@ -68,7 +68,7 @@ void cpu_unfold_fwd_4d(tensor<T> input_tensor,
         ls.push_back(l);
     }
 
-    int32_t kernel_size_h = kernel_size[0];
+    [[maybe_unused]] int32_t kernel_size_h = kernel_size[0];
     int32_t kernel_size_w = kernel_size[1];
     int32_t stride_h      = stride[0];
     int32_t stride_w      = stride[1];
@@ -76,7 +76,7 @@ void cpu_unfold_fwd_4d(tensor<T> input_tensor,
     int32_t padding_w     = padding[1];
     int32_t dilation_h    = dilation[0];
     int32_t dilation_w    = dilation[1];
-    int32_t LH            = ls[0];
+    [[maybe_unused]] int32_t LH            = ls[0];
     int32_t LW            = ls[1];
     int32_t H             = static_cast<int32_t>(input_dims[2]);
     int32_t W             = static_cast<int32_t>(input_dims[3]);
@@ -129,7 +129,7 @@ void cpu_unfold_bwd_4d(tensor<T>& ref_dinput_tensor,
     const int32_t N = static_cast<int32_t>(input_grad_dims[0]);
     const int32_t C = static_cast<int32_t>(input_grad_dims[1]);
 
-    int32_t P = 1, L = 1;
+    [[maybe_unused]] int32_t P = 1, L = 1;
     std::vector<int32_t> ls;
     for(int i = 0; i < spatial_dim_size; ++i)
     {
diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp
index 0e9fe9ddd8..6bd24d931f 100644
--- a/test/gtest/fold.cpp
+++ b/test/gtest/fold.cpp
@@ -32,32 +32,32 @@
 MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
 
 namespace fold {
-struct UnfoldForwardTestFloat32 : UnfoldFwdTest<float>
+struct FoldForwardTestFloat32 : FoldFwdTest<float>
 {
 };
 
-struct UnfoldForwardTestFloat16 : UnfoldFwdTest<half>
+struct FoldForwardTestFloat16 : FoldFwdTest<half>
 {
 };
 
-struct UnfoldForwardTestBFloat16 : UnfoldFwdTest<bfloat16>
+struct FoldForwardTestBFloat16 : FoldFwdTest<bfloat16>
 {
 };
 
-struct UnfoldBackwardTestFloat32 : UnfoldBwdTest<float>
+struct FoldBackwardTestFloat32 : FoldBwdTest<float>
 {
 };
 
-struct UnfoldBackwardTestFloat16 : UnfoldBwdTest<half>
+struct FoldBackwardTestFloat16 : FoldBwdTest<half>
 {
 };
 
-struct UnfoldBackwardTestBFloat16 : UnfoldBwdTest<bfloat16>
+struct FoldBackwardTestBFloat16 : FoldBwdTest<bfloat16>
 {
 };
 }; // namespace fold
 using namespace fold;
-TEST_P(UnfoldForwardTestFloat32, UnfoldForwardTest)
+TEST_P(FoldForwardTestFloat32, FoldForwardTest)
 {
     if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
     {
@@ -70,11 +70,11 @@ TEST_P(UnfoldForwardTestFloat32, UnfoldForwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
-                         UnfoldForwardTestFloat32,
-                         testing::ValuesIn(UnfoldTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet,
+                         FoldForwardTestFloat32,
+                         testing::ValuesIn(FoldTestConfigs()));
 
-TEST_P(UnfoldForwardTestFloat16, UnfoldForwardTest)
+TEST_P(FoldForwardTestFloat16, FoldForwardTest)
 {
     if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
     {
@@ -87,11 +87,11 @@ TEST_P(UnfoldForwardTestFloat16, UnfoldForwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
-                         UnfoldForwardTestFloat16,
-                         testing::ValuesIn(UnfoldTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet,
+                         FoldForwardTestFloat16,
+                         testing::ValuesIn(FoldTestConfigs()));
 
-TEST_P(UnfoldForwardTestBFloat16, UnfoldForwardTest)
+TEST_P(FoldForwardTestBFloat16, FoldForwardTest)
 {
     if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
     {
@@ -104,11 +104,11 @@ TEST_P(UnfoldForwardTestBFloat16, UnfoldForwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
-                         UnfoldForwardTestBFloat16,
-                         testing::ValuesIn(UnfoldTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet,
+                         FoldForwardTestBFloat16,
+                         testing::ValuesIn(FoldTestConfigs()));
 
-TEST_P(UnfoldBackwardTestFloat32, UnfoldBackwardTest)
+TEST_P(FoldBackwardTestFloat32, FoldBackwardTest)
 {
     if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
     {
@@ -121,11 +121,11 @@ TEST_P(UnfoldBackwardTestFloat32, UnfoldBackwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet,
-                         UnfoldBackwardTestFloat32,
-                         testing::ValuesIn(UnfoldTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet,
+                         FoldBackwardTestFloat32,
+                         testing::ValuesIn(FoldTestConfigs()));
 
-TEST_P(UnfoldBackwardTestFloat16, UnfoldBackwardTest)
+TEST_P(FoldBackwardTestFloat16, FoldBackwardTest)
 {
     if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
     {
@@ -138,11 +138,11 @@ TEST_P(UnfoldBackwardTestFloat16, UnfoldBackwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet,
-                         UnfoldBackwardTestFloat16,
-                         testing::ValuesIn(UnfoldTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet,
+                         FoldBackwardTestFloat16,
+                         testing::ValuesIn(FoldTestConfigs()));
 
-TEST_P(UnfoldBackwardTestBFloat16, UnfoldBackwardTest)
+TEST_P(FoldBackwardTestBFloat16, FoldBackwardTest)
 {
     if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
     {
@@ -155,6 +155,6 @@ TEST_P(UnfoldBackwardTestBFloat16, UnfoldBackwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet,
-                         UnfoldBackwardTestBFloat16,
-                         testing::ValuesIn(UnfoldTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet,
+                         FoldBackwardTestBFloat16,
+                         testing::ValuesIn(FoldTestConfigs()));
diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp
index f15c5b6a5f..9b7f883528 100644
--- a/test/gtest/fold.hpp
+++ b/test/gtest/fold.hpp
@@ -37,22 +37,26 @@
 #include <miopen/miopen.h>
 #include <miopen/fold.hpp>
 
-struct UnfoldTestCase
+struct FoldTestCase
 {
     size_t N;
     size_t C;
     size_t D;
     size_t H;
     size_t W;
+    std::vector<int32_t> outputSize;
     std::vector<int32_t> kernelSize;
     std::vector<int32_t> stride;
     std::vector<int32_t> padding;
     std::vector<int32_t> dilation;
     bool isContiguous = true;
-    friend std::ostream& operator<<(std::ostream& os, const UnfoldTestCase& tc)
+    friend std::ostream& operator<<(std::ostream& os, const FoldTestCase& tc)
     {
-        os << "N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H << " W:" << tc.W
-           << " kernel_size:";
+        os << "N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H << " W:" << tc.W;
+        os << " output_size:";
+        for(const auto& outs : tc.outputSize)
+            os << outs << " ";
+        os << " kernel_size:";
         for(const auto& ks : tc.kernelSize)
             os << ks << " ";
         os << "stride:";
@@ -111,20 +115,21 @@ struct UnfoldTestCase
     }
 };
 
-std::vector<UnfoldTestCase> UnfoldTestConfigs()
+std::vector<FoldTestCase> FoldTestConfigs()
 { // n c d h w padding
     return {
-        {2, 5, 0, 3, 4, {2, 3}, {1, 1}, {0, 0}, {1, 1}, true},
-        {1, 3, 0, 10, 12, {4, 5}, {1, 1}, {0, 0}, {1, 1}, true},
-        {11, 13, 0, 17, 19, {3, 3}, {3, 2}, {0, 0}, {1, 1}, true},
-        {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {3, 2}, {1, 1}, true},
-        {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {0, 0}, {3, 2}, true},
-        {11, 13, 0, 33, 37, {4, 3}, {2, 3}, {5, 2}, {3, 5}, true},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {4, 5}, {2, 2}, {1, 1}, {0, 0}, {1, 1}, true},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {6, 11}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, true},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {7, 12}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, true},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {7, 13}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, true},
+        {3, 3 * 3 * 4, 0, 0, 3 * 4, {5, 7}, {3, 4}, {1, 1}, {0, 0}, {1, 1}, true},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {2, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, true},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {5, 7}, {2, 2}, {1, 1}, {0, 0}, {2, 3}, true},
     };
 }
 
 template <typename T>
-struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
+struct FoldFwdTest : public ::testing::TestWithParam<FoldTestCase>
 {
 protected:
     void SetUp() override
@@ -136,28 +141,18 @@ struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
 
         auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
-        auto gen_one   = [&](auto...) { return 1; };
+        [[maybe_unused]] auto gen_one   = [&](auto...) { return 1; };
         auto gen_zero  = [&](auto...) { return 0; };
         input          = tensor<T>{in_dims, in_strides}.generate(gen_value);
-
-        int spatial_dim_size = in_dims.size() - 2;
         const int32_t N      = static_cast<int32_t>(in_dims[0]);
-        const int32_t C      = static_cast<int32_t>(in_dims[1]);
-        int32_t P = 1, L = 1;
-        std::vector<int32_t> ls;
-        for(int i = 0; i < spatial_dim_size; ++i)
+        int32_t C      = static_cast<int32_t>(in_dims[1]);
+        for (int32_t i : config.kernelSize)
         {
-            P *= config.kernelSize[i];
-            int32_t l = (static_cast<int32_t>(in_dims[i + 2]) + 2 * config.padding[i] -
-                         config.dilation[i] * (config.kernelSize[i] - 1) - 1) /
-                            config.stride[i] +
-                        1;
-            L *= l;
-            ls.push_back(l);
+            C = C / i;
         }
 
         std::vector<size_t> out_dims{
-            static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
+            static_cast<size_t>(N), static_cast<size_t>(C), static_cast<size_t>(config.outputSize[0]), static_cast<size_t>(config.outputSize[1])};
 
         output     = tensor<T>{out_dims}.generate(gen_zero);
         outputHost = tensor<T>{out_dims}.generate(gen_zero);
@@ -171,7 +166,7 @@ struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         auto&& handle = get_handle();
         miopenStatus_t status;
 
-        status = miopen::UnfoldForward(handle,
+        status = miopen::FoldForward(handle,
                                        input.desc,
                                        input_dev.get(),
                                        output.desc,
@@ -185,8 +180,8 @@ struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
                                        config.dilation.data(),
                                        static_cast<int>(config.dilation.size()));
 
-        cpu_unfold_fwd_4d<T>(
-            input, outputHost, config.kernelSize, config.stride, config.padding, config.dilation);
+        cpu_unfold_bwd_4d<T>(
+            outputHost, input, config.kernelSize, config.stride, config.padding, config.dilation);
 
         EXPECT_EQ(status, miopenStatusSuccess);
         output.data = handle.Read<T>(output_dev, output.data.size());
@@ -201,11 +196,15 @@ struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
         if(std::is_same<T, bfloat16>::value)
             tolerance *= 8.0;
+        for (int i = 0; i < 10; ++i)
+        {
+            std::cout << "output[" << i << "]: " << output[i] << " ~ " << outputHost[i] << std::endl;
+        }
         auto error_output = miopen::rms_range(outputHost, output);
         EXPECT_TRUE(error_output < tolerance) << "Error forward output beyond tolerance Error: {"
                                               << error_output << "},  Tolerance: " << tolerance;
     }
-    UnfoldTestCase config;
+    FoldTestCase config;
 
     tensor<T> input;
     tensor<T> output;
@@ -217,7 +216,7 @@ struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
 };
 
 template <typename T>
-struct UnfoldBwdTest : public ::testing::TestWithParam<UnfoldTestCase>
+struct FoldBwdTest : public ::testing::TestWithParam<FoldTestCase>
 {
 protected:
     void SetUp() override
@@ -229,29 +228,20 @@ struct UnfoldBwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
 
         auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
-        auto gen_one   = [&](auto...) { return 1; };
+        [[maybe_unused]] auto gen_one   = [&](auto...) { return 1; };
         auto gen_zero  = [&](auto...) { return 0; };
         dinput         = tensor<T>{in_dims, in_strides}.generate(gen_zero);
         dinputHost     = tensor<T>{in_dims, in_strides}.generate(gen_zero);
 
-        int spatial_dim_size = in_dims.size() - 2;
         const int32_t N      = static_cast<int32_t>(in_dims[0]);
-        const int32_t C      = static_cast<int32_t>(in_dims[1]);
-        int32_t P = 1, L = 1;
-        std::vector<int32_t> ls;
-        for(int i = 0; i < spatial_dim_size; ++i)
+        int32_t C      = static_cast<int32_t>(in_dims[1]);
+        for (int32_t i : config.kernelSize)
         {
-            P *= config.kernelSize[i];
-            int32_t l = (static_cast<int32_t>(in_dims[i + 2]) + 2 * config.padding[i] -
-                         config.dilation[i] * (config.kernelSize[i] - 1) - 1) /
-                            config.stride[i] +
-                        1;
-            L *= l;
-            ls.push_back(l);
+            C = C / i;
         }
 
         std::vector<size_t> out_dims{
-            static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
+            static_cast<size_t>(N), static_cast<size_t>(C), static_cast<size_t>(config.outputSize[0]), static_cast<size_t>(config.outputSize[1])};
 
         doutput = tensor<T>{out_dims}.generate(gen_value);
 
@@ -264,7 +254,7 @@ struct UnfoldBwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         auto&& handle = get_handle();
         miopenStatus_t status;
 
-        status = miopen::UnfoldBackward(handle,
+        status = miopen::FoldBackward(handle,
                                         dinput.desc,
                                         dinput_dev.get(),
                                         doutput.desc,
@@ -278,8 +268,8 @@ struct UnfoldBwdTest : public ::testing::TestWithParam<UnfoldTestCase>
                                         config.dilation.data(),
                                         static_cast<int>(config.dilation.size()));
 
-        cpu_unfold_bwd_4d<T>(
-            dinputHost, doutput, config.kernelSize, config.stride, config.padding, config.dilation);
+        cpu_unfold_fwd_4d<T>(
+            doutput, dinputHost, config.kernelSize, config.stride, config.padding, config.dilation);
 
         EXPECT_EQ(status, miopenStatusSuccess);
         dinput.data = handle.Read<T>(dinput_dev, dinput.data.size());
@@ -299,7 +289,7 @@ struct UnfoldBwdTest : public ::testing::TestWithParam<UnfoldTestCase>
             << "Error backward input_grad beyond tolerance Error: {" << error_dinput
             << "},  Tolerance: " << tolerance;
     }
-    UnfoldTestCase config;
+    FoldTestCase config;
 
     tensor<T> dinput;
     tensor<T> doutput;
diff --git a/test/gtest/unfold.cpp b/test/gtest/unfold.cpp
new file mode 100644
index 0000000000..b97c96d567
--- /dev/null
+++ b/test/gtest/unfold.cpp
@@ -0,0 +1,160 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "unfold.hpp"
+#include "miopen/bfloat16.hpp"
+#include "tensor_holder.hpp"
+#include <miopen/env.hpp>
+
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
+
+namespace unfold {
+struct UnfoldForwardTestFloat32 : UnfoldFwdTest<float>
+{
+};
+
+struct UnfoldForwardTestFloat16 : UnfoldFwdTest<half>
+{
+};
+
+struct UnfoldForwardTestBFloat16 : UnfoldFwdTest<bfloat16>
+{
+};
+
+struct UnfoldBackwardTestFloat32 : UnfoldBwdTest<float>
+{
+};
+
+struct UnfoldBackwardTestFloat16 : UnfoldBwdTest<half>
+{
+};
+
+struct UnfoldBackwardTestBFloat16 : UnfoldBwdTest<bfloat16>
+{
+};
+}; // namespace unfold
+using namespace unfold;
+TEST_P(UnfoldForwardTestFloat32, UnfoldForwardTest)
+{
+    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
+                         UnfoldForwardTestFloat32,
+                         testing::ValuesIn(UnfoldTestConfigs()));
+
+TEST_P(UnfoldForwardTestFloat16, UnfoldForwardTest)
+{
+    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
+                         UnfoldForwardTestFloat16,
+                         testing::ValuesIn(UnfoldTestConfigs()));
+
+TEST_P(UnfoldForwardTestBFloat16, UnfoldForwardTest)
+{
+    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
+                         UnfoldForwardTestBFloat16,
+                         testing::ValuesIn(UnfoldTestConfigs()));
+
+TEST_P(UnfoldBackwardTestFloat32, UnfoldBackwardTest)
+{
+    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet,
+                         UnfoldBackwardTestFloat32,
+                         testing::ValuesIn(UnfoldTestConfigs()));
+
+TEST_P(UnfoldBackwardTestFloat16, UnfoldBackwardTest)
+{
+    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet,
+                         UnfoldBackwardTestFloat16,
+                         testing::ValuesIn(UnfoldTestConfigs()));
+
+TEST_P(UnfoldBackwardTestBFloat16, UnfoldBackwardTest)
+{
+    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet,
+                         UnfoldBackwardTestBFloat16,
+                         testing::ValuesIn(UnfoldTestConfigs()));
diff --git a/test/gtest/unfold.hpp b/test/gtest/unfold.hpp
new file mode 100644
index 0000000000..686a1e8f02
--- /dev/null
+++ b/test/gtest/unfold.hpp
@@ -0,0 +1,311 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTN OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTN WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "../driver/tensor_driver.hpp"
+#include "cpu_fold.hpp"
+#include "get_handle.hpp"
+#include "miopen/allocator.hpp"
+#include "random.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+#include <cstddef>
+#include <cstdlib>
+#include <random>
+#include <gtest/gtest.h>
+#include <miopen/miopen.h>
+#include <miopen/fold.hpp>
+
+struct UnfoldTestCase
+{
+    size_t N;
+    size_t C;
+    size_t D;
+    size_t H;
+    size_t W;
+    std::vector<int32_t> kernelSize;
+    std::vector<int32_t> stride;
+    std::vector<int32_t> padding;
+    std::vector<int32_t> dilation;
+    bool isContiguous = true;
+    friend std::ostream& operator<<(std::ostream& os, const UnfoldTestCase& tc)
+    {
+        os << "N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H << " W:" << tc.W
+           << " kernel_size:";
+        for(const auto& ks : tc.kernelSize)
+            os << ks << " ";
+        os << "stride:";
+        for(const auto& s : tc.stride)
+            os << s << " ";
+        os << "padding:";
+        for(const auto& p : tc.padding)
+            os << p << " ";
+        os << "dilation:";
+        for(const auto& d : tc.dilation)
+            os << d << " ";
+        os << "isContiguous:" << std::boolalpha << tc.isContiguous;
+        return os;
+    }
+
+    std::vector<size_t> GetInput()
+    {
+        if((N != 0) && (C != 0) && (D != 0) && (H != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, C, D, H, W});
+        }
+        else if((N != 0) && (C != 0) && (H != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, C, H, W});
+        }
+        else if((N != 0) && (C != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, C, W});
+        }
+        else if((N != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, W});
+        }
+        else if((N != 0))
+        {
+            return std::vector<size_t>({N});
+        }
+        else
+        {
+            std::cout << "Error Input Tensor Lengths\n" << std::endl;
+            return std::vector<size_t>({0});
+        }
+    }
+
+    std::vector<size_t> ComputeStrides(std::vector<size_t> inputDim) const
+    {
+        if(!isContiguous)
+            std::swap(inputDim.front(), inputDim.back());
+        std::vector<size_t> strides(inputDim.size());
+        strides.back() = 1;
+        for(int i = inputDim.size() - 2; i >= 0; --i)
+            strides[i] = strides[i + 1] * inputDim[i + 1];
+        if(!isContiguous)
+            std::swap(strides.front(), strides.back());
+        return strides;
+    }
+};
+
+std::vector<UnfoldTestCase> UnfoldTestConfigs()
+{ // n c d h w padding
+    return {
+        {2, 5, 0, 3, 4, {2, 3}, {1, 1}, {0, 0}, {1, 1}, true},
+        {1, 3, 0, 10, 12, {4, 5}, {1, 1}, {0, 0}, {1, 1}, true},
+        {11, 13, 0, 17, 19, {3, 3}, {3, 2}, {0, 0}, {1, 1}, true},
+        {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {3, 2}, {1, 1}, true},
+        {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {0, 0}, {3, 2}, true},
+        {11, 13, 0, 33, 37, {4, 3}, {2, 3}, {5, 2}, {3, 5}, true},
+    };
+}
+
+template <typename T>
+struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle = get_handle();
+        config        = GetParam();
+
+        std::vector<size_t> in_dims    = config.GetInput();
+        std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
+
+        auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
+        [[maybe_unused]] auto gen_one   = [&](auto...) { return 1; };
+        auto gen_zero  = [&](auto...) { return 0; };
+        input          = tensor<T>{in_dims, in_strides}.generate(gen_value);
+
+        int spatial_dim_size = in_dims.size() - 2;
+        const int32_t N      = static_cast<int32_t>(in_dims[0]);
+        const int32_t C      = static_cast<int32_t>(in_dims[1]);
+        int32_t P = 1, L = 1;
+        std::vector<int32_t> ls;
+        for(int i = 0; i < spatial_dim_size; ++i)
+        {
+            P *= config.kernelSize[i];
+            int32_t l = (static_cast<int32_t>(in_dims[i + 2]) + 2 * config.padding[i] -
+                         config.dilation[i] * (config.kernelSize[i] - 1) - 1) /
+                            config.stride[i] +
+                        1;
+            L *= l;
+            ls.push_back(l);
+        }
+
+        std::vector<size_t> out_dims{
+            static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
+
+        output     = tensor<T>{out_dims}.generate(gen_zero);
+        outputHost = tensor<T>{out_dims}.generate(gen_zero);
+
+        input_dev  = handle.Write(input.data);
+        output_dev = handle.Write(output.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+        miopenStatus_t status;
+
+        status = miopen::UnfoldForward(handle,
+                                       input.desc,
+                                       input_dev.get(),
+                                       output.desc,
+                                       output_dev.get(),
+                                       config.kernelSize.data(),
+                                       static_cast<int>(config.kernelSize.size()),
+                                       config.stride.data(),
+                                       static_cast<int>(config.stride.size()),
+                                       config.padding.data(),
+                                       static_cast<int>(config.padding.size()),
+                                       config.dilation.data(),
+                                       static_cast<int>(config.dilation.size()));
+
+        cpu_unfold_fwd_4d<T>(
+            input, outputHost, config.kernelSize, config.stride, config.padding, config.dilation);
+
+        EXPECT_EQ(status, miopenStatusSuccess);
+        output.data = handle.Read<T>(output_dev, output.data.size());
+    }
+
+    void Verify()
+    {
+        // Computation error of fp16 is ~2^13 (=8192) bigger than
+        // the one of fp32 because mantissa is shorter by 13 bits.
+        double tolerance = std::is_same<T, float>::value ? 1.5e-6 : 8.2e-3;
+
+        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+        if(std::is_same<T, bfloat16>::value)
+            tolerance *= 8.0;
+        auto error_output = miopen::rms_range(outputHost, output);
+        EXPECT_TRUE(error_output < tolerance) << "Error forward output beyond tolerance Error: {"
+                                              << error_output << "},  Tolerance: " << tolerance;
+    }
+    UnfoldTestCase config;
+
+    tensor<T> input;
+    tensor<T> output;
+
+    tensor<T> outputHost;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+};
+
+template <typename T>
+struct UnfoldBwdTest : public ::testing::TestWithParam<UnfoldTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle = get_handle();
+        config        = GetParam();
+
+        std::vector<size_t> in_dims    = config.GetInput();
+        std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
+
+        auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
+        [[maybe_unused]] auto gen_one   = [&](auto...) { return 1; };
+        auto gen_zero  = [&](auto...) { return 0; };
+        dinput         = tensor<T>{in_dims, in_strides}.generate(gen_zero);
+        dinputHost     = tensor<T>{in_dims, in_strides}.generate(gen_zero);
+
+        int spatial_dim_size = in_dims.size() - 2;
+        const int32_t N      = static_cast<int32_t>(in_dims[0]);
+        const int32_t C      = static_cast<int32_t>(in_dims[1]);
+        int32_t P = 1, L = 1;
+        std::vector<int32_t> ls;
+        for(int i = 0; i < spatial_dim_size; ++i)
+        {
+            P *= config.kernelSize[i];
+            int32_t l = (static_cast<int32_t>(in_dims[i + 2]) + 2 * config.padding[i] -
+                         config.dilation[i] * (config.kernelSize[i] - 1) - 1) /
+                            config.stride[i] +
+                        1;
+            L *= l;
+            ls.push_back(l);
+        }
+
+        std::vector<size_t> out_dims{
+            static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
+
+        doutput = tensor<T>{out_dims}.generate(gen_value);
+
+        dinput_dev  = handle.Write(dinput.data);
+        doutput_dev = handle.Write(doutput.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+        miopenStatus_t status;
+
+        status = miopen::UnfoldBackward(handle,
+                                        dinput.desc,
+                                        dinput_dev.get(),
+                                        doutput.desc,
+                                        doutput_dev.get(),
+                                        config.kernelSize.data(),
+                                        static_cast<int>(config.kernelSize.size()),
+                                        config.stride.data(),
+                                        static_cast<int>(config.stride.size()),
+                                        config.padding.data(),
+                                        static_cast<int>(config.padding.size()),
+                                        config.dilation.data(),
+                                        static_cast<int>(config.dilation.size()));
+
+        cpu_unfold_bwd_4d<T>(
+            dinputHost, doutput, config.kernelSize, config.stride, config.padding, config.dilation);
+
+        EXPECT_EQ(status, miopenStatusSuccess);
+        dinput.data = handle.Read<T>(dinput_dev, dinput.data.size());
+    }
+
+    void Verify()
+    {
+        // Computation error of fp16 is ~2^13 (=8192) bigger than
+        // the one of fp32 because mantissa is shorter by 13 bits.
+        double tolerance = std::is_same<T, float>::value ? 1.5e-6 : 8.2e-3;
+
+        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+        if(std::is_same<T, bfloat16>::value)
+            tolerance *= 8.0;
+        auto error_dinput = miopen::rms_range(dinputHost, dinput);
+        EXPECT_TRUE(error_dinput < tolerance)
+            << "Error backward input_grad beyond tolerance Error: {" << error_dinput
+            << "},  Tolerance: " << tolerance;
+    }
+    UnfoldTestCase config;
+
+    tensor<T> dinput;
+    tensor<T> doutput;
+
+    tensor<T> dinputHost;
+
+    miopen::Allocator::ManageDataPtr dinput_dev;
+    miopen::Allocator::ManageDataPtr doutput_dev;
+};

From 9286ce7336264a1be722bf5bad4a2515868132dc Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Wed, 10 Jul 2024 07:44:38 +0000
Subject: [PATCH 07/46] githook format

---
 driver/driver.hpp                             |   6 +-
 driver/fold_driver.hpp                        |  75 ++++----
 include/miopen/miopen.h                       |  84 ++++-----
 src/fold.cpp                                  |  84 ++++-----
 src/fold/problem_description.cpp              |   8 +-
 src/fold_api.cpp                              |  96 +++++-----
 src/include/miopen/fold.hpp                   |  48 ++---
 .../miopen/fold/problem_description.hpp       | 175 ++++++++----------
 src/include/miopen/fold/solvers.hpp           |  10 +-
 src/solver/fold/fold_backward.cpp             |  10 +-
 test/cpu_fold.hpp                             |  22 +--
 test/gtest/fold.hpp                           |  93 +++++-----
 test/gtest/unfold.hpp                         |  14 +-
 13 files changed, 355 insertions(+), 370 deletions(-)

diff --git a/driver/driver.hpp b/driver/driver.hpp
index f26d7053f3..68a0421e41 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -151,7 +151,8 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "pool[fp16], lrn[fp16], "
            "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], "
            "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
-           "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16], unfold[bfp16|fp16], fold[bfp16|fp16]\n");
+           "argmax[bfp16|fp16], groupnorm[bfp16|fp16], cat[bfp16|fp16], unfold[bfp16|fp16], "
+           "fold[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -177,7 +178,8 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "sumbfp16" && arg != "argmax" && arg != "argmaxfp16" && arg != "argmaxbfp16" &&
        arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" &&
        arg != "catfp16" && arg != "catbfp16" && arg != "unfold" && arg != "unfoldfp16" &&
-       arg != "unfoldbfp16" && arg != "fold" && arg != "foldfp16" && arg != "foldbfp16" && arg != "--version")
+       arg != "unfoldbfp16" && arg != "fold" && arg != "foldfp16" && arg != "foldbfp16" &&
+       arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/fold_driver.hpp b/driver/fold_driver.hpp
index 117538452e..4278624928 100644
--- a/driver/fold_driver.hpp
+++ b/driver/fold_driver.hpp
@@ -135,14 +135,14 @@ int FoldDriver<Tgpu, Tref>::GetandSetData()
 {
     std::vector<int> input_length = GetTensorLengthsFromCmdLine();
 
-    output_size          = GetVectorInt32tFromCmdLine("outputSize");
-    kernel_size          = GetVectorInt32tFromCmdLine("kernelSize");
-    stride               = GetVectorInt32tFromCmdLine("stride");
-    padding              = GetVectorInt32tFromCmdLine("padding");
-    dilation             = GetVectorInt32tFromCmdLine("dilation");
-    const int N          = input_length[0];
-    int C          = input_length[1];
-    for (int32_t i : kernel_size)
+    output_size = GetVectorInt32tFromCmdLine("outputSize");
+    kernel_size = GetVectorInt32tFromCmdLine("kernelSize");
+    stride      = GetVectorInt32tFromCmdLine("stride");
+    padding     = GetVectorInt32tFromCmdLine("padding");
+    dilation    = GetVectorInt32tFromCmdLine("dilation");
+    const int N = input_length[0];
+    int C       = input_length[1];
+    for(int32_t i : kernel_size)
     {
         C = C / i;
     }
@@ -295,18 +295,18 @@ int FoldDriver<Tgpu, Tref>::RunForwardGPU()
     for(int i = 0; i < inflags.GetValueInt("iter"); i++)
     {
         miopenFoldForward(GetHandle(),
-                            inputDesc,
-                            input_dev->GetMem(),
-                            outputDesc,
-                            output_dev->GetMem(),
-                            kernel_size.data(),
-                            kernel_size.size(),
-                            stride.data(),
-                            stride.size(),
-                            padding.data(),
-                            padding.size(),
-                            dilation.data(),
-                            dilation.size());
+                          inputDesc,
+                          input_dev->GetMem(),
+                          outputDesc,
+                          output_dev->GetMem(),
+                          kernel_size.data(),
+                          kernel_size.size(),
+                          stride.data(),
+                          stride.size(),
+                          padding.data(),
+                          padding.size(),
+                          dilation.data(),
+                          dilation.size());
 
         float time = 0.0;
         miopenGetKernelTime(GetHandle(), &time);
@@ -320,8 +320,8 @@ int FoldDriver<Tgpu, Tref>::RunForwardGPU()
         STOP_TIME
         int iter = inflags.GetValueInt("iter");
         if(WALL_CLOCK)
-            std::cout << "Wall-clock Time Fold Forward Elapsed: " << t.gettime_ms() / iter
-                      << " ms" << std::endl;
+            std::cout << "Wall-clock Time Fold Forward Elapsed: " << t.gettime_ms() / iter << " ms"
+                      << std::endl;
 
         float kernel_average_time =
             iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
@@ -362,18 +362,18 @@ int FoldDriver<Tgpu, Tref>::RunBackwardGPU()
     for(int i = 0; i < inflags.GetValueInt("iter"); i++)
     {
         miopenFoldBackward(GetHandle(),
-                             dinputDesc,
-                             dinput_dev->GetMem(),
-                             doutputDesc,
-                             doutput_dev->GetMem(),
-                             kernel_size.data(),
-                             kernel_size.size(),
-                             stride.data(),
-                             stride.size(),
-                             padding.data(),
-                             padding.size(),
-                             dilation.data(),
-                             dilation.size());
+                           dinputDesc,
+                           dinput_dev->GetMem(),
+                           doutputDesc,
+                           doutput_dev->GetMem(),
+                           kernel_size.data(),
+                           kernel_size.size(),
+                           stride.data(),
+                           stride.size(),
+                           padding.data(),
+                           padding.size(),
+                           dilation.data(),
+                           dilation.size());
 
         float time = 0.0;
         miopenGetKernelTime(GetHandle(), &time);
@@ -387,8 +387,8 @@ int FoldDriver<Tgpu, Tref>::RunBackwardGPU()
         STOP_TIME
         int iter = inflags.GetValueInt("iter");
         if(WALL_CLOCK)
-            std::cout << "Wall-clock Time Fold Backward Elapsed: " << t.gettime_ms() / iter
-                      << " ms" << std::endl;
+            std::cout << "Wall-clock Time Fold Backward Elapsed: " << t.gettime_ms() / iter << " ms"
+                      << std::endl;
 
         float kernel_average_time =
             iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
@@ -459,8 +459,7 @@ int FoldDriver<Tgpu, Tref>::VerifyBackward()
 
     if(!std::isfinite(error_dinput) || error_dinput > tolerance)
     {
-        std::cout << "Backward Fold FAILED: {" << error_dinput << "} > " << tolerance
-                  << std::endl;
+        std::cout << "Backward Fold FAILED: {" << error_dinput << "} > " << tolerance << std::endl;
         return EC_VerifyFwd;
     }
     else
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index a45ece12fe..51485db6e7 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -6607,50 +6607,50 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d
  * @return               miopenStatus_t
  */
 MIOPEN_EXPORT miopenStatus_t miopenFoldForward(miopenHandle_t handle,
-                                                 const miopenTensorDescriptor_t inputDesc,
-                                                 const void* input,
-                                                 const miopenTensorDescriptor_t outputDesc,
-                                                 void* output,
-                                                 const int32_t* kernel_size,
-                                                 const int kernel_size_size,
-                                                 const int32_t* stride,
-                                                 const int stride_size,
-                                                 const int32_t* padding,
-                                                 const int padding_size,
-                                                 const int32_t* dilation,
-                                                 const int dilation_size);
+                                               const miopenTensorDescriptor_t inputDesc,
+                                               const void* input,
+                                               const miopenTensorDescriptor_t outputDesc,
+                                               void* output,
+                                               const int32_t* kernel_size,
+                                               const int kernel_size_size,
+                                               const int32_t* stride,
+                                               const int stride_size,
+                                               const int32_t* padding,
+                                               const int padding_size,
+                                               const int32_t* dilation,
+                                               const int dilation_size);
 
- /*! @brief Execute an unfold backward layer
- *
- * @param handle              MIOpen handle (input)
- * @param dinputDesc          Tensor descriptor for data input grad tensor (output)
- * @param dinput              Data tensor input grad (output)
- * @param doutputDesc         Tensor descriptor for data output grad tensor (input)
- * @param doutput             Data tensor output grad (input)
- * @param kernel_size         Size of the sliding box array (input)
- * @param kernel_size_size    Size of the kernel_size array (input)
- * @param stride              Stride array of the sliding box (input)
- * @param stride_size         Size of the stride array (input)
- * @param padding             Padding array to be added on input (input)
- * @param padding_size        Size of the padding array (input)
- * @param dilation            Dilation array control the stride of the elements within the
- neighborhood (input)
- * @param dilation_size       Size of the dilation array (input)
- * @return               miopenStatus_t
- */
+/*! @brief Execute an unfold backward layer
+*
+* @param handle              MIOpen handle (input)
+* @param dinputDesc          Tensor descriptor for data input grad tensor (output)
+* @param dinput              Data tensor input grad (output)
+* @param doutputDesc         Tensor descriptor for data output grad tensor (input)
+* @param doutput             Data tensor output grad (input)
+* @param kernel_size         Size of the sliding box array (input)
+* @param kernel_size_size    Size of the kernel_size array (input)
+* @param stride              Stride array of the sliding box (input)
+* @param stride_size         Size of the stride array (input)
+* @param padding             Padding array to be added on input (input)
+* @param padding_size        Size of the padding array (input)
+* @param dilation            Dilation array control the stride of the elements within the
+neighborhood (input)
+* @param dilation_size       Size of the dilation array (input)
+* @return               miopenStatus_t
+*/
 MIOPEN_EXPORT miopenStatus_t miopenFoldBackward(miopenHandle_t handle,
-                                                  const miopenTensorDescriptor_t dinputDesc,
-                                                  void* dinput,
-                                                  const miopenTensorDescriptor_t doutputDesc,
-                                                  const void* doutput,
-                                                  const int32_t* kernel_size,
-                                                  const int kernel_size_size,
-                                                  const int32_t* stride,
-                                                  const int stride_size,
-                                                  const int32_t* padding,
-                                                  const int padding_size,
-                                                  const int32_t* dilation,
-                                                  const int dilation_size);
+                                                const miopenTensorDescriptor_t dinputDesc,
+                                                void* dinput,
+                                                const miopenTensorDescriptor_t doutputDesc,
+                                                const void* doutput,
+                                                const int32_t* kernel_size,
+                                                const int kernel_size_size,
+                                                const int32_t* stride,
+                                                const int stride_size,
+                                                const int32_t* padding,
+                                                const int padding_size,
+                                                const int32_t* dilation,
+                                                const int dilation_size);
 
 /*! @brief Execute an unfold forward layer
  *
diff --git a/src/fold.cpp b/src/fold.cpp
index d2ff285af1..470d8eb6de 100644
--- a/src/fold.cpp
+++ b/src/fold.cpp
@@ -138,29 +138,29 @@ miopenStatus_t UnfoldBackward(Handle& handle,
 }
 
 miopenStatus_t FoldForward(Handle& handle,
-                             const TensorDescriptor& inputDesc,
-                             ConstData_t input,
-                             const TensorDescriptor& outputDesc,
-                             Data_t output,
-                             const int32_t* kernel_size,
-                             const int kernel_size_size,
-                             const int32_t* stride,
-                             const int stride_size,
-                             const int32_t* padding,
-                             const int padding_size,
-                             const int32_t* dilation,
-                             const int dilation_size)
+                           const TensorDescriptor& inputDesc,
+                           ConstData_t input,
+                           const TensorDescriptor& outputDesc,
+                           Data_t output,
+                           const int32_t* kernel_size,
+                           const int kernel_size_size,
+                           const int32_t* stride,
+                           const int stride_size,
+                           const int32_t* padding,
+                           const int padding_size,
+                           const int32_t* dilation,
+                           const int dilation_size)
 {
     const auto problem = fold::FoldFwdProblemDescription{inputDesc,
-                                                           outputDesc,
-                                                           kernel_size,
-                                                           kernel_size_size,
-                                                           stride,
-                                                           stride_size,
-                                                           padding,
-                                                           padding_size,
-                                                           dilation,
-                                                           dilation_size};
+                                                         outputDesc,
+                                                         kernel_size,
+                                                         kernel_size_size,
+                                                         stride,
+                                                         stride_size,
+                                                         padding,
+                                                         padding_size,
+                                                         dilation,
+                                                         dilation_size};
 
     const auto invoke_params = [&]() {
         auto tmp             = fold::InvokeParams{};
@@ -188,29 +188,29 @@ miopenStatus_t FoldForward(Handle& handle,
 }
 
 miopenStatus_t FoldBackward(Handle& handle,
-                              const TensorDescriptor& dinputDesc,
-                              Data_t dinput,
-                              const TensorDescriptor& doutputDesc,
-                              ConstData_t doutput,
-                              const int32_t* kernel_size,
-                              const int kernel_size_size,
-                              const int32_t* stride,
-                              const int stride_size,
-                              const int32_t* padding,
-                              const int padding_size,
-                              const int32_t* dilation,
-                              const int dilation_size)
+                            const TensorDescriptor& dinputDesc,
+                            Data_t dinput,
+                            const TensorDescriptor& doutputDesc,
+                            ConstData_t doutput,
+                            const int32_t* kernel_size,
+                            const int kernel_size_size,
+                            const int32_t* stride,
+                            const int stride_size,
+                            const int32_t* padding,
+                            const int padding_size,
+                            const int32_t* dilation,
+                            const int dilation_size)
 {
     const auto problem = fold::FoldBwdProblemDescription{dinputDesc,
-                                                           doutputDesc,
-                                                           kernel_size,
-                                                           kernel_size_size,
-                                                           stride,
-                                                           stride_size,
-                                                           padding,
-                                                           padding_size,
-                                                           dilation,
-                                                           dilation_size};
+                                                         doutputDesc,
+                                                         kernel_size,
+                                                         kernel_size_size,
+                                                         stride,
+                                                         stride_size,
+                                                         padding,
+                                                         padding_size,
+                                                         dilation,
+                                                         dilation_size};
 
     const auto invoke_params = [&]() {
         auto tmp             = fold::InvokeParams{};
diff --git a/src/fold/problem_description.cpp b/src/fold/problem_description.cpp
index 39202fd372..ce34de1a16 100644
--- a/src/fold/problem_description.cpp
+++ b/src/fold/problem_description.cpp
@@ -91,7 +91,7 @@ NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const
     auto output_dtype = outputDesc.GetType();
     auto size         = inputDesc.GetElementSize();
     auto in_dims      = inputDesc.GetLengths();
-    auto out_dims      = outputDesc.GetLengths();
+    auto out_dims     = outputDesc.GetLengths();
 
     std::ostringstream ss;
 
@@ -105,7 +105,7 @@ NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const
         ss << "_" << val;
     }
     ss << "out_dims";
-    for (auto val: out_dims)
+    for(auto val : out_dims)
     {
         ss << "_" << val;
     }
@@ -123,7 +123,7 @@ NetworkConfig FoldBwdProblemDescription::MakeNetworkConfig() const
     auto output_dtype = doutputDesc.GetType();
     auto size         = dinputDesc.GetElementSize();
     auto in_dims      = dinputDesc.GetLengths();
-    auto out_dims      = doutputDesc.GetLengths();
+    auto out_dims     = doutputDesc.GetLengths();
 
     std::ostringstream ss;
 
@@ -137,7 +137,7 @@ NetworkConfig FoldBwdProblemDescription::MakeNetworkConfig() const
         ss << "_" << val;
     }
     ss << "out_grad_dims";
-    for (auto val: out_dims)
+    for(auto val : out_dims)
     {
         ss << "_" << val;
     }
diff --git a/src/fold_api.cpp b/src/fold_api.cpp
index fb22fa90b4..f59c209785 100644
--- a/src/fold_api.cpp
+++ b/src/fold_api.cpp
@@ -94,63 +94,63 @@ extern "C" miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle,
 }
 
 extern "C" miopenStatus_t miopenFoldForward(miopenHandle_t handle,
-                                              const miopenTensorDescriptor_t inputDesc,
-                                              const void* input,
-                                              const miopenTensorDescriptor_t outputDesc,
-                                              void* output,
-                                              const int32_t* kernel_size,
-                                              const int kernel_size_size,
-                                              const int32_t* stride,
-                                              const int stride_size,
-                                              const int32_t* padding,
-                                              const int padding_size,
-                                              const int32_t* dilation,
-                                              const int dilation_size)
+                                            const miopenTensorDescriptor_t inputDesc,
+                                            const void* input,
+                                            const miopenTensorDescriptor_t outputDesc,
+                                            void* output,
+                                            const int32_t* kernel_size,
+                                            const int kernel_size_size,
+                                            const int32_t* stride,
+                                            const int stride_size,
+                                            const int32_t* padding,
+                                            const int padding_size,
+                                            const int32_t* dilation,
+                                            const int dilation_size)
 {
     return miopen::try_([&] {
         miopen::FoldForward(miopen::deref(handle),
-                              miopen::deref(inputDesc),
-                              DataCast(input),
-                              miopen::deref(outputDesc),
-                              DataCast(output),
-                              kernel_size,
-                              kernel_size_size,
-                              stride,
-                              stride_size,
-                              padding,
-                              padding_size,
-                              dilation,
-                              dilation_size);
+                            miopen::deref(inputDesc),
+                            DataCast(input),
+                            miopen::deref(outputDesc),
+                            DataCast(output),
+                            kernel_size,
+                            kernel_size_size,
+                            stride,
+                            stride_size,
+                            padding,
+                            padding_size,
+                            dilation,
+                            dilation_size);
     });
 }
 
 extern "C" miopenStatus_t miopenFoldBackward(miopenHandle_t handle,
-                                               const miopenTensorDescriptor_t dinputDesc,
-                                               void* dinput,
-                                               const miopenTensorDescriptor_t doutputDesc,
-                                               const void* doutput,
-                                               const int32_t* kernel_size,
-                                               const int kernel_size_size,
-                                               const int32_t* stride,
-                                               const int stride_size,
-                                               const int32_t* padding,
-                                               const int padding_size,
-                                               const int32_t* dilation,
-                                               const int dilation_size)
+                                             const miopenTensorDescriptor_t dinputDesc,
+                                             void* dinput,
+                                             const miopenTensorDescriptor_t doutputDesc,
+                                             const void* doutput,
+                                             const int32_t* kernel_size,
+                                             const int kernel_size_size,
+                                             const int32_t* stride,
+                                             const int stride_size,
+                                             const int32_t* padding,
+                                             const int padding_size,
+                                             const int32_t* dilation,
+                                             const int dilation_size)
 {
     return miopen::try_([&] {
         miopen::FoldBackward(miopen::deref(handle),
-                               miopen::deref(dinputDesc),
-                               DataCast(dinput),
-                               miopen::deref(doutputDesc),
-                               DataCast(doutput),
-                               kernel_size,
-                               kernel_size_size,
-                               stride,
-                               stride_size,
-                               padding,
-                               padding_size,
-                               dilation,
-                               dilation_size);
+                             miopen::deref(dinputDesc),
+                             DataCast(dinput),
+                             miopen::deref(doutputDesc),
+                             DataCast(doutput),
+                             kernel_size,
+                             kernel_size_size,
+                             stride,
+                             stride_size,
+                             padding,
+                             padding_size,
+                             dilation,
+                             dilation_size);
     });
 }
diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp
index d94a42ee5a..3ac7e878f7 100644
--- a/src/include/miopen/fold.hpp
+++ b/src/include/miopen/fold.hpp
@@ -62,31 +62,31 @@ miopenStatus_t UnfoldBackward(Handle& handle,
                               const int dilation_size);
 
 miopenStatus_t FoldForward(Handle& handle,
-                             const TensorDescriptor& inputDesc,
-                             ConstData_t input,
-                             const TensorDescriptor& outputDesc,
-                             Data_t output,
-                             const int32_t* kernel_size,
-                             const int kernel_size_size,
-                             const int32_t* stride,
-                             const int stride_size,
-                             const int32_t* padding,
-                             const int padding_size,
-                             const int32_t* dilation,
-                             const int dilation_size);
+                           const TensorDescriptor& inputDesc,
+                           ConstData_t input,
+                           const TensorDescriptor& outputDesc,
+                           Data_t output,
+                           const int32_t* kernel_size,
+                           const int kernel_size_size,
+                           const int32_t* stride,
+                           const int stride_size,
+                           const int32_t* padding,
+                           const int padding_size,
+                           const int32_t* dilation,
+                           const int dilation_size);
 
 miopenStatus_t FoldBackward(Handle& handle,
-                              const TensorDescriptor& dinputDesc,
-                              Data_t dinput,
-                              const TensorDescriptor& doutputDesc,
-                              ConstData_t doutput,
-                              const int32_t* kernel_size,
-                              const int kernel_size_size,
-                              const int32_t* stride,
-                              const int stride_size,
-                              const int32_t* padding,
-                              const int padding_size,
-                              const int32_t* dilation,
-                              const int dilation_size);
+                            const TensorDescriptor& dinputDesc,
+                            Data_t dinput,
+                            const TensorDescriptor& doutputDesc,
+                            ConstData_t doutput,
+                            const int32_t* kernel_size,
+                            const int kernel_size_size,
+                            const int32_t* stride,
+                            const int stride_size,
+                            const int32_t* padding,
+                            const int padding_size,
+                            const int32_t* dilation,
+                            const int dilation_size);
 } // namespace miopen
 #endif // MIOPEN_INSTANCE_NORM_HPP_
diff --git a/src/include/miopen/fold/problem_description.hpp b/src/include/miopen/fold/problem_description.hpp
index f89a90eac2..ebaadb5386 100644
--- a/src/include/miopen/fold/problem_description.hpp
+++ b/src/include/miopen/fold/problem_description.hpp
@@ -74,21 +74,17 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase
         if(inputDesc.GetSize() != 4)
         {
 #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "Unfold: The input tensor should be 4D.");
+            MIOPEN_THROW(miopenStatusBadParm, "Unfold: The input tensor should be 4D.");
 #else
             return false;
 #endif
         }
         int spatial_dim_size = inputDesc.GetSize() - 2;
-        if (kernel_size_size != spatial_dim_size ||
-        stride_size != spatial_dim_size ||
-        padding_size != spatial_dim_size ||
-        dilation_size != spatial_dim_size)
+        if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size ||
+           padding_size != spatial_dim_size || dilation_size != spatial_dim_size)
         {
 #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "Unfold: Argument length should be 2D");
+            MIOPEN_THROW(miopenStatusBadParm, "Unfold: Argument length should be 2D");
 #else
             return false;
 #endif
@@ -98,24 +94,23 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase
         const int32_t C = static_cast<int32_t>(input_dims[1]);
         int32_t P = 1, L = 1;
         std::vector<int32_t> ls;
-        for (int i = 0; i < spatial_dim_size; ++i) {
+        for(int i = 0; i < spatial_dim_size; ++i)
+        {
             P *= kernel_size[i];
             int32_t l = (static_cast<int32_t>(input_dims[i + 2]) + 2 * padding[i] -
-                        dilation[i] * (kernel_size[i] - 1) - 1) /
+                         dilation[i] * (kernel_size[i] - 1) - 1) /
                             stride[i] +
                         1;
             L *= l;
             ls.push_back(l);
         }
-        std::vector<size_t> output_dims_desired{static_cast<size_t>(N),
-                                        static_cast<size_t>(C * P),
-                                        static_cast<size_t>(L)};
+        std::vector<size_t> output_dims_desired{
+            static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
         auto output_dims = outputDesc.GetLengths();
-        if (output_dims != output_dims_desired)
+        if(output_dims != output_dims_desired)
         {
 #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "Unfold: Invalid output dimension");
+            MIOPEN_THROW(miopenStatusBadParm, "Unfold: Invalid output dimension");
 #else
             return false;
 #endif
@@ -125,7 +120,7 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase
 
     bool IsValidType() const
     {
-        if (inputDesc.GetType() != outputDesc.GetType())
+        if(inputDesc.GetType() != outputDesc.GetType())
         {
 #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
             MIOPEN_THROW(miopenStatusBadParm,
@@ -187,21 +182,17 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase
         if(dinputDesc.GetSize() != 4)
         {
 #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "Unfold: The input gradient tensor should be 4D.");
+            MIOPEN_THROW(miopenStatusBadParm, "Unfold: The input gradient tensor should be 4D.");
 #else
             return false;
 #endif
         }
         int spatial_dim_size = dinputDesc.GetSize() - 2;
-        if (kernel_size_size != spatial_dim_size ||
-        stride_size != spatial_dim_size ||
-        padding_size != spatial_dim_size ||
-        dilation_size != spatial_dim_size)
+        if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size ||
+           padding_size != spatial_dim_size || dilation_size != spatial_dim_size)
         {
 #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "Unfold: Argument length should be 2D");
+            MIOPEN_THROW(miopenStatusBadParm, "Unfold: Argument length should be 2D");
 #else
             return false;
 #endif
@@ -211,24 +202,23 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase
         const int32_t C = static_cast<int32_t>(input_dims[1]);
         int32_t P = 1, L = 1;
         std::vector<int32_t> ls;
-        for (int i = 0; i < spatial_dim_size; ++i) {
+        for(int i = 0; i < spatial_dim_size; ++i)
+        {
             P *= kernel_size[i];
             int32_t l = (static_cast<int32_t>(input_dims[i + 2]) + 2 * padding[i] -
-                        dilation[i] * (kernel_size[i] - 1) - 1) /
+                         dilation[i] * (kernel_size[i] - 1) - 1) /
                             stride[i] +
                         1;
             L *= l;
             ls.push_back(l);
         }
-        std::vector<size_t> output_dims_desired{static_cast<size_t>(N),
-                                        static_cast<size_t>(C * P),
-                                        static_cast<size_t>(L)};
+        std::vector<size_t> output_dims_desired{
+            static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
         auto output_dims = doutputDesc.GetLengths();
-        if (output_dims != output_dims_desired)
+        if(output_dims != output_dims_desired)
         {
 #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "Unfold: Invalid output gradient dimension");
+            MIOPEN_THROW(miopenStatusBadParm, "Unfold: Invalid output gradient dimension");
 #else
             return false;
 #endif
@@ -238,11 +228,12 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase
 
     bool IsValidType() const
     {
-        if (dinputDesc.GetType() != doutputDesc.GetType())
+        if(dinputDesc.GetType() != doutputDesc.GetType())
         {
 #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "Unfold: The input gradient tensor and output gradient tensor has mismatch type.");
+            MIOPEN_THROW(
+                miopenStatusBadParm,
+                "Unfold: The input gradient tensor and output gradient tensor has mismatch type.");
 #else
             return false;
 #endif
@@ -271,15 +262,15 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase
 struct FoldFwdProblemDescription : ProblemDescriptionBase
 {
     FoldFwdProblemDescription(const TensorDescriptor& inputDesc_,
-                                const TensorDescriptor& outputDesc_,
-                                const int32_t* kernel_size_,
-                                const int kernel_size_size_,
-                                const int32_t* stride_,
-                                const int stride_size_,
-                                const int32_t* padding_,
-                                const int padding_size_,
-                                const int32_t* dilation_,
-                                const int dilation_size_)
+                              const TensorDescriptor& outputDesc_,
+                              const int32_t* kernel_size_,
+                              const int kernel_size_size_,
+                              const int32_t* stride_,
+                              const int stride_size_,
+                              const int32_t* padding_,
+                              const int padding_size_,
+                              const int32_t* dilation_,
+                              const int dilation_size_)
         : inputDesc(inputDesc_),
           outputDesc(outputDesc_),
           kernel_size(kernel_size_),
@@ -300,48 +291,43 @@ struct FoldFwdProblemDescription : ProblemDescriptionBase
         if(outputDesc.GetSize() != 4)
         {
 #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "Fold: The output tensor should be 4D.");
+            MIOPEN_THROW(miopenStatusBadParm, "Fold: The output tensor should be 4D.");
 #else
             return false;
 #endif
         }
         int spatial_dim_size = outputDesc.GetSize() - 2;
-        if (kernel_size_size != spatial_dim_size ||
-        stride_size != spatial_dim_size ||
-        padding_size != spatial_dim_size ||
-        dilation_size != spatial_dim_size)
+        if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size ||
+           padding_size != spatial_dim_size || dilation_size != spatial_dim_size)
         {
 #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "Fold: Argument length should be 2D");
+            MIOPEN_THROW(miopenStatusBadParm, "Fold: Argument length should be 2D");
 #else
             return false;
 #endif
         }
-        auto input_dims = inputDesc.GetLengths();
+        auto input_dims  = inputDesc.GetLengths();
         auto output_dims = outputDesc.GetLengths();
-        const int32_t N = static_cast<int32_t>(output_dims[0]);
-        const int32_t C = static_cast<int32_t>(output_dims[1]);
+        const int32_t N  = static_cast<int32_t>(output_dims[0]);
+        const int32_t C  = static_cast<int32_t>(output_dims[1]);
         int32_t P = 1, L = 1;
         std::vector<int32_t> ls;
-        for (int i = 0; i < spatial_dim_size; ++i) {
+        for(int i = 0; i < spatial_dim_size; ++i)
+        {
             P *= kernel_size[i];
             int32_t l = (static_cast<int32_t>(output_dims[i + 2]) + 2 * padding[i] -
-                        dilation[i] * (kernel_size[i] - 1) - 1) /
+                         dilation[i] * (kernel_size[i] - 1) - 1) /
                             stride[i] +
                         1;
             L *= l;
             ls.push_back(l);
         }
-        std::vector<size_t> input_dims_desired{static_cast<size_t>(N),
-                                        static_cast<size_t>(C * P),
-                                        static_cast<size_t>(L)};
-        if (input_dims != input_dims_desired)
+        std::vector<size_t> input_dims_desired{
+            static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
+        if(input_dims != input_dims_desired)
         {
 #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "Fold: Invalid input dimension");
+            MIOPEN_THROW(miopenStatusBadParm, "Fold: Invalid input dimension");
 #else
             return false;
 #endif
@@ -351,7 +337,7 @@ struct FoldFwdProblemDescription : ProblemDescriptionBase
 
     bool IsValidType() const
     {
-        if (inputDesc.GetType() != outputDesc.GetType())
+        if(inputDesc.GetType() != outputDesc.GetType())
         {
 #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
             MIOPEN_THROW(miopenStatusBadParm,
@@ -384,15 +370,15 @@ struct FoldFwdProblemDescription : ProblemDescriptionBase
 struct FoldBwdProblemDescription : ProblemDescriptionBase
 {
     FoldBwdProblemDescription(const TensorDescriptor& dinputDesc_,
-                                const TensorDescriptor& doutputDesc_,
-                                const int32_t* kernel_size_,
-                                const int kernel_size_size_,
-                                const int32_t* stride_,
-                                const int stride_size_,
-                                const int32_t* padding_,
-                                const int padding_size_,
-                                const int32_t* dilation_,
-                                const int dilation_size_)
+                              const TensorDescriptor& doutputDesc_,
+                              const int32_t* kernel_size_,
+                              const int kernel_size_size_,
+                              const int32_t* stride_,
+                              const int stride_size_,
+                              const int32_t* padding_,
+                              const int padding_size_,
+                              const int32_t* dilation_,
+                              const int dilation_size_)
         : dinputDesc(dinputDesc_),
           doutputDesc(doutputDesc_),
           kernel_size(kernel_size_),
@@ -413,48 +399,43 @@ struct FoldBwdProblemDescription : ProblemDescriptionBase
         if(doutputDesc.GetSize() != 4)
         {
 #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "Fold: The output gradient tensor should be 4D.");
+            MIOPEN_THROW(miopenStatusBadParm, "Fold: The output gradient tensor should be 4D.");
 #else
             return false;
 #endif
         }
         int spatial_dim_size = doutputDesc.GetSize() - 2;
-        if (kernel_size_size != spatial_dim_size ||
-        stride_size != spatial_dim_size ||
-        padding_size != spatial_dim_size ||
-        dilation_size != spatial_dim_size)
+        if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size ||
+           padding_size != spatial_dim_size || dilation_size != spatial_dim_size)
         {
 #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "Fold: Argument length should be 2D");
+            MIOPEN_THROW(miopenStatusBadParm, "Fold: Argument length should be 2D");
 #else
             return false;
 #endif
         }
-        auto input_dims = dinputDesc.GetLengths();
+        auto input_dims  = dinputDesc.GetLengths();
         auto output_dims = doutputDesc.GetLengths();
-        const int32_t N = static_cast<int32_t>(output_dims[0]);
-        const int32_t C = static_cast<int32_t>(output_dims[1]);
+        const int32_t N  = static_cast<int32_t>(output_dims[0]);
+        const int32_t C  = static_cast<int32_t>(output_dims[1]);
         int32_t P = 1, L = 1;
         std::vector<int32_t> ls;
-        for (int i = 0; i < spatial_dim_size; ++i) {
+        for(int i = 0; i < spatial_dim_size; ++i)
+        {
             P *= kernel_size[i];
             int32_t l = (static_cast<int32_t>(output_dims[i + 2]) + 2 * padding[i] -
-                        dilation[i] * (kernel_size[i] - 1) - 1) /
+                         dilation[i] * (kernel_size[i] - 1) - 1) /
                             stride[i] +
                         1;
             L *= l;
             ls.push_back(l);
         }
-        std::vector<size_t> input_dims_desired{static_cast<size_t>(N),
-                                        static_cast<size_t>(C * P),
-                                        static_cast<size_t>(L)};
-        if (input_dims != input_dims_desired)
+        std::vector<size_t> input_dims_desired{
+            static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
+        if(input_dims != input_dims_desired)
         {
 #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "Fold: Invalid input gradient dimension");
+            MIOPEN_THROW(miopenStatusBadParm, "Fold: Invalid input gradient dimension");
 #else
             return false;
 #endif
@@ -464,11 +445,12 @@ struct FoldBwdProblemDescription : ProblemDescriptionBase
 
     bool IsValidType() const
     {
-        if (dinputDesc.GetType() != doutputDesc.GetType())
+        if(dinputDesc.GetType() != doutputDesc.GetType())
         {
 #if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "Fold: The input gradient tensor and output gradient tensor has mismatch type.");
+            MIOPEN_THROW(
+                miopenStatusBadParm,
+                "Fold: The input gradient tensor and output gradient tensor has mismatch type.");
 #else
             return false;
 #endif
@@ -476,7 +458,6 @@ struct FoldBwdProblemDescription : ProblemDescriptionBase
         return true;
     }
 
-
     const TensorDescriptor& GetDinputDesc() const { return dinputDesc; }
     const TensorDescriptor& GetDoutputDesc() const { return doutputDesc; }
 
diff --git a/src/include/miopen/fold/solvers.hpp b/src/include/miopen/fold/solvers.hpp
index e92213f434..1ff3ef7566 100644
--- a/src/include/miopen/fold/solvers.hpp
+++ b/src/include/miopen/fold/solvers.hpp
@@ -76,9 +76,8 @@ struct FoldFwd final : FoldFwdSolverBase
     bool IsApplicable(const ExecutionContext& context,
                       const miopen::fold::FoldFwdProblemDescription& problem) const override;
 
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::fold::FoldFwdProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::fold::FoldFwdProblemDescription& problem) const override;
 };
 
 using FoldBwdSolverBase =
@@ -91,9 +90,8 @@ struct FoldBwd final : FoldBwdSolverBase
     bool IsApplicable(const ExecutionContext& context,
                       const miopen::fold::FoldBwdProblemDescription& problem) const override;
 
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::fold::FoldBwdProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::fold::FoldBwdProblemDescription& problem) const override;
 };
 
 } // namespace fold
diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp
index e7696b10a3..a1327be94e 100644
--- a/src/solver/fold/fold_backward.cpp
+++ b/src/solver/fold/fold_backward.cpp
@@ -43,13 +43,13 @@ namespace solver {
 namespace fold {
 
 bool FoldBwd::IsApplicable([[maybe_unused]] const ExecutionContext& /*context*/,
-                             const miopen::fold::FoldBwdProblemDescription& problem) const
+                           const miopen::fold::FoldBwdProblemDescription& problem) const
 {
     return true;
 }
 
 ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& context,
-                                    const miopen::fold::FoldBwdProblemDescription& problem) const
+                                  const miopen::fold::FoldBwdProblemDescription& problem) const
 {
     std::ignore = context;
     auto result = ConvSolution{miopenStatusSuccess};
@@ -59,8 +59,8 @@ ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& conte
     auto input_grad_dims  = problem.GetDinputDesc().GetLengths();
     auto output_grad_dims = problem.GetDoutputDesc().GetLengths();
 
-    const int32_t N = static_cast<int32_t>(output_grad_dims[0]);
-    const int32_t C = static_cast<int32_t>(output_grad_dims[1]);
+    const int32_t N      = static_cast<int32_t>(output_grad_dims[0]);
+    const int32_t C      = static_cast<int32_t>(output_grad_dims[1]);
     int spatial_dim_size = output_grad_dims.size() - 2;
     int32_t P = 1, L = 1;
     std::vector<int32_t> ls;
@@ -68,7 +68,7 @@ ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& conte
     {
         P *= problem.kernel_size[i];
         int32_t l = (static_cast<int32_t>(output_grad_dims[i + 2]) + 2 * problem.padding[i] -
-                    problem.dilation[i] * (problem.kernel_size[i] - 1) - 1) /
+                     problem.dilation[i] * (problem.kernel_size[i] - 1) - 1) /
                         problem.stride[i] +
                     1;
         L *= l;
diff --git a/test/cpu_fold.hpp b/test/cpu_fold.hpp
index de34115177..373cc30917 100644
--- a/test/cpu_fold.hpp
+++ b/test/cpu_fold.hpp
@@ -69,18 +69,18 @@ void cpu_unfold_fwd_4d(tensor<T> input_tensor,
     }
 
     [[maybe_unused]] int32_t kernel_size_h = kernel_size[0];
-    int32_t kernel_size_w = kernel_size[1];
-    int32_t stride_h      = stride[0];
-    int32_t stride_w      = stride[1];
-    int32_t padding_h     = padding[0];
-    int32_t padding_w     = padding[1];
-    int32_t dilation_h    = dilation[0];
-    int32_t dilation_w    = dilation[1];
+    int32_t kernel_size_w                  = kernel_size[1];
+    int32_t stride_h                       = stride[0];
+    int32_t stride_w                       = stride[1];
+    int32_t padding_h                      = padding[0];
+    int32_t padding_w                      = padding[1];
+    int32_t dilation_h                     = dilation[0];
+    int32_t dilation_w                     = dilation[1];
     [[maybe_unused]] int32_t LH            = ls[0];
-    int32_t LW            = ls[1];
-    int32_t H             = static_cast<int32_t>(input_dims[2]);
-    int32_t W             = static_cast<int32_t>(input_dims[3]);
-    int work_size         = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
+    int32_t LW                             = ls[1];
+    int32_t H                              = static_cast<int32_t>(input_dims[2]);
+    int32_t W                              = static_cast<int32_t>(input_dims[3]);
+    int work_size = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
     par_ford(work_size)([&](int gid) {
         int ncp = gid / L, l = gid % L;
         int nc = ncp / P, p = ncp % P;
diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp
index 9b7f883528..7e71c5ce2f 100644
--- a/test/gtest/fold.hpp
+++ b/test/gtest/fold.hpp
@@ -141,18 +141,20 @@ struct FoldFwdTest : public ::testing::TestWithParam<FoldTestCase>
         std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
 
         auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
-        [[maybe_unused]] auto gen_one   = [&](auto...) { return 1; };
-        auto gen_zero  = [&](auto...) { return 0; };
-        input          = tensor<T>{in_dims, in_strides}.generate(gen_value);
-        const int32_t N      = static_cast<int32_t>(in_dims[0]);
-        int32_t C      = static_cast<int32_t>(in_dims[1]);
-        for (int32_t i : config.kernelSize)
+        [[maybe_unused]] auto gen_one = [&](auto...) { return 1; };
+        auto gen_zero                 = [&](auto...) { return 0; };
+        input                         = tensor<T>{in_dims, in_strides}.generate(gen_value);
+        const int32_t N               = static_cast<int32_t>(in_dims[0]);
+        int32_t C                     = static_cast<int32_t>(in_dims[1]);
+        for(int32_t i : config.kernelSize)
         {
             C = C / i;
         }
 
-        std::vector<size_t> out_dims{
-            static_cast<size_t>(N), static_cast<size_t>(C), static_cast<size_t>(config.outputSize[0]), static_cast<size_t>(config.outputSize[1])};
+        std::vector<size_t> out_dims{static_cast<size_t>(N),
+                                     static_cast<size_t>(C),
+                                     static_cast<size_t>(config.outputSize[0]),
+                                     static_cast<size_t>(config.outputSize[1])};
 
         output     = tensor<T>{out_dims}.generate(gen_zero);
         outputHost = tensor<T>{out_dims}.generate(gen_zero);
@@ -167,18 +169,18 @@ struct FoldFwdTest : public ::testing::TestWithParam<FoldTestCase>
         miopenStatus_t status;
 
         status = miopen::FoldForward(handle,
-                                       input.desc,
-                                       input_dev.get(),
-                                       output.desc,
-                                       output_dev.get(),
-                                       config.kernelSize.data(),
-                                       static_cast<int>(config.kernelSize.size()),
-                                       config.stride.data(),
-                                       static_cast<int>(config.stride.size()),
-                                       config.padding.data(),
-                                       static_cast<int>(config.padding.size()),
-                                       config.dilation.data(),
-                                       static_cast<int>(config.dilation.size()));
+                                     input.desc,
+                                     input_dev.get(),
+                                     output.desc,
+                                     output_dev.get(),
+                                     config.kernelSize.data(),
+                                     static_cast<int>(config.kernelSize.size()),
+                                     config.stride.data(),
+                                     static_cast<int>(config.stride.size()),
+                                     config.padding.data(),
+                                     static_cast<int>(config.padding.size()),
+                                     config.dilation.data(),
+                                     static_cast<int>(config.dilation.size()));
 
         cpu_unfold_bwd_4d<T>(
             outputHost, input, config.kernelSize, config.stride, config.padding, config.dilation);
@@ -196,9 +198,10 @@ struct FoldFwdTest : public ::testing::TestWithParam<FoldTestCase>
         // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
         if(std::is_same<T, bfloat16>::value)
             tolerance *= 8.0;
-        for (int i = 0; i < 10; ++i)
+        for(int i = 0; i < 10; ++i)
         {
-            std::cout << "output[" << i << "]: " << output[i] << " ~ " << outputHost[i] << std::endl;
+            std::cout << "output[" << i << "]: " << output[i] << " ~ " << outputHost[i]
+                      << std::endl;
         }
         auto error_output = miopen::rms_range(outputHost, output);
         EXPECT_TRUE(error_output < tolerance) << "Error forward output beyond tolerance Error: {"
@@ -228,20 +231,22 @@ struct FoldBwdTest : public ::testing::TestWithParam<FoldTestCase>
         std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
 
         auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
-        [[maybe_unused]] auto gen_one   = [&](auto...) { return 1; };
-        auto gen_zero  = [&](auto...) { return 0; };
-        dinput         = tensor<T>{in_dims, in_strides}.generate(gen_zero);
-        dinputHost     = tensor<T>{in_dims, in_strides}.generate(gen_zero);
-
-        const int32_t N      = static_cast<int32_t>(in_dims[0]);
-        int32_t C      = static_cast<int32_t>(in_dims[1]);
-        for (int32_t i : config.kernelSize)
+        [[maybe_unused]] auto gen_one = [&](auto...) { return 1; };
+        auto gen_zero                 = [&](auto...) { return 0; };
+        dinput                        = tensor<T>{in_dims, in_strides}.generate(gen_zero);
+        dinputHost                    = tensor<T>{in_dims, in_strides}.generate(gen_zero);
+
+        const int32_t N = static_cast<int32_t>(in_dims[0]);
+        int32_t C       = static_cast<int32_t>(in_dims[1]);
+        for(int32_t i : config.kernelSize)
         {
             C = C / i;
         }
 
-        std::vector<size_t> out_dims{
-            static_cast<size_t>(N), static_cast<size_t>(C), static_cast<size_t>(config.outputSize[0]), static_cast<size_t>(config.outputSize[1])};
+        std::vector<size_t> out_dims{static_cast<size_t>(N),
+                                     static_cast<size_t>(C),
+                                     static_cast<size_t>(config.outputSize[0]),
+                                     static_cast<size_t>(config.outputSize[1])};
 
         doutput = tensor<T>{out_dims}.generate(gen_value);
 
@@ -255,18 +260,18 @@ struct FoldBwdTest : public ::testing::TestWithParam<FoldTestCase>
         miopenStatus_t status;
 
         status = miopen::FoldBackward(handle,
-                                        dinput.desc,
-                                        dinput_dev.get(),
-                                        doutput.desc,
-                                        doutput_dev.get(),
-                                        config.kernelSize.data(),
-                                        static_cast<int>(config.kernelSize.size()),
-                                        config.stride.data(),
-                                        static_cast<int>(config.stride.size()),
-                                        config.padding.data(),
-                                        static_cast<int>(config.padding.size()),
-                                        config.dilation.data(),
-                                        static_cast<int>(config.dilation.size()));
+                                      dinput.desc,
+                                      dinput_dev.get(),
+                                      doutput.desc,
+                                      doutput_dev.get(),
+                                      config.kernelSize.data(),
+                                      static_cast<int>(config.kernelSize.size()),
+                                      config.stride.data(),
+                                      static_cast<int>(config.stride.size()),
+                                      config.padding.data(),
+                                      static_cast<int>(config.padding.size()),
+                                      config.dilation.data(),
+                                      static_cast<int>(config.dilation.size()));
 
         cpu_unfold_fwd_4d<T>(
             doutput, dinputHost, config.kernelSize, config.stride, config.padding, config.dilation);
diff --git a/test/gtest/unfold.hpp b/test/gtest/unfold.hpp
index 686a1e8f02..3a839024b2 100644
--- a/test/gtest/unfold.hpp
+++ b/test/gtest/unfold.hpp
@@ -136,9 +136,9 @@ struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
 
         auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
-        [[maybe_unused]] auto gen_one   = [&](auto...) { return 1; };
-        auto gen_zero  = [&](auto...) { return 0; };
-        input          = tensor<T>{in_dims, in_strides}.generate(gen_value);
+        [[maybe_unused]] auto gen_one = [&](auto...) { return 1; };
+        auto gen_zero                 = [&](auto...) { return 0; };
+        input                         = tensor<T>{in_dims, in_strides}.generate(gen_value);
 
         int spatial_dim_size = in_dims.size() - 2;
         const int32_t N      = static_cast<int32_t>(in_dims[0]);
@@ -229,10 +229,10 @@ struct UnfoldBwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
 
         auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
-        [[maybe_unused]] auto gen_one   = [&](auto...) { return 1; };
-        auto gen_zero  = [&](auto...) { return 0; };
-        dinput         = tensor<T>{in_dims, in_strides}.generate(gen_zero);
-        dinputHost     = tensor<T>{in_dims, in_strides}.generate(gen_zero);
+        [[maybe_unused]] auto gen_one = [&](auto...) { return 1; };
+        auto gen_zero                 = [&](auto...) { return 0; };
+        dinput                        = tensor<T>{in_dims, in_strides}.generate(gen_zero);
+        dinputHost                    = tensor<T>{in_dims, in_strides}.generate(gen_zero);
 
         int spatial_dim_size = in_dims.size() - 2;
         const int32_t N      = static_cast<int32_t>(in_dims[0]);

From e59ce36d740b91608b03d46a5b6b4da8ea05815b Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Fri, 12 Jul 2024 08:44:18 +0000
Subject: [PATCH 08/46] update doc and miopen.h description

---
 docs/reference/index.rst |  1 +
 include/miopen/miopen.h  | 12 ++++++------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 02bcb88622..c6ab9521b7 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -32,3 +32,4 @@ The MIOpen API library is structured as follows:
   * :doc:`GroupNorm <../doxygen/html/group__groupnorm>` (experimental)
   * :doc:`Cat <../doxygen/html/group__cat>` (experimental)
   * :doc:`Argmax<./argmax>` (experimental)
+  * :doc:`Fold <./fold>` (experimental)
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 51485db6e7..a019b4f7c6 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -6588,12 +6588,12 @@ MIOPEN_EXPORT miopenStatus_t miopenBackendInitialize(miopenBackendDescriptor_t d
  *
  *  @{
  */
-/*! @brief Execute an unfold forward layer
+/*! @brief Execute an fold forward layer
  *
  * @param handle              MIOpen handle (input)
  * @param inputDesc           Tensor descriptor for data input tensor input (input)
  * @param input               Data tensor input (input)
- * @param outputDesc          Tensor descriptor for data output tensor output (output)
+ * @param outputDesc          Tensor descriptor for data output tensor output (input)
  * @param output              Data tensor output (output)
  * @param kernel_size         Size of the sliding box array (input)
  * @param kernel_size_size    Size of the kernel_size array (input)
@@ -6620,10 +6620,10 @@ MIOPEN_EXPORT miopenStatus_t miopenFoldForward(miopenHandle_t handle,
                                                const int32_t* dilation,
                                                const int dilation_size);
 
-/*! @brief Execute an unfold backward layer
+/*! @brief Execute an fold backward layer
 *
 * @param handle              MIOpen handle (input)
-* @param dinputDesc          Tensor descriptor for data input grad tensor (output)
+* @param dinputDesc          Tensor descriptor for data input grad tensor (input)
 * @param dinput              Data tensor input grad (output)
 * @param doutputDesc         Tensor descriptor for data output grad tensor (input)
 * @param doutput             Data tensor output grad (input)
@@ -6657,7 +6657,7 @@ MIOPEN_EXPORT miopenStatus_t miopenFoldBackward(miopenHandle_t handle,
  * @param handle              MIOpen handle (input)
  * @param inputDesc           Tensor descriptor for data input tensor input (input)
  * @param input               Data tensor input (input)
- * @param outputDesc          Tensor descriptor for data output tensor output (output)
+ * @param outputDesc          Tensor descriptor for data output tensor output (input)
  * @param output              Data tensor output (output)
  * @param kernel_size         Size of the sliding box array (input)
  * @param kernel_size_size    Size of the kernel_size array (input)
@@ -6687,7 +6687,7 @@ MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle,
 /*! @brief Execute an unfold backward layer
  *
  * @param handle              MIOpen handle (input)
- * @param dinputDesc          Tensor descriptor for data input grad tensor (output)
+ * @param dinputDesc          Tensor descriptor for data input grad tensor (input)
  * @param dinput              Data tensor input grad (output)
  * @param doutputDesc         Tensor descriptor for data output grad tensor (input)
  * @param doutput             Data tensor output grad (input)

From c11493403883df17a9cd5f5dfd109a024ccdab54 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Mon, 15 Jul 2024 05:43:46 +0000
Subject: [PATCH 09/46] Update driver help text

---
 driver/fold_driver.hpp   | 4 ++--
 driver/unfold_driver.hpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/driver/fold_driver.hpp b/driver/fold_driver.hpp
index 4278624928..1468a77e0d 100644
--- a/driver/fold_driver.hpp
+++ b/driver/fold_driver.hpp
@@ -166,8 +166,8 @@ int FoldDriver<Tgpu, Tref>::AddCmdLineArgs()
     inflags.AddInputFlag("outputSize", 'o', "4,5", "Output Size (Default=2,3)", "str");
     inflags.AddInputFlag("kernelSize", 'k', "2,2", "Kernel Size (Default=2,3)", "str");
     inflags.AddInputFlag("stride", 's', "1,1", "Stride (Default=1,1)", "str");
-    inflags.AddInputFlag("padding", 'p', "0,0", "Stride (Default=0,0)", "str");
-    inflags.AddInputFlag("dilation", 'd', "1,1", "Stride (Default=1,1)", "str");
+    inflags.AddInputFlag("padding", 'p', "0,0", "Padding (Default=0,0)", "str");
+    inflags.AddInputFlag("dilation", 'd', "1,1", "Dilation (Default=1,1)", "str");
     inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
     inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int");
     inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
diff --git a/driver/unfold_driver.hpp b/driver/unfold_driver.hpp
index d565d192f5..e9a3665eaa 100644
--- a/driver/unfold_driver.hpp
+++ b/driver/unfold_driver.hpp
@@ -171,8 +171,8 @@ int UnfoldDriver<Tgpu, Tref>::AddCmdLineArgs()
         "DimLengths", 'D', "2,5,3,4", "The dimensional lengths of the input tensor", "string");
     inflags.AddInputFlag("kernelSize", 'k', "2,3", "Kernel Size (Default=2,3)", "str");
     inflags.AddInputFlag("stride", 's', "1,1", "Stride (Default=1,1)", "str");
-    inflags.AddInputFlag("padding", 'p', "0,0", "Stride (Default=0,0)", "str");
-    inflags.AddInputFlag("dilation", 'd', "1,1", "Stride (Default=1,1)", "str");
+    inflags.AddInputFlag("padding", 'p', "0,0", "Padding (Default=0,0)", "str");
+    inflags.AddInputFlag("dilation", 'd', "1,1", "Dilation (Default=1,1)", "str");
     inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
     inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int");
     inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");

From bd5db593af74bd68f03e44465a4a1beec55b8056 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Mon, 15 Jul 2024 06:14:14 +0000
Subject: [PATCH 10/46] Change IN_OUT_TYPE to FLOAT

---
 src/kernels/MIOpenUnfold.cpp        | 12 ++++++------
 src/solver/fold/fold_backward.cpp   |  1 -
 src/solver/fold/fold_forward.cpp    |  1 -
 src/solver/fold/unfold_backward.cpp |  1 -
 src/solver/fold/unfold_forward.cpp  |  1 -
 5 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/kernels/MIOpenUnfold.cpp b/src/kernels/MIOpenUnfold.cpp
index 5c39a82e2c..9b36edd28d 100644
--- a/src/kernels/MIOpenUnfold.cpp
+++ b/src/kernels/MIOpenUnfold.cpp
@@ -86,8 +86,8 @@ __device__ void unfoldForward4D(const TIO* input,
     output[output_idx] = x;
 }
 
-extern "C" __global__ void UnfoldForward4D(const IN_OUT_TYPE* input,
-                                           IN_OUT_TYPE* output,
+extern "C" __global__ void UnfoldForward4D(const FLOAT* input,
+                                           FLOAT* output,
                                            int N,
                                            int C,
                                            int H,
@@ -107,7 +107,7 @@ extern "C" __global__ void UnfoldForward4D(const IN_OUT_TYPE* input,
                                            tensor_view_t<4> input_tv,
                                            tensor_view_t<3> output_tv)
 {
-    unfoldForward4D<IN_OUT_TYPE>(input,
+    unfoldForward4D<FLOAT>(input,
                                  output,
                                  N,
                                  C,
@@ -194,8 +194,8 @@ __device__ void unfoldBackward4D(const TIO* output_grad,
     input_grad[input_grad_idx] = CVT_ACCUM2FLOAT(sum);
 }
 
-extern "C" __global__ void UnfoldBackward4D(const IN_OUT_TYPE* output_grad,
-                                            IN_OUT_TYPE* input_grad,
+extern "C" __global__ void UnfoldBackward4D(const FLOAT* output_grad,
+                                            FLOAT* input_grad,
                                             int N,
                                             int C,
                                             int H,
@@ -215,7 +215,7 @@ extern "C" __global__ void UnfoldBackward4D(const IN_OUT_TYPE* output_grad,
                                             tensor_view_t<3> output_grad_tv,
                                             tensor_view_t<4> input_grad_tv)
 {
-    unfoldBackward4D<IN_OUT_TYPE>(output_grad,
+    unfoldBackward4D<FLOAT>(output_grad,
                                   input_grad,
                                   N,
                                   C,
diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp
index a1327be94e..2b09ac3529 100644
--- a/src/solver/fold/fold_backward.cpp
+++ b/src/solver/fold/fold_backward.cpp
@@ -85,7 +85,6 @@ ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& conte
             {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
             {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
             {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
-            {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype},
         };
         kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
 
diff --git a/src/solver/fold/fold_forward.cpp b/src/solver/fold/fold_forward.cpp
index 5b8f638cb1..0150c8b9fb 100644
--- a/src/solver/fold/fold_forward.cpp
+++ b/src/solver/fold/fold_forward.cpp
@@ -74,7 +74,6 @@ ConvSolution FoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& conte
             {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
             {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
             {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
-            {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype},
         };
         kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
 
diff --git a/src/solver/fold/unfold_backward.cpp b/src/solver/fold/unfold_backward.cpp
index 249f08592c..38d0812307 100644
--- a/src/solver/fold/unfold_backward.cpp
+++ b/src/solver/fold/unfold_backward.cpp
@@ -74,7 +74,6 @@ ConvSolution UnfoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& con
             {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
             {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
             {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
-            {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype},
         };
         kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
 
diff --git a/src/solver/fold/unfold_forward.cpp b/src/solver/fold/unfold_forward.cpp
index b866b5d167..1aa10d9eea 100644
--- a/src/solver/fold/unfold_forward.cpp
+++ b/src/solver/fold/unfold_forward.cpp
@@ -85,7 +85,6 @@ ConvSolution UnfoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& con
             {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
             {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
             {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
-            {"IN_OUT_TYPE", in_dtype == "bfloat16" ? "ushort" : in_dtype},
         };
         kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
 

From 4bb5855eb4623006da4f4f62f29343cbfdfba5b9 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Mon, 15 Jul 2024 06:16:32 +0000
Subject: [PATCH 11/46] add __restrict__ to tensor pointer

---
 src/kernels/MIOpenUnfold.cpp | 98 ++++++++++++++++++------------------
 1 file changed, 49 insertions(+), 49 deletions(-)

diff --git a/src/kernels/MIOpenUnfold.cpp b/src/kernels/MIOpenUnfold.cpp
index 9b36edd28d..84a272ef80 100644
--- a/src/kernels/MIOpenUnfold.cpp
+++ b/src/kernels/MIOpenUnfold.cpp
@@ -32,9 +32,9 @@
 #include "float_types.h"
 #include "tensor_view.hpp"
 
-template <typename TIO>
-__device__ void unfoldForward4D(const TIO* input,
-                                TIO* output,
+template <typename DTYPE>
+__device__ void unfoldForward4D(const DTYPE* __restrict__ input,
+                                DTYPE* __restrict__ output,
                                 int N,
                                 int C,
                                 int H,
@@ -73,7 +73,7 @@ __device__ void unfoldForward4D(const TIO* input,
     int h = lh * stride_h - padding_h + ph * dilation_h;
     int w = lw * stride_w - padding_w + pw * dilation_w;
 
-    TIO x = 0;
+    DTYPE x = 0;
     if(0 <= h && h < H && 0 <= w && w < W)
     {
         long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + input_tv.stride[1] * c +
@@ -86,8 +86,8 @@ __device__ void unfoldForward4D(const TIO* input,
     output[output_idx] = x;
 }
 
-extern "C" __global__ void UnfoldForward4D(const FLOAT* input,
-                                           FLOAT* output,
+extern "C" __global__ void UnfoldForward4D(const FLOAT* __restrict__ input,
+                                           FLOAT* __restrict__ output,
                                            int N,
                                            int C,
                                            int H,
@@ -108,30 +108,30 @@ extern "C" __global__ void UnfoldForward4D(const FLOAT* input,
                                            tensor_view_t<3> output_tv)
 {
     unfoldForward4D<FLOAT>(input,
-                                 output,
-                                 N,
-                                 C,
-                                 H,
-                                 W,
-                                 P,
-                                 L,
-                                 LH,
-                                 LW,
-                                 kernel_size_h,
-                                 kernel_size_w,
-                                 stride_h,
-                                 stride_w,
-                                 padding_h,
-                                 padding_w,
-                                 dilation_h,
-                                 dilation_w,
-                                 input_tv,
-                                 output_tv);
+                           output,
+                           N,
+                           C,
+                           H,
+                           W,
+                           P,
+                           L,
+                           LH,
+                           LW,
+                           kernel_size_h,
+                           kernel_size_w,
+                           stride_h,
+                           stride_w,
+                           padding_h,
+                           padding_w,
+                           dilation_h,
+                           dilation_w,
+                           input_tv,
+                           output_tv);
 }
 
-template <typename TIO>
-__device__ void unfoldBackward4D(const TIO* output_grad,
-                                 TIO* input_grad,
+template <typename DTYPE>
+__device__ void unfoldBackward4D(const DTYPE* __restrict__ output_grad,
+                                 DTYPE* __restrict__ input_grad,
                                  int N,
                                  int C,
                                  int H,
@@ -194,8 +194,8 @@ __device__ void unfoldBackward4D(const TIO* output_grad,
     input_grad[input_grad_idx] = CVT_ACCUM2FLOAT(sum);
 }
 
-extern "C" __global__ void UnfoldBackward4D(const FLOAT* output_grad,
-                                            FLOAT* input_grad,
+extern "C" __global__ void UnfoldBackward4D(const FLOAT* __restrict__ output_grad,
+                                            FLOAT* __restrict__ input_grad,
                                             int N,
                                             int C,
                                             int H,
@@ -216,23 +216,23 @@ extern "C" __global__ void UnfoldBackward4D(const FLOAT* output_grad,
                                             tensor_view_t<4> input_grad_tv)
 {
     unfoldBackward4D<FLOAT>(output_grad,
-                                  input_grad,
-                                  N,
-                                  C,
-                                  H,
-                                  W,
-                                  P,
-                                  L,
-                                  LH,
-                                  LW,
-                                  kernel_size_h,
-                                  kernel_size_w,
-                                  stride_h,
-                                  stride_w,
-                                  padding_h,
-                                  padding_w,
-                                  dilation_h,
-                                  dilation_w,
-                                  output_grad_tv,
-                                  input_grad_tv);
+                            input_grad,
+                            N,
+                            C,
+                            H,
+                            W,
+                            P,
+                            L,
+                            LH,
+                            LW,
+                            kernel_size_h,
+                            kernel_size_w,
+                            stride_h,
+                            stride_w,
+                            padding_h,
+                            padding_w,
+                            dilation_h,
+                            dilation_w,
+                            output_grad_tv,
+                            input_grad_tv);
 }

From 918a26751c88bdfaa06732ba8fa76a551e5de4f8 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Wed, 17 Jul 2024 06:09:46 +0000
Subject: [PATCH 12/46] replace include "" with <>

---
 src/fold.cpp                                    | 4 ++--
 src/fold_api.cpp                                | 2 +-
 src/include/miopen/fold/invoke_params.hpp       | 2 +-
 src/include/miopen/fold/problem_description.hpp | 2 +-
 src/solver/fold/fold_backward.cpp               | 4 ++--
 src/solver/fold/fold_forward.cpp                | 4 ++--
 src/solver/fold/unfold_backward.cpp             | 4 ++--
 src/solver/fold/unfold_forward.cpp              | 4 ++--
 8 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/fold.cpp b/src/fold.cpp
index 470d8eb6de..a105be60b3 100644
--- a/src/fold.cpp
+++ b/src/fold.cpp
@@ -24,8 +24,8 @@
  *
  *******************************************************************************/
 
-#include "miopen/miopen.h"
-#include "miopen/fold/problem_description.hpp"
+#include <miopen/miopen.h>
+#include <miopen/fold/problem_description.hpp>
 #include <miopen/datatype.hpp>
 #include <miopen/find_solution.hpp>
 #include <miopen/float_equal.hpp>
diff --git a/src/fold_api.cpp b/src/fold_api.cpp
index f59c209785..d1bdefdfb0 100644
--- a/src/fold_api.cpp
+++ b/src/fold_api.cpp
@@ -24,7 +24,7 @@
  *
  *******************************************************************************/
 
-#include "miopen/miopen.h"
+#include <miopen/miopen.h>
 #include <miopen/fold.hpp>
 #include <miopen/errors.hpp>
 #include <miopen/handle.hpp>
diff --git a/src/include/miopen/fold/invoke_params.hpp b/src/include/miopen/fold/invoke_params.hpp
index da89023f17..c19e83eeee 100644
--- a/src/include/miopen/fold/invoke_params.hpp
+++ b/src/include/miopen/fold/invoke_params.hpp
@@ -25,7 +25,7 @@
  *******************************************************************************/
 #pragma once
 
-#include "miopen/miopen.h"
+#include <miopen/miopen.h>
 #include <miopen/invoke_params.hpp>
 #include <miopen/tensor.hpp>
 
diff --git a/src/include/miopen/fold/problem_description.hpp b/src/include/miopen/fold/problem_description.hpp
index ebaadb5386..30689e8bd9 100644
--- a/src/include/miopen/fold/problem_description.hpp
+++ b/src/include/miopen/fold/problem_description.hpp
@@ -26,7 +26,7 @@
 #pragma once
 
 #include "miopen/errors.hpp"
-#include "miopen/miopen.h"
+#include <miopen/miopen.h>
 #include <miopen/activ.hpp>
 #include <miopen/problem_description_base.hpp>
 #include <miopen/tensor.hpp>
diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp
index 2b09ac3529..ee9316a3bf 100644
--- a/src/solver/fold/fold_backward.cpp
+++ b/src/solver/fold/fold_backward.cpp
@@ -24,8 +24,8 @@
  *
  *******************************************************************************/
 
-#include "miopen/fold/problem_description.hpp"
-#include "miopen/miopen.h"
+#include <miopen/fold/problem_description.hpp>
+#include <miopen/miopen.h>
 #include <miopen/datatype.hpp>
 #include <miopen/kernel_build_params.hpp>
 #include <miopen/fold/invoke_params.hpp>
diff --git a/src/solver/fold/fold_forward.cpp b/src/solver/fold/fold_forward.cpp
index 0150c8b9fb..9ecf89c6e6 100644
--- a/src/solver/fold/fold_forward.cpp
+++ b/src/solver/fold/fold_forward.cpp
@@ -24,8 +24,8 @@
  *
  *******************************************************************************/
 
-#include "miopen/fold/problem_description.hpp"
-#include "miopen/miopen.h"
+#include <miopen/fold/problem_description.hpp>
+#include <miopen/miopen.h>
 #include <miopen/datatype.hpp>
 #include <miopen/kernel_build_params.hpp>
 #include <miopen/fold/invoke_params.hpp>
diff --git a/src/solver/fold/unfold_backward.cpp b/src/solver/fold/unfold_backward.cpp
index 38d0812307..7b75679263 100644
--- a/src/solver/fold/unfold_backward.cpp
+++ b/src/solver/fold/unfold_backward.cpp
@@ -24,8 +24,8 @@
  *
  *******************************************************************************/
 
-#include "miopen/fold/problem_description.hpp"
-#include "miopen/miopen.h"
+#include <miopen/fold/problem_description.hpp>
+#include <miopen/miopen.h>
 #include <miopen/datatype.hpp>
 #include <miopen/kernel_build_params.hpp>
 #include <miopen/fold/invoke_params.hpp>
diff --git a/src/solver/fold/unfold_forward.cpp b/src/solver/fold/unfold_forward.cpp
index 1aa10d9eea..35e29df8bb 100644
--- a/src/solver/fold/unfold_forward.cpp
+++ b/src/solver/fold/unfold_forward.cpp
@@ -24,8 +24,8 @@
  *
  *******************************************************************************/
 
-#include "miopen/fold/problem_description.hpp"
-#include "miopen/miopen.h"
+#include <miopen/fold/problem_description.hpp>
+#include <miopen/miopen.h>
 #include <miopen/datatype.hpp>
 #include <miopen/kernel_build_params.hpp>
 #include <miopen/fold/invoke_params.hpp>

From 4f51b6e8b3f9adf009bf9d6f5a7999ee506ec908 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Wed, 17 Jul 2024 07:57:56 +0000
Subject: [PATCH 13/46] change all int -> int32_t, remove duplicate lines in
 solver

---
 include/miopen/miopen.h                       | 32 ++++-----
 src/fold.cpp                                  | 32 ++++-----
 src/fold_api.cpp                              | 32 ++++-----
 src/include/miopen/fold.hpp                   | 32 ++++-----
 src/include/miopen/fold/invoke_params.hpp     |  8 +--
 .../miopen/fold/problem_description.hpp       | 72 +++++++++----------
 src/solver/fold/fold_backward.cpp             | 20 +-----
 src/solver/fold/fold_forward.cpp              |  7 +-
 src/solver/fold/unfold_backward.cpp           |  6 +-
 src/solver/fold/unfold_forward.cpp            | 18 +----
 10 files changed, 111 insertions(+), 148 deletions(-)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index a019b4f7c6..3a0d7fd5dd 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -6612,13 +6612,13 @@ MIOPEN_EXPORT miopenStatus_t miopenFoldForward(miopenHandle_t handle,
                                                const miopenTensorDescriptor_t outputDesc,
                                                void* output,
                                                const int32_t* kernel_size,
-                                               const int kernel_size_size,
+                                               const int32_t kernel_size_size,
                                                const int32_t* stride,
-                                               const int stride_size,
+                                               const int32_t stride_size,
                                                const int32_t* padding,
-                                               const int padding_size,
+                                               const int32_t padding_size,
                                                const int32_t* dilation,
-                                               const int dilation_size);
+                                               const int32_t dilation_size);
 
 /*! @brief Execute an fold backward layer
 *
@@ -6644,13 +6644,13 @@ MIOPEN_EXPORT miopenStatus_t miopenFoldBackward(miopenHandle_t handle,
                                                 const miopenTensorDescriptor_t doutputDesc,
                                                 const void* doutput,
                                                 const int32_t* kernel_size,
-                                                const int kernel_size_size,
+                                                const int32_t kernel_size_size,
                                                 const int32_t* stride,
-                                                const int stride_size,
+                                                const int32_t stride_size,
                                                 const int32_t* padding,
-                                                const int padding_size,
+                                                const int32_t padding_size,
                                                 const int32_t* dilation,
-                                                const int dilation_size);
+                                                const int32_t dilation_size);
 
 /*! @brief Execute an unfold forward layer
  *
@@ -6676,13 +6676,13 @@ MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle,
                                                  const miopenTensorDescriptor_t outputDesc,
                                                  void* output,
                                                  const int32_t* kernel_size,
-                                                 const int kernel_size_size,
+                                                 const int32_t kernel_size_size,
                                                  const int32_t* stride,
-                                                 const int stride_size,
+                                                 const int32_t stride_size,
                                                  const int32_t* padding,
-                                                 const int padding_size,
+                                                 const int32_t padding_size,
                                                  const int32_t* dilation,
-                                                 const int dilation_size);
+                                                 const int32_t dilation_size);
 
 /*! @brief Execute an unfold backward layer
  *
@@ -6708,13 +6708,13 @@ MIOPEN_EXPORT miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle,
                                                   const miopenTensorDescriptor_t doutputDesc,
                                                   const void* doutput,
                                                   const int32_t* kernel_size,
-                                                  const int kernel_size_size,
+                                                  const int32_t kernel_size_size,
                                                   const int32_t* stride,
-                                                  const int stride_size,
+                                                  const int32_t stride_size,
                                                   const int32_t* padding,
-                                                  const int padding_size,
+                                                  const int32_t padding_size,
                                                   const int32_t* dilation,
-                                                  const int dilation_size);
+                                                  const int32_t dilation_size);
 
 /** @} */
 // CLOSEOUT FOLD DOXYGEN GROUP
diff --git a/src/fold.cpp b/src/fold.cpp
index a105be60b3..0c30529c99 100644
--- a/src/fold.cpp
+++ b/src/fold.cpp
@@ -43,13 +43,13 @@ miopenStatus_t UnfoldForward(Handle& handle,
                              const TensorDescriptor& outputDesc,
                              Data_t output,
                              const int32_t* kernel_size,
-                             const int kernel_size_size,
+                             const int32_t kernel_size_size,
                              const int32_t* stride,
-                             const int stride_size,
+                             const int32_t stride_size,
                              const int32_t* padding,
-                             const int padding_size,
+                             const int32_t padding_size,
                              const int32_t* dilation,
-                             const int dilation_size)
+                             const int32_t dilation_size)
 {
     const auto problem = fold::UnfoldFwdProblemDescription{inputDesc,
                                                            outputDesc,
@@ -93,13 +93,13 @@ miopenStatus_t UnfoldBackward(Handle& handle,
                               const TensorDescriptor& doutputDesc,
                               ConstData_t doutput,
                               const int32_t* kernel_size,
-                              const int kernel_size_size,
+                              const int32_t kernel_size_size,
                               const int32_t* stride,
-                              const int stride_size,
+                              const int32_t stride_size,
                               const int32_t* padding,
-                              const int padding_size,
+                              const int32_t padding_size,
                               const int32_t* dilation,
-                              const int dilation_size)
+                              const int32_t dilation_size)
 {
     const auto problem = fold::UnfoldBwdProblemDescription{dinputDesc,
                                                            doutputDesc,
@@ -143,13 +143,13 @@ miopenStatus_t FoldForward(Handle& handle,
                            const TensorDescriptor& outputDesc,
                            Data_t output,
                            const int32_t* kernel_size,
-                           const int kernel_size_size,
+                           const int32_t kernel_size_size,
                            const int32_t* stride,
-                           const int stride_size,
+                           const int32_t stride_size,
                            const int32_t* padding,
-                           const int padding_size,
+                           const int32_t padding_size,
                            const int32_t* dilation,
-                           const int dilation_size)
+                           const int32_t dilation_size)
 {
     const auto problem = fold::FoldFwdProblemDescription{inputDesc,
                                                          outputDesc,
@@ -193,13 +193,13 @@ miopenStatus_t FoldBackward(Handle& handle,
                             const TensorDescriptor& doutputDesc,
                             ConstData_t doutput,
                             const int32_t* kernel_size,
-                            const int kernel_size_size,
+                            const int32_t kernel_size_size,
                             const int32_t* stride,
-                            const int stride_size,
+                            const int32_t stride_size,
                             const int32_t* padding,
-                            const int padding_size,
+                            const int32_t padding_size,
                             const int32_t* dilation,
-                            const int dilation_size)
+                            const int32_t dilation_size)
 {
     const auto problem = fold::FoldBwdProblemDescription{dinputDesc,
                                                          doutputDesc,
diff --git a/src/fold_api.cpp b/src/fold_api.cpp
index d1bdefdfb0..6c15380db6 100644
--- a/src/fold_api.cpp
+++ b/src/fold_api.cpp
@@ -37,13 +37,13 @@ extern "C" miopenStatus_t miopenUnfoldForward(miopenHandle_t handle,
                                               const miopenTensorDescriptor_t outputDesc,
                                               void* output,
                                               const int32_t* kernel_size,
-                                              const int kernel_size_size,
+                                              const int32_t kernel_size_size,
                                               const int32_t* stride,
-                                              const int stride_size,
+                                              const int32_t stride_size,
                                               const int32_t* padding,
-                                              const int padding_size,
+                                              const int32_t padding_size,
                                               const int32_t* dilation,
-                                              const int dilation_size)
+                                              const int32_t dilation_size)
 {
     return miopen::try_([&] {
         miopen::UnfoldForward(miopen::deref(handle),
@@ -68,13 +68,13 @@ extern "C" miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle,
                                                const miopenTensorDescriptor_t doutputDesc,
                                                const void* doutput,
                                                const int32_t* kernel_size,
-                                               const int kernel_size_size,
+                                               const int32_t kernel_size_size,
                                                const int32_t* stride,
-                                               const int stride_size,
+                                               const int32_t stride_size,
                                                const int32_t* padding,
-                                               const int padding_size,
+                                               const int32_t padding_size,
                                                const int32_t* dilation,
-                                               const int dilation_size)
+                                               const int32_t dilation_size)
 {
     return miopen::try_([&] {
         miopen::UnfoldBackward(miopen::deref(handle),
@@ -99,13 +99,13 @@ extern "C" miopenStatus_t miopenFoldForward(miopenHandle_t handle,
                                             const miopenTensorDescriptor_t outputDesc,
                                             void* output,
                                             const int32_t* kernel_size,
-                                            const int kernel_size_size,
+                                            const int32_t kernel_size_size,
                                             const int32_t* stride,
-                                            const int stride_size,
+                                            const int32_t stride_size,
                                             const int32_t* padding,
-                                            const int padding_size,
+                                            const int32_t padding_size,
                                             const int32_t* dilation,
-                                            const int dilation_size)
+                                            const int32_t dilation_size)
 {
     return miopen::try_([&] {
         miopen::FoldForward(miopen::deref(handle),
@@ -130,13 +130,13 @@ extern "C" miopenStatus_t miopenFoldBackward(miopenHandle_t handle,
                                              const miopenTensorDescriptor_t doutputDesc,
                                              const void* doutput,
                                              const int32_t* kernel_size,
-                                             const int kernel_size_size,
+                                             const int32_t kernel_size_size,
                                              const int32_t* stride,
-                                             const int stride_size,
+                                             const int32_t stride_size,
                                              const int32_t* padding,
-                                             const int padding_size,
+                                             const int32_t padding_size,
                                              const int32_t* dilation,
-                                             const int dilation_size)
+                                             const int32_t dilation_size)
 {
     return miopen::try_([&] {
         miopen::FoldBackward(miopen::deref(handle),
diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp
index 3ac7e878f7..a88e0b9b9e 100644
--- a/src/include/miopen/fold.hpp
+++ b/src/include/miopen/fold.hpp
@@ -39,13 +39,13 @@ miopenStatus_t UnfoldForward(Handle& handle,
                              const TensorDescriptor& outputDesc,
                              Data_t output,
                              const int32_t* kernel_size,
-                             const int kernel_size_size,
+                             const int32_t kernel_size_size,
                              const int32_t* stride,
-                             const int stride_size,
+                             const int32_t stride_size,
                              const int32_t* padding,
-                             const int padding_size,
+                             const int32_t padding_size,
                              const int32_t* dilation,
-                             const int dilation_size);
+                             const int32_t dilation_size);
 
 miopenStatus_t UnfoldBackward(Handle& handle,
                               const TensorDescriptor& dinputDesc,
@@ -53,13 +53,13 @@ miopenStatus_t UnfoldBackward(Handle& handle,
                               const TensorDescriptor& doutputDesc,
                               ConstData_t doutput,
                               const int32_t* kernel_size,
-                              const int kernel_size_size,
+                              const int32_t kernel_size_size,
                               const int32_t* stride,
-                              const int stride_size,
+                              const int32_t stride_size,
                               const int32_t* padding,
-                              const int padding_size,
+                              const int32_t padding_size,
                               const int32_t* dilation,
-                              const int dilation_size);
+                              const int32_t dilation_size);
 
 miopenStatus_t FoldForward(Handle& handle,
                            const TensorDescriptor& inputDesc,
@@ -67,13 +67,13 @@ miopenStatus_t FoldForward(Handle& handle,
                            const TensorDescriptor& outputDesc,
                            Data_t output,
                            const int32_t* kernel_size,
-                           const int kernel_size_size,
+                           const int32_t kernel_size_size,
                            const int32_t* stride,
-                           const int stride_size,
+                           const int32_t stride_size,
                            const int32_t* padding,
-                           const int padding_size,
+                           const int32_t padding_size,
                            const int32_t* dilation,
-                           const int dilation_size);
+                           const int32_t dilation_size);
 
 miopenStatus_t FoldBackward(Handle& handle,
                             const TensorDescriptor& dinputDesc,
@@ -81,12 +81,12 @@ miopenStatus_t FoldBackward(Handle& handle,
                             const TensorDescriptor& doutputDesc,
                             ConstData_t doutput,
                             const int32_t* kernel_size,
-                            const int kernel_size_size,
+                            const int32_t kernel_size_size,
                             const int32_t* stride,
-                            const int stride_size,
+                            const int32_t stride_size,
                             const int32_t* padding,
-                            const int padding_size,
+                            const int32_t padding_size,
                             const int32_t* dilation,
-                            const int dilation_size);
+                            const int32_t dilation_size);
 } // namespace miopen
 #endif // MIOPEN_INSTANCE_NORM_HPP_
diff --git a/src/include/miopen/fold/invoke_params.hpp b/src/include/miopen/fold/invoke_params.hpp
index c19e83eeee..20e7859b35 100644
--- a/src/include/miopen/fold/invoke_params.hpp
+++ b/src/include/miopen/fold/invoke_params.hpp
@@ -53,10 +53,10 @@ struct InvokeParams : public miopen::InvokeParams
     const int32_t* stride      = nullptr;
     const int32_t* padding     = nullptr;
     const int32_t* dilation    = nullptr;
-    int kernel_size_size       = 0;
-    int stride_size            = 0;
-    int padding_size           = 0;
-    int dilation_size          = 0;
+    int32_t kernel_size_size       = 0;
+    int32_t stride_size            = 0;
+    int32_t padding_size           = 0;
+    int32_t dilation_size          = 0;
 
     std::size_t GetWorkspaceSize() const { return 0; }
     Data_t GetWorkspace() const { return nullptr; }
diff --git a/src/include/miopen/fold/problem_description.hpp b/src/include/miopen/fold/problem_description.hpp
index 30689e8bd9..3bc7ae91ca 100644
--- a/src/include/miopen/fold/problem_description.hpp
+++ b/src/include/miopen/fold/problem_description.hpp
@@ -47,13 +47,13 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase
     UnfoldFwdProblemDescription(const TensorDescriptor& inputDesc_,
                                 const TensorDescriptor& outputDesc_,
                                 const int32_t* kernel_size_,
-                                const int kernel_size_size_,
+                                const int32_t kernel_size_size_,
                                 const int32_t* stride_,
-                                const int stride_size_,
+                                const int32_t stride_size_,
                                 const int32_t* padding_,
-                                const int padding_size_,
+                                const int32_t padding_size_,
                                 const int32_t* dilation_,
-                                const int dilation_size_)
+                                const int32_t dilation_size_)
         : inputDesc(inputDesc_),
           outputDesc(outputDesc_),
           kernel_size(kernel_size_),
@@ -79,7 +79,7 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase
             return false;
 #endif
         }
-        int spatial_dim_size = inputDesc.GetSize() - 2;
+        int32_t spatial_dim_size = inputDesc.GetSize() - 2;
         if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size ||
            padding_size != spatial_dim_size || dilation_size != spatial_dim_size)
         {
@@ -141,13 +141,13 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase
     TensorDescriptor inputDesc;
     TensorDescriptor outputDesc;
     const int32_t* kernel_size;
-    const int kernel_size_size;
+    const int32_t kernel_size_size;
     const int32_t* stride;
-    const int stride_size;
+    const int32_t stride_size;
     const int32_t* padding;
-    const int padding_size;
+    const int32_t padding_size;
     const int32_t* dilation;
-    const int dilation_size;
+    const int32_t dilation_size;
 };
 
 struct UnfoldBwdProblemDescription : ProblemDescriptionBase
@@ -155,13 +155,13 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase
     UnfoldBwdProblemDescription(const TensorDescriptor& dinputDesc_,
                                 const TensorDescriptor& doutputDesc_,
                                 const int32_t* kernel_size_,
-                                const int kernel_size_size_,
+                                const int32_t kernel_size_size_,
                                 const int32_t* stride_,
-                                const int stride_size_,
+                                const int32_t stride_size_,
                                 const int32_t* padding_,
-                                const int padding_size_,
+                                const int32_t padding_size_,
                                 const int32_t* dilation_,
-                                const int dilation_size_)
+                                const int32_t dilation_size_)
         : dinputDesc(dinputDesc_),
           doutputDesc(doutputDesc_),
           kernel_size(kernel_size_),
@@ -187,7 +187,7 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase
             return false;
 #endif
         }
-        int spatial_dim_size = dinputDesc.GetSize() - 2;
+        int32_t spatial_dim_size = dinputDesc.GetSize() - 2;
         if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size ||
            padding_size != spatial_dim_size || dilation_size != spatial_dim_size)
         {
@@ -250,13 +250,13 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase
     TensorDescriptor dinputDesc;
     TensorDescriptor doutputDesc;
     const int32_t* kernel_size;
-    const int kernel_size_size;
+    const int32_t kernel_size_size;
     const int32_t* stride;
-    const int stride_size;
+    const int32_t stride_size;
     const int32_t* padding;
-    const int padding_size;
+    const int32_t padding_size;
     const int32_t* dilation;
-    const int dilation_size;
+    const int32_t dilation_size;
 };
 
 struct FoldFwdProblemDescription : ProblemDescriptionBase
@@ -264,13 +264,13 @@ struct FoldFwdProblemDescription : ProblemDescriptionBase
     FoldFwdProblemDescription(const TensorDescriptor& inputDesc_,
                               const TensorDescriptor& outputDesc_,
                               const int32_t* kernel_size_,
-                              const int kernel_size_size_,
+                              const int32_t kernel_size_size_,
                               const int32_t* stride_,
-                              const int stride_size_,
+                              const int32_t stride_size_,
                               const int32_t* padding_,
-                              const int padding_size_,
+                              const int32_t padding_size_,
                               const int32_t* dilation_,
-                              const int dilation_size_)
+                              const int32_t dilation_size_)
         : inputDesc(inputDesc_),
           outputDesc(outputDesc_),
           kernel_size(kernel_size_),
@@ -296,7 +296,7 @@ struct FoldFwdProblemDescription : ProblemDescriptionBase
             return false;
 #endif
         }
-        int spatial_dim_size = outputDesc.GetSize() - 2;
+        int32_t spatial_dim_size = outputDesc.GetSize() - 2;
         if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size ||
            padding_size != spatial_dim_size || dilation_size != spatial_dim_size)
         {
@@ -358,13 +358,13 @@ struct FoldFwdProblemDescription : ProblemDescriptionBase
     TensorDescriptor inputDesc;
     TensorDescriptor outputDesc;
     const int32_t* kernel_size;
-    const int kernel_size_size;
+    const int32_t kernel_size_size;
     const int32_t* stride;
-    const int stride_size;
+    const int32_t stride_size;
     const int32_t* padding;
-    const int padding_size;
+    const int32_t padding_size;
     const int32_t* dilation;
-    const int dilation_size;
+    const int32_t dilation_size;
 };
 
 struct FoldBwdProblemDescription : ProblemDescriptionBase
@@ -372,13 +372,13 @@ struct FoldBwdProblemDescription : ProblemDescriptionBase
     FoldBwdProblemDescription(const TensorDescriptor& dinputDesc_,
                               const TensorDescriptor& doutputDesc_,
                               const int32_t* kernel_size_,
-                              const int kernel_size_size_,
+                              const int32_t kernel_size_size_,
                               const int32_t* stride_,
-                              const int stride_size_,
+                              const int32_t stride_size_,
                               const int32_t* padding_,
-                              const int padding_size_,
+                              const int32_t padding_size_,
                               const int32_t* dilation_,
-                              const int dilation_size_)
+                              const int32_t dilation_size_)
         : dinputDesc(dinputDesc_),
           doutputDesc(doutputDesc_),
           kernel_size(kernel_size_),
@@ -404,7 +404,7 @@ struct FoldBwdProblemDescription : ProblemDescriptionBase
             return false;
 #endif
         }
-        int spatial_dim_size = doutputDesc.GetSize() - 2;
+        int32_t spatial_dim_size = doutputDesc.GetSize() - 2;
         if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size ||
            padding_size != spatial_dim_size || dilation_size != spatial_dim_size)
         {
@@ -467,13 +467,13 @@ struct FoldBwdProblemDescription : ProblemDescriptionBase
     TensorDescriptor dinputDesc;
     TensorDescriptor doutputDesc;
     const int32_t* kernel_size;
-    const int kernel_size_size;
+    const int32_t kernel_size_size;
     const int32_t* stride;
-    const int stride_size;
+    const int32_t stride_size;
     const int32_t* padding;
-    const int padding_size;
+    const int32_t padding_size;
     const int32_t* dilation;
-    const int dilation_size;
+    const int32_t dilation_size;
 };
 
 } // namespace fold
diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp
index ee9316a3bf..0e5c4b0824 100644
--- a/src/solver/fold/fold_backward.cpp
+++ b/src/solver/fold/fold_backward.cpp
@@ -61,7 +61,7 @@ ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& conte
 
     const int32_t N      = static_cast<int32_t>(output_grad_dims[0]);
     const int32_t C      = static_cast<int32_t>(output_grad_dims[1]);
-    int spatial_dim_size = output_grad_dims.size() - 2;
+    int32_t spatial_dim_size = output_grad_dims.size() - 2;
     int32_t P = 1, L = 1;
     std::vector<int32_t> ls;
     for(int i = 0; i < spatial_dim_size; ++i)
@@ -105,7 +105,7 @@ ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& conte
         result.construction_params.push_back(kernel);
     }
 
-    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+    result.invoker_factory = [N, C, P, L, ls](const std::vector<Kernel>& kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) kernel = handle_.Run(kernels.front());
             decltype(auto) params = raw_params.CastTo<miopen::fold::InvokeParams>();
@@ -115,22 +115,6 @@ ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& conte
             auto input_grad_dims  = deref(params.dinputDesc).GetLengths();
             auto output_grad_dims = deref(params.doutputDesc).GetLengths();
 
-            int spatial_dim_size = output_grad_dims.size() - 2;
-            const int32_t N      = static_cast<int32_t>(output_grad_dims[0]);
-            const int32_t C      = static_cast<int32_t>(output_grad_dims[1]);
-            int32_t P = 1, L = 1;
-            std::vector<int32_t> ls;
-            for(int i = 0; i < spatial_dim_size; ++i)
-            {
-                P *= params.kernel_size[i];
-                int32_t l = (static_cast<int32_t>(output_grad_dims[i + 2]) + 2 * params.padding[i] -
-                             params.dilation[i] * (params.kernel_size[i] - 1) - 1) /
-                                params.stride[i] +
-                            1;
-                L *= l;
-                ls.push_back(l);
-            }
-
             int32_t kernel_size_h = params.kernel_size[0];
             int32_t kernel_size_w = params.kernel_size[1];
             int32_t stride_h      = params.stride[0];
diff --git a/src/solver/fold/fold_forward.cpp b/src/solver/fold/fold_forward.cpp
index 9ecf89c6e6..1ec6e9f4ab 100644
--- a/src/solver/fold/fold_forward.cpp
+++ b/src/solver/fold/fold_forward.cpp
@@ -24,6 +24,7 @@
  *
  *******************************************************************************/
 
+#include <cstdint>
 #include <miopen/fold/problem_description.hpp>
 #include <miopen/miopen.h>
 #include <miopen/datatype.hpp>
@@ -94,7 +95,7 @@ ConvSolution FoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& conte
         result.construction_params.push_back(kernel);
     }
 
-    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+    result.invoker_factory = [N, C](const std::vector<Kernel>& kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) kernel = handle_.Run(kernels.front());
             decltype(auto) params = raw_params.CastTo<miopen::fold::InvokeParams>();
@@ -104,9 +105,7 @@ ConvSolution FoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& conte
             auto input_dims  = deref(params.inputDesc).GetLengths();
             auto output_dims = deref(params.outputDesc).GetLengths();
 
-            int spatial_dim_size = output_dims.size() - 2;
-            const int32_t N      = static_cast<int32_t>(output_dims[0]);
-            const int32_t C      = static_cast<int32_t>(output_dims[1]);
+            int32_t spatial_dim_size = output_dims.size() - 2;
             int32_t P = 1, L = 1;
             std::vector<int32_t> ls;
             for(int i = 0; i < spatial_dim_size; ++i)
diff --git a/src/solver/fold/unfold_backward.cpp b/src/solver/fold/unfold_backward.cpp
index 7b75679263..c673c4497a 100644
--- a/src/solver/fold/unfold_backward.cpp
+++ b/src/solver/fold/unfold_backward.cpp
@@ -94,7 +94,7 @@ ConvSolution UnfoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& con
         result.construction_params.push_back(kernel);
     }
 
-    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+    result.invoker_factory = [N, C, H, W](const std::vector<Kernel>& kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) kernel = handle_.Run(kernels.front());
             decltype(auto) params = raw_params.CastTo<miopen::fold::InvokeParams>();
@@ -105,8 +105,6 @@ ConvSolution UnfoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& con
             auto output_grad_dims = deref(params.doutputDesc).GetLengths();
 
             int spatial_dim_size = input_grad_dims.size() - 2;
-            const int32_t N      = static_cast<int32_t>(input_grad_dims[0]);
-            const int32_t C      = static_cast<int32_t>(input_grad_dims[1]);
             int32_t P = 1, L = 1;
             std::vector<int32_t> ls;
             for(int i = 0; i < spatial_dim_size; ++i)
@@ -130,8 +128,6 @@ ConvSolution UnfoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& con
             int32_t dilation_w    = params.dilation[1];
             int32_t LH            = ls[0];
             int32_t LW            = ls[1];
-            int32_t H             = static_cast<int32_t>(input_grad_dims[2]);
-            int32_t W             = static_cast<int32_t>(input_grad_dims[3]);
 
             kernel(params.doutput,
                    params.dinput,
diff --git a/src/solver/fold/unfold_forward.cpp b/src/solver/fold/unfold_forward.cpp
index 35e29df8bb..68402e6a2c 100644
--- a/src/solver/fold/unfold_forward.cpp
+++ b/src/solver/fold/unfold_forward.cpp
@@ -105,7 +105,7 @@ ConvSolution UnfoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& con
         result.construction_params.push_back(kernel);
     }
 
-    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+    result.invoker_factory = [N, C, P, L, ls](const std::vector<Kernel>& kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
             decltype(auto) kernel = handle_.Run(kernels.front());
             decltype(auto) params = raw_params.CastTo<miopen::fold::InvokeParams>();
@@ -115,22 +115,6 @@ ConvSolution UnfoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& con
             auto input_dims  = deref(params.inputDesc).GetLengths();
             auto output_dims = deref(params.outputDesc).GetLengths();
 
-            int spatial_dim_size = input_dims.size() - 2;
-            const int32_t N      = static_cast<int32_t>(input_dims[0]);
-            const int32_t C      = static_cast<int32_t>(input_dims[1]);
-            int32_t P = 1, L = 1;
-            std::vector<int32_t> ls;
-            for(int i = 0; i < spatial_dim_size; ++i)
-            {
-                P *= params.kernel_size[i];
-                int32_t l = (static_cast<int32_t>(input_dims[i + 2]) + 2 * params.padding[i] -
-                             params.dilation[i] * (params.kernel_size[i] - 1) - 1) /
-                                params.stride[i] +
-                            1;
-                L *= l;
-                ls.push_back(l);
-            }
-
             int32_t kernel_size_h = params.kernel_size[0];
             int32_t kernel_size_w = params.kernel_size[1];
             int32_t stride_h      = params.stride[0];

From e726fc1d1f9ddbabb2a4e0f3d9504277ccc9c34b Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Wed, 17 Jul 2024 07:59:56 +0000
Subject: [PATCH 14/46] githook format

---
 src/include/miopen/fold/invoke_params.hpp | 8 ++++----
 src/solver/fold/fold_backward.cpp         | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/include/miopen/fold/invoke_params.hpp b/src/include/miopen/fold/invoke_params.hpp
index 20e7859b35..246ccfb401 100644
--- a/src/include/miopen/fold/invoke_params.hpp
+++ b/src/include/miopen/fold/invoke_params.hpp
@@ -53,10 +53,10 @@ struct InvokeParams : public miopen::InvokeParams
     const int32_t* stride      = nullptr;
     const int32_t* padding     = nullptr;
     const int32_t* dilation    = nullptr;
-    int32_t kernel_size_size       = 0;
-    int32_t stride_size            = 0;
-    int32_t padding_size           = 0;
-    int32_t dilation_size          = 0;
+    int32_t kernel_size_size   = 0;
+    int32_t stride_size        = 0;
+    int32_t padding_size       = 0;
+    int32_t dilation_size      = 0;
 
     std::size_t GetWorkspaceSize() const { return 0; }
     Data_t GetWorkspace() const { return nullptr; }
diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp
index 0e5c4b0824..1fe957d408 100644
--- a/src/solver/fold/fold_backward.cpp
+++ b/src/solver/fold/fold_backward.cpp
@@ -59,8 +59,8 @@ ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& conte
     auto input_grad_dims  = problem.GetDinputDesc().GetLengths();
     auto output_grad_dims = problem.GetDoutputDesc().GetLengths();
 
-    const int32_t N      = static_cast<int32_t>(output_grad_dims[0]);
-    const int32_t C      = static_cast<int32_t>(output_grad_dims[1]);
+    const int32_t N          = static_cast<int32_t>(output_grad_dims[0]);
+    const int32_t C          = static_cast<int32_t>(output_grad_dims[1]);
     int32_t spatial_dim_size = output_grad_dims.size() - 2;
     int32_t P = 1, L = 1;
     std::vector<int32_t> ls;

From 8ee286160f08a51be318531311032182c8cb3a12 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Thu, 18 Jul 2024 04:54:47 +0000
Subject: [PATCH 15/46] remove useless if else in problem description

---
 .../miopen/fold/problem_description.hpp       | 60 -------------------
 1 file changed, 60 deletions(-)

diff --git a/src/include/miopen/fold/problem_description.hpp b/src/include/miopen/fold/problem_description.hpp
index 3bc7ae91ca..0e3ef29d4b 100644
--- a/src/include/miopen/fold/problem_description.hpp
+++ b/src/include/miopen/fold/problem_description.hpp
@@ -83,11 +83,7 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase
         if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size ||
            padding_size != spatial_dim_size || dilation_size != spatial_dim_size)
         {
-#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
             MIOPEN_THROW(miopenStatusBadParm, "Unfold: Argument length should be 2D");
-#else
-            return false;
-#endif
         }
         auto input_dims = inputDesc.GetLengths();
         const int32_t N = static_cast<int32_t>(input_dims[0]);
@@ -109,11 +105,7 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase
         auto output_dims = outputDesc.GetLengths();
         if(output_dims != output_dims_desired)
         {
-#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
             MIOPEN_THROW(miopenStatusBadParm, "Unfold: Invalid output dimension");
-#else
-            return false;
-#endif
         }
         return true;
     }
@@ -122,12 +114,8 @@ struct UnfoldFwdProblemDescription : ProblemDescriptionBase
     {
         if(inputDesc.GetType() != outputDesc.GetType())
         {
-#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
             MIOPEN_THROW(miopenStatusBadParm,
                          "Unfold: The input tensor and output tensor has mismatch type.");
-#else
-            return false;
-#endif
         }
         return true;
     }
@@ -181,21 +169,13 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase
     {
         if(dinputDesc.GetSize() != 4)
         {
-#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
             MIOPEN_THROW(miopenStatusBadParm, "Unfold: The input gradient tensor should be 4D.");
-#else
-            return false;
-#endif
         }
         int32_t spatial_dim_size = dinputDesc.GetSize() - 2;
         if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size ||
            padding_size != spatial_dim_size || dilation_size != spatial_dim_size)
         {
-#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
             MIOPEN_THROW(miopenStatusBadParm, "Unfold: Argument length should be 2D");
-#else
-            return false;
-#endif
         }
         auto input_dims = dinputDesc.GetLengths();
         const int32_t N = static_cast<int32_t>(input_dims[0]);
@@ -217,11 +197,7 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase
         auto output_dims = doutputDesc.GetLengths();
         if(output_dims != output_dims_desired)
         {
-#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
             MIOPEN_THROW(miopenStatusBadParm, "Unfold: Invalid output gradient dimension");
-#else
-            return false;
-#endif
         }
         return true;
     }
@@ -230,13 +206,9 @@ struct UnfoldBwdProblemDescription : ProblemDescriptionBase
     {
         if(dinputDesc.GetType() != doutputDesc.GetType())
         {
-#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
             MIOPEN_THROW(
                 miopenStatusBadParm,
                 "Unfold: The input gradient tensor and output gradient tensor has mismatch type.");
-#else
-            return false;
-#endif
         }
         return true;
     }
@@ -290,21 +262,13 @@ struct FoldFwdProblemDescription : ProblemDescriptionBase
     {
         if(outputDesc.GetSize() != 4)
         {
-#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
             MIOPEN_THROW(miopenStatusBadParm, "Fold: The output tensor should be 4D.");
-#else
-            return false;
-#endif
         }
         int32_t spatial_dim_size = outputDesc.GetSize() - 2;
         if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size ||
            padding_size != spatial_dim_size || dilation_size != spatial_dim_size)
         {
-#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
             MIOPEN_THROW(miopenStatusBadParm, "Fold: Argument length should be 2D");
-#else
-            return false;
-#endif
         }
         auto input_dims  = inputDesc.GetLengths();
         auto output_dims = outputDesc.GetLengths();
@@ -326,11 +290,7 @@ struct FoldFwdProblemDescription : ProblemDescriptionBase
             static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
         if(input_dims != input_dims_desired)
         {
-#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
             MIOPEN_THROW(miopenStatusBadParm, "Fold: Invalid input dimension");
-#else
-            return false;
-#endif
         }
         return true;
     }
@@ -339,12 +299,8 @@ struct FoldFwdProblemDescription : ProblemDescriptionBase
     {
         if(inputDesc.GetType() != outputDesc.GetType())
         {
-#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
             MIOPEN_THROW(miopenStatusBadParm,
                          "Fold: The input tensor and output tensor has mismatch type.");
-#else
-            return false;
-#endif
         }
         return true;
     }
@@ -398,21 +354,13 @@ struct FoldBwdProblemDescription : ProblemDescriptionBase
     {
         if(doutputDesc.GetSize() != 4)
         {
-#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
             MIOPEN_THROW(miopenStatusBadParm, "Fold: The output gradient tensor should be 4D.");
-#else
-            return false;
-#endif
         }
         int32_t spatial_dim_size = doutputDesc.GetSize() - 2;
         if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size ||
            padding_size != spatial_dim_size || dilation_size != spatial_dim_size)
         {
-#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
             MIOPEN_THROW(miopenStatusBadParm, "Fold: Argument length should be 2D");
-#else
-            return false;
-#endif
         }
         auto input_dims  = dinputDesc.GetLengths();
         auto output_dims = doutputDesc.GetLengths();
@@ -434,11 +382,7 @@ struct FoldBwdProblemDescription : ProblemDescriptionBase
             static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
         if(input_dims != input_dims_desired)
         {
-#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
             MIOPEN_THROW(miopenStatusBadParm, "Fold: Invalid input gradient dimension");
-#else
-            return false;
-#endif
         }
         return true;
     }
@@ -447,13 +391,9 @@ struct FoldBwdProblemDescription : ProblemDescriptionBase
     {
         if(dinputDesc.GetType() != doutputDesc.GetType())
         {
-#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
             MIOPEN_THROW(
                 miopenStatusBadParm,
                 "Fold: The input gradient tensor and output gradient tensor has mismatch type.");
-#else
-            return false;
-#endif
         }
         return true;
     }

From f3dea16ca398ffa36db95f75d5ef000ad2b74721 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Thu, 18 Jul 2024 08:36:42 +0000
Subject: [PATCH 16/46] add more tensor_layout_t constructor and update kernel
 to use get_tensor_view_idx

---
 src/kernels/MIOpenUnfold.cpp | 23 ++++++++---------------
 src/kernels/tensor_view.hpp  | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/src/kernels/MIOpenUnfold.cpp b/src/kernels/MIOpenUnfold.cpp
index 84a272ef80..100d8d7c42 100644
--- a/src/kernels/MIOpenUnfold.cpp
+++ b/src/kernels/MIOpenUnfold.cpp
@@ -76,14 +76,11 @@ __device__ void unfoldForward4D(const DTYPE* __restrict__ input,
     DTYPE x = 0;
     if(0 <= h && h < H && 0 <= w && w < W)
     {
-        long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h + input_tv.stride[1] * c +
-                         input_tv.stride[0] * n;
-        x = input[input_idx];
+        tensor_layout_t<4> input_layout(input_tv, n, c, h, w);
+        x = input[input_tv.get_tensor_view_idx(input_layout)];
     }
-
-    long output_idx =
-        output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n;
-    output[output_idx] = x;
+    tensor_layout_t<3> output_layout(output_tv, n, c * P + p, l);
+    output[output_tv.get_tensor_view_idx(output_layout)] = x;
 }
 
 extern "C" __global__ void UnfoldForward4D(const FLOAT* __restrict__ input,
@@ -182,16 +179,12 @@ __device__ void unfoldBackward4D(const DTYPE* __restrict__ output_grad,
                 continue;
             if(lw < 0 || LW <= lw)
                 continue;
-            long output_grad_idx = output_grad_tv.stride[2] * (lh * LW + lw) +
-                                   output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) +
-                                   output_grad_tv.stride[0] * n;
-            sum += CVT_FLOAT2ACCUM(output_grad[output_grad_idx]);
+            tensor_layout_t<3> output_grad_layout(output_grad_tv, n, c * P + (ph * kernel_size_w + pw), lh * LW + lw);
+            sum += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(output_grad_layout)]);
         }
     }
-
-    long input_grad_idx = input_grad_tv.stride[3] * w + input_grad_tv.stride[2] * h +
-                          input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n;
-    input_grad[input_grad_idx] = CVT_ACCUM2FLOAT(sum);
+    tensor_layout_t<4> input_grad_layout(input_grad_tv, n, c, h , w);
+    input_grad[input_grad_tv.get_tensor_view_idx(input_grad_layout)] = CVT_ACCUM2FLOAT(sum);
 }
 
 extern "C" __global__ void UnfoldBackward4D(const FLOAT* __restrict__ output_grad,
diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
index d35bfd93fc..b62bb5ef33 100644
--- a/src/kernels/tensor_view.hpp
+++ b/src/kernels/tensor_view.hpp
@@ -72,6 +72,40 @@ struct tensor_layout_t
         }
     }
 
+    constexpr tensor_layout_t(const tensor_view_t<N>& tensor_view, uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w)
+    {
+        static_assert(N == 5);
+        layout[0] = n;
+        layout[1] = c;
+        layout[2] = d;
+        layout[3] = h;
+        layout[4] = w;
+    }
+
+    constexpr tensor_layout_t(const tensor_view_t<N>& tensor_view, uint64_t n, uint64_t c, uint64_t h, uint64_t w)
+    {
+        static_assert(N == 4);
+        layout[0] = n;
+        layout[1] = c;
+        layout[2] = h;
+        layout[3] = w;
+    }
+
+    constexpr tensor_layout_t(const tensor_view_t<N>& tensor_view, uint64_t n, uint64_t h, uint64_t w)
+    {
+        static_assert(N == 3);
+        layout[0] = n;
+        layout[1] = h;
+        layout[2] = w;
+    }
+    
+    constexpr tensor_layout_t(const tensor_view_t<N>& tensor_view, uint64_t n, uint64_t w)
+    {
+        static_assert(N == 2);
+        layout[0] = n;
+        layout[1] = w;
+    }
+
     uint64_t layout[N];
 };
 

From 4a832966d63c9699f2751354aae0e1e16b4c5147 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Thu, 18 Jul 2024 08:37:34 +0000
Subject: [PATCH 17/46] githook format

---
 src/kernels/MIOpenUnfold.cpp |  8 +++++---
 src/kernels/tensor_view.hpp  | 17 +++++++++++++----
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/kernels/MIOpenUnfold.cpp b/src/kernels/MIOpenUnfold.cpp
index 100d8d7c42..24eebcc80f 100644
--- a/src/kernels/MIOpenUnfold.cpp
+++ b/src/kernels/MIOpenUnfold.cpp
@@ -179,11 +179,13 @@ __device__ void unfoldBackward4D(const DTYPE* __restrict__ output_grad,
                 continue;
             if(lw < 0 || LW <= lw)
                 continue;
-            tensor_layout_t<3> output_grad_layout(output_grad_tv, n, c * P + (ph * kernel_size_w + pw), lh * LW + lw);
-            sum += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(output_grad_layout)]);
+            tensor_layout_t<3> output_grad_layout(
+                output_grad_tv, n, c * P + (ph * kernel_size_w + pw), lh * LW + lw);
+            sum += CVT_FLOAT2ACCUM(
+                output_grad[output_grad_tv.get_tensor_view_idx(output_grad_layout)]);
         }
     }
-    tensor_layout_t<4> input_grad_layout(input_grad_tv, n, c, h , w);
+    tensor_layout_t<4> input_grad_layout(input_grad_tv, n, c, h, w);
     input_grad[input_grad_tv.get_tensor_view_idx(input_grad_layout)] = CVT_ACCUM2FLOAT(sum);
 }
 
diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
index b62bb5ef33..abaa052142 100644
--- a/src/kernels/tensor_view.hpp
+++ b/src/kernels/tensor_view.hpp
@@ -72,7 +72,12 @@ struct tensor_layout_t
         }
     }
 
-    constexpr tensor_layout_t(const tensor_view_t<N>& tensor_view, uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w)
+    constexpr tensor_layout_t(const tensor_view_t<N>& tensor_view,
+                              uint64_t n,
+                              uint64_t c,
+                              uint64_t d,
+                              uint64_t h,
+                              uint64_t w)
     {
         static_assert(N == 5);
         layout[0] = n;
@@ -82,7 +87,8 @@ struct tensor_layout_t
         layout[4] = w;
     }
 
-    constexpr tensor_layout_t(const tensor_view_t<N>& tensor_view, uint64_t n, uint64_t c, uint64_t h, uint64_t w)
+    constexpr tensor_layout_t(
+        const tensor_view_t<N>& tensor_view, uint64_t n, uint64_t c, uint64_t h, uint64_t w)
     {
         static_assert(N == 4);
         layout[0] = n;
@@ -91,14 +97,17 @@ struct tensor_layout_t
         layout[3] = w;
     }
 
-    constexpr tensor_layout_t(const tensor_view_t<N>& tensor_view, uint64_t n, uint64_t h, uint64_t w)
+    constexpr tensor_layout_t(const tensor_view_t<N>& tensor_view,
+                              uint64_t n,
+                              uint64_t h,
+                              uint64_t w)
     {
         static_assert(N == 3);
         layout[0] = n;
         layout[1] = h;
         layout[2] = w;
     }
-    
+
     constexpr tensor_layout_t(const tensor_view_t<N>& tensor_view, uint64_t n, uint64_t w)
     {
         static_assert(N == 2);

From 299117b4e20374a40ffc03bd0aba9a0c4e764903 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Thu, 18 Jul 2024 08:39:49 +0000
Subject: [PATCH 18/46] remove {}

---
 src/solver/fold/unfold_forward.cpp | 56 ++++++++++++++----------------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/src/solver/fold/unfold_forward.cpp b/src/solver/fold/unfold_forward.cpp
index 68402e6a2c..be8692f111 100644
--- a/src/solver/fold/unfold_forward.cpp
+++ b/src/solver/fold/unfold_forward.cpp
@@ -75,35 +75,33 @@ ConvSolution UnfoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& con
         ls.push_back(l);
     }
 
-    {
-        auto kernel        = KernelInfo{};
-        kernel.kernel_file = "MIOpenUnfold.cpp";
-        kernel.kernel_name = "UnfoldForward4D";
-
-        const auto build_params = KernelBuildParameters{
-            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
-            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
-            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
-            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
-        };
-        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
-
-        size_t xlocalsize = LOCAL_SIZE;
-        size_t xgridsize  = AlignUp(N * C * P * L, LOCAL_SIZE);
-        size_t ylocalsize = 1;
-        size_t ygridsize  = 1;
-        size_t zlocalsize = 1;
-        size_t zgridsize  = 1;
-        kernel.l_wk.push_back(xlocalsize);
-        kernel.l_wk.push_back(ylocalsize);
-        kernel.l_wk.push_back(zlocalsize);
-
-        kernel.g_wk.push_back(xgridsize);
-        kernel.g_wk.push_back(ygridsize);
-        kernel.g_wk.push_back(zgridsize);
-
-        result.construction_params.push_back(kernel);
-    }
+    auto kernel        = KernelInfo{};
+    kernel.kernel_file = "MIOpenUnfold.cpp";
+    kernel.kernel_name = "UnfoldForward4D";
+
+    const auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+    };
+    kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+    size_t xlocalsize = LOCAL_SIZE;
+    size_t xgridsize  = AlignUp(N * C * P * L, LOCAL_SIZE);
+    size_t ylocalsize = 1;
+    size_t ygridsize  = 1;
+    size_t zlocalsize = 1;
+    size_t zgridsize  = 1;
+    kernel.l_wk.push_back(xlocalsize);
+    kernel.l_wk.push_back(ylocalsize);
+    kernel.l_wk.push_back(zlocalsize);
+
+    kernel.g_wk.push_back(xgridsize);
+    kernel.g_wk.push_back(ygridsize);
+    kernel.g_wk.push_back(zgridsize);
+
+    result.construction_params.push_back(kernel);
 
     result.invoker_factory = [N, C, P, L, ls](const std::vector<Kernel>& kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {

From e76095e8fc1771a82978af13018b538b396877d1 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Mon, 22 Jul 2024 08:43:14 +0000
Subject: [PATCH 19/46] update code as comments

---
 docs/reference/index.rst          |  2 +-
 driver/CMakeLists.txt             |  2 +-
 driver/fold_driver.hpp            |  6 ++--
 driver/unfold_driver.hpp          |  6 ++--
 src/CMakeLists.txt                |  4 +--
 src/fold/problem_description.cpp  |  8 -----
 src/solver/fold/fold_backward.cpp | 56 +++++++++++++++----------------
 test/gtest/fold.cpp               | 32 ++++++++++++++----
 test/gtest/fold.hpp               |  3 --
 test/gtest/unfold.cpp             | 30 +++++++++++++----
 10 files changed, 87 insertions(+), 62 deletions(-)

diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index c6ab9521b7..cd1cfee6d2 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -32,4 +32,4 @@ The MIOpen API library is structured as follows:
   * :doc:`GroupNorm <../doxygen/html/group__groupnorm>` (experimental)
   * :doc:`Cat <../doxygen/html/group__cat>` (experimental)
   * :doc:`Argmax<./argmax>` (experimental)
-  * :doc:`Fold <./fold>` (experimental)
+  * :doc:`Fold <./group__fold>` (experimental)
diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index c115cf435f..a51a8ec859 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -51,8 +51,8 @@ add_executable(MIOpenDriver
     dm_softmax.cpp
     dm_sum.cpp
     dm_tensorop.cpp
-    dm_unfold.cpp
     dm_fold.cpp
+    dm_unfold.cpp
     main.cpp
     registry_driver_maker.cpp
     rocrand_wrapper.cpp)
diff --git a/driver/fold_driver.hpp b/driver/fold_driver.hpp
index 1468a77e0d..c034beeaee 100644
--- a/driver/fold_driver.hpp
+++ b/driver/fold_driver.hpp
@@ -133,7 +133,7 @@ int FoldDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
 template <typename Tgpu, typename Tref>
 int FoldDriver<Tgpu, Tref>::GetandSetData()
 {
-    std::vector<int> input_length = GetTensorLengthsFromCmdLine();
+    std::vector<int> input_length = inflags.GetValueTensor("DimLengths").lengths;
 
     output_size = GetVectorInt32tFromCmdLine("outputSize");
     kernel_size = GetVectorInt32tFromCmdLine("kernelSize");
@@ -161,8 +161,8 @@ int FoldDriver<Tgpu, Tref>::AddCmdLineArgs()
 {
     inflags.AddInputFlag(
         "forw", 'F', "1", "Run Fold Forward (Default=1) or both Forward and Backward (0)", "int");
-    inflags.AddInputFlag(
-        "DimLengths", 'D', "3,12,12", "The dimensional lengths of the input tensor", "string");
+    inflags.AddTensorFlag(
+        "DimLengths", 'D', "3x12x12", "The dimensional lengths of the input tensor");
     inflags.AddInputFlag("outputSize", 'o', "4,5", "Output Size (Default=2,3)", "str");
     inflags.AddInputFlag("kernelSize", 'k', "2,2", "Kernel Size (Default=2,3)", "str");
     inflags.AddInputFlag("stride", 's', "1,1", "Stride (Default=1,1)", "str");
diff --git a/driver/unfold_driver.hpp b/driver/unfold_driver.hpp
index e9a3665eaa..8e4be26f3f 100644
--- a/driver/unfold_driver.hpp
+++ b/driver/unfold_driver.hpp
@@ -132,7 +132,7 @@ int UnfoldDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
 template <typename Tgpu, typename Tref>
 int UnfoldDriver<Tgpu, Tref>::GetandSetData()
 {
-    std::vector<int> input_length = GetTensorLengthsFromCmdLine();
+    std::vector<int> input_length = inflags.GetValueTensor("DimLengths").lengths;
 
     kernel_size          = GetVectorInt32tFromCmdLine("kernelSize");
     stride               = GetVectorInt32tFromCmdLine("stride");
@@ -167,8 +167,8 @@ int UnfoldDriver<Tgpu, Tref>::AddCmdLineArgs()
 {
     inflags.AddInputFlag(
         "forw", 'F', "1", "Run Unfold Forward (Default=1) or both Forward and Backward (0)", "int");
-    inflags.AddInputFlag(
-        "DimLengths", 'D', "2,5,3,4", "The dimensional lengths of the input tensor", "string");
+    inflags.AddTensorFlag(
+        "DimLengths", 'D', "2x5x3x4", "The dimensional lengths of the input tensor");
     inflags.AddInputFlag("kernelSize", 'k', "2,3", "Kernel Size (Default=2,3)", "str");
     inflags.AddInputFlag("stride", 's', "1,1", "Stride (Default=1,1)", "str");
     inflags.AddInputFlag("padding", 'p', "0,0", "Padding (Default=0,0)", "str");
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index bd057795a3..6ed6638122 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -258,10 +258,10 @@ set( MIOpen_Source
     solver/conv_winoRxS.cpp
     solver/conv_winoRxS_fused.cpp
     solver/fft.cpp
-    solver/fold/unfold_forward.cpp
-    solver/fold/unfold_backward.cpp
     solver/fold/fold_forward.cpp
     solver/fold/fold_backward.cpp
+    solver/fold/unfold_forward.cpp
+    solver/fold/unfold_backward.cpp
     solver/gemm.cpp
     solver/gemm_bwd.cpp
     solver/gemm_wrw.cpp
diff --git a/src/fold/problem_description.cpp b/src/fold/problem_description.cpp
index ce34de1a16..7cf628c170 100644
--- a/src/fold/problem_description.cpp
+++ b/src/fold/problem_description.cpp
@@ -36,7 +36,6 @@ namespace fold {
 NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const
 {
     auto input_dtype  = inputDesc.GetType();
-    auto output_dtype = outputDesc.GetType();
     auto size         = inputDesc.GetElementSize();
     auto in_dims      = inputDesc.GetLengths();
 
@@ -44,7 +43,6 @@ NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const
 
     ss << "Unfold_fwd";
     ss << "i_dtype" << input_dtype;
-    ss << "o_dtype" << output_dtype;
     ss << "size" << size;
     ss << "in_dims";
     for(auto val : in_dims)
@@ -62,7 +60,6 @@ NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const
 NetworkConfig UnfoldBwdProblemDescription::MakeNetworkConfig() const
 {
     auto input_dtype  = dinputDesc.GetType();
-    auto output_dtype = doutputDesc.GetType();
     auto size         = dinputDesc.GetElementSize();
     auto in_dims      = dinputDesc.GetLengths();
 
@@ -70,7 +67,6 @@ NetworkConfig UnfoldBwdProblemDescription::MakeNetworkConfig() const
 
     ss << "Unfold_bwd";
     ss << "i_dtype" << input_dtype;
-    ss << "o_dtype" << output_dtype;
     ss << "size" << size;
     ss << "in_grad_dims";
     for(auto val : in_dims)
@@ -88,7 +84,6 @@ NetworkConfig UnfoldBwdProblemDescription::MakeNetworkConfig() const
 NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const
 {
     auto input_dtype  = inputDesc.GetType();
-    auto output_dtype = outputDesc.GetType();
     auto size         = inputDesc.GetElementSize();
     auto in_dims      = inputDesc.GetLengths();
     auto out_dims     = outputDesc.GetLengths();
@@ -97,7 +92,6 @@ NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const
 
     ss << "Fold_fwd";
     ss << "i_dtype" << input_dtype;
-    ss << "o_dtype" << output_dtype;
     ss << "size" << size;
     ss << "in_dims";
     for(auto val : in_dims)
@@ -120,7 +114,6 @@ NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const
 NetworkConfig FoldBwdProblemDescription::MakeNetworkConfig() const
 {
     auto input_dtype  = dinputDesc.GetType();
-    auto output_dtype = doutputDesc.GetType();
     auto size         = dinputDesc.GetElementSize();
     auto in_dims      = dinputDesc.GetLengths();
     auto out_dims     = doutputDesc.GetLengths();
@@ -129,7 +122,6 @@ NetworkConfig FoldBwdProblemDescription::MakeNetworkConfig() const
 
     ss << "Fold_bwd";
     ss << "i_dtype" << input_dtype;
-    ss << "o_dtype" << output_dtype;
     ss << "size" << size;
     ss << "in_grad_dims";
     for(auto val : in_dims)
diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp
index 1fe957d408..d07362ace3 100644
--- a/src/solver/fold/fold_backward.cpp
+++ b/src/solver/fold/fold_backward.cpp
@@ -75,35 +75,33 @@ ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& conte
         ls.push_back(l);
     }
 
-    {
-        auto kernel        = KernelInfo{};
-        kernel.kernel_file = "MIOpenUnfold.cpp";
-        kernel.kernel_name = "UnfoldForward4D";
-
-        const auto build_params = KernelBuildParameters{
-            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
-            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
-            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
-            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
-        };
-        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
-
-        size_t xlocalsize = LOCAL_SIZE;
-        size_t xgridsize  = AlignUp(N * C * P * L, LOCAL_SIZE);
-        size_t ylocalsize = 1;
-        size_t ygridsize  = 1;
-        size_t zlocalsize = 1;
-        size_t zgridsize  = 1;
-        kernel.l_wk.push_back(xlocalsize);
-        kernel.l_wk.push_back(ylocalsize);
-        kernel.l_wk.push_back(zlocalsize);
-
-        kernel.g_wk.push_back(xgridsize);
-        kernel.g_wk.push_back(ygridsize);
-        kernel.g_wk.push_back(zgridsize);
-
-        result.construction_params.push_back(kernel);
-    }
+    auto kernel        = KernelInfo{};
+    kernel.kernel_file = "MIOpenUnfold.cpp";
+    kernel.kernel_name = "UnfoldForward4D";
+
+    const auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+    };
+    kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+    size_t xlocalsize = LOCAL_SIZE;
+    size_t xgridsize  = AlignUp(N * C * P * L, LOCAL_SIZE);
+    size_t ylocalsize = 1;
+    size_t ygridsize  = 1;
+    size_t zlocalsize = 1;
+    size_t zgridsize  = 1;
+    kernel.l_wk.push_back(xlocalsize);
+    kernel.l_wk.push_back(ylocalsize);
+    kernel.l_wk.push_back(zlocalsize);
+
+    kernel.g_wk.push_back(xgridsize);
+    kernel.g_wk.push_back(ygridsize);
+    kernel.g_wk.push_back(zgridsize);
+
+    result.construction_params.push_back(kernel);
 
     result.invoker_factory = [N, C, P, L, ls](const std::vector<Kernel>& kernels) {
         return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp
index 6bd24d931f..5d96199515 100644
--- a/test/gtest/fold.cpp
+++ b/test/gtest/fold.cpp
@@ -29,9 +29,21 @@
 #include "tensor_holder.hpp"
 #include <miopen/env.hpp>
 
+MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
 MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
 
 namespace fold {
+    
+std::string GetFloatArg()
+{
+    const auto& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG));
+    if(tmp.empty())
+    {
+        return "";
+    }
+    return tmp;
+}
+
 struct FoldForwardTestFloat32 : FoldFwdTest<float>
 {
 };
@@ -56,10 +68,13 @@ struct FoldBackwardTestBFloat16 : FoldBwdTest<bfloat16>
 {
 };
 }; // namespace fold
+
 using namespace fold;
+
 TEST_P(FoldForwardTestFloat32, FoldForwardTest)
 {
-    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
+       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
     {
         RunTest();
         Verify();
@@ -76,7 +91,8 @@ INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet,
 
 TEST_P(FoldForwardTestFloat16, FoldForwardTest)
 {
-    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
+       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
     {
         RunTest();
         Verify();
@@ -93,7 +109,8 @@ INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet,
 
 TEST_P(FoldForwardTestBFloat16, FoldForwardTest)
 {
-    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
+       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
     {
         RunTest();
         Verify();
@@ -110,7 +127,8 @@ INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet,
 
 TEST_P(FoldBackwardTestFloat32, FoldBackwardTest)
 {
-    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
+       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
     {
         RunTest();
         Verify();
@@ -127,7 +145,8 @@ INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet,
 
 TEST_P(FoldBackwardTestFloat16, FoldBackwardTest)
 {
-    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
+       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
     {
         RunTest();
         Verify();
@@ -144,7 +163,8 @@ INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet,
 
 TEST_P(FoldBackwardTestBFloat16, FoldBackwardTest)
 {
-    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
+       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
     {
         RunTest();
         Verify();
diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp
index 7e71c5ce2f..0f9f9e040b 100644
--- a/test/gtest/fold.hpp
+++ b/test/gtest/fold.hpp
@@ -30,9 +30,6 @@
 #include "random.hpp"
 #include "tensor_holder.hpp"
 #include "verify.hpp"
-#include <cstddef>
-#include <cstdlib>
-#include <random>
 #include <gtest/gtest.h>
 #include <miopen/miopen.h>
 #include <miopen/fold.hpp>
diff --git a/test/gtest/unfold.cpp b/test/gtest/unfold.cpp
index b97c96d567..22a67c4657 100644
--- a/test/gtest/unfold.cpp
+++ b/test/gtest/unfold.cpp
@@ -29,9 +29,21 @@
 #include "tensor_holder.hpp"
 #include <miopen/env.hpp>
 
+MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
 MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
 
 namespace unfold {
+
+std::string GetFloatArg()
+{
+    const auto& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG));
+    if(tmp.empty())
+    {
+        return "";
+    }
+    return tmp;
+}
+
 struct UnfoldForwardTestFloat32 : UnfoldFwdTest<float>
 {
 };
@@ -59,7 +71,8 @@ struct UnfoldBackwardTestBFloat16 : UnfoldBwdTest<bfloat16>
 using namespace unfold;
 TEST_P(UnfoldForwardTestFloat32, UnfoldForwardTest)
 {
-    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
+       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
     {
         RunTest();
         Verify();
@@ -76,7 +89,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
 
 TEST_P(UnfoldForwardTestFloat16, UnfoldForwardTest)
 {
-    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
+       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
     {
         RunTest();
         Verify();
@@ -93,7 +107,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
 
 TEST_P(UnfoldForwardTestBFloat16, UnfoldForwardTest)
 {
-    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
+       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
     {
         RunTest();
         Verify();
@@ -110,7 +125,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
 
 TEST_P(UnfoldBackwardTestFloat32, UnfoldBackwardTest)
 {
-    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
+       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
     {
         RunTest();
         Verify();
@@ -127,7 +143,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet,
 
 TEST_P(UnfoldBackwardTestFloat16, UnfoldBackwardTest)
 {
-    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
+       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
     {
         RunTest();
         Verify();
@@ -144,7 +161,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet,
 
 TEST_P(UnfoldBackwardTestBFloat16, UnfoldBackwardTest)
 {
-    if(miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)))
+    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
+       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
     {
         RunTest();
         Verify();

From 526f7728430b21ba072e37c218d67b2ebef49126 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Mon, 22 Jul 2024 08:44:04 +0000
Subject: [PATCH 20/46] githook format

---
 src/fold/problem_description.cpp | 28 ++++++++++++++--------------
 test/gtest/fold.cpp              |  2 +-
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/fold/problem_description.cpp b/src/fold/problem_description.cpp
index 7cf628c170..a59b460ed7 100644
--- a/src/fold/problem_description.cpp
+++ b/src/fold/problem_description.cpp
@@ -35,9 +35,9 @@ namespace fold {
 
 NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const
 {
-    auto input_dtype  = inputDesc.GetType();
-    auto size         = inputDesc.GetElementSize();
-    auto in_dims      = inputDesc.GetLengths();
+    auto input_dtype = inputDesc.GetType();
+    auto size        = inputDesc.GetElementSize();
+    auto in_dims     = inputDesc.GetLengths();
 
     std::ostringstream ss;
 
@@ -59,9 +59,9 @@ NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const
 
 NetworkConfig UnfoldBwdProblemDescription::MakeNetworkConfig() const
 {
-    auto input_dtype  = dinputDesc.GetType();
-    auto size         = dinputDesc.GetElementSize();
-    auto in_dims      = dinputDesc.GetLengths();
+    auto input_dtype = dinputDesc.GetType();
+    auto size        = dinputDesc.GetElementSize();
+    auto in_dims     = dinputDesc.GetLengths();
 
     std::ostringstream ss;
 
@@ -83,10 +83,10 @@ NetworkConfig UnfoldBwdProblemDescription::MakeNetworkConfig() const
 
 NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const
 {
-    auto input_dtype  = inputDesc.GetType();
-    auto size         = inputDesc.GetElementSize();
-    auto in_dims      = inputDesc.GetLengths();
-    auto out_dims     = outputDesc.GetLengths();
+    auto input_dtype = inputDesc.GetType();
+    auto size        = inputDesc.GetElementSize();
+    auto in_dims     = inputDesc.GetLengths();
+    auto out_dims    = outputDesc.GetLengths();
 
     std::ostringstream ss;
 
@@ -113,10 +113,10 @@ NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const
 
 NetworkConfig FoldBwdProblemDescription::MakeNetworkConfig() const
 {
-    auto input_dtype  = dinputDesc.GetType();
-    auto size         = dinputDesc.GetElementSize();
-    auto in_dims      = dinputDesc.GetLengths();
-    auto out_dims     = doutputDesc.GetLengths();
+    auto input_dtype = dinputDesc.GetType();
+    auto size        = dinputDesc.GetElementSize();
+    auto in_dims     = dinputDesc.GetLengths();
+    auto out_dims    = doutputDesc.GetLengths();
 
     std::ostringstream ss;
 
diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp
index 5d96199515..7368c6920e 100644
--- a/test/gtest/fold.cpp
+++ b/test/gtest/fold.cpp
@@ -33,7 +33,7 @@ MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
 MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
 
 namespace fold {
-    
+
 std::string GetFloatArg()
 {
     const auto& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG));

From 2a3d2b0565d0aec02064ef1c9c94373ad66def20 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Mon, 22 Jul 2024 08:46:15 +0000
Subject: [PATCH 21/46] cpu_fold -> cpu_unfold

---
 test/{cpu_fold.hpp => cpu_unfold.hpp} | 0
 test/gtest/fold.hpp                   | 2 +-
 test/gtest/unfold.hpp                 | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename test/{cpu_fold.hpp => cpu_unfold.hpp} (100%)

diff --git a/test/cpu_fold.hpp b/test/cpu_unfold.hpp
similarity index 100%
rename from test/cpu_fold.hpp
rename to test/cpu_unfold.hpp
diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp
index 0f9f9e040b..a92b09d6a7 100644
--- a/test/gtest/fold.hpp
+++ b/test/gtest/fold.hpp
@@ -24,7 +24,7 @@
  *
  *******************************************************************************/
 #include "../driver/tensor_driver.hpp"
-#include "cpu_fold.hpp"
+#include "cpu_unfold.hpp"
 #include "get_handle.hpp"
 #include "miopen/allocator.hpp"
 #include "random.hpp"
diff --git a/test/gtest/unfold.hpp b/test/gtest/unfold.hpp
index 3a839024b2..51d0c9cf30 100644
--- a/test/gtest/unfold.hpp
+++ b/test/gtest/unfold.hpp
@@ -24,7 +24,7 @@
  *
  *******************************************************************************/
 #include "../driver/tensor_driver.hpp"
-#include "cpu_fold.hpp"
+#include "cpu_unfold.hpp"
 #include "get_handle.hpp"
 #include "miopen/allocator.hpp"
 #include "random.hpp"

From 27e26c35b5d2fedee5bee62202e300f4d2665d40 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Mon, 22 Jul 2024 09:08:46 +0000
Subject: [PATCH 22/46] update code as comments

---
 driver/mloUnfoldHost.hpp | 6 +++---
 test/cpu_unfold.hpp      | 6 +++---
 test/gtest/fold.hpp      | 4 +++-
 test/gtest/unfold.hpp    | 4 +++-
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/driver/mloUnfoldHost.hpp b/driver/mloUnfoldHost.hpp
index fcfd5f4a6b..6178946bd1 100644
--- a/driver/mloUnfoldHost.hpp
+++ b/driver/mloUnfoldHost.hpp
@@ -64,7 +64,7 @@ int32_t mloUnFoldFwd4DRunHost(Tgpu* input,
         L *= l;
         ls.push_back(l);
     }
-    [[maybe_unused]] int32_t kernel_size_h = kernel_size[0];
+    int32_t kernel_size_h = kernel_size[0];
     int32_t kernel_size_w                  = kernel_size[1];
     int32_t stride_h                       = stride[0];
     int32_t stride_w                       = stride[1];
@@ -72,7 +72,7 @@ int32_t mloUnFoldFwd4DRunHost(Tgpu* input,
     int32_t padding_w                      = padding[1];
     int32_t dilation_h                     = dilation[0];
     int32_t dilation_w                     = dilation[1];
-    [[maybe_unused]] int32_t LH            = ls[0];
+    int32_t LH            = ls[0];
     int32_t LW                             = ls[1];
     int32_t H                              = static_cast<int32_t>(input_dims[2]);
     int32_t W                              = static_cast<int32_t>(input_dims[3]);
@@ -124,7 +124,7 @@ int32_t mloUnFoldBwd4DRunHost(Tcheck* ref_dinput,
     int spatial_dim_size       = input_size - 2;
     const int32_t N            = static_cast<int32_t>(input_grad_dims[0]);
     const int32_t C            = static_cast<int32_t>(input_grad_dims[1]);
-    [[maybe_unused]] int32_t P = 1, L = 1;
+    int32_t P = 1, L = 1;
     std::vector<int32_t> ls;
     for(int i = 0; i < spatial_dim_size; ++i)
     {
diff --git a/test/cpu_unfold.hpp b/test/cpu_unfold.hpp
index 373cc30917..6a3b5f3d2e 100644
--- a/test/cpu_unfold.hpp
+++ b/test/cpu_unfold.hpp
@@ -68,7 +68,7 @@ void cpu_unfold_fwd_4d(tensor<T> input_tensor,
         ls.push_back(l);
     }
 
-    [[maybe_unused]] int32_t kernel_size_h = kernel_size[0];
+    int32_t kernel_size_h = kernel_size[0];
     int32_t kernel_size_w                  = kernel_size[1];
     int32_t stride_h                       = stride[0];
     int32_t stride_w                       = stride[1];
@@ -76,7 +76,7 @@ void cpu_unfold_fwd_4d(tensor<T> input_tensor,
     int32_t padding_w                      = padding[1];
     int32_t dilation_h                     = dilation[0];
     int32_t dilation_w                     = dilation[1];
-    [[maybe_unused]] int32_t LH            = ls[0];
+    int32_t LH            = ls[0];
     int32_t LW                             = ls[1];
     int32_t H                              = static_cast<int32_t>(input_dims[2]);
     int32_t W                              = static_cast<int32_t>(input_dims[3]);
@@ -129,7 +129,7 @@ void cpu_unfold_bwd_4d(tensor<T>& ref_dinput_tensor,
     const int32_t N = static_cast<int32_t>(input_grad_dims[0]);
     const int32_t C = static_cast<int32_t>(input_grad_dims[1]);
 
-    [[maybe_unused]] int32_t P = 1, L = 1;
+    int32_t P = 1, L = 1;
     std::vector<int32_t> ls;
     for(int i = 0; i < spatial_dim_size; ++i)
     {
diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp
index a92b09d6a7..02d9e42e17 100644
--- a/test/gtest/fold.hpp
+++ b/test/gtest/fold.hpp
@@ -113,7 +113,8 @@ struct FoldTestCase
 };
 
 std::vector<FoldTestCase> FoldTestConfigs()
-{ // n c d h w padding
+{
+    // clang-format: off
     return {
         {3, 3 * 2 * 2, 0, 0, 3 * 4, {4, 5}, {2, 2}, {1, 1}, {0, 0}, {1, 1}, true},
         {3, 3 * 2 * 2, 0, 0, 3 * 4, {6, 11}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, true},
@@ -123,6 +124,7 @@ std::vector<FoldTestCase> FoldTestConfigs()
         {3, 3 * 2 * 2, 0, 0, 3 * 4, {2, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, true},
         {3, 3 * 2 * 2, 0, 0, 3 * 4, {5, 7}, {2, 2}, {1, 1}, {0, 0}, {2, 3}, true},
     };
+    // clang-format: on
 }
 
 template <typename T>
diff --git a/test/gtest/unfold.hpp b/test/gtest/unfold.hpp
index 51d0c9cf30..4bb790b1f5 100644
--- a/test/gtest/unfold.hpp
+++ b/test/gtest/unfold.hpp
@@ -112,7 +112,8 @@ struct UnfoldTestCase
 };
 
 std::vector<UnfoldTestCase> UnfoldTestConfigs()
-{ // n c d h w padding
+{
+    // clang-format: off
     return {
         {2, 5, 0, 3, 4, {2, 3}, {1, 1}, {0, 0}, {1, 1}, true},
         {1, 3, 0, 10, 12, {4, 5}, {1, 1}, {0, 0}, {1, 1}, true},
@@ -121,6 +122,7 @@ std::vector<UnfoldTestCase> UnfoldTestConfigs()
         {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {0, 0}, {3, 2}, true},
         {11, 13, 0, 33, 37, {4, 3}, {2, 3}, {5, 2}, {3, 5}, true},
     };
+    // clang-format: on
 }
 
 template <typename T>

From 366e350ebfef0c3722f81a8e8a400648454628ab Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Mon, 22 Jul 2024 09:09:33 +0000
Subject: [PATCH 23/46] githook format

---
 driver/mloUnfoldHost.hpp | 30 +++++++++++++++---------------
 test/cpu_unfold.hpp      | 22 +++++++++++-----------
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/driver/mloUnfoldHost.hpp b/driver/mloUnfoldHost.hpp
index 6178946bd1..3334cd55fe 100644
--- a/driver/mloUnfoldHost.hpp
+++ b/driver/mloUnfoldHost.hpp
@@ -65,18 +65,18 @@ int32_t mloUnFoldFwd4DRunHost(Tgpu* input,
         ls.push_back(l);
     }
     int32_t kernel_size_h = kernel_size[0];
-    int32_t kernel_size_w                  = kernel_size[1];
-    int32_t stride_h                       = stride[0];
-    int32_t stride_w                       = stride[1];
-    int32_t padding_h                      = padding[0];
-    int32_t padding_w                      = padding[1];
-    int32_t dilation_h                     = dilation[0];
-    int32_t dilation_w                     = dilation[1];
+    int32_t kernel_size_w = kernel_size[1];
+    int32_t stride_h      = stride[0];
+    int32_t stride_w      = stride[1];
+    int32_t padding_h     = padding[0];
+    int32_t padding_w     = padding[1];
+    int32_t dilation_h    = dilation[0];
+    int32_t dilation_w    = dilation[1];
     int32_t LH            = ls[0];
-    int32_t LW                             = ls[1];
-    int32_t H                              = static_cast<int32_t>(input_dims[2]);
-    int32_t W                              = static_cast<int32_t>(input_dims[3]);
-    int work_size = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
+    int32_t LW            = ls[1];
+    int32_t H             = static_cast<int32_t>(input_dims[2]);
+    int32_t W             = static_cast<int32_t>(input_dims[3]);
+    int work_size         = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
     par_ford(work_size)([&](int gid) {
         int ncp = gid / L, l = gid % L;
         int nc = ncp / P, p = ncp % P;
@@ -120,10 +120,10 @@ int32_t mloUnFoldBwd4DRunHost(Tcheck* ref_dinput,
     auto input_grad_dims = miopen::deref(dinputDesc).GetLengths();
     auto input_size      = miopen::deref(dinputDesc).GetSize();
 
-    const int LOCAL_SIZE       = 256;
-    int spatial_dim_size       = input_size - 2;
-    const int32_t N            = static_cast<int32_t>(input_grad_dims[0]);
-    const int32_t C            = static_cast<int32_t>(input_grad_dims[1]);
+    const int LOCAL_SIZE = 256;
+    int spatial_dim_size = input_size - 2;
+    const int32_t N      = static_cast<int32_t>(input_grad_dims[0]);
+    const int32_t C      = static_cast<int32_t>(input_grad_dims[1]);
     int32_t P = 1, L = 1;
     std::vector<int32_t> ls;
     for(int i = 0; i < spatial_dim_size; ++i)
diff --git a/test/cpu_unfold.hpp b/test/cpu_unfold.hpp
index 6a3b5f3d2e..46f7552083 100644
--- a/test/cpu_unfold.hpp
+++ b/test/cpu_unfold.hpp
@@ -69,18 +69,18 @@ void cpu_unfold_fwd_4d(tensor<T> input_tensor,
     }
 
     int32_t kernel_size_h = kernel_size[0];
-    int32_t kernel_size_w                  = kernel_size[1];
-    int32_t stride_h                       = stride[0];
-    int32_t stride_w                       = stride[1];
-    int32_t padding_h                      = padding[0];
-    int32_t padding_w                      = padding[1];
-    int32_t dilation_h                     = dilation[0];
-    int32_t dilation_w                     = dilation[1];
+    int32_t kernel_size_w = kernel_size[1];
+    int32_t stride_h      = stride[0];
+    int32_t stride_w      = stride[1];
+    int32_t padding_h     = padding[0];
+    int32_t padding_w     = padding[1];
+    int32_t dilation_h    = dilation[0];
+    int32_t dilation_w    = dilation[1];
     int32_t LH            = ls[0];
-    int32_t LW                             = ls[1];
-    int32_t H                              = static_cast<int32_t>(input_dims[2]);
-    int32_t W                              = static_cast<int32_t>(input_dims[3]);
-    int work_size = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
+    int32_t LW            = ls[1];
+    int32_t H             = static_cast<int32_t>(input_dims[2]);
+    int32_t W             = static_cast<int32_t>(input_dims[3]);
+    int work_size         = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
     par_ford(work_size)([&](int gid) {
         int ncp = gid / L, l = gid % L;
         int nc = ncp / P, p = ncp % P;

From 2b3bd1f1d8602274aefdfbcb501b9751d4ccf806 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Wed, 24 Jul 2024 09:26:07 +0000
Subject: [PATCH 24/46] githook format

---
 driver/driver.hpp | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/driver/driver.hpp b/driver/driver.hpp
index 4b1c831ad9..a22e10e572 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -169,14 +169,15 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
 [[noreturn]] inline void Usage()
 {
     printf("Usage: ./driver *base_arg* *other_args*\n");
-    printf("Supported Base Arguments: conv[fp16|int8|bfp16|fp8|bfp8], CBAInfer[fp16], "
-           "pool[fp16], lrn[fp16], "
-           "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], "
-           "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
-           "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
-           "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
-           "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, unfold[bfp16|fp16], "
-           "fold[bfp16|fp16]\n");
+    printf(
+        "Supported Base Arguments: conv[fp16|int8|bfp16|fp8|bfp8], CBAInfer[fp16], "
+        "pool[fp16], lrn[fp16], "
+        "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], "
+        "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
+        "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
+        "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
+        "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, unfold[bfp16|fp16], "
+        "fold[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -206,9 +207,9 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "adam" && arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" &&
        arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "adamw" &&
        arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" &&
-       arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "unfold" && arg != "unfoldfp16" &&
-       arg != "unfoldbfp16" && arg != "fold" && arg != "foldfp16" && arg != "foldbfp16" &&
-       arg != "--version")
+       arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "unfold" &&
+       arg != "unfoldfp16" && arg != "unfoldbfp16" && arg != "fold" && arg != "foldfp16" &&
+       arg != "foldbfp16" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();

From 5c506b4269a16b480645c5c917d3df70d9dc6de7 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Mon, 29 Jul 2024 08:26:20 +0000
Subject: [PATCH 25/46] Update gtest code

---
 test/cpu_unfold.hpp   |  9 +++------
 test/gtest/fold.cpp   | 26 +++++++++++++-------------
 test/gtest/unfold.cpp | 26 +++++++++++++-------------
 3 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/test/cpu_unfold.hpp b/test/cpu_unfold.hpp
index 46f7552083..201917d4e0 100644
--- a/test/cpu_unfold.hpp
+++ b/test/cpu_unfold.hpp
@@ -43,7 +43,7 @@ void cpu_unfold_fwd_4d(tensor<T> input_tensor,
 {
     auto input_tv   = miopen::get_inner_expanded_tv<4>(input_tensor.desc);
     auto output_tv  = miopen::get_inner_expanded_tv<3>(ref_output_tensor.desc);
-    auto input_size = input_tensor.desc.GetSize();
+    auto input_size = input_tensor.desc.GetNumDims();
     auto input_dims = input_tensor.desc.GetLengths();
 
     auto input  = input_tensor.data.data();
@@ -68,7 +68,6 @@ void cpu_unfold_fwd_4d(tensor<T> input_tensor,
         ls.push_back(l);
     }
 
-    int32_t kernel_size_h = kernel_size[0];
     int32_t kernel_size_w = kernel_size[1];
     int32_t stride_h      = stride[0];
     int32_t stride_w      = stride[1];
@@ -76,7 +75,6 @@ void cpu_unfold_fwd_4d(tensor<T> input_tensor,
     int32_t padding_w     = padding[1];
     int32_t dilation_h    = dilation[0];
     int32_t dilation_w    = dilation[1];
-    int32_t LH            = ls[0];
     int32_t LW            = ls[1];
     int32_t H             = static_cast<int32_t>(input_dims[2]);
     int32_t W             = static_cast<int32_t>(input_dims[3]);
@@ -117,7 +115,7 @@ void cpu_unfold_bwd_4d(tensor<T>& ref_dinput_tensor,
 {
     auto input_grad_tv   = miopen::get_inner_expanded_tv<4>(ref_dinput_tensor.desc);
     auto output_grad_tv  = miopen::get_inner_expanded_tv<3>(doutput_tensor.desc);
-    auto input_size      = ref_dinput_tensor.desc.GetSize();
+    auto input_size      = ref_dinput_tensor.desc.GetNumDims();
     auto input_grad_dims = ref_dinput_tensor.desc.GetLengths();
 
     auto input_grad  = ref_dinput_tensor.data.data();
@@ -129,7 +127,7 @@ void cpu_unfold_bwd_4d(tensor<T>& ref_dinput_tensor,
     const int32_t N = static_cast<int32_t>(input_grad_dims[0]);
     const int32_t C = static_cast<int32_t>(input_grad_dims[1]);
 
-    int32_t P = 1, L = 1;
+    int32_t P = 1;
     std::vector<int32_t> ls;
     for(int i = 0; i < spatial_dim_size; ++i)
     {
@@ -138,7 +136,6 @@ void cpu_unfold_bwd_4d(tensor<T>& ref_dinput_tensor,
                      dilation[i] * (kernel_size[i] - 1) - 1) /
                         stride[i] +
                     1;
-        L *= l;
         ls.push_back(l);
     }
 
diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp
index 7368c6920e..15ae2572c4 100644
--- a/test/gtest/fold.cpp
+++ b/test/gtest/fold.cpp
@@ -36,7 +36,7 @@ namespace fold {
 
 std::string GetFloatArg()
 {
-    const auto& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG));
+    const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG);
     if(tmp.empty())
     {
         return "";
@@ -73,8 +73,8 @@ using namespace fold;
 
 TEST_P(FoldForwardTestFloat32, FoldForwardTest)
 {
-    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
-       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
     {
         RunTest();
         Verify();
@@ -91,8 +91,8 @@ INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet,
 
 TEST_P(FoldForwardTestFloat16, FoldForwardTest)
 {
-    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
-       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
     {
         RunTest();
         Verify();
@@ -109,8 +109,8 @@ INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet,
 
 TEST_P(FoldForwardTestBFloat16, FoldForwardTest)
 {
-    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
-       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
     {
         RunTest();
         Verify();
@@ -127,8 +127,8 @@ INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet,
 
 TEST_P(FoldBackwardTestFloat32, FoldBackwardTest)
 {
-    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
-       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
     {
         RunTest();
         Verify();
@@ -145,8 +145,8 @@ INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet,
 
 TEST_P(FoldBackwardTestFloat16, FoldBackwardTest)
 {
-    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
-       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
     {
         RunTest();
         Verify();
@@ -163,8 +163,8 @@ INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet,
 
 TEST_P(FoldBackwardTestBFloat16, FoldBackwardTest)
 {
-    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
-       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
     {
         RunTest();
         Verify();
diff --git a/test/gtest/unfold.cpp b/test/gtest/unfold.cpp
index 22a67c4657..dc7cbdce38 100644
--- a/test/gtest/unfold.cpp
+++ b/test/gtest/unfold.cpp
@@ -36,7 +36,7 @@ namespace unfold {
 
 std::string GetFloatArg()
 {
-    const auto& tmp = miopen::GetStringEnv(ENV(MIOPEN_TEST_FLOAT_ARG));
+    const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG);
     if(tmp.empty())
     {
         return "";
@@ -71,8 +71,8 @@ struct UnfoldBackwardTestBFloat16 : UnfoldBwdTest<bfloat16>
 using namespace unfold;
 TEST_P(UnfoldForwardTestFloat32, UnfoldForwardTest)
 {
-    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
-       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
     {
         RunTest();
         Verify();
@@ -89,8 +89,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
 
 TEST_P(UnfoldForwardTestFloat16, UnfoldForwardTest)
 {
-    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
-       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
     {
         RunTest();
         Verify();
@@ -107,8 +107,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
 
 TEST_P(UnfoldForwardTestBFloat16, UnfoldForwardTest)
 {
-    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
-       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
     {
         RunTest();
         Verify();
@@ -125,8 +125,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
 
 TEST_P(UnfoldBackwardTestFloat32, UnfoldBackwardTest)
 {
-    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
-       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
     {
         RunTest();
         Verify();
@@ -143,8 +143,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet,
 
 TEST_P(UnfoldBackwardTestFloat16, UnfoldBackwardTest)
 {
-    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
-       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
     {
         RunTest();
         Verify();
@@ -161,8 +161,8 @@ INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet,
 
 TEST_P(UnfoldBackwardTestBFloat16, UnfoldBackwardTest)
 {
-    if(miopen::IsUnset(ENV(MIOPEN_TEST_ALL)) ||
-       (miopen::IsEnabled(ENV(MIOPEN_TEST_ALL)) && (GetFloatArg() == "--float")))
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
     {
         RunTest();
         Verify();

From e2631d8bc488bcfee0d3af508075fcea6afb097c Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Mon, 29 Jul 2024 09:17:33 +0000
Subject: [PATCH 26/46] githook format

---
 driver/driver.hpp        | 19 ++++++++++---------
 driver/mloUnfoldHost.hpp |  2 +-
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/driver/driver.hpp b/driver/driver.hpp
index 0bace22ab1..902389b977 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -169,13 +169,14 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
 [[noreturn]] inline void Usage()
 {
     printf("Usage: ./driver *base_arg* *other_args*\n");
-    printf("Supported Base Arguments: conv[fp16|int8|bfp16|fp8|bfp8], CBAInfer[fp16], "
-           "pool[fp16], lrn[fp16], "
-           "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], "
-           "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
-           "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
-           "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
-           "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, unfold[bfp16|fp16], "
+    printf(
+        "Supported Base Arguments: conv[fp16|int8|bfp16|fp8|bfp8], CBAInfer[fp16], "
+        "pool[fp16], lrn[fp16], "
+        "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], "
+        "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
+        "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
+        "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
+        "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, unfold[bfp16|fp16], "
         "fold[bfp16|fp16], getitem[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
@@ -207,8 +208,8 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" &&
        arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "unfold" &&
        arg != "unfoldfp16" && arg != "unfoldbfp16" && arg != "fold" && arg != "foldfp16" &&
-       arg != "foldbfp16" && arg != "getitem" &&
-       arg != "getitemfp16" && arg != "getitembfp16" && arg != "--version")
+       arg != "foldbfp16" && arg != "getitem" && arg != "getitemfp16" && arg != "getitembfp16" &&
+       arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/mloUnfoldHost.hpp b/driver/mloUnfoldHost.hpp
index f2e1d3f6e6..6204ea2bd9 100644
--- a/driver/mloUnfoldHost.hpp
+++ b/driver/mloUnfoldHost.hpp
@@ -121,7 +121,7 @@ int32_t mloUnFoldBwd4DRunHost(Tcheck* ref_dinput,
     int spatial_dim_size = input_size - 2;
     const int32_t N      = static_cast<int32_t>(input_grad_dims[0]);
     const int32_t C      = static_cast<int32_t>(input_grad_dims[1]);
-    int32_t P = 1;
+    int32_t P            = 1;
     std::vector<int32_t> ls;
     for(int i = 0; i < spatial_dim_size; ++i)
     {

From 7bb583b51b954f1b65f1177edea21e9178ece556 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Wed, 31 Jul 2024 08:28:38 +0000
Subject: [PATCH 27/46] git hook format

---
 src/kernels/tensor_view.hpp         | 10 ++--------
 src/solver/fold/fold_backward.cpp   |  5 +++--
 src/solver/fold/fold_forward.cpp    |  5 +++--
 src/solver/fold/unfold_backward.cpp |  5 +++--
 src/solver/fold/unfold_forward.cpp  |  5 +++--
 5 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
index 1e6491fadf..1b29099c2b 100644
--- a/src/kernels/tensor_view.hpp
+++ b/src/kernels/tensor_view.hpp
@@ -72,11 +72,7 @@ struct tensor_layout_t
         }
     }
 
-    constexpr tensor_layout_t(uint64_t n,
-                              uint64_t c,
-                              uint64_t d,
-                              uint64_t h,
-                              uint64_t w)
+    constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w)
     {
         static_assert(N == 5);
         layout[0] = n;
@@ -95,9 +91,7 @@ struct tensor_layout_t
         layout[3] = w;
     }
 
-    constexpr tensor_layout_t(uint64_t n,
-                              uint64_t h,
-                              uint64_t w)
+    constexpr tensor_layout_t(uint64_t n, uint64_t h, uint64_t w)
     {
         static_assert(N == 3);
         layout[0] = n;
diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp
index 6d0e0ab021..b952e5375c 100644
--- a/src/solver/fold/fold_backward.cpp
+++ b/src/solver/fold/fold_backward.cpp
@@ -42,8 +42,9 @@ namespace solver {
 
 namespace fold {
 
-bool FoldBwd::IsApplicable([[maybe_unused]] const ExecutionContext& /*context*/,
-                           [[maybe_unused]] const miopen::fold::FoldBwdProblemDescription& problem) const
+bool FoldBwd::IsApplicable(
+    [[maybe_unused]] const ExecutionContext& /*context*/,
+    [[maybe_unused]] const miopen::fold::FoldBwdProblemDescription& problem) const
 {
     return true;
 }
diff --git a/src/solver/fold/fold_forward.cpp b/src/solver/fold/fold_forward.cpp
index 63f6130761..17fb11180c 100644
--- a/src/solver/fold/fold_forward.cpp
+++ b/src/solver/fold/fold_forward.cpp
@@ -43,8 +43,9 @@ namespace solver {
 
 namespace fold {
 
-bool FoldFwd::IsApplicable([[maybe_unused]] const ExecutionContext& /*context*/,
-                           [[maybe_unused]] const miopen::fold::FoldFwdProblemDescription& problem) const
+bool FoldFwd::IsApplicable(
+    [[maybe_unused]] const ExecutionContext& /*context*/,
+    [[maybe_unused]] const miopen::fold::FoldFwdProblemDescription& problem) const
 {
     return true;
 }
diff --git a/src/solver/fold/unfold_backward.cpp b/src/solver/fold/unfold_backward.cpp
index 02b44a5339..da11969c64 100644
--- a/src/solver/fold/unfold_backward.cpp
+++ b/src/solver/fold/unfold_backward.cpp
@@ -42,8 +42,9 @@ namespace solver {
 
 namespace fold {
 
-bool UnfoldBwd::IsApplicable([[maybe_unused]] const ExecutionContext& /*context*/,
-                             [[maybe_unused]] const miopen::fold::UnfoldBwdProblemDescription& problem) const
+bool UnfoldBwd::IsApplicable(
+    [[maybe_unused]] const ExecutionContext& /*context*/,
+    [[maybe_unused]] const miopen::fold::UnfoldBwdProblemDescription& problem) const
 {
     return true;
 }
diff --git a/src/solver/fold/unfold_forward.cpp b/src/solver/fold/unfold_forward.cpp
index d0e3f53e8c..54e39809d6 100644
--- a/src/solver/fold/unfold_forward.cpp
+++ b/src/solver/fold/unfold_forward.cpp
@@ -42,8 +42,9 @@ namespace solver {
 
 namespace fold {
 
-bool UnfoldFwd::IsApplicable([[maybe_unused]] const ExecutionContext& /*context*/,
-                             [[maybe_unused]] const miopen::fold::UnfoldFwdProblemDescription& problem) const
+bool UnfoldFwd::IsApplicable(
+    [[maybe_unused]] const ExecutionContext& /*context*/,
+    [[maybe_unused]] const miopen::fold::UnfoldFwdProblemDescription& problem) const
 {
     return true;
 }

From a6256e752b3cdf4372c5a6473cb17c897e57bf70 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Wed, 31 Jul 2024 10:42:15 +0000
Subject: [PATCH 28/46] add MIOPEN_INTERNALS_EXPORT

---
 src/include/miopen/fold.hpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp
index 25f4aefa8d..454e067c5e 100644
--- a/src/include/miopen/fold.hpp
+++ b/src/include/miopen/fold.hpp
@@ -23,8 +23,8 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#ifndef MIOPEN_INSTANCE_NORM_HPP_
-#define MIOPEN_INSTANCE_NORM_HPP_
+#ifndef MIOPEN_FOLD_HPP_
+#define MIOPEN_FOLD_HPP_
 
 #include <miopen/common.hpp>
 
@@ -33,7 +33,7 @@ namespace miopen {
 struct Handle;
 struct TensorDescriptor;
 
-miopenStatus_t UnfoldForward(Handle& handle,
+MIOPEN_INTERNALS_EXPORT miopenStatus_t UnfoldForward(Handle& handle,
                              const TensorDescriptor& inputDesc,
                              ConstData_t input,
                              const TensorDescriptor& outputDesc,
@@ -47,7 +47,7 @@ miopenStatus_t UnfoldForward(Handle& handle,
                              const int32_t* dilation,
                              int32_t dilation_size);
 
-miopenStatus_t UnfoldBackward(Handle& handle,
+MIOPEN_INTERNALS_EXPORT miopenStatus_t UnfoldBackward(Handle& handle,
                               const TensorDescriptor& dinputDesc,
                               Data_t dinput,
                               const TensorDescriptor& doutputDesc,
@@ -61,7 +61,7 @@ miopenStatus_t UnfoldBackward(Handle& handle,
                               const int32_t* dilation,
                               int32_t dilation_size);
 
-miopenStatus_t FoldForward(Handle& handle,
+MIOPEN_INTERNALS_EXPORT miopenStatus_t FoldForward(Handle& handle,
                            const TensorDescriptor& inputDesc,
                            ConstData_t input,
                            const TensorDescriptor& outputDesc,
@@ -75,7 +75,7 @@ miopenStatus_t FoldForward(Handle& handle,
                            const int32_t* dilation,
                            int32_t dilation_size);
 
-miopenStatus_t FoldBackward(Handle& handle,
+MIOPEN_INTERNALS_EXPORT miopenStatus_t FoldBackward(Handle& handle,
                             const TensorDescriptor& dinputDesc,
                             Data_t dinput,
                             const TensorDescriptor& doutputDesc,
@@ -89,4 +89,4 @@ miopenStatus_t FoldBackward(Handle& handle,
                             const int32_t* dilation,
                             int32_t dilation_size);
 } // namespace miopen
-#endif // MIOPEN_INSTANCE_NORM_HPP_
+#endif // MIOPEN_FOLD_HPP_

From d359c1f64946135113709e8d0c6ba8665e50cb71 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Wed, 31 Jul 2024 10:42:37 +0000
Subject: [PATCH 29/46] githook format

---
 src/include/miopen/fold.hpp | 96 ++++++++++++++++++-------------------
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp
index 454e067c5e..9a8b46bb56 100644
--- a/src/include/miopen/fold.hpp
+++ b/src/include/miopen/fold.hpp
@@ -34,59 +34,59 @@ struct Handle;
 struct TensorDescriptor;
 
 MIOPEN_INTERNALS_EXPORT miopenStatus_t UnfoldForward(Handle& handle,
-                             const TensorDescriptor& inputDesc,
-                             ConstData_t input,
-                             const TensorDescriptor& outputDesc,
-                             Data_t output,
-                             const int32_t* kernel_size,
-                             int32_t kernel_size_size,
-                             const int32_t* stride,
-                             int32_t stride_size,
-                             const int32_t* padding,
-                             int32_t padding_size,
-                             const int32_t* dilation,
-                             int32_t dilation_size);
+                                                     const TensorDescriptor& inputDesc,
+                                                     ConstData_t input,
+                                                     const TensorDescriptor& outputDesc,
+                                                     Data_t output,
+                                                     const int32_t* kernel_size,
+                                                     int32_t kernel_size_size,
+                                                     const int32_t* stride,
+                                                     int32_t stride_size,
+                                                     const int32_t* padding,
+                                                     int32_t padding_size,
+                                                     const int32_t* dilation,
+                                                     int32_t dilation_size);
 
 MIOPEN_INTERNALS_EXPORT miopenStatus_t UnfoldBackward(Handle& handle,
-                              const TensorDescriptor& dinputDesc,
-                              Data_t dinput,
-                              const TensorDescriptor& doutputDesc,
-                              ConstData_t doutput,
-                              const int32_t* kernel_size,
-                              int32_t kernel_size_size,
-                              const int32_t* stride,
-                              int32_t stride_size,
-                              const int32_t* padding,
-                              int32_t padding_size,
-                              const int32_t* dilation,
-                              int32_t dilation_size);
+                                                      const TensorDescriptor& dinputDesc,
+                                                      Data_t dinput,
+                                                      const TensorDescriptor& doutputDesc,
+                                                      ConstData_t doutput,
+                                                      const int32_t* kernel_size,
+                                                      int32_t kernel_size_size,
+                                                      const int32_t* stride,
+                                                      int32_t stride_size,
+                                                      const int32_t* padding,
+                                                      int32_t padding_size,
+                                                      const int32_t* dilation,
+                                                      int32_t dilation_size);
 
 MIOPEN_INTERNALS_EXPORT miopenStatus_t FoldForward(Handle& handle,
-                           const TensorDescriptor& inputDesc,
-                           ConstData_t input,
-                           const TensorDescriptor& outputDesc,
-                           Data_t output,
-                           const int32_t* kernel_size,
-                           int32_t kernel_size_size,
-                           const int32_t* stride,
-                           int32_t stride_size,
-                           const int32_t* padding,
-                           int32_t padding_size,
-                           const int32_t* dilation,
-                           int32_t dilation_size);
+                                                   const TensorDescriptor& inputDesc,
+                                                   ConstData_t input,
+                                                   const TensorDescriptor& outputDesc,
+                                                   Data_t output,
+                                                   const int32_t* kernel_size,
+                                                   int32_t kernel_size_size,
+                                                   const int32_t* stride,
+                                                   int32_t stride_size,
+                                                   const int32_t* padding,
+                                                   int32_t padding_size,
+                                                   const int32_t* dilation,
+                                                   int32_t dilation_size);
 
 MIOPEN_INTERNALS_EXPORT miopenStatus_t FoldBackward(Handle& handle,
-                            const TensorDescriptor& dinputDesc,
-                            Data_t dinput,
-                            const TensorDescriptor& doutputDesc,
-                            ConstData_t doutput,
-                            const int32_t* kernel_size,
-                            int32_t kernel_size_size,
-                            const int32_t* stride,
-                            int32_t stride_size,
-                            const int32_t* padding,
-                            int32_t padding_size,
-                            const int32_t* dilation,
-                            int32_t dilation_size);
+                                                    const TensorDescriptor& dinputDesc,
+                                                    Data_t dinput,
+                                                    const TensorDescriptor& doutputDesc,
+                                                    ConstData_t doutput,
+                                                    const int32_t* kernel_size,
+                                                    int32_t kernel_size_size,
+                                                    const int32_t* stride,
+                                                    int32_t stride_size,
+                                                    const int32_t* padding,
+                                                    int32_t padding_size,
+                                                    const int32_t* dilation,
+                                                    int32_t dilation_size);
 } // namespace miopen
 #endif // MIOPEN_FOLD_HPP_

From 1652dc4def449cba1bc4b3c28adb6574b4175a84 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Fri, 2 Aug 2024 02:58:25 +0000
Subject: [PATCH 30/46] resolve conflict

---
 driver/driver.hpp | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/driver/driver.hpp b/driver/driver.hpp
index 4b328682e4..8307602c79 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -169,25 +169,13 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
 [[noreturn]] inline void Usage()
 {
     printf("Usage: ./driver *base_arg* *other_args*\n");
-<<<<<<< HEAD
-    printf(
-        "Supported Base Arguments: conv[fp16|int8|bfp16|fp8|bfp8], CBAInfer[fp16], "
-        "pool[fp16], lrn[fp16], "
-        "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], "
-        "tensorop[fp16], reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
-        "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
-        "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
-        "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, unfold[bfp16|fp16], "
-        "fold[bfp16|fp16], getitem[bfp16|fp16]\n");
-=======
     printf("Supported Base Arguments: conv[fp16|int8|bfp16], pool[fp16], lrn[fp16], "
            "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], "
            "tensorop, reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
            "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
            "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
            "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
-           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16]\n");
->>>>>>> origin
+           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], unfold[bfp16|fp16], fold[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 

From 1f6e4a2762ddceb0c5a57d512b7ad2aaaff6aa45 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Fri, 2 Aug 2024 02:59:15 +0000
Subject: [PATCH 31/46] githook format

---
 driver/driver.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/driver/driver.hpp b/driver/driver.hpp
index 8307602c79..ddb6667323 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -175,7 +175,8 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
            "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
            "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
-           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], unfold[bfp16|fp16], fold[bfp16|fp16]\n");
+           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], unfold[bfp16|fp16], "
+           "fold[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 

From 2f9bce7af944b3ca5b03f565aeb69dbf1caae5fe Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Fri, 2 Aug 2024 04:56:36 +0000
Subject: [PATCH 32/46] fix git merge dup

---
 src/solver.cpp        | 30 ++++++------------------------
 test/gtest/fold.cpp   |  2 --
 test/gtest/unfold.cpp |  2 --
 3 files changed, 6 insertions(+), 28 deletions(-)

diff --git a/src/solver.cpp b/src/solver.cpp
index 6839a5f122..47686cd789 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -654,11 +654,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
 
     Register(registry, ++id, Primitive::Softmax, softmax::Softmax{}.SolverDbId());
     Register(registry, ++id, Primitive::Softmax, softmax::AttnSoftmax{}.SolverDbId());
-    Register(registry, ++id, Primitive::Unfold, fold::UnfoldFwd{}.SolverDbId());
-    Register(registry, ++id, Primitive::Unfold, fold::UnfoldBwd{}.SolverDbId());
-    Register(registry, ++id, Primitive::Fold, fold::FoldFwd{}.SolverDbId());
-    Register(registry, ++id, Primitive::Fold, fold::FoldBwd{}.SolverDbId());
-
+    
     Register(registry, ++id, Primitive::Reduce, reduce::ArgminForward{}.SolverDbId());
     Register(registry, ++id, Primitive::Reduce, reduce::MaxForward{}.SolverDbId());
     Register(registry, ++id, Primitive::Reduce, reduce::MinForward{}.SolverDbId());
@@ -676,25 +672,11 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
              Primitive::Fusion,
              fusion::ConvWinoFuryRxSFused<2, 3>{}.SolverDbId(),
              miopenConvolutionAlgoWinograd);
-
-    Register(registry, ++id, Primitive::Reduce, reduce::ArgminForward{}.SolverDbId());
-    Register(registry, ++id, Primitive::Reduce, reduce::MaxForward{}.SolverDbId());
-    Register(registry, ++id, Primitive::Reduce, reduce::MinForward{}.SolverDbId());
-
-    Register(registry, ++id, Primitive::Mha, mha::MhaForward{}.SolverDbId());
-    Register(registry, ++id, Primitive::Mha, mha::MhaBackward{}.SolverDbId());
-
-    Register(registry, ++id, Primitive::Cat, cat::CatForward{}.SolverDbId());
-    Register(registry, ++id, Primitive::Adam, adam::Adam{}.SolverDbId());
-    Register(registry, ++id, Primitive::Item, getitem::GetitemBackward{}.SolverDbId());
-
-    Register(registry, ++id, Primitive::Adam, adam::TransformersAdamW{}.SolverDbId());
-
-    Register(registry,
-             ++id,
-             Primitive::Fusion,
-             fusion::ConvWinoFuryRxSFused<2, 3>{}.SolverDbId(),
-             miopenConvolutionAlgoWinograd);
+    
+    Register(registry, ++id, Primitive::Unfold, fold::UnfoldFwd{}.SolverDbId());
+    Register(registry, ++id, Primitive::Unfold, fold::UnfoldBwd{}.SolverDbId());
+    Register(registry, ++id, Primitive::Fold, fold::FoldFwd{}.SolverDbId());
+    Register(registry, ++id, Primitive::Fold, fold::FoldBwd{}.SolverDbId());
 
     // IMPORTANT: New solvers should be added to the end of the function!
 }
diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp
index 15ae2572c4..a07d0ea8d8 100644
--- a/test/gtest/fold.cpp
+++ b/test/gtest/fold.cpp
@@ -25,8 +25,6 @@
  *******************************************************************************/
 
 #include "fold.hpp"
-#include "miopen/bfloat16.hpp"
-#include "tensor_holder.hpp"
 #include <miopen/env.hpp>
 
 MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
diff --git a/test/gtest/unfold.cpp b/test/gtest/unfold.cpp
index dc7cbdce38..0c523b1b7f 100644
--- a/test/gtest/unfold.cpp
+++ b/test/gtest/unfold.cpp
@@ -25,8 +25,6 @@
  *******************************************************************************/
 
 #include "unfold.hpp"
-#include "miopen/bfloat16.hpp"
-#include "tensor_holder.hpp"
 #include <miopen/env.hpp>
 
 MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)

From fa6f15a010a1168f31cf6b1f12c291dce2ae8160 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Fri, 2 Aug 2024 04:57:15 +0000
Subject: [PATCH 33/46] githook format

---
 src/solver.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/solver.cpp b/src/solver.cpp
index 47686cd789..282a0930b3 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -654,7 +654,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
 
     Register(registry, ++id, Primitive::Softmax, softmax::Softmax{}.SolverDbId());
     Register(registry, ++id, Primitive::Softmax, softmax::AttnSoftmax{}.SolverDbId());
-    
+
     Register(registry, ++id, Primitive::Reduce, reduce::ArgminForward{}.SolverDbId());
     Register(registry, ++id, Primitive::Reduce, reduce::MaxForward{}.SolverDbId());
     Register(registry, ++id, Primitive::Reduce, reduce::MinForward{}.SolverDbId());
@@ -672,7 +672,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
              Primitive::Fusion,
              fusion::ConvWinoFuryRxSFused<2, 3>{}.SolverDbId(),
              miopenConvolutionAlgoWinograd);
-    
+
     Register(registry, ++id, Primitive::Unfold, fold::UnfoldFwd{}.SolverDbId());
     Register(registry, ++id, Primitive::Unfold, fold::UnfoldBwd{}.SolverDbId());
     Register(registry, ++id, Primitive::Fold, fold::FoldFwd{}.SolverDbId());

From 45ed5c12ab5ceb7c1ccfe4565d953ed4176a8603 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Mon, 5 Aug 2024 03:47:35 +0000
Subject: [PATCH 34/46] update tensor_view and kernel code

---
 src/include/miopen/tensor_view_utils.hpp |  2 +-
 src/kernels/MIOpenUnfold.cpp             |  8 ++---
 src/kernels/tensor_view.hpp              | 38 +++++-------------------
 3 files changed, 13 insertions(+), 35 deletions(-)

diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp
index 4a7c0b51ad..4ac9196a8d 100644
--- a/src/include/miopen/tensor_view_utils.hpp
+++ b/src/include/miopen/tensor_view_utils.hpp
@@ -38,7 +38,7 @@ inline tensor_view_t<N> get_inner_expanded_tv(const TensorDescriptor Desc)
     auto dims    = Desc.GetLengths();
     auto strides = Desc.GetStrides();
 
-    tensor_view_t<N> tensor_view;
+    tensor_view_t<N> tensor_view{};
     for(size_t i = 0; i < N; ++i)
     {
         if(i < dims.size())
diff --git a/src/kernels/MIOpenUnfold.cpp b/src/kernels/MIOpenUnfold.cpp
index a1c8cfd9f4..0e3f33e2f6 100644
--- a/src/kernels/MIOpenUnfold.cpp
+++ b/src/kernels/MIOpenUnfold.cpp
@@ -76,10 +76,10 @@ __device__ void unfoldForward4D(const DTYPE* __restrict__ input,
     DTYPE x = 0;
     if(0 <= h && h < H && 0 <= w && w < W)
     {
-        tensor_layout_t<4> input_layout(n, c, h, w);
+        tensor_layout_t<4> input_layout({n, c, h, w});
         x = input[input_tv.get_tensor_view_idx(input_layout)];
     }
-    tensor_layout_t<3> output_layout(n, c * P + p, l);
+    tensor_layout_t<3> output_layout({n, c * P + p, l});
     output[output_tv.get_tensor_view_idx(output_layout)] = x;
 }
 
@@ -180,12 +180,12 @@ __device__ void unfoldBackward4D(const DTYPE* __restrict__ output_grad,
             if(lw < 0 || LW <= lw)
                 continue;
             tensor_layout_t<3> output_grad_layout(
-                n, c * P + (ph * kernel_size_w + pw), lh * LW + lw);
+                {n, c * P + (ph * kernel_size_w + pw), lh * LW + lw});
             sum += CVT_FLOAT2ACCUM(
                 output_grad[output_grad_tv.get_tensor_view_idx(output_grad_layout)]);
         }
     }
-    tensor_layout_t<4> input_grad_layout(n, c, h, w);
+    tensor_layout_t<4> input_grad_layout({n, c, h, w});
     input_grad[input_grad_tv.get_tensor_view_idx(input_grad_layout)] = CVT_ACCUM2FLOAT(sum);
 }
 
diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
index 1b29099c2b..ecc075ac12 100644
--- a/src/kernels/tensor_view.hpp
+++ b/src/kernels/tensor_view.hpp
@@ -27,6 +27,8 @@
 #ifndef GUARD_TENSOR_VIEW_HPP
 #define GUARD_TENSOR_VIEW_HPP
 
+#include <initializer_list>
+
 template <int N>
 struct tensor_layout_t;
 
@@ -72,38 +74,14 @@ struct tensor_layout_t
         }
     }
 
-    constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w)
-    {
-        static_assert(N == 5);
-        layout[0] = n;
-        layout[1] = c;
-        layout[2] = d;
-        layout[3] = h;
-        layout[4] = w;
-    }
 
-    constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t h, uint64_t w)
+    constexpr tensor_layout_t(std::initializer_list<uint64_t> layout_)
     {
-        static_assert(N == 4);
-        layout[0] = n;
-        layout[1] = c;
-        layout[2] = h;
-        layout[3] = w;
-    }
-
-    constexpr tensor_layout_t(uint64_t n, uint64_t h, uint64_t w)
-    {
-        static_assert(N == 3);
-        layout[0] = n;
-        layout[1] = h;
-        layout[2] = w;
-    }
-
-    constexpr tensor_layout_t(uint64_t n, uint64_t w)
-    {
-        static_assert(N == 2);
-        layout[0] = n;
-        layout[1] = w;
+        static_assert(N > 0);
+        for(auto i = 0; i < N; ++i)
+        {
+            layout[i] = layout_.begin()[i];
+        }
     }
 
     uint64_t layout[N];

From ba2020be6cb6b7cb0868e77adab34d72844ec29a Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Mon, 5 Aug 2024 03:48:02 +0000
Subject: [PATCH 35/46] githook format

---
 src/kernels/tensor_view.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
index ecc075ac12..c9357dd729 100644
--- a/src/kernels/tensor_view.hpp
+++ b/src/kernels/tensor_view.hpp
@@ -74,7 +74,6 @@ struct tensor_layout_t
         }
     }
 
-
     constexpr tensor_layout_t(std::initializer_list<uint64_t> layout_)
     {
         static_assert(N > 0);

From 87edcbd014915137ed737f784819e4a80b782e3b Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Mon, 5 Aug 2024 08:48:51 +0000
Subject: [PATCH 36/46] remove duplicate miopen ops and update doc

---
 docs/reference/index.rst |   2 +-
 include/miopen/miopen.h  | 701 ---------------------------------------
 2 files changed, 1 insertion(+), 702 deletions(-)

diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index be56db13f9..a4cc9470a1 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -33,6 +33,6 @@ The MIOpen API library is structured as follows:
   * :doc:`Cat <../doxygen/html/group__cat>` (experimental)
   * :doc:`SGD <../doxygen/html/group___s_g_d>` (experimental)
   * :doc:`ReduceExtreme <../doxygen/html/group__ReduceExtreme>` (experimental)
-  * :doc:`Fold <./group__fold>` (experimental)
+  * :doc:`Fold <./group___f_o_l_d>` (experimental)
   * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental)
   * :doc:`ReduceCalculation <../doxygen/html/group__ReduceCalculation>` (experimental)
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index ee3585cbf7..7eab6a77c5 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -7621,707 +7621,6 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle,
 // CLOSEOUT GETITEM DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
-#ifdef MIOPEN_BETA_API
-// GetItem APIs
-/** @addtogroup getitem
- *
- *  @{
- */
-/*! @brief Helper function to query the minimum workspace size required by the getitem call
- *
- * @param [in]   handle                  MIOpen Handle
- * @param [in]   indexCount              Number of input tensor indexs
- * @param [in]   indexDescs              Tensor descriptor of input tensor indexs
- * @param [out]  sizeInBytes             Pointer to data to return the minimum workspace size
- * @return                        miopenStatus_t
- */
-MIOPEN_EXPORT miopenStatus_t
-miopenGetGetitemWorkspaceSize(miopenHandle_t handle,
-                              uint32_t indexCount,
-                              const miopenTensorDescriptor_t* indexDescs,
-                              size_t* sizeInBytes);
-
-/*! @brief Execute a getitem backward layer
- *
- * Backward of getitem for tensor indexing, slicing, masking.
- *
- * @param [in]   handle                  MIOpen handle
- * @param [in]   workspace               Address of the allocated workspace data
- * @param [in]   workspaceSizeInBytes    Size in bytes of the allocated workspace data
- * @param [in]   dyDesc                  Tensor descriptor of input tensor dy
- * @param [in]   dy                      Source data tensor dy
- * @param [in]   indexCount              Number of input tensor indexs
- * @param [in]   indexDescs              Tensor descriptor of input tensor indexs(All indexs same
- * size)
- * @param [in]   indexs                  Source data tensor indexs
- * @param [in]   dxDesc                  Tensor descriptor of output tensor dx
- * @param [out]  dx                      Data tensor dx(It must be initialized to 0)
- * @param [in]   errorDesc               Tensor descriptor of output tensor error
- * @param [out]  error                   Data tensor error(It must be initialized to 0)
- * @param [in]   dimCount                Number of dimensions
- * @param [in]   dims                    Dimensions
- * @param [in]   sliceCount              Number of slices
- * @param [in]   slices                  Slices
- * @param [in]   offset                  Offset of output tensor dx
- * @return                               miopenStatus_t
- */
-MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle,
-                                                   void* workspace,
-                                                   size_t workspaceSizeInBytes,
-                                                   const miopenTensorDescriptor_t dyDesc,
-                                                   const void* dy,
-                                                   uint32_t indexCount,
-                                                   const miopenTensorDescriptor_t* indexDescs,
-                                                   const void* const* indexs,
-                                                   const miopenTensorDescriptor_t dxDesc,
-                                                   void* dx,
-                                                   const miopenTensorDescriptor_t errorDesc,
-                                                   void* error,
-                                                   uint32_t dimCount,
-                                                   const int32_t* dims,
-                                                   uint32_t sliceCount,
-                                                   const int32_t* slices,
-                                                   uint32_t offset);
-
-/** @} */
-// CLOSEOUT GETITEM DOXYGEN GROUP
-#endif // MIOPEN_BETA_API
-
-#ifdef MIOPEN_BETA_API
-// FusedAdam APIs
-/** @addtogroup SGD
- *
- *  @{
- */
-/*! @brief Perform Fused Adam optimization for a single tensor (Adaptive Moment Estimation).
- *
- * This function implements the Fused Adam optimization algorithm. Adam, short for Adaptive Moment
- * Estimation, extends the RMSProp optimizer. It combines the advantages of AdaGrad and RMSProp by
- * adaptively adjusting learning rates for each parameter using the first and second moments of
- * gradients. Fused Adam optimization efficiently combines multiple operations into a single kernel,
- * reducing memory access overhead and improving performance.
- *
- * Additionally, Fused Adam can be utilized in both adam w and Automatic Mixed Precision (AMP),
- * enabling accelerated model training and reduced memory consumption. AMP supports FP16
- * computation, optimizing model calculations using a mixture of FP32 and FP16 precision to enhance
- * training speed. When utilizing AMP, FoundInf, ScaleGrad, and step tensors should be employed. In
- * AMP mode, the execution of Adam is determined based on the FoundInf value. State Step accepts
- * both int values and int tensors. If a Step tensor is employed, the step received as an int is
- * disregarded, and if Adam is executed, the step tensor is incremented by 1.
- *
- * @code
- * // Execute Adam
- * miopenFusedAdam(handle,
- *                 paramDesc,
- *                 param,
- *                 gradDesc,
- *                 grad,
- *                 expAvgDesc,
- *                 expAvg,
- *                 expAvgSqDesc,
- *                 expAvgSq,
- *                 NULL,     // Unused maxExpAvgSqDesc because amsgrad is false
- *                 NULL,
- *                 NULL,     // Unused stateStep Tensor because use step integer argument
- *                 NULL,
- *                 step,
- *                 lr,
- *                 beta1,
- *                 beta2,
- *                 weight_decay,
- *                 eps,
- *                 false,    // amsgrad
- *                 false,    // maximize
- *                 false,    // adamw
- *                 NULL,     // Unused gradScale Tensor because not amp
- *                 NULL,
- *                 NULL,     // Unused foundInf Tensor because not amp
- *                 NULL);
- *
- * // Execute AdamW
- * miopenFusedAdam(handle,
- *                 paramDesc,
- *                 param,
- *                 gradDesc,
- *                 grad,
- *                 expAvgDesc,
- *                 expAvg,
- *                 expAvgSqDesc,
- *                 expAvgSq,
- *                 NULL,     // Unused maxExpAvgSqDesc because amsgrad is false
- *                 NULL,
- *                 NULL,     // Unused stateStep Tensor because use step integer argument
- *                 NULL,
- *                 step,
- *                 lr,
- *                 beta1,
- *                 beta2,
- *                 weight_decay,
- *                 eps,
- *                 false,    // amsgrad
- *                 false,    // maximize
- *                 true,     // adamw
- *                 NULL,     // Unused gradScale Tensor because not amp
- *                 NULL,
- *                 NULL,     // Unused foundInf Tensor because not amp
- *                 NULL);
- *
- * // Execute AMP Adam
- * miopenFusedAdam(handle,
- *                 paramDesc,
- *                 param,
- *                 gradDesc,
- *                 grad,
- *                 expAvgDesc,
- *                 expAvg,
- *                 expAvgSqDesc,
- *                 expAvgSq,
- *                 NULL,     // Unused maxExpAvgSqDesc because amsgrad is false
- *                 NULL,
- *                 stateStepDesc,
- *                 stateStep,
- *                 -1,       // Ignore step value because stateStep Tensor is used
- *                 lr,
- *                 beta1,
- *                 beta2,
- *                 weight_decay,
- *                 eps,
- *                 false,    // amsgrad
- *                 false,    // maximize
- *                 false,    // adamw
- *                 gradScaleDesc,
- *                 gradScale,
- *                 foundInfDesc,
- *                 foundInf);
- * @endcode
- *
- * @param handle              MIOpen handle (input)
- * @param paramDesc           Tensor descriptor for the input parameter tensor (input)
- * @param param               Input parameter tensor (input)
- * @param gradDesc            Tensor descriptor for the input gradient tensor (input)
- * @param grad                Input gradient tensor (input)
- * @param expAvgDesc          Tensor descriptor for the input exponential moving average tensor
- *                            (input)
- * @param expAvg              Input exponential moving average tensor (input)
- * @param expAvgSqDesc        Tensor descriptor for the input exponential moving average squared
- *                            tensor (input)
- * @param expAvgSq            Input exponential moving average squared tensor (input)
- * @param maxExpAvgSqDesc     Tensor descriptor for the input maximum exponential moving average
- *                            squared tensor. Used when amsgrad is true (input, optional)
- * @param maxExpAvgSq         Input maximum exponential moving average squared tensor. Used when
- *                            amsgrad is true (input, optional)
- * @param stateStepDesc       Tensor descriptor for the input state step tensor (input)
- * @param stateStep           Input state step tensor (input)
- * @param state_step          Input state step. used when the step tensor is null (input)
- * @param lr                  Learning rate (input)
- * @param beta1               Coefficient used for computing the first moment running average of
- *                            gradient (input)
- * @param beta2               Coefficient used for computing the second moment running average of
- *                            gradient (input)
- * @param weight_decay        Weight decay (input)
- * @param eps                 Term added to the denominator to improve numerical stability (input)
- * @param amsgrad             Flag indicating whether to use the AMSGrad variant of Adam (input)
- * @param maximize            Flag indicating whether to maximize the objective with respect to the
- *                            parameters (input)
- * @param adamw               If true, the operation becomes AdamW (input)
- * @param gradScaleDesc       Tensor descriptor for the input grad scale tensor (input, optional)
- * @param gradScale           Input grad scale tensor (input, optional)
- * @param foundInfDesc        Tensor descriptor for the input found inf tensor (input, optional)
- * @param foundInf            Tensor indicating the presence of inf or NaN in gradients. If true,
- *                            skips operation and step update (input, optional)
- * @return                    miopenStatus_t
- */
-MIOPEN_EXPORT miopenStatus_t miopenFusedAdam(miopenHandle_t handle,
-                                             const miopenTensorDescriptor_t paramDesc,
-                                             void* param,
-                                             const miopenTensorDescriptor_t gradDesc,
-                                             const void* grad,
-                                             const miopenTensorDescriptor_t expAvgDesc,
-                                             void* expAvg,
-                                             const miopenTensorDescriptor_t expAvgSqDesc,
-                                             void* expAvgSq,
-                                             const miopenTensorDescriptor_t maxExpAvgSqDesc,
-                                             void* maxExpAvgSq,
-                                             const miopenTensorDescriptor_t stateStepDesc,
-                                             void* stateStep,
-                                             const unsigned int state_step,
-                                             const float lr,
-                                             const float beta1,
-                                             const float beta2,
-                                             const float weight_decay,
-                                             const float eps,
-                                             const bool amsgrad,
-                                             const bool maximize,
-                                             const bool adamw,
-                                             const miopenTensorDescriptor_t gradScaleDesc,
-                                             const void* gradScale,
-                                             const miopenTensorDescriptor_t foundInfDesc,
-                                             const void* foundInf);
-
-/*! @brief Execute single tensor Adam optimization and receive the result in a separate output
- * tensor.
- *
- * This function is equivalent to miopenFusedAdam but receives the result in a separate output
- * tensor.
- * @see miopenFusedAdam
- *
- * @code
- * // Execute Adam
- * miopenFusedAdamWithOutput(handle,
- *                           paramInDesc,
- *                           paramIn,
- *                           paramOutDesc,
- *                           paramOut,
- *                           NULL,   // Unused paramOutFloat16 tensor because is not amp
- *                           NULL,
- *                           gradInDesc,
- *                           gradIn,
- *                           expAvgInDesc,
- *                           expAvgIn,
- *                           expAvgOutDesc,
- *                           expAvgOut,
- *                           expAvgInSqDesc,
- *                           expAvgSqIn,
- *                           expAvgSqOutDesc,
- *                           expAvgSqOut,
- *                           NULL,   // Unused maxExpAvgSqIn tensor because amsgrad is false
- *                           NULL,
- *                           NULL,   // Unused maxExpAvgSqOut tensor because amsgrad is false
- *                           NULL,
- *                           NULL,   // Unused stateStepIn tensor because use step integer argument
- *                           NULL,
- *                           NULL,   // Unused stateStepOut tensor because use step integer argument
- *                           NULL,
- *                           step,
- *                           lr,
- *                           beta1,
- *                           beta2,
- *                           weight_decay,
- *                           eps,
- *                           false,  // amsgrad
- *                           false,  // maximize
- *                           false,  // adamw
- *                           NULL,   // Unused gradScale Tensor because not amp
- *                           NULL,
- *                           NULL,   // Unused foundInf Tensor because not amp
- *                           NULL);
- *
- * // Execute Amp Adam
- * miopenFusedAdamWithOutput(handle,
- *                           paramInDesc,
- *                           paramIn,
- *                           paramOutDesc,
- *                           paramOut,
- *                           paramOutFloat16Desc,  // paramOutFloat16 tensor is optional in amp
- *                           paramOutFloat16,
- *                           gradInDesc,
- *                           gradIn,
- *                           expAvgInDesc,
- *                           expAvgIn,
- *                           expAvgOutDesc,
- *                           expAvgOut,
- *                           expAvgInSqDesc,
- *                           expAvgSqIn,
- *                           expAvgSqIn,
- *                           expAvgSqOutDesc,
- *                           expAvgSqOut,
- *                           NULL,         // Unused maxExpAvgSqIn tensor because amsgrad is false
- *                           NULL,
- *                           NULL,         // Unused maxExpAvgSqOut tensor because amsgrad is false
- *                           NULL,
- *                           stateStepInDesc,
- *                           stateStepIn,
- *                           stateStepOutDesc,
- *                           stateStepOut
- *                           -1,           // Ignore step value because stateStep Tensor is used
- *                           lr, beta1, beta2, weight_decay, eps,
- *                           false,        // amsgrad
- *                           false,        // maximize
- *                           false,        // adamw
- *                           gradScaleDesc,
- *                           gradScale,
- *                           foundInfDesc,
- *                           foundInf);
- * @endcode
- *
- * @param handle              MIOpen handle (input)
- * @param paramInDesc         Tensor descriptor for the input parameter tensor (input)
- * @param paramIn             Input parameter tensor (input)
- * @param paramOutDesc        Tensor descriptor for the output parameter tensor (input)
- * @param paramOut            Output parameter tensor (output)
- * @param paramOutFloat16Desc Tensor descriptor for the output parameter tensor float16 (input,
- *                            optional)
- * @param paramOutFloat16     Output parameter tensor (output, optional)
- * @param gradInDesc          Tensor descriptor for the input gradient tensor (input)
- * @param gradIn              Input gradient tensor (input)
- * @param expAvgInDesc        Tensor descriptor for the input exponential moving average tensor
- *                            (input)
- * @param expAvgIn            Input exponential moving average tensor (input)
- * @param expAvgOutDesc       Tensor descriptor for the output exponential moving average tensor
- *                            (input)
- * @param expAvgOut           Output exponential moving average tensor (output)
- * @param expAvgSqInDesc      Tensor descriptor for the input exponential moving average squared
- *                            tensor (input)
- * @param expAvgSqIn          Input exponential moving average squared tensor (input)
- * @param expAvgSqOutDesc     Tensor descriptor for the output exponential moving average squared
- *                            tensor (input)
- * @param expAvgSqOut         Output exponential moving average squared tensor (output)
- * @param maxExpAvgSqInDesc   Tensor descriptor for the input maximum exponential moving average
- *                            squared tensor. Used when amsgrad is true (input, optional)
- * @param maxExpAvgSqIn       Input maximum exponential moving average squared tensor. Used when
- *                            amsgrad is true (input, optional)
- * @param maxExpAvgSqOutDesc  Tensor descriptor for the output maximum exponential moving average
- *                            squared tensor. Used when amsgrad is true (input, optional)
- * @param maxExpAvgSqOut      Output maximum exponential moving average squared tensor. Used when
- *                            amsgrad is true (output, optional)
- * @param stateStepInDesc     Tensor descriptor for the input state step tensor (input, optional)
- * @param stateStepIn         Input state step tensor (input, optional)
- * @param stateStepOutDesc    Tensor descriptor for the output state step tensor (input, optional)
- * @param stateStepOut        Output state step tensor that stores the updated step value. (output,
- *                            optional)
- * @param state_step          Input state step, It is used when the step tensor is null. (input)
- * @param lr                  Learning rate (input)
- * @param beta1               Coefficient used for computing the first moment running average of
- *                            gradient (input)
- * @param beta2               Coefficient used for computing the second moment running average of
- *                            gradient (input)
- * @param weight_decay        Weight decay (input)
- * @param eps                 Term added to the denominator to improve numerical stability (input)
- * @param amsgrad             Flag indicating whether to use the AMSGrad variant of Adam (input)
- * @param maximize            Flag indicating whether to maximize the objective with respect to the
- *                            parameters (input)
- * @param adamw               If it is true, the operation becomes AdamW (input)
- * @param gradScaleDesc       Tensor descriptor for the input grad scale tensor (input, optional)
- * @param gradScale           Input grad scale tensor (input, optional)
- * @param foundInfDesc        Tensor descriptor for the input found inf tensor (input, optional)
- * @param foundInf            Tensor indicating presence of inf or nan in gradients. If true, skips
- *                            operation and step update. (input, optional)
- * @return                    miopenStatus_t
- */
-MIOPEN_EXPORT miopenStatus_t
-miopenFusedAdamWithOutput(miopenHandle_t handle,
-                          const miopenTensorDescriptor_t paramInDesc,
-                          void* paramIn,
-                          const miopenTensorDescriptor_t paramOutDesc,
-                          void* paramOut,
-                          const miopenTensorDescriptor_t paramOutFloat16Desc,
-                          void* paramOutFloat16,
-                          const miopenTensorDescriptor_t gradInDesc,
-                          const void* gradIn,
-                          const miopenTensorDescriptor_t expAvgInDesc,
-                          void* expAvgIn,
-                          const miopenTensorDescriptor_t expAvgOutDesc,
-                          void* expAvgOut,
-                          const miopenTensorDescriptor_t expAvgSqInDesc,
-                          void* expAvgSqIn,
-                          const miopenTensorDescriptor_t expAvgSqOutDesc,
-                          void* expAvgSqOut,
-                          const miopenTensorDescriptor_t maxExpAvgSqInDesc,
-                          void* maxExpAvgSqIn,
-                          const miopenTensorDescriptor_t maxExpAvgSqOutDesc,
-                          void* maxExpAvgSqOut,
-                          const miopenTensorDescriptor_t stateStepInDesc,
-                          void* stateStepIn,
-                          const miopenTensorDescriptor_t stateStepOutDesc,
-                          void* stateStepOut,
-                          const unsigned int state_step,
-                          const float lr,
-                          const float beta1,
-                          const float beta2,
-                          const float weight_decay,
-                          const float eps,
-                          const bool amsgrad,
-                          const bool maximize,
-                          const bool adamw,
-                          const miopenTensorDescriptor_t gradScaleDesc,
-                          const void* gradScale,
-                          const miopenTensorDescriptor_t foundInfDesc,
-                          const void* foundInf);
-
-/** @} */
-// CLOSEOUT SGD DOXYGEN GROUP
-#endif // MIOPEN_BETA_API
-
-#ifdef MIOPEN_BETA_API
-// TransformersAdamW APIs
-/** @addtogroup SGD
- *
- *  @{
- */
-/*! @brief Implements Adam algorithm with weight decay fix as introduced in
- * <a href="https://arxiv.org/abs/1711.05101">Decoupled Weight Decay Regularization</a>.
- * This is the fused kernel version of AdamW included in the Hugging Face Transformers module.
- *
- * @see miopenFusedAdam
- *
- * @code
- * // Execute Adam
- * miopenTransformersAdamW(handle,
- *                         paramDesc,
- *                         param,
- *                         gradDesc,
- *                         grad,
- *                         expAvgDesc,
- *                         expAvg,
- *                         expAvgSqDesc,
- *                         expAvgSq,
- *                         NULL,     // Unused stateStep Tensor because use step integer argument
- *                         NULL,
- *                         step,
- *                         lr,
- *                         beta1,
- *                         beta2,
- *                         weight_decay,
- *                         eps,
- *                         true,     // correct_bias
- *                         NULL,     // Unused gradScale Tensor because not amp
- *                         NULL,
- *                         NULL,     // Unused foundInf Tensor because not amp
- *                         NULL);
- *
- * // Execute AMP Adam
- * miopenTransformersAdamW(handle,
- *                         paramDesc,
- *                         param,
- *                         gradDesc,
- *                         grad,
- *                         expAvgDesc,
- *                         expAvg,
- *                         expAvgSqDesc,
- *                         expAvgSq,
- *                         stateStepDesc,
- *                         stateStep,
- *                         -1,       // Ignore step value because stateStep Tensor is used
- *                         lr,
- *                         beta1,
- *                         beta2,
- *                         weight_decay,
- *                         eps,
- *                         true,     // correct_bias
- *                         gradScaleDesc,
- *                         gradScale,
- *                         foundInfDesc,
- *                         foundInf);
- * @endcode
- *
- * @param handle              MIOpen handle (input)
- * @param paramDesc           Tensor descriptor for the input parameter tensor (input)
- * @param param               Input parameter tensor (input)
- * @param gradDesc            Tensor descriptor for the input gradient tensor (input)
- * @param grad                Input gradient tensor (input)
- * @param expAvgDesc          Tensor descriptor for the input exponential moving average tensor
- *                            (input)
- * @param expAvg              Input exponential moving average tensor (input)
- * @param expAvgSqDesc        Tensor descriptor for the input exponential moving average squared
- *                            tensor (input)
- * @param expAvgSq            Input exponential moving average squared tensor (input)
- * @param stateStepDesc       Tensor descriptor for the input state step tensor (input)
- * @param stateStep           Input state step tensor (input)
- * @param state_step          Input state step. used when the step tensor is null (input)
- * @param lr                  Learning rate (input)
- * @param beta1               Coefficient used for computing the first moment running average of
- *                            gradient (input)
- * @param beta2               Coefficient used for computing the second moment running average of
- *                            gradient (input)
- * @param weight_decay        Weight decay (input)
- * @param eps                 Term added to the denominator to improve numerical stability (input)
- * @param correct_bias        Whether or not to correct bias in Adam (for instance, in Bert TF
- *                            repository they use False).
- * @param gradScaleDesc       Tensor descriptor for the input grad scale tensor (input, optional)
- * @param gradScale           Input grad scale tensor (input, optional)
- * @param foundInfDesc        Tensor descriptor for the input found inf tensor (input, optional)
- * @param foundInf            Tensor indicating the presence of inf or NaN in gradients. If true,
- *                            skips operation and step update (input, optional)
- * @return                    miopenStatus_t
- */
-MIOPEN_EXPORT miopenStatus_t miopenTransformersAdamW(miopenHandle_t handle,
-                                                     const miopenTensorDescriptor_t paramDesc,
-                                                     void* param,
-                                                     const miopenTensorDescriptor_t gradDesc,
-                                                     const void* grad,
-                                                     const miopenTensorDescriptor_t expAvgDesc,
-                                                     void* expAvg,
-                                                     const miopenTensorDescriptor_t expAvgSqDesc,
-                                                     void* expAvgSq,
-                                                     const miopenTensorDescriptor_t stateStepDesc,
-                                                     void* stateStep,
-                                                     const unsigned int state_step,
-                                                     const float lr,
-                                                     const float beta1,
-                                                     const float beta2,
-                                                     const float weight_decay,
-                                                     const float eps,
-                                                     const bool correct_bias,
-                                                     const miopenTensorDescriptor_t gradScaleDesc,
-                                                     const void* gradScale,
-                                                     const miopenTensorDescriptor_t foundInfDesc,
-                                                     const void* foundInf);
-
-/*! @brief Execute single tensor Adam optimization and receive the result in a separate output
- * tensor.
- *
- * This function is equivalent to miopenTransformersAdam but receives the result in a separate
- * output tensor.
- * @see miopenTransformersAdamW
- * @see miopenFusedAdamWithOutput
- *
- * @code
- * // Execute Adam
- * miopenTransformersAdamWWithOutput(handle,
- *                                   paramInDesc,
- *                                   paramIn,
- *                                   paramOutDesc,
- *                                   paramOut,
- *                                   NULL,   // Unused paramOutFloat16 tensor because is not amp
- *                                   NULL,
- *                                   gradInDesc,
- *                                   gradIn,
- *                                   expAvgInDesc,
- *                                   expAvgIn,
- *                                   expAvgOutDesc,
- *                                   expAvgOut,
- *                                   expAvgInSqDesc,
- *                                   expAvgSqIn,
- *                                   expAvgSqOutDesc,
- *                                   expAvgSqOut,
- *                                   NULL,   // Unused stateStepIn tensor because use step int
- *                                   NULL,
- *                                   NULL,   // Unused stateStepOut tensor because use step int
- *                                   NULL,
- *                                   step,
- *                                   lr,
- *                                   beta1,
- *                                   beta2,
- *                                   weight_decay,
- *                                   eps,
- *                                   -1,     // step_size
- *                                   true,   // correct_bias
- *                                   NULL,   // Unused gradScale Tensor because not amp
- *                                   NULL,
- *                                   NULL,   // Unused foundInf Tensor because not amp
- *                                   NULL);
- *
- * // Execute Amp Adam
- * miopenTransformersAdamWWithOutput(handle,
- *                                   paramInDesc,
- *                                   paramIn,
- *                                   paramOutDesc,
- *                                   paramOut,
- *                                   paramOutFloat16Desc,  // optional in amp
- *                                   paramOutFloat16,
- *                                   gradInDesc,
- *                                   gradIn,
- *                                   expAvgInDesc,
- *                                   expAvgIn,
- *                                   expAvgOutDesc,
- *                                   expAvgOut,
- *                                   expAvgInSqDesc,
- *                                   expAvgSqIn,
- *                                   expAvgSqIn,
- *                                   expAvgSqOutDesc,
- *                                   expAvgSqOut,
- *                                   stateStepInDesc,
- *                                   stateStepIn,
- *                                   stateStepOutDesc,
- *                                   stateStepOut
- *                                   -1,   // Ignore step value because stateStep Tensor is used
- *                                   lr,
- *                                   beta1,
- *                                   beta2,
- *                                   weight_decay,
- *                                   eps,
- *                                   -1,   // step_size
- *                                   true, // correct_bias
- *                                   NULL, // Unused gradScale Tensor because not amp
- *                                   NULL,
- *                                   NULL, // Unused foundInf Tensor because not amp
- *                                   NULL);
- * @endcode
- *
- * @param handle              MIOpen handle (input)
- * @param paramInDesc         Tensor descriptor for the input parameter tensor (input)
- * @param paramIn             Input parameter tensor (input)
- * @param paramOutDesc        Tensor descriptor for the output parameter tensor (input)
- * @param paramOut            Output parameter tensor (output)
- * @param paramOutFloat16Desc Tensor descriptor for the output parameter tensor float16 (input,
- *                            optional)
- * @param paramOutFloat16     Output parameter tensor (output, optional)
- * @param gradInDesc          Tensor descriptor for the input gradient tensor (input)
- * @param gradIn              Input gradient tensor (input)
- * @param expAvgInDesc        Tensor descriptor for the input exponential moving average tensor
- *                            (input)
- * @param expAvgIn            Input exponential moving average tensor (input)
- * @param expAvgOutDesc       Tensor descriptor for the output exponential moving average tensor
- *                            (input)
- * @param expAvgOut           Output exponential moving average tensor (output)
- * @param expAvgSqInDesc      Tensor descriptor for the input exponential moving average squared
- *                            tensor (input)
- * @param expAvgSqIn          Input exponential moving average squared tensor (input)
- * @param expAvgSqOutDesc     Tensor descriptor for the output exponential moving average squared
- *                            tensor (input)
- * @param expAvgSqOut         Output exponential moving average squared tensor (output)
- * @param stateStepInDesc     Tensor descriptor for the input state step tensor (input, optional)
- * @param stateStepIn         Input state step tensor (input, optional)
- * @param stateStepOutDesc    Tensor descriptor for the output state step tensor (input, optional)
- * @param stateStepOut        Output state step tensor that stores the updated step value. (output,
- *                            optional)
- * @param state_step          Input state step, It is used when the step tensor is null. (input)
- * @param lr                  Learning rate (input)
- * @param beta1               Coefficient used for computing the first moment running average of
- *                            gradient (input)
- * @param beta2               Coefficient used for computing the second moment running average of
- *                            gradient (input)
- * @param weight_decay        Weight decay (input)
- * @param eps                 Term added to the denominator to improve numerical stability (input)
- * @param step_size           Pre-calculated step_size, used for performance enhancement (input)
- * @param correct_bias        Whether or not to correct bias in Adam (for instance, in Bert TF
- *                            repository they use False) (input)
- * @param gradScaleDesc       Tensor descriptor for the input grad scale tensor (input, optional)
- * @param gradScale           Input grad scale tensor (input, optional)
- * @param foundInfDesc        Tensor descriptor for the input found inf tensor (input, optional)
- * @param foundInf            Tensor indicating presence of inf or nan in gradients. If true, skips
- *                            operation and step update. (input, optional)
- * @return                    miopenStatus_t
- */
-MIOPEN_EXPORT miopenStatus_t
-miopenTransformersAdamWWithOutput(miopenHandle_t handle,
-                                  const miopenTensorDescriptor_t paramInDesc,
-                                  void* paramIn,
-                                  const miopenTensorDescriptor_t paramOutDesc,
-                                  void* paramOut,
-                                  const miopenTensorDescriptor_t paramOutFloat16Desc,
-                                  void* paramOutFloat16,
-                                  const miopenTensorDescriptor_t gradInDesc,
-                                  const void* gradIn,
-                                  const miopenTensorDescriptor_t expAvgInDesc,
-                                  void* expAvgIn,
-                                  const miopenTensorDescriptor_t expAvgOutDesc,
-                                  void* expAvgOut,
-                                  const miopenTensorDescriptor_t expAvgSqInDesc,
-                                  void* expAvgSqIn,
-                                  const miopenTensorDescriptor_t expAvgSqOutDesc,
-                                  void* expAvgSqOut,
-                                  const miopenTensorDescriptor_t stateStepInDesc,
-                                  void* stateStepIn,
-                                  const miopenTensorDescriptor_t stateStepOutDesc,
-                                  void* stateStepOut,
-                                  const unsigned int state_step,
-                                  const float lr,
-                                  const float beta1,
-                                  const float beta2,
-                                  const float weight_decay,
-                                  const float eps,
-                                  const float step_size,
-                                  const bool correct_bias,
-                                  const miopenTensorDescriptor_t gradScaleDesc,
-                                  const void* gradScale,
-                                  const miopenTensorDescriptor_t foundInfDesc,
-                                  const void* foundInf);
-
-/** @} */
-// CLOSEOUT SGD DOXYGEN GROUP
-#endif // MIOPEN_BETA_API
-
 #ifdef MIOPEN_BETA_API
 // Fold APIs
 /** @addtogroup FOLD

From db7b9a8ff3ca7ef99076129f2c8fc5fbb4e04e7a Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Mon, 5 Aug 2024 09:33:50 +0000
Subject: [PATCH 37/46] update spacing

---
 include/miopen/miopen.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 7eab6a77c5..fb0f7006db 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -7643,7 +7643,7 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle,
  * @param dilation            Dilation array control the stride of the elements within the
  * neighborhood (input)
  * @param dilation_size       Size of the dilation array (input)
- * @return               miopenStatus_t
+ * @return                    miopenStatus_t
  */
 MIOPEN_EXPORT miopenStatus_t miopenFoldForward(miopenHandle_t handle,
                                                const miopenTensorDescriptor_t inputDesc,
@@ -7675,7 +7675,7 @@ MIOPEN_EXPORT miopenStatus_t miopenFoldForward(miopenHandle_t handle,
 * @param dilation            Dilation array control the stride of the elements within the
 neighborhood (input)
 * @param dilation_size       Size of the dilation array (input)
-* @return               miopenStatus_t
+* @return                    miopenStatus_t
 */
 MIOPEN_EXPORT miopenStatus_t miopenFoldBackward(miopenHandle_t handle,
                                                 const miopenTensorDescriptor_t dinputDesc,
@@ -7707,7 +7707,7 @@ MIOPEN_EXPORT miopenStatus_t miopenFoldBackward(miopenHandle_t handle,
  * @param dilation            Dilation array control the stride of the elements within the
  * neighborhood (input)
  * @param dilation_size       Size of the dilation array (input)
- * @return               miopenStatus_t
+ * @return                    miopenStatus_t
  */
 MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle,
                                                  const miopenTensorDescriptor_t inputDesc,
@@ -7739,7 +7739,7 @@ MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle,
  * @param dilation            Dilation array control the stride of the elements within the
  neighborhood (input)
  * @param dilation_size       Size of the dilation array (input)
- * @return               miopenStatus_t
+ * @return                    miopenStatus_t
  */
 MIOPEN_EXPORT miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle,
                                                   const miopenTensorDescriptor_t dinputDesc,

From 857db5c5bfc3775a8574e0f7cfbf429137a7f98a Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 6 Aug 2024 03:54:39 +0000
Subject: [PATCH 38/46] empty commit


From 0da1cc6a4fc4c38b027ac64bc14ca118de2c44b5 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 6 Aug 2024 21:39:56 +0000
Subject: [PATCH 39/46] update gtest syntax

---
 test/gtest/fold.cpp   | 48 ++++++++++++++++++++---------------------
 test/gtest/unfold.cpp | 50 ++++++++++++++++++++++---------------------
 2 files changed, 50 insertions(+), 48 deletions(-)

diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp
index a07d0ea8d8..4de7f26e6a 100644
--- a/test/gtest/fold.cpp
+++ b/test/gtest/fold.cpp
@@ -42,34 +42,34 @@ std::string GetFloatArg()
     return tmp;
 }
 
-struct FoldForwardTestFloat32 : FoldFwdTest<float>
+struct GPU_Fold_fwd_FP32 : FoldFwdTest<float>
 {
 };
 
-struct FoldForwardTestFloat16 : FoldFwdTest<half>
+struct GPU_Fold_fwd_FP16 : FoldFwdTest<half>
 {
 };
 
-struct FoldForwardTestBFloat16 : FoldFwdTest<bfloat16>
+struct GPU_Fold_fwd_BFP16 : FoldFwdTest<bfloat16>
 {
 };
 
-struct FoldBackwardTestFloat32 : FoldBwdTest<float>
+struct GPU_Fold_bwd_FP32 : FoldBwdTest<float>
 {
 };
 
-struct FoldBackwardTestFloat16 : FoldBwdTest<half>
+struct GPU_Fold_bwd_FP16 : FoldBwdTest<half>
 {
 };
 
-struct FoldBackwardTestBFloat16 : FoldBwdTest<bfloat16>
+struct GPU_Fold_bwd_BFP16 : FoldBwdTest<bfloat16>
 {
 };
 }; // namespace fold
 
 using namespace fold;
 
-TEST_P(FoldForwardTestFloat32, FoldForwardTest)
+TEST_P(GPU_Fold_fwd_FP32, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -83,11 +83,11 @@ TEST_P(FoldForwardTestFloat32, FoldForwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet,
-                         FoldForwardTestFloat32,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_Fold_fwd_FP32,
                          testing::ValuesIn(FoldTestConfigs()));
 
-TEST_P(FoldForwardTestFloat16, FoldForwardTest)
+TEST_P(GPU_Fold_fwd_FP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -101,11 +101,11 @@ TEST_P(FoldForwardTestFloat16, FoldForwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet,
-                         FoldForwardTestFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_Fold_fwd_FP16,
                          testing::ValuesIn(FoldTestConfigs()));
 
-TEST_P(FoldForwardTestBFloat16, FoldForwardTest)
+TEST_P(GPU_Fold_fwd_BFP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -119,11 +119,11 @@ TEST_P(FoldForwardTestBFloat16, FoldForwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(FoldForwardTestSet,
-                         FoldForwardTestBFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_Fold_fwd_BFP16,
                          testing::ValuesIn(FoldTestConfigs()));
 
-TEST_P(FoldBackwardTestFloat32, FoldBackwardTest)
+TEST_P(GPU_Fold_bwd_FP32, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -137,11 +137,11 @@ TEST_P(FoldBackwardTestFloat32, FoldBackwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet,
-                         FoldBackwardTestFloat32,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_Fold_bwd_FP32,
                          testing::ValuesIn(FoldTestConfigs()));
 
-TEST_P(FoldBackwardTestFloat16, FoldBackwardTest)
+TEST_P(GPU_Fold_bwd_FP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -155,11 +155,11 @@ TEST_P(FoldBackwardTestFloat16, FoldBackwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet,
-                         FoldBackwardTestFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_Fold_bwd_FP16,
                          testing::ValuesIn(FoldTestConfigs()));
 
-TEST_P(FoldBackwardTestBFloat16, FoldBackwardTest)
+TEST_P(GPU_Fold_bwd_BFP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -173,6 +173,6 @@ TEST_P(FoldBackwardTestBFloat16, FoldBackwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(FoldBackwardTestSet,
-                         FoldBackwardTestBFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_Fold_bwd_BFP16,
                          testing::ValuesIn(FoldTestConfigs()));
diff --git a/test/gtest/unfold.cpp b/test/gtest/unfold.cpp
index 0c523b1b7f..8573e1573b 100644
--- a/test/gtest/unfold.cpp
+++ b/test/gtest/unfold.cpp
@@ -42,32 +42,34 @@ std::string GetFloatArg()
     return tmp;
 }
 
-struct UnfoldForwardTestFloat32 : UnfoldFwdTest<float>
+struct GPU_Unfold_fwd_FP32 : UnfoldFwdTest<float>
 {
 };
 
-struct UnfoldForwardTestFloat16 : UnfoldFwdTest<half>
+struct GPU_Unfold_fwd_FP16 : UnfoldFwdTest<half>
 {
 };
 
-struct UnfoldForwardTestBFloat16 : UnfoldFwdTest<bfloat16>
+struct GPU_Unfold_fwd_BFP16 : UnfoldFwdTest<bfloat16>
 {
 };
 
-struct UnfoldBackwardTestFloat32 : UnfoldBwdTest<float>
+struct GPU_Unfold_bwd_FP32 : UnfoldBwdTest<float>
 {
 };
 
-struct UnfoldBackwardTestFloat16 : UnfoldBwdTest<half>
+struct GPU_Unfold_bwd_FP16 : UnfoldBwdTest<half>
 {
 };
 
-struct UnfoldBackwardTestBFloat16 : UnfoldBwdTest<bfloat16>
+struct GPU_Unfold_bwd_BFP16 : UnfoldBwdTest<bfloat16>
 {
 };
 }; // namespace unfold
+
 using namespace unfold;
-TEST_P(UnfoldForwardTestFloat32, UnfoldForwardTest)
+
+TEST_P(GPU_Unfold_fwd_FP32, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -81,11 +83,11 @@ TEST_P(UnfoldForwardTestFloat32, UnfoldForwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
-                         UnfoldForwardTestFloat32,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_Unfold_fwd_FP32,
                          testing::ValuesIn(UnfoldTestConfigs()));
 
-TEST_P(UnfoldForwardTestFloat16, UnfoldForwardTest)
+TEST_P(GPU_Unfold_fwd_FP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -99,11 +101,11 @@ TEST_P(UnfoldForwardTestFloat16, UnfoldForwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
-                         UnfoldForwardTestFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_Unfold_fwd_FP16,
                          testing::ValuesIn(UnfoldTestConfigs()));
 
-TEST_P(UnfoldForwardTestBFloat16, UnfoldForwardTest)
+TEST_P(GPU_Unfold_fwd_BFP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -117,11 +119,11 @@ TEST_P(UnfoldForwardTestBFloat16, UnfoldForwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(UnfoldForwardTestSet,
-                         UnfoldForwardTestBFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_Unfold_fwd_BFP16,
                          testing::ValuesIn(UnfoldTestConfigs()));
 
-TEST_P(UnfoldBackwardTestFloat32, UnfoldBackwardTest)
+TEST_P(GPU_Unfold_bwd_FP32, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
@@ -135,11 +137,11 @@ TEST_P(UnfoldBackwardTestFloat32, UnfoldBackwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet,
-                         UnfoldBackwardTestFloat32,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_Unfold_bwd_FP32,
                          testing::ValuesIn(UnfoldTestConfigs()));
 
-TEST_P(UnfoldBackwardTestFloat16, UnfoldBackwardTest)
+TEST_P(GPU_Unfold_bwd_FP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
@@ -153,11 +155,11 @@ TEST_P(UnfoldBackwardTestFloat16, UnfoldBackwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet,
-                         UnfoldBackwardTestFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_Unfold_bwd_FP16,
                          testing::ValuesIn(UnfoldTestConfigs()));
 
-TEST_P(UnfoldBackwardTestBFloat16, UnfoldBackwardTest)
+TEST_P(GPU_Unfold_bwd_BFP16, Test)
 {
     if(!MIOPEN_TEST_ALL ||
        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
@@ -171,6 +173,6 @@ TEST_P(UnfoldBackwardTestBFloat16, UnfoldBackwardTest)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(UnfoldBackwardTestSet,
-                         UnfoldBackwardTestBFloat16,
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_Unfold_bwd_BFP16,
                          testing::ValuesIn(UnfoldTestConfigs()));

From 879c5c77b675c2bd03944f9912104a677ffbd2b9 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 6 Aug 2024 21:41:48 +0000
Subject: [PATCH 40/46] githook format

---
 test/gtest/fold.cpp   | 24 ++++++------------------
 test/gtest/unfold.cpp | 24 ++++++------------------
 2 files changed, 12 insertions(+), 36 deletions(-)

diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp
index 4de7f26e6a..b3868bb0ec 100644
--- a/test/gtest/fold.cpp
+++ b/test/gtest/fold.cpp
@@ -83,9 +83,7 @@ TEST_P(GPU_Fold_fwd_FP32, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_Fold_fwd_FP32,
-                         testing::ValuesIn(FoldTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Fold_fwd_FP32, testing::ValuesIn(FoldTestConfigs()));
 
 TEST_P(GPU_Fold_fwd_FP16, Test)
 {
@@ -101,9 +99,7 @@ TEST_P(GPU_Fold_fwd_FP16, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_Fold_fwd_FP16,
-                         testing::ValuesIn(FoldTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Fold_fwd_FP16, testing::ValuesIn(FoldTestConfigs()));
 
 TEST_P(GPU_Fold_fwd_BFP16, Test)
 {
@@ -119,9 +115,7 @@ TEST_P(GPU_Fold_fwd_BFP16, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_Fold_fwd_BFP16,
-                         testing::ValuesIn(FoldTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Fold_fwd_BFP16, testing::ValuesIn(FoldTestConfigs()));
 
 TEST_P(GPU_Fold_bwd_FP32, Test)
 {
@@ -137,9 +131,7 @@ TEST_P(GPU_Fold_bwd_FP32, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_Fold_bwd_FP32,
-                         testing::ValuesIn(FoldTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Fold_bwd_FP32, testing::ValuesIn(FoldTestConfigs()));
 
 TEST_P(GPU_Fold_bwd_FP16, Test)
 {
@@ -155,9 +147,7 @@ TEST_P(GPU_Fold_bwd_FP16, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_Fold_bwd_FP16,
-                         testing::ValuesIn(FoldTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Fold_bwd_FP16, testing::ValuesIn(FoldTestConfigs()));
 
 TEST_P(GPU_Fold_bwd_BFP16, Test)
 {
@@ -173,6 +163,4 @@ TEST_P(GPU_Fold_bwd_BFP16, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_Fold_bwd_BFP16,
-                         testing::ValuesIn(FoldTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Fold_bwd_BFP16, testing::ValuesIn(FoldTestConfigs()));
diff --git a/test/gtest/unfold.cpp b/test/gtest/unfold.cpp
index 8573e1573b..a5aead6d10 100644
--- a/test/gtest/unfold.cpp
+++ b/test/gtest/unfold.cpp
@@ -83,9 +83,7 @@ TEST_P(GPU_Unfold_fwd_FP32, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_Unfold_fwd_FP32,
-                         testing::ValuesIn(UnfoldTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Unfold_fwd_FP32, testing::ValuesIn(UnfoldTestConfigs()));
 
 TEST_P(GPU_Unfold_fwd_FP16, Test)
 {
@@ -101,9 +99,7 @@ TEST_P(GPU_Unfold_fwd_FP16, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_Unfold_fwd_FP16,
-                         testing::ValuesIn(UnfoldTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Unfold_fwd_FP16, testing::ValuesIn(UnfoldTestConfigs()));
 
 TEST_P(GPU_Unfold_fwd_BFP16, Test)
 {
@@ -119,9 +115,7 @@ TEST_P(GPU_Unfold_fwd_BFP16, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_Unfold_fwd_BFP16,
-                         testing::ValuesIn(UnfoldTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Unfold_fwd_BFP16, testing::ValuesIn(UnfoldTestConfigs()));
 
 TEST_P(GPU_Unfold_bwd_FP32, Test)
 {
@@ -137,9 +131,7 @@ TEST_P(GPU_Unfold_bwd_FP32, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_Unfold_bwd_FP32,
-                         testing::ValuesIn(UnfoldTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Unfold_bwd_FP32, testing::ValuesIn(UnfoldTestConfigs()));
 
 TEST_P(GPU_Unfold_bwd_FP16, Test)
 {
@@ -155,9 +147,7 @@ TEST_P(GPU_Unfold_bwd_FP16, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_Unfold_bwd_FP16,
-                         testing::ValuesIn(UnfoldTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Unfold_bwd_FP16, testing::ValuesIn(UnfoldTestConfigs()));
 
 TEST_P(GPU_Unfold_bwd_BFP16, Test)
 {
@@ -173,6 +163,4 @@ TEST_P(GPU_Unfold_bwd_BFP16, Test)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Full,
-                         GPU_Unfold_bwd_BFP16,
-                         testing::ValuesIn(UnfoldTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Unfold_bwd_BFP16, testing::ValuesIn(UnfoldTestConfigs()));

From 7d07012d12a93ea0c6b0482efe661f10e62a6367 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Mon, 12 Aug 2024 03:07:48 +0000
Subject: [PATCH 41/46] githook format

---
 driver/driver.hpp | 23 ++++++++++++-----------
 src/solver.cpp    |  2 +-
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/driver/driver.hpp b/driver/driver.hpp
index c37f3be25a..844101230a 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -169,14 +169,15 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
 [[noreturn]] inline void Usage()
 {
     printf("Usage: ./driver *base_arg* *other_args*\n");
-    printf("Supported Base Arguments: conv[fp16|int8|bfp16], pool[fp16], lrn[fp16], "
-           "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], "
-           "tensorop, reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
-           "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
-           "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
-           "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
-           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], unfold[bfp16|fp16], "
-           "fold[bfp16|fp16]\n");
+    printf(
+        "Supported Base Arguments: conv[fp16|int8|bfp16], pool[fp16], lrn[fp16], "
+        "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], "
+        "tensorop, reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
+        "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
+        "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
+        "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
+        "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], unfold[bfp16|fp16], "
+        "fold[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -208,9 +209,9 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" &&
        arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" &&
        arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "rope" &&
-       arg != "ropefp16" && arg != "ropebfp16" && arg != "unfold" &&
-       arg != "unfoldfp16" && arg != "unfoldbfp16" && arg != "fold" && arg != "foldfp16" &&
-       arg != "foldbfp16" && arg != "--version")
+       arg != "ropefp16" && arg != "ropebfp16" && arg != "unfold" && arg != "unfoldfp16" &&
+       arg != "unfoldbfp16" && arg != "fold" && arg != "foldfp16" && arg != "foldbfp16" &&
+       arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/src/solver.cpp b/src/solver.cpp
index 7ee13a51f9..ae443f29d5 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -676,7 +676,7 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
 
     Register(registry, ++id, Primitive::RoPE, rope::RoPEForward{}.SolverDbId());
     Register(registry, ++id, Primitive::RoPE, rope::RoPEBackward{}.SolverDbId());
-    
+
     Register(registry, ++id, Primitive::Unfold, fold::UnfoldFwd{}.SolverDbId());
     Register(registry, ++id, Primitive::Unfold, fold::UnfoldBwd{}.SolverDbId());
     Register(registry, ++id, Primitive::Fold, fold::FoldFwd{}.SolverDbId());

From 85c1ee04b6737e6ac02251ae836b2a4c4f3cb101 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 13 Aug 2024 16:40:04 +0000
Subject: [PATCH 42/46] add not contiguous test cases for fold and unfold

---
 test/gtest/fold.hpp   | 22 ++++++++++++++--------
 test/gtest/unfold.hpp | 15 ++++++++++++---
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp
index 02d9e42e17..9c7a9874eb 100644
--- a/test/gtest/fold.hpp
+++ b/test/gtest/fold.hpp
@@ -23,7 +23,6 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include "../driver/tensor_driver.hpp"
 #include "cpu_unfold.hpp"
 #include "get_handle.hpp"
 #include "miopen/allocator.hpp"
@@ -123,6 +122,13 @@ std::vector<FoldTestCase> FoldTestConfigs()
         {3, 3 * 3 * 4, 0, 0, 3 * 4, {5, 7}, {3, 4}, {1, 1}, {0, 0}, {1, 1}, true},
         {3, 3 * 2 * 2, 0, 0, 3 * 4, {2, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, true},
         {3, 3 * 2 * 2, 0, 0, 3 * 4, {5, 7}, {2, 2}, {1, 1}, {0, 0}, {2, 3}, true},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {4, 5}, {2, 2}, {1, 1}, {0, 0}, {1, 1}, false},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {6, 11}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, false},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {7, 12}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, false},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {7, 13}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, false},
+        {3, 3 * 3 * 4, 0, 0, 3 * 4, {5, 7}, {3, 4}, {1, 1}, {0, 0}, {1, 1}, false},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {2, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {5, 7}, {2, 2}, {1, 1}, {0, 0}, {2, 3}, false},
     };
     // clang-format: on
 }
@@ -197,13 +203,10 @@ struct FoldFwdTest : public ::testing::TestWithParam<FoldTestCase>
         // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
         if(std::is_same<T, bfloat16>::value)
             tolerance *= 8.0;
-        for(int i = 0; i < 10; ++i)
-        {
-            std::cout << "output[" << i << "]: " << output[i] << " ~ " << outputHost[i]
-                      << std::endl;
-        }
         auto error_output = miopen::rms_range(outputHost, output);
-        EXPECT_TRUE(error_output < tolerance) << "Error forward output beyond tolerance Error: {"
+        ASSERT_EQ(miopen::range_distance(outputHost), miopen::range_distance(output));
+
+        EXPECT_LT(error_output, tolerance) << "Error forward output beyond tolerance Error: {"
                                               << error_output << "},  Tolerance: " << tolerance;
     }
     FoldTestCase config;
@@ -289,10 +292,13 @@ struct FoldBwdTest : public ::testing::TestWithParam<FoldTestCase>
         if(std::is_same<T, bfloat16>::value)
             tolerance *= 8.0;
         auto error_dinput = miopen::rms_range(dinputHost, dinput);
-        EXPECT_TRUE(error_dinput < tolerance)
+        ASSERT_EQ(miopen::range_distance(dinputHost), miopen::range_distance(dinput));
+
+        EXPECT_LT(error_dinput, tolerance)
             << "Error backward input_grad beyond tolerance Error: {" << error_dinput
             << "},  Tolerance: " << tolerance;
     }
+    
     FoldTestCase config;
 
     tensor<T> dinput;
diff --git a/test/gtest/unfold.hpp b/test/gtest/unfold.hpp
index 4bb790b1f5..2abb1daca4 100644
--- a/test/gtest/unfold.hpp
+++ b/test/gtest/unfold.hpp
@@ -23,7 +23,6 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include "../driver/tensor_driver.hpp"
 #include "cpu_unfold.hpp"
 #include "get_handle.hpp"
 #include "miopen/allocator.hpp"
@@ -32,7 +31,6 @@
 #include "verify.hpp"
 #include <cstddef>
 #include <cstdlib>
-#include <random>
 #include <gtest/gtest.h>
 #include <miopen/miopen.h>
 #include <miopen/fold.hpp>
@@ -121,6 +119,12 @@ std::vector<UnfoldTestCase> UnfoldTestConfigs()
         {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {3, 2}, {1, 1}, true},
         {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {0, 0}, {3, 2}, true},
         {11, 13, 0, 33, 37, {4, 3}, {2, 3}, {5, 2}, {3, 5}, true},
+        {2, 5, 0, 3, 4, {2, 3}, {1, 1}, {0, 0}, {1, 1}, false},
+        {1, 3, 0, 10, 12, {4, 5}, {1, 1}, {0, 0}, {1, 1}, false},
+        {11, 13, 0, 17, 19, {3, 3}, {3, 2}, {0, 0}, {1, 1}, false},
+        {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {3, 2}, {1, 1}, false},
+        {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {0, 0}, {3, 2}, false},
+        {11, 13, 0, 33, 37, {4, 3}, {2, 3}, {5, 2}, {3, 5}, false},
     };
     // clang-format: on
 }
@@ -204,9 +208,12 @@ struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         if(std::is_same<T, bfloat16>::value)
             tolerance *= 8.0;
         auto error_output = miopen::rms_range(outputHost, output);
+        ASSERT_EQ(miopen::range_distance(outputHost), miopen::range_distance(output));
+        
         EXPECT_TRUE(error_output < tolerance) << "Error forward output beyond tolerance Error: {"
                                               << error_output << "},  Tolerance: " << tolerance;
     }
+
     UnfoldTestCase config;
 
     tensor<T> input;
@@ -297,7 +304,9 @@ struct UnfoldBwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         if(std::is_same<T, bfloat16>::value)
             tolerance *= 8.0;
         auto error_dinput = miopen::rms_range(dinputHost, dinput);
-        EXPECT_TRUE(error_dinput < tolerance)
+        ASSERT_EQ(miopen::range_distance(dinputHost), miopen::range_distance(dinput));
+
+        EXPECT_LT(error_dinput, tolerance)
             << "Error backward input_grad beyond tolerance Error: {" << error_dinput
             << "},  Tolerance: " << tolerance;
     }

From 66e5dcb4b6c10fdb5dcb7647b94b52c95b1240f1 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 13 Aug 2024 16:40:48 +0000
Subject: [PATCH 43/46] githook format

---
 test/gtest/fold.hpp   | 9 ++++-----
 test/gtest/unfold.hpp | 7 +++----
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp
index 9c7a9874eb..3c63862fca 100644
--- a/test/gtest/fold.hpp
+++ b/test/gtest/fold.hpp
@@ -207,7 +207,7 @@ struct FoldFwdTest : public ::testing::TestWithParam<FoldTestCase>
         ASSERT_EQ(miopen::range_distance(outputHost), miopen::range_distance(output));
 
         EXPECT_LT(error_output, tolerance) << "Error forward output beyond tolerance Error: {"
-                                              << error_output << "},  Tolerance: " << tolerance;
+                                           << error_output << "},  Tolerance: " << tolerance;
     }
     FoldTestCase config;
 
@@ -294,11 +294,10 @@ struct FoldBwdTest : public ::testing::TestWithParam<FoldTestCase>
         auto error_dinput = miopen::rms_range(dinputHost, dinput);
         ASSERT_EQ(miopen::range_distance(dinputHost), miopen::range_distance(dinput));
 
-        EXPECT_LT(error_dinput, tolerance)
-            << "Error backward input_grad beyond tolerance Error: {" << error_dinput
-            << "},  Tolerance: " << tolerance;
+        EXPECT_LT(error_dinput, tolerance) << "Error backward input_grad beyond tolerance Error: {"
+                                           << error_dinput << "},  Tolerance: " << tolerance;
     }
-    
+
     FoldTestCase config;
 
     tensor<T> dinput;
diff --git a/test/gtest/unfold.hpp b/test/gtest/unfold.hpp
index 2abb1daca4..63d859f7f0 100644
--- a/test/gtest/unfold.hpp
+++ b/test/gtest/unfold.hpp
@@ -209,7 +209,7 @@ struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
             tolerance *= 8.0;
         auto error_output = miopen::rms_range(outputHost, output);
         ASSERT_EQ(miopen::range_distance(outputHost), miopen::range_distance(output));
-        
+
         EXPECT_TRUE(error_output < tolerance) << "Error forward output beyond tolerance Error: {"
                                               << error_output << "},  Tolerance: " << tolerance;
     }
@@ -306,9 +306,8 @@ struct UnfoldBwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         auto error_dinput = miopen::rms_range(dinputHost, dinput);
         ASSERT_EQ(miopen::range_distance(dinputHost), miopen::range_distance(dinput));
 
-        EXPECT_LT(error_dinput, tolerance)
-            << "Error backward input_grad beyond tolerance Error: {" << error_dinput
-            << "},  Tolerance: " << tolerance;
+        EXPECT_LT(error_dinput, tolerance) << "Error backward input_grad beyond tolerance Error: {"
+                                           << error_dinput << "},  Tolerance: " << tolerance;
     }
     UnfoldTestCase config;
 

From 918091d242ad9cad4eedd5696c425b6009d7b38a Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 13 Aug 2024 17:01:39 +0000
Subject: [PATCH 44/46] remove /*context*/ for solver

---
 src/solver/fold/fold_backward.cpp   | 2 +-
 src/solver/fold/fold_forward.cpp    | 2 +-
 src/solver/fold/unfold_backward.cpp | 2 +-
 src/solver/fold/unfold_forward.cpp  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp
index b952e5375c..edfa5649a0 100644
--- a/src/solver/fold/fold_backward.cpp
+++ b/src/solver/fold/fold_backward.cpp
@@ -43,7 +43,7 @@ namespace solver {
 namespace fold {
 
 bool FoldBwd::IsApplicable(
-    [[maybe_unused]] const ExecutionContext& /*context*/,
+    [[maybe_unused]] const ExecutionContext&,
     [[maybe_unused]] const miopen::fold::FoldBwdProblemDescription& problem) const
 {
     return true;
diff --git a/src/solver/fold/fold_forward.cpp b/src/solver/fold/fold_forward.cpp
index 17fb11180c..585a21a0e2 100644
--- a/src/solver/fold/fold_forward.cpp
+++ b/src/solver/fold/fold_forward.cpp
@@ -44,7 +44,7 @@ namespace solver {
 namespace fold {
 
 bool FoldFwd::IsApplicable(
-    [[maybe_unused]] const ExecutionContext& /*context*/,
+    [[maybe_unused]] const ExecutionContext&,
     [[maybe_unused]] const miopen::fold::FoldFwdProblemDescription& problem) const
 {
     return true;
diff --git a/src/solver/fold/unfold_backward.cpp b/src/solver/fold/unfold_backward.cpp
index da11969c64..b9b49c2799 100644
--- a/src/solver/fold/unfold_backward.cpp
+++ b/src/solver/fold/unfold_backward.cpp
@@ -43,7 +43,7 @@ namespace solver {
 namespace fold {
 
 bool UnfoldBwd::IsApplicable(
-    [[maybe_unused]] const ExecutionContext& /*context*/,
+    [[maybe_unused]] const ExecutionContext&,
     [[maybe_unused]] const miopen::fold::UnfoldBwdProblemDescription& problem) const
 {
     return true;
diff --git a/src/solver/fold/unfold_forward.cpp b/src/solver/fold/unfold_forward.cpp
index 54e39809d6..154c51c2fc 100644
--- a/src/solver/fold/unfold_forward.cpp
+++ b/src/solver/fold/unfold_forward.cpp
@@ -43,7 +43,7 @@ namespace solver {
 namespace fold {
 
 bool UnfoldFwd::IsApplicable(
-    [[maybe_unused]] const ExecutionContext& /*context*/,
+    [[maybe_unused]] const ExecutionContext&,
     [[maybe_unused]] const miopen::fold::UnfoldFwdProblemDescription& problem) const
 {
     return true;

From 901d7b32448be557eeca4d3001ea648a71b477f0 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 13 Aug 2024 17:02:32 +0000
Subject: [PATCH 45/46] remove gen_one

---
 test/gtest/fold.hpp   | 2 --
 test/gtest/unfold.hpp | 2 --
 2 files changed, 4 deletions(-)

diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp
index 3c63862fca..e97758f720 100644
--- a/test/gtest/fold.hpp
+++ b/test/gtest/fold.hpp
@@ -146,7 +146,6 @@ struct FoldFwdTest : public ::testing::TestWithParam<FoldTestCase>
         std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
 
         auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
-        [[maybe_unused]] auto gen_one = [&](auto...) { return 1; };
         auto gen_zero                 = [&](auto...) { return 0; };
         input                         = tensor<T>{in_dims, in_strides}.generate(gen_value);
         const int32_t N               = static_cast<int32_t>(in_dims[0]);
@@ -233,7 +232,6 @@ struct FoldBwdTest : public ::testing::TestWithParam<FoldTestCase>
         std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
 
         auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
-        [[maybe_unused]] auto gen_one = [&](auto...) { return 1; };
         auto gen_zero                 = [&](auto...) { return 0; };
         dinput                        = tensor<T>{in_dims, in_strides}.generate(gen_zero);
         dinputHost                    = tensor<T>{in_dims, in_strides}.generate(gen_zero);
diff --git a/test/gtest/unfold.hpp b/test/gtest/unfold.hpp
index 63d859f7f0..78941443e9 100644
--- a/test/gtest/unfold.hpp
+++ b/test/gtest/unfold.hpp
@@ -142,7 +142,6 @@ struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
 
         auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
-        [[maybe_unused]] auto gen_one = [&](auto...) { return 1; };
         auto gen_zero                 = [&](auto...) { return 0; };
         input                         = tensor<T>{in_dims, in_strides}.generate(gen_value);
 
@@ -238,7 +237,6 @@ struct UnfoldBwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
 
         auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
-        [[maybe_unused]] auto gen_one = [&](auto...) { return 1; };
         auto gen_zero                 = [&](auto...) { return 0; };
         dinput                        = tensor<T>{in_dims, in_strides}.generate(gen_zero);
         dinputHost                    = tensor<T>{in_dims, in_strides}.generate(gen_zero);

From 89562483bfa20bc5e769b4633b1ccb26e1b8c2b4 Mon Sep 17 00:00:00 2001
From: Duong Le <duong.le@moreh.com.vn>
Date: Tue, 13 Aug 2024 17:02:52 +0000
Subject: [PATCH 46/46] githook format

---
 test/gtest/fold.hpp   | 16 ++++++++--------
 test/gtest/unfold.hpp | 10 +++++-----
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp
index e97758f720..f43f30f346 100644
--- a/test/gtest/fold.hpp
+++ b/test/gtest/fold.hpp
@@ -145,11 +145,11 @@ struct FoldFwdTest : public ::testing::TestWithParam<FoldTestCase>
         std::vector<size_t> in_dims    = config.GetInput();
         std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
 
-        auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
-        auto gen_zero                 = [&](auto...) { return 0; };
-        input                         = tensor<T>{in_dims, in_strides}.generate(gen_value);
-        const int32_t N               = static_cast<int32_t>(in_dims[0]);
-        int32_t C                     = static_cast<int32_t>(in_dims[1]);
+        auto gen_value  = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
+        auto gen_zero   = [&](auto...) { return 0; };
+        input           = tensor<T>{in_dims, in_strides}.generate(gen_value);
+        const int32_t N = static_cast<int32_t>(in_dims[0]);
+        int32_t C       = static_cast<int32_t>(in_dims[1]);
         for(int32_t i : config.kernelSize)
         {
             C = C / i;
@@ -232,9 +232,9 @@ struct FoldBwdTest : public ::testing::TestWithParam<FoldTestCase>
         std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
 
         auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
-        auto gen_zero                 = [&](auto...) { return 0; };
-        dinput                        = tensor<T>{in_dims, in_strides}.generate(gen_zero);
-        dinputHost                    = tensor<T>{in_dims, in_strides}.generate(gen_zero);
+        auto gen_zero  = [&](auto...) { return 0; };
+        dinput         = tensor<T>{in_dims, in_strides}.generate(gen_zero);
+        dinputHost     = tensor<T>{in_dims, in_strides}.generate(gen_zero);
 
         const int32_t N = static_cast<int32_t>(in_dims[0]);
         int32_t C       = static_cast<int32_t>(in_dims[1]);
diff --git a/test/gtest/unfold.hpp b/test/gtest/unfold.hpp
index 78941443e9..2631722400 100644
--- a/test/gtest/unfold.hpp
+++ b/test/gtest/unfold.hpp
@@ -142,8 +142,8 @@ struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
 
         auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
-        auto gen_zero                 = [&](auto...) { return 0; };
-        input                         = tensor<T>{in_dims, in_strides}.generate(gen_value);
+        auto gen_zero  = [&](auto...) { return 0; };
+        input          = tensor<T>{in_dims, in_strides}.generate(gen_value);
 
         int spatial_dim_size = in_dims.size() - 2;
         const int32_t N      = static_cast<int32_t>(in_dims[0]);
@@ -237,9 +237,9 @@ struct UnfoldBwdTest : public ::testing::TestWithParam<UnfoldTestCase>
         std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
 
         auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
-        auto gen_zero                 = [&](auto...) { return 0; };
-        dinput                        = tensor<T>{in_dims, in_strides}.generate(gen_zero);
-        dinputHost                    = tensor<T>{in_dims, in_strides}.generate(gen_zero);
+        auto gen_zero  = [&](auto...) { return 0; };
+        dinput         = tensor<T>{in_dims, in_strides}.generate(gen_zero);
+        dinputHost     = tensor<T>{in_dims, in_strides}.generate(gen_zero);
 
         int spatial_dim_size = in_dims.size() - 2;
         const int32_t N      = static_cast<int32_t>(in_dims[0]);