diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 2574dbbf5e..90f6cc3df8 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -33,6 +33,7 @@ The MIOpen API library is structured as follows:
   * :doc:`Cat <../doxygen/html/group__cat>` (experimental)
   * :doc:`SGD <../doxygen/html/group___s_g_d>` (experimental)
   * :doc:`ReduceExtreme <../doxygen/html/group__ReduceExtreme>` (experimental)
+  * :doc:`Fold <./group___f_o_l_d>` (experimental)
   * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental)
   * :doc:`ReduceCalculation <../doxygen/html/group__ReduceCalculation>` (experimental)
   * :doc:`RotaryPositionalEmbeddings <../doxygen/html/group__RotaryPositionalEmbeddings>` (experimental)
diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index 19abd61597..7e35ad803c 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -57,6 +57,8 @@ add_executable(MIOpenDriver
     dm_t5layernorm.cpp
     dm_tensorop.cpp
     dm_transformers_adam_w.cpp
+    dm_fold.cpp
+    dm_unfold.cpp
     main.cpp
     registry_driver_maker.cpp
     rocrand_wrapper.cpp)
diff --git a/driver/dm_fold.cpp b/driver/dm_fold.cpp
new file mode 100644
index 0000000000..d7a8e2cb9a
--- /dev/null
+++ b/driver/dm_fold.cpp
@@ -0,0 +1,39 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "registry_driver_maker.hpp"
+#include "fold_driver.hpp"
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "fold")
+        return new FoldDriver<float, float>();
+    if(base_arg == "foldfp16")
+        return new FoldDriver<float16, float>();
+    if(base_arg == "foldbfp16")
+        return new FoldDriver<bfloat16, float>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
diff --git a/driver/dm_unfold.cpp b/driver/dm_unfold.cpp
new file mode 100644
index 0000000000..3d7ed56a91
--- /dev/null
+++ b/driver/dm_unfold.cpp
@@ -0,0 +1,39 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "registry_driver_maker.hpp"
+#include "unfold_driver.hpp"
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "unfold")
+        return new UnfoldDriver<float, float>();
+    if(base_arg == "unfoldfp16")
+        return new UnfoldDriver<float16, float>();
+    if(base_arg == "unfoldbfp16")
+        return new UnfoldDriver<bfloat16, float>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
diff --git a/driver/driver.hpp b/driver/driver.hpp
index 196d48c2b6..844101230a 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -169,13 +169,15 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
 [[noreturn]] inline void Usage()
 {
     printf("Usage: ./driver *base_arg* *other_args*\n");
-    printf("Supported Base Arguments: conv[fp16|int8|bfp16], pool[fp16], lrn[fp16], "
-           "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], "
-           "tensorop, reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
-           "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
-           "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
-           "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
-           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16]\n");
+    printf(
+        "Supported Base Arguments: conv[fp16|int8|bfp16], pool[fp16], lrn[fp16], "
+        "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm[fp16], ctc, dropout[fp16], "
+        "tensorop, reduce[fp16|fp64], layernorm[bfp16|fp16], sum[bfp16|fp16], "
+        "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
+        "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
+        "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
+        "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], unfold[bfp16|fp16], "
+        "fold[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -207,7 +209,9 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" &&
        arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" &&
        arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "rope" &&
-       arg != "ropefp16" && arg != "ropebfp16" && arg != "--version")
+       arg != "ropefp16" && arg != "ropebfp16" && arg != "unfold" && arg != "unfoldfp16" &&
+       arg != "unfoldbfp16" && arg != "fold" && arg != "foldfp16" && arg != "foldbfp16" &&
+       arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/fold_driver.hpp b/driver/fold_driver.hpp
new file mode 100644
index 0000000000..c034beeaee
--- /dev/null
+++ b/driver/fold_driver.hpp
@@ -0,0 +1,473 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACTORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_FOLD_DRIVER_HPP
+#define GUARD_MIOPEN_FOLD_DRIVER_HPP
+
+#include "InputFlags.hpp"
+#include "driver.hpp"
+#include "mloUnfoldHost.hpp"
+#include "random.hpp"
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+#include "util_driver.hpp"
+
+#include <../test/tensor_holder.hpp>
+#include <../test/verify.hpp>
+
+#include <miopen/env.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <vector>
+
+template <typename Tgpu, typename Tref>
+class FoldDriver : public Driver
+{
+public:
+    FoldDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&inputDesc);
+        miopenCreateTensorDescriptor(&outputDesc);
+        miopenCreateTensorDescriptor(&dinputDesc);
+        miopenCreateTensorDescriptor(&doutputDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    int GetandSetData() override;
+    std::vector<int> GetTensorLengthsFromCmdLine();
+    std::vector<int32_t> GetVectorInt32tFromCmdLine(std::string long_name);
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+    int RunBackwardCPU();
+
+    Tref GetTolerance();
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~FoldDriver() override
+    {
+        miopenDestroyTensorDescriptor(inputDesc);
+        miopenDestroyTensorDescriptor(outputDesc);
+        miopenDestroyTensorDescriptor(dinputDesc);
+        miopenDestroyTensorDescriptor(doutputDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    int forw;
+
+    miopenTensorDescriptor_t inputDesc;
+    miopenTensorDescriptor_t outputDesc;
+
+    miopenTensorDescriptor_t doutputDesc;
+    miopenTensorDescriptor_t dinputDesc;
+
+    std::unique_ptr<GPUMem> input_dev;
+    std::unique_ptr<GPUMem> output_dev;
+
+    std::unique_ptr<GPUMem> doutput_dev;
+    std::unique_ptr<GPUMem> dinput_dev;
+
+    std::vector<Tgpu> input;
+    std::vector<Tgpu> output;
+
+    std::vector<Tgpu> doutput;
+    std::vector<Tgpu> dinput;
+
+    std::vector<Tref> output_host;
+
+    std::vector<Tref> dinput_host;
+
+    std::vector<int32_t> output_size;
+    std::vector<int32_t> kernel_size;
+    std::vector<int32_t> stride;
+    std::vector<int32_t> padding;
+    std::vector<int32_t> dilation;
+};
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::GetandSetData()
+{
+    std::vector<int> input_length = inflags.GetValueTensor("DimLengths").lengths;
+
+    output_size = GetVectorInt32tFromCmdLine("outputSize");
+    kernel_size = GetVectorInt32tFromCmdLine("kernelSize");
+    stride      = GetVectorInt32tFromCmdLine("stride");
+    padding     = GetVectorInt32tFromCmdLine("padding");
+    dilation    = GetVectorInt32tFromCmdLine("dilation");
+    const int N = input_length[0];
+    int C       = input_length[1];
+    for(int32_t i : kernel_size)
+    {
+        C = C / i;
+    }
+
+    std::vector<int> output_length = {N, C, output_size[0], output_size[1]};
+    SetTensorNd(inputDesc, input_length, data_type);
+    SetTensorNd(outputDesc, output_length, data_type);
+    SetTensorNd(dinputDesc, input_length, data_type);
+    SetTensorNd(doutputDesc, output_length, data_type);
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag(
+        "forw", 'F', "1", "Run Fold Forward (Default=1) or both Forward and Backward (0)", "int");
+    inflags.AddTensorFlag(
+        "DimLengths", 'D', "3x12x12", "The dimensional lengths of the input tensor");
+    inflags.AddInputFlag("outputSize", 'o', "4,5", "Output Size (Default=2,3)", "str");
+    inflags.AddInputFlag("kernelSize", 'k', "2,2", "Kernel Size (Default=2,3)", "str");
+    inflags.AddInputFlag("stride", 's', "1,1", "Stride (Default=1,1)", "str");
+    inflags.AddInputFlag("padding", 'p', "0,0", "Padding (Default=0,0)", "str");
+    inflags.AddInputFlag("dilation", 'd', "1,1", "Dilation (Default=1,1)", "str");
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int");
+    inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+std::vector<int> FoldDriver<Tgpu, Tref>::GetTensorLengthsFromCmdLine()
+{
+    std::string lengthsStr = inflags.GetValueStr("DimLengths");
+
+    std::vector<int> lengths;
+    std::size_t pos = 0;
+    std::size_t new_pos;
+
+    new_pos = lengthsStr.find(',', pos);
+    while(new_pos != std::string::npos)
+    {
+        std::string sliceStr = lengthsStr.substr(pos, new_pos - pos);
+
+        int len = std::stoi(sliceStr);
+
+        lengths.push_back(len);
+
+        pos     = new_pos + 1;
+        new_pos = lengthsStr.find(',', pos);
+    };
+
+    std::string sliceStr = lengthsStr.substr(pos);
+    int len              = std::stoi(sliceStr);
+
+    lengths.push_back(len);
+
+    return (lengths);
+}
+
+template <typename Tgpu, typename Tref>
+std::vector<int32_t> FoldDriver<Tgpu, Tref>::GetVectorInt32tFromCmdLine(std::string long_name)
+{
+    std::string lengthsStr = inflags.GetValueStr(long_name);
+
+    std::vector<int32_t> lengths;
+    std::size_t pos = 0;
+    std::size_t new_pos;
+
+    new_pos = lengthsStr.find(',', pos);
+    while(new_pos != std::string::npos)
+    {
+        std::string sliceStr = lengthsStr.substr(pos, new_pos - pos);
+
+        int len = std::stoi(sliceStr);
+
+        lengths.push_back(static_cast<int32_t>(len));
+
+        pos     = new_pos + 1;
+        new_pos = lengthsStr.find(',', pos);
+    };
+
+    std::string sliceStr = lengthsStr.substr(pos);
+    int len              = std::stoi(sliceStr);
+
+    lengths.push_back(static_cast<int32_t>(len));
+
+    return (lengths);
+}
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    size_t input_sz  = GetTensorSize(inputDesc);
+    size_t output_sz = GetTensorSize(outputDesc);
+
+    size_t doutput_sz = GetTensorSize(doutputDesc);
+    size_t dinput_sz  = GetTensorSize(dinputDesc);
+
+    uint32_t ctx = 0;
+
+    input_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    output_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
+
+    doutput_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, doutput_sz, sizeof(Tgpu)));
+    dinput_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, dinput_sz, sizeof(Tgpu)));
+
+    input  = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(0.0f));
+    output = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0.0f));
+
+    doutput = std::vector<Tgpu>(doutput_sz, static_cast<Tgpu>(1.0f));
+    dinput  = std::vector<Tgpu>(dinput_sz, static_cast<Tgpu>(0.0f));
+
+    output_host = std::vector<Tref>(output_sz, static_cast<Tref>(0.0f));
+
+    dinput_host = std::vector<Tref>(dinput_sz, static_cast<Tref>(0.0f));
+
+    int status;
+
+    for(int i = 0; i < input_sz; i++)
+        input[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
+    status = input_dev->ToGPU(GetStream(), input.data());
+
+    for(int i = 0; i < doutput_sz; i++)
+    {
+        doutput[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
+    }
+    status |= doutput_dev->ToGPU(GetStream(), doutput.data());
+    status |= dinput_dev->ToGPU(GetStream(), dinput.data());
+
+    if(status != 0)
+        std::cout << "Fold Driver Error copying data to GPU\n" << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenFoldForward(GetHandle(),
+                          inputDesc,
+                          input_dev->GetMem(),
+                          outputDesc,
+                          output_dev->GetMem(),
+                          kernel_size.data(),
+                          kernel_size.size(),
+                          stride.data(),
+                          stride.size(),
+                          padding.data(),
+                          padding.size(),
+                          dilation.data(),
+                          dilation.size());
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Fold Forward Elapsed: " << t.gettime_ms() / iter << " ms"
+                      << std::endl;
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Fold Forward Elapsed: " << kernel_average_time << " ms"
+                  << std::endl;
+    }
+
+    if(output_dev->FromGPU(GetStream(), output.data()) != 0)
+        std::cerr << "Error copying (out_dev) from GPU, size: " << output_dev->GetSize()
+                  << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::RunForwardCPU()
+{
+    mloUnFoldBwd4DRunHost(output_host.data(),
+                          outputDesc,
+                          input.data(),
+                          inputDesc,
+                          kernel_size,
+                          stride,
+                          padding,
+                          dilation);
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenFoldBackward(GetHandle(),
+                           dinputDesc,
+                           dinput_dev->GetMem(),
+                           doutputDesc,
+                           doutput_dev->GetMem(),
+                           kernel_size.data(),
+                           kernel_size.size(),
+                           stride.data(),
+                           stride.size(),
+                           padding.data(),
+                           padding.size(),
+                           dilation.data(),
+                           dilation.size());
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Fold Backward Elapsed: " << t.gettime_ms() / iter << " ms"
+                      << std::endl;
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Fold Backward Elapsed: " << kernel_average_time << " ms"
+                  << std::endl;
+    }
+
+    if(dinput_dev->FromGPU(GetStream(), dinput.data()) != 0)
+        std::cerr << "Error copying (dinput_dev) from GPU, size: " << dinput_dev->GetSize()
+                  << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::RunBackwardCPU()
+{
+    mloUnFoldFwd4DRunHost(doutput.data(),
+                          doutputDesc,
+                          dinput_host.data(),
+                          dinputDesc,
+                          kernel_size,
+                          stride,
+                          padding,
+                          dilation);
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+Tref FoldDriver<Tgpu, Tref>::GetTolerance()
+{
+    // Computation error of fp16 is ~2^13 (=8192) bigger than
+    // the one of fp32 because mantissa is shorter by 13 bits.
+    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
+
+    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+    if(std::is_same<Tgpu, bfloat16>::value)
+        tolerance *= 8.0;
+    return tolerance;
+}
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::VerifyForward()
+{
+    RunForwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error_output    = miopen::rms_range(output_host, output);
+
+    if(!std::isfinite(error_output) || error_output > tolerance)
+    {
+        std::cout << "Forward Fold FAILED: {" << error_output << "} > " << tolerance << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        std::cout << "Forward Fold Verifies OK on CPU reference ({" << error_output << "} < "
+                  << tolerance << ')' << std::endl;
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int FoldDriver<Tgpu, Tref>::VerifyBackward()
+{
+    RunBackwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error_dinput    = miopen::rms_range(dinput_host, dinput);
+
+    if(!std::isfinite(error_dinput) || error_dinput > tolerance)
+    {
+        std::cout << "Backward Fold FAILED: {" << error_dinput << "} > " << tolerance << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        std::cout << "Backward Fold Verifies OK on CPU reference ({" << error_dinput << "} < "
+                  << tolerance << ')' << std::endl;
+    }
+    return miopenStatusSuccess;
+}
+
+#endif // GUARD_MIOPEN_FOLD_DRIVER_HPP
diff --git a/driver/mloUnfoldHost.hpp b/driver/mloUnfoldHost.hpp
new file mode 100644
index 0000000000..6204ea2bd9
--- /dev/null
+++ b/driver/mloUnfoldHost.hpp
@@ -0,0 +1,187 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <../test/ford.hpp>
+#include "tensor_view.hpp"
+#include "miopen/tensor_view_utils.hpp"
+#include <vector>
+#include <miopen/tensor.hpp>
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloUnFoldFwd4DRunHost(Tgpu* input,
+                              const miopenTensorDescriptor_t inputDesc,
+                              Tcheck* ref_output,
+                              const miopenTensorDescriptor_t ref_outputDesc,
+                              const std::vector<int32_t> kernel_size,
+                              const std::vector<int32_t> stride,
+                              const std::vector<int32_t> padding,
+                              const std::vector<int32_t> dilation)
+{
+    auto input_tv   = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc));
+    auto output_tv  = miopen::get_inner_expanded_tv<3>(miopen::deref(ref_outputDesc));
+    auto input_dims = miopen::deref(inputDesc).GetLengths();
+    auto input_size = miopen::deref(inputDesc).GetNumDims();
+
+    const int LOCAL_SIZE = 256;
+    int spatial_dim_size = input_size - 2;
+    const int32_t N      = static_cast<int32_t>(input_dims[0]);
+    const int32_t C      = static_cast<int32_t>(input_dims[1]);
+    int32_t P = 1, L = 1;
+    std::vector<int32_t> ls;
+    for(int i = 0; i < spatial_dim_size; ++i)
+    {
+        P *= kernel_size[i];
+        int32_t l = (static_cast<int32_t>(input_dims[i + 2]) + 2 * padding[i] -
+                     dilation[i] * (kernel_size[i] - 1) - 1) /
+                        stride[i] +
+                    1;
+        L *= l;
+        ls.push_back(l);
+    }
+    int32_t kernel_size_w = kernel_size[1];
+    int32_t stride_h      = stride[0];
+    int32_t stride_w      = stride[1];
+    int32_t padding_h     = padding[0];
+    int32_t padding_w     = padding[1];
+    int32_t dilation_h    = dilation[0];
+    int32_t dilation_w    = dilation[1];
+    int32_t LW            = ls[1];
+    int32_t H             = static_cast<int32_t>(input_dims[2]);
+    int32_t W             = static_cast<int32_t>(input_dims[3]);
+    int work_size         = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
+    par_ford(work_size)([&](int gid) {
+        int ncp = gid / L, l = gid % L;
+        int nc = ncp / P, p = ncp % P;
+        int n = nc / C, c = nc % C;
+        if(n >= N)
+            return;
+
+        int lh = l / LW, lw = l % LW;                       // sliding window position
+        int ph = p / kernel_size_w, pw = p % kernel_size_w; // position inside kernel
+        int h = lh * stride_h - padding_h + ph * dilation_h;
+        int w = lw * stride_w - padding_w + pw * dilation_w;
+
+        Tgpu x = static_cast<Tgpu>(0.0f);
+        if(0 <= h && h < H && 0 <= w && w < W)
+        {
+            long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h +
+                             input_tv.stride[1] * c + input_tv.stride[0] * n;
+            x = input[input_idx];
+        }
+
+        long output_idx =
+            output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n;
+        ref_output[output_idx] = static_cast<Tcheck>(x);
+    });
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloUnFoldBwd4DRunHost(Tcheck* ref_dinput,
+                              const miopenTensorDescriptor_t dinputDesc,
+                              Tgpu* doutput,
+                              const miopenTensorDescriptor_t doutputDesc,
+                              const std::vector<int32_t> kernel_size,
+                              const std::vector<int32_t> stride,
+                              const std::vector<int32_t> padding,
+                              const std::vector<int32_t> dilation)
+{
+    auto input_grad_tv   = miopen::get_inner_expanded_tv<4>(miopen::deref(dinputDesc));
+    auto output_grad_tv  = miopen::get_inner_expanded_tv<3>(miopen::deref(doutputDesc));
+    auto input_grad_dims = miopen::deref(dinputDesc).GetLengths();
+    auto input_size      = miopen::deref(dinputDesc).GetNumDims();
+
+    const int LOCAL_SIZE = 256;
+    int spatial_dim_size = input_size - 2;
+    const int32_t N      = static_cast<int32_t>(input_grad_dims[0]);
+    const int32_t C      = static_cast<int32_t>(input_grad_dims[1]);
+    int32_t P            = 1;
+    std::vector<int32_t> ls;
+    for(int i = 0; i < spatial_dim_size; ++i)
+    {
+        P *= kernel_size[i];
+        int32_t l = (static_cast<int32_t>(input_grad_dims[i + 2]) + 2 * padding[i] -
+                     dilation[i] * (kernel_size[i] - 1) - 1) /
+                        stride[i] +
+                    1;
+        ls.push_back(l);
+    }
+    int32_t kernel_size_h = kernel_size[0];
+    int32_t kernel_size_w = kernel_size[1];
+    int32_t stride_h      = stride[0];
+    int32_t stride_w      = stride[1];
+    int32_t padding_h     = padding[0];
+    int32_t padding_w     = padding[1];
+    int32_t dilation_h    = dilation[0];
+    int32_t dilation_w    = dilation[1];
+    int32_t LH            = ls[0];
+    int32_t LW            = ls[1];
+    int32_t H             = static_cast<int32_t>(input_grad_dims[2]);
+    int32_t W             = static_cast<int32_t>(input_grad_dims[3]);
+    int work_size         = (((N * C * H * W) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
+    par_ford(work_size)([&](int gid) {
+        int nch = gid / W, w = gid % W;
+        int nc = nch / H, h = nch % H;
+        int n = nc / C, c = nc % C;
+        if(n >= N)
+            return;
+
+        float sum = 0.0f;
+
+        for(int ph = 0; ph < kernel_size_h; ++ph)
+        {
+            for(int pw = 0; pw < kernel_size_w; ++pw)
+            {
+                int lhsh = h - ph * dilation_h + padding_h;
+                int lwsw = w - pw * dilation_w + padding_w;
+                if(lhsh % stride_h != 0)
+                    continue;
+                if(lwsw % stride_w != 0)
+                    continue;
+                int lh = lhsh / stride_h;
+                int lw = lwsw / stride_w;
+                if(lh < 0 || LH <= lh)
+                    continue;
+                if(lw < 0 || LW <= lw)
+                    continue;
+                long output_grad_idx =
+                    output_grad_tv.stride[2] * (lh * LW + lw) +
+                    output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) +
+                    output_grad_tv.stride[0] * n;
+                sum += static_cast<float>(doutput[output_grad_idx]);
+            }
+        }
+
+        long input_grad_idx = input_grad_tv.stride[3] * w + input_grad_tv.stride[2] * h +
+                              input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n;
+        ref_dinput[input_grad_idx] = static_cast<Tcheck>(sum);
+    });
+
+    return miopenStatusSuccess;
+}
diff --git a/driver/unfold_driver.hpp b/driver/unfold_driver.hpp
new file mode 100644
index 0000000000..8e4be26f3f
--- /dev/null
+++ b/driver/unfold_driver.hpp
@@ -0,0 +1,479 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACTORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_UNFOLD_DRIVER_HPP
+#define GUARD_MIOPEN_UNFOLD_DRIVER_HPP
+
+#include "InputFlags.hpp"
+#include "driver.hpp"
+#include "mloUnfoldHost.hpp"
+#include "random.hpp"
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+#include "util_driver.hpp"
+
+#include <../test/tensor_holder.hpp>
+#include <../test/verify.hpp>
+
+#include <miopen/env.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <vector>
+
+template <typename Tgpu, typename Tref>
+class UnfoldDriver : public Driver
+{
+public:
+    UnfoldDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&inputDesc);
+        miopenCreateTensorDescriptor(&outputDesc);
+        miopenCreateTensorDescriptor(&dinputDesc);
+        miopenCreateTensorDescriptor(&doutputDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    int GetandSetData() override;
+    std::vector<int> GetTensorLengthsFromCmdLine();
+    std::vector<int32_t> GetVectorInt32tFromCmdLine(std::string long_name);
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+    int RunBackwardCPU();
+
+    Tref GetTolerance();
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~UnfoldDriver() override
+    {
+        miopenDestroyTensorDescriptor(inputDesc);
+        miopenDestroyTensorDescriptor(outputDesc);
+        miopenDestroyTensorDescriptor(dinputDesc);
+        miopenDestroyTensorDescriptor(doutputDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    int forw;
+
+    miopenTensorDescriptor_t inputDesc;
+    miopenTensorDescriptor_t outputDesc;
+
+    miopenTensorDescriptor_t doutputDesc;
+    miopenTensorDescriptor_t dinputDesc;
+
+    std::unique_ptr<GPUMem> input_dev;
+    std::unique_ptr<GPUMem> output_dev;
+
+    std::unique_ptr<GPUMem> doutput_dev;
+    std::unique_ptr<GPUMem> dinput_dev;
+
+    std::vector<Tgpu> input;
+    std::vector<Tgpu> output;
+
+    std::vector<Tgpu> doutput;
+    std::vector<Tgpu> dinput;
+
+    std::vector<Tref> output_host;
+
+    std::vector<Tref> dinput_host;
+
+    std::vector<int32_t> kernel_size;
+    std::vector<int32_t> stride;
+    std::vector<int32_t> padding;
+    std::vector<int32_t> dilation;
+};
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::GetandSetData()
+{
+    std::vector<int> input_length = inflags.GetValueTensor("DimLengths").lengths;
+
+    kernel_size          = GetVectorInt32tFromCmdLine("kernelSize");
+    stride               = GetVectorInt32tFromCmdLine("stride");
+    padding              = GetVectorInt32tFromCmdLine("padding");
+    dilation             = GetVectorInt32tFromCmdLine("dilation");
+    int spatial_dim_size = input_length.size() - 2;
+    const int N          = input_length[0];
+    const int C          = input_length[1];
+
+    int P = 1, L = 1;
+    std::vector<int> ls;
+    for(int i = 0; i < spatial_dim_size; ++i)
+    {
+        P *= kernel_size[i];
+        int l = (input_length[i + 2] + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1) /
+                    stride[i] +
+                1;
+        L *= l;
+        ls.push_back(l);
+    }
+    std::vector<int> output_length = {N, (C * P), L};
+    SetTensorNd(inputDesc, input_length, data_type);
+    SetTensorNd(outputDesc, output_length, data_type);
+    SetTensorNd(dinputDesc, input_length, data_type);
+    SetTensorNd(doutputDesc, output_length, data_type);
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag(
+        "forw", 'F', "1", "Run Unfold Forward (Default=1) or both Forward and Backward (0)", "int");
+    inflags.AddTensorFlag(
+        "DimLengths", 'D', "2x5x3x4", "The dimensional lengths of the input tensor");
+    inflags.AddInputFlag("kernelSize", 'k', "2,3", "Kernel Size (Default=2,3)", "str");
+    inflags.AddInputFlag("stride", 's', "1,1", "Stride (Default=1,1)", "str");
+    inflags.AddInputFlag("padding", 'p', "0,0", "Padding (Default=0,0)", "str");
+    inflags.AddInputFlag("dilation", 'd', "1,1", "Dilation (Default=1,1)", "str");
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "0", "Verify Each Layer (Default=0)", "int");
+    inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time Each Layer, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+std::vector<int> UnfoldDriver<Tgpu, Tref>::GetTensorLengthsFromCmdLine()
+{
+    std::string lengthsStr = inflags.GetValueStr("DimLengths");
+
+    std::vector<int> lengths;
+    std::size_t pos = 0;
+    std::size_t new_pos;
+
+    new_pos = lengthsStr.find(',', pos);
+    while(new_pos != std::string::npos)
+    {
+        std::string sliceStr = lengthsStr.substr(pos, new_pos - pos);
+
+        int len = std::stoi(sliceStr);
+
+        lengths.push_back(len);
+
+        pos     = new_pos + 1;
+        new_pos = lengthsStr.find(',', pos);
+    };
+
+    std::string sliceStr = lengthsStr.substr(pos);
+    int len              = std::stoi(sliceStr);
+
+    lengths.push_back(len);
+
+    return (lengths);
+}
+
+template <typename Tgpu, typename Tref>
+std::vector<int32_t> UnfoldDriver<Tgpu, Tref>::GetVectorInt32tFromCmdLine(std::string long_name)
+{
+    std::string lengthsStr = inflags.GetValueStr(long_name);
+
+    std::vector<int32_t> lengths;
+    std::size_t pos = 0;
+    std::size_t new_pos;
+
+    new_pos = lengthsStr.find(',', pos);
+    while(new_pos != std::string::npos)
+    {
+        std::string sliceStr = lengthsStr.substr(pos, new_pos - pos);
+
+        int len = std::stoi(sliceStr);
+
+        lengths.push_back(static_cast<int32_t>(len));
+
+        pos     = new_pos + 1;
+        new_pos = lengthsStr.find(',', pos);
+    };
+
+    std::string sliceStr = lengthsStr.substr(pos);
+    int len              = std::stoi(sliceStr);
+
+    lengths.push_back(static_cast<int32_t>(len));
+
+    return (lengths);
+}
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    size_t input_sz  = GetTensorSize(inputDesc);
+    size_t output_sz = GetTensorSize(outputDesc);
+
+    size_t doutput_sz = GetTensorSize(doutputDesc);
+    size_t dinput_sz  = GetTensorSize(dinputDesc);
+
+    uint32_t ctx = 0;
+
+    input_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    output_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
+
+    doutput_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, doutput_sz, sizeof(Tgpu)));
+    dinput_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, dinput_sz, sizeof(Tgpu)));
+
+    input  = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(0.0f));
+    output = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0.0f));
+
+    doutput = std::vector<Tgpu>(doutput_sz, static_cast<Tgpu>(1.0f));
+    dinput  = std::vector<Tgpu>(dinput_sz, static_cast<Tgpu>(0.0f));
+
+    output_host = std::vector<Tref>(output_sz, static_cast<Tref>(0.0f));
+
+    dinput_host = std::vector<Tref>(dinput_sz, static_cast<Tref>(0.0f));
+
+    int status;
+
+    for(int i = 0; i < input_sz; i++)
+        input[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
+    status = input_dev->ToGPU(GetStream(), input.data());
+
+    for(int i = 0; i < doutput_sz; i++)
+    {
+        doutput[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(0.0), static_cast<Tgpu>(1.0));
+    }
+    status |= doutput_dev->ToGPU(GetStream(), doutput.data());
+    status |= dinput_dev->ToGPU(GetStream(), dinput.data());
+
+    if(status != 0)
+        std::cout << "Unfold Driver Error copying data to GPU\n" << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenUnfoldForward(GetHandle(),
+                            inputDesc,
+                            input_dev->GetMem(),
+                            outputDesc,
+                            output_dev->GetMem(),
+                            kernel_size.data(),
+                            kernel_size.size(),
+                            stride.data(),
+                            stride.size(),
+                            padding.data(),
+                            padding.size(),
+                            dilation.data(),
+                            dilation.size());
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Unfold Forward Elapsed: " << t.gettime_ms() / iter
+                      << " ms" << std::endl;
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Unfold Forward Elapsed: " << kernel_average_time << " ms"
+                  << std::endl;
+    }
+
+    if(output_dev->FromGPU(GetStream(), output.data()) != 0)
+        std::cerr << "Error copying (out_dev) from GPU, size: " << output_dev->GetSize()
+                  << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::RunForwardCPU()
+{
+    mloUnFoldFwd4DRunHost(input.data(),
+                          inputDesc,
+                          output_host.data(),
+                          outputDesc,
+                          kernel_size,
+                          stride,
+                          padding,
+                          dilation);
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    float kernel_total_time = 0;
+    float kernel_first_time = 0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenUnfoldBackward(GetHandle(),
+                             dinputDesc,
+                             dinput_dev->GetMem(),
+                             doutputDesc,
+                             doutput_dev->GetMem(),
+                             kernel_size.data(),
+                             kernel_size.size(),
+                             stride.data(),
+                             stride.size(),
+                             padding.data(),
+                             padding.size(),
+                             dilation.data(),
+                             dilation.size());
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            std::cout << "Wall-clock Time Unfold Backward Elapsed: " << t.gettime_ms() / iter
+                      << " ms" << std::endl;
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        std::cout << "GPU Kernel Time Unfold Backward Elapsed: " << kernel_average_time << " ms"
+                  << std::endl;
+    }
+
+    if(dinput_dev->FromGPU(GetStream(), dinput.data()) != 0)
+        std::cerr << "Error copying (dinput_dev) from GPU, size: " << dinput_dev->GetSize()
+                  << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::RunBackwardCPU()
+{
+    mloUnFoldBwd4DRunHost(dinput_host.data(),
+                          inputDesc,
+                          doutput.data(),
+                          doutputDesc,
+                          kernel_size,
+                          stride,
+                          padding,
+                          dilation);
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+Tref UnfoldDriver<Tgpu, Tref>::GetTolerance()
+{
+    // Computation error of fp16 is ~2^13 (=8192) bigger than
+    // the one of fp32 because mantissa is shorter by 13 bits.
+    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
+
+    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+    if(std::is_same<Tgpu, bfloat16>::value)
+        tolerance *= 8.0;
+    return tolerance;
+}
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::VerifyForward()
+{
+    RunForwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error_output    = miopen::rms_range(output_host, output);
+
+    if(!std::isfinite(error_output) || error_output > tolerance)
+    {
+        std::cout << "Forward Unfold FAILED: {" << error_output << "} > " << tolerance << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        std::cout << "Forward Unfold Verifies OK on CPU reference ({" << error_output << "} < "
+                  << tolerance << ')' << std::endl;
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int UnfoldDriver<Tgpu, Tref>::VerifyBackward()
+{
+    RunBackwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error_dinput    = miopen::rms_range(dinput_host, dinput);
+
+    if(!std::isfinite(error_dinput) || error_dinput > tolerance)
+    {
+        std::cout << "Backward Unfold FAILED: {" << error_dinput << "} > " << tolerance
+                  << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        std::cout << "Backward Unfold Verifies OK on CPU reference ({" << error_dinput << "} < "
+                  << tolerance << ')' << std::endl;
+    }
+    return miopenStatusSuccess;
+}
+
+#endif // GUARD_MIOPEN_UNFOLD_DRIVER_HPP
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 6c66f04867..8a6a30177e 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -7622,6 +7622,144 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle,
 // CLOSEOUT GETITEM DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
+#ifdef MIOPEN_BETA_API
+// Fold APIs
+/** @addtogroup FOLD
+ *
+ *  @{
+ */
+/*! @brief Execute an fold forward layer
+ *
+ * @param handle              MIOpen handle (input)
+ * @param inputDesc           Tensor descriptor for data input tensor input (input)
+ * @param input               Data tensor input (input)
+ * @param outputDesc          Tensor descriptor for data output tensor output (input)
+ * @param output              Data tensor output (output)
+ * @param kernel_size         Size of the sliding box array (input)
+ * @param kernel_size_size    Size of the kernel_size array (input)
+ * @param stride              Stride array of the sliding box (input)
+ * @param stride_size         Size of the stride array (input)
+ * @param padding             Padding array to be added on input (input)
+ * @param padding_size        Size of the padding array (input)
+ * @param dilation            Dilation array control the stride of the elements within the
+ * neighborhood (input)
+ * @param dilation_size       Size of the dilation array (input)
+ * @return                    miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenFoldForward(miopenHandle_t handle,
+                                               const miopenTensorDescriptor_t inputDesc,
+                                               const void* input,
+                                               const miopenTensorDescriptor_t outputDesc,
+                                               void* output,
+                                               const int32_t* kernel_size,
+                                               const int32_t kernel_size_size,
+                                               const int32_t* stride,
+                                               const int32_t stride_size,
+                                               const int32_t* padding,
+                                               const int32_t padding_size,
+                                               const int32_t* dilation,
+                                               const int32_t dilation_size);
+
+/*! @brief Execute an fold backward layer
+*
+* @param handle              MIOpen handle (input)
+* @param dinputDesc          Tensor descriptor for data input grad tensor (input)
+* @param dinput              Data tensor input grad (output)
+* @param doutputDesc         Tensor descriptor for data output grad tensor (input)
+* @param doutput             Data tensor output grad (input)
+* @param kernel_size         Size of the sliding box array (input)
+* @param kernel_size_size    Size of the kernel_size array (input)
+* @param stride              Stride array of the sliding box (input)
+* @param stride_size         Size of the stride array (input)
+* @param padding             Padding array to be added on input (input)
+* @param padding_size        Size of the padding array (input)
+* @param dilation            Dilation array control the stride of the elements within the
+neighborhood (input)
+* @param dilation_size       Size of the dilation array (input)
+* @return                    miopenStatus_t
+*/
+MIOPEN_EXPORT miopenStatus_t miopenFoldBackward(miopenHandle_t handle,
+                                                const miopenTensorDescriptor_t dinputDesc,
+                                                void* dinput,
+                                                const miopenTensorDescriptor_t doutputDesc,
+                                                const void* doutput,
+                                                const int32_t* kernel_size,
+                                                const int32_t kernel_size_size,
+                                                const int32_t* stride,
+                                                const int32_t stride_size,
+                                                const int32_t* padding,
+                                                const int32_t padding_size,
+                                                const int32_t* dilation,
+                                                const int32_t dilation_size);
+
+/*! @brief Execute an unfold forward layer
+ *
+ * @param handle              MIOpen handle (input)
+ * @param inputDesc           Tensor descriptor for data input tensor input (input)
+ * @param input               Data tensor input (input)
+ * @param outputDesc          Tensor descriptor for data output tensor output (input)
+ * @param output              Data tensor output (output)
+ * @param kernel_size         Size of the sliding box array (input)
+ * @param kernel_size_size    Size of the kernel_size array (input)
+ * @param stride              Stride array of the sliding box (input)
+ * @param stride_size         Size of the stride array (input)
+ * @param padding             Padding array to be added on input (input)
+ * @param padding_size        Size of the padding array (input)
+ * @param dilation            Dilation array control the stride of the elements within the
+ * neighborhood (input)
+ * @param dilation_size       Size of the dilation array (input)
+ * @return                    miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenUnfoldForward(miopenHandle_t handle,
+                                                 const miopenTensorDescriptor_t inputDesc,
+                                                 const void* input,
+                                                 const miopenTensorDescriptor_t outputDesc,
+                                                 void* output,
+                                                 const int32_t* kernel_size,
+                                                 const int32_t kernel_size_size,
+                                                 const int32_t* stride,
+                                                 const int32_t stride_size,
+                                                 const int32_t* padding,
+                                                 const int32_t padding_size,
+                                                 const int32_t* dilation,
+                                                 const int32_t dilation_size);
+
+/*! @brief Execute an unfold backward layer
+ *
+ * @param handle              MIOpen handle (input)
+ * @param dinputDesc          Tensor descriptor for data input grad tensor (input)
+ * @param dinput              Data tensor input grad (output)
+ * @param doutputDesc         Tensor descriptor for data output grad tensor (input)
+ * @param doutput             Data tensor output grad (input)
+ * @param kernel_size         Size of the sliding box array (input)
+ * @param kernel_size_size    Size of the kernel_size array (input)
+ * @param stride              Stride array of the sliding box (input)
+ * @param stride_size         Size of the stride array (input)
+ * @param padding             Padding array to be added on input (input)
+ * @param padding_size        Size of the padding array (input)
+ * @param dilation            Dilation array control the stride of the elements within the
+ neighborhood (input)
+ * @param dilation_size       Size of the dilation array (input)
+ * @return                    miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle,
+                                                  const miopenTensorDescriptor_t dinputDesc,
+                                                  void* dinput,
+                                                  const miopenTensorDescriptor_t doutputDesc,
+                                                  const void* doutput,
+                                                  const int32_t* kernel_size,
+                                                  const int32_t kernel_size_size,
+                                                  const int32_t* stride,
+                                                  const int32_t stride_size,
+                                                  const int32_t* padding,
+                                                  const int32_t padding_size,
+                                                  const int32_t* dilation,
+                                                  const int32_t dilation_size);
+
+/** @} */
+// CLOSEOUT FOLD DOXYGEN GROUP
+#endif
+
 #ifdef MIOPEN_BETA_API
 // RotaryPositionalEmbeddings APIs
 /** @addtogroup RotaryPositionalEmbeddings
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d2f84f43b8..dbba92f114 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -122,6 +122,8 @@ set( MIOpen_Source
     expanduser.cpp
     find_controls.cpp
     find_db.cpp
+    fold_api.cpp
+    fold/problem_description.cpp
     fused_api.cpp
     fusion.cpp
     fusion/problem_description.cpp
@@ -287,6 +289,10 @@ set( MIOpen_Source
     solver/conv_ck_igemm_fwd_bias_res_add_activ_fused.cpp
     solver/conv_ocl_dir2Dfwd_fused.cpp
     solver/conv_winoRxS_fused.cpp
+    solver/fold/fold_forward.cpp
+    solver/fold/fold_backward.cpp
+    solver/fold/unfold_forward.cpp
+    solver/fold/unfold_backward.cpp
     solver/groupnorm/forward_groupnorm.cpp
     solver/getitem/backward_getitem.cpp
     solver/layernorm/backward_t5layernorm.cpp
@@ -552,6 +558,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         kernels/conv7x7c3h224w224k64u2v2p3q3f1.s
         kernels/xform_out.s
         kernels/gcnAsmBNBwdTrainSpatial.s
+        kernels/MIOpenUnfold.cpp
         kernels/MIOpenTensorKernels.cl
         kernels/MIOpenTensorKernelsHip.cpp
         kernels/MIOpenSubTensorOpWithScalarKernel.cl
@@ -634,6 +641,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         adam.cpp
         addlayernorm.cpp
         cat.cpp
+        fold.cpp
         groupnorm.cpp
         getitem.cpp
         kernel_cache.cpp
diff --git a/src/fold.cpp b/src/fold.cpp
new file mode 100644
index 0000000000..0c30529c99
--- /dev/null
+++ b/src/fold.cpp
@@ -0,0 +1,240 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/miopen.h>
+#include <miopen/fold/problem_description.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/find_solution.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/kernel_cache.hpp>
+#include <miopen/fold/invoke_params.hpp>
+#include <miopen/fold/solvers.hpp>
+#include <miopen/fold.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+miopenStatus_t UnfoldForward(Handle& handle,
+                             const TensorDescriptor& inputDesc,
+                             ConstData_t input,
+                             const TensorDescriptor& outputDesc,
+                             Data_t output,
+                             const int32_t* kernel_size,
+                             const int32_t kernel_size_size,
+                             const int32_t* stride,
+                             const int32_t stride_size,
+                             const int32_t* padding,
+                             const int32_t padding_size,
+                             const int32_t* dilation,
+                             const int32_t dilation_size)
+{
+    const auto problem = fold::UnfoldFwdProblemDescription{inputDesc,
+                                                           outputDesc,
+                                                           kernel_size,
+                                                           kernel_size_size,
+                                                           stride,
+                                                           stride_size,
+                                                           padding,
+                                                           padding_size,
+                                                           dilation,
+                                                           dilation_size};
+
+    const auto invoke_params = [&]() {
+        auto tmp             = fold::InvokeParams{};
+        tmp.type             = InvokeType::Run;
+        tmp.inputDesc        = &inputDesc;
+        tmp.outputDesc       = &outputDesc;
+        tmp.input            = input;
+        tmp.output           = output;
+        tmp.kernel_size      = kernel_size;
+        tmp.stride           = stride;
+        tmp.padding          = padding;
+        tmp.dilation         = dilation;
+        tmp.kernel_size_size = kernel_size_size;
+        tmp.stride_size      = stride_size;
+        tmp.padding_size     = padding_size;
+        tmp.dilation_size    = dilation_size;
+        return tmp;
+    }();
+
+    const auto algo    = AlgorithmName{"UnfoldFwd"};
+    const auto solvers = solver::SolverContainer<solver::fold::UnfoldFwd>{};
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+miopenStatus_t UnfoldBackward(Handle& handle,
+                              const TensorDescriptor& dinputDesc,
+                              Data_t dinput,
+                              const TensorDescriptor& doutputDesc,
+                              ConstData_t doutput,
+                              const int32_t* kernel_size,
+                              const int32_t kernel_size_size,
+                              const int32_t* stride,
+                              const int32_t stride_size,
+                              const int32_t* padding,
+                              const int32_t padding_size,
+                              const int32_t* dilation,
+                              const int32_t dilation_size)
+{
+    const auto problem = fold::UnfoldBwdProblemDescription{dinputDesc,
+                                                           doutputDesc,
+                                                           kernel_size,
+                                                           kernel_size_size,
+                                                           stride,
+                                                           stride_size,
+                                                           padding,
+                                                           padding_size,
+                                                           dilation,
+                                                           dilation_size};
+
+    const auto invoke_params = [&]() {
+        auto tmp             = fold::InvokeParams{};
+        tmp.type             = InvokeType::Run;
+        tmp.dinputDesc       = &dinputDesc;
+        tmp.doutputDesc      = &doutputDesc;
+        tmp.dinput           = dinput;
+        tmp.doutput          = doutput;
+        tmp.kernel_size      = kernel_size;
+        tmp.stride           = stride;
+        tmp.padding          = padding;
+        tmp.dilation         = dilation;
+        tmp.kernel_size_size = kernel_size_size;
+        tmp.stride_size      = stride_size;
+        tmp.padding_size     = padding_size;
+        tmp.dilation_size    = dilation_size;
+        return tmp;
+    }();
+
+    const auto algo    = AlgorithmName{"UnfoldBwd"};
+    const auto solvers = solver::SolverContainer<solver::fold::UnfoldBwd>{};
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+miopenStatus_t FoldForward(Handle& handle,
+                           const TensorDescriptor& inputDesc,
+                           ConstData_t input,
+                           const TensorDescriptor& outputDesc,
+                           Data_t output,
+                           const int32_t* kernel_size,
+                           const int32_t kernel_size_size,
+                           const int32_t* stride,
+                           const int32_t stride_size,
+                           const int32_t* padding,
+                           const int32_t padding_size,
+                           const int32_t* dilation,
+                           const int32_t dilation_size)
+{
+    const auto problem = fold::FoldFwdProblemDescription{inputDesc,
+                                                         outputDesc,
+                                                         kernel_size,
+                                                         kernel_size_size,
+                                                         stride,
+                                                         stride_size,
+                                                         padding,
+                                                         padding_size,
+                                                         dilation,
+                                                         dilation_size};
+
+    const auto invoke_params = [&]() {
+        auto tmp             = fold::InvokeParams{};
+        tmp.type             = InvokeType::Run;
+        tmp.inputDesc        = &inputDesc;
+        tmp.outputDesc       = &outputDesc;
+        tmp.input            = input;
+        tmp.output           = output;
+        tmp.kernel_size      = kernel_size;
+        tmp.stride           = stride;
+        tmp.padding          = padding;
+        tmp.dilation         = dilation;
+        tmp.kernel_size_size = kernel_size_size;
+        tmp.stride_size      = stride_size;
+        tmp.padding_size     = padding_size;
+        tmp.dilation_size    = dilation_size;
+        return tmp;
+    }();
+
+    const auto algo    = AlgorithmName{"FoldFwd"};
+    const auto solvers = solver::SolverContainer<solver::fold::FoldFwd>{};
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+miopenStatus_t FoldBackward(Handle& handle,
+                            const TensorDescriptor& dinputDesc,
+                            Data_t dinput,
+                            const TensorDescriptor& doutputDesc,
+                            ConstData_t doutput,
+                            const int32_t* kernel_size,
+                            const int32_t kernel_size_size,
+                            const int32_t* stride,
+                            const int32_t stride_size,
+                            const int32_t* padding,
+                            const int32_t padding_size,
+                            const int32_t* dilation,
+                            const int32_t dilation_size)
+{
+    const auto problem = fold::FoldBwdProblemDescription{dinputDesc,
+                                                         doutputDesc,
+                                                         kernel_size,
+                                                         kernel_size_size,
+                                                         stride,
+                                                         stride_size,
+                                                         padding,
+                                                         padding_size,
+                                                         dilation,
+                                                         dilation_size};
+
+    const auto invoke_params = [&]() {
+        auto tmp             = fold::InvokeParams{};
+        tmp.type             = InvokeType::Run;
+        tmp.dinputDesc       = &dinputDesc;
+        tmp.doutputDesc      = &doutputDesc;
+        tmp.dinput           = dinput;
+        tmp.doutput          = doutput;
+        tmp.kernel_size      = kernel_size;
+        tmp.stride           = stride;
+        tmp.padding          = padding;
+        tmp.dilation         = dilation;
+        tmp.kernel_size_size = kernel_size_size;
+        tmp.stride_size      = stride_size;
+        tmp.padding_size     = padding_size;
+        tmp.dilation_size    = dilation_size;
+        return tmp;
+    }();
+
+    const auto algo    = AlgorithmName{"FoldBwd"};
+    const auto solvers = solver::SolverContainer<solver::fold::FoldBwd>{};
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace miopen
diff --git a/src/fold/problem_description.cpp b/src/fold/problem_description.cpp
new file mode 100644
index 0000000000..a59b460ed7
--- /dev/null
+++ b/src/fold/problem_description.cpp
@@ -0,0 +1,146 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/fold/problem_description.hpp>
+#include <miopen/names.hpp>
+
+#include <sstream>
+
+namespace miopen {
+
+namespace fold {
+
+NetworkConfig UnfoldFwdProblemDescription::MakeNetworkConfig() const
+{
+    auto input_dtype = inputDesc.GetType();
+    auto size        = inputDesc.GetElementSize();
+    auto in_dims     = inputDesc.GetLengths();
+
+    std::ostringstream ss;
+
+    ss << "Unfold_fwd";
+    ss << "i_dtype" << input_dtype;
+    ss << "size" << size;
+    ss << "in_dims";
+    for(auto val : in_dims)
+    {
+        ss << "_" << val;
+    }
+    ss << "kernel_size_" << kernel_size[0] << "_" << kernel_size[1];
+    ss << "stride_" << stride[0] << "_" << stride[1];
+    ss << "padding_" << padding[0] << "_" << padding[1];
+    ss << "dilation_" << dilation[0] << "_" << dilation[1];
+
+    return NetworkConfig{ss.str()};
+}
+
+NetworkConfig UnfoldBwdProblemDescription::MakeNetworkConfig() const
+{
+    auto input_dtype = dinputDesc.GetType();
+    auto size        = dinputDesc.GetElementSize();
+    auto in_dims     = dinputDesc.GetLengths();
+
+    std::ostringstream ss;
+
+    ss << "Unfold_bwd";
+    ss << "i_dtype" << input_dtype;
+    ss << "size" << size;
+    ss << "in_grad_dims";
+    for(auto val : in_dims)
+    {
+        ss << "_" << val;
+    }
+    ss << "kernel_size_" << kernel_size[0] << "_" << kernel_size[1];
+    ss << "stride_" << stride[0] << "_" << stride[1];
+    ss << "padding_" << padding[0] << "_" << padding[1];
+    ss << "dilation_" << dilation[0] << "_" << dilation[1];
+
+    return NetworkConfig{ss.str()};
+}
+
+NetworkConfig FoldFwdProblemDescription::MakeNetworkConfig() const
+{
+    auto input_dtype = inputDesc.GetType();
+    auto size        = inputDesc.GetElementSize();
+    auto in_dims     = inputDesc.GetLengths();
+    auto out_dims    = outputDesc.GetLengths();
+
+    std::ostringstream ss;
+
+    ss << "Fold_fwd";
+    ss << "i_dtype" << input_dtype;
+    ss << "size" << size;
+    ss << "in_dims";
+    for(auto val : in_dims)
+    {
+        ss << "_" << val;
+    }
+    ss << "out_dims";
+    for(auto val : out_dims)
+    {
+        ss << "_" << val;
+    }
+    ss << "kernel_size_" << kernel_size[0] << "_" << kernel_size[1];
+    ss << "stride_" << stride[0] << "_" << stride[1];
+    ss << "padding_" << padding[0] << "_" << padding[1];
+    ss << "dilation_" << dilation[0] << "_" << dilation[1];
+
+    return NetworkConfig{ss.str()};
+}
+
+NetworkConfig FoldBwdProblemDescription::MakeNetworkConfig() const
+{
+    auto input_dtype = dinputDesc.GetType();
+    auto size        = dinputDesc.GetElementSize();
+    auto in_dims     = dinputDesc.GetLengths();
+    auto out_dims    = doutputDesc.GetLengths();
+
+    std::ostringstream ss;
+
+    ss << "Fold_bwd";
+    ss << "i_dtype" << input_dtype;
+    ss << "size" << size;
+    ss << "in_grad_dims";
+    for(auto val : in_dims)
+    {
+        ss << "_" << val;
+    }
+    ss << "out_grad_dims";
+    for(auto val : out_dims)
+    {
+        ss << "_" << val;
+    }
+    ss << "kernel_size_" << kernel_size[0] << "_" << kernel_size[1];
+    ss << "stride_" << stride[0] << "_" << stride[1];
+    ss << "padding_" << padding[0] << "_" << padding[1];
+    ss << "dilation_" << dilation[0] << "_" << dilation[1];
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace fold
+
+} // namespace miopen
diff --git a/src/fold_api.cpp b/src/fold_api.cpp
new file mode 100644
index 0000000000..6c15380db6
--- /dev/null
+++ b/src/fold_api.cpp
@@ -0,0 +1,156 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/miopen.h>
+#include <miopen/fold.hpp>
+#include <miopen/errors.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/logger.hpp>
+#include <miopen/tensor_ops.hpp>
+
+extern "C" miopenStatus_t miopenUnfoldForward(miopenHandle_t handle,
+                                              const miopenTensorDescriptor_t inputDesc,
+                                              const void* input,
+                                              const miopenTensorDescriptor_t outputDesc,
+                                              void* output,
+                                              const int32_t* kernel_size,
+                                              const int32_t kernel_size_size,
+                                              const int32_t* stride,
+                                              const int32_t stride_size,
+                                              const int32_t* padding,
+                                              const int32_t padding_size,
+                                              const int32_t* dilation,
+                                              const int32_t dilation_size)
+{
+    return miopen::try_([&] {
+        miopen::UnfoldForward(miopen::deref(handle),
+                              miopen::deref(inputDesc),
+                              DataCast(input),
+                              miopen::deref(outputDesc),
+                              DataCast(output),
+                              kernel_size,
+                              kernel_size_size,
+                              stride,
+                              stride_size,
+                              padding,
+                              padding_size,
+                              dilation,
+                              dilation_size);
+    });
+}
+
+extern "C" miopenStatus_t miopenUnfoldBackward(miopenHandle_t handle,
+                                               const miopenTensorDescriptor_t dinputDesc,
+                                               void* dinput,
+                                               const miopenTensorDescriptor_t doutputDesc,
+                                               const void* doutput,
+                                               const int32_t* kernel_size,
+                                               const int32_t kernel_size_size,
+                                               const int32_t* stride,
+                                               const int32_t stride_size,
+                                               const int32_t* padding,
+                                               const int32_t padding_size,
+                                               const int32_t* dilation,
+                                               const int32_t dilation_size)
+{
+    return miopen::try_([&] {
+        miopen::UnfoldBackward(miopen::deref(handle),
+                               miopen::deref(dinputDesc),
+                               DataCast(dinput),
+                               miopen::deref(doutputDesc),
+                               DataCast(doutput),
+                               kernel_size,
+                               kernel_size_size,
+                               stride,
+                               stride_size,
+                               padding,
+                               padding_size,
+                               dilation,
+                               dilation_size);
+    });
+}
+
+extern "C" miopenStatus_t miopenFoldForward(miopenHandle_t handle,
+                                            const miopenTensorDescriptor_t inputDesc,
+                                            const void* input,
+                                            const miopenTensorDescriptor_t outputDesc,
+                                            void* output,
+                                            const int32_t* kernel_size,
+                                            const int32_t kernel_size_size,
+                                            const int32_t* stride,
+                                            const int32_t stride_size,
+                                            const int32_t* padding,
+                                            const int32_t padding_size,
+                                            const int32_t* dilation,
+                                            const int32_t dilation_size)
+{
+    return miopen::try_([&] {
+        miopen::FoldForward(miopen::deref(handle),
+                            miopen::deref(inputDesc),
+                            DataCast(input),
+                            miopen::deref(outputDesc),
+                            DataCast(output),
+                            kernel_size,
+                            kernel_size_size,
+                            stride,
+                            stride_size,
+                            padding,
+                            padding_size,
+                            dilation,
+                            dilation_size);
+    });
+}
+
+extern "C" miopenStatus_t miopenFoldBackward(miopenHandle_t handle,
+                                             const miopenTensorDescriptor_t dinputDesc,
+                                             void* dinput,
+                                             const miopenTensorDescriptor_t doutputDesc,
+                                             const void* doutput,
+                                             const int32_t* kernel_size,
+                                             const int32_t kernel_size_size,
+                                             const int32_t* stride,
+                                             const int32_t stride_size,
+                                             const int32_t* padding,
+                                             const int32_t padding_size,
+                                             const int32_t* dilation,
+                                             const int32_t dilation_size)
+{
+    return miopen::try_([&] {
+        miopen::FoldBackward(miopen::deref(handle),
+                             miopen::deref(dinputDesc),
+                             DataCast(dinput),
+                             miopen::deref(doutputDesc),
+                             DataCast(doutput),
+                             kernel_size,
+                             kernel_size_size,
+                             stride,
+                             stride_size,
+                             padding,
+                             padding_size,
+                             dilation,
+                             dilation_size);
+    });
+}
diff --git a/src/include/miopen/fold.hpp b/src/include/miopen/fold.hpp
new file mode 100644
index 0000000000..9a8b46bb56
--- /dev/null
+++ b/src/include/miopen/fold.hpp
@@ -0,0 +1,92 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MIOPEN_FOLD_HPP_
+#define MIOPEN_FOLD_HPP_
+
+#include <miopen/common.hpp>
+
+namespace miopen {
+
+struct Handle;
+struct TensorDescriptor;
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t UnfoldForward(Handle& handle,
+                                                     const TensorDescriptor& inputDesc,
+                                                     ConstData_t input,
+                                                     const TensorDescriptor& outputDesc,
+                                                     Data_t output,
+                                                     const int32_t* kernel_size,
+                                                     int32_t kernel_size_size,
+                                                     const int32_t* stride,
+                                                     int32_t stride_size,
+                                                     const int32_t* padding,
+                                                     int32_t padding_size,
+                                                     const int32_t* dilation,
+                                                     int32_t dilation_size);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t UnfoldBackward(Handle& handle,
+                                                      const TensorDescriptor& dinputDesc,
+                                                      Data_t dinput,
+                                                      const TensorDescriptor& doutputDesc,
+                                                      ConstData_t doutput,
+                                                      const int32_t* kernel_size,
+                                                      int32_t kernel_size_size,
+                                                      const int32_t* stride,
+                                                      int32_t stride_size,
+                                                      const int32_t* padding,
+                                                      int32_t padding_size,
+                                                      const int32_t* dilation,
+                                                      int32_t dilation_size);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t FoldForward(Handle& handle,
+                                                   const TensorDescriptor& inputDesc,
+                                                   ConstData_t input,
+                                                   const TensorDescriptor& outputDesc,
+                                                   Data_t output,
+                                                   const int32_t* kernel_size,
+                                                   int32_t kernel_size_size,
+                                                   const int32_t* stride,
+                                                   int32_t stride_size,
+                                                   const int32_t* padding,
+                                                   int32_t padding_size,
+                                                   const int32_t* dilation,
+                                                   int32_t dilation_size);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t FoldBackward(Handle& handle,
+                                                    const TensorDescriptor& dinputDesc,
+                                                    Data_t dinput,
+                                                    const TensorDescriptor& doutputDesc,
+                                                    ConstData_t doutput,
+                                                    const int32_t* kernel_size,
+                                                    int32_t kernel_size_size,
+                                                    const int32_t* stride,
+                                                    int32_t stride_size,
+                                                    const int32_t* padding,
+                                                    int32_t padding_size,
+                                                    const int32_t* dilation,
+                                                    int32_t dilation_size);
+} // namespace miopen
+#endif // MIOPEN_FOLD_HPP_
diff --git a/src/include/miopen/fold/invoke_params.hpp b/src/include/miopen/fold/invoke_params.hpp
new file mode 100644
index 0000000000..246ccfb401
--- /dev/null
+++ b/src/include/miopen/fold/invoke_params.hpp
@@ -0,0 +1,67 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/miopen.h>
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+
+#include <limits>
+
+namespace miopen {
+
+namespace fold {
+
+struct InvokeParams : public miopen::InvokeParams
+{
+    InvokeParams() = default;
+
+    const TensorDescriptor* inputDesc  = nullptr;
+    const TensorDescriptor* outputDesc = nullptr;
+    ConstData_t input                  = nullptr;
+    Data_t output                      = nullptr;
+
+    const TensorDescriptor* dinputDesc  = nullptr;
+    const TensorDescriptor* doutputDesc = nullptr;
+    Data_t dinput                       = nullptr;
+    ConstData_t doutput                 = nullptr;
+
+    const int32_t* kernel_size = nullptr;
+    const int32_t* stride      = nullptr;
+    const int32_t* padding     = nullptr;
+    const int32_t* dilation    = nullptr;
+    int32_t kernel_size_size   = 0;
+    int32_t stride_size        = 0;
+    int32_t padding_size       = 0;
+    int32_t dilation_size      = 0;
+
+    std::size_t GetWorkspaceSize() const { return 0; }
+    Data_t GetWorkspace() const { return nullptr; }
+};
+
+} // namespace fold
+
+} // namespace miopen
diff --git a/src/include/miopen/fold/problem_description.hpp b/src/include/miopen/fold/problem_description.hpp
new file mode 100644
index 0000000000..165aba97b6
--- /dev/null
+++ b/src/include/miopen/fold/problem_description.hpp
@@ -0,0 +1,421 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include "miopen/errors.hpp"
+#include <miopen/miopen.h>
+#include <miopen/activ.hpp>
+#include <miopen/problem_description_base.hpp>
+#include <miopen/tensor.hpp>
+
+#include <cassert>
+#include <string>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace fold {
+
+bool checkSameLength(const TensorDescriptor& x, const TensorDescriptor& y);
+
+struct UnfoldFwdProblemDescription : ProblemDescriptionBase
+{
+    UnfoldFwdProblemDescription(const TensorDescriptor& inputDesc_,
+                                const TensorDescriptor& outputDesc_,
+                                const int32_t* kernel_size_,
+                                const int32_t kernel_size_size_,
+                                const int32_t* stride_,
+                                const int32_t stride_size_,
+                                const int32_t* padding_,
+                                const int32_t padding_size_,
+                                const int32_t* dilation_,
+                                const int32_t dilation_size_)
+        : inputDesc(inputDesc_),
+          outputDesc(outputDesc_),
+          kernel_size(kernel_size_),
+          kernel_size_size(kernel_size_size_),
+          stride(stride_),
+          stride_size(stride_size_),
+          padding(padding_),
+          padding_size(padding_size_),
+          dilation(dilation_),
+          dilation_size(dilation_size_)
+    {
+        IsValidSize();
+        IsValidType();
+    }
+
+    bool IsValidSize() const
+    {
+        if(inputDesc.GetNumDims() != 4)
+        {
+#if MIOPEN_BUILD_DEV || !MIOPEN_NDEBUG
+            MIOPEN_THROW(miopenStatusBadParm, "Unfold: The input tensor should be 4D.");
+#else
+            return false;
+#endif
+        }
+        int32_t spatial_dim_size = inputDesc.GetNumDims() - 2;
+        if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size ||
+           padding_size != spatial_dim_size || dilation_size != spatial_dim_size)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Unfold: Argument length should be 2D");
+        }
+        auto input_dims = inputDesc.GetLengths();
+        const int32_t N = static_cast<int32_t>(input_dims[0]);
+        const int32_t C = static_cast<int32_t>(input_dims[1]);
+        int32_t P = 1, L = 1;
+        std::vector<int32_t> ls;
+        for(int i = 0; i < spatial_dim_size; ++i)
+        {
+            P *= kernel_size[i];
+            int32_t l = (static_cast<int32_t>(input_dims[i + 2]) + 2 * padding[i] -
+                         dilation[i] * (kernel_size[i] - 1) - 1) /
+                            stride[i] +
+                        1;
+            L *= l;
+            ls.push_back(l);
+        }
+        std::vector<size_t> output_dims_desired{
+            static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
+        auto output_dims = outputDesc.GetLengths();
+        if(output_dims != output_dims_desired)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Unfold: Invalid output dimension");
+        }
+        return true;
+    }
+
+    bool IsValidType() const
+    {
+        if(inputDesc.GetType() != outputDesc.GetType())
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Unfold: The input tensor and output tensor has mismatch type.");
+        }
+        return true;
+    }
+
+    const TensorDescriptor& GetInputDesc() const { return inputDesc; }
+    const TensorDescriptor& GetOutputDesc() const { return outputDesc; }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+public:
+    TensorDescriptor inputDesc;
+    TensorDescriptor outputDesc;
+    const int32_t* kernel_size;
+    const int32_t kernel_size_size;
+    const int32_t* stride;
+    const int32_t stride_size;
+    const int32_t* padding;
+    const int32_t padding_size;
+    const int32_t* dilation;
+    const int32_t dilation_size;
+};
+
+struct UnfoldBwdProblemDescription : ProblemDescriptionBase
+{
+    UnfoldBwdProblemDescription(const TensorDescriptor& dinputDesc_,
+                                const TensorDescriptor& doutputDesc_,
+                                const int32_t* kernel_size_,
+                                const int32_t kernel_size_size_,
+                                const int32_t* stride_,
+                                const int32_t stride_size_,
+                                const int32_t* padding_,
+                                const int32_t padding_size_,
+                                const int32_t* dilation_,
+                                const int32_t dilation_size_)
+        : dinputDesc(dinputDesc_),
+          doutputDesc(doutputDesc_),
+          kernel_size(kernel_size_),
+          kernel_size_size(kernel_size_size_),
+          stride(stride_),
+          stride_size(stride_size_),
+          padding(padding_),
+          padding_size(padding_size_),
+          dilation(dilation_),
+          dilation_size(dilation_size_)
+    {
+        IsValidSize();
+        IsValidType();
+    }
+
+    bool IsValidSize() const
+    {
+        if(dinputDesc.GetNumDims() != 4)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Unfold: The input gradient tensor should be 4D.");
+        }
+        int32_t spatial_dim_size = dinputDesc.GetNumDims() - 2;
+        if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size ||
+           padding_size != spatial_dim_size || dilation_size != spatial_dim_size)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Unfold: Argument length should be 2D");
+        }
+        auto input_dims = dinputDesc.GetLengths();
+        const int32_t N = static_cast<int32_t>(input_dims[0]);
+        const int32_t C = static_cast<int32_t>(input_dims[1]);
+        int32_t P = 1, L = 1;
+        std::vector<int32_t> ls;
+        for(int i = 0; i < spatial_dim_size; ++i)
+        {
+            P *= kernel_size[i];
+            int32_t l = (static_cast<int32_t>(input_dims[i + 2]) + 2 * padding[i] -
+                         dilation[i] * (kernel_size[i] - 1) - 1) /
+                            stride[i] +
+                        1;
+            L *= l;
+            ls.push_back(l);
+        }
+        std::vector<size_t> output_dims_desired{
+            static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
+        auto output_dims = doutputDesc.GetLengths();
+        if(output_dims != output_dims_desired)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Unfold: Invalid output gradient dimension");
+        }
+        return true;
+    }
+
+    bool IsValidType() const
+    {
+        if(dinputDesc.GetType() != doutputDesc.GetType())
+        {
+            MIOPEN_THROW(
+                miopenStatusBadParm,
+                "Unfold: The input gradient tensor and output gradient tensor has mismatch type.");
+        }
+        return true;
+    }
+
+    const TensorDescriptor& GetDinputDesc() const { return dinputDesc; }
+    const TensorDescriptor& GetDoutputDesc() const { return doutputDesc; }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+public:
+    TensorDescriptor dinputDesc;
+    TensorDescriptor doutputDesc;
+    const int32_t* kernel_size;
+    const int32_t kernel_size_size;
+    const int32_t* stride;
+    const int32_t stride_size;
+    const int32_t* padding;
+    const int32_t padding_size;
+    const int32_t* dilation;
+    const int32_t dilation_size;
+};
+
+struct FoldFwdProblemDescription : ProblemDescriptionBase
+{
+    FoldFwdProblemDescription(const TensorDescriptor& inputDesc_,
+                              const TensorDescriptor& outputDesc_,
+                              const int32_t* kernel_size_,
+                              const int32_t kernel_size_size_,
+                              const int32_t* stride_,
+                              const int32_t stride_size_,
+                              const int32_t* padding_,
+                              const int32_t padding_size_,
+                              const int32_t* dilation_,
+                              const int32_t dilation_size_)
+        : inputDesc(inputDesc_),
+          outputDesc(outputDesc_),
+          kernel_size(kernel_size_),
+          kernel_size_size(kernel_size_size_),
+          stride(stride_),
+          stride_size(stride_size_),
+          padding(padding_),
+          padding_size(padding_size_),
+          dilation(dilation_),
+          dilation_size(dilation_size_)
+    {
+        IsValidSize();
+        IsValidType();
+    }
+
+    bool IsValidSize() const
+    {
+        if(outputDesc.GetNumDims() != 4)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Fold: The output tensor should be 4D.");
+        }
+        int32_t spatial_dim_size = outputDesc.GetNumDims() - 2;
+        if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size ||
+           padding_size != spatial_dim_size || dilation_size != spatial_dim_size)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Fold: Argument length should be 2D");
+        }
+        auto input_dims  = inputDesc.GetLengths();
+        auto output_dims = outputDesc.GetLengths();
+        const int32_t N  = static_cast<int32_t>(output_dims[0]);
+        const int32_t C  = static_cast<int32_t>(output_dims[1]);
+        int32_t P = 1, L = 1;
+        std::vector<int32_t> ls;
+        for(int i = 0; i < spatial_dim_size; ++i)
+        {
+            P *= kernel_size[i];
+            int32_t l = (static_cast<int32_t>(output_dims[i + 2]) + 2 * padding[i] -
+                         dilation[i] * (kernel_size[i] - 1) - 1) /
+                            stride[i] +
+                        1;
+            L *= l;
+            ls.push_back(l);
+        }
+        std::vector<size_t> input_dims_desired{
+            static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
+        if(input_dims != input_dims_desired)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Fold: Invalid input dimension");
+        }
+        return true;
+    }
+
+    bool IsValidType() const
+    {
+        if(inputDesc.GetType() != outputDesc.GetType())
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "Fold: The input tensor and output tensor has mismatch type.");
+        }
+        return true;
+    }
+
+    const TensorDescriptor& GetInputDesc() const { return inputDesc; }
+    const TensorDescriptor& GetOutputDesc() const { return outputDesc; }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+public:
+    TensorDescriptor inputDesc;
+    TensorDescriptor outputDesc;
+    const int32_t* kernel_size;
+    const int32_t kernel_size_size;
+    const int32_t* stride;
+    const int32_t stride_size;
+    const int32_t* padding;
+    const int32_t padding_size;
+    const int32_t* dilation;
+    const int32_t dilation_size;
+};
+
+struct FoldBwdProblemDescription : ProblemDescriptionBase
+{
+    FoldBwdProblemDescription(const TensorDescriptor& dinputDesc_,
+                              const TensorDescriptor& doutputDesc_,
+                              const int32_t* kernel_size_,
+                              const int32_t kernel_size_size_,
+                              const int32_t* stride_,
+                              const int32_t stride_size_,
+                              const int32_t* padding_,
+                              const int32_t padding_size_,
+                              const int32_t* dilation_,
+                              const int32_t dilation_size_)
+        : dinputDesc(dinputDesc_),
+          doutputDesc(doutputDesc_),
+          kernel_size(kernel_size_),
+          kernel_size_size(kernel_size_size_),
+          stride(stride_),
+          stride_size(stride_size_),
+          padding(padding_),
+          padding_size(padding_size_),
+          dilation(dilation_),
+          dilation_size(dilation_size_)
+    {
+        IsValidSize();
+        IsValidType();
+    }
+
+    bool IsValidSize() const
+    {
+        if(doutputDesc.GetNumDims() != 4)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Fold: The output gradient tensor should be 4D.");
+        }
+        int32_t spatial_dim_size = doutputDesc.GetNumDims() - 2;
+        if(kernel_size_size != spatial_dim_size || stride_size != spatial_dim_size ||
+           padding_size != spatial_dim_size || dilation_size != spatial_dim_size)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Fold: Argument length should be 2D");
+        }
+        auto input_dims  = dinputDesc.GetLengths();
+        auto output_dims = doutputDesc.GetLengths();
+        const int32_t N  = static_cast<int32_t>(output_dims[0]);
+        const int32_t C  = static_cast<int32_t>(output_dims[1]);
+        int32_t P = 1, L = 1;
+        std::vector<int32_t> ls;
+        for(int i = 0; i < spatial_dim_size; ++i)
+        {
+            P *= kernel_size[i];
+            int32_t l = (static_cast<int32_t>(output_dims[i + 2]) + 2 * padding[i] -
+                         dilation[i] * (kernel_size[i] - 1) - 1) /
+                            stride[i] +
+                        1;
+            L *= l;
+            ls.push_back(l);
+        }
+        std::vector<size_t> input_dims_desired{
+            static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
+        if(input_dims != input_dims_desired)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "Fold: Invalid input gradient dimension");
+        }
+        return true;
+    }
+
+    bool IsValidType() const
+    {
+        if(dinputDesc.GetType() != doutputDesc.GetType())
+        {
+            MIOPEN_THROW(
+                miopenStatusBadParm,
+                "Fold: The input gradient tensor and output gradient tensor has mismatch type.");
+        }
+        return true;
+    }
+
+    const TensorDescriptor& GetDinputDesc() const { return dinputDesc; }
+    const TensorDescriptor& GetDoutputDesc() const { return doutputDesc; }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+public:
+    TensorDescriptor dinputDesc;
+    TensorDescriptor doutputDesc;
+    const int32_t* kernel_size;
+    const int32_t kernel_size_size;
+    const int32_t* stride;
+    const int32_t stride_size;
+    const int32_t* padding;
+    const int32_t padding_size;
+    const int32_t* dilation;
+    const int32_t dilation_size;
+};
+
+} // namespace fold
+
+} // namespace miopen
diff --git a/src/include/miopen/fold/solvers.hpp b/src/include/miopen/fold/solvers.hpp
new file mode 100644
index 0000000000..1ff3ef7566
--- /dev/null
+++ b/src/include/miopen/fold/solvers.hpp
@@ -0,0 +1,101 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#pragma once
+
+#include <miopen/fold/problem_description.hpp>
+#include <miopen/solver.hpp>
+
+#include <utility>
+
+namespace miopen {
+
+namespace solver {
+
+namespace fold {
+
+using UnfoldFwdSolverBase =
+    NonTunableSolverBase<ExecutionContext, miopen::fold::UnfoldFwdProblemDescription>;
+
+struct UnfoldFwd final : UnfoldFwdSolverBase
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<UnfoldFwd>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::fold::UnfoldFwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::fold::UnfoldFwdProblemDescription& problem) const override;
+};
+
+using UnfoldBwdSolverBase =
+    NonTunableSolverBase<ExecutionContext, miopen::fold::UnfoldBwdProblemDescription>;
+
+struct UnfoldBwd final : UnfoldBwdSolverBase
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<UnfoldBwd>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::fold::UnfoldBwdProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::fold::UnfoldBwdProblemDescription& problem) const override;
+};
+
+using FoldFwdSolverBase =
+    NonTunableSolverBase<ExecutionContext, miopen::fold::FoldFwdProblemDescription>;
+
+struct FoldFwd final : FoldFwdSolverBase
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<FoldFwd>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::fold::FoldFwdProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::fold::FoldFwdProblemDescription& problem) const override;
+};
+
+using FoldBwdSolverBase =
+    NonTunableSolverBase<ExecutionContext, miopen::fold::FoldBwdProblemDescription>;
+
+struct FoldBwd final : FoldBwdSolverBase
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<FoldBwd>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::fold::FoldBwdProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::fold::FoldBwdProblemDescription& problem) const override;
+};
+
+} // namespace fold
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index fb81bafb5c..ca39b8e000 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -60,7 +60,9 @@ enum class Primitive
     Softmax,
     Adam,
     Item,
-    RoPE
+    RoPE,
+    Fold,
+    Unfold
 };
 
 struct MIOPEN_INTERNALS_EXPORT Id
diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp
index 9f7430ba8a..4ac9196a8d 100644
--- a/src/include/miopen/tensor_view_utils.hpp
+++ b/src/include/miopen/tensor_view_utils.hpp
@@ -27,8 +27,8 @@
 #ifndef MIOPEN_TENSOR_VIEW_UTIL_HPP_
 #define MIOPEN_TENSOR_VIEW_UTIL_HPP_
 
-#include <miopen/common.hpp>
 #include "../../kernels/tensor_view.hpp"
+#include "miopen/tensor.hpp"
 
 namespace miopen {
 
@@ -38,7 +38,7 @@ inline tensor_view_t<N> get_inner_expanded_tv(const TensorDescriptor Desc)
     auto dims    = Desc.GetLengths();
     auto strides = Desc.GetStrides();
 
-    tensor_view_t<N> tensor_view;
+    tensor_view_t<N> tensor_view{};
     for(size_t i = 0; i < N; ++i)
     {
         if(i < dims.size())
diff --git a/src/kernels/MIOpenUnfold.cpp b/src/kernels/MIOpenUnfold.cpp
new file mode 100644
index 0000000000..0e3f33e2f6
--- /dev/null
+++ b/src/kernels/MIOpenUnfold.cpp
@@ -0,0 +1,233 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "tensor_view.hpp"
+
+template <typename DTYPE>
+__device__ void unfoldForward4D(const DTYPE* __restrict__ input,
+                                DTYPE* __restrict__ output,
+                                int N,
+                                int C,
+                                int H,
+                                int W,
+                                int P,
+                                int L,
+                                int LH,
+                                int LW,
+                                int kernel_size_h,
+                                int kernel_size_w,
+                                int stride_h,
+                                int stride_w,
+                                int padding_h,
+                                int padding_w,
+                                int dilation_h,
+                                int dilation_w,
+                                tensor_view_t<4> input_tv,
+                                tensor_view_t<3> output_tv)
+{
+    /*
+     * input = {N, C, H, W}, output = {N, C * P, L}
+     * where P = kernel_size_h * kernel_size_w, L = # of blocks (see host code for
+     * formula)
+     * => gws = {ceil(N * C * P * L, LOCAL_SIZE)}, lws = {LOCAL_SIZE}
+     */
+
+    const int gid = threadIdx.x + blockIdx.x * blockDim.x;
+    int ncp = gid / L, l = gid % L;
+    int nc = ncp / P, p = ncp % P;
+    int n = nc / C, c = nc % C;
+    if(n >= N)
+        return;
+
+    int lh = l / LW, lw = l % LW;                       // sliding window position
+    int ph = p / kernel_size_w, pw = p % kernel_size_w; // position inside kernel
+    int h = lh * stride_h - padding_h + ph * dilation_h;
+    int w = lw * stride_w - padding_w + pw * dilation_w;
+
+    DTYPE x = 0;
+    if(0 <= h && h < H && 0 <= w && w < W)
+    {
+        tensor_layout_t<4> input_layout({n, c, h, w});
+        x = input[input_tv.get_tensor_view_idx(input_layout)];
+    }
+    tensor_layout_t<3> output_layout({n, c * P + p, l});
+    output[output_tv.get_tensor_view_idx(output_layout)] = x;
+}
+
+extern "C" __global__ void UnfoldForward4D(const FLOAT* __restrict__ input,
+                                           FLOAT* __restrict__ output,
+                                           int N,
+                                           int C,
+                                           int H,
+                                           int W,
+                                           int P,
+                                           int L,
+                                           int LH,
+                                           int LW,
+                                           int kernel_size_h,
+                                           int kernel_size_w,
+                                           int stride_h,
+                                           int stride_w,
+                                           int padding_h,
+                                           int padding_w,
+                                           int dilation_h,
+                                           int dilation_w,
+                                           tensor_view_t<4> input_tv,
+                                           tensor_view_t<3> output_tv)
+{
+    unfoldForward4D<FLOAT>(input,
+                           output,
+                           N,
+                           C,
+                           H,
+                           W,
+                           P,
+                           L,
+                           LH,
+                           LW,
+                           kernel_size_h,
+                           kernel_size_w,
+                           stride_h,
+                           stride_w,
+                           padding_h,
+                           padding_w,
+                           dilation_h,
+                           dilation_w,
+                           input_tv,
+                           output_tv);
+}
+
+template <typename DTYPE>
+__device__ void unfoldBackward4D(const DTYPE* __restrict__ output_grad,
+                                 DTYPE* __restrict__ input_grad,
+                                 int N,
+                                 int C,
+                                 int H,
+                                 int W,
+                                 int P,
+                                 int L,
+                                 int LH,
+                                 int LW,
+                                 int kernel_size_h,
+                                 int kernel_size_w,
+                                 int stride_h,
+                                 int stride_w,
+                                 int padding_h,
+                                 int padding_w,
+                                 int dilation_h,
+                                 int dilation_w,
+                                 tensor_view_t<3> output_grad_tv,
+                                 tensor_view_t<4> input_grad_tv)
+{
+    /*
+     * output_grad = {N, C * P, L}, input_grad = {N, C, H, W}
+     * where P = kernel_size_h * kernel_size_w, L = # of blocks (see host code for
+     * formula)
+     * => gws = {ceil(N * C * H * W, LOCAL_SIZE)}, lws = {LOCAL_SIZE}
+     */
+
+    const int gid = threadIdx.x + blockIdx.x * blockDim.x;
+    int nch = gid / W, w = gid % W;
+    int nc = nch / H, h = nch % H;
+    int n = nc / C, c = nc % C;
+    if(n >= N)
+        return;
+
+    FLOAT_ACCUM sum = 0.0f;
+    for(int ph = 0; ph < kernel_size_h; ++ph)
+    {
+        for(int pw = 0; pw < kernel_size_w; ++pw)
+        {
+            int lhsh = h - ph * dilation_h + padding_h;
+            int lwsw = w - pw * dilation_w + padding_w;
+            if(lhsh % stride_h != 0)
+                continue;
+            if(lwsw % stride_w != 0)
+                continue;
+            int lh = lhsh / stride_h;
+            int lw = lwsw / stride_w;
+            if(lh < 0 || LH <= lh)
+                continue;
+            if(lw < 0 || LW <= lw)
+                continue;
+            tensor_layout_t<3> output_grad_layout(
+                {n, c * P + (ph * kernel_size_w + pw), lh * LW + lw});
+            sum += CVT_FLOAT2ACCUM(
+                output_grad[output_grad_tv.get_tensor_view_idx(output_grad_layout)]);
+        }
+    }
+    tensor_layout_t<4> input_grad_layout({n, c, h, w});
+    input_grad[input_grad_tv.get_tensor_view_idx(input_grad_layout)] = CVT_ACCUM2FLOAT(sum);
+}
+
+extern "C" __global__ void UnfoldBackward4D(const FLOAT* __restrict__ output_grad,
+                                            FLOAT* __restrict__ input_grad,
+                                            int N,
+                                            int C,
+                                            int H,
+                                            int W,
+                                            int P,
+                                            int L,
+                                            int LH,
+                                            int LW,
+                                            int kernel_size_h,
+                                            int kernel_size_w,
+                                            int stride_h,
+                                            int stride_w,
+                                            int padding_h,
+                                            int padding_w,
+                                            int dilation_h,
+                                            int dilation_w,
+                                            tensor_view_t<3> output_grad_tv,
+                                            tensor_view_t<4> input_grad_tv)
+{
+    unfoldBackward4D<FLOAT>(output_grad,
+                            input_grad,
+                            N,
+                            C,
+                            H,
+                            W,
+                            P,
+                            L,
+                            LH,
+                            LW,
+                            kernel_size_h,
+                            kernel_size_w,
+                            stride_h,
+                            stride_w,
+                            padding_h,
+                            padding_w,
+                            dilation_h,
+                            dilation_w,
+                            output_grad_tv,
+                            input_grad_tv);
+}
diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
index d35bfd93fc..c9357dd729 100644
--- a/src/kernels/tensor_view.hpp
+++ b/src/kernels/tensor_view.hpp
@@ -27,6 +27,8 @@
 #ifndef GUARD_TENSOR_VIEW_HPP
 #define GUARD_TENSOR_VIEW_HPP
 
+#include <initializer_list>
+
 template <int N>
 struct tensor_layout_t;
 
@@ -72,6 +74,15 @@ struct tensor_layout_t
         }
     }
 
+    constexpr tensor_layout_t(std::initializer_list<uint64_t> layout_)
+    {
+        static_assert(N > 0);
+        for(auto i = 0; i < N; ++i)
+        {
+            layout[i] = layout_.begin()[i];
+        }
+    }
+
     uint64_t layout[N];
 };
 
diff --git a/src/solver.cpp b/src/solver.cpp
index bbd13bd89f..ae443f29d5 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -30,6 +30,7 @@
 #include <miopen/adam/solvers.hpp>
 #include <miopen/batchnorm/solvers.hpp>
 #include <miopen/cat/solvers.hpp>
+#include <miopen/fold/solvers.hpp>
 #include <miopen/fusion/solvers.hpp>
 #include <miopen/groupnorm/solvers.hpp>
 #include <miopen/getitem/solvers.hpp>
@@ -664,7 +665,6 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
 
     Register(registry, ++id, Primitive::Cat, cat::CatForward{}.SolverDbId());
     Register(registry, ++id, Primitive::Adam, adam::Adam{}.SolverDbId());
-    Register(registry, ++id, Primitive::Item, getitem::GetitemBackward{}.SolverDbId());
 
     Register(registry, ++id, Primitive::Adam, adam::TransformersAdamW{}.SolverDbId());
 
@@ -676,6 +676,11 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
 
     Register(registry, ++id, Primitive::RoPE, rope::RoPEForward{}.SolverDbId());
     Register(registry, ++id, Primitive::RoPE, rope::RoPEBackward{}.SolverDbId());
+
+    Register(registry, ++id, Primitive::Unfold, fold::UnfoldFwd{}.SolverDbId());
+    Register(registry, ++id, Primitive::Unfold, fold::UnfoldBwd{}.SolverDbId());
+    Register(registry, ++id, Primitive::Fold, fold::FoldFwd{}.SolverDbId());
+    Register(registry, ++id, Primitive::Fold, fold::FoldBwd{}.SolverDbId());
     // IMPORTANT: New solvers should be added to the end of the function!
 }
 
diff --git a/src/solver/fold/fold_backward.cpp b/src/solver/fold/fold_backward.cpp
new file mode 100644
index 0000000000..edfa5649a0
--- /dev/null
+++ b/src/solver/fold/fold_backward.cpp
@@ -0,0 +1,160 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/fold/problem_description.hpp>
+#include <miopen/miopen.h>
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/fold/invoke_params.hpp>
+#include <miopen/fold/solvers.hpp>
+#include <miopen/fold.hpp>
+#include <miopen/target_properties.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define LOCAL_SIZE 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace fold {
+
+bool FoldBwd::IsApplicable(
+    [[maybe_unused]] const ExecutionContext&,
+    [[maybe_unused]] const miopen::fold::FoldBwdProblemDescription& problem) const
+{
+    return true;
+}
+
+ConvSolution FoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                                  const miopen::fold::FoldBwdProblemDescription& problem) const
+{
+    std::ignore = context;
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto in_dtype         = miopen::GetDataType(problem.GetDinputDesc().GetType());
+    auto dtype            = problem.GetDoutputDesc().GetType();
+    auto input_grad_dims  = problem.GetDinputDesc().GetLengths();
+    auto output_grad_dims = problem.GetDoutputDesc().GetLengths();
+
+    const int32_t N          = static_cast<int32_t>(output_grad_dims[0]);
+    const int32_t C          = static_cast<int32_t>(output_grad_dims[1]);
+    int32_t spatial_dim_size = output_grad_dims.size() - 2;
+    int32_t P = 1, L = 1;
+    std::vector<int32_t> ls;
+    for(int i = 0; i < spatial_dim_size; ++i)
+    {
+        P *= problem.kernel_size[i];
+        int32_t l = (static_cast<int32_t>(output_grad_dims[i + 2]) + 2 * problem.padding[i] -
+                     problem.dilation[i] * (problem.kernel_size[i] - 1) - 1) /
+                        problem.stride[i] +
+                    1;
+        L *= l;
+        ls.push_back(l);
+    }
+
+    auto kernel        = KernelInfo{};
+    kernel.kernel_file = "MIOpenUnfold.cpp";
+    kernel.kernel_name = "UnfoldForward4D";
+
+    const auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+    };
+    kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+    size_t xlocalsize = LOCAL_SIZE;
+    size_t xgridsize  = AlignUp(N * C * P * L, LOCAL_SIZE);
+    size_t ylocalsize = 1;
+    size_t ygridsize  = 1;
+    size_t zlocalsize = 1;
+    size_t zgridsize  = 1;
+    kernel.l_wk.push_back(xlocalsize);
+    kernel.l_wk.push_back(ylocalsize);
+    kernel.l_wk.push_back(zlocalsize);
+
+    kernel.g_wk.push_back(xgridsize);
+    kernel.g_wk.push_back(ygridsize);
+    kernel.g_wk.push_back(zgridsize);
+
+    result.construction_params.push_back(kernel);
+
+    result.invoker_factory = [N, C, P, L, ls](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::fold::InvokeParams>();
+
+            auto input_grad_tv    = get_inner_expanded_tv<3>(deref(params.dinputDesc));
+            auto output_grad_tv   = get_inner_expanded_tv<4>(deref(params.doutputDesc));
+            auto input_grad_dims  = deref(params.dinputDesc).GetLengths();
+            auto output_grad_dims = deref(params.doutputDesc).GetLengths();
+
+            int32_t kernel_size_h = params.kernel_size[0];
+            int32_t kernel_size_w = params.kernel_size[1];
+            int32_t stride_h      = params.stride[0];
+            int32_t stride_w      = params.stride[1];
+            int32_t padding_h     = params.padding[0];
+            int32_t padding_w     = params.padding[1];
+            int32_t dilation_h    = params.dilation[0];
+            int32_t dilation_w    = params.dilation[1];
+            int32_t LH            = ls[0];
+            int32_t LW            = ls[1];
+            int32_t H             = static_cast<int32_t>(output_grad_dims[2]);
+            int32_t W             = static_cast<int32_t>(output_grad_dims[3]);
+
+            kernel(params.doutput,
+                   params.dinput,
+                   N,
+                   C,
+                   H,
+                   W,
+                   P,
+                   L,
+                   LH,
+                   LW,
+                   kernel_size_h,
+                   kernel_size_w,
+                   stride_h,
+                   stride_w,
+                   padding_h,
+                   padding_w,
+                   dilation_h,
+                   dilation_w,
+                   output_grad_tv,
+                   input_grad_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace fold
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/fold/fold_forward.cpp b/src/solver/fold/fold_forward.cpp
new file mode 100644
index 0000000000..585a21a0e2
--- /dev/null
+++ b/src/solver/fold/fold_forward.cpp
@@ -0,0 +1,166 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <cstdint>
+#include <miopen/fold/problem_description.hpp>
+#include <miopen/miopen.h>
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/fold/invoke_params.hpp>
+#include <miopen/fold/solvers.hpp>
+#include <miopen/fold.hpp>
+#include <miopen/target_properties.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define LOCAL_SIZE 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace fold {
+
+bool FoldFwd::IsApplicable(
+    [[maybe_unused]] const ExecutionContext&,
+    [[maybe_unused]] const miopen::fold::FoldFwdProblemDescription& problem) const
+{
+    return true;
+}
+
+ConvSolution FoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                                  const miopen::fold::FoldFwdProblemDescription& problem) const
+{
+    std::ignore = context;
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto in_dtype   = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto dtype      = problem.GetOutputDesc().GetType();
+    auto input_dims = problem.GetInputDesc().GetLengths();
+
+    auto output_dims = problem.GetOutputDesc().GetLengths();
+    const int32_t N  = static_cast<int32_t>(output_dims[0]);
+    const int32_t C  = static_cast<int32_t>(output_dims[1]);
+    int32_t H        = static_cast<int32_t>(output_dims[2]);
+    int32_t W        = static_cast<int32_t>(output_dims[3]);
+
+    {
+        auto kernel        = KernelInfo{};
+        kernel.kernel_file = "MIOpenUnfold.cpp";
+        kernel.kernel_name = "UnfoldBackward4D";
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        };
+        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+        size_t xlocalsize = LOCAL_SIZE;
+        size_t xgridsize  = AlignUp(N * C * H * W, LOCAL_SIZE);
+        size_t ylocalsize = 1;
+        size_t ygridsize  = 1;
+        size_t zlocalsize = 1;
+        size_t zgridsize  = 1;
+        kernel.l_wk.push_back(xlocalsize);
+        kernel.l_wk.push_back(ylocalsize);
+        kernel.l_wk.push_back(zlocalsize);
+
+        kernel.g_wk.push_back(xgridsize);
+        kernel.g_wk.push_back(ygridsize);
+        kernel.g_wk.push_back(zgridsize);
+
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [N, C](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::fold::InvokeParams>();
+
+            auto input_tv    = get_inner_expanded_tv<3>(deref(params.inputDesc));
+            auto output_tv   = get_inner_expanded_tv<4>(deref(params.outputDesc));
+            auto input_dims  = deref(params.inputDesc).GetLengths();
+            auto output_dims = deref(params.outputDesc).GetLengths();
+
+            int32_t spatial_dim_size = output_dims.size() - 2;
+            int32_t P = 1, L = 1;
+            std::vector<int32_t> ls;
+            for(int i = 0; i < spatial_dim_size; ++i)
+            {
+                P *= params.kernel_size[i];
+                int32_t l = (static_cast<int32_t>(output_dims[i + 2]) + 2 * params.padding[i] -
+                             params.dilation[i] * (params.kernel_size[i] - 1) - 1) /
+                                params.stride[i] +
+                            1;
+                L *= l;
+                ls.push_back(l);
+            }
+
+            int32_t kernel_size_h = params.kernel_size[0];
+            int32_t kernel_size_w = params.kernel_size[1];
+            int32_t stride_h      = params.stride[0];
+            int32_t stride_w      = params.stride[1];
+            int32_t padding_h     = params.padding[0];
+            int32_t padding_w     = params.padding[1];
+            int32_t dilation_h    = params.dilation[0];
+            int32_t dilation_w    = params.dilation[1];
+            int32_t LH            = ls[0];
+            int32_t LW            = ls[1];
+            int32_t H             = static_cast<int32_t>(output_dims[2]);
+            int32_t W             = static_cast<int32_t>(output_dims[3]);
+
+            kernel(params.input,
+                   params.output,
+                   N,
+                   C,
+                   H,
+                   W,
+                   P,
+                   L,
+                   LH,
+                   LW,
+                   kernel_size_h,
+                   kernel_size_w,
+                   stride_h,
+                   stride_w,
+                   padding_h,
+                   padding_w,
+                   dilation_h,
+                   dilation_w,
+                   input_tv,
+                   output_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace fold
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/fold/unfold_backward.cpp b/src/solver/fold/unfold_backward.cpp
new file mode 100644
index 0000000000..b9b49c2799
--- /dev/null
+++ b/src/solver/fold/unfold_backward.cpp
@@ -0,0 +1,163 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/fold/problem_description.hpp>
+#include <miopen/miopen.h>
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/fold/invoke_params.hpp>
+#include <miopen/fold/solvers.hpp>
+#include <miopen/fold.hpp>
+#include <miopen/target_properties.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define LOCAL_SIZE 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace fold {
+
+bool UnfoldBwd::IsApplicable(
+    [[maybe_unused]] const ExecutionContext&,
+    [[maybe_unused]] const miopen::fold::UnfoldBwdProblemDescription& problem) const
+{
+    return true;
+}
+
+ConvSolution UnfoldBwd::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                                    const miopen::fold::UnfoldBwdProblemDescription& problem) const
+{
+    std::ignore = context;
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto in_dtype         = miopen::GetDataType(problem.GetDinputDesc().GetType());
+    auto dtype            = problem.GetDoutputDesc().GetType();
+    auto input_grad_dims  = problem.GetDinputDesc().GetLengths();
+    auto output_grad_dims = problem.GetDoutputDesc().GetLengths();
+
+    const int32_t N = static_cast<int32_t>(input_grad_dims[0]);
+    const int32_t C = static_cast<int32_t>(input_grad_dims[1]);
+    int32_t H       = static_cast<int32_t>(input_grad_dims[2]);
+    int32_t W       = static_cast<int32_t>(input_grad_dims[3]);
+
+    {
+        auto kernel        = KernelInfo{};
+        kernel.kernel_file = "MIOpenUnfold.cpp";
+        kernel.kernel_name = "UnfoldBackward4D";
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+            {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+            {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+            {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        };
+        kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+        size_t xlocalsize = LOCAL_SIZE;
+        size_t xgridsize  = AlignUp(N * C * H * W, LOCAL_SIZE);
+        size_t ylocalsize = 1;
+        size_t ygridsize  = 1;
+        size_t zlocalsize = 1;
+        size_t zgridsize  = 1;
+        kernel.l_wk.push_back(xlocalsize);
+        kernel.l_wk.push_back(ylocalsize);
+        kernel.l_wk.push_back(zlocalsize);
+
+        kernel.g_wk.push_back(xgridsize);
+        kernel.g_wk.push_back(ygridsize);
+        kernel.g_wk.push_back(zgridsize);
+
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [N, C, H, W](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::fold::InvokeParams>();
+
+            auto input_grad_tv    = get_inner_expanded_tv<4>(deref(params.dinputDesc));
+            auto output_grad_tv   = get_inner_expanded_tv<3>(deref(params.doutputDesc));
+            auto input_grad_dims  = deref(params.dinputDesc).GetLengths();
+            auto output_grad_dims = deref(params.doutputDesc).GetLengths();
+
+            int spatial_dim_size = input_grad_dims.size() - 2;
+            int32_t P = 1, L = 1;
+            std::vector<int32_t> ls;
+            for(int i = 0; i < spatial_dim_size; ++i)
+            {
+                P *= params.kernel_size[i];
+                int32_t l = (static_cast<int32_t>(input_grad_dims[i + 2]) + 2 * params.padding[i] -
+                             params.dilation[i] * (params.kernel_size[i] - 1) - 1) /
+                                params.stride[i] +
+                            1;
+                L *= l;
+                ls.push_back(l);
+            }
+
+            int32_t kernel_size_h = params.kernel_size[0];
+            int32_t kernel_size_w = params.kernel_size[1];
+            int32_t stride_h      = params.stride[0];
+            int32_t stride_w      = params.stride[1];
+            int32_t padding_h     = params.padding[0];
+            int32_t padding_w     = params.padding[1];
+            int32_t dilation_h    = params.dilation[0];
+            int32_t dilation_w    = params.dilation[1];
+            int32_t LH            = ls[0];
+            int32_t LW            = ls[1];
+
+            kernel(params.doutput,
+                   params.dinput,
+                   N,
+                   C,
+                   H,
+                   W,
+                   P,
+                   L,
+                   LH,
+                   LW,
+                   kernel_size_h,
+                   kernel_size_w,
+                   stride_h,
+                   stride_w,
+                   padding_h,
+                   padding_w,
+                   dilation_h,
+                   dilation_w,
+                   output_grad_tv,
+                   input_grad_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace fold
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/fold/unfold_forward.cpp b/src/solver/fold/unfold_forward.cpp
new file mode 100644
index 0000000000..154c51c2fc
--- /dev/null
+++ b/src/solver/fold/unfold_forward.cpp
@@ -0,0 +1,160 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/fold/problem_description.hpp>
+#include <miopen/miopen.h>
+#include <miopen/datatype.hpp>
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/fold/invoke_params.hpp>
+#include <miopen/fold/solvers.hpp>
+#include <miopen/fold.hpp>
+#include <miopen/target_properties.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+#define LOCAL_SIZE 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace fold {
+
+bool UnfoldFwd::IsApplicable(
+    [[maybe_unused]] const ExecutionContext&,
+    [[maybe_unused]] const miopen::fold::UnfoldFwdProblemDescription& problem) const
+{
+    return true;
+}
+
+ConvSolution UnfoldFwd::GetSolution([[maybe_unused]] const ExecutionContext& context,
+                                    const miopen::fold::UnfoldFwdProblemDescription& problem) const
+{
+    std::ignore = context;
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    auto in_dtype   = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto dtype      = problem.GetOutputDesc().GetType();
+    auto input_dims = problem.GetInputDesc().GetLengths();
+
+    auto output_dims     = problem.GetOutputDesc().GetLengths();
+    const int32_t N      = static_cast<int32_t>(input_dims[0]);
+    const int32_t C      = static_cast<int32_t>(input_dims[1]);
+    int spatial_dim_size = input_dims.size() - 2;
+    int32_t P = 1, L = 1;
+    std::vector<int32_t> ls;
+    for(int i = 0; i < spatial_dim_size; ++i)
+    {
+        P *= problem.kernel_size[i];
+        int32_t l = (static_cast<int32_t>(input_dims[i + 2]) + 2 * problem.padding[i] -
+                     problem.dilation[i] * (problem.kernel_size[i] - 1) - 1) /
+                        problem.stride[i] +
+                    1;
+        L *= l;
+        ls.push_back(l);
+    }
+
+    auto kernel        = KernelInfo{};
+    kernel.kernel_file = "MIOpenUnfold.cpp";
+    kernel.kernel_name = "UnfoldForward4D";
+
+    const auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+    };
+    kernel.comp_options = build_params.GenerateFor(kbp::HIP{});
+
+    size_t xlocalsize = LOCAL_SIZE;
+    size_t xgridsize  = AlignUp(N * C * P * L, LOCAL_SIZE);
+    size_t ylocalsize = 1;
+    size_t ygridsize  = 1;
+    size_t zlocalsize = 1;
+    size_t zgridsize  = 1;
+    kernel.l_wk.push_back(xlocalsize);
+    kernel.l_wk.push_back(ylocalsize);
+    kernel.l_wk.push_back(zlocalsize);
+
+    kernel.g_wk.push_back(xgridsize);
+    kernel.g_wk.push_back(ygridsize);
+    kernel.g_wk.push_back(zgridsize);
+
+    result.construction_params.push_back(kernel);
+
+    result.invoker_factory = [N, C, P, L, ls](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::fold::InvokeParams>();
+
+            auto input_tv    = get_inner_expanded_tv<4>(deref(params.inputDesc));
+            auto output_tv   = get_inner_expanded_tv<3>(deref(params.outputDesc));
+            auto input_dims  = deref(params.inputDesc).GetLengths();
+            auto output_dims = deref(params.outputDesc).GetLengths();
+
+            int32_t kernel_size_h = params.kernel_size[0];
+            int32_t kernel_size_w = params.kernel_size[1];
+            int32_t stride_h      = params.stride[0];
+            int32_t stride_w      = params.stride[1];
+            int32_t padding_h     = params.padding[0];
+            int32_t padding_w     = params.padding[1];
+            int32_t dilation_h    = params.dilation[0];
+            int32_t dilation_w    = params.dilation[1];
+            int32_t LH            = ls[0];
+            int32_t LW            = ls[1];
+            int32_t H             = static_cast<int32_t>(input_dims[2]);
+            int32_t W             = static_cast<int32_t>(input_dims[3]);
+
+            kernel(params.input,
+                   params.output,
+                   N,
+                   C,
+                   H,
+                   W,
+                   P,
+                   L,
+                   LH,
+                   LW,
+                   kernel_size_h,
+                   kernel_size_w,
+                   stride_h,
+                   stride_w,
+                   padding_h,
+                   padding_w,
+                   dilation_h,
+                   dilation_w,
+                   input_tv,
+                   output_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace fold
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/test/cpu_unfold.hpp b/test/cpu_unfold.hpp
new file mode 100644
index 0000000000..201917d4e0
--- /dev/null
+++ b/test/cpu_unfold.hpp
@@ -0,0 +1,193 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_CPU_UNFOLD_HPP
+#define GUARD_CPU_UNFOLD_HPP
+
+#pragma once
+
+#include "miopen/tensor.hpp"
+#include "tensor_holder.hpp"
+#include "tensor_view.hpp"
+#include "miopen/tensor_view_utils.hpp"
+
+template <class T>
+void cpu_unfold_fwd_4d(tensor<T> input_tensor,
+                       tensor<T>& ref_output_tensor,
+                       const std::vector<int32_t> kernel_size,
+                       const std::vector<int32_t> stride,
+                       const std::vector<int32_t> padding,
+                       const std::vector<int32_t> dilation)
+{
+    auto input_tv   = miopen::get_inner_expanded_tv<4>(input_tensor.desc);
+    auto output_tv  = miopen::get_inner_expanded_tv<3>(ref_output_tensor.desc);
+    auto input_size = input_tensor.desc.GetNumDims();
+    auto input_dims = input_tensor.desc.GetLengths();
+
+    auto input  = input_tensor.data.data();
+    auto output = ref_output_tensor.data.data();
+
+    const int LOCAL_SIZE = 256;
+    int spatial_dim_size = input_size - 2;
+
+    const int32_t N = static_cast<int32_t>(input_dims[0]);
+    const int32_t C = static_cast<int32_t>(input_dims[1]);
+
+    int32_t P = 1, L = 1;
+    std::vector<int32_t> ls;
+    for(int i = 0; i < spatial_dim_size; ++i)
+    {
+        P *= kernel_size[i];
+        int32_t l = (static_cast<int32_t>(input_dims[i + 2]) + 2 * padding[i] -
+                     dilation[i] * (kernel_size[i] - 1) - 1) /
+                        stride[i] +
+                    1;
+        L *= l;
+        ls.push_back(l);
+    }
+
+    int32_t kernel_size_w = kernel_size[1];
+    int32_t stride_h      = stride[0];
+    int32_t stride_w      = stride[1];
+    int32_t padding_h     = padding[0];
+    int32_t padding_w     = padding[1];
+    int32_t dilation_h    = dilation[0];
+    int32_t dilation_w    = dilation[1];
+    int32_t LW            = ls[1];
+    int32_t H             = static_cast<int32_t>(input_dims[2]);
+    int32_t W             = static_cast<int32_t>(input_dims[3]);
+    int work_size         = (((N * C * P * L) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
+    par_ford(work_size)([&](int gid) {
+        int ncp = gid / L, l = gid % L;
+        int nc = ncp / P, p = ncp % P;
+        int n = nc / C, c = nc % C;
+        if(n >= N)
+            return;
+
+        int lh = l / LW, lw = l % LW;                       // sliding window position
+        int ph = p / kernel_size_w, pw = p % kernel_size_w; // position inside kernel
+        int h = lh * stride_h - padding_h + ph * dilation_h;
+        int w = lw * stride_w - padding_w + pw * dilation_w;
+
+        T x = static_cast<T>(0.0f);
+        if(0 <= h && h < H && 0 <= w && w < W)
+        {
+            long input_idx = input_tv.stride[3] * w + input_tv.stride[2] * h +
+                             input_tv.stride[1] * c + input_tv.stride[0] * n;
+            x = input[input_idx];
+        }
+
+        long output_idx =
+            output_tv.stride[2] * l + output_tv.stride[1] * (c * P + p) + output_tv.stride[0] * n;
+        output[output_idx] = x;
+    });
+}
+
+template <class T>
+void cpu_unfold_bwd_4d(tensor<T>& ref_dinput_tensor,
+                       tensor<T> doutput_tensor,
+                       const std::vector<int32_t> kernel_size,
+                       const std::vector<int32_t> stride,
+                       const std::vector<int32_t> padding,
+                       const std::vector<int32_t> dilation)
+{
+    auto input_grad_tv   = miopen::get_inner_expanded_tv<4>(ref_dinput_tensor.desc);
+    auto output_grad_tv  = miopen::get_inner_expanded_tv<3>(doutput_tensor.desc);
+    auto input_size      = ref_dinput_tensor.desc.GetNumDims();
+    auto input_grad_dims = ref_dinput_tensor.desc.GetLengths();
+
+    auto input_grad  = ref_dinput_tensor.data.data();
+    auto output_grad = doutput_tensor.data.data();
+
+    const int LOCAL_SIZE = 256;
+    int spatial_dim_size = input_size - 2;
+
+    const int32_t N = static_cast<int32_t>(input_grad_dims[0]);
+    const int32_t C = static_cast<int32_t>(input_grad_dims[1]);
+
+    int32_t P = 1;
+    std::vector<int32_t> ls;
+    for(int i = 0; i < spatial_dim_size; ++i)
+    {
+        P *= kernel_size[i];
+        int32_t l = (static_cast<int32_t>(input_grad_dims[i + 2]) + 2 * padding[i] -
+                     dilation[i] * (kernel_size[i] - 1) - 1) /
+                        stride[i] +
+                    1;
+        ls.push_back(l);
+    }
+
+    int32_t kernel_size_h = kernel_size[0];
+    int32_t kernel_size_w = kernel_size[1];
+    int32_t stride_h      = stride[0];
+    int32_t stride_w      = stride[1];
+    int32_t padding_h     = padding[0];
+    int32_t padding_w     = padding[1];
+    int32_t dilation_h    = dilation[0];
+    int32_t dilation_w    = dilation[1];
+    int32_t LH            = ls[0];
+    int32_t LW            = ls[1];
+    int32_t H             = static_cast<int32_t>(input_grad_dims[2]);
+    int32_t W             = static_cast<int32_t>(input_grad_dims[3]);
+    int work_size         = (((N * C * H * W) + LOCAL_SIZE - 1) / LOCAL_SIZE) * LOCAL_SIZE;
+    par_ford(work_size)([&](int gid) {
+        int nch = gid / W, w = gid % W;
+        int nc = nch / H, h = nch % H;
+        int n = nc / C, c = nc % C;
+        if(n >= N)
+            return;
+
+        float sum = 0.0f;
+
+        for(int ph = 0; ph < kernel_size_h; ++ph)
+        {
+            for(int pw = 0; pw < kernel_size_w; ++pw)
+            {
+                int lhsh = h - ph * dilation_h + padding_h;
+                int lwsw = w - pw * dilation_w + padding_w;
+                if(lhsh % stride_h != 0)
+                    continue;
+                if(lwsw % stride_w != 0)
+                    continue;
+                int lh = lhsh / stride_h;
+                int lw = lwsw / stride_w;
+                if(lh < 0 || LH <= lh)
+                    continue;
+                if(lw < 0 || LW <= lw)
+                    continue;
+                long output_grad_idx =
+                    output_grad_tv.stride[2] * (lh * LW + lw) +
+                    output_grad_tv.stride[1] * (c * P + (ph * kernel_size_w + pw)) +
+                    output_grad_tv.stride[0] * n;
+                sum += static_cast<float>(output_grad[output_grad_idx]);
+            }
+        }
+
+        long input_grad_idx = input_grad_tv.stride[3] * w + input_grad_tv.stride[2] * h +
+                              input_grad_tv.stride[1] * c + input_grad_tv.stride[0] * n;
+        input_grad[input_grad_idx] = static_cast<T>(sum);
+    });
+}
+#endif
diff --git a/test/gtest/fold.cpp b/test/gtest/fold.cpp
new file mode 100644
index 0000000000..b3868bb0ec
--- /dev/null
+++ b/test/gtest/fold.cpp
@@ -0,0 +1,166 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "fold.hpp"
+#include <miopen/env.hpp>
+
+MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
+
+namespace fold {
+
+std::string GetFloatArg()
+{
+    const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG);
+    if(tmp.empty())
+    {
+        return "";
+    }
+    return tmp;
+}
+
+struct GPU_Fold_fwd_FP32 : FoldFwdTest<float>
+{
+};
+
+struct GPU_Fold_fwd_FP16 : FoldFwdTest<half>
+{
+};
+
+struct GPU_Fold_fwd_BFP16 : FoldFwdTest<bfloat16>
+{
+};
+
+struct GPU_Fold_bwd_FP32 : FoldBwdTest<float>
+{
+};
+
+struct GPU_Fold_bwd_FP16 : FoldBwdTest<half>
+{
+};
+
+struct GPU_Fold_bwd_BFP16 : FoldBwdTest<bfloat16>
+{
+};
+}; // namespace fold
+
+using namespace fold;
+
+TEST_P(GPU_Fold_fwd_FP32, Test)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Fold_fwd_FP32, testing::ValuesIn(FoldTestConfigs()));
+
+TEST_P(GPU_Fold_fwd_FP16, Test)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Fold_fwd_FP16, testing::ValuesIn(FoldTestConfigs()));
+
+TEST_P(GPU_Fold_fwd_BFP16, Test)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Fold_fwd_BFP16, testing::ValuesIn(FoldTestConfigs()));
+
+TEST_P(GPU_Fold_bwd_FP32, Test)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Fold_bwd_FP32, testing::ValuesIn(FoldTestConfigs()));
+
+TEST_P(GPU_Fold_bwd_FP16, Test)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Fold_bwd_FP16, testing::ValuesIn(FoldTestConfigs()));
+
+TEST_P(GPU_Fold_bwd_BFP16, Test)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Fold_bwd_BFP16, testing::ValuesIn(FoldTestConfigs()));
diff --git a/test/gtest/fold.hpp b/test/gtest/fold.hpp
new file mode 100644
index 0000000000..f43f30f346
--- /dev/null
+++ b/test/gtest/fold.hpp
@@ -0,0 +1,308 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTN OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTN WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "cpu_unfold.hpp"
+#include "get_handle.hpp"
+#include "miopen/allocator.hpp"
+#include "random.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+#include <gtest/gtest.h>
+#include <miopen/miopen.h>
+#include <miopen/fold.hpp>
+
+struct FoldTestCase
+{
+    size_t N;
+    size_t C;
+    size_t D;
+    size_t H;
+    size_t W;
+    std::vector<int32_t> outputSize;
+    std::vector<int32_t> kernelSize;
+    std::vector<int32_t> stride;
+    std::vector<int32_t> padding;
+    std::vector<int32_t> dilation;
+    bool isContiguous = true;
+    friend std::ostream& operator<<(std::ostream& os, const FoldTestCase& tc)
+    {
+        os << "N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H << " W:" << tc.W;
+        os << " output_size:";
+        for(const auto& outs : tc.outputSize)
+            os << outs << " ";
+        os << " kernel_size:";
+        for(const auto& ks : tc.kernelSize)
+            os << ks << " ";
+        os << "stride:";
+        for(const auto& s : tc.stride)
+            os << s << " ";
+        os << "padding:";
+        for(const auto& p : tc.padding)
+            os << p << " ";
+        os << "dilation:";
+        for(const auto& d : tc.dilation)
+            os << d << " ";
+        os << "isContiguous:" << std::boolalpha << tc.isContiguous;
+        return os;
+    }
+
+    std::vector<size_t> GetInput()
+    {
+        if((N != 0) && (C != 0) && (D != 0) && (H != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, C, D, H, W});
+        }
+        else if((N != 0) && (C != 0) && (H != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, C, H, W});
+        }
+        else if((N != 0) && (C != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, C, W});
+        }
+        else if((N != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, W});
+        }
+        else if((N != 0))
+        {
+            return std::vector<size_t>({N});
+        }
+        else
+        {
+            std::cout << "Error Input Tensor Lengths\n" << std::endl;
+            return std::vector<size_t>({0});
+        }
+    }
+
+    std::vector<size_t> ComputeStrides(std::vector<size_t> inputDim) const
+    {
+        if(!isContiguous)
+            std::swap(inputDim.front(), inputDim.back());
+        std::vector<size_t> strides(inputDim.size());
+        strides.back() = 1;
+        for(int i = inputDim.size() - 2; i >= 0; --i)
+            strides[i] = strides[i + 1] * inputDim[i + 1];
+        if(!isContiguous)
+            std::swap(strides.front(), strides.back());
+        return strides;
+    }
+};
+
+std::vector<FoldTestCase> FoldTestConfigs()
+{
+    // clang-format: off
+    return {
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {4, 5}, {2, 2}, {1, 1}, {0, 0}, {1, 1}, true},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {6, 11}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, true},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {7, 12}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, true},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {7, 13}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, true},
+        {3, 3 * 3 * 4, 0, 0, 3 * 4, {5, 7}, {3, 4}, {1, 1}, {0, 0}, {1, 1}, true},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {2, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, true},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {5, 7}, {2, 2}, {1, 1}, {0, 0}, {2, 3}, true},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {4, 5}, {2, 2}, {1, 1}, {0, 0}, {1, 1}, false},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {6, 11}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, false},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {7, 12}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, false},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {7, 13}, {2, 2}, {2, 3}, {0, 0}, {1, 1}, false},
+        {3, 3 * 3 * 4, 0, 0, 3 * 4, {5, 7}, {3, 4}, {1, 1}, {0, 0}, {1, 1}, false},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {2, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false},
+        {3, 3 * 2 * 2, 0, 0, 3 * 4, {5, 7}, {2, 2}, {1, 1}, {0, 0}, {2, 3}, false},
+    };
+    // clang-format: on
+}
+
+template <typename T>
+struct FoldFwdTest : public ::testing::TestWithParam<FoldTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle = get_handle();
+        config        = GetParam();
+
+        std::vector<size_t> in_dims    = config.GetInput();
+        std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
+
+        auto gen_value  = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
+        auto gen_zero   = [&](auto...) { return 0; };
+        input           = tensor<T>{in_dims, in_strides}.generate(gen_value);
+        const int32_t N = static_cast<int32_t>(in_dims[0]);
+        int32_t C       = static_cast<int32_t>(in_dims[1]);
+        for(int32_t i : config.kernelSize)
+        {
+            C = C / i;
+        }
+
+        std::vector<size_t> out_dims{static_cast<size_t>(N),
+                                     static_cast<size_t>(C),
+                                     static_cast<size_t>(config.outputSize[0]),
+                                     static_cast<size_t>(config.outputSize[1])};
+
+        output     = tensor<T>{out_dims}.generate(gen_zero);
+        outputHost = tensor<T>{out_dims}.generate(gen_zero);
+
+        input_dev  = handle.Write(input.data);
+        output_dev = handle.Write(output.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+        miopenStatus_t status;
+
+        status = miopen::FoldForward(handle,
+                                     input.desc,
+                                     input_dev.get(),
+                                     output.desc,
+                                     output_dev.get(),
+                                     config.kernelSize.data(),
+                                     static_cast<int>(config.kernelSize.size()),
+                                     config.stride.data(),
+                                     static_cast<int>(config.stride.size()),
+                                     config.padding.data(),
+                                     static_cast<int>(config.padding.size()),
+                                     config.dilation.data(),
+                                     static_cast<int>(config.dilation.size()));
+
+        cpu_unfold_bwd_4d<T>(
+            outputHost, input, config.kernelSize, config.stride, config.padding, config.dilation);
+
+        EXPECT_EQ(status, miopenStatusSuccess);
+        output.data = handle.Read<T>(output_dev, output.data.size());
+    }
+
+    void Verify()
+    {
+        // Computation error of fp16 is ~2^13 (=8192) bigger than
+        // the one of fp32 because mantissa is shorter by 13 bits.
+        double tolerance = std::is_same<T, float>::value ? 1.5e-6 : 8.2e-3;
+
+        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+        if(std::is_same<T, bfloat16>::value)
+            tolerance *= 8.0;
+        auto error_output = miopen::rms_range(outputHost, output);
+        ASSERT_EQ(miopen::range_distance(outputHost), miopen::range_distance(output));
+
+        EXPECT_LT(error_output, tolerance) << "Error forward output beyond tolerance Error: {"
+                                           << error_output << "},  Tolerance: " << tolerance;
+    }
+    FoldTestCase config;
+
+    tensor<T> input;
+    tensor<T> output;
+
+    tensor<T> outputHost;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+};
+
+template <typename T>
+struct FoldBwdTest : public ::testing::TestWithParam<FoldTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle = get_handle();
+        config        = GetParam();
+
+        std::vector<size_t> in_dims    = config.GetInput();
+        std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
+
+        auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
+        auto gen_zero  = [&](auto...) { return 0; };
+        dinput         = tensor<T>{in_dims, in_strides}.generate(gen_zero);
+        dinputHost     = tensor<T>{in_dims, in_strides}.generate(gen_zero);
+
+        const int32_t N = static_cast<int32_t>(in_dims[0]);
+        int32_t C       = static_cast<int32_t>(in_dims[1]);
+        for(int32_t i : config.kernelSize)
+        {
+            C = C / i;
+        }
+
+        std::vector<size_t> out_dims{static_cast<size_t>(N),
+                                     static_cast<size_t>(C),
+                                     static_cast<size_t>(config.outputSize[0]),
+                                     static_cast<size_t>(config.outputSize[1])};
+
+        doutput = tensor<T>{out_dims}.generate(gen_value);
+
+        dinput_dev  = handle.Write(dinput.data);
+        doutput_dev = handle.Write(doutput.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+        miopenStatus_t status;
+
+        status = miopen::FoldBackward(handle,
+                                      dinput.desc,
+                                      dinput_dev.get(),
+                                      doutput.desc,
+                                      doutput_dev.get(),
+                                      config.kernelSize.data(),
+                                      static_cast<int>(config.kernelSize.size()),
+                                      config.stride.data(),
+                                      static_cast<int>(config.stride.size()),
+                                      config.padding.data(),
+                                      static_cast<int>(config.padding.size()),
+                                      config.dilation.data(),
+                                      static_cast<int>(config.dilation.size()));
+
+        cpu_unfold_fwd_4d<T>(
+            doutput, dinputHost, config.kernelSize, config.stride, config.padding, config.dilation);
+
+        EXPECT_EQ(status, miopenStatusSuccess);
+        dinput.data = handle.Read<T>(dinput_dev, dinput.data.size());
+    }
+
+    void Verify()
+    {
+        // Computation error of fp16 is ~2^13 (=8192) bigger than
+        // the one of fp32 because mantissa is shorter by 13 bits.
+        double tolerance = std::is_same<T, float>::value ? 1.5e-6 : 8.2e-3;
+
+        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+        if(std::is_same<T, bfloat16>::value)
+            tolerance *= 8.0;
+        auto error_dinput = miopen::rms_range(dinputHost, dinput);
+        ASSERT_EQ(miopen::range_distance(dinputHost), miopen::range_distance(dinput));
+
+        EXPECT_LT(error_dinput, tolerance) << "Error backward input_grad beyond tolerance Error: {"
+                                           << error_dinput << "},  Tolerance: " << tolerance;
+    }
+
+    FoldTestCase config;
+
+    tensor<T> dinput;
+    tensor<T> doutput;
+
+    tensor<T> dinputHost;
+
+    miopen::Allocator::ManageDataPtr dinput_dev;
+    miopen::Allocator::ManageDataPtr doutput_dev;
+};
diff --git a/test/gtest/unfold.cpp b/test/gtest/unfold.cpp
new file mode 100644
index 0000000000..a5aead6d10
--- /dev/null
+++ b/test/gtest/unfold.cpp
@@ -0,0 +1,166 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "unfold.hpp"
+#include <miopen/env.hpp>
+
+MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
+
+namespace unfold {
+
+std::string GetFloatArg()
+{
+    const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG);
+    if(tmp.empty())
+    {
+        return "";
+    }
+    return tmp;
+}
+
+struct GPU_Unfold_fwd_FP32 : UnfoldFwdTest<float>
+{
+};
+
+struct GPU_Unfold_fwd_FP16 : UnfoldFwdTest<half>
+{
+};
+
+struct GPU_Unfold_fwd_BFP16 : UnfoldFwdTest<bfloat16>
+{
+};
+
+struct GPU_Unfold_bwd_FP32 : UnfoldBwdTest<float>
+{
+};
+
+struct GPU_Unfold_bwd_FP16 : UnfoldBwdTest<half>
+{
+};
+
+struct GPU_Unfold_bwd_BFP16 : UnfoldBwdTest<bfloat16>
+{
+};
+}; // namespace unfold
+
+using namespace unfold;
+
+TEST_P(GPU_Unfold_fwd_FP32, Test)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Unfold_fwd_FP32, testing::ValuesIn(UnfoldTestConfigs()));
+
+TEST_P(GPU_Unfold_fwd_FP16, Test)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Unfold_fwd_FP16, testing::ValuesIn(UnfoldTestConfigs()));
+
+TEST_P(GPU_Unfold_fwd_BFP16, Test)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Unfold_fwd_BFP16, testing::ValuesIn(UnfoldTestConfigs()));
+
+TEST_P(GPU_Unfold_bwd_FP32, Test)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Unfold_bwd_FP32, testing::ValuesIn(UnfoldTestConfigs()));
+
+TEST_P(GPU_Unfold_bwd_FP16, Test)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Unfold_bwd_FP16, testing::ValuesIn(UnfoldTestConfigs()));
+
+TEST_P(GPU_Unfold_bwd_BFP16, Test)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Full, GPU_Unfold_bwd_BFP16, testing::ValuesIn(UnfoldTestConfigs()));
diff --git a/test/gtest/unfold.hpp b/test/gtest/unfold.hpp
new file mode 100644
index 0000000000..2631722400
--- /dev/null
+++ b/test/gtest/unfold.hpp
@@ -0,0 +1,319 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTN OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTN WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "cpu_unfold.hpp"
+#include "get_handle.hpp"
+#include "miopen/allocator.hpp"
+#include "random.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+#include <cstddef>
+#include <cstdlib>
+#include <gtest/gtest.h>
+#include <miopen/miopen.h>
+#include <miopen/fold.hpp>
+
+struct UnfoldTestCase
+{
+    size_t N;
+    size_t C;
+    size_t D;
+    size_t H;
+    size_t W;
+    std::vector<int32_t> kernelSize;
+    std::vector<int32_t> stride;
+    std::vector<int32_t> padding;
+    std::vector<int32_t> dilation;
+    bool isContiguous = true;
+    friend std::ostream& operator<<(std::ostream& os, const UnfoldTestCase& tc)
+    {
+        os << "N:" << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H << " W:" << tc.W
+           << " kernel_size:";
+        for(const auto& ks : tc.kernelSize)
+            os << ks << " ";
+        os << "stride:";
+        for(const auto& s : tc.stride)
+            os << s << " ";
+        os << "padding:";
+        for(const auto& p : tc.padding)
+            os << p << " ";
+        os << "dilation:";
+        for(const auto& d : tc.dilation)
+            os << d << " ";
+        os << "isContiguous:" << std::boolalpha << tc.isContiguous;
+        return os;
+    }
+
+    std::vector<size_t> GetInput()
+    {
+        if((N != 0) && (C != 0) && (D != 0) && (H != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, C, D, H, W});
+        }
+        else if((N != 0) && (C != 0) && (H != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, C, H, W});
+        }
+        else if((N != 0) && (C != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, C, W});
+        }
+        else if((N != 0) && (W != 0))
+        {
+            return std::vector<size_t>({N, W});
+        }
+        else if((N != 0))
+        {
+            return std::vector<size_t>({N});
+        }
+        else
+        {
+            std::cout << "Error Input Tensor Lengths\n" << std::endl;
+            return std::vector<size_t>({0});
+        }
+    }
+
+    std::vector<size_t> ComputeStrides(std::vector<size_t> inputDim) const
+    {
+        if(!isContiguous)
+            std::swap(inputDim.front(), inputDim.back());
+        std::vector<size_t> strides(inputDim.size());
+        strides.back() = 1;
+        for(int i = inputDim.size() - 2; i >= 0; --i)
+            strides[i] = strides[i + 1] * inputDim[i + 1];
+        if(!isContiguous)
+            std::swap(strides.front(), strides.back());
+        return strides;
+    }
+};
+
+std::vector<UnfoldTestCase> UnfoldTestConfigs()
+{
+    // clang-format: off
+    return {
+        {2, 5, 0, 3, 4, {2, 3}, {1, 1}, {0, 0}, {1, 1}, true},
+        {1, 3, 0, 10, 12, {4, 5}, {1, 1}, {0, 0}, {1, 1}, true},
+        {11, 13, 0, 17, 19, {3, 3}, {3, 2}, {0, 0}, {1, 1}, true},
+        {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {3, 2}, {1, 1}, true},
+        {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {0, 0}, {3, 2}, true},
+        {11, 13, 0, 33, 37, {4, 3}, {2, 3}, {5, 2}, {3, 5}, true},
+        {2, 5, 0, 3, 4, {2, 3}, {1, 1}, {0, 0}, {1, 1}, false},
+        {1, 3, 0, 10, 12, {4, 5}, {1, 1}, {0, 0}, {1, 1}, false},
+        {11, 13, 0, 17, 19, {3, 3}, {3, 2}, {0, 0}, {1, 1}, false},
+        {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {3, 2}, {1, 1}, false},
+        {11, 13, 0, 17, 19, {3, 3}, {1, 1}, {0, 0}, {3, 2}, false},
+        {11, 13, 0, 33, 37, {4, 3}, {2, 3}, {5, 2}, {3, 5}, false},
+    };
+    // clang-format: on
+}
+
+template <typename T>
+struct UnfoldFwdTest : public ::testing::TestWithParam<UnfoldTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle = get_handle();
+        config        = GetParam();
+
+        std::vector<size_t> in_dims    = config.GetInput();
+        std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
+
+        auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
+        auto gen_zero  = [&](auto...) { return 0; };
+        input          = tensor<T>{in_dims, in_strides}.generate(gen_value);
+
+        int spatial_dim_size = in_dims.size() - 2;
+        const int32_t N      = static_cast<int32_t>(in_dims[0]);
+        const int32_t C      = static_cast<int32_t>(in_dims[1]);
+        int32_t P = 1, L = 1;
+        std::vector<int32_t> ls;
+        for(int i = 0; i < spatial_dim_size; ++i)
+        {
+            P *= config.kernelSize[i];
+            int32_t l = (static_cast<int32_t>(in_dims[i + 2]) + 2 * config.padding[i] -
+                         config.dilation[i] * (config.kernelSize[i] - 1) - 1) /
+                            config.stride[i] +
+                        1;
+            L *= l;
+            ls.push_back(l);
+        }
+
+        std::vector<size_t> out_dims{
+            static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
+
+        output     = tensor<T>{out_dims}.generate(gen_zero);
+        outputHost = tensor<T>{out_dims}.generate(gen_zero);
+
+        input_dev  = handle.Write(input.data);
+        output_dev = handle.Write(output.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+        miopenStatus_t status;
+
+        status = miopen::UnfoldForward(handle,
+                                       input.desc,
+                                       input_dev.get(),
+                                       output.desc,
+                                       output_dev.get(),
+                                       config.kernelSize.data(),
+                                       static_cast<int>(config.kernelSize.size()),
+                                       config.stride.data(),
+                                       static_cast<int>(config.stride.size()),
+                                       config.padding.data(),
+                                       static_cast<int>(config.padding.size()),
+                                       config.dilation.data(),
+                                       static_cast<int>(config.dilation.size()));
+
+        cpu_unfold_fwd_4d<T>(
+            input, outputHost, config.kernelSize, config.stride, config.padding, config.dilation);
+
+        EXPECT_EQ(status, miopenStatusSuccess);
+        output.data = handle.Read<T>(output_dev, output.data.size());
+    }
+
+    void Verify()
+    {
+        // Computation error of fp16 is ~2^13 (=8192) bigger than
+        // the one of fp32 because mantissa is shorter by 13 bits.
+        double tolerance = std::is_same<T, float>::value ? 1.5e-6 : 8.2e-3;
+
+        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+        if(std::is_same<T, bfloat16>::value)
+            tolerance *= 8.0;
+        auto error_output = miopen::rms_range(outputHost, output);
+        ASSERT_EQ(miopen::range_distance(outputHost), miopen::range_distance(output));
+
+        EXPECT_TRUE(error_output < tolerance) << "Error forward output beyond tolerance Error: {"
+                                              << error_output << "},  Tolerance: " << tolerance;
+    }
+
+    UnfoldTestCase config;
+
+    tensor<T> input;
+    tensor<T> output;
+
+    tensor<T> outputHost;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+};
+
+template <typename T>
+struct UnfoldBwdTest : public ::testing::TestWithParam<UnfoldTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle = get_handle();
+        config        = GetParam();
+
+        std::vector<size_t> in_dims    = config.GetInput();
+        std::vector<size_t> in_strides = config.ComputeStrides(in_dims);
+
+        auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<T>(1e-2, 100); };
+        auto gen_zero  = [&](auto...) { return 0; };
+        dinput         = tensor<T>{in_dims, in_strides}.generate(gen_zero);
+        dinputHost     = tensor<T>{in_dims, in_strides}.generate(gen_zero);
+
+        int spatial_dim_size = in_dims.size() - 2;
+        const int32_t N      = static_cast<int32_t>(in_dims[0]);
+        const int32_t C      = static_cast<int32_t>(in_dims[1]);
+        int32_t P = 1, L = 1;
+        std::vector<int32_t> ls;
+        for(int i = 0; i < spatial_dim_size; ++i)
+        {
+            P *= config.kernelSize[i];
+            int32_t l = (static_cast<int32_t>(in_dims[i + 2]) + 2 * config.padding[i] -
+                         config.dilation[i] * (config.kernelSize[i] - 1) - 1) /
+                            config.stride[i] +
+                        1;
+            L *= l;
+            ls.push_back(l);
+        }
+
+        std::vector<size_t> out_dims{
+            static_cast<size_t>(N), static_cast<size_t>(C * P), static_cast<size_t>(L)};
+
+        doutput = tensor<T>{out_dims}.generate(gen_value);
+
+        dinput_dev  = handle.Write(dinput.data);
+        doutput_dev = handle.Write(doutput.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+        miopenStatus_t status;
+
+        status = miopen::UnfoldBackward(handle,
+                                        dinput.desc,
+                                        dinput_dev.get(),
+                                        doutput.desc,
+                                        doutput_dev.get(),
+                                        config.kernelSize.data(),
+                                        static_cast<int>(config.kernelSize.size()),
+                                        config.stride.data(),
+                                        static_cast<int>(config.stride.size()),
+                                        config.padding.data(),
+                                        static_cast<int>(config.padding.size()),
+                                        config.dilation.data(),
+                                        static_cast<int>(config.dilation.size()));
+
+        cpu_unfold_bwd_4d<T>(
+            dinputHost, doutput, config.kernelSize, config.stride, config.padding, config.dilation);
+
+        EXPECT_EQ(status, miopenStatusSuccess);
+        dinput.data = handle.Read<T>(dinput_dev, dinput.data.size());
+    }
+
+    void Verify()
+    {
+        // Computation error of fp16 is ~2^13 (=8192) bigger than
+        // the one of fp32 because mantissa is shorter by 13 bits.
+        double tolerance = std::is_same<T, float>::value ? 1.5e-6 : 8.2e-3;
+
+        // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+        if(std::is_same<T, bfloat16>::value)
+            tolerance *= 8.0;
+        auto error_dinput = miopen::rms_range(dinputHost, dinput);
+        ASSERT_EQ(miopen::range_distance(dinputHost), miopen::range_distance(dinput));
+
+        EXPECT_LT(error_dinput, tolerance) << "Error backward input_grad beyond tolerance Error: {"
+                                           << error_dinput << "},  Tolerance: " << tolerance;
+    }
+    UnfoldTestCase config;
+
+    tensor<T> dinput;
+    tensor<T> doutput;
+
+    tensor<T> dinputHost;
+
+    miopen::Allocator::ManageDataPtr dinput_dev;
+    miopen::Allocator::ManageDataPtr doutput_dev;
+};