diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 90e29ffaa9..9594e00ef0 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -35,3 +35,4 @@ The MIOpen API library is structured as follows:
   * :doc:`ReduceExtreme <../doxygen/html/group__ReduceExtreme>` (experimental)
   * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental)
   * :doc:`ReduceCalculation <../doxygen/html/group__ReduceCalculation>` (experimental)
+  * :doc:`AvgPool <../doxygen/html/group__avgpool>` (experimental)
diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index cd663eb8b4..385580e2e1 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -32,6 +32,7 @@ add_executable(MIOpenDriver
     dm_activ.cpp
     dm_adam.cpp
     dm_addlayernorm.cpp
+    dm_avgpool.cpp
     dm_bnorm.cpp
     dm_cat.cpp
     dm_conv.cpp
diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp
new file mode 100644
index 0000000000..ff7d04edd5
--- /dev/null
+++ b/driver/avgpool_driver.hpp
@@ -0,0 +1,575 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_AVGPOOL_DRIVER_HPP
+#define GUARD_MIOPEN_AVGPOOL_DRIVER_HPP
+
+#include "InputFlags.hpp"
+#include "driver.hpp"
+#include "mloAvgPoolHost.hpp"
+#include "random.hpp"
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+
+#include <../test/tensor_holder.hpp>
+#include <../test/verify.hpp>
+
+#include <miopen/env.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <vector>
+
+template <typename Tgpu, typename Tref>
+class AvgPoolDriver : public Driver
+{
+public:
+    AvgPoolDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&inputDesc);
+        miopenCreateTensorDescriptor(&outputDesc);
+        miopenCreateTensorDescriptor(&inputGradDesc);
+        miopenCreateTensorDescriptor(&outputGradDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    std::vector<int> GetInputTensorDimsFromCmd(const char* param);
+    int GetandSetData() override;
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+    int RunBackwardCPU();
+
+    Tref GetTolerance();
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~AvgPoolDriver() override
+    {
+        miopenDestroyTensorDescriptor(inputDesc);
+        miopenDestroyTensorDescriptor(outputDesc);
+        miopenDestroyTensorDescriptor(inputGradDesc);
+        miopenDestroyTensorDescriptor(outputGradDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    int forw;
+
+    miopenTensorDescriptor_t inputDesc;
+    miopenTensorDescriptor_t outputDesc;
+    miopenTensorDescriptor_t inputGradDesc;
+    miopenTensorDescriptor_t outputGradDesc;
+
+    std::unique_ptr<GPUMem> input_dev;
+    std::unique_ptr<GPUMem> output_dev;
+    std::unique_ptr<GPUMem> input_grad_dev;
+    std::unique_ptr<GPUMem> output_grad_dev;
+
+    std::vector<Tgpu> input;
+    std::vector<Tgpu> output;
+    std::vector<Tref> output_host;
+    std::vector<Tgpu> input_grad;
+    std::vector<Tref> input_grad_host;
+    std::vector<Tgpu> output_grad;
+    std::vector<int32_t> ksize;
+    std::vector<int32_t> stride;
+    std::vector<int32_t> padding;
+
+    bool ceil_mode;
+    bool count_include_pad;
+    int32_t divisor_override;
+    int32_t N, C, D, H, W, OD, OH, OW;
+
+    std::vector<int> in_dim;
+};
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+std::vector<int> AvgPoolDriver<Tgpu, Tref>::GetInputTensorDimsFromCmd(const char* param)
+{
+    std::string lengthsStr = inflags.GetValueStr(param);
+
+    std::vector<int> lengths;
+    std::size_t pos = 0;
+    std::size_t new_pos;
+
+    new_pos = lengthsStr.find(',', pos);
+    while(new_pos != std::string::npos)
+    {
+        std::string sliceStr = lengthsStr.substr(pos, new_pos - pos);
+
+        int len = std::stoi(sliceStr);
+
+        lengths.push_back(len);
+
+        pos     = new_pos + 1;
+        new_pos = lengthsStr.find(',', pos);
+    };
+
+    std::string sliceStr = lengthsStr.substr(pos);
+    int len              = std::stoi(sliceStr);
+
+    lengths.push_back(len);
+
+    return (lengths);
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
+{
+    in_dim      = GetInputTensorDimsFromCmd("input_dims");
+    int ksp_dim = in_dim.size() - 2;
+    ksize       = GetInputTensorDimsFromCmd("kernel_size");
+    stride      = GetInputTensorDimsFromCmd("stride");
+    padding     = GetInputTensorDimsFromCmd("padding");
+
+    if(ksize.size() != ksp_dim)
+    {
+        int ref = ksp_dim - ksize.size();
+        while((ref--) != 0)
+            ksize.push_back(ksize[0]);
+    }
+    if(stride.size() != ksp_dim)
+    {
+        int ref = ksp_dim - stride.size();
+        while((ref--) != 0)
+            stride.push_back(stride[0]);
+    }
+    if(padding.size() != ksp_dim)
+    {
+        int ref = ksp_dim - padding.size();
+        while((ref--) != 0)
+            padding.push_back(padding[0]);
+    }
+
+    ceil_mode         = static_cast<bool>(inflags.GetValueInt("ceil_mode"));
+    count_include_pad = static_cast<bool>(inflags.GetValueInt("count_include_pad"));
+    divisor_override  = inflags.GetValueInt("divisor_override");
+
+    N = in_dim[0];
+    C = in_dim[1];
+    D = in_dim.size() == 5 ? in_dim[2] : 1;
+    H = in_dim.size() == 5 ? in_dim[3] : in_dim[2];
+    W = in_dim.size() == 5 ? in_dim[4] : in_dim[3];
+
+    std::vector<int32_t> out_dim;
+    if(in_dim.size() == 5)
+    {
+        if(ceil_mode)
+        {
+            OD = std::ceil(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+            OH = std::ceil(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+            OW = std::ceil(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
+        }
+        else
+        {
+            OD = std::floor(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+            OH = std::floor(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+            OW = std::floor(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
+        }
+        out_dim = std::vector<int32_t>{N, C, OD, OH, OW};
+    }
+    else
+    {
+        if(ceil_mode)
+        {
+            OH = std::ceil(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+            OW = std::ceil(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+        }
+        else
+        {
+            OH = std::floor(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+            OW = std::floor(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+        }
+        out_dim = std::vector<int32_t>{N, C, OH, OW};
+    }
+    SetTensorNd(inputDesc, in_dim, data_type);
+    SetTensorNd(outputDesc, out_dim, data_type);
+    SetTensorNd(outputGradDesc, out_dim, data_type);
+    SetTensorNd(inputGradDesc, in_dim, data_type);
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag("forw", 'F', "1", "Run only Forward AvgPool (Default=1)", "int");
+    inflags.AddInputFlag(
+        "input_dims",
+        'D',
+        "2,3,7,9",
+        "The dimensional lengths of the input tensor: N,C,D1,D2,... Example: 2,3,7,9.",
+        "string");
+    inflags.AddInputFlag(
+        "kernel_size", 'k', "1,1", "The size of the window D1,D2,... Example: 1,1.", "string");
+    inflags.AddInputFlag(
+        "stride",
+        's',
+        "1,1",
+        "The stride of the window. Default value is kernel_size D1,D2,... Example: 1,1.",
+        "string");
+    inflags.AddInputFlag("padding",
+                         'p',
+                         "0,0",
+                         "Implicit zero padding to be added on both sides D1,D2,... Example: 0,0.",
+                         "string");
+    inflags.AddInputFlag("ceil_mode",
+                         'c',
+                         "1",
+                         "When 1, will use ceil instead of floor to compute the output shape.",
+                         "int");
+    inflags.AddInputFlag("count_include_pad",
+                         'P',
+                         "0",
+                         "When 1, will include the zero-padding in the averaging calculation.",
+                         "int");
+    inflags.AddInputFlag("divisor_override",
+                         'd',
+                         "0",
+                         "If specified, it will be used as divisor, otherwise size of the pooling "
+                         "region will be used.",
+                         "int");
+
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "1", "Verify (Default=1)", "int");
+    inflags.AddInputFlag("time", 't', "1", "Time (Default=1)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    size_t input_sz  = GetTensorSize(inputDesc);
+    size_t output_sz = GetTensorSize(outputDesc);
+
+    uint32_t ctx = 0;
+
+    input_dev       = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    output_dev      = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
+    input_grad_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    output_grad_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
+
+    input       = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(0));
+    output      = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0));
+    output_host = std::vector<Tref>(output_sz, static_cast<Tref>(0));
+
+    input_grad      = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(0));
+    input_grad_host = std::vector<Tref>(input_sz, static_cast<Tref>(0));
+    output_grad     = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0));
+
+    int status;
+
+    for(int i = 0; i < input_sz; i++)
+    {
+        input[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-10.0f), static_cast<Tgpu>(10.0f));
+    }
+    status = input_dev->ToGPU(q, input.data());
+
+    status |= output_dev->ToGPU(q, output.data());
+
+    status |= input_grad_dev->ToGPU(q, input_grad.data());
+
+    for(int i = 0; i < output_sz; i++)
+    {
+        output_grad[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-1.0), static_cast<Tgpu>(1.0));
+    }
+    status |= output_grad_dev->ToGPU(q, output_grad.data());
+
+    if(status != 0)
+        std::cout << "Error copying data to GPU\n" << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    float kernel_total_time = 0.0;
+    float kernel_first_time = 0.0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenAvgPoolForward(GetHandle(),
+                             inputDesc,
+                             input_dev->GetMem(),
+                             outputDesc,
+                             output_dev->GetMem(),
+                             ksize.size() == 3 ? ksize[0] : 0,
+                             ksize.size() == 3 ? ksize[1] : ksize[0],
+                             ksize.size() == 3 ? ksize[2] : ksize[1],
+                             stride.size() == 3 ? stride[0] : 0,
+                             stride.size() == 3 ? stride[1] : stride[0],
+                             stride.size() == 3 ? stride[2] : stride[1],
+                             padding.size() == 3 ? padding[0] : 0,
+                             padding.size() == 3 ? padding[1] : padding[0],
+                             padding.size() == 3 ? padding[2] : padding[1],
+                             count_include_pad,
+                             divisor_override);
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            printf("Wall-clock Time Forward AvgPool Elapsed: %f ms\n", t.gettime_ms() / iter);
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        printf("GPU Kernel Time Forward AvgPool Elapsed: %f ms\n", kernel_average_time);
+    }
+
+    output_dev->FromGPU(GetStream(), output.data());
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::RunForwardCPU()
+{
+    if(in_dim.size() == 4)
+    {
+        mloAvgPoolForward2dRunHost<Tgpu, Tref>(inputDesc,
+                                               outputDesc,
+                                               input.data(),
+                                               output_host.data(),
+                                               N,
+                                               C,
+                                               H,
+                                               W,
+                                               OH,
+                                               OW,
+                                               ksize.data(),
+                                               stride.data(),
+                                               padding.data(),
+                                               count_include_pad,
+                                               divisor_override);
+    }
+    else if(in_dim.size() == 5)
+    {
+        mloAvgPoolForward3dRunHost<Tgpu, Tref>(inputDesc,
+                                               outputDesc,
+                                               input.data(),
+                                               output_host.data(),
+                                               N,
+                                               C,
+                                               D,
+                                               H,
+                                               W,
+                                               OD,
+                                               OH,
+                                               OW,
+                                               ksize.data(),
+                                               stride.data(),
+                                               padding.data(),
+                                               count_include_pad,
+                                               divisor_override);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    float kernel_total_time = 0.0;
+    float kernel_first_time = 0.0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenAvgPoolBackward(GetHandle(),
+                              outputGradDesc,
+                              output_grad_dev->GetMem(),
+                              inputGradDesc,
+                              input_grad_dev->GetMem(),
+                              ksize.size() == 3 ? ksize[0] : 0,
+                              ksize.size() == 3 ? ksize[1] : ksize[0],
+                              ksize.size() == 3 ? ksize[2] : ksize[1],
+                              stride.size() == 3 ? stride[0] : 0,
+                              stride.size() == 3 ? stride[1] : stride[0],
+                              stride.size() == 3 ? stride[2] : stride[1],
+                              padding.size() == 3 ? padding[0] : 0,
+                              padding.size() == 3 ? padding[1] : padding[0],
+                              padding.size() == 3 ? padding[2] : padding[1],
+                              count_include_pad,
+                              divisor_override);
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            printf("Wall-clock Time Backward AvgPool Elapsed: %f ms\n", t.gettime_ms() / iter);
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        printf("GPU Kernel Time Backward AvgPool Elapsed: %f ms\n", kernel_average_time);
+    }
+
+    input_grad_dev->FromGPU(GetStream(), input_grad.data());
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::RunBackwardCPU()
+{
+    if(in_dim.size() == 4)
+    {
+        mloAvgPoolBackward2dRunHost<Tgpu, Tref>(outputGradDesc,
+                                                inputGradDesc,
+                                                output_grad.data(),
+                                                input_grad_host.data(),
+                                                N,
+                                                C,
+                                                H,
+                                                W,
+                                                OH,
+                                                OW,
+                                                ksize.data(),
+                                                stride.data(),
+                                                padding.data(),
+                                                count_include_pad,
+                                                divisor_override);
+    }
+    else if(in_dim.size() == 5)
+    {
+        mloAvgPoolBackward3dRunHost<Tgpu, Tref>(outputGradDesc,
+                                                inputGradDesc,
+                                                output_grad.data(),
+                                                input_grad_host.data(),
+                                                N,
+                                                C,
+                                                D,
+                                                H,
+                                                W,
+                                                OD,
+                                                OH,
+                                                OW,
+                                                ksize.data(),
+                                                stride.data(),
+                                                padding.data(),
+                                                count_include_pad,
+                                                divisor_override);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+Tref AvgPoolDriver<Tgpu, Tref>::GetTolerance()
+{
+    // Computation error of fp16 is ~2^13 (=8192) bigger than
+    // the one of fp32 because mantissa is shorter by 13 bits.
+    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
+
+    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+    if(std::is_same<Tgpu, bfloat16>::value)
+        tolerance *= 8.0;
+    return tolerance;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::VerifyForward()
+{
+    RunForwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error           = miopen::rms_range(output_host, output);
+
+    if(!std::isfinite(error) || error > tolerance)
+    {
+        std::cout << "Forward AvgPool FAILED: " << error << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        printf("Forward AvgPool Verifies on CPU and GPU (err=%f)\n", error);
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::VerifyBackward()
+{
+    RunBackwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error           = miopen::rms_range(input_grad_host, input_grad);
+
+    if(!std::isfinite(error) || error > tolerance)
+    {
+        std::cout << "Backward AvgPool FAILED: " << error << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        printf("Backward AvgPool Verifies on CPU and GPU (err=%f)\n", error);
+    }
+    return miopenStatusSuccess;
+}
+
+#endif // GUARD_MIOPEN_AVGPOOL_DRIVER_HPP
diff --git a/driver/dm_avgpool.cpp b/driver/dm_avgpool.cpp
new file mode 100644
index 0000000000..ec0e457056
--- /dev/null
+++ b/driver/dm_avgpool.cpp
@@ -0,0 +1,40 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "registry_driver_maker.hpp"
+#include "avgpool_driver.hpp"
+
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "avgpool")
+        return new AvgPoolDriver<float, float>();
+    if(base_arg == "avgpoolfp16")
+        return new AvgPoolDriver<float16, float>();
+    if(base_arg == "avgpoolbfp16")
+        return new AvgPoolDriver<bfloat16, float>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
diff --git a/driver/driver.hpp b/driver/driver.hpp
index b23df690d1..bd42f6ee13 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -175,7 +175,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
            "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
            "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
-           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16]\n");
+           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], avgpool[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -206,7 +206,8 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" &&
        arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" &&
        arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" &&
-       arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "--version")
+       arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "avgpool" &&
+       arg != "avgpoolfp16" && arg != "avgpoolbfp16" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/mloAvgPoolHost.hpp b/driver/mloAvgPoolHost.hpp
new file mode 100644
index 0000000000..6980ce968e
--- /dev/null
+++ b/driver/mloAvgPoolHost.hpp
@@ -0,0 +1,438 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MLO_AVGPOOLHOST_H_
+#define MLO_AVGPOOLHOST_H_
+
+#include <miopen/tensor.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc,
+                                   const miopenTensorDescriptor_t outputDesc,
+                                   Tgpu* input,
+                                   Tcheck* output,
+                                   size_t N,
+                                   size_t C,
+                                   size_t H,
+                                   size_t W,
+                                   size_t OH,
+                                   size_t OW,
+                                   const int32_t* ksize,
+                                   const int32_t* stride,
+                                   const int32_t* padding,
+                                   bool count_include_pad,
+                                   int32_t divisor_override)
+{
+    auto dims  = miopen::deref(inputDesc).GetLengths();
+    auto numel = miopen::deref(outputDesc).GetElementSize();
+
+    auto input_tv  = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc));
+    auto output_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputDesc));
+
+    for(int32_t gid = 0; gid < numel; gid++)
+    {
+        int32_t ncoh = gid / OW, ow = gid % OW;
+        int32_t nc = ncoh / OH, oh = ncoh % OH;
+        int32_t n = nc / C, c = nc % C;
+        int32_t R  = ksize[0];
+        int32_t S  = ksize[1];
+        int32_t sh = stride[0];
+        int32_t sw = stride[1];
+        int32_t ph = padding[0];
+        int32_t pw = padding[1];
+
+        if(n >= N)
+            return 0;
+
+        float m = 0;
+        for(int32_t r = 0; r < R; ++r)
+        {
+            for(int32_t s = 0; s < S; ++s)
+            {
+                // input idx : (n, c, h, w)
+                int32_t h = oh * sh - ph + r;
+                if(h < 0 || h >= H)
+                    continue;
+                int32_t w = ow * sw - pw + s;
+                if(w < 0 || w >= W)
+                    continue;
+                // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+                m += static_cast<float>(
+                    input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]);
+            }
+        }
+
+        int32_t hstart = oh * sh - ph;
+        int32_t wstart = ow * sw - pw;
+        int32_t hend   = min(hstart + R, H + ph);
+        int32_t wend   = min(wstart + S, W + pw);
+
+        const int32_t pool_size = (hend - hstart) * (wend - wstart);
+
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        hend   = min(hend, H);
+        wend   = min(wend, W);
+
+        int32_t divide_factor;
+        if(divisor_override != 0)
+        {
+            divide_factor = divisor_override;
+        }
+        else
+        {
+            if(count_include_pad)
+            {
+                divide_factor = pool_size;
+            }
+            else
+            {
+                divide_factor = (hend - hstart) * (wend - wstart);
+            }
+        }
+        float val = m / divide_factor;
+
+        output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] =
+            static_cast<Tcheck>(val);
+    }
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc,
+                                   const miopenTensorDescriptor_t outputDesc,
+                                   Tgpu* input,
+                                   Tcheck* output,
+                                   size_t N,
+                                   size_t C,
+                                   size_t D,
+                                   size_t H,
+                                   size_t W,
+                                   size_t OD,
+                                   size_t OH,
+                                   size_t OW,
+                                   const int32_t* ksize,
+                                   const int32_t* stride,
+                                   const int32_t* padding,
+                                   bool count_include_pad,
+                                   int32_t divisor_override)
+{
+    auto dims  = miopen::deref(inputDesc).GetLengths();
+    auto numel = miopen::deref(outputDesc).GetElementSize();
+
+    auto input_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
+    auto output_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc));
+
+    for(int32_t gid = 0; gid < numel; gid++)
+    {
+        int32_t ncodoh = gid / OW, ow = gid % OW;
+        int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
+        int32_t nc = ncod / OD, od = ncod % OD;
+        int32_t n = nc / C, c = nc % C;
+        int32_t KD = ksize[0];
+        int32_t R  = ksize[1];
+        int32_t S  = ksize[2];
+        int32_t sd = stride[0];
+        int32_t sh = stride[1];
+        int32_t sw = stride[2];
+        int32_t pd = padding[0];
+        int32_t ph = padding[1];
+        int32_t pw = padding[2];
+
+        if(n >= N)
+            return 0;
+        float sum = 0;
+        for(int32_t kd = 0; kd < KD; ++kd)
+        {
+            for(int32_t r = 0; r < R; ++r)
+            {
+                for(int32_t s = 0; s < S; ++s)
+                {
+                    // input idx : (n, c, d, h, w)
+                    int32_t d = od * sd - pd + kd;
+                    if(d < 0 || d >= D)
+                        continue;
+                    int32_t h = oh * sh - ph + r;
+                    if(h < 0 || h >= H)
+                        continue;
+                    int32_t w = ow * sw - pw + s;
+                    if(w < 0 || w >= W)
+                        continue;
+                    // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+                    sum += static_cast<float>(
+                        input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]);
+                }
+            }
+        }
+        int32_t dstart = od * sd - pd;
+        int32_t hstart = oh * sh - ph;
+        int32_t wstart = ow * sw - pw;
+        int32_t dend   = min(dstart + KD, D + pd);
+        int32_t hend   = min(hstart + R, H + ph);
+        int32_t wend   = min(wstart + S, W + pw);
+
+        const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+        dstart                  = max(dstart, 0);
+        hstart                  = max(hstart, 0);
+        wstart                  = max(wstart, 0);
+        dend                    = min(dend, D);
+        hend                    = min(hend, H);
+        wend                    = min(wend, W);
+
+        int32_t divide_factor;
+        if(divisor_override != 0)
+        {
+            divide_factor = divisor_override;
+        }
+        else
+        {
+            if(count_include_pad)
+            {
+                divide_factor = pool_size;
+            }
+            else
+            {
+                divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            }
+        }
+        float val = sum / divide_factor;
+        output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] =
+            static_cast<Tcheck>(val);
+    }
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDesc,
+                                    const miopenTensorDescriptor_t inputGradDesc,
+                                    Tgpu* output_grad,
+                                    Tcheck* input_grad,
+                                    size_t N,
+                                    size_t C,
+                                    size_t H,
+                                    size_t W,
+                                    size_t OH,
+                                    size_t OW,
+                                    const int32_t* ksize,
+                                    const int32_t* stride,
+                                    const int32_t* padding,
+                                    bool count_include_pad,
+                                    int32_t divisor_override)
+{
+    auto dims  = miopen::deref(inputGradDesc).GetLengths();
+    auto numel = miopen::deref(inputGradDesc).GetElementSize();
+
+    auto output_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc));
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<4>(miopen::deref(inputGradDesc));
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        int32_t nch = gid / W, w = gid % W;
+        int32_t nc = nch / H, h = nch % H;
+        int32_t n = nc / C, c = nc % C;
+        int32_t R  = ksize[0];
+        int32_t S  = ksize[1];
+        int32_t sh = stride[0];
+        int32_t sw = stride[1];
+        int32_t ph = padding[0];
+        int32_t pw = padding[1];
+
+        if(n >= N)
+            return 0;
+
+        float grad = 0;
+        for(int32_t r = 0; r < R; ++r)
+        {
+            for(int32_t s = 0; s < S; ++s)
+            {
+                int32_t ohsh = h + ph - r;
+                if(ohsh % sh != 0)
+                    continue;
+                int32_t oh = ohsh / sh;
+                if(oh < 0 || oh >= OH)
+                    continue;
+                int32_t owsw = w + pw - s;
+                if(owsw % sw != 0)
+                    continue;
+                int32_t ow = owsw / sw;
+                if(ow < 0 || ow >= OW)
+                    continue;
+
+                int32_t hstart = oh * sh - ph;
+                int32_t wstart = ow * sw - pw;
+                int32_t hend   = min(hstart + R, H + ph);
+                int32_t wend   = min(wstart + S, W + pw);
+
+                const int32_t pool_size = (hend - hstart) * (wend - wstart);
+
+                hstart = max(hstart, 0);
+                wstart = max(wstart, 0);
+                hend   = min(hend, H);
+                wend   = min(wend, W);
+
+                int32_t divide_factor;
+                if(divisor_override != 0)
+                {
+                    divide_factor = divisor_override;
+                }
+                else
+                {
+                    if(count_include_pad)
+                    {
+                        divide_factor = pool_size;
+                    }
+                    else
+                    {
+                        divide_factor = (hend - hstart) * (wend - wstart);
+                    }
+                }
+
+                grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
+                            tensor_layout_t<4>(n, c, oh, ow))]) /
+                        divide_factor;
+            }
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] =
+            static_cast<Tcheck>(grad);
+    }
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDesc,
+                                    const miopenTensorDescriptor_t inputGradDesc,
+                                    Tgpu* output_grad,
+                                    Tcheck* input_grad,
+                                    size_t N,
+                                    size_t C,
+                                    size_t D,
+                                    size_t H,
+                                    size_t W,
+                                    size_t OD,
+                                    size_t OH,
+                                    size_t OW,
+                                    const int32_t* ksize,
+                                    const int32_t* stride,
+                                    const int32_t* padding,
+                                    bool count_include_pad,
+                                    int32_t divisor_override)
+{
+    auto dims  = miopen::deref(inputGradDesc).GetLengths();
+    auto numel = miopen::deref(inputGradDesc).GetElementSize();
+
+    auto output_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc));
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(inputGradDesc));
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        int32_t ncdh = gid / W, w = gid % W;
+        int32_t ncd = ncdh / H, h = ncdh % H;
+        int32_t nc = ncd / D, d = ncd % D;
+        int32_t n = nc / C, c = nc % C;
+        int32_t KD = ksize[0];
+        int32_t R  = ksize[1];
+        int32_t S  = ksize[2];
+        int32_t sd = stride[0];
+        int32_t sh = stride[1];
+        int32_t sw = stride[2];
+        int32_t pd = padding[0];
+        int32_t ph = padding[1];
+        int32_t pw = padding[2];
+
+        if(n >= N)
+            return 0;
+
+        float grad = 0;
+        for(int32_t kd = 0; kd < KD; ++kd)
+        {
+            for(int32_t r = 0; r < R; ++r)
+            {
+                for(int32_t s = 0; s < S; ++s)
+                {
+                    int32_t odsd = d + pd - kd;
+                    if(odsd % sd != 0)
+                        continue;
+                    int32_t od = odsd / sd;
+                    if(od < 0 || od >= OD)
+                        continue;
+
+                    int32_t ohsh = h + ph - r;
+                    if(ohsh % sh != 0)
+                        continue;
+                    int32_t oh = ohsh / sh;
+                    if(oh < 0 || oh >= OH)
+                        continue;
+
+                    int32_t owsw = w + pw - s;
+                    if(owsw % sw != 0)
+                        continue;
+                    int32_t ow = owsw / sw;
+                    if(ow < 0 || ow >= OW)
+                        continue;
+
+                    int32_t dstart = od * sd - pd;
+                    int32_t hstart = oh * sh - ph;
+                    int32_t wstart = ow * sw - pw;
+                    int32_t dend   = min(dstart + KD, D + pd);
+                    int32_t hend   = min(hstart + R, H + ph);
+                    int32_t wend   = min(wstart + S, W + pw);
+
+                    const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                    dstart                  = max(dstart, 0);
+                    hstart                  = max(hstart, 0);
+                    wstart                  = max(wstart, 0);
+                    dend                    = min(dend, D);
+                    hend                    = min(hend, H);
+                    wend                    = min(wend, W);
+                    int32_t divide_factor;
+                    if(divisor_override != 0)
+                    {
+                        divide_factor = divisor_override;
+                    }
+                    else
+                    {
+                        if(count_include_pad)
+                        {
+                            divide_factor = pool_size;
+                        }
+                        else
+                        {
+                            divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                        }
+                    }
+                    grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
+                                tensor_layout_t<5>(n, c, od, oh, ow))]) /
+                            divide_factor;
+                }
+            }
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] =
+            static_cast<Tcheck>(grad);
+    }
+    return 0;
+}
+
+#endif // MLO_AVGPOOLHOST_H_
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 3b9bbeccc1..ea44de92d5 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -70,6 +70,7 @@
  * @defgroup SGD
  * @defgroup getitem
  * @defgroup ReduceCalculation
+ * @defgroup avgpool
  *
  */
 
@@ -7621,6 +7622,76 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle,
 // CLOSEOUT GETITEM DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
+#ifdef MIOPEN_BETA_API
+// avgpool APIs
+/** @addtogroup avgpool
+ *
+ *  @{
+ */
+
+/*! @brief Execute an avgpool forward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param input                    Data tensor input (input)
+ * @param outputDesc               Tensor descriptor for output tensor (input)
+ * @param output                   Data tensor output (output)
+ * @param count_include_pad        When True, will include the zero-padding in the averaging
+ * calculation (input)
+ * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
+ * use the number of elements in the pooling window (input)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
+                                                  const miopenTensorDescriptor_t inputDesc,
+                                                  const void* input,
+                                                  const miopenTensorDescriptor_t outputDesc,
+                                                  void* output,
+                                                  const int32_t KD,
+                                                  const int32_t KH,
+                                                  const int32_t KW,
+                                                  const int32_t SD,
+                                                  const int32_t SH,
+                                                  const int32_t SW,
+                                                  const int32_t PD,
+                                                  const int32_t PH,
+                                                  const int32_t PW,
+                                                  const bool count_include_pad,
+                                                  const int32_t divisor_override);
+
+/*! @brief Execute an avgpool backward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param outputGradDesc           Tensor descriptor for output grad tensor (input)
+ * @param output_grad              Data tensor output grad (input)
+ * @param inputGradDesc            Tensor descriptor for input grad tensor (input)
+ * @param input_grad               Data tensor input grad (output)
+ * @param count_include_pad        When True, will include the zero-padding in the averaging
+ * calculation (input)
+ * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
+ * use the number of elements in the pooling window (input)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
+                                                   const miopenTensorDescriptor_t outputGradDesc,
+                                                   const void* output_grad,
+                                                   const miopenTensorDescriptor_t inputGradDesc,
+                                                   void* input_grad,
+                                                   const int32_t KD,
+                                                   const int32_t KH,
+                                                   const int32_t KW,
+                                                   const int32_t SD,
+                                                   const int32_t SH,
+                                                   const int32_t SW,
+                                                   const int32_t PD,
+                                                   const int32_t PH,
+                                                   const int32_t PW,
+                                                   const bool count_include_pad,
+                                                   const int32_t divisor_override);
+/** @} */
+// CLOSEOUT avgpool DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 77acf3f7d3..ee36c92967 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -89,6 +89,8 @@ set( MIOpen_Source
     adam_api.cpp
     addlayernorm_api.cpp
     api/find2_0_commons.cpp
+    avgpool_api.cpp
+    avgpool/problem_description.cpp
     batch_norm.cpp
     batch_norm_api.cpp
     batchnorm/problem_description.cpp
@@ -191,6 +193,10 @@ set( MIOpen_Source
     solver/activ/fwd_1.cpp
     solver/adam/adam.cpp
     solver/adam/transformers_adam_w.cpp
+    solver/avgpool/backward_avgpool_2d.cpp
+    solver/avgpool/backward_avgpool_3d.cpp
+    solver/avgpool/forward_avgpool_2d.cpp
+    solver/avgpool/forward_avgpool_3d.cpp
     solver/batchnorm/backward_ck.cpp
     solver/batchnorm/backward_per_activation.cpp
     solver/batchnorm/backward_per_activation_fused.cpp
@@ -482,6 +488,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         ${GPU_BATCHED_TRANSPOSE_KERNEL_HIP}
         ${GPU_GENERAL_TENSOR_REORDER_KERNEL_HIP_SOURCE}
         kernels/MIOpenAdam.cpp
+        kernels/MIOpenAvgPool.cpp
         kernels/MIOpenCat.cpp
         kernels/MIOpenCheckNumerics.cpp
         kernels/MIOpenBatchNormActivBwdPerAct.cl
@@ -626,6 +633,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         activ.cpp
         adam.cpp
         addlayernorm.cpp
+        avgpool.cpp
         cat.cpp
         groupnorm.cpp
         getitem.cpp
diff --git a/src/avgpool.cpp b/src/avgpool.cpp
new file mode 100644
index 0000000000..323f01c90e
--- /dev/null
+++ b/src/avgpool.cpp
@@ -0,0 +1,136 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/avgpool.hpp>
+#include <miopen/kernel_cache.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/avgpool/solvers.hpp>
+#include <miopen/find_solution.hpp>
+
+namespace miopen {
+
+miopenStatus_t AvgPoolForward(Handle& handle,
+                              const TensorDescriptor& inputDesc,
+                              ConstData_t input,
+                              const TensorDescriptor& outputDesc,
+                              Data_t output,
+                              const int32_t KD,
+                              const int32_t KH,
+                              const int32_t KW,
+                              const int32_t SD,
+                              const int32_t SH,
+                              const int32_t SW,
+                              const int32_t PD,
+                              const int32_t PH,
+                              const int32_t PW,
+                              const bool count_include_pad,
+                              const int32_t divisor_override)
+{
+    const auto problem =
+        avgpool::FwdProblemDescription{inputDesc, outputDesc, count_include_pad, divisor_override};
+
+    const auto invoke_params = [&]() {
+        auto tmp       = avgpool::FwdInvokeParams{};
+        tmp.inputDesc  = &inputDesc;
+        tmp.outputDesc = &outputDesc;
+
+        tmp.input             = input;
+        tmp.output            = output;
+        tmp.KD                = KD;
+        tmp.KH                = KH;
+        tmp.KW                = KW;
+        tmp.SD                = SD;
+        tmp.SH                = SH;
+        tmp.SW                = SW;
+        tmp.PD                = PD;
+        tmp.PH                = PH;
+        tmp.PW                = PW;
+        tmp.count_include_pad = count_include_pad;
+        tmp.divisor_override  = divisor_override;
+
+        return tmp;
+    }();
+    const auto algo    = AlgorithmName{"AvgPoolForward"};
+    const auto solvers = solver::SolverContainer<solver::avgpool::AvgPoolForward2d,
+                                                 solver::avgpool::AvgPoolForward3d>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+miopenStatus_t AvgPoolBackward(Handle& handle,
+                               const TensorDescriptor& outputGradDesc,
+                               ConstData_t output_grad,
+                               const TensorDescriptor& inputGradDesc,
+                               Data_t input_grad,
+                               const int32_t KD,
+                               const int32_t KH,
+                               const int32_t KW,
+                               const int32_t SD,
+                               const int32_t SH,
+                               const int32_t SW,
+                               const int32_t PD,
+                               const int32_t PH,
+                               const int32_t PW,
+                               const bool count_include_pad,
+                               const int32_t divisor_override)
+{
+    const auto problem = avgpool::BwdProblemDescription{
+        outputGradDesc, inputGradDesc, count_include_pad, divisor_override};
+
+    const auto invoke_params = [&]() {
+        auto tmp           = avgpool::BwdInvokeParams{};
+        tmp.outputGradDesc = &outputGradDesc;
+        tmp.inputGradDesc  = &inputGradDesc;
+
+        tmp.output_grad       = output_grad;
+        tmp.input_grad        = input_grad;
+        tmp.KD                = KD;
+        tmp.KH                = KH;
+        tmp.KW                = KW;
+        tmp.SD                = SD;
+        tmp.SH                = SH;
+        tmp.SW                = SW;
+        tmp.PD                = PD;
+        tmp.PH                = PH;
+        tmp.PW                = PW;
+        tmp.count_include_pad = count_include_pad;
+        tmp.divisor_override  = divisor_override;
+
+        return tmp;
+    }();
+    const auto algo    = AlgorithmName{"AvgPoolBackward"};
+    const auto solvers = solver::SolverContainer<solver::avgpool::AvgPoolBackward2d,
+                                                 solver::avgpool::AvgPoolBackward3d>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace miopen
diff --git a/src/avgpool/problem_description.cpp b/src/avgpool/problem_description.cpp
new file mode 100644
index 0000000000..96ecb4bb72
--- /dev/null
+++ b/src/avgpool/problem_description.cpp
@@ -0,0 +1,95 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/avgpool/problem_description.hpp>
+#include <miopen/names.hpp>
+
+namespace miopen {
+
+namespace avgpool {
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
+NetworkConfig FwdProblemDescription::MakeNetworkConfig() const
+{
+    auto input_size    = inputDesc.GetLengths();
+    auto output_size   = outputDesc.GetLengths();
+    auto input_stride  = inputDesc.GetStrides();
+    auto output_stride = outputDesc.GetStrides();
+
+    auto input_dtype = inputDesc.GetType();
+
+    std::ostringstream ss;
+
+    ss << "avgpool_fwd";
+    ss << "-input_dtype" << input_dtype;
+    ss << "-Is" << input_size;
+    ss << "-Os" << output_size;
+    ss << "-Si" << input_stride;
+    ss << "-So" << output_stride;
+    ss << "-Cp " << count_include_pad;
+    ss << "-Do " << divisor_override;
+
+    return NetworkConfig{ss.str()};
+}
+
+NetworkConfig BwdProblemDescription::MakeNetworkConfig() const
+{
+    auto input_grad_size    = inputGradDesc.GetLengths();
+    auto output_grad_size   = outputGradDesc.GetLengths();
+    auto input_grad_stride  = inputGradDesc.GetStrides();
+    auto output_grad_stride = outputGradDesc.GetStrides();
+
+    auto input_dtype = inputGradDesc.GetType();
+
+    std::ostringstream ss;
+
+    ss << "avgpool_bwd";
+    ss << "-input_dtype" << input_dtype;
+    ss << "-dIs" << input_grad_size;
+    ss << "-dOs" << output_grad_size;
+    ss << "-dSi" << input_grad_stride;
+    ss << "-dSo" << output_grad_stride;
+    ss << "-Cp " << count_include_pad;
+    ss << "-Do " << divisor_override;
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace avgpool
+
+} // namespace miopen
diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp
new file mode 100644
index 0000000000..32e1f12f92
--- /dev/null
+++ b/src/avgpool_api.cpp
@@ -0,0 +1,190 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/avgpool.hpp>
+#include <miopen/errors.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/logger.hpp>
+#include <miopen/tensor_ops.hpp>
+
+inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
+static void LogCmdAvgPool(const miopenTensorDescriptor_t xDesc,
+                          const miopenTensorDescriptor_t oDesc,
+                          const bool count_include_pad,
+                          const int32_t divisor_override,
+                          const bool is_fwd)
+{
+    if(miopen::IsLoggingCmd())
+    {
+        std::stringstream ss;
+        auto dtype = miopen::deref(xDesc).GetType();
+        if(dtype == miopenHalf)
+        {
+            ss << "avgpoolfp16";
+        }
+        else if(dtype == miopenFloat)
+        {
+            ss << "avgpoolfp32";
+        }
+        else if(dtype == miopenBFloat16)
+        {
+            ss << "avgpoolbfp16";
+        }
+
+        MIOPEN_LOG_FUNCTION(xDesc, oDesc, count_include_pad, divisor_override);
+        ss << " -Is " << miopen::deref(xDesc).GetLengths();
+        ss << " -Os " << miopen::deref(oDesc).GetLengths();
+        ss << " -Si " << miopen::deref(xDesc).GetStrides();
+        ss << " -So " << miopen::deref(oDesc).GetStrides();
+        ss << " -Cp " << count_include_pad;
+        ss << " -Do " << divisor_override;
+        ss << " -F " << ((is_fwd) ? "1" : "2");
+
+        MIOPEN_LOG_DRIVER_CMD(ss.str());
+    }
+}
+
+extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
+                                               const miopenTensorDescriptor_t inputDesc,
+                                               const void* input,
+                                               const miopenTensorDescriptor_t outputDesc,
+                                               void* output,
+                                               const int32_t KD,
+                                               const int32_t KH,
+                                               const int32_t KW,
+                                               const int32_t SD,
+                                               const int32_t SH,
+                                               const int32_t SW,
+                                               const int32_t PD,
+                                               const int32_t PH,
+                                               const int32_t PW,
+                                               const bool count_include_pad,
+                                               const int32_t divisor_override)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        inputDesc,
+                        input,
+                        outputDesc,
+                        output,
+                        KD,
+                        KH,
+                        KW,
+                        SD,
+                        SH,
+                        SW,
+                        PD,
+                        PH,
+                        PW,
+                        count_include_pad,
+                        divisor_override);
+
+    LogCmdAvgPool(inputDesc, outputDesc, count_include_pad, divisor_override, true);
+    return miopen::try_([&] {
+        miopen::AvgPoolForward(miopen::deref(handle),
+                               miopen::deref(inputDesc),
+                               DataCast(input),
+                               miopen::deref(outputDesc),
+                               DataCast(output),
+                               KD,
+                               KH,
+                               KW,
+                               SD,
+                               SH,
+                               SW,
+                               PD,
+                               PH,
+                               PW,
+                               count_include_pad,
+                               divisor_override);
+    });
+}
+
+extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
+                                                const miopenTensorDescriptor_t outputGradDesc,
+                                                const void* output_grad,
+                                                const miopenTensorDescriptor_t inputGradDesc,
+                                                void* input_grad,
+                                                const int32_t KD,
+                                                const int32_t KH,
+                                                const int32_t KW,
+                                                const int32_t SD,
+                                                const int32_t SH,
+                                                const int32_t SW,
+                                                const int32_t PD,
+                                                const int32_t PH,
+                                                const int32_t PW,
+                                                const bool count_include_pad,
+                                                const int32_t divisor_override)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        outputGradDesc,
+                        output_grad,
+                        inputGradDesc,
+                        input_grad,
+                        KD,
+                        KH,
+                        KW,
+                        SD,
+                        SH,
+                        SW,
+                        PD,
+                        PH,
+                        PW,
+                        count_include_pad,
+                        divisor_override);
+
+    LogCmdAvgPool(inputGradDesc, outputGradDesc, count_include_pad, divisor_override, false);
+    return miopen::try_([&] {
+        miopen::AvgPoolBackward(miopen::deref(handle),
+                                miopen::deref(outputGradDesc),
+                                DataCast(output_grad),
+                                miopen::deref(inputGradDesc),
+                                DataCast(input_grad),
+                                KD,
+                                KH,
+                                KW,
+                                SD,
+                                SH,
+                                SW,
+                                PD,
+                                PH,
+                                PW,
+                                count_include_pad,
+                                divisor_override);
+    });
+}
diff --git a/src/include/miopen/avgpool.hpp b/src/include/miopen/avgpool.hpp
new file mode 100644
index 0000000000..00a2717ff6
--- /dev/null
+++ b/src/include/miopen/avgpool.hpp
@@ -0,0 +1,71 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/miopen.h>
+#ifndef MIOPEN_AVGPOOL_HPP_
+#define MIOPEN_AVGPOOL_HPP_
+
+#include <miopen/common.hpp>
+
+namespace miopen {
+
+struct Handle;
+struct TensorDescriptor;
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolForward(Handle& handle,
+                                                      const TensorDescriptor& inputDesc,
+                                                      ConstData_t input,
+                                                      const TensorDescriptor& outputDesc,
+                                                      Data_t output,
+                                                      int32_t KD,
+                                                      int32_t KH,
+                                                      int32_t KW,
+                                                      int32_t SD,
+                                                      int32_t SH,
+                                                      int32_t SW,
+                                                      int32_t PD,
+                                                      int32_t PH,
+                                                      int32_t PW,
+                                                      bool count_include_pad,
+                                                      int32_t divisor_override);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolBackward(Handle& handle,
+                                                       const TensorDescriptor& outputGradDesc,
+                                                       ConstData_t output_grad,
+                                                       const TensorDescriptor& inputGradDesc,
+                                                       Data_t input_grad,
+                                                       int32_t KD,
+                                                       int32_t KH,
+                                                       int32_t KW,
+                                                       int32_t SD,
+                                                       int32_t SH,
+                                                       int32_t SW,
+                                                       int32_t PD,
+                                                       int32_t PH,
+                                                       int32_t PW,
+                                                       bool count_include_pad,
+                                                       int32_t divisor_override);
+} // namespace miopen
+#endif // _MIOPEN_AVGPOOL_HPP_
diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/avgpool/invoke_params.hpp
new file mode 100644
index 0000000000..e8bd9256ac
--- /dev/null
+++ b/src/include/miopen/avgpool/invoke_params.hpp
@@ -0,0 +1,93 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "miopen/common.hpp"
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+namespace avgpool {
+
+struct FwdInvokeParams : public miopen::InvokeParams
+{
+
+    FwdInvokeParams() = default;
+
+    const TensorDescriptor* inputDesc  = nullptr;
+    const TensorDescriptor* outputDesc = nullptr;
+
+    ConstData_t input = nullptr;
+    Data_t output     = nullptr;
+    ConstData_t ksize = nullptr;
+
+    int32_t KD               = 0;
+    int32_t KH               = 0;
+    int32_t KW               = 0;
+    int32_t SD               = 0;
+    int32_t SH               = 0;
+    int32_t SW               = 0;
+    int32_t PD               = 0;
+    int32_t PH               = 0;
+    int32_t PW               = 0;
+    bool count_include_pad   = false;
+    int32_t divisor_override = 0;
+
+    std::size_t GetWorkspaceSize() const { return 0; }
+    Data_t GetWorkspace() const { return nullptr; }
+};
+
+struct BwdInvokeParams : public miopen::InvokeParams
+{
+
+    BwdInvokeParams() = default;
+
+    const TensorDescriptor* outputGradDesc = nullptr;
+    const TensorDescriptor* inputGradDesc  = nullptr;
+
+    ConstData_t output_grad = nullptr;
+    Data_t input_grad       = nullptr;
+    ConstData_t ksize       = nullptr;
+
+    int32_t KD               = 0;
+    int32_t KH               = 0;
+    int32_t KW               = 0;
+    int32_t SD               = 0;
+    int32_t SH               = 0;
+    int32_t SW               = 0;
+    int32_t PD               = 0;
+    int32_t PH               = 0;
+    int32_t PW               = 0;
+    bool count_include_pad   = false;
+    int32_t divisor_override = 0;
+
+    std::size_t GetWorkspaceSize() const { return 0; }
+    Data_t GetWorkspace() const { return nullptr; }
+};
+
+} // namespace avgpool
+} // namespace miopen
diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/avgpool/problem_description.hpp
new file mode 100644
index 0000000000..2dee6a30ea
--- /dev/null
+++ b/src/include/miopen/avgpool/problem_description.hpp
@@ -0,0 +1,133 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/problem_description_base.hpp>
+#include <miopen/activ.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace avgpool {
+
+struct ProblemDescription : ProblemDescriptionBase
+{
+    ProblemDescription(const bool count_include_pad_, const int32_t divisor_override_)
+        : count_include_pad(count_include_pad_), divisor_override(divisor_override_)
+    {
+        if(divisor_override < 0)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: divisor_override must be non-negative.");
+        }
+    }
+
+protected:
+    bool count_include_pad;
+    int32_t divisor_override;
+};
+
+struct FwdProblemDescription : ProblemDescription
+{
+    FwdProblemDescription(const TensorDescriptor& inputDesc_,
+                          const TensorDescriptor& outputDesc_,
+                          const bool count_include_pad_,
+                          const int32_t divisor_override_)
+        : ProblemDescription(count_include_pad_, divisor_override_),
+          inputDesc(inputDesc_),
+          outputDesc(outputDesc_)
+    {
+        IsValidLength();
+    }
+
+    auto GetInputDesc() const { return inputDesc; }
+    auto GetOutputDesc() const { return outputDesc; }
+    auto GetNtotal() const { return outputDesc.GetElementSize(); }
+
+    bool IsValidLength() const
+    {
+        auto input_dims = inputDesc.GetLengths().size();
+        if(outputDesc.GetLengths()[0] != inputDesc.GetLengths()[0] ||
+           outputDesc.GetLengths()[1] != inputDesc.GetLengths()[1] ||
+           outputDesc.GetLengths().size() != input_dims)
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "AvgPool: Input and output tensor sizes do not match.");
+        }
+
+        return true;
+    }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+protected:
+    TensorDescriptor inputDesc;
+    TensorDescriptor outputDesc;
+};
+
+struct BwdProblemDescription : ProblemDescription
+{
+    BwdProblemDescription(const TensorDescriptor& outputGradDesc_,
+                          const TensorDescriptor& inputGradDesc_,
+                          const bool count_include_pad_,
+                          const int32_t divisor_override_)
+        : ProblemDescription(count_include_pad_, divisor_override_),
+          outputGradDesc(outputGradDesc_),
+          inputGradDesc(inputGradDesc_)
+    {
+        IsValidLength();
+    }
+
+    auto GetOutputGradDesc() const { return outputGradDesc; }
+    auto GetInputGradDesc() const { return inputGradDesc; }
+    auto GetNtotal() const { return inputGradDesc.GetElementSize(); }
+
+    bool IsValidLength() const
+    {
+        auto input_dims = inputGradDesc.GetLengths().size();
+        if(outputGradDesc.GetLengths()[0] != inputGradDesc.GetLengths()[0] ||
+           outputGradDesc.GetLengths()[1] != inputGradDesc.GetLengths()[1] ||
+           outputGradDesc.GetLengths().size() != input_dims)
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "AvgPool: Input grad and output grad tensor sizes do not match.");
+        }
+
+        return true;
+    }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+protected:
+    TensorDescriptor outputGradDesc;
+    TensorDescriptor inputGradDesc;
+};
+
+} // namespace avgpool
+
+} // namespace miopen
diff --git a/src/include/miopen/avgpool/solvers.hpp b/src/include/miopen/avgpool/solvers.hpp
new file mode 100644
index 0000000000..5577b9fad6
--- /dev/null
+++ b/src/include/miopen/avgpool/solvers.hpp
@@ -0,0 +1,113 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include <miopen/solver.hpp>
+#include <miopen/avgpool/problem_description.hpp>
+#include "miopen/kernel_build_params.hpp"
+#include "miopen/kernel_info.hpp"
+
+namespace miopen {
+
+namespace solver {
+
+const auto make_hip_kernel = [](std::vector<size_t> localsize,
+                                std::vector<size_t> gridsize,
+                                std::string kernel_file,
+                                std::string kernel_name,
+                                KernelBuildParameters build_params) {
+    while(localsize.size() < 3)
+        localsize.push_back(1);
+    while(gridsize.size() < 3)
+        gridsize.push_back(1);
+    for(int i = 0; i < localsize.size(); ++i)
+        gridsize[i] = AlignUp(gridsize[i], localsize[i]);
+    return KernelInfo{
+        build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name};
+};
+
+namespace avgpool {
+
+using AvgPoolForward =
+    NonTunableSolverBase<ExecutionContext, miopen::avgpool::FwdProblemDescription>;
+
+using AvgPoolBackward =
+    NonTunableSolverBase<ExecutionContext, miopen::avgpool::BwdProblemDescription>;
+
+// FORWARD
+struct AvgPoolForward2d final : AvgPoolForward
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<AvgPoolForward2d>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::FwdProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::avgpool::FwdProblemDescription& problem) const override;
+};
+
+struct AvgPoolForward3d final : AvgPoolForward
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<AvgPoolForward3d>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::FwdProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::avgpool::FwdProblemDescription& problem) const override;
+};
+
+// BACKWARD
+struct AvgPoolBackward2d final : AvgPoolBackward
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<AvgPoolBackward2d>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::BwdProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::avgpool::BwdProblemDescription& problem) const override;
+};
+
+struct AvgPoolBackward3d final : AvgPoolBackward
+{
+    const std::string& SolverDbId() const override { return GetSolverDbId<AvgPoolBackward3d>(); }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::BwdProblemDescription& problem) const override;
+
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::avgpool::BwdProblemDescription& problem) const override;
+};
+
+} // namespace avgpool
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index 81c15f6bea..194afd79ac 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -59,7 +59,8 @@ enum class Primitive
     Mha,
     Softmax,
     Adam,
-    Item
+    Item,
+    AvgPool
 };
 
 struct MIOPEN_INTERNALS_EXPORT Id
diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp
index 9f7430ba8a..050d431844 100644
--- a/src/include/miopen/tensor_view_utils.hpp
+++ b/src/include/miopen/tensor_view_utils.hpp
@@ -29,6 +29,7 @@
 
 #include <miopen/common.hpp>
 #include "../../kernels/tensor_view.hpp"
+#include "miopen/tensor.hpp"
 
 namespace miopen {
 
diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
new file mode 100644
index 0000000000..76355d5729
--- /dev/null
+++ b/src/kernels/MIOpenAvgPool.cpp
@@ -0,0 +1,574 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <cstddef>
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "tensor_view.hpp"
+
+#ifndef INPUT_TYPE
+#define INPUT_TYPE float
+#endif
+
+#ifndef OUTPUT_TYPE
+#define OUTPUT_TYPE float
+#endif
+
+template <typename TI, typename TO>
+__device__ void avgPoolForward2d(const TI* __restrict__ input,
+                                 TO* __restrict__ output,
+                                 size_t N,
+                                 size_t C,
+                                 size_t H,
+                                 size_t W,
+                                 size_t OH,
+                                 size_t OW,
+                                 int32_t R,
+                                 int32_t S,
+                                 int32_t sh,
+                                 int32_t sw,
+                                 int32_t ph,
+                                 int32_t pw,
+                                 bool count_include_pad,
+                                 int32_t divisor_override,
+                                 tensor_view_t<4> input_tv,
+                                 tensor_view_t<4> output_tv)
+{
+    int32_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t ncoh = gid / OW, ow = gid % OW;
+    int32_t nc = ncoh / OH, oh = ncoh % OH;
+    int32_t n = nc / C, c = nc % C;
+
+    if(n >= N)
+        return;
+
+    FLOAT_ACCUM m = 0;
+    for(int32_t r = 0; r < R; ++r)
+    {
+        for(int32_t s = 0; s < S; ++s)
+        {
+            // input idx : (n, c, h, w)
+            int32_t h = oh * sh - ph + r;
+            if(h < 0 || h >= H)
+                continue;
+            int32_t w = ow * sw - pw + s;
+            if(w < 0 || w >= W)
+                continue;
+            // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+            m += CVT_FLOAT2ACCUM(
+                input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]);
+        }
+    }
+
+    int32_t hstart = oh * sh - ph;
+    int32_t wstart = ow * sw - pw;
+    int32_t hend   = min(hstart + R, H + ph);
+    int32_t wend   = min(wstart + S, W + pw);
+
+    const int32_t pool_size = (hend - hstart) * (wend - wstart);
+
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    hend   = min(hend, H);
+    wend   = min(wend, W);
+
+    int32_t divide_factor;
+    if(divisor_override != 0)
+    {
+        divide_factor = divisor_override;
+    }
+    else
+    {
+        if(count_include_pad)
+        {
+            divide_factor = pool_size;
+        }
+        else
+        {
+            divide_factor = (hend - hstart) * (wend - wstart);
+        }
+    }
+    FLOAT_ACCUM val = m / divide_factor;
+
+    output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] = CVT_ACCUM2FLOAT(val);
+}
+
+extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input,
+                                            OUTPUT_TYPE* __restrict__ output,
+                                            size_t N,
+                                            size_t C,
+                                            size_t H,
+                                            size_t W,
+                                            size_t OH,
+                                            size_t OW,
+                                            int32_t R,
+                                            int32_t S,
+                                            int32_t sh,
+                                            int32_t sw,
+                                            int32_t ph,
+                                            int32_t pw,
+                                            bool count_include_pad,
+                                            int32_t divisor_override,
+                                            tensor_view_t<4> input_tv,
+                                            tensor_view_t<4> output_tv)
+{
+    avgPoolForward2d<INPUT_TYPE, OUTPUT_TYPE>(input,
+                                              output,
+                                              N,
+                                              C,
+                                              H,
+                                              W,
+                                              OH,
+                                              OW,
+                                              R,
+                                              S,
+                                              sh,
+                                              sw,
+                                              ph,
+                                              pw,
+                                              count_include_pad,
+                                              divisor_override,
+                                              input_tv,
+                                              output_tv);
+}
+
+template <typename TI, typename TO>
+__device__ void avgPoolForward3d(const TI* __restrict__ input,
+                                 TO* __restrict__ output,
+                                 size_t N,
+                                 size_t C,
+                                 size_t D,
+                                 size_t H,
+                                 size_t W,
+                                 size_t OD,
+                                 size_t OH,
+                                 size_t OW,
+                                 int32_t KD,
+                                 int32_t R,
+                                 int32_t S,
+                                 int32_t sd,
+                                 int32_t sh,
+                                 int32_t sw,
+                                 int32_t pd,
+                                 int32_t ph,
+                                 int32_t pw,
+                                 bool count_include_pad,
+                                 int32_t divisor_override,
+                                 tensor_view_t<5> input_tv,
+                                 tensor_view_t<5> output_tv)
+{
+    int32_t gid    = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t ncodoh = gid / OW, ow = gid % OW;
+    int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
+    int32_t nc = ncod / OD, od = ncod % OD;
+    int32_t n = nc / C, c = nc % C;
+
+    if(n >= N)
+        return;
+    FLOAT_ACCUM sum = 0;
+    for(int32_t kd = 0; kd < KD; ++kd)
+    {
+        for(int32_t r = 0; r < R; ++r)
+        {
+            for(int32_t s = 0; s < S; ++s)
+            {
+                // input idx : (n, c, d, h, w)
+                int32_t d = od * sd - pd + kd;
+                if(d < 0 || d >= D)
+                    continue;
+                int32_t h = oh * sh - ph + r;
+                if(h < 0 || h >= H)
+                    continue;
+                int32_t w = ow * sw - pw + s;
+                if(w < 0 || w >= W)
+                    continue;
+                // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+                sum += CVT_FLOAT2ACCUM(
+                    input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]);
+            }
+        }
+    }
+    int32_t dstart = od * sd - pd;
+    int32_t hstart = oh * sh - ph;
+    int32_t wstart = ow * sw - pw;
+    int32_t dend   = min(dstart + KD, D + pd);
+    int32_t hend   = min(hstart + R, H + ph);
+    int32_t wend   = min(wstart + S, W + pw);
+
+    const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    dstart                  = max(dstart, 0);
+    hstart                  = max(hstart, 0);
+    wstart                  = max(wstart, 0);
+    dend                    = min(dend, D);
+    hend                    = min(hend, H);
+    wend                    = min(wend, W);
+
+    int32_t divide_factor;
+    if(divisor_override != 0)
+    {
+        divide_factor = divisor_override;
+    }
+    else
+    {
+        if(count_include_pad)
+        {
+            divide_factor = pool_size;
+        }
+        else
+        {
+            divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
+        }
+    }
+    FLOAT_ACCUM val = sum / divide_factor;
+    output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] =
+        CVT_ACCUM2FLOAT(val);
+}
+
+extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input,
+                                            OUTPUT_TYPE* __restrict__ output,
+                                            size_t N,
+                                            size_t C,
+                                            size_t D,
+                                            size_t H,
+                                            size_t W,
+                                            size_t OD,
+                                            size_t OH,
+                                            size_t OW,
+                                            int32_t KD,
+                                            int32_t R,
+                                            int32_t S,
+                                            int32_t sd,
+                                            int32_t sh,
+                                            int32_t sw,
+                                            int32_t pd,
+                                            int32_t ph,
+                                            int32_t pw,
+                                            bool count_include_pad,
+                                            int32_t divisor_override,
+                                            tensor_view_t<5> input_tv,
+                                            tensor_view_t<5> output_tv)
+{
+    avgPoolForward3d<INPUT_TYPE, OUTPUT_TYPE>(input,
+                                              output,
+                                              N,
+                                              C,
+                                              D,
+                                              H,
+                                              W,
+                                              OD,
+                                              OH,
+                                              OW,
+                                              KD,
+                                              R,
+                                              S,
+                                              sd,
+                                              sh,
+                                              sw,
+                                              pd,
+                                              ph,
+                                              pw,
+                                              count_include_pad,
+                                              divisor_override,
+                                              input_tv,
+                                              output_tv);
+}
+
+template <typename TI, typename TO>
+__device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
+                                  TO* __restrict__ input_grad,
+                                  size_t N,
+                                  size_t C,
+                                  size_t H,
+                                  size_t W,
+                                  size_t OH,
+                                  size_t OW,
+                                  int32_t R,
+                                  int32_t S,
+                                  int32_t sh,
+                                  int32_t sw,
+                                  int32_t ph,
+                                  int32_t pw,
+                                  bool count_include_pad,
+                                  int32_t divisor_override,
+                                  tensor_view_t<4> output_grad_tv,
+                                  tensor_view_t<4> input_grad_tv)
+{
+    int32_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t nch = gid / W, w = gid % W;
+    int32_t nc = nch / H, h = nch % H;
+    int32_t n = nc / C, c = nc % C;
+
+    if(n >= N)
+        return;
+
+    FLOAT_ACCUM grad = 0;
+    for(int32_t r = 0; r < R; ++r)
+    {
+        for(int32_t s = 0; s < S; ++s)
+        {
+            int32_t ohsh = h + ph - r;
+            if(ohsh % sh != 0)
+                continue;
+            int32_t oh = ohsh / sh;
+            if(oh < 0 || oh >= OH)
+                continue;
+            int32_t owsw = w + pw - s;
+            if(owsw % sw != 0)
+                continue;
+            int32_t ow = owsw / sw;
+            if(ow < 0 || ow >= OW)
+                continue;
+
+            int32_t hstart = oh * sh - ph;
+            int32_t wstart = ow * sw - pw;
+            int32_t hend   = min(hstart + R, H + ph);
+            int32_t wend   = min(wstart + S, W + pw);
+
+            const int32_t pool_size = (hend - hstart) * (wend - wstart);
+
+            hstart = max(hstart, 0);
+            wstart = max(wstart, 0);
+            hend   = min(hend, H);
+            wend   = min(wend, W);
+
+            int32_t divide_factor;
+            if(divisor_override != 0)
+            {
+                divide_factor = divisor_override;
+            }
+            else
+            {
+                if(count_include_pad)
+                {
+                    divide_factor = pool_size;
+                }
+                else
+                {
+                    divide_factor = (hend - hstart) * (wend - wstart);
+                }
+            }
+
+            grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(
+                        tensor_layout_t<4>(n, c, oh, ow))]) /
+                    divide_factor;
+        }
+    }
+    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] =
+        CVT_ACCUM2FLOAT(grad);
+}
+
+extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad,
+                                             OUTPUT_TYPE* __restrict__ input_grad,
+                                             size_t N,
+                                             size_t C,
+                                             size_t H,
+                                             size_t W,
+                                             size_t OH,
+                                             size_t OW,
+                                             int32_t R,
+                                             int32_t S,
+                                             int32_t sh,
+                                             int32_t sw,
+                                             int32_t ph,
+                                             int32_t pw,
+                                             bool count_include_pad,
+                                             int32_t divisor_override,
+                                             tensor_view_t<4> output_grad_tv,
+                                             tensor_view_t<4> input_grad_tv)
+{
+    avgPoolBackward2d<INPUT_TYPE, OUTPUT_TYPE>(output_grad,
+                                               input_grad,
+                                               N,
+                                               C,
+                                               H,
+                                               W,
+                                               OH,
+                                               OW,
+                                               R,
+                                               S,
+                                               sh,
+                                               sw,
+                                               ph,
+                                               pw,
+                                               count_include_pad,
+                                               divisor_override,
+                                               output_grad_tv,
+                                               input_grad_tv);
+}
+
+template <typename TI, typename TO>
+__device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
+                                  TO* __restrict__ input_grad,
+                                  size_t N,
+                                  size_t C,
+                                  size_t D,
+                                  size_t H,
+                                  size_t W,
+                                  size_t OD,
+                                  size_t OH,
+                                  size_t OW,
+                                  int32_t KD,
+                                  int32_t R,
+                                  int32_t S,
+                                  int32_t sd,
+                                  int32_t sh,
+                                  int32_t sw,
+                                  int32_t pd,
+                                  int32_t ph,
+                                  int32_t pw,
+                                  bool count_include_pad,
+                                  int32_t divisor_override,
+                                  tensor_view_t<5> output_grad_tv,
+                                  tensor_view_t<5> input_grad_tv)
+{
+    int32_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t ncdh = gid / W, w = gid % W;
+    int32_t ncd = ncdh / H, h = ncdh % H;
+    int32_t nc = ncd / D, d = ncd % D;
+    int32_t n = nc / C, c = nc % C;
+
+    if(n >= N)
+        return;
+
+    FLOAT_ACCUM grad = 0;
+    for(int32_t kd = 0; kd < KD; ++kd)
+    {
+        for(int32_t r = 0; r < R; ++r)
+        {
+            for(int32_t s = 0; s < S; ++s)
+            {
+                int32_t odsd = d + pd - kd;
+                if(odsd % sd != 0)
+                    continue;
+                int32_t od = odsd / sd;
+                if(od < 0 || od >= OD)
+                    continue;
+
+                int32_t ohsh = h + ph - r;
+                if(ohsh % sh != 0)
+                    continue;
+                int32_t oh = ohsh / sh;
+                if(oh < 0 || oh >= OH)
+                    continue;
+
+                int32_t owsw = w + pw - s;
+                if(owsw % sw != 0)
+                    continue;
+                int32_t ow = owsw / sw;
+                if(ow < 0 || ow >= OW)
+                    continue;
+
+                int32_t dstart = od * sd - pd;
+                int32_t hstart = oh * sh - ph;
+                int32_t wstart = ow * sw - pw;
+                int32_t dend   = min(dstart + KD, D + pd);
+                int32_t hend   = min(hstart + R, H + ph);
+                int32_t wend   = min(wstart + S, W + pw);
+
+                const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                dstart                  = max(dstart, 0);
+                hstart                  = max(hstart, 0);
+                wstart                  = max(wstart, 0);
+                dend                    = min(dend, D);
+                hend                    = min(hend, H);
+                wend                    = min(wend, W);
+                int32_t divide_factor;
+                if(divisor_override != 0)
+                {
+                    divide_factor = divisor_override;
+                }
+                else
+                {
+                    if(count_include_pad)
+                    {
+                        divide_factor = pool_size;
+                    }
+                    else
+                    {
+                        divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                    }
+                }
+                grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(
+                            tensor_layout_t<5>(n, c, od, oh, ow))]) /
+                        divide_factor;
+            }
+        }
+    }
+    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] =
+        CVT_ACCUM2FLOAT(grad);
+}
+
+extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad,
+                                             OUTPUT_TYPE* __restrict__ input_grad,
+                                             size_t N,
+                                             size_t C,
+                                             size_t D,
+                                             size_t H,
+                                             size_t W,
+                                             size_t OD,
+                                             size_t OH,
+                                             size_t OW,
+                                             int32_t KD,
+                                             int32_t R,
+                                             int32_t S,
+                                             int32_t sd,
+                                             int32_t sh,
+                                             int32_t sw,
+                                             int32_t pd,
+                                             int32_t ph,
+                                             int32_t pw,
+                                             bool count_include_pad,
+                                             int32_t divisor_override,
+                                             tensor_view_t<5> output_grad_tv,
+                                             tensor_view_t<5> input_grad_tv)
+{
+    avgPoolBackward3d<INPUT_TYPE, OUTPUT_TYPE>(output_grad,
+                                               input_grad,
+                                               N,
+                                               C,
+                                               D,
+                                               H,
+                                               W,
+                                               OD,
+                                               OH,
+                                               OW,
+                                               KD,
+                                               R,
+                                               S,
+                                               sd,
+                                               sh,
+                                               sw,
+                                               pd,
+                                               ph,
+                                               pw,
+                                               count_include_pad,
+                                               divisor_override,
+                                               output_grad_tv,
+                                               input_grad_tv);
+}
diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
index d35bfd93fc..d64dbf21f9 100644
--- a/src/kernels/tensor_view.hpp
+++ b/src/kernels/tensor_view.hpp
@@ -72,6 +72,46 @@ struct tensor_layout_t
         }
     }
 
+    constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w)
+    {
+        static_assert(N == 5);
+        layout[0] = n;
+        layout[1] = c;
+        layout[2] = d;
+        layout[3] = h;
+        layout[4] = w;
+    }
+
+    constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t h, uint64_t w)
+    {
+        static_assert(N == 4);
+        layout[0] = n;
+        layout[1] = c;
+        layout[2] = h;
+        layout[3] = w;
+    }
+
+    constexpr tensor_layout_t(uint64_t n, uint64_t h, uint64_t w)
+    {
+        static_assert(N == 3);
+        layout[0] = n;
+        layout[1] = h;
+        layout[2] = w;
+    }
+
+    constexpr tensor_layout_t(uint64_t n, uint64_t w)
+    {
+        static_assert(N == 2);
+        layout[0] = n;
+        layout[1] = w;
+    }
+
+    constexpr tensor_layout_t(uint64_t n)
+    {
+        static_assert(N == 1);
+        layout[0] = n;
+    }
+
     uint64_t layout[N];
 };
 
diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp
new file mode 100644
index 0000000000..73adabb8e7
--- /dev/null
+++ b/src/solver/avgpool/backward_avgpool_2d.cpp
@@ -0,0 +1,158 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include "miopen/tensor_view_utils.hpp"
+#include <miopen/avgpool/solvers.hpp>
+
+#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/avgpool.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_BWD_2D 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace avgpool {
+
+bool IsOverRocmBwd2d(const miopen::avgpool::BwdProblemDescription& problem)
+{
+    auto dtype      = problem.GetInputGradDesc().GetType();
+    auto in_nelems  = problem.GetInputGradDesc().GetElementSize();
+    auto out_nelems = problem.GetOutputGradDesc().GetElementSize();
+    auto mul_nc =
+        problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1];
+    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
+
+    if(dtype == miopenFloat)
+    {
+        return false;
+    }
+    else if(dtype == miopenHalf)
+    {
+        if(in_over_out < 2 && in_nelems >= 11075584)
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenBFloat16)
+    {
+        if(in_over_out < 2 || (in_nelems > 20000000 && mul_nc <= 2048))
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool AvgPoolBackward2d::IsApplicable(const ExecutionContext&,
+                                     const miopen::avgpool::BwdProblemDescription& problem) const
+{
+    if(problem.GetInputGradDesc().GetNumDims() != 4 ||
+       problem.GetOutputGradDesc().GetNumDims() != 4)
+    {
+        return false;
+    }
+    if(!IsOverRocmBwd2d(problem))
+    {
+        return false;
+    }
+    return true;
+}
+
+ConvSolution
+AvgPoolBackward2d::GetSolution(const ExecutionContext& context,
+                               const miopen::avgpool::BwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetOutputGradDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType());
+    auto dtype        = problem.GetInputGradDesc().GetType();
+    size_t N_total    = problem.GetNtotal();
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+        {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
+
+    result.construction_params.push_back(make_hip_kernel(
+        {LOCAL_SIZE_BWD_2D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolBackward2d", build_params));
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::avgpool::BwdInvokeParams>();
+
+            decltype(auto) kernel = handle_.Run(kernels.front());
+
+            auto input_grad_tv  = get_inner_expanded_tv<4>(deref(params.inputGradDesc));
+            auto output_grad_tv = get_inner_expanded_tv<4>(deref(params.outputGradDesc));
+
+            auto N  = deref(params.inputGradDesc).GetLengths()[0];
+            auto C  = deref(params.inputGradDesc).GetLengths()[1];
+            auto H  = deref(params.inputGradDesc).GetLengths()[2];
+            auto W  = deref(params.inputGradDesc).GetLengths()[3];
+            auto OH = deref(params.outputGradDesc).GetLengths()[2];
+            auto OW = deref(params.outputGradDesc).GetLengths()[3];
+
+            kernel(params.output_grad,
+                   params.input_grad,
+                   N,
+                   C,
+                   H,
+                   W,
+                   OH,
+                   OW,
+                   params.KH,
+                   params.KW,
+                   params.SH,
+                   params.SW,
+                   params.PH,
+                   params.PW,
+                   params.count_include_pad,
+                   params.divisor_override,
+                   output_grad_tv,
+                   input_grad_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace avgpool
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp
new file mode 100644
index 0000000000..4815803ad3
--- /dev/null
+++ b/src/solver/avgpool/backward_avgpool_3d.cpp
@@ -0,0 +1,171 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include "miopen/tensor_view_utils.hpp"
+#include <miopen/avgpool/solvers.hpp>
+
+#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/avgpool.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_BWD_3D 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace avgpool {
+
+bool IsOverRocmBwd3d(const miopen::avgpool::BwdProblemDescription& problem)
+{
+    auto dtype      = problem.GetInputGradDesc().GetType();
+    auto in_nelems  = problem.GetInputGradDesc().GetElementSize();
+    auto out_nelems = problem.GetOutputGradDesc().GetElementSize();
+    auto mul_nc =
+        problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1];
+    auto N           = problem.GetOutputGradDesc().GetLengths()[0];
+    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
+
+    if(dtype == miopenFloat)
+    {
+        if((in_over_out < 2 && out_nelems <= 12582912) || (in_over_out <= 8 && N >= 6))
+        {
+            return true;
+        }
+        return false;
+    }
+    else if(dtype == miopenHalf)
+    {
+        if((in_over_out < 2 && mul_nc < 8192) || (8 > in_over_out && out_nelems >= 29052108))
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenBFloat16)
+    {
+        if((1 <= in_over_out && in_over_out < 2 && in_nelems >= 4194304) ||
+           (in_over_out <= 8 && in_nelems >= 944111616))
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool AvgPoolBackward3d::IsApplicable(const ExecutionContext&,
+                                     const miopen::avgpool::BwdProblemDescription& problem) const
+{
+    if(problem.GetInputGradDesc().GetNumDims() != 5 ||
+       problem.GetOutputGradDesc().GetNumDims() != 5)
+    {
+        return false;
+    }
+    if(!IsOverRocmBwd3d(problem))
+    {
+        return false;
+    }
+    return true;
+}
+
+ConvSolution
+AvgPoolBackward3d::GetSolution(const ExecutionContext& context,
+                               const miopen::avgpool::BwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetOutputGradDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType());
+    auto dtype        = problem.GetInputGradDesc().GetType();
+    size_t N_total    = problem.GetNtotal();
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+        {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
+
+    result.construction_params.push_back(make_hip_kernel(
+        {LOCAL_SIZE_BWD_3D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolBackward3d", build_params));
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::avgpool::BwdInvokeParams>();
+
+            decltype(auto) kernel = handle_.Run(kernels.front());
+
+            auto input_grad_tv  = get_inner_expanded_tv<5>(deref(params.inputGradDesc));
+            auto output_grad_tv = get_inner_expanded_tv<5>(deref(params.outputGradDesc));
+
+            auto N  = deref(params.inputGradDesc).GetLengths()[0];
+            auto C  = deref(params.inputGradDesc).GetLengths()[1];
+            auto D  = deref(params.inputGradDesc).GetLengths()[2];
+            auto H  = deref(params.inputGradDesc).GetLengths()[3];
+            auto W  = deref(params.inputGradDesc).GetLengths()[4];
+            auto OD = deref(params.outputGradDesc).GetLengths()[2];
+            auto OH = deref(params.outputGradDesc).GetLengths()[3];
+            auto OW = deref(params.outputGradDesc).GetLengths()[4];
+
+            kernel(params.output_grad,
+                   params.input_grad,
+                   N,
+                   C,
+                   D,
+                   H,
+                   W,
+                   OD,
+                   OH,
+                   OW,
+                   params.KD,
+                   params.KH,
+                   params.KW,
+                   params.SD,
+                   params.SH,
+                   params.SW,
+                   params.PD,
+                   params.PH,
+                   params.PW,
+                   params.count_include_pad,
+                   params.divisor_override,
+                   output_grad_tv,
+                   input_grad_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace avgpool
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
new file mode 100644
index 0000000000..1c51feb54b
--- /dev/null
+++ b/src/solver/avgpool/forward_avgpool_2d.cpp
@@ -0,0 +1,160 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include "miopen/tensor_view_utils.hpp"
+#include <cstdint>
+#include <miopen/avgpool/solvers.hpp>
+
+#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/avgpool.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_FWD_2D 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace avgpool {
+
+bool IsOverRocmFwd2d(const miopen::avgpool::FwdProblemDescription& problem)
+{
+    auto dtype      = problem.GetOutputDesc().GetType();
+    auto in_nelems  = problem.GetInputDesc().GetElementSize();
+    auto out_nelems = problem.GetOutputDesc().GetElementSize();
+    auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1];
+    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
+
+    if(dtype == miopenFloat)
+    {
+        if(in_over_out > 11 || (in_over_out < 2 && mul_nc >= 12288))
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenHalf)
+    {
+        if(in_over_out > 11 || (in_over_out < 2 && mul_nc < 90000))
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenBFloat16)
+    {
+        if(in_over_out >= 1024 || in_over_out < 2 || out_nelems >= 4816896)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool AvgPoolForward2d::IsApplicable(const ExecutionContext&,
+                                    const miopen::avgpool::FwdProblemDescription& problem) const
+{
+    if(problem.GetInputDesc().GetNumDims() != 4 || problem.GetOutputDesc().GetNumDims() != 4)
+    {
+        return false;
+    }
+    if(!IsOverRocmFwd2d(problem))
+    {
+        return false;
+    }
+    return true;
+}
+
+ConvSolution
+AvgPoolForward2d::GetSolution(const ExecutionContext& context,
+                              const miopen::avgpool::FwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
+    auto dtype        = problem.GetOutputDesc().GetType();
+    size_t N_total    = problem.GetNtotal();
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+        {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
+
+    result.construction_params.push_back(make_hip_kernel(
+        {LOCAL_SIZE_FWD_2D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolForward2d", build_params));
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::avgpool::FwdInvokeParams>();
+
+            decltype(auto) kernel = handle_.Run(kernels.front());
+
+            auto input_tv  = get_inner_expanded_tv<4>(deref(params.inputDesc));
+            auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc));
+
+            size_t N  = deref(params.inputDesc).GetLengths()[0];
+            size_t C  = deref(params.inputDesc).GetLengths()[1];
+            size_t H  = deref(params.inputDesc).GetLengths()[2];
+            size_t W  = deref(params.inputDesc).GetLengths()[3];
+            size_t OH = deref(params.outputDesc).GetLengths()[2];
+            size_t OW = deref(params.outputDesc).GetLengths()[3];
+
+            kernel(params.input,
+                   params.output,
+                   N,
+                   C,
+                   H,
+                   W,
+                   OH,
+                   OW,
+                   params.KH,
+                   params.KW,
+                   params.SH,
+                   params.SW,
+                   params.PH,
+                   params.PW,
+                   params.count_include_pad,
+                   params.divisor_override,
+                   input_tv,
+                   output_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace avgpool
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp
new file mode 100644
index 0000000000..6f70a07419
--- /dev/null
+++ b/src/solver/avgpool/forward_avgpool_3d.cpp
@@ -0,0 +1,171 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include "miopen/tensor_view_utils.hpp"
+#include <miopen/avgpool/solvers.hpp>
+
+#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/avgpool.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_FWD_3D 256
+
+namespace miopen {
+
+namespace solver {
+
+namespace avgpool {
+
+bool IsOverRocmFwd3d(const miopen::avgpool::FwdProblemDescription& problem)
+{
+    auto dtype      = problem.GetOutputDesc().GetType();
+    auto in_nelems  = problem.GetInputDesc().GetElementSize();
+    auto out_nelems = problem.GetOutputDesc().GetElementSize();
+    auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1];
+    auto N      = problem.GetOutputDesc().GetLengths()[0];
+    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
+
+    std::cout << "in_over_out: " << in_over_out << std::endl;
+    std::cout << "in_nelems: " << in_nelems << std::endl;
+    std::cout << "out_nelems: " << out_nelems << std::endl;
+
+    if(dtype == miopenFloat)
+    {
+        if(in_over_out < 2 || in_over_out >= 262144 || (out_nelems >= 10125000 && N > 4))
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenHalf)
+    {
+        if(in_nelems >= 201326592 || (in_over_out < 2 && mul_nc < 8192))
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenBFloat16)
+    {
+        if((out_nelems >= 5971968 && in_over_out < 2) || out_nelems >= 74088000)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool AvgPoolForward3d::IsApplicable(const ExecutionContext&,
+                                    const miopen::avgpool::FwdProblemDescription& problem) const
+{
+    if(problem.GetInputDesc().GetNumDims() != 5 || problem.GetOutputDesc().GetNumDims() != 5)
+    {
+        return false;
+    }
+    if(!IsOverRocmFwd3d(problem))
+    {
+        return false;
+    }
+    return true;
+}
+
+ConvSolution
+AvgPoolForward3d::GetSolution(const ExecutionContext& context,
+                              const miopen::avgpool::FwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
+    auto dtype        = problem.GetOutputDesc().GetType();
+    size_t N_total    = problem.GetNtotal();
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+        {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
+
+    result.construction_params.push_back(make_hip_kernel(
+        {LOCAL_SIZE_FWD_3D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolForward3d", build_params));
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::avgpool::FwdInvokeParams>();
+
+            decltype(auto) kernel = handle_.Run(kernels.front());
+
+            auto input_tv  = get_inner_expanded_tv<5>(deref(params.inputDesc));
+            auto output_tv = get_inner_expanded_tv<5>(deref(params.outputDesc));
+
+            auto N  = deref(params.inputDesc).GetLengths()[0];
+            auto C  = deref(params.inputDesc).GetLengths()[1];
+            auto D  = deref(params.inputDesc).GetLengths()[2];
+            auto H  = deref(params.inputDesc).GetLengths()[3];
+            auto W  = deref(params.inputDesc).GetLengths()[4];
+            auto OD = deref(params.outputDesc).GetLengths()[2];
+            auto OH = deref(params.outputDesc).GetLengths()[3];
+            auto OW = deref(params.outputDesc).GetLengths()[4];
+
+            kernel(params.input,
+                   params.output,
+                   N,
+                   C,
+                   D,
+                   H,
+                   W,
+                   OD,
+                   OH,
+                   OW,
+                   params.KD,
+                   params.KH,
+                   params.KW,
+                   params.SD,
+                   params.SH,
+                   params.SW,
+                   params.PD,
+                   params.PH,
+                   params.PW,
+                   params.count_include_pad,
+                   params.divisor_override,
+                   input_tv,
+                   output_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace avgpool
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/test/cpu_avgpool.hpp b/test/cpu_avgpool.hpp
new file mode 100644
index 0000000000..5b91033633
--- /dev/null
+++ b/test/cpu_avgpool.hpp
@@ -0,0 +1,426 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_CPU_AVGPOOL_HPP
+#define GUARD_CPU_AVGPOOL_HPP
+
+#include "tensor_holder.hpp"
+#include <miopen/tensor_view_utils.hpp>
+
+template <class T>
+void cpu_avgpool_forward_2d(tensor<T> input,
+                            tensor<T>& output,
+                            size_t N,
+                            size_t C,
+                            size_t H,
+                            size_t W,
+                            size_t OH,
+                            size_t OW,
+                            tensor<int32_t> ksize,
+                            tensor<int32_t> stride,
+                            tensor<int32_t> padding,
+                            bool count_include_pad,
+                            int32_t divisor_override)
+{
+    auto dims  = input.desc.GetLengths();
+    auto numel = output.desc.GetElementSize();
+
+    auto input_tv  = miopen::get_inner_expanded_tv<4>(input.desc);
+    auto output_tv = miopen::get_inner_expanded_tv<4>(output.desc);
+
+    for(int32_t gid = 0; gid < numel; gid++)
+    {
+        int32_t ncoh = gid / OW, ow = gid % OW;
+        int32_t nc = ncoh / OH, oh = ncoh % OH;
+        int32_t n = nc / C, c = nc % C;
+        int32_t R  = ksize[0];
+        int32_t S  = ksize[1];
+        int32_t sh = stride[0];
+        int32_t sw = stride[1];
+        int32_t ph = padding[0];
+        int32_t pw = padding[1];
+
+        if(n >= N)
+            return;
+
+        float m = 0;
+        for(int32_t r = 0; r < R; ++r)
+        {
+            for(int32_t s = 0; s < S; ++s)
+            {
+                // input idx : (n, c, h, w)
+                int32_t h = oh * sh - ph + r;
+                if(h < 0 || h >= H)
+                    continue;
+                int32_t w = ow * sw - pw + s;
+                if(w < 0 || w >= W)
+                    continue;
+                // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+                m += static_cast<float>(
+                    input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]);
+            }
+        }
+
+        int32_t hstart = oh * sh - ph;
+        int32_t wstart = ow * sw - pw;
+        int32_t hend   = min(hstart + R, H + ph);
+        int32_t wend   = min(wstart + S, W + pw);
+
+        const int32_t pool_size = (hend - hstart) * (wend - wstart);
+
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        hend   = min(hend, H);
+        wend   = min(wend, W);
+
+        int32_t divide_factor;
+        if(divisor_override != 0)
+        {
+            divide_factor = divisor_override;
+        }
+        else
+        {
+            if(count_include_pad)
+            {
+                divide_factor = pool_size;
+            }
+            else
+            {
+                divide_factor = (hend - hstart) * (wend - wstart);
+            }
+        }
+        float val = m / divide_factor;
+
+        output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] =
+            static_cast<T>(val);
+    }
+}
+
+template <class T>
+void cpu_avgpool_forward_3d(tensor<T> input,
+                            tensor<T>& output,
+                            size_t N,
+                            size_t C,
+                            size_t D,
+                            size_t H,
+                            size_t W,
+                            size_t OD,
+                            size_t OH,
+                            size_t OW,
+                            tensor<int32_t> ksize,
+                            tensor<int32_t> stride,
+                            tensor<int32_t> padding,
+                            bool count_include_pad,
+                            int32_t divisor_override)
+{
+    auto dims  = input.desc.GetLengths();
+    auto numel = output.desc.GetElementSize();
+
+    auto input_tv  = miopen::get_inner_expanded_tv<5>(input.desc);
+    auto output_tv = miopen::get_inner_expanded_tv<5>(output.desc);
+
+    for(int32_t gid = 0; gid < numel; gid++)
+    {
+        int32_t ncodoh = gid / OW, ow = gid % OW;
+        int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
+        int32_t nc = ncod / OD, od = ncod % OD;
+        int32_t n = nc / C, c = nc % C;
+        int32_t KD = ksize[0];
+        int32_t R  = ksize[1];
+        int32_t S  = ksize[2];
+        int32_t sd = stride[0];
+        int32_t sh = stride[1];
+        int32_t sw = stride[2];
+        int32_t pd = padding[0];
+        int32_t ph = padding[1];
+        int32_t pw = padding[2];
+
+        if(n >= N)
+            return;
+        float sum = 0;
+        for(int32_t kd = 0; kd < KD; ++kd)
+        {
+            for(int32_t r = 0; r < R; ++r)
+            {
+                for(int32_t s = 0; s < S; ++s)
+                {
+                    // input idx : (n, c, d, h, w)
+                    int32_t d = od * sd - pd + kd;
+                    if(d < 0 || d >= D)
+                        continue;
+                    int32_t h = oh * sh - ph + r;
+                    if(h < 0 || h >= H)
+                        continue;
+                    int32_t w = ow * sw - pw + s;
+                    if(w < 0 || w >= W)
+                        continue;
+                    // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+                    sum += static_cast<float>(
+                        input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]);
+                }
+            }
+        }
+        int32_t dstart = od * sd - pd;
+        int32_t hstart = oh * sh - ph;
+        int32_t wstart = ow * sw - pw;
+        int32_t dend   = min(dstart + KD, D + pd);
+        int32_t hend   = min(hstart + R, H + ph);
+        int32_t wend   = min(wstart + S, W + pw);
+
+        const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+        dstart                  = max(dstart, 0);
+        hstart                  = max(hstart, 0);
+        wstart                  = max(wstart, 0);
+        dend                    = min(dend, D);
+        hend                    = min(hend, H);
+        wend                    = min(wend, W);
+
+        int32_t divide_factor;
+        if(divisor_override != 0)
+        {
+            divide_factor = divisor_override;
+        }
+        else
+        {
+            if(count_include_pad)
+            {
+                divide_factor = pool_size;
+            }
+            else
+            {
+                divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            }
+        }
+        float val = sum / divide_factor;
+        output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] =
+            static_cast<T>(val);
+    }
+}
+
+template <class T>
+void cpu_avgpool_backward_2d(tensor<T> output_grad,
+                             tensor<T>& input_grad,
+                             size_t N,
+                             size_t C,
+                             size_t H,
+                             size_t W,
+                             size_t OH,
+                             size_t OW,
+                             tensor<int32_t> ksize,
+                             tensor<int32_t> stride,
+                             tensor<int32_t> padding,
+                             bool count_include_pad,
+                             int32_t divisor_override)
+{
+    auto dims  = input_grad.desc.GetLengths();
+    auto numel = input_grad.desc.GetElementSize();
+
+    auto output_grad_tv = miopen::get_inner_expanded_tv<4>(output_grad.desc);
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<4>(input_grad.desc);
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        int32_t nch = gid / W, w = gid % W;
+        int32_t nc = nch / H, h = nch % H;
+        int32_t n = nc / C, c = nc % C;
+        int32_t R  = ksize[0];
+        int32_t S  = ksize[1];
+        int32_t sh = stride[0];
+        int32_t sw = stride[1];
+        int32_t ph = padding[0];
+        int32_t pw = padding[1];
+
+        if(n >= N)
+            return;
+
+        float grad = 0;
+        for(int32_t r = 0; r < R; ++r)
+        {
+            for(int32_t s = 0; s < S; ++s)
+            {
+                int32_t ohsh = h + ph - r;
+                if(ohsh % sh != 0)
+                    continue;
+                int32_t oh = ohsh / sh;
+                if(oh < 0 || oh >= OH)
+                    continue;
+                int32_t owsw = w + pw - s;
+                if(owsw % sw != 0)
+                    continue;
+                int32_t ow = owsw / sw;
+                if(ow < 0 || ow >= OW)
+                    continue;
+
+                int32_t hstart = oh * sh - ph;
+                int32_t wstart = ow * sw - pw;
+                int32_t hend   = min(hstart + R, H + ph);
+                int32_t wend   = min(wstart + S, W + pw);
+
+                const int32_t pool_size = (hend - hstart) * (wend - wstart);
+
+                hstart = max(hstart, 0);
+                wstart = max(wstart, 0);
+                hend   = min(hend, H);
+                wend   = min(wend, W);
+
+                int32_t divide_factor;
+                if(divisor_override != 0)
+                {
+                    divide_factor = divisor_override;
+                }
+                else
+                {
+                    if(count_include_pad)
+                    {
+                        divide_factor = pool_size;
+                    }
+                    else
+                    {
+                        divide_factor = (hend - hstart) * (wend - wstart);
+                    }
+                }
+
+                grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
+                            tensor_layout_t<4>(n, c, oh, ow))]) /
+                        divide_factor;
+            }
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] =
+            static_cast<T>(grad);
+    }
+}
+
+template <class T>
+void cpu_avgpool_backward_3d(tensor<T> output_grad,
+                             tensor<T>& input_grad,
+                             size_t N,
+                             size_t C,
+                             size_t D,
+                             size_t H,
+                             size_t W,
+                             size_t OD,
+                             size_t OH,
+                             size_t OW,
+                             tensor<int32_t> ksize,
+                             tensor<int32_t> stride,
+                             tensor<int32_t> padding,
+                             bool count_include_pad,
+                             int32_t divisor_override)
+{
+    auto dims  = input_grad.desc.GetLengths();
+    auto numel = input_grad.desc.GetElementSize();
+
+    auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc);
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(input_grad.desc);
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        int32_t ncdh = gid / W, w = gid % W;
+        int32_t ncd = ncdh / H, h = ncdh % H;
+        int32_t nc = ncd / D, d = ncd % D;
+        int32_t n = nc / C, c = nc % C;
+        int32_t KD = ksize[0];
+        int32_t R  = ksize[1];
+        int32_t S  = ksize[2];
+        int32_t sd = stride[0];
+        int32_t sh = stride[1];
+        int32_t sw = stride[2];
+        int32_t pd = padding[0];
+        int32_t ph = padding[1];
+        int32_t pw = padding[2];
+
+        if(n >= N)
+            return;
+
+        float grad = 0;
+        for(int32_t kd = 0; kd < KD; ++kd)
+        {
+            for(int32_t r = 0; r < R; ++r)
+            {
+                for(int32_t s = 0; s < S; ++s)
+                {
+                    int32_t odsd = d + pd - kd;
+                    if(odsd % sd != 0)
+                        continue;
+                    int32_t od = odsd / sd;
+                    if(od < 0 || od >= OD)
+                        continue;
+
+                    int32_t ohsh = h + ph - r;
+                    if(ohsh % sh != 0)
+                        continue;
+                    int32_t oh = ohsh / sh;
+                    if(oh < 0 || oh >= OH)
+                        continue;
+
+                    int32_t owsw = w + pw - s;
+                    if(owsw % sw != 0)
+                        continue;
+                    int32_t ow = owsw / sw;
+                    if(ow < 0 || ow >= OW)
+                        continue;
+
+                    int32_t dstart = od * sd - pd;
+                    int32_t hstart = oh * sh - ph;
+                    int32_t wstart = ow * sw - pw;
+                    int32_t dend   = min(dstart + KD, D + pd);
+                    int32_t hend   = min(hstart + R, H + ph);
+                    int32_t wend   = min(wstart + S, W + pw);
+
+                    const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                    dstart                  = max(dstart, 0);
+                    hstart                  = max(hstart, 0);
+                    wstart                  = max(wstart, 0);
+                    dend                    = min(dend, D);
+                    hend                    = min(hend, H);
+                    wend                    = min(wend, W);
+                    int32_t divide_factor;
+                    if(divisor_override != 0)
+                    {
+                        divide_factor = divisor_override;
+                    }
+                    else
+                    {
+                        if(count_include_pad)
+                        {
+                            divide_factor = pool_size;
+                        }
+                        else
+                        {
+                            divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                        }
+                    }
+                    grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
+                                tensor_layout_t<5>(n, c, od, oh, ow))]) /
+                            divide_factor;
+                }
+            }
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] =
+            static_cast<T>(grad);
+    }
+}
+
+#endif
diff --git a/test/gtest/avgpool.cpp b/test/gtest/avgpool.cpp
new file mode 100644
index 0000000000..3ab32be510
--- /dev/null
+++ b/test/gtest/avgpool.cpp
@@ -0,0 +1,175 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "avgpool.hpp"
+#include <miopen/env.hpp>
+
+MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
+
+namespace avgpool {
+
+std::string GetFloatArg()
+{
+    const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG);
+    if(tmp.empty())
+    {
+        return "";
+    }
+    return tmp;
+}
+
+struct GPU_Avgpool_fwd_FP32 : AvgPoolTestFwd<float>
+{
+};
+
+struct GPU_Avgpool_fwd_FP16 : AvgPoolTestFwd<half>
+{
+};
+
+struct GPU_Avgpool_fwd_BFP16 : AvgPoolTestFwd<bfloat16>
+{
+};
+
+struct GPU_Avgpool_bwd_FP32 : AvgPoolTestBwd<float>
+{
+};
+
+struct GPU_Avgpool_bwd_FP16 : AvgPoolTestBwd<half>
+{
+};
+
+struct GPU_Avgpool_bwd_BFP16 : AvgPoolTestBwd<bfloat16>
+{
+};
+
+} // namespace avgpool
+using namespace avgpool;
+
+// FORWARD TEST
+TEST_P(GPU_Avgpool_fwd_FP32, AvgPoolTestFwd)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_Avgpool_fwd_FP16, AvgPoolTestFwd)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_Avgpool_fwd_BFP16, AvgPoolTestFwd)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Avgpool_fwd_FP32,
+                         testing::ValuesIn(AvgPoolTestConfigsFwdFp32()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Avgpool_fwd_FP16,
+                         testing::ValuesIn(AvgPoolTestConfigsFwdFp16()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Avgpool_fwd_BFP16,
+                         testing::ValuesIn(AvgPoolTestConfigsFwdBfp16()));
+
+// BACKWARD TEST
+TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_Avgpool_bwd_FP16, AvgPoolTestBwd)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Avgpool_bwd_FP32,
+                         testing::ValuesIn(AvgPoolTestConfigsBwdFp32()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Avgpool_bwd_FP16,
+                         testing::ValuesIn(AvgPoolTestConfigsBwdFp16()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Avgpool_bwd_BFP16,
+                         testing::ValuesIn(AvgPoolTestConfigsBwdBfp16()));
diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp
new file mode 100644
index 0000000000..94898d32b6
--- /dev/null
+++ b/test/gtest/avgpool.hpp
@@ -0,0 +1,451 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "../driver/tensor_driver.hpp"
+#include "cpu_avgpool.hpp"
+#include "get_handle.hpp"
+#include "random.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+#include <cstdint>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <miopen/avgpool.hpp>
+#include <miopen/miopen.h>
+
+template <class T>
+inline std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
+struct AvgPoolTestCase
+{
+    std::vector<int32_t> input_dims;
+    std::vector<int32_t> kernel_size;
+    std::vector<int32_t> stride;
+    std::vector<int32_t> padding;
+    bool ceil_mode;
+    bool count_include_pad;
+    int32_t divisor_override;
+
+    friend std::ostream& operator<<(std::ostream& os, const AvgPoolTestCase& tc)
+    {
+        return os << " input_dims:" << tc.input_dims << " kernel_size:" << tc.kernel_size
+                  << " stride:" << tc.stride << " padding:" << tc.padding
+                  << " ceil_mode:" << tc.ceil_mode << " count_include_pad:" << tc.count_include_pad
+                  << " divisor_override:" << tc.divisor_override;
+    }
+
+    std::vector<int32_t> GetInput() const { return input_dims; }
+};
+
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdFp32()
+{
+    return {
+        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+    };
+}
+
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdFp16()
+{
+    return {
+        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+    };
+}
+
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdBfp16()
+{
+    return {
+        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+    };
+}
+
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdFp32()
+{
+    return {
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+    };
+}
+
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdFp16()
+{
+    return {
+        {{64, 288, 35, 35}, {3, 3}, {1, 1}, {1, 1}, false, true, 0},
+        {{6, 288, 35, 35, 35}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, false, true, 0},
+    };
+}
+
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdBfp16()
+{
+    return {
+        {{64, 2048, 9, 9}, {3, 3}, {1, 1}, {1, 1}, false, true, 0},
+        {{6, 128, 112, 112, 112}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+    };
+}
+
+// FORWARD TEST
+template <typename T = float>
+struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle     = get_handle();
+        avgpool_config    = GetParam();
+        auto in_dim       = avgpool_config.GetInput();
+        N                 = in_dim[0];
+        C                 = in_dim[1];
+        D                 = in_dim.size() == 5 ? in_dim[2] : 1;
+        H                 = in_dim.size() == 5 ? in_dim[3] : in_dim[2];
+        W                 = in_dim.size() == 5 ? in_dim[4] : in_dim[3];
+        ksize             = tensor<int32_t>{in_dim.size() - 2};
+        ksize.data        = avgpool_config.kernel_size;
+        stride            = tensor<int32_t>{in_dim.size() - 2};
+        stride.data       = avgpool_config.stride;
+        padding           = tensor<int32_t>{in_dim.size() - 2};
+        padding.data      = avgpool_config.padding;
+        ceil_mode         = avgpool_config.ceil_mode;
+        count_include_pad = avgpool_config.count_include_pad;
+        divisor_override  = avgpool_config.divisor_override;
+
+        auto gen_input_value = [](auto...) {
+            return prng::gen_A_to_B<T>(static_cast<T>(-10.0f), static_cast<T>(10.0f));
+        };
+        input = tensor<T>{in_dim}.generate(gen_input_value);
+
+        std::vector<int32_t> out_dim;
+        if(in_dim.size() == 5)
+        {
+            if(ceil_mode)
+            {
+                OD = std::ceil(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OH = std::ceil(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+                OW = std::ceil(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
+            }
+            else
+            {
+                OD = std::floor(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OH = std::floor(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+                OW = std::floor(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
+            }
+            out_dim = {N, C, OD, OH, OW};
+        }
+        else
+        {
+            if(ceil_mode)
+            {
+                OH = std::ceil(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OW = std::ceil(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+            }
+            else
+            {
+                OH = std::floor(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OW = std::floor(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+            }
+            out_dim = {N, C, OH, OW};
+        }
+
+        output = tensor<T>{out_dim};
+        std::fill(output.begin(), output.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ref_output = tensor<T>{out_dim};
+        std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits<T>::quiet_NaN());
+
+        input_dev   = handle.Write(input.data);
+        output_dev  = handle.Write(output.data);
+        ksize_dev   = handle.Write(ksize.data);
+        stride_dev  = handle.Write(stride.data);
+        padding_dev = handle.Write(padding.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+        miopenStatus_t status;
+
+        auto dims = input.desc.GetNumDims();
+        if(dims == 4)
+        {
+            cpu_avgpool_forward_2d(input,
+                                   ref_output,
+                                   N,
+                                   C,
+                                   H,
+                                   W,
+                                   OH,
+                                   OW,
+                                   ksize,
+                                   stride,
+                                   padding,
+                                   count_include_pad,
+                                   divisor_override);
+        }
+        else if(dims == 5)
+        {
+            cpu_avgpool_forward_3d<T>(input,
+                                      ref_output,
+                                      N,
+                                      C,
+                                      D,
+                                      H,
+                                      W,
+                                      OD,
+                                      OH,
+                                      OW,
+                                      ksize,
+                                      stride,
+                                      padding,
+                                      count_include_pad,
+                                      divisor_override);
+        }
+        status = miopen::AvgPoolForward(handle,
+                                        input.desc,
+                                        input_dev.get(),
+                                        output.desc,
+                                        output_dev.get(),
+                                        ksize.GetSize() == 3 ? ksize[0] : 0,
+                                        ksize.GetSize() == 3 ? ksize[1] : ksize[0],
+                                        ksize.GetSize() == 3 ? ksize[2] : ksize[1],
+                                        stride.GetSize() == 3 ? stride[0] : 0,
+                                        stride.GetSize() == 3 ? stride[1] : stride[0],
+                                        stride.GetSize() == 3 ? stride[2] : stride[1],
+                                        padding.GetSize() == 3 ? padding[0] : 0,
+                                        padding.GetSize() == 3 ? padding[1] : padding[0],
+                                        padding.GetSize() == 3 ? padding[2] : padding[1],
+                                        count_include_pad,
+                                        divisor_override);
+        fflush(stdout);
+        ASSERT_EQ(status, miopenStatusSuccess);
+
+        output.data = handle.Read<T>(output_dev, output.data.size());
+    }
+
+    void Verify()
+    {
+        double threshold = std::numeric_limits<T>::epsilon();
+
+        auto error = miopen::rms_range(ref_output, output);
+
+        ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output));
+        EXPECT_LT(error, threshold * 10);
+    }
+    AvgPoolTestCase avgpool_config;
+
+    tensor<T> input;
+    tensor<T> output;
+    tensor<T> ref_output;
+    tensor<int32_t> ksize;
+    tensor<int32_t> stride;
+    tensor<int32_t> padding;
+
+    bool ceil_mode;
+    bool count_include_pad;
+    int32_t divisor_override;
+    int32_t N, C, D, H, W, OD, OH, OW;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+    miopen::Allocator::ManageDataPtr ksize_dev;
+    miopen::Allocator::ManageDataPtr stride_dev;
+    miopen::Allocator::ManageDataPtr padding_dev;
+};
+
+// BACKWARD TEST
+template <typename T = float>
+struct AvgPoolTestBwd : public ::testing::TestWithParam<AvgPoolTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle     = get_handle();
+        avgpool_config    = GetParam();
+        auto in_grad_dim  = avgpool_config.GetInput();
+        N                 = in_grad_dim[0];
+        C                 = in_grad_dim[1];
+        D                 = in_grad_dim.size() == 5 ? in_grad_dim[2] : 1;
+        H                 = in_grad_dim.size() == 5 ? in_grad_dim[3] : in_grad_dim[2];
+        W                 = in_grad_dim.size() == 5 ? in_grad_dim[4] : in_grad_dim[3];
+        ksize             = tensor<int32_t>{in_grad_dim.size() - 2};
+        ksize.data        = avgpool_config.kernel_size;
+        stride            = tensor<int32_t>{in_grad_dim.size() - 2};
+        stride.data       = avgpool_config.stride;
+        padding           = tensor<int32_t>{in_grad_dim.size() - 2};
+        padding.data      = avgpool_config.padding;
+        ceil_mode         = avgpool_config.ceil_mode;
+        count_include_pad = avgpool_config.count_include_pad;
+        divisor_override  = avgpool_config.divisor_override;
+
+        std::vector<int32_t> out_grad_dim;
+        if(in_grad_dim.size() == 5)
+        {
+            if(ceil_mode)
+            {
+                OD = std::ceil(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OH = std::ceil(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+                OW = std::ceil(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
+            }
+            else
+            {
+                OD = std::floor(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OH = std::floor(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+                OW = std::floor(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
+            }
+            out_grad_dim = {N, C, OD, OH, OW};
+        }
+        else
+        {
+            if(ceil_mode)
+            {
+                OH = std::ceil(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OW = std::ceil(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+            }
+            else
+            {
+                OH = std::floor(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OW = std::floor(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+            }
+            out_grad_dim = {N, C, OH, OW};
+        }
+        auto gen_output_grad_value = [](auto...) {
+            return prng::gen_A_to_B<T>(static_cast<T>(-10.0f), static_cast<T>(10.0f));
+        };
+        output_grad = tensor<T>{out_grad_dim}.generate(gen_output_grad_value);
+
+        input_grad = tensor<T>{in_grad_dim};
+        std::fill(input_grad.begin(), input_grad.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ref_input_grad = tensor<T>{in_grad_dim};
+        std::fill(
+            ref_input_grad.begin(), ref_input_grad.end(), std::numeric_limits<T>::quiet_NaN());
+
+        output_grad_dev = handle.Write(output_grad.data);
+        input_grad_dev  = handle.Write(input_grad.data);
+        ksize_dev       = handle.Write(ksize.data);
+        stride_dev      = handle.Write(stride.data);
+        padding_dev     = handle.Write(padding.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+
+        miopenStatus_t status;
+
+        auto dims = input_grad.desc.GetNumDims();
+        if(dims == 4)
+        {
+            cpu_avgpool_backward_2d(output_grad,
+                                    ref_input_grad,
+                                    N,
+                                    C,
+                                    H,
+                                    W,
+                                    OH,
+                                    OW,
+                                    ksize,
+                                    stride,
+                                    padding,
+                                    count_include_pad,
+                                    divisor_override);
+        }
+        else if(dims == 5)
+        {
+            cpu_avgpool_backward_3d<T>(output_grad,
+                                       ref_input_grad,
+                                       N,
+                                       C,
+                                       D,
+                                       H,
+                                       W,
+                                       OD,
+                                       OH,
+                                       OW,
+                                       ksize,
+                                       stride,
+                                       padding,
+                                       count_include_pad,
+                                       divisor_override);
+        }
+        status = miopen::AvgPoolBackward(handle,
+                                         output_grad.desc,
+                                         output_grad_dev.get(),
+                                         input_grad.desc,
+                                         input_grad_dev.get(),
+                                         ksize.GetSize() == 3 ? ksize[0] : 0,
+                                         ksize.GetSize() == 3 ? ksize[1] : ksize[0],
+                                         ksize.GetSize() == 3 ? ksize[2] : ksize[1],
+                                         stride.GetSize() == 3 ? stride[0] : 0,
+                                         stride.GetSize() == 3 ? stride[1] : stride[0],
+                                         stride.GetSize() == 3 ? stride[2] : stride[1],
+                                         padding.GetSize() == 3 ? padding[0] : 0,
+                                         padding.GetSize() == 3 ? padding[1] : padding[0],
+                                         padding.GetSize() == 3 ? padding[2] : padding[1],
+                                         count_include_pad,
+                                         divisor_override);
+
+        ASSERT_EQ(status, miopenStatusSuccess);
+
+        input_grad.data = handle.Read<T>(input_grad_dev, input_grad.data.size());
+    }
+
+    void Verify()
+    {
+        double threshold = std::numeric_limits<T>::epsilon();
+        auto error       = miopen::rms_range(ref_input_grad, input_grad);
+        ASSERT_EQ(miopen::range_distance(ref_input_grad), miopen::range_distance(input_grad));
+        EXPECT_LT(error, threshold * 10);
+    }
+    AvgPoolTestCase avgpool_config;
+
+    tensor<T> output_grad;
+    tensor<T> input_grad;
+    tensor<T> ref_input_grad;
+    tensor<int32_t> ksize;
+    tensor<int32_t> stride;
+    tensor<int32_t> padding;
+
+    bool ceil_mode;
+    bool count_include_pad;
+    int32_t divisor_override;
+    int32_t N, C, D, H, W, OD, OH, OW;
+
+    miopen::Allocator::ManageDataPtr output_grad_dev;
+    miopen::Allocator::ManageDataPtr input_grad_dev;
+    miopen::Allocator::ManageDataPtr ksize_dev;
+    miopen::Allocator::ManageDataPtr stride_dev;
+    miopen::Allocator::ManageDataPtr padding_dev;
+};