From d197108888a5a37ed7d866f2cf1a25eb9611be8e Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 5 Aug 2024 18:11:48 +0700
Subject: [PATCH 01/29] init

---
 docs/reference/index.rst |   1 +
 include/miopen/miopen.h  |  77 ++++++++++++++++++++++++++
 src/avgpool.cpp          |   0
 src/avgpool_api.cpp      | 113 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 191 insertions(+)
 create mode 100644 src/avgpool.cpp
 create mode 100644 src/avgpool_api.cpp

diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 90e29ffaa9..9594e00ef0 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -35,3 +35,4 @@ The MIOpen API library is structured as follows:
   * :doc:`ReduceExtreme <../doxygen/html/group__ReduceExtreme>` (experimental)
   * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental)
   * :doc:`ReduceCalculation <../doxygen/html/group__ReduceCalculation>` (experimental)
+  * :doc:`AvgPool <../doxygen/html/group__avgpool>` (experimental)
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 3b9bbeccc1..fda8817e3a 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -70,6 +70,7 @@
  * @defgroup SGD
  * @defgroup getitem
  * @defgroup ReduceCalculation
+ * @defgroup avgpool
  *
  */
 
@@ -7621,6 +7622,82 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle,
 // CLOSEOUT GETITEM DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
+#ifdef MIOPEN_BETA_API
+// avgpool APIs
+/** @addtogroup avgpool
+ *
+ *  @{
+ */
+
+/*! @brief Execute an avgpool forward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param input                    Data tensor input (input)
+ * @param outputDesc               Tensor descriptor for output tensor (input)
+ * @param output                   Data tensor output (output)
+ * @param strideDesc               Tensor descriptor for stride tensor (input)
+ * @param stride                   Data tensor stride (output)
+ * @param paddingDesc              Tensor descriptor for padding tensor (input)
+ * @param padding                  Data tensor padding (output)
+ * @param kinforDesc               Tensor descriptor for kinfor tensor (input)
+ * @param kinfor                   Data tensor kinfor (output)
+ * @param count_include_pad        When True, will include the zero-padding in the averaging
+ * calculation (input)
+ * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
+ * use the number of elements in the pooling window (input)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
+                                                  const miopenTensorDescriptor_t inputDesc,
+                                                  const void* input,
+                                                  const miopenTensorDescriptor_t outputDesc,
+                                                  void* output,
+                                                  const miopenTensorDescriptor_t strideDesc,
+                                                  const void* stride,
+                                                  const miopenTensorDescriptor_t paddingDesc,
+                                                  const void* padding,
+                                                  const miopenTensorDescriptor_t kinforDesc,
+                                                  const void* kinfor,
+                                                  const bool count_include_pad,
+                                                  const int32_t divisor_override);
+
+/*! @brief Execute an avgpool backward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param outputGradDesc           Tensor descriptor for output grad tensor (input)
+ * @param output_grad              Data tensor output grad (input)
+ * @param inputGradDesc            Tensor descriptor for input grad tensor (input)
+ * @param input_grad               Data tensor input grad (output)
+ * @param strideDesc               Tensor descriptor for stride tensor (input)
+ * @param stride                   Data tensor stride (output)
+ * @param paddingDesc              Tensor descriptor for padding tensor (input)
+ * @param padding                  Data tensor padding (output)
+ * @param kinforDesc               Tensor descriptor for kinfor tensor (input)
+ * @param kinfor                   Data tensor kinfor (output)
+ * @param count_include_pad        When True, will include the zero-padding in the averaging
+ * calculation (input)
+ * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
+ * use the number of elements in the pooling window (input)
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
+                                                   const miopenTensorDescriptor_t outputGradDesc,
+                                                   const void* output_grad,
+                                                   const miopenTensorDescriptor_t inputGradDesc,
+                                                   void* input_grad,
+                                                   const miopenTensorDescriptor_t strideDesc,
+                                                   const void* stride,
+                                                   const miopenTensorDescriptor_t paddingDesc,
+                                                   const void* padding,
+                                                   const miopenTensorDescriptor_t kinforDesc,
+                                                   const void* kinfor,
+                                                   const bool count_include_pad,
+                                                   const int32_t divisor_override);
+/** @} */
+// CLOSEOUT avgpool DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/avgpool.cpp b/src/avgpool.cpp
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp
new file mode 100644
index 0000000000..643d494cee
--- /dev/null
+++ b/src/avgpool_api.cpp
@@ -0,0 +1,113 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/avgpool.hpp>
+#include <miopen/kernel_cache.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/avgpool/solvers.hpp>
+#include <miopen/find_solution.hpp>
+
+namespace miopen {
+
+miopenStatus_t AvgPoolForward(Handle& handle,
+                              const TensorDescriptor& inputDesc,
+                              ConstData_t input,
+                              const TensorDescriptor& outputDesc,
+                              Data_t output,
+                              const TensorDescriptor& strideDesc,
+                              ConstData_t stride,
+                              bool log_target)
+{
+    const auto problem = avgpool::UnreducedProblemDescription{
+        inputDesc, targetDesc, outputGradDesc, log_target, false};
+
+    const auto invoke_params = [&]() {
+        auto tmp           = avgpool::BwdInvokeParams{};
+        tmp.inputDesc      = &inputDesc;
+        tmp.targetDesc     = &targetDesc;
+        tmp.outputGradDesc = &outputGradDesc;
+        tmp.inputGradDesc  = &inputGradDesc;
+        tmp.targetGradDesc = &targetGradDesc;
+
+        tmp.input       = input;
+        tmp.target      = target;
+        tmp.output_grad = output_grad;
+        tmp.input_grad  = input_grad;
+        tmp.target_grad = target_grad;
+
+        tmp.log_target = log_target;
+
+        return tmp;
+    }();
+    const auto algo    = AlgorithmName{"AvgPoolForward"};
+    const auto solvers = solver::SolverContainer<solver::avgpool::AvgPoolForward5d>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+miopenStatus_t AvgPoolBackward(Handle& handle,
+                               const TensorDescriptor& outputGradDesc,
+                               ConstData_t output_grad,
+                               const TensorDescriptor& inputGradDesc,
+                               Data_t input_grad,
+                               const TensorDescriptor& windowInforDesc,
+                               ConstData_t window_infor,
+                               bool log_target)
+{
+    const auto problem = avgpool::ReducedProblemDescription{
+        inputDesc, targetDesc, outputGradDesc, divisor, log_target, false};
+
+    const auto invoke_params = [&]() {
+        auto tmp           = avgpool::BwdInvokeParams{};
+        tmp.inputDesc      = &inputDesc;
+        tmp.targetDesc     = &targetDesc;
+        tmp.outputGradDesc = &outputGradDesc;
+        tmp.inputGradDesc  = &inputGradDesc;
+        tmp.targetGradDesc = &targetGradDesc;
+
+        tmp.input       = input;
+        tmp.target      = target;
+        tmp.output_grad = output_grad;
+        tmp.input_grad  = input_grad;
+        tmp.target_grad = target_grad;
+
+        tmp.divisor    = divisor;
+        tmp.log_target = log_target;
+
+        return tmp;
+    }();
+    const auto algo    = AlgorithmName{"AvgPoolBackward"};
+    const auto solvers = solver::SolverContainer<solver::avgpool::AvgPoolBackward5d>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace miopen

From 3c90908ea196e4051d85cd9fe916788d3cce71ac Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 5 Aug 2024 23:30:39 +0700
Subject: [PATCH 02/29] skeleton code

---
 src/CMakeLists.txt                            |   8 +
 src/avgpool.cpp                               | 134 +++++++++
 src/avgpool/problem_description.cpp           |  85 ++++++
 src/avgpool_api.cpp                           | 217 +++++++++-----
 src/include/miopen/avgpool.hpp                |  65 ++++
 src/include/miopen/avgpool/invoke_params.hpp  |  85 ++++++
 .../miopen/avgpool/problem_description.hpp    | 215 ++++++++++++++
 src/include/miopen/avgpool/solvers.hpp        | 281 ++++++++++++++++++
 src/include/miopen/solver_id.hpp              |   3 +-
 src/kernels/MIOpenAvgPool.cpp                 |   0
 src/solver/avgpool/backward_avgpool_2d.cpp    |   0
 src/solver/avgpool/backward_avgpool_3d.cpp    |   0
 src/solver/avgpool/forward_avgpool_2d.cpp     |   0
 src/solver/avgpool/forward_avgpool_3d.cpp     |   0
 14 files changed, 1013 insertions(+), 80 deletions(-)
 create mode 100644 src/avgpool/problem_description.cpp
 create mode 100644 src/include/miopen/avgpool.hpp
 create mode 100644 src/include/miopen/avgpool/invoke_params.hpp
 create mode 100644 src/include/miopen/avgpool/problem_description.hpp
 create mode 100644 src/include/miopen/avgpool/solvers.hpp
 create mode 100644 src/kernels/MIOpenAvgPool.cpp
 create mode 100644 src/solver/avgpool/backward_avgpool_2d.cpp
 create mode 100644 src/solver/avgpool/backward_avgpool_3d.cpp
 create mode 100644 src/solver/avgpool/forward_avgpool_2d.cpp
 create mode 100644 src/solver/avgpool/forward_avgpool_3d.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 77acf3f7d3..ee36c92967 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -89,6 +89,8 @@ set( MIOpen_Source
     adam_api.cpp
     addlayernorm_api.cpp
     api/find2_0_commons.cpp
+    avgpool_api.cpp
+    avgpool/problem_description.cpp
     batch_norm.cpp
     batch_norm_api.cpp
     batchnorm/problem_description.cpp
@@ -191,6 +193,10 @@ set( MIOpen_Source
     solver/activ/fwd_1.cpp
     solver/adam/adam.cpp
     solver/adam/transformers_adam_w.cpp
+    solver/avgpool/backward_avgpool_2d.cpp
+    solver/avgpool/backward_avgpool_3d.cpp
+    solver/avgpool/forward_avgpool_2d.cpp
+    solver/avgpool/forward_avgpool_3d.cpp
     solver/batchnorm/backward_ck.cpp
     solver/batchnorm/backward_per_activation.cpp
     solver/batchnorm/backward_per_activation_fused.cpp
@@ -482,6 +488,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         ${GPU_BATCHED_TRANSPOSE_KERNEL_HIP}
         ${GPU_GENERAL_TENSOR_REORDER_KERNEL_HIP_SOURCE}
         kernels/MIOpenAdam.cpp
+        kernels/MIOpenAvgPool.cpp
         kernels/MIOpenCat.cpp
         kernels/MIOpenCheckNumerics.cpp
         kernels/MIOpenBatchNormActivBwdPerAct.cl
@@ -626,6 +633,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN
         activ.cpp
         adam.cpp
         addlayernorm.cpp
+        avgpool.cpp
         cat.cpp
         groupnorm.cpp
         getitem.cpp
diff --git a/src/avgpool.cpp b/src/avgpool.cpp
index e69de29bb2..15bea1f9d8 100644
--- a/src/avgpool.cpp
+++ b/src/avgpool.cpp
@@ -0,0 +1,134 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/avgpool.hpp>
+#include <miopen/kernel_cache.hpp>
+#include <miopen/float_equal.hpp>
+#include <miopen/tensor.hpp>
+#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/avgpool/solvers.hpp>
+#include <miopen/find_solution.hpp>
+
+namespace miopen {
+
+miopenStatus_t AvgPoolForward(Handle& handle,
+                              const TensorDescriptor& inputDesc,
+                              ConstData_t input,
+                              const TensorDescriptor& outputDesc,
+                              Data_t output,
+                              const TensorDescriptor& strideDesc,
+                              ConstData_t stride,
+                              const TensorDescriptor& paddingDesc,
+                              ConstData_t padding,
+                              const TensorDescriptor& kinforDesc,
+                              ConstData_t kinfor,
+                              const bool count_include_pad,
+                              const int32_t divisor_override)
+{
+    const auto problem = avgpool::FwdProblemDescription{inputDesc,
+                                                        outputDesc,
+                                                        strideDesc,
+                                                        paddingDesc,
+                                                        kinforDesc,
+                                                        count_include_pad,
+                                                        divisor_override};
+
+    const auto invoke_params = [&]() {
+        auto tmp        = avgpool::FwdInvokeParams{};
+        tmp.inputDesc   = &inputDesc;
+        tmp.outputDesc  = &outputDesc;
+        tmp.strideDesc  = &strideDesc;
+        tmp.paddingDesc = &paddingDesc;
+        tmp.kinforDesc  = &kinforDesc;
+
+        tmp.input             = input;
+        tmp.output            = output;
+        tmp.stride            = stride;
+        tmp.padding           = padding;
+        tmp.kinfor            = kinfor;
+        tmp.count_include_pad = count_include_pad;
+        tmp.divisor_override  = divisor_override;
+
+        return tmp;
+    }();
+    const auto algo    = AlgorithmName{"AvgPoolForward"};
+    const auto solvers = solver::SolverContainer<solver::avgpool::AvgPoolForward2d,
+                                                 solver::avgpool::AvgPoolForward3d>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+miopenStatus_t AvgPoolBackward(Handle& handle,
+                               const TensorDescriptor& outputGradDesc,
+                               ConstData_t output_grad,
+                               const TensorDescriptor& inputGradDesc,
+                               Data_t input_grad,
+                               const TensorDescriptor& strideDesc,
+                               ConstData_t stride,
+                               const TensorDescriptor& paddingDesc,
+                               ConstData_t padding,
+                               const TensorDescriptor& kinforDesc,
+                               ConstData_t kinfor,
+                               const bool count_include_pad,
+                               const int32_t divisor_override)
+{
+    const auto problem = avgpool::BwdProblemDescription{outputGradDesc,
+                                                        inputGradDesc,
+                                                        strideDesc,
+                                                        paddingDesc,
+                                                        kinforDesc,
+                                                        count_include_pad,
+                                                        divisor_override};
+
+    const auto invoke_params = [&]() {
+        auto tmp           = avgpool::BwdInvokeParams{};
+        tmp.outputGradDesc = &outputGradDesc;
+        tmp.inputGradDesc  = &inputGradDesc;
+        tmp.strideDesc     = &strideDesc;
+        tmp.paddingDesc    = &paddingDesc;
+        tmp.kinforDesc     = &kinforDesc;
+
+        tmp.output_grad       = output_grad;
+        tmp.input_grad        = input_grad;
+        tmp.stride            = stride;
+        tmp.padding           = padding;
+        tmp.kinfor            = kinfor;
+        tmp.count_include_pad = count_include_pad;
+        tmp.divisor_override  = divisor_override;
+
+        return tmp;
+    }();
+    const auto algo    = AlgorithmName{"AvgPoolBackward"};
+    const auto solvers = solver::SolverContainer<solver::avgpool::AvgPoolBackward2d,
+                                                 solver::avgpool::AvgPoolBackward3d>{};
+
+    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+
+    return miopenStatusSuccess;
+}
+
+} // namespace miopen
diff --git a/src/avgpool/problem_description.cpp b/src/avgpool/problem_description.cpp
new file mode 100644
index 0000000000..dd2144f429
--- /dev/null
+++ b/src/avgpool/problem_description.cpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <cstddef>
+#include <miopen/avgpool/problem_description.hpp>
+#include <miopen/names.hpp>
+
+#include <sstream>
+
+namespace miopen {
+
+namespace avgpool {
+
+NetworkConfig FwdProblemDescription::MakeNetworkConfig() const
+{
+    size_t numel       = GetNtotal();
+    size_t num_batches = inputDesc.GetLengths()[0];
+    size_t num_classes = GetC();
+    size_t num_dims    = inputDesc.GetNumDims();
+
+    auto input_dtype = inputDesc.GetType();
+
+    std::ostringstream ss;
+
+    ss << "avgpool_unreduce";
+    ss << "is_fwd" << is_fwd;
+    ss << "contiguous" << contiguous;
+    ss << "input_dtype" << input_dtype;
+    ss << "numel" << numel;
+    ss << "num_dims" << num_dims;
+    ss << "num_batches" << num_batches;
+    ss << "num_classes" << num_classes;
+
+    return NetworkConfig{ss.str()};
+}
+
+NetworkConfig BwdProblemDescription::MakeNetworkConfig() const
+{
+    size_t numel       = GetNtotal();
+    size_t num_batches = inputDesc.GetLengths()[0];
+    size_t num_classes = GetC();
+    size_t num_dims    = inputDesc.GetNumDims();
+
+    auto input_dtype = inputDesc.GetType();
+
+    std::ostringstream ss;
+
+    ss << "avgpool_reduce";
+    ss << "is_fwd" << is_fwd;
+    ss << "input_dtype" << input_dtype;
+    ss << "divisor" << divisor;
+    ss << "numel" << numel;
+    ss << "num_dims" << num_dims;
+    ss << "num_batches" << num_batches;
+    ss << "num_classes" << num_classes;
+
+    return NetworkConfig{ss.str()};
+}
+
+} // namespace avgpool
+
+} // namespace miopen
diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp
index 643d494cee..4e62bd5e7b 100644
--- a/src/avgpool_api.cpp
+++ b/src/avgpool_api.cpp
@@ -23,91 +23,150 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include <miopen/avgpool.hpp>
-#include <miopen/kernel_cache.hpp>
-#include <miopen/float_equal.hpp>
-#include <miopen/tensor.hpp>
-#include <miopen/avgpool/invoke_params.hpp>
-#include <miopen/avgpool/solvers.hpp>
-#include <miopen/find_solution.hpp>
 
-namespace miopen {
+#include <miopen/avgpool.hpp>
+#include <miopen/errors.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/logger.hpp>
+#include <miopen/tensor_ops.hpp>
 
-miopenStatus_t AvgPoolForward(Handle& handle,
-                              const TensorDescriptor& inputDesc,
-                              ConstData_t input,
-                              const TensorDescriptor& outputDesc,
-                              Data_t output,
-                              const TensorDescriptor& strideDesc,
-                              ConstData_t stride,
-                              bool log_target)
+inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
 {
-    const auto problem = avgpool::UnreducedProblemDescription{
-        inputDesc, targetDesc, outputGradDesc, log_target, false};
-
-    const auto invoke_params = [&]() {
-        auto tmp           = avgpool::BwdInvokeParams{};
-        tmp.inputDesc      = &inputDesc;
-        tmp.targetDesc     = &targetDesc;
-        tmp.outputGradDesc = &outputGradDesc;
-        tmp.inputGradDesc  = &inputGradDesc;
-        tmp.targetGradDesc = &targetGradDesc;
-
-        tmp.input       = input;
-        tmp.target      = target;
-        tmp.output_grad = output_grad;
-        tmp.input_grad  = input_grad;
-        tmp.target_grad = target_grad;
-
-        tmp.log_target = log_target;
-
-        return tmp;
-    }();
-    const auto algo    = AlgorithmName{"AvgPoolForward"};
-    const auto solvers = solver::SolverContainer<solver::avgpool::AvgPoolForward5d>{};
-
-    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
-
-    return miopenStatusSuccess;
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
 }
 
-miopenStatus_t AvgPoolBackward(Handle& handle,
-                               const TensorDescriptor& outputGradDesc,
-                               ConstData_t output_grad,
-                               const TensorDescriptor& inputGradDesc,
-                               Data_t input_grad,
-                               const TensorDescriptor& windowInforDesc,
-                               ConstData_t window_infor,
-                               bool log_target)
+static void LogCmdAvgPool(const miopenTensorDescriptor_t xDesc,
+                          const miopenTensorDescriptor_t oDesc,
+                          const bool count_include_pad,
+                          const int32_t divisor_override,
+                          const bool is_fwd)
 {
-    const auto problem = avgpool::ReducedProblemDescription{
-        inputDesc, targetDesc, outputGradDesc, divisor, log_target, false};
-
-    const auto invoke_params = [&]() {
-        auto tmp           = avgpool::BwdInvokeParams{};
-        tmp.inputDesc      = &inputDesc;
-        tmp.targetDesc     = &targetDesc;
-        tmp.outputGradDesc = &outputGradDesc;
-        tmp.inputGradDesc  = &inputGradDesc;
-        tmp.targetGradDesc = &targetGradDesc;
-
-        tmp.input       = input;
-        tmp.target      = target;
-        tmp.output_grad = output_grad;
-        tmp.input_grad  = input_grad;
-        tmp.target_grad = target_grad;
-
-        tmp.divisor    = divisor;
-        tmp.log_target = log_target;
-
-        return tmp;
-    }();
-    const auto algo    = AlgorithmName{"AvgPoolBackward"};
-    const auto solvers = solver::SolverContainer<solver::avgpool::AvgPoolBackward5d>{};
-
-    solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
+    if(miopen::IsLoggingCmd())
+    {
+        std::stringstream ss;
+        auto dtype = miopen::deref(xDesc).GetType();
+        if(dtype == miopenHalf)
+        {
+            ss << "avgpoolfp16";
+        }
+        else if(dtype == miopenFloat)
+        {
+            ss << "avgpoolfp32";
+        }
+        else if(dtype == miopenBFloat16)
+        {
+            ss << "avgpoolbfp16";
+        }
+
+        MIOPEN_LOG_FUNCTION(xDesc, oDesc, count_include_pad, divisor_override);
+        ss << " -Is " << miopen::deref(xDesc).GetLengths();
+        ss << " -Os " << miopen::deref(oDesc).GetLengths();
+        ss << " -Si " << miopen::deref(xDesc).GetStrides();
+        ss << " -So " << miopen::deref(oDesc).GetStrides();
+        ss << " -Cp " << count_include_pad;
+        ss << " -Do " << divisor_override;
+        ss << " -F " << ((is_fwd) ? "1" : "2");
+
+        MIOPEN_LOG_DRIVER_CMD(ss.str());
+    }
+}
 
-    return miopenStatusSuccess;
+extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
+                                               const miopenTensorDescriptor_t inputDesc,
+                                               const void* input,
+                                               const miopenTensorDescriptor_t outputDesc,
+                                               void* output,
+                                               const miopenTensorDescriptor_t strideDesc,
+                                               const void* stride,
+                                               const miopenTensorDescriptor_t paddingDesc,
+                                               const void* padding,
+                                               const miopenTensorDescriptor_t kinforDesc,
+                                               const void* kinfor,
+                                               const bool count_include_pad,
+                                               const int32_t divisor_override)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        inputDesc,
+                        input,
+                        outputDesc,
+                        output,
+                        strideDesc,
+                        stride,
+                        paddingDesc,
+                        padding,
+                        kinforDesc,
+                        kinfor,
+                        count_include_pad,
+                        divisor_override);
+
+    LogCmdAvgPool(inputDesc, outputDesc, count_include_pad, divisor_override, true);
+    return miopen::try_([&] {
+        miopen::AvgPoolForward(miopen::deref(handle),
+                               miopen::deref(inputDesc),
+                               DataCast(input),
+                               miopen::deref(outputDesc),
+                               DataCast(output),
+                               miopen::deref(strideDesc),
+                               DataCast(stride),
+                               miopen::deref(paddingDesc),
+                               DataCast(padding),
+                               miopen::deref(kinforDesc),
+                               DataCast(kinfor),
+                               count_include_pad,
+                               divisor_override);
+    });
 }
 
-} // namespace miopen
+extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
+                                                const miopenTensorDescriptor_t outputGradDesc,
+                                                const void* output_grad,
+                                                const miopenTensorDescriptor_t inputGradDesc,
+                                                void* input_grad,
+                                                const miopenTensorDescriptor_t strideDesc,
+                                                const void* stride,
+                                                const miopenTensorDescriptor_t paddingDesc,
+                                                const void* padding,
+                                                const miopenTensorDescriptor_t kinforDesc,
+                                                const void* kinfor,
+                                                const bool count_include_pad,
+                                                const int32_t divisor_override)
+{
+    MIOPEN_LOG_FUNCTION(handle,
+                        outputGradDesc,
+                        output_grad,
+                        inputGradDesc,
+                        input_grad,
+                        strideDesc,
+                        stride,
+                        paddingDesc,
+                        padding,
+                        kinforDesc,
+                        kinfor,
+                        count_include_pad,
+                        divisor_override);
+
+    LogCmdAvgPool(inputGradDesc, outputGradDesc, count_include_pad, divisor_override, false);
+    return miopen::try_([&] {
+        miopen::AvgPoolBackward(miopen::deref(handle),
+                                miopen::deref(outputGradDesc),
+                                DataCast(output_grad),
+                                miopen::deref(inputGradDesc),
+                                DataCast(input_grad),
+                                miopen::deref(strideDesc),
+                                DataCast(stride),
+                                miopen::deref(paddingDesc),
+                                DataCast(padding),
+                                miopen::deref(kinforDesc),
+                                DataCast(kinfor),
+                                count_include_pad,
+                                divisor_override);
+    });
+}
diff --git a/src/include/miopen/avgpool.hpp b/src/include/miopen/avgpool.hpp
new file mode 100644
index 0000000000..1a46b974b2
--- /dev/null
+++ b/src/include/miopen/avgpool.hpp
@@ -0,0 +1,65 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <miopen/miopen.h>
+#ifndef MIOPEN_AVGPOOL_HPP_
+#define MIOPEN_AVGPOOL_HPP_
+
+#include <miopen/common.hpp>
+
+namespace miopen {
+
+struct Handle;
+struct TensorDescriptor;
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolForward(Handle& handle,
+                                                      const TensorDescriptor& inputDesc,
+                                                      ConstData_t input,
+                                                      const TensorDescriptor& outputDesc,
+                                                      Data_t output,
+                                                      const TensorDescriptor& strideDesc,
+                                                      ConstData_t stride,
+                                                      const TensorDescriptor& paddingDesc,
+                                                      ConstData_t padding,
+                                                      const TensorDescriptor& kinforDesc,
+                                                      ConstData_t kinfor,
+                                                      bool count_include_pad,
+                                                      int32_t divisor_override);
+
+MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolBackward(Handle& handle,
+                                                       const TensorDescriptor& outputGradDesc,
+                                                       Data_t output_grad,
+                                                       const TensorDescriptor& inputGradDesc,
+                                                       Data_t input_grad,
+                                                       const TensorDescriptor& strideDesc,
+                                                       ConstData_t stride,
+                                                       const TensorDescriptor& paddingDesc,
+                                                       ConstData_t padding,
+                                                       const TensorDescriptor& kinforDesc,
+                                                       ConstData_t kinfor,
+                                                       bool count_include_pad,
+                                                       int32_t divisor_override);
+} // namespace miopen
+#endif // _MIOPEN_AVGPOOL_HPP_
diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/avgpool/invoke_params.hpp
new file mode 100644
index 0000000000..de2e87ea1b
--- /dev/null
+++ b/src/include/miopen/avgpool/invoke_params.hpp
@@ -0,0 +1,85 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "miopen/common.hpp"
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor.hpp>
+
+namespace miopen {
+namespace avgpool {
+
+struct FwdInvokeParams : public miopen::InvokeParams
+{
+
+    FwdInvokeParams() = default;
+
+    const TensorDescriptor* inputDesc   = nullptr;
+    const TensorDescriptor* outputDesc  = nullptr;
+    const TensorDescriptor* strideDesc  = nullptr;
+    const TensorDescriptor* paddingDesc = nullptr;
+    const TensorDescriptor* kinfor      = nullptr;
+
+    ConstData_t input   = nullptr;
+    Data_t output       = nullptr;
+    ConstData_t stride  = nullptr;
+    ConstData_t padding = nullptr;
+    ConstData_t kinfo   = nullptr;
+
+    const bool count_include_pad   = false;
+    const int32_t divisor_override = 0;
+
+    std::size_t GetWorkspaceSize() const { return 0; }
+    Data_t GetWorkspace() const { return nullptr; }
+};
+
+struct BwdInvokeParams : public miopen::InvokeParams
+{
+
+    BwdInvokeParams() = default;
+
+    const TensorDescriptor* outputGradDesc = nullptr;
+    const TensorDescriptor* inputGradDesc  = nullptr;
+    const TensorDescriptor* strideDesc     = nullptr;
+    const TensorDescriptor* paddingDesc    = nullptr;
+    const TensorDescriptor* kinfor         = nullptr;
+
+    ConstData_t output_grad = nullptr;
+    Data_t input_grad       = nullptr;
+    ConstData_t stride      = nullptr;
+    ConstData_t padding     = nullptr;
+    ConstData_t kinfo       = nullptr;
+
+    const bool count_include_pad   = false;
+    const int32_t divisor_override = 0;
+
+    std::size_t GetWorkspaceSize() const { return 0; }
+    Data_t GetWorkspace() const { return nullptr; }
+};
+
+} // namespace avgpool
+} // namespace miopen
diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/avgpool/problem_description.hpp
new file mode 100644
index 0000000000..2b3ec555db
--- /dev/null
+++ b/src/include/miopen/avgpool/problem_description.hpp
@@ -0,0 +1,215 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/problem_description_base.hpp>
+#include <miopen/activ.hpp>
+#include <miopen/tensor.hpp>
+#include <cassert>
+#include <string>
+
+namespace miopen {
+
+struct NetworkConfig;
+
+namespace avgpool {
+
+struct ProblemDescription : ProblemDescriptionBase
+{
+    ProblemDescription(const TensorDescriptor& inputDesc_,
+                       const TensorDescriptor& targetDesc_,
+                       const TensorDescriptor& weightDesc_,
+                       const TensorDescriptor& outputDesc_,
+                       int32_t ignore_index_,
+                       bool is_fwd_)
+        : inputDesc(inputDesc_),
+          targetDesc(targetDesc_),
+          weightDesc(weightDesc_),
+          outputDesc(outputDesc_),
+          ignore_index(ignore_index_),
+          is_fwd(is_fwd_)
+    {
+    }
+
+    const TensorDescriptor& GetInputDesc() const { return inputDesc; }
+    const TensorDescriptor& GetTargetDesc() const { return targetDesc; }
+    const TensorDescriptor& GetWeightDesc() const { return weightDesc; }
+    const TensorDescriptor& GetOutputDesc() const { return outputDesc; }
+    int32_t GetIgnoreIndex() const { return ignore_index; }
+
+    bool IsValidLength() const
+    {
+        if(targetDesc.GetLengths()[0] != inputDesc.GetLengths()[0])
+            MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor sizes do not match.");
+
+        for(int32_t i = 1; i < targetDesc.GetNumDims(); ++i)
+        {
+            if(targetDesc.GetLengths()[i] != inputDesc.GetLengths()[i + 1])
+            {
+                MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor sizes do not match.");
+            }
+        }
+        if(weightDesc.GetLengths()[0] != inputDesc.GetLengths()[1])
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor sizes do not match.");
+        }
+        if(inputDesc.GetLengths().size() > 5)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Do not support Input Tensor dims > 5.");
+        }
+        return true;
+    }
+
+    bool IsValidStride() const
+    {
+        auto isRightStride = [](TensorDescriptor td) {
+            auto strides = td.GetStrides();
+            auto lengths = td.GetLengths();
+            std::vector<std::pair<size_t, size_t>> p;
+            p.reserve(td.GetNumDims());
+            std::transform(strides.begin(),
+                           strides.end(),
+                           lengths.begin(),
+                           std::back_inserter(p),
+                           [](size_t a, size_t b) { return std::make_pair(a, b); });
+            std::sort(p.begin(), p.end());
+            for(int i = 1; i < p.size(); ++i)
+            {
+                if(p[i].first != p[i - 1].first * p[i - 1].second)
+                    MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor strides do not valid.");
+            }
+            return true;
+        };
+        return isRightStride(inputDesc) && isRightStride(targetDesc) && isRightStride(outputDesc) &&
+               isRightStride(weightDesc);
+    }
+
+    bool IsSameType() const
+    {
+        if(inputDesc.GetType() != weightDesc.GetType())
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "NLLLoss: Input and Weight tensors types do not match.");
+        }
+        return true;
+    }
+
+    bool IsAllContiguous() const
+    {
+        auto isContiguous = [](TensorDescriptor td) {
+            size_t s = 1;
+            for(int i = td.GetNumDims() - 1; i >= 0; --i)
+            {
+                if(s != td.GetStrides()[i])
+                {
+                    return false;
+                }
+                s *= td.GetLengths()[i];
+            }
+            return true;
+        };
+        return isContiguous(inputDesc) && isContiguous(targetDesc) && isContiguous(weightDesc) &&
+               isContiguous(outputDesc);
+    }
+
+protected:
+    TensorDescriptor inputDesc;
+    TensorDescriptor targetDesc;
+    TensorDescriptor weightDesc;
+    TensorDescriptor outputDesc;
+
+    int32_t ignore_index;
+    bool is_fwd;
+
+    NetworkConfig MakeForwardNetworkConfig() const;
+};
+
+struct UnreduceProblemDescription : ProblemDescription
+{
+    UnreduceProblemDescription(const TensorDescriptor& inputDesc_,
+                               const TensorDescriptor& targetDesc_,
+                               const TensorDescriptor& weightDesc_,
+                               const TensorDescriptor& outputDesc_,
+                               int32_t ignore_index_,
+                               bool is_fwd_)
+        : ProblemDescription(
+              inputDesc_, targetDesc_, weightDesc_, outputDesc_, ignore_index_, is_fwd_)
+    {
+        IsSameType();
+        IsValidLength();
+        IsValidStride();
+    }
+
+    size_t GetNtotal() const { return outputDesc.GetElementSize(); }
+    size_t GetC() const { return weightDesc.GetElementSize(); }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+private:
+    NetworkConfig MakeForwardNetworkConfig() const;
+};
+
+struct ReduceProblemDescription : ProblemDescription
+{
+    ReduceProblemDescription(const TensorDescriptor& inputDesc_,
+                             const TensorDescriptor& targetDesc_,
+                             const TensorDescriptor& weightDesc_,
+                             const TensorDescriptor& outputDesc_,
+                             int32_t ignore_index_,
+                             float divisor_,
+                             bool is_fwd_)
+        : ProblemDescription(
+              inputDesc_, targetDesc_, weightDesc_, outputDesc_, ignore_index_, is_fwd_)
+    {
+        divisor = divisor_;
+        IsSameType();
+        IsValidLength();
+        IsValidStride();
+    }
+
+    size_t GetNtotal() const { return targetDesc.GetElementSize(); }
+    size_t GetC() const { return weightDesc.GetElementSize(); }
+
+    bool IsValidLength() const
+    {
+        if(outputDesc.GetNumDims() != 1 || outputDesc.GetLengths()[0] != 1)
+            MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Output Tensor size must be (1).");
+        if(!ProblemDescription::IsValidLength())
+            return false;
+        return true;
+    }
+
+    NetworkConfig MakeNetworkConfig() const override;
+
+private:
+    float divisor;
+    NetworkConfig MakeForwardNetworkConfig() const;
+};
+
+} // namespace avgpool
+
+} // namespace miopen
diff --git a/src/include/miopen/avgpool/solvers.hpp b/src/include/miopen/avgpool/solvers.hpp
new file mode 100644
index 0000000000..34adc12b4c
--- /dev/null
+++ b/src/include/miopen/avgpool/solvers.hpp
@@ -0,0 +1,281 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include <miopen/solver.hpp>
+#include <miopen/avgpool/problem_description.hpp>
+#include "miopen/kernel_build_params.hpp"
+#include "miopen/kernel_info.hpp"
+
+#include <utility>
+
+namespace miopen {
+
+namespace solver {
+
+const auto make_hip_kernel = [](std::vector<size_t> localsize,
+                                std::vector<size_t> gridsize,
+                                std::string kernel_file,
+                                std::string kernel_name,
+                                KernelBuildParameters build_params) {
+    while(localsize.size() < 3)
+        localsize.push_back(1);
+    while(gridsize.size() < 3)
+        gridsize.push_back(1);
+    for(int i = 0; i < localsize.size(); ++i)
+        gridsize[i] = AlignUp(gridsize[i], localsize[i]);
+    return KernelInfo{
+        build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name};
+};
+
+namespace avgpool {
+
+using NLLLossUnreduce =
+    NonTunableSolverBase<ExecutionContext, miopen::avgpool::UnreduceProblemDescription>;
+
+using NLLLossReduce =
+    NonTunableSolverBase<ExecutionContext, miopen::avgpool::ReduceProblemDescription>;
+
+struct NLLLossUnreduceSolver : NLLLossUnreduce
+{
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+struct NLLLossReduceSolver : NLLLossReduce
+{
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::ReduceProblemDescription& problem) const override;
+};
+
+// FORWARD UNREDUCE
+struct NLLLossUnreduceForwardContiguous4d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceForwardContiguous4d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+struct NLLLossUnreduceForwardContiguous2d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceForwardContiguous2d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+struct NLLLossUnreduceForward4d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceForward4d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+struct NLLLossUnreduceForward2d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceForward2d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+struct NLLLossUnreduceForward5d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceForward5d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+// FORWARD REDUCE
+struct NLLLossReduceForward5d final : NLLLossReduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossReduceForward5d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::ReduceProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::ReduceProblemDescription& problem) const override;
+    std::size_t
+    GetWorkspaceSize(const ExecutionContext& context,
+                     const miopen::avgpool::ReduceProblemDescription& problem) const override;
+    bool MayNeedWorkspace() const override { return true; }
+};
+
+// BACKWARD UNREDUCE
+struct NLLLossUnreduceBackwardContiguous2d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceBackwardContiguous2d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+struct NLLLossUnreduceBackwardContiguous4d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceBackwardContiguous4d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+struct NLLLossUnreduceBackward4d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceBackward4d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+struct NLLLossUnreduceBackward2d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceBackward2d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+struct NLLLossUnreduceBackward5d final : NLLLossUnreduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossUnreduceBackward5d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+};
+
+// BACKWARD REDUCE
+struct NLLLossReduceBackward2d final : NLLLossReduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossReduceBackward2d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::ReduceProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::ReduceProblemDescription& problem) const override;
+};
+
+struct NLLLossReduceBackward5d final : NLLLossReduceSolver
+{
+    const std::string& SolverDbId() const override
+    {
+        return GetSolverDbId<NLLLossReduceBackward5d>();
+    }
+
+    bool IsApplicable(const ExecutionContext& context,
+                      const miopen::avgpool::ReduceProblemDescription& problem) const override;
+    ConvSolution
+    GetSolution(const ExecutionContext& context,
+                const miopen::avgpool::ReduceProblemDescription& problem) const override;
+};
+
+} // namespace avgpool
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index 81c15f6bea..194afd79ac 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -59,7 +59,8 @@ enum class Primitive
     Mha,
     Softmax,
     Adam,
-    Item
+    Item,
+    AvgPool
 };
 
 struct MIOPEN_INTERNALS_EXPORT Id
diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp
new file mode 100644
index 0000000000..e69de29bb2

From 86a50733653b8cce2fcfccb4d79869385e149181 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Tue, 6 Aug 2024 19:12:43 +0700
Subject: [PATCH 03/29] add solver, kernel

---
 src/avgpool/problem_description.cpp           |  66 +++---
 src/include/miopen/avgpool.hpp                |   2 +-
 src/include/miopen/avgpool/invoke_params.hpp  |  16 +-
 .../miopen/avgpool/problem_description.hpp    | 218 +++++++----------
 src/include/miopen/avgpool/solvers.hpp        | 220 +++---------------
 src/include/miopen/tensor_view_utils.hpp      |   1 +
 src/kernels/tensor_view.hpp                   |  40 ++++
 src/solver/avgpool/backward_avgpool_2d.cpp    | 116 +++++++++
 src/solver/avgpool/backward_avgpool_3d.cpp    | 120 ++++++++++
 src/solver/avgpool/forward_avgpool_2d.cpp     | 116 +++++++++
 src/solver/avgpool/forward_avgpool_3d.cpp     | 120 ++++++++++
 11 files changed, 668 insertions(+), 367 deletions(-)

diff --git a/src/avgpool/problem_description.cpp b/src/avgpool/problem_description.cpp
index dd2144f429..96ecb4bb72 100644
--- a/src/avgpool/problem_description.cpp
+++ b/src/avgpool/problem_description.cpp
@@ -24,58 +24,68 @@
  *
  *******************************************************************************/
 
-#include <cstddef>
 #include <miopen/avgpool/problem_description.hpp>
 #include <miopen/names.hpp>
 
-#include <sstream>
-
 namespace miopen {
 
 namespace avgpool {
 
+inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
 NetworkConfig FwdProblemDescription::MakeNetworkConfig() const
 {
-    size_t numel       = GetNtotal();
-    size_t num_batches = inputDesc.GetLengths()[0];
-    size_t num_classes = GetC();
-    size_t num_dims    = inputDesc.GetNumDims();
+    auto input_size    = inputDesc.GetLengths();
+    auto output_size   = outputDesc.GetLengths();
+    auto input_stride  = inputDesc.GetStrides();
+    auto output_stride = outputDesc.GetStrides();
 
     auto input_dtype = inputDesc.GetType();
 
     std::ostringstream ss;
 
-    ss << "avgpool_unreduce";
-    ss << "is_fwd" << is_fwd;
-    ss << "contiguous" << contiguous;
-    ss << "input_dtype" << input_dtype;
-    ss << "numel" << numel;
-    ss << "num_dims" << num_dims;
-    ss << "num_batches" << num_batches;
-    ss << "num_classes" << num_classes;
+    ss << "avgpool_fwd";
+    ss << "-input_dtype" << input_dtype;
+    ss << "-Is" << input_size;
+    ss << "-Os" << output_size;
+    ss << "-Si" << input_stride;
+    ss << "-So" << output_stride;
+    ss << "-Cp " << count_include_pad;
+    ss << "-Do " << divisor_override;
 
     return NetworkConfig{ss.str()};
 }
 
 NetworkConfig BwdProblemDescription::MakeNetworkConfig() const
 {
-    size_t numel       = GetNtotal();
-    size_t num_batches = inputDesc.GetLengths()[0];
-    size_t num_classes = GetC();
-    size_t num_dims    = inputDesc.GetNumDims();
+    auto input_grad_size    = inputGradDesc.GetLengths();
+    auto output_grad_size   = outputGradDesc.GetLengths();
+    auto input_grad_stride  = inputGradDesc.GetStrides();
+    auto output_grad_stride = outputGradDesc.GetStrides();
 
-    auto input_dtype = inputDesc.GetType();
+    auto input_dtype = inputGradDesc.GetType();
 
     std::ostringstream ss;
 
-    ss << "avgpool_reduce";
-    ss << "is_fwd" << is_fwd;
-    ss << "input_dtype" << input_dtype;
-    ss << "divisor" << divisor;
-    ss << "numel" << numel;
-    ss << "num_dims" << num_dims;
-    ss << "num_batches" << num_batches;
-    ss << "num_classes" << num_classes;
+    ss << "avgpool_bwd";
+    ss << "-input_dtype" << input_dtype;
+    ss << "-dIs" << input_grad_size;
+    ss << "-dOs" << output_grad_size;
+    ss << "-dSi" << input_grad_stride;
+    ss << "-dSo" << output_grad_stride;
+    ss << "-Cp " << count_include_pad;
+    ss << "-Do " << divisor_override;
 
     return NetworkConfig{ss.str()};
 }
diff --git a/src/include/miopen/avgpool.hpp b/src/include/miopen/avgpool.hpp
index 1a46b974b2..617ed56782 100644
--- a/src/include/miopen/avgpool.hpp
+++ b/src/include/miopen/avgpool.hpp
@@ -50,7 +50,7 @@ MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolForward(Handle& handle,
 
 MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolBackward(Handle& handle,
                                                        const TensorDescriptor& outputGradDesc,
-                                                       Data_t output_grad,
+                                                       ConstData_t output_grad,
                                                        const TensorDescriptor& inputGradDesc,
                                                        Data_t input_grad,
                                                        const TensorDescriptor& strideDesc,
diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/avgpool/invoke_params.hpp
index de2e87ea1b..b57f8e0edc 100644
--- a/src/include/miopen/avgpool/invoke_params.hpp
+++ b/src/include/miopen/avgpool/invoke_params.hpp
@@ -42,16 +42,16 @@ struct FwdInvokeParams : public miopen::InvokeParams
     const TensorDescriptor* outputDesc  = nullptr;
     const TensorDescriptor* strideDesc  = nullptr;
     const TensorDescriptor* paddingDesc = nullptr;
-    const TensorDescriptor* kinfor      = nullptr;
+    const TensorDescriptor* kinforDesc  = nullptr;
 
     ConstData_t input   = nullptr;
     Data_t output       = nullptr;
     ConstData_t stride  = nullptr;
     ConstData_t padding = nullptr;
-    ConstData_t kinfo   = nullptr;
+    ConstData_t kinfor  = nullptr;
 
-    const bool count_include_pad   = false;
-    const int32_t divisor_override = 0;
+    bool count_include_pad   = false;
+    int32_t divisor_override = 0;
 
     std::size_t GetWorkspaceSize() const { return 0; }
     Data_t GetWorkspace() const { return nullptr; }
@@ -66,16 +66,16 @@ struct BwdInvokeParams : public miopen::InvokeParams
     const TensorDescriptor* inputGradDesc  = nullptr;
     const TensorDescriptor* strideDesc     = nullptr;
     const TensorDescriptor* paddingDesc    = nullptr;
-    const TensorDescriptor* kinfor         = nullptr;
+    const TensorDescriptor* kinforDesc     = nullptr;
 
     ConstData_t output_grad = nullptr;
     Data_t input_grad       = nullptr;
     ConstData_t stride      = nullptr;
     ConstData_t padding     = nullptr;
-    ConstData_t kinfo       = nullptr;
+    ConstData_t kinfor      = nullptr;
 
-    const bool count_include_pad   = false;
-    const int32_t divisor_override = 0;
+    bool count_include_pad   = false;
+    int32_t divisor_override = 0;
 
     std::size_t GetWorkspaceSize() const { return 0; }
     Data_t GetWorkspace() const { return nullptr; }
diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/avgpool/problem_description.hpp
index 2b3ec555db..9400bd67a0 100644
--- a/src/include/miopen/avgpool/problem_description.hpp
+++ b/src/include/miopen/avgpool/problem_description.hpp
@@ -29,8 +29,6 @@
 #include <miopen/problem_description_base.hpp>
 #include <miopen/activ.hpp>
 #include <miopen/tensor.hpp>
-#include <cassert>
-#include <string>
 
 namespace miopen {
 
@@ -40,174 +38,122 @@ namespace avgpool {
 
 struct ProblemDescription : ProblemDescriptionBase
 {
-    ProblemDescription(const TensorDescriptor& inputDesc_,
-                       const TensorDescriptor& targetDesc_,
-                       const TensorDescriptor& weightDesc_,
-                       const TensorDescriptor& outputDesc_,
-                       int32_t ignore_index_,
-                       bool is_fwd_)
-        : inputDesc(inputDesc_),
-          targetDesc(targetDesc_),
-          weightDesc(weightDesc_),
-          outputDesc(outputDesc_),
-          ignore_index(ignore_index_),
-          is_fwd(is_fwd_)
+    ProblemDescription(const TensorDescriptor& strideDesc_,
+                       const TensorDescriptor& paddingDesc_,
+                       const TensorDescriptor& kinforDesc_,
+                       const bool count_include_pad_,
+                       const int32_t divisor_override_)
+        : strideDesc(strideDesc_),
+          paddingDesc(paddingDesc_),
+          kinforDesc(kinforDesc_),
+          count_include_pad(count_include_pad_),
+          divisor_override(divisor_override_)
     {
-    }
-
-    const TensorDescriptor& GetInputDesc() const { return inputDesc; }
-    const TensorDescriptor& GetTargetDesc() const { return targetDesc; }
-    const TensorDescriptor& GetWeightDesc() const { return weightDesc; }
-    const TensorDescriptor& GetOutputDesc() const { return outputDesc; }
-    int32_t GetIgnoreIndex() const { return ignore_index; }
-
-    bool IsValidLength() const
-    {
-        if(targetDesc.GetLengths()[0] != inputDesc.GetLengths()[0])
-            MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor sizes do not match.");
-
-        for(int32_t i = 1; i < targetDesc.GetNumDims(); ++i)
+        if(divisor_override < 0)
         {
-            if(targetDesc.GetLengths()[i] != inputDesc.GetLengths()[i + 1])
-            {
-                MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor sizes do not match.");
-            }
+            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: divisor_override must be non-negative.");
         }
-        if(weightDesc.GetLengths()[0] != inputDesc.GetLengths()[1])
-        {
-            MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor sizes do not match.");
-        }
-        if(inputDesc.GetLengths().size() > 5)
-        {
-            MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Do not support Input Tensor dims > 5.");
-        }
-        return true;
-    }
-
-    bool IsValidStride() const
-    {
-        auto isRightStride = [](TensorDescriptor td) {
-            auto strides = td.GetStrides();
-            auto lengths = td.GetLengths();
-            std::vector<std::pair<size_t, size_t>> p;
-            p.reserve(td.GetNumDims());
-            std::transform(strides.begin(),
-                           strides.end(),
-                           lengths.begin(),
-                           std::back_inserter(p),
-                           [](size_t a, size_t b) { return std::make_pair(a, b); });
-            std::sort(p.begin(), p.end());
-            for(int i = 1; i < p.size(); ++i)
-            {
-                if(p[i].first != p[i - 1].first * p[i - 1].second)
-                    MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Tensor strides do not valid.");
-            }
-            return true;
-        };
-        return isRightStride(inputDesc) && isRightStride(targetDesc) && isRightStride(outputDesc) &&
-               isRightStride(weightDesc);
-    }
-
-    bool IsSameType() const
-    {
-        if(inputDesc.GetType() != weightDesc.GetType())
-        {
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "NLLLoss: Input and Weight tensors types do not match.");
-        }
-        return true;
-    }
-
-    bool IsAllContiguous() const
-    {
-        auto isContiguous = [](TensorDescriptor td) {
-            size_t s = 1;
-            for(int i = td.GetNumDims() - 1; i >= 0; --i)
-            {
-                if(s != td.GetStrides()[i])
-                {
-                    return false;
-                }
-                s *= td.GetLengths()[i];
-            }
-            return true;
-        };
-        return isContiguous(inputDesc) && isContiguous(targetDesc) && isContiguous(weightDesc) &&
-               isContiguous(outputDesc);
     }
 
 protected:
-    TensorDescriptor inputDesc;
-    TensorDescriptor targetDesc;
-    TensorDescriptor weightDesc;
-    TensorDescriptor outputDesc;
-
-    int32_t ignore_index;
-    bool is_fwd;
+    TensorDescriptor strideDesc;
+    TensorDescriptor paddingDesc;
+    TensorDescriptor kinforDesc;
 
-    NetworkConfig MakeForwardNetworkConfig() const;
+    bool count_include_pad;
+    int32_t divisor_override;
 };
 
-struct UnreduceProblemDescription : ProblemDescription
+struct FwdProblemDescription : ProblemDescription
 {
-    UnreduceProblemDescription(const TensorDescriptor& inputDesc_,
-                               const TensorDescriptor& targetDesc_,
-                               const TensorDescriptor& weightDesc_,
-                               const TensorDescriptor& outputDesc_,
-                               int32_t ignore_index_,
-                               bool is_fwd_)
+    FwdProblemDescription(const TensorDescriptor& inputDesc_,
+                          const TensorDescriptor& outputDesc_,
+                          const TensorDescriptor& strideDesc_,
+                          const TensorDescriptor& paddingDesc_,
+                          const TensorDescriptor& kinforDesc_,
+                          const bool count_include_pad_,
+                          const int32_t divisor_override_)
         : ProblemDescription(
-              inputDesc_, targetDesc_, weightDesc_, outputDesc_, ignore_index_, is_fwd_)
+              strideDesc_, paddingDesc_, kinforDesc_, count_include_pad_, divisor_override_),
+          inputDesc(inputDesc_),
+          outputDesc(outputDesc_)
     {
-        IsSameType();
         IsValidLength();
-        IsValidStride();
     }
 
-    size_t GetNtotal() const { return outputDesc.GetElementSize(); }
-    size_t GetC() const { return weightDesc.GetElementSize(); }
+    auto GetInputDesc() const { return inputDesc; }
+    auto GetOutputDesc() const { return outputDesc; }
+    auto GetNtotal() const { return outputDesc.GetElementSize(); }
+
+    bool IsValidLength() const
+    {
+        auto input_dims = inputDesc.GetLengths().size();
+        if(outputDesc.GetLengths()[0] != inputDesc.GetLengths()[0] ||
+           outputDesc.GetLengths()[1] != inputDesc.GetLengths()[1] ||
+           outputDesc.GetLengths().size() != input_dims)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match.");
+        }
+        if(input_dims != strideDesc.GetElementSize() ||
+           input_dims != paddingDesc.GetElementSize() || input_dims != kinforDesc.GetElementSize())
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match.");
+        }
+
+        return true;
+    }
 
     NetworkConfig MakeNetworkConfig() const override;
 
-private:
-    NetworkConfig MakeForwardNetworkConfig() const;
+protected:
+    TensorDescriptor inputDesc;
+    TensorDescriptor outputDesc;
 };
 
-struct ReduceProblemDescription : ProblemDescription
+struct BwdProblemDescription : ProblemDescription
 {
-    ReduceProblemDescription(const TensorDescriptor& inputDesc_,
-                             const TensorDescriptor& targetDesc_,
-                             const TensorDescriptor& weightDesc_,
-                             const TensorDescriptor& outputDesc_,
-                             int32_t ignore_index_,
-                             float divisor_,
-                             bool is_fwd_)
+    BwdProblemDescription(const TensorDescriptor& outputGradDesc_,
+                          const TensorDescriptor& inputGradDesc_,
+                          const TensorDescriptor& strideDesc_,
+                          const TensorDescriptor& paddingDesc_,
+                          const TensorDescriptor& kinforDesc_,
+                          const bool count_include_pad_,
+                          const int32_t divisor_override_)
         : ProblemDescription(
-              inputDesc_, targetDesc_, weightDesc_, outputDesc_, ignore_index_, is_fwd_)
+              strideDesc_, paddingDesc_, kinforDesc_, count_include_pad_, divisor_override_),
+          outputGradDesc(outputGradDesc_),
+          inputGradDesc(inputGradDesc_)
     {
-        divisor = divisor_;
-        IsSameType();
         IsValidLength();
-        IsValidStride();
     }
 
-    size_t GetNtotal() const { return targetDesc.GetElementSize(); }
-    size_t GetC() const { return weightDesc.GetElementSize(); }
+    auto GetOutputGradDesc() const { return outputGradDesc; }
+    auto GetInputGradDesc() const { return inputGradDesc; }
+    auto GetNtotal() const { return inputGradDesc.GetElementSize(); }
 
     bool IsValidLength() const
     {
-        if(outputDesc.GetNumDims() != 1 || outputDesc.GetLengths()[0] != 1)
-            MIOPEN_THROW(miopenStatusBadParm, "NLLLoss: Output Tensor size must be (1).");
-        if(!ProblemDescription::IsValidLength())
-            return false;
+        auto input_dims = inputGradDesc.GetLengths().size();
+        if(outputGradDesc.GetLengths()[0] != inputGradDesc.GetLengths()[0] ||
+           outputGradDesc.GetLengths()[1] != inputGradDesc.GetLengths()[1] ||
+           outputGradDesc.GetLengths().size() != input_dims)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match.");
+        }
+        if(input_dims != strideDesc.GetElementSize() ||
+           input_dims != paddingDesc.GetElementSize() || input_dims != kinforDesc.GetElementSize())
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match.");
+        }
+
         return true;
     }
 
     NetworkConfig MakeNetworkConfig() const override;
 
-private:
-    float divisor;
-    NetworkConfig MakeForwardNetworkConfig() const;
+protected:
+    TensorDescriptor outputGradDesc;
+    TensorDescriptor inputGradDesc;
 };
 
 } // namespace avgpool
diff --git a/src/include/miopen/avgpool/solvers.hpp b/src/include/miopen/avgpool/solvers.hpp
index 34adc12b4c..5577b9fad6 100644
--- a/src/include/miopen/avgpool/solvers.hpp
+++ b/src/include/miopen/avgpool/solvers.hpp
@@ -33,8 +33,6 @@
 #include "miopen/kernel_build_params.hpp"
 #include "miopen/kernel_info.hpp"
 
-#include <utility>
-
 namespace miopen {
 
 namespace solver {
@@ -56,222 +54,56 @@ const auto make_hip_kernel = [](std::vector<size_t> localsize,
 
 namespace avgpool {
 
-using NLLLossUnreduce =
-    NonTunableSolverBase<ExecutionContext, miopen::avgpool::UnreduceProblemDescription>;
-
-using NLLLossReduce =
-    NonTunableSolverBase<ExecutionContext, miopen::avgpool::ReduceProblemDescription>;
-
-struct NLLLossUnreduceSolver : NLLLossUnreduce
-{
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-};
-
-struct NLLLossReduceSolver : NLLLossReduce
-{
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::ReduceProblemDescription& problem) const override;
-};
-
-// FORWARD UNREDUCE
-struct NLLLossUnreduceForwardContiguous4d final : NLLLossUnreduceSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceForwardContiguous4d>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-};
-
-struct NLLLossUnreduceForwardContiguous2d final : NLLLossUnreduceSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceForwardContiguous2d>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-};
-
-struct NLLLossUnreduceForward4d final : NLLLossUnreduceSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceForward4d>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-};
-
-struct NLLLossUnreduceForward2d final : NLLLossUnreduceSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceForward2d>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-};
-
-struct NLLLossUnreduceForward5d final : NLLLossUnreduceSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceForward5d>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-};
-
-// FORWARD REDUCE
-struct NLLLossReduceForward5d final : NLLLossReduceSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossReduceForward5d>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::ReduceProblemDescription& problem) const override;
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::ReduceProblemDescription& problem) const override;
-    std::size_t
-    GetWorkspaceSize(const ExecutionContext& context,
-                     const miopen::avgpool::ReduceProblemDescription& problem) const override;
-    bool MayNeedWorkspace() const override { return true; }
-};
-
-// BACKWARD UNREDUCE
-struct NLLLossUnreduceBackwardContiguous2d final : NLLLossUnreduceSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceBackwardContiguous2d>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+using AvgPoolForward =
+    NonTunableSolverBase<ExecutionContext, miopen::avgpool::FwdProblemDescription>;
 
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-};
+using AvgPoolBackward =
+    NonTunableSolverBase<ExecutionContext, miopen::avgpool::BwdProblemDescription>;
 
-struct NLLLossUnreduceBackwardContiguous4d final : NLLLossUnreduceSolver
+// FORWARD
+struct AvgPoolForward2d final : AvgPoolForward
 {
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceBackwardContiguous4d>();
-    }
+    const std::string& SolverDbId() const override { return GetSolverDbId<AvgPoolForward2d>(); }
 
     bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+                      const miopen::avgpool::FwdProblemDescription& problem) const override;
 
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::avgpool::FwdProblemDescription& problem) const override;
 };
 
-struct NLLLossUnreduceBackward4d final : NLLLossUnreduceSolver
+struct AvgPoolForward3d final : AvgPoolForward
 {
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceBackward4d>();
-    }
+    const std::string& SolverDbId() const override { return GetSolverDbId<AvgPoolForward3d>(); }
 
     bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+                      const miopen::avgpool::FwdProblemDescription& problem) const override;
 
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::avgpool::FwdProblemDescription& problem) const override;
 };
 
-struct NLLLossUnreduceBackward2d final : NLLLossUnreduceSolver
+// BACKWARD
+struct AvgPoolBackward2d final : AvgPoolBackward
 {
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceBackward2d>();
-    }
+    const std::string& SolverDbId() const override { return GetSolverDbId<AvgPoolBackward2d>(); }
 
     bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+                      const miopen::avgpool::BwdProblemDescription& problem) const override;
 
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::avgpool::BwdProblemDescription& problem) const override;
 };
 
-struct NLLLossUnreduceBackward5d final : NLLLossUnreduceSolver
+struct AvgPoolBackward3d final : AvgPoolBackward
 {
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossUnreduceBackward5d>();
-    }
+    const std::string& SolverDbId() const override { return GetSolverDbId<AvgPoolBackward3d>(); }
 
     bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::UnreduceProblemDescription& problem) const override;
-};
+                      const miopen::avgpool::BwdProblemDescription& problem) const override;
 
-// BACKWARD REDUCE
-struct NLLLossReduceBackward2d final : NLLLossReduceSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossReduceBackward2d>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::ReduceProblemDescription& problem) const override;
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::ReduceProblemDescription& problem) const override;
-};
-
-struct NLLLossReduceBackward5d final : NLLLossReduceSolver
-{
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<NLLLossReduceBackward5d>();
-    }
-
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::avgpool::ReduceProblemDescription& problem) const override;
-    ConvSolution
-    GetSolution(const ExecutionContext& context,
-                const miopen::avgpool::ReduceProblemDescription& problem) const override;
+    ConvSolution GetSolution(const ExecutionContext& context,
+                             const miopen::avgpool::BwdProblemDescription& problem) const override;
 };
 
 } // namespace avgpool
diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp
index 9f7430ba8a..050d431844 100644
--- a/src/include/miopen/tensor_view_utils.hpp
+++ b/src/include/miopen/tensor_view_utils.hpp
@@ -29,6 +29,7 @@
 
 #include <miopen/common.hpp>
 #include "../../kernels/tensor_view.hpp"
+#include "miopen/tensor.hpp"
 
 namespace miopen {
 
diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
index d35bfd93fc..d64dbf21f9 100644
--- a/src/kernels/tensor_view.hpp
+++ b/src/kernels/tensor_view.hpp
@@ -72,6 +72,46 @@ struct tensor_layout_t
         }
     }
 
+    constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w)
+    {
+        static_assert(N == 5);
+        layout[0] = n;
+        layout[1] = c;
+        layout[2] = d;
+        layout[3] = h;
+        layout[4] = w;
+    }
+
+    constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t h, uint64_t w)
+    {
+        static_assert(N == 4);
+        layout[0] = n;
+        layout[1] = c;
+        layout[2] = h;
+        layout[3] = w;
+    }
+
+    constexpr tensor_layout_t(uint64_t n, uint64_t h, uint64_t w)
+    {
+        static_assert(N == 3);
+        layout[0] = n;
+        layout[1] = h;
+        layout[2] = w;
+    }
+
+    constexpr tensor_layout_t(uint64_t n, uint64_t w)
+    {
+        static_assert(N == 2);
+        layout[0] = n;
+        layout[1] = w;
+    }
+
+    constexpr tensor_layout_t(uint64_t n)
+    {
+        static_assert(N == 1);
+        layout[0] = n;
+    }
+
     uint64_t layout[N];
 };
 
diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp
index e69de29bb2..10c9479b0c 100644
--- a/src/solver/avgpool/backward_avgpool_2d.cpp
+++ b/src/solver/avgpool/backward_avgpool_2d.cpp
@@ -0,0 +1,116 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include "miopen/tensor_view_utils.hpp"
+#include <miopen/avgpool/solvers.hpp>
+
+#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/avgpool.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_BWD_2D 1024
+
+namespace miopen {
+
+namespace solver {
+
+namespace avgpool {
+
+bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context,
+                                     const miopen::avgpool::BwdProblemDescription& problem) const
+{
+    return true;
+}
+
+ConvSolution
+AvgPoolBackward2d::GetSolution(const ExecutionContext& context,
+                               const miopen::avgpool::BwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetOutputGradDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType());
+    auto dtype        = problem.GetInputGradDesc().GetType();
+    size_t N_total    = problem.GetNtotal();
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+        {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
+
+    result.construction_params.push_back(make_hip_kernel(
+        {LOCAL_SIZE_BWD_2D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolBackward2d", build_params));
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::avgpool::BwdInvokeParams>();
+
+            decltype(auto) kernel = handle_.Run(kernels.front());
+
+            auto input_grad_tv  = get_inner_expanded_tv<4>(deref(params.inputGradDesc));
+            auto output_grad_tv = get_inner_expanded_tv<4>(deref(params.outputGradDesc));
+
+            auto N  = deref(params.inputGradDesc).GetLengths()[0];
+            auto C  = deref(params.inputGradDesc).GetLengths()[1];
+            auto H  = deref(params.inputGradDesc).GetLengths()[2];
+            auto W  = deref(params.inputGradDesc).GetLengths()[3];
+            auto OH = deref(params.outputGradDesc).GetLengths()[2];
+            auto OW = deref(params.outputGradDesc).GetLengths()[3];
+
+            kernel(params.output_grad,
+                   params.input_grad,
+                   N,
+                   C,
+                   H,
+                   W,
+                   OH,
+                   OW,
+                   params.kinfor,
+                   params.stride,
+                   params.padding,
+                   params.count_include_pad,
+                   params.divisor_override,
+                   output_grad_tv,
+                   input_grad_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace avgpool
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp
index e69de29bb2..b960554348 100644
--- a/src/solver/avgpool/backward_avgpool_3d.cpp
+++ b/src/solver/avgpool/backward_avgpool_3d.cpp
@@ -0,0 +1,120 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include "miopen/tensor_view_utils.hpp"
+#include <miopen/avgpool/solvers.hpp>
+
+#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/avgpool.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_BWD_3D 1024
+
+namespace miopen {
+
+namespace solver {
+
+namespace avgpool {
+
+bool AvgPoolBackward3d::IsApplicable(const ExecutionContext& context,
+                                     const miopen::avgpool::BwdProblemDescription& problem) const
+{
+    return true;
+}
+
+ConvSolution
+AvgPoolBackward3d::GetSolution(const ExecutionContext& context,
+                               const miopen::avgpool::BwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetOutputGradDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType());
+    auto dtype        = problem.GetInputGradDesc().GetType();
+    size_t N_total    = problem.GetNtotal();
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+        {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
+
+    result.construction_params.push_back(make_hip_kernel(
+        {LOCAL_SIZE_BWD_3D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolBackward3d", build_params));
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::avgpool::BwdInvokeParams>();
+
+            decltype(auto) kernel = handle_.Run(kernels.front());
+
+            auto input_grad_tv  = get_inner_expanded_tv<5>(deref(params.inputGradDesc));
+            auto output_grad_tv = get_inner_expanded_tv<5>(deref(params.outputGradDesc));
+
+            auto N  = deref(params.inputGradDesc).GetLengths()[0];
+            auto C  = deref(params.inputGradDesc).GetLengths()[1];
+            auto D  = deref(params.inputGradDesc).GetLengths()[2];
+            auto H  = deref(params.inputGradDesc).GetLengths()[3];
+            auto W  = deref(params.inputGradDesc).GetLengths()[4];
+            auto OD = deref(params.outputGradDesc).GetLengths()[2];
+            auto OH = deref(params.outputGradDesc).GetLengths()[3];
+            auto OW = deref(params.outputGradDesc).GetLengths()[4];
+
+            kernel(params.output_grad,
+                   params.input_grad,
+                   N,
+                   C,
+                   D,
+                   H,
+                   W,
+                   OD,
+                   OH,
+                   OW,
+                   params.kinfor,
+                   params.stride,
+                   params.padding,
+                   params.count_include_pad,
+                   params.divisor_override,
+                   output_grad_tv,
+                   input_grad_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace avgpool
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
index e69de29bb2..d0e37b5464 100644
--- a/src/solver/avgpool/forward_avgpool_2d.cpp
+++ b/src/solver/avgpool/forward_avgpool_2d.cpp
@@ -0,0 +1,116 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include "miopen/tensor_view_utils.hpp"
+#include <miopen/avgpool/solvers.hpp>
+
+#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/avgpool.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_FWD_2D 1024
+
+namespace miopen {
+
+namespace solver {
+
+namespace avgpool {
+
+bool AvgPoolForward2d::IsApplicable(const ExecutionContext& context,
+                                    const miopen::avgpool::FwdProblemDescription& problem) const
+{
+    return true;
+}
+
+ConvSolution
+AvgPoolForward2d::GetSolution(const ExecutionContext& context,
+                              const miopen::avgpool::FwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
+    auto dtype        = problem.GetOutputDesc().GetType();
+    size_t N_total    = problem.GetNtotal();
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+        {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
+
+    result.construction_params.push_back(make_hip_kernel(
+        {LOCAL_SIZE_FWD_2D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolForward2d", build_params));
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::avgpool::FwdInvokeParams>();
+
+            decltype(auto) kernel = handle_.Run(kernels.front());
+
+            auto input_tv  = get_inner_expanded_tv<4>(deref(params.inputDesc));
+            auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc));
+
+            auto N  = deref(params.inputDesc).GetLengths()[0];
+            auto C  = deref(params.inputDesc).GetLengths()[1];
+            auto H  = deref(params.inputDesc).GetLengths()[2];
+            auto W  = deref(params.inputDesc).GetLengths()[3];
+            auto OH = deref(params.outputDesc).GetLengths()[2];
+            auto OW = deref(params.outputDesc).GetLengths()[3];
+
+            kernel(params.input,
+                   params.output,
+                   N,
+                   C,
+                   H,
+                   W,
+                   OH,
+                   OW,
+                   params.kinfor,
+                   params.stride,
+                   params.padding,
+                   params.count_include_pad,
+                   params.divisor_override,
+                   input_tv,
+                   output_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace avgpool
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp
index e69de29bb2..9dd8c03cba 100644
--- a/src/solver/avgpool/forward_avgpool_3d.cpp
+++ b/src/solver/avgpool/forward_avgpool_3d.cpp
@@ -0,0 +1,120 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "miopen/conv_solution.hpp"
+#include "miopen/execution_context.hpp"
+#include "miopen/invoke_params.hpp"
+#include "miopen/tensor_view_utils.hpp"
+#include <miopen/avgpool/solvers.hpp>
+
+#include <miopen/avgpool/invoke_params.hpp>
+#include <miopen/datatype.hpp>
+#include <miopen/avgpool.hpp>
+#include <miopen/target_properties.hpp>
+
+#define LOCAL_SIZE_FWD_3D 1024
+
+namespace miopen {
+
+namespace solver {
+
+namespace avgpool {
+
+bool AvgPoolForward3d::IsApplicable(const ExecutionContext& context,
+                                    const miopen::avgpool::FwdProblemDescription& problem) const
+{
+    return true;
+}
+
+ConvSolution
+AvgPoolForward3d::GetSolution(const ExecutionContext& context,
+                              const miopen::avgpool::FwdProblemDescription& problem) const
+{
+    std::ignore = context;
+
+    auto result       = ConvSolution{miopenStatusSuccess};
+    auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
+    auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
+    auto dtype        = problem.GetOutputDesc().GetType();
+    size_t N_total    = problem.GetNtotal();
+
+    auto build_params = KernelBuildParameters{
+        {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
+        {"MIOPEN_USE_FP32", static_cast<int>(dtype == miopenFloat)},
+        {"MIOPEN_USE_FP64", static_cast<int>(dtype == miopenDouble)},
+        {"MIOPEN_USE_BFP16", static_cast<int>(dtype == miopenBFloat16)},
+        {"INPUT_TYPE", input_dtype == "bfloat16" ? "ushort" : input_dtype},
+        {"OUTPUT_TYPE", output_dtype == "bfloat16" ? "ushort" : output_dtype}};
+
+    result.construction_params.push_back(make_hip_kernel(
+        {LOCAL_SIZE_FWD_3D}, {N_total}, "MIOpenAvgPool.cpp", "AvgPoolForward3d", build_params));
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::avgpool::FwdInvokeParams>();
+
+            decltype(auto) kernel = handle_.Run(kernels.front());
+
+            auto input_tv  = get_inner_expanded_tv<5>(deref(params.inputDesc));
+            auto output_tv = get_inner_expanded_tv<5>(deref(params.outputDesc));
+
+            auto N  = deref(params.inputDesc).GetLengths()[0];
+            auto C  = deref(params.inputDesc).GetLengths()[1];
+            auto D  = deref(params.inputDesc).GetLengths()[2];
+            auto H  = deref(params.inputDesc).GetLengths()[3];
+            auto W  = deref(params.inputDesc).GetLengths()[4];
+            auto OD = deref(params.outputDesc).GetLengths()[2];
+            auto OH = deref(params.outputDesc).GetLengths()[3];
+            auto OW = deref(params.outputDesc).GetLengths()[4];
+
+            kernel(params.input,
+                   params.output,
+                   N,
+                   C,
+                   D,
+                   H,
+                   W,
+                   OD,
+                   OH,
+                   OW,
+                   params.kinfor,
+                   params.stride,
+                   params.padding,
+                   params.count_include_pad,
+                   params.divisor_override,
+                   input_tv,
+                   output_tv);
+        };
+    };
+
+    return result;
+}
+
+} // namespace avgpool
+
+} // namespace solver
+
+} // namespace miopen

From ca4ad974e8392c209814afa2478af48a3bb2bf1c Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Wed, 7 Aug 2024 18:17:48 +0700
Subject: [PATCH 04/29] add gtest

---
 .../miopen/avgpool/problem_description.hpp    |  24 +-
 src/kernels/MIOpenAvgPool.cpp                 | 550 ++++++++++++++++++
 src/solver/avgpool/forward_avgpool_2d.cpp     |  13 +-
 test/cpu_avgpool.hpp                          | 426 ++++++++++++++
 test/gtest/avgpool.cpp                        | 163 ++++++
 test/gtest/avgpool.hpp                        | 426 ++++++++++++++
 6 files changed, 1588 insertions(+), 14 deletions(-)
 create mode 100644 test/cpu_avgpool.hpp
 create mode 100644 test/gtest/avgpool.cpp
 create mode 100644 test/gtest/avgpool.hpp

diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/avgpool/problem_description.hpp
index 9400bd67a0..9166762235 100644
--- a/src/include/miopen/avgpool/problem_description.hpp
+++ b/src/include/miopen/avgpool/problem_description.hpp
@@ -92,12 +92,16 @@ struct FwdProblemDescription : ProblemDescription
            outputDesc.GetLengths()[1] != inputDesc.GetLengths()[1] ||
            outputDesc.GetLengths().size() != input_dims)
         {
-            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match.");
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "AvgPool: Input and output tensor sizes do not match.");
         }
-        if(input_dims != strideDesc.GetElementSize() ||
-           input_dims != paddingDesc.GetElementSize() || input_dims != kinforDesc.GetElementSize())
+        if(input_dims - 2 != strideDesc.GetElementSize() ||
+           input_dims - 2 != paddingDesc.GetElementSize() ||
+           input_dims - 2 != kinforDesc.GetElementSize())
         {
-            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match.");
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "AvgPool: Input tensor sizes and Kernel size or stride "
+                         "or padding do not match.");
         }
 
         return true;
@@ -138,12 +142,16 @@ struct BwdProblemDescription : ProblemDescription
            outputGradDesc.GetLengths()[1] != inputGradDesc.GetLengths()[1] ||
            outputGradDesc.GetLengths().size() != input_dims)
         {
-            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match.");
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "AvgPool: Input grad and output grad tensor sizes do not match.");
         }
-        if(input_dims != strideDesc.GetElementSize() ||
-           input_dims != paddingDesc.GetElementSize() || input_dims != kinforDesc.GetElementSize())
+        if(input_dims - 2 != strideDesc.GetElementSize() ||
+           input_dims - 2 != paddingDesc.GetElementSize() ||
+           input_dims - 2 != kinforDesc.GetElementSize())
         {
-            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Tensor sizes do not match.");
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "AvgPool: Input grad tensor sizes and Kernel size or stride or padding do "
+                         "not match.");
         }
 
         return true;
diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
index e69de29bb2..bcbf4f6c60 100644
--- a/src/kernels/MIOpenAvgPool.cpp
+++ b/src/kernels/MIOpenAvgPool.cpp
@@ -0,0 +1,550 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include <cstddef>
+#ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#endif
+
+#include "float_types.h"
+#include "tensor_view.hpp"
+
+#ifndef INPUT_TYPE
+#define INPUT_TYPE float
+#endif
+
+#ifndef OUTPUT_TYPE
+#define OUTPUT_TYPE float
+#endif
+
+template <typename TI, typename TO>
+__device__ void avgPoolForward2d(const TI* __restrict__ input,
+                                 TO* __restrict__ output,
+                                 int32_t N,
+                                 int32_t C,
+                                 int32_t H,
+                                 int32_t W,
+                                 int32_t OH,
+                                 int32_t OW,
+                                 int32_t* kinfor,
+                                 int32_t* stride,
+                                 int32_t* padding,
+                                 bool count_include_pad,
+                                 int32_t divisor_override,
+                                 tensor_view_t<4> input_tv,
+                                 tensor_view_t<4> output_tv)
+{
+    int32_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t ncoh = gid / OW, ow = gid % OW;
+    int32_t nc = ncoh / OH, oh = ncoh % OH;
+    int32_t n = nc / C, c = nc % C;
+    int32_t R  = kinfor[0];
+    int32_t S  = kinfor[1];
+    int32_t sh = stride[0];
+    int32_t sw = stride[1];
+    int32_t ph = padding[0];
+    int32_t pw = padding[1];
+
+    if(n >= N)
+        return;
+
+    FLOAT_ACCUM m = 0;
+    for(int32_t r = 0; r < R; ++r)
+    {
+        for(int32_t s = 0; s < S; ++s)
+        {
+            // input idx : (n, c, h, w)
+            int32_t h = oh * sh - ph + r;
+            if(h < 0 || h >= H)
+                continue;
+            int32_t w = ow * sw - pw + s;
+            if(w < 0 || w >= W)
+                continue;
+            // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+            m += CVT_FLOAT2ACCUM(
+                input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]);
+        }
+    }
+
+    int32_t hstart = oh * sh - ph;
+    int32_t wstart = ow * sw - pw;
+    int32_t hend   = min(hstart + R, H + ph);
+    int32_t wend   = min(wstart + S, W + pw);
+
+    const int32_t pool_size = (hend - hstart) * (wend - wstart);
+
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    hend   = min(hend, H);
+    wend   = min(wend, W);
+
+    int32_t divide_factor;
+    if(divisor_override != 0)
+    {
+        divide_factor = divisor_override;
+    }
+    else
+    {
+        if(count_include_pad)
+        {
+            divide_factor = pool_size;
+        }
+        else
+        {
+            divide_factor = (hend - hstart) * (wend - wstart);
+        }
+    }
+    FLOAT_ACCUM val = m / divide_factor;
+
+    output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] = CVT_ACCUM2FLOAT(val);
+}
+
+extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input,
+                                            OUTPUT_TYPE* __restrict__ output,
+                                            int32_t N,
+                                            int32_t C,
+                                            int32_t H,
+                                            int32_t W,
+                                            int32_t OH,
+                                            int32_t OW,
+                                            int32_t* kinfor,
+                                            int32_t* stride,
+                                            int32_t* padding,
+                                            bool count_include_pad,
+                                            int32_t divisor_override,
+                                            tensor_view_t<4> input_tv,
+                                            tensor_view_t<4> output_tv)
+{
+    avgPoolForward2d<INPUT_TYPE, OUTPUT_TYPE>(input,
+                                              output,
+                                              N,
+                                              C,
+                                              H,
+                                              W,
+                                              OH,
+                                              OW,
+                                              kinfor,
+                                              stride,
+                                              padding,
+                                              count_include_pad,
+                                              divisor_override,
+                                              input_tv,
+                                              output_tv);
+}
+
+template <typename TI, typename TO>
+__device__ void avgPoolForward3d(const TI* __restrict__ input,
+                                 TO* __restrict__ output,
+                                 int32_t N,
+                                 int32_t C,
+                                 int32_t D,
+                                 int32_t H,
+                                 int32_t W,
+                                 int32_t OD,
+                                 int32_t OH,
+                                 int32_t OW,
+                                 int32_t* kinfor,
+                                 int32_t* stride,
+                                 int32_t* padding,
+                                 bool count_include_pad,
+                                 int32_t divisor_override,
+                                 tensor_view_t<5> input_tv,
+                                 tensor_view_t<5> output_tv)
+{
+    int32_t gid    = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t ncodoh = gid / OW, ow = gid % OW;
+    int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
+    int32_t nc = ncod / OD, od = ncod % OD;
+    int32_t n = nc / C, c = nc % C;
+    int32_t KD = kinfor[0];
+    int32_t R  = kinfor[1];
+    int32_t S  = kinfor[2];
+    int32_t sd = stride[0];
+    int32_t sh = stride[1];
+    int32_t sw = stride[2];
+    int32_t pd = padding[0];
+    int32_t ph = padding[1];
+    int32_t pw = padding[2];
+
+    if(n >= N)
+        return;
+    FLOAT_ACCUM sum = 0;
+    for(int32_t kd = 0; kd < KD; ++kd)
+    {
+        for(int32_t r = 0; r < R; ++r)
+        {
+            for(int32_t s = 0; s < S; ++s)
+            {
+                // input idx : (n, c, d, h, w)
+                int32_t d = od * sd - pd + kd;
+                if(d < 0 || d >= D)
+                    continue;
+                int32_t h = oh * sh - ph + r;
+                if(h < 0 || h >= H)
+                    continue;
+                int32_t w = ow * sw - pw + s;
+                if(w < 0 || w >= W)
+                    continue;
+                // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+                sum += CVT_FLOAT2ACCUM(
+                    input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]);
+            }
+        }
+    }
+    int32_t dstart = od * sd - pd;
+    int32_t hstart = oh * sh - ph;
+    int32_t wstart = ow * sw - pw;
+    int32_t dend   = min(dstart + KD, D + pd);
+    int32_t hend   = min(hstart + R, H + ph);
+    int32_t wend   = min(wstart + S, W + pw);
+
+    const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    dstart                  = max(dstart, 0);
+    hstart                  = max(hstart, 0);
+    wstart                  = max(wstart, 0);
+    dend                    = min(dend, D);
+    hend                    = min(hend, H);
+    wend                    = min(wend, W);
+
+    int32_t divide_factor;
+    if(divisor_override != 0)
+    {
+        divide_factor = divisor_override;
+    }
+    else
+    {
+        if(count_include_pad)
+        {
+            divide_factor = pool_size;
+        }
+        else
+        {
+            divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
+        }
+    }
+    FLOAT_ACCUM val = sum / divide_factor;
+    output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] =
+        CVT_ACCUM2FLOAT(val);
+}
+
+extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input,
+                                            OUTPUT_TYPE* __restrict__ output,
+                                            int32_t N,
+                                            int32_t C,
+                                            int32_t D,
+                                            int32_t H,
+                                            int32_t W,
+                                            int32_t OD,
+                                            int32_t OH,
+                                            int32_t OW,
+                                            int32_t* kinfor,
+                                            int32_t* stride,
+                                            int32_t* padding,
+                                            bool count_include_pad,
+                                            int32_t divisor_override,
+                                            tensor_view_t<5> input_tv,
+                                            tensor_view_t<5> output_tv)
+{
+    avgPoolForward3d<INPUT_TYPE, OUTPUT_TYPE>(input,
+                                              output,
+                                              N,
+                                              C,
+                                              D,
+                                              H,
+                                              W,
+                                              OD,
+                                              OH,
+                                              OW,
+                                              kinfor,
+                                              stride,
+                                              padding,
+                                              count_include_pad,
+                                              divisor_override,
+                                              input_tv,
+                                              output_tv);
+}
+
+template <typename TI, typename TO>
+__device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
+                                  TO* __restrict__ input_grad,
+                                  int32_t N,
+                                  int32_t C,
+                                  int32_t H,
+                                  int32_t W,
+                                  int32_t OH,
+                                  int32_t OW,
+                                  int32_t* kinfor,
+                                  int32_t* stride,
+                                  int32_t* padding,
+                                  bool count_include_pad,
+                                  int32_t divisor_override,
+                                  tensor_view_t<4> output_grad_tv,
+                                  tensor_view_t<4> input_grad_tv)
+{
+    int32_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t nch = gid / W, w = gid % W;
+    int32_t nc = nch / H, h = nch % H;
+    int32_t n = nc / C, c = nc % C;
+    int32_t R  = kinfor[0];
+    int32_t S  = kinfor[1];
+    int32_t sh = stride[0];
+    int32_t sw = stride[1];
+    int32_t ph = padding[0];
+    int32_t pw = padding[1];
+
+    if(n >= N)
+        return;
+
+    FLOAT_ACCUM grad = 0;
+    for(int32_t r = 0; r < R; ++r)
+    {
+        for(int32_t s = 0; s < S; ++s)
+        {
+            int32_t ohsh = h + ph - r;
+            if(ohsh % sh != 0)
+                continue;
+            int32_t oh = ohsh / sh;
+            if(oh < 0 || oh >= OH)
+                continue;
+            int32_t owsw = w + pw - s;
+            if(owsw % sw != 0)
+                continue;
+            int32_t ow = owsw / sw;
+            if(ow < 0 || ow >= OW)
+                continue;
+
+            int32_t hstart = oh * sh - ph;
+            int32_t wstart = ow * sw - pw;
+            int32_t hend   = min(hstart + R, H + ph);
+            int32_t wend   = min(wstart + S, W + pw);
+
+            const int32_t pool_size = (hend - hstart) * (wend - wstart);
+
+            hstart = max(hstart, 0);
+            wstart = max(wstart, 0);
+            hend   = min(hend, H);
+            wend   = min(wend, W);
+
+            int32_t divide_factor;
+            if(divisor_override != 0)
+            {
+                divide_factor = divisor_override;
+            }
+            else
+            {
+                if(count_include_pad)
+                {
+                    divide_factor = pool_size;
+                }
+                else
+                {
+                    divide_factor = (hend - hstart) * (wend - wstart);
+                }
+            }
+
+            grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(
+                        tensor_layout_t<4>(n, c, oh, ow))]) /
+                    divide_factor;
+        }
+    }
+    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] =
+        CVT_ACCUM2FLOAT(grad);
+}
+
+extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad,
+                                             OUTPUT_TYPE* __restrict__ input_grad,
+                                             int32_t N,
+                                             int32_t C,
+                                             int32_t H,
+                                             int32_t W,
+                                             int32_t OH,
+                                             int32_t OW,
+                                             int32_t* kinfor,
+                                             int32_t* stride,
+                                             int32_t* padding,
+                                             bool count_include_pad,
+                                             int32_t divisor_override,
+                                             tensor_view_t<4> output_grad_tv,
+                                             tensor_view_t<4> input_grad_tv)
+{
+    avgPoolBackward2d<INPUT_TYPE, OUTPUT_TYPE>(output_grad,
+                                               input_grad,
+                                               N,
+                                               C,
+                                               H,
+                                               W,
+                                               OH,
+                                               OW,
+                                               kinfor,
+                                               stride,
+                                               padding,
+                                               count_include_pad,
+                                               divisor_override,
+                                               output_grad_tv,
+                                               input_grad_tv);
+}
+
+template <typename TI, typename TO>
+__device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
+                                  TO* __restrict__ input_grad,
+                                  int32_t N,
+                                  int32_t C,
+                                  int32_t D,
+                                  int32_t H,
+                                  int32_t W,
+                                  int32_t OD,
+                                  int32_t OH,
+                                  int32_t OW,
+                                  int32_t* kinfor,
+                                  int32_t* stride,
+                                  int32_t* padding,
+                                  bool count_include_pad,
+                                  int32_t divisor_override,
+                                  tensor_view_t<5> output_grad_tv,
+                                  tensor_view_t<5> input_grad_tv)
+{
+    int32_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
+    int32_t ncdh = gid / W, w = gid % W;
+    int32_t ncd = ncdh / H, h = ncdh % H;
+    int32_t nc = ncd / D, d = ncd % D;
+    int32_t n = nc / C, c = nc % C;
+    int32_t KD = kinfor[0];
+    int32_t R  = kinfor[1];
+    int32_t S  = kinfor[2];
+    int32_t sd = stride[0];
+    int32_t sh = stride[1];
+    int32_t sw = stride[2];
+    int32_t pd = padding[0];
+    int32_t ph = padding[1];
+    int32_t pw = padding[2];
+
+    if(n >= N)
+        return;
+
+    FLOAT_ACCUM grad = 0;
+    for(int32_t kd = 0; kd < KD; ++kd)
+    {
+        for(int32_t r = 0; r < R; ++r)
+        {
+            for(int32_t s = 0; s < S; ++s)
+            {
+                int32_t odsd = d + pd - kd;
+                if(odsd % sd != 0)
+                    continue;
+                int32_t od = odsd / sd;
+                if(od < 0 || od >= OD)
+                    continue;
+
+                int32_t ohsh = h + ph - r;
+                if(ohsh % sh != 0)
+                    continue;
+                int32_t oh = ohsh / sh;
+                if(oh < 0 || oh >= OH)
+                    continue;
+
+                int32_t owsw = w + pw - s;
+                if(owsw % sw != 0)
+                    continue;
+                int32_t ow = owsw / sw;
+                if(ow < 0 || ow >= OW)
+                    continue;
+
+                int32_t dstart = od * sd - pd;
+                int32_t hstart = oh * sh - ph;
+                int32_t wstart = ow * sw - pw;
+                int32_t dend   = min(dstart + KD, D + pd);
+                int32_t hend   = min(hstart + R, H + ph);
+                int32_t wend   = min(wstart + S, W + pw);
+
+                const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                dstart                  = max(dstart, 0);
+                hstart                  = max(hstart, 0);
+                wstart                  = max(wstart, 0);
+                dend                    = min(dend, D);
+                hend                    = min(hend, H);
+                wend                    = min(wend, W);
+                int32_t divide_factor;
+                if(divisor_override != 0)
+                {
+                    divide_factor = divisor_override;
+                }
+                else
+                {
+                    if(count_include_pad)
+                    {
+                        divide_factor = pool_size;
+                    }
+                    else
+                    {
+                        divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                    }
+                }
+                grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(
+                            tensor_layout_t<5>(n, c, od, oh, ow))]) /
+                        divide_factor;
+            }
+        }
+    }
+    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] =
+        CVT_ACCUM2FLOAT(grad);
+}
+
+extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad,
+                                             OUTPUT_TYPE* __restrict__ input_grad,
+                                             int32_t N,
+                                             int32_t C,
+                                             int32_t D,
+                                             int32_t H,
+                                             int32_t W,
+                                             int32_t OD,
+                                             int32_t OH,
+                                             int32_t OW,
+                                             int32_t* kinfor,
+                                             int32_t* stride,
+                                             int32_t* padding,
+                                             bool count_include_pad,
+                                             int32_t divisor_override,
+                                             tensor_view_t<5> output_grad_tv,
+                                             tensor_view_t<5> input_grad_tv)
+{
+    avgPoolBackward3d<INPUT_TYPE, OUTPUT_TYPE>(output_grad,
+                                               input_grad,
+                                               N,
+                                               C,
+                                               D,
+                                               H,
+                                               W,
+                                               OD,
+                                               OH,
+                                               OW,
+                                               kinfor,
+                                               stride,
+                                               padding,
+                                               count_include_pad,
+                                               divisor_override,
+                                               output_grad_tv,
+                                               input_grad_tv);
+}
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
index d0e37b5464..8b444370a0 100644
--- a/src/solver/avgpool/forward_avgpool_2d.cpp
+++ b/src/solver/avgpool/forward_avgpool_2d.cpp
@@ -28,6 +28,7 @@
 #include "miopen/execution_context.hpp"
 #include "miopen/invoke_params.hpp"
 #include "miopen/tensor_view_utils.hpp"
+#include <cstdint>
 #include <miopen/avgpool/solvers.hpp>
 
 #include <miopen/avgpool/invoke_params.hpp>
@@ -81,12 +82,12 @@ AvgPoolForward2d::GetSolution(const ExecutionContext& context,
             auto input_tv  = get_inner_expanded_tv<4>(deref(params.inputDesc));
             auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc));
 
-            auto N  = deref(params.inputDesc).GetLengths()[0];
-            auto C  = deref(params.inputDesc).GetLengths()[1];
-            auto H  = deref(params.inputDesc).GetLengths()[2];
-            auto W  = deref(params.inputDesc).GetLengths()[3];
-            auto OH = deref(params.outputDesc).GetLengths()[2];
-            auto OW = deref(params.outputDesc).GetLengths()[3];
+            size_t N  = deref(params.inputDesc).GetLengths()[0];
+            size_t C  = deref(params.inputDesc).GetLengths()[1];
+            size_t H  = deref(params.inputDesc).GetLengths()[2];
+            size_t W  = deref(params.inputDesc).GetLengths()[3];
+            size_t OH = deref(params.outputDesc).GetLengths()[2];
+            size_t OW = deref(params.outputDesc).GetLengths()[3];
 
             kernel(params.input,
                    params.output,
diff --git a/test/cpu_avgpool.hpp b/test/cpu_avgpool.hpp
new file mode 100644
index 0000000000..40a67a8d7d
--- /dev/null
+++ b/test/cpu_avgpool.hpp
@@ -0,0 +1,426 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_CPU_AVGPOOL_HPP
+#define GUARD_CPU_AVGPOOL_HPP
+
+#include "tensor_holder.hpp"
+#include <miopen/tensor_view_utils.hpp>
+
+template <class T>
+void cpu_avgpool_forward_2d(tensor<T> input,
+                            tensor<T>& output,
+                            int32_t N,
+                            int32_t C,
+                            int32_t H,
+                            int32_t W,
+                            int32_t OH,
+                            int32_t OW,
+                            tensor<int32_t> kinfor,
+                            tensor<int32_t> stride,
+                            tensor<int32_t> padding,
+                            bool count_include_pad,
+                            int32_t divisor_override)
+{
+    auto dims  = input.desc.GetLengths();
+    auto numel = output.desc.GetElementSize();
+
+    auto input_tv  = miopen::get_inner_expanded_tv<4>(input.desc);
+    auto output_tv = miopen::get_inner_expanded_tv<4>(output.desc);
+
+    for(int32_t gid = 0; gid < numel; gid++)
+    {
+        int32_t ncoh = gid / OW, ow = gid % OW;
+        int32_t nc = ncoh / OH, oh = ncoh % OH;
+        int32_t n = nc / C, c = nc % C;
+        int32_t R  = kinfor[0];
+        int32_t S  = kinfor[1];
+        int32_t sh = stride[0];
+        int32_t sw = stride[1];
+        int32_t ph = padding[0];
+        int32_t pw = padding[1];
+
+        if(n >= N)
+            return;
+
+        float m = 0;
+        for(int32_t r = 0; r < R; ++r)
+        {
+            for(int32_t s = 0; s < S; ++s)
+            {
+                // input idx : (n, c, h, w)
+                int32_t h = oh * sh - ph + r;
+                if(h < 0 || h >= H)
+                    continue;
+                int32_t w = ow * sw - pw + s;
+                if(w < 0 || w >= W)
+                    continue;
+                // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+                m += static_cast<float>(
+                    input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]);
+            }
+        }
+
+        int32_t hstart = oh * sh - ph;
+        int32_t wstart = ow * sw - pw;
+        int32_t hend   = std::min(hstart + R, H + ph);
+        int32_t wend   = std::min(wstart + S, W + pw);
+
+        const int32_t pool_size = (hend - hstart) * (wend - wstart);
+
+        hstart = std::max(hstart, 0);
+        wstart = std::max(wstart, 0);
+        hend   = std::min(hend, H);
+        wend   = std::min(wend, W);
+
+        int32_t divide_factor;
+        if(divisor_override != 0)
+        {
+            divide_factor = divisor_override;
+        }
+        else
+        {
+            if(count_include_pad)
+            {
+                divide_factor = pool_size;
+            }
+            else
+            {
+                divide_factor = (hend - hstart) * (wend - wstart);
+            }
+        }
+        float val = m / divide_factor;
+
+        output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] =
+            static_cast<T>(val);
+    }
+}
+
+template <class T>
+void cpu_avgpool_forward_3d(tensor<T> input,
+                            tensor<T>& output,
+                            int32_t N,
+                            int32_t C,
+                            int32_t D,
+                            int32_t H,
+                            int32_t W,
+                            int32_t OD,
+                            int32_t OH,
+                            int32_t OW,
+                            tensor<int32_t> kinfor,
+                            tensor<int32_t> stride,
+                            tensor<int32_t> padding,
+                            bool count_include_pad,
+                            int32_t divisor_override)
+{
+    auto dims  = input.desc.GetLengths();
+    auto numel = output.desc.GetElementSize();
+
+    auto input_tv  = miopen::get_inner_expanded_tv<5>(input.desc);
+    auto output_tv = miopen::get_inner_expanded_tv<5>(output.desc);
+
+    for(int32_t gid = 0; gid < numel; gid++)
+    {
+        int32_t ncodoh = gid / OW, ow = gid % OW;
+        int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
+        int32_t nc = ncod / OD, od = ncod % OD;
+        int32_t n = nc / C, c = nc % C;
+        int32_t KD = kinfor[0];
+        int32_t R  = kinfor[1];
+        int32_t S  = kinfor[2];
+        int32_t sd = stride[0];
+        int32_t sh = stride[1];
+        int32_t sw = stride[2];
+        int32_t pd = padding[0];
+        int32_t ph = padding[1];
+        int32_t pw = padding[2];
+
+        if(n >= N)
+            return;
+        float sum = 0;
+        for(int32_t kd = 0; kd < KD; ++kd)
+        {
+            for(int32_t r = 0; r < R; ++r)
+            {
+                for(int32_t s = 0; s < S; ++s)
+                {
+                    // input idx : (n, c, d, h, w)
+                    int32_t d = od * sd - pd + kd;
+                    if(d < 0 || d >= D)
+                        continue;
+                    int32_t h = oh * sh - ph + r;
+                    if(h < 0 || h >= H)
+                        continue;
+                    int32_t w = ow * sw - pw + s;
+                    if(w < 0 || w >= W)
+                        continue;
+                    // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+                    sum += static_cast<float>(
+                        input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]);
+                }
+            }
+        }
+        int32_t dstart = od * sd - pd;
+        int32_t hstart = oh * sh - ph;
+        int32_t wstart = ow * sw - pw;
+        int32_t dend   = std::min(dstart + KD, D + pd);
+        int32_t hend   = std::min(hstart + R, H + ph);
+        int32_t wend   = std::min(wstart + S, W + pw);
+
+        const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+        dstart                  = std::max(dstart, 0);
+        hstart                  = std::max(hstart, 0);
+        wstart                  = std::max(wstart, 0);
+        dend                    = std::min(dend, D);
+        hend                    = std::min(hend, H);
+        wend                    = std::min(wend, W);
+
+        int32_t divide_factor;
+        if(divisor_override != 0)
+        {
+            divide_factor = divisor_override;
+        }
+        else
+        {
+            if(count_include_pad)
+            {
+                divide_factor = pool_size;
+            }
+            else
+            {
+                divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            }
+        }
+        float val = sum / divide_factor;
+        output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] =
+            static_cast<T>(val);
+    }
+}
+
+template <class T>
+void cpu_avgpool_backward_2d(tensor<T> output_grad,
+                             tensor<T>& input_grad,
+                             int32_t N,
+                             int32_t C,
+                             int32_t H,
+                             int32_t W,
+                             int32_t OH,
+                             int32_t OW,
+                             tensor<int32_t> kinfor,
+                             tensor<int32_t> stride,
+                             tensor<int32_t> padding,
+                             bool count_include_pad,
+                             int32_t divisor_override)
+{
+    auto dims  = input_grad.desc.GetLengths();
+    auto numel = input_grad.desc.GetElementSize();
+
+    auto output_grad_tv = miopen::get_inner_expanded_tv<4>(output_grad.desc);
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<4>(input_grad.desc);
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        int32_t nch = gid / W, w = gid % W;
+        int32_t nc = nch / H, h = nch % H;
+        int32_t n = nc / C, c = nc % C;
+        int32_t R  = kinfor[0];
+        int32_t S  = kinfor[1];
+        int32_t sh = stride[0];
+        int32_t sw = stride[1];
+        int32_t ph = padding[0];
+        int32_t pw = padding[1];
+
+        if(n >= N)
+            return;
+
+        float grad = 0;
+        for(int32_t r = 0; r < R; ++r)
+        {
+            for(int32_t s = 0; s < S; ++s)
+            {
+                int32_t ohsh = h + ph - r;
+                if(ohsh % sh != 0)
+                    continue;
+                int32_t oh = ohsh / sh;
+                if(oh < 0 || oh >= OH)
+                    continue;
+                int32_t owsw = w + pw - s;
+                if(owsw % sw != 0)
+                    continue;
+                int32_t ow = owsw / sw;
+                if(ow < 0 || ow >= OW)
+                    continue;
+
+                int32_t hstart = oh * sh - ph;
+                int32_t wstart = ow * sw - pw;
+                int32_t hend   = std::min(hstart + R, H + ph);
+                int32_t wend   = std::min(wstart + S, W + pw);
+
+                const int32_t pool_size = (hend - hstart) * (wend - wstart);
+
+                hstart = std::max(hstart, 0);
+                wstart = std::max(wstart, 0);
+                hend   = std::min(hend, H);
+                wend   = std::min(wend, W);
+
+                int32_t divide_factor;
+                if(divisor_override != 0)
+                {
+                    divide_factor = divisor_override;
+                }
+                else
+                {
+                    if(count_include_pad)
+                    {
+                        divide_factor = pool_size;
+                    }
+                    else
+                    {
+                        divide_factor = (hend - hstart) * (wend - wstart);
+                    }
+                }
+
+                grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
+                            tensor_layout_t<4>(n, c, oh, ow))]) /
+                        divide_factor;
+            }
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] =
+            static_cast<T>(grad);
+    }
+}
+
+template <class T>
+void cpu_avgpool_backward_3d(tensor<T> output_grad,
+                             tensor<T>& input_grad,
+                             int32_t N,
+                             int32_t C,
+                             int32_t D,
+                             int32_t H,
+                             int32_t W,
+                             int32_t OD,
+                             int32_t OH,
+                             int32_t OW,
+                             tensor<int32_t> kinfor,
+                             tensor<int32_t> stride,
+                             tensor<int32_t> padding,
+                             bool count_include_pad,
+                             int32_t divisor_override)
+{
+    auto dims  = input_grad.desc.GetLengths();
+    auto numel = input_grad.desc.GetElementSize();
+
+    auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc);
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(input_grad.desc);
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        int32_t ncdh = gid / W, w = gid % W;
+        int32_t ncd = ncdh / H, h = ncdh % H;
+        int32_t nc = ncd / D, d = ncd % D;
+        int32_t n = nc / C, c = nc % C;
+        int32_t KD = kinfor[0];
+        int32_t R  = kinfor[1];
+        int32_t S  = kinfor[2];
+        int32_t sd = stride[0];
+        int32_t sh = stride[1];
+        int32_t sw = stride[2];
+        int32_t pd = padding[0];
+        int32_t ph = padding[1];
+        int32_t pw = padding[2];
+
+        if(n >= N)
+            return;
+
+        float grad = 0;
+        for(int32_t kd = 0; kd < KD; ++kd)
+        {
+            for(int32_t r = 0; r < R; ++r)
+            {
+                for(int32_t s = 0; s < S; ++s)
+                {
+                    int32_t odsd = d + pd - kd;
+                    if(odsd % sd != 0)
+                        continue;
+                    int32_t od = odsd / sd;
+                    if(od < 0 || od >= OD)
+                        continue;
+
+                    int32_t ohsh = h + ph - r;
+                    if(ohsh % sh != 0)
+                        continue;
+                    int32_t oh = ohsh / sh;
+                    if(oh < 0 || oh >= OH)
+                        continue;
+
+                    int32_t owsw = w + pw - s;
+                    if(owsw % sw != 0)
+                        continue;
+                    int32_t ow = owsw / sw;
+                    if(ow < 0 || ow >= OW)
+                        continue;
+
+                    int32_t dstart = od * sd - pd;
+                    int32_t hstart = oh * sh - ph;
+                    int32_t wstart = ow * sw - pw;
+                    int32_t dend   = std::min(dstart + KD, D + pd);
+                    int32_t hend   = std::min(hstart + R, H + ph);
+                    int32_t wend   = std::min(wstart + S, W + pw);
+
+                    const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                    dstart                  = std::max(dstart, 0);
+                    hstart                  = std::max(hstart, 0);
+                    wstart                  = std::max(wstart, 0);
+                    dend                    = std::min(dend, D);
+                    hend                    = std::min(hend, H);
+                    wend                    = std::min(wend, W);
+                    int32_t divide_factor;
+                    if(divisor_override != 0)
+                    {
+                        divide_factor = divisor_override;
+                    }
+                    else
+                    {
+                        if(count_include_pad)
+                        {
+                            divide_factor = pool_size;
+                        }
+                        else
+                        {
+                            divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                        }
+                    }
+                    grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
+                                tensor_layout_t<5>(n, c, od, oh, ow))]) /
+                            divide_factor;
+                }
+            }
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] =
+            static_cast<T>(grad);
+    }
+}
+
+#endif
diff --git a/test/gtest/avgpool.cpp b/test/gtest/avgpool.cpp
new file mode 100644
index 0000000000..1dd5502339
--- /dev/null
+++ b/test/gtest/avgpool.cpp
@@ -0,0 +1,163 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "avgpool.hpp"
+#include <miopen/env.hpp>
+
+MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
+MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
+
+namespace avgpool {
+
+std::string GetFloatArg()
+{
+    const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG);
+    if(tmp.empty())
+    {
+        return "";
+    }
+    return tmp;
+}
+
+struct GPU_Avgpool_fwd_FP32 : AvgPoolTestFwd<float>
+{
+};
+
+struct GPU_Avgpool_fwd_FP16 : AvgPoolTestFwd<half>
+{
+};
+
+struct GPU_Avgpool_fwd_BFP16 : AvgPoolTestFwd<bfloat16>
+{
+};
+
+struct GPU_Avgpool_bwd_FP32 : AvgPoolTestBwd<float>
+{
+};
+
+struct GPU_Avgpool_bwd_FP16 : AvgPoolTestBwd<half>
+{
+};
+
+struct GPU_Avgpool_bwd_BFP16 : AvgPoolTestBwd<bfloat16>
+{
+};
+
+} // namespace avgpool
+using namespace avgpool;
+
+// FORWARD TEST
+TEST_P(GPU_Avgpool_fwd_FP32, AvgPoolTestFwd)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_Avgpool_fwd_FP16, AvgPoolTestFwd)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_Avgpool_fwd_BFP16, AvgPoolTestFwd)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP32, testing::ValuesIn(AvgPoolTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP16, testing::ValuesIn(AvgPoolTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs()));
+
+// // BACKWARD TEST
+// TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd)
+// {
+//     if(!MIOPEN_TEST_ALL ||
+//        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
+//     {
+//         RunTest();
+//         Verify();
+//     }
+//     else
+//     {
+//         GTEST_SKIP();
+//     }
+// };
+
+// TEST_P(GPU_Avgpool_bwd_FP16, AvgPoolTestBwd)
+// {
+//     if(!MIOPEN_TEST_ALL ||
+//        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
+//     {
+//         RunTest();
+//         Verify();
+//     }
+//     else
+//     {
+//         GTEST_SKIP();
+//     }
+// };
+
+// TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd)
+// {
+//     if(!MIOPEN_TEST_ALL ||
+//        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
+//     {
+//         RunTest();
+//         Verify();
+//     }
+//     else
+//     {
+//         GTEST_SKIP();
+//     }
+// };
+
+// INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP32, testing::ValuesIn(AvgPoolTestConfigs()));
+// INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP16, testing::ValuesIn(AvgPoolTestConfigs()));
+// INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs()));
diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp
new file mode 100644
index 0000000000..23ec4c1726
--- /dev/null
+++ b/test/gtest/avgpool.hpp
@@ -0,0 +1,426 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "../driver/tensor_driver.hpp"
+#include "cpu_avgpool.hpp"
+#include "get_handle.hpp"
+#include "random.hpp"
+#include "tensor_holder.hpp"
+#include "verify.hpp"
+#include <cstdint>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <miopen/avgpool.hpp>
+#include <miopen/miopen.h>
+
+template <class T>
+inline std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    os << '{';
+    for(int i = 0; i < v.size(); ++i)
+    {
+        if(i != 0)
+            os << ',';
+        os << v[i];
+    }
+    os << '}';
+    return os;
+}
+
+struct AvgPoolTestCase
+{
+    std::vector<int32_t> input_dims;
+    std::vector<int32_t> kernel_size;
+    std::vector<int32_t> stride;
+    std::vector<int32_t> padding;
+    bool ceil_mode;
+    bool count_include_pad;
+    int32_t divisor_override;
+
+    friend std::ostream& operator<<(std::ostream& os, const AvgPoolTestCase& tc)
+    {
+        return os << " input_dims:" << tc.input_dims << " kernel_size:" << tc.kernel_size
+                  << " stride:" << tc.stride << " padding:" << tc.padding
+                  << " ceil_mode:" << tc.ceil_mode << " count_include_pad:" << tc.count_include_pad
+                  << " divisor_override:" << tc.divisor_override;
+    }
+
+    std::vector<int32_t> GetInput() const { return input_dims; }
+};
+
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigs()
+{
+    return {
+        {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, false, 0},
+        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, false, 0},
+        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, true, 0},
+        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, true, 0},
+        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, false, 1},
+        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, false, 1},
+        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, true, 1},
+        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, true, 1},
+        {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, false, 0},
+        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, false, 0},
+        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, true, 0},
+        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, false, 1},
+        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, false, 1},
+        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 1},
+        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, true, 1},
+    };
+}
+
+// FORWARD TEST
+template <typename T = float>
+struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle     = get_handle();
+        avgpool_config    = GetParam();
+        auto in_dim       = avgpool_config.GetInput();
+        N                 = in_dim[0];
+        C                 = in_dim[1];
+        D                 = in_dim.size() == 5 ? in_dim[2] : 1;
+        H                 = in_dim.size() == 5 ? in_dim[3] : in_dim[2];
+        W                 = in_dim.size() == 5 ? in_dim[4] : in_dim[3];
+        ksize             = tensor<int32_t>{in_dim.size() - 2};
+        ksize.data        = avgpool_config.kernel_size;
+        stride            = tensor<int32_t>{in_dim.size() - 2};
+        stride.data       = avgpool_config.stride;
+        padding           = tensor<int32_t>{in_dim.size() - 2};
+        padding.data      = avgpool_config.padding;
+        ceil_mode         = avgpool_config.ceil_mode;
+        count_include_pad = avgpool_config.count_include_pad;
+        divisor_override  = avgpool_config.divisor_override;
+
+        auto gen_input_value = [](auto...) {
+            return prng::gen_A_to_B<T>(static_cast<T>(-10.0f), static_cast<T>(10.0f));
+        };
+        input = tensor<T>{in_dim}.generate(gen_input_value);
+
+        std::vector<int32_t> out_dim;
+        if(in_dim.size() == 5)
+        {
+            if(ceil_mode)
+            {
+                OD = std::ceil(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OH = std::ceil(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+                OW = std::ceil(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
+            }
+            else
+            {
+                OD = std::floor(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OH = std::floor(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+                OW = std::floor(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
+            }
+            out_dim = {N, C, OD, OH, OW};
+        }
+        else
+        {
+            if(ceil_mode)
+            {
+                OH = std::ceil(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OW = std::ceil(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+            }
+            else
+            {
+                OH = std::floor(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OW = std::floor(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+            }
+            out_dim = {N, C, OH, OW};
+        }
+
+        output = tensor<T>{out_dim};
+        std::fill(output.begin(), output.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ref_output = tensor<T>{out_dim};
+        std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits<T>::quiet_NaN());
+
+        input_dev   = handle.Write(input.data);
+        output_dev  = handle.Write(output.data);
+        ksize_dev   = handle.Write(ksize.data);
+        stride_dev  = handle.Write(stride.data);
+        padding_dev = handle.Write(padding.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+        miopenStatus_t status;
+
+        auto dims = input.desc.GetNumDims();
+        if(dims == 4)
+        {
+            cpu_avgpool_forward_2d(input,
+                                   ref_output,
+                                   N,
+                                   C,
+                                   H,
+                                   W,
+                                   OH,
+                                   OW,
+                                   ksize,
+                                   stride,
+                                   padding,
+                                   count_include_pad,
+                                   divisor_override);
+        }
+        else if(dims == 5)
+        {
+            cpu_avgpool_forward_3d<T>(input,
+                                      ref_output,
+                                      N,
+                                      C,
+                                      D,
+                                      H,
+                                      W,
+                                      OD,
+                                      OH,
+                                      OW,
+                                      ksize,
+                                      stride,
+                                      padding,
+                                      count_include_pad,
+                                      divisor_override);
+        }
+        status = miopen::AvgPoolForward(handle,
+                                        input.desc,
+                                        input_dev.get(),
+                                        output.desc,
+                                        output_dev.get(),
+                                        stride.desc,
+                                        stride_dev.get(),
+                                        padding.desc,
+                                        padding_dev.get(),
+                                        ksize.desc,
+                                        ksize_dev.get(),
+                                        count_include_pad,
+                                        divisor_override);
+        fflush(stdout);
+
+        ASSERT_EQ(status, miopenStatusSuccess);
+
+        output.data = handle.Read<T>(output_dev, output.data.size());
+    }
+
+    void Verify()
+    {
+        double threshold = std::numeric_limits<T>::epsilon();
+
+        auto error = miopen::rms_range(ref_output, output);
+
+        ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output));
+        for(int i = 0; i < 10; ++i)
+        {
+            std::cout << "output cpu: " << ref_output[i] << " output gpu: " << output[i]
+                      << std::endl;
+        }
+        EXPECT_LT(error, threshold * 10);
+    }
+    AvgPoolTestCase avgpool_config;
+
+    tensor<T> input;
+    tensor<T> output;
+    tensor<T> ref_output;
+    tensor<int32_t> ksize;
+    tensor<int32_t> stride;
+    tensor<int32_t> padding;
+
+    bool ceil_mode;
+    bool count_include_pad;
+    int32_t divisor_override;
+    int32_t N, C, D, H, W, OD, OH, OW;
+
+    miopen::Allocator::ManageDataPtr input_dev;
+    miopen::Allocator::ManageDataPtr output_dev;
+    miopen::Allocator::ManageDataPtr ksize_dev;
+    miopen::Allocator::ManageDataPtr stride_dev;
+    miopen::Allocator::ManageDataPtr padding_dev;
+};
+
+// BACKWARD TEST
+template <typename T = float>
+struct AvgPoolTestBwd : public ::testing::TestWithParam<AvgPoolTestCase>
+{
+protected:
+    void SetUp() override
+    {
+        auto&& handle     = get_handle();
+        avgpool_config    = GetParam();
+        auto in_grad_dim  = avgpool_config.GetInput();
+        N                 = in_grad_dim[0];
+        C                 = in_grad_dim[1];
+        D                 = in_grad_dim.size() == 5 ? in_grad_dim[2] : 1;
+        H                 = in_grad_dim.size() == 5 ? in_grad_dim[3] : in_grad_dim[2];
+        W                 = in_grad_dim.size() == 5 ? in_grad_dim[4] : in_grad_dim[3];
+        ksize             = tensor<int32_t>{in_grad_dim.size() - 2};
+        ksize.data        = avgpool_config.kernel_size;
+        stride            = tensor<int32_t>{in_grad_dim.size() - 2};
+        stride.data       = avgpool_config.stride;
+        padding           = tensor<int32_t>{in_grad_dim.size() - 2};
+        padding.data      = avgpool_config.padding;
+        ceil_mode         = avgpool_config.ceil_mode;
+        count_include_pad = avgpool_config.count_include_pad;
+        divisor_override  = avgpool_config.divisor_override;
+
+        std::vector<int32_t> out_grad_dim;
+        if(in_grad_dim.size() == 5)
+        {
+            if(ceil_mode)
+            {
+                OD = std::ceil(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OH = std::ceil(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+                OW = std::ceil(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
+            }
+            else
+            {
+                OD = std::floor(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OH = std::floor(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+                OW = std::floor(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
+            }
+            out_grad_dim = {N, C, OD, OH, OW};
+        }
+        else
+        {
+            if(ceil_mode)
+            {
+                OH = std::ceil(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OW = std::ceil(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+            }
+            else
+            {
+                OH = std::floor(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+                OW = std::floor(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+            }
+            out_grad_dim = {N, C, OH, OW};
+        }
+        auto gen_output_grad_value = [](auto...) {
+            return prng::gen_A_to_B<T>(static_cast<T>(-10.0f), static_cast<T>(10.0f));
+        };
+        output_grad = tensor<T>{out_grad_dim}.generate(gen_output_grad_value);
+
+        input_grad = tensor<T>{in_grad_dim};
+        std::fill(input_grad.begin(), input_grad.end(), std::numeric_limits<T>::quiet_NaN());
+
+        ref_input_grad = tensor<T>{in_grad_dim};
+        std::fill(
+            ref_input_grad.begin(), ref_input_grad.end(), std::numeric_limits<T>::quiet_NaN());
+
+        output_grad_dev = handle.Write(output_grad.data);
+        input_grad_dev  = handle.Write(input_grad.data);
+        ksize_dev       = handle.Write(ksize.data);
+        stride_dev      = handle.Write(stride.data);
+        padding_dev     = handle.Write(padding.data);
+    }
+
+    void RunTest()
+    {
+        auto&& handle = get_handle();
+
+        miopenStatus_t status;
+
+        auto dims = input_grad.desc.GetNumDims();
+        if(dims == 4)
+        {
+            cpu_avgpool_backward_2d(output_grad,
+                                    ref_input_grad,
+                                    N,
+                                    C,
+                                    H,
+                                    W,
+                                    OH,
+                                    OW,
+                                    ksize,
+                                    stride,
+                                    padding,
+                                    count_include_pad,
+                                    divisor_override);
+        }
+        else if(dims == 5)
+        {
+            cpu_avgpool_backward_3d<T>(output_grad,
+                                       ref_input_grad,
+                                       N,
+                                       C,
+                                       D,
+                                       H,
+                                       W,
+                                       OD,
+                                       OH,
+                                       OW,
+                                       ksize,
+                                       stride,
+                                       padding,
+                                       count_include_pad,
+                                       divisor_override);
+        }
+        status = miopen::AvgPoolBackward(handle,
+                                         output_grad.desc,
+                                         output_grad_dev.get(),
+                                         input_grad.desc,
+                                         input_grad_dev.get(),
+                                         stride.desc,
+                                         stride_dev.get(),
+                                         padding.desc,
+                                         padding_dev.get(),
+                                         ksize.desc,
+                                         ksize_dev.get(),
+                                         count_include_pad,
+                                         divisor_override);
+
+        ASSERT_EQ(status, miopenStatusSuccess);
+
+        input_grad.data = handle.Read<T>(input_grad_dev, input_grad.data.size());
+    }
+
+    void Verify()
+    {
+        double threshold = std::numeric_limits<T>::epsilon();
+        auto error       = miopen::rms_range(ref_input_grad, input_grad);
+        ASSERT_EQ(miopen::range_distance(ref_input_grad), miopen::range_distance(input_grad));
+        EXPECT_LT(error, threshold * 10);
+    }
+    AvgPoolTestCase avgpool_config;
+
+    tensor<T> output_grad;
+    tensor<T> input_grad;
+    tensor<T> ref_input_grad;
+    tensor<int32_t> ksize;
+    tensor<int32_t> stride;
+    tensor<int32_t> padding;
+
+    bool ceil_mode;
+    bool count_include_pad;
+    int32_t divisor_override;
+    int32_t N, C, D, H, W, OD, OH, OW;
+
+    miopen::Allocator::ManageDataPtr output_grad_dev;
+    miopen::Allocator::ManageDataPtr input_grad_dev;
+    miopen::Allocator::ManageDataPtr ksize_dev;
+    miopen::Allocator::ManageDataPtr stride_dev;
+    miopen::Allocator::ManageDataPtr padding_dev;
+};

From 0492fc71c714c320b7d0d53f67030ba8e3fe2a90 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 12 Aug 2024 10:21:50 +0700
Subject: [PATCH 05/29] add driver test

---
 driver/CMakeLists.txt                      |   1 +
 driver/avgpool_driver.hpp                  | 596 +++++++++++++++++++++
 driver/dm_avgpool.cpp                      |  40 ++
 driver/driver.hpp                          |   5 +-
 driver/mloAvgPoolHost.hpp                  | 438 +++++++++++++++
 src/kernels/MIOpenAvgPool.cpp              | 118 ++--
 src/solver/avgpool/backward_avgpool_2d.cpp |   5 +
 src/solver/avgpool/backward_avgpool_3d.cpp |   5 +
 src/solver/avgpool/forward_avgpool_2d.cpp  |   4 +
 src/solver/avgpool/forward_avgpool_3d.cpp  |   4 +
 test/cpu_avgpool.hpp                       | 116 ++--
 test/gtest/avgpool.cpp                     |  92 ++--
 test/gtest/avgpool.hpp                     |   6 -
 13 files changed, 1259 insertions(+), 171 deletions(-)
 create mode 100644 driver/avgpool_driver.hpp
 create mode 100644 driver/dm_avgpool.cpp
 create mode 100644 driver/mloAvgPoolHost.hpp

diff --git a/driver/CMakeLists.txt b/driver/CMakeLists.txt
index cd663eb8b4..385580e2e1 100644
--- a/driver/CMakeLists.txt
+++ b/driver/CMakeLists.txt
@@ -32,6 +32,7 @@ add_executable(MIOpenDriver
     dm_activ.cpp
     dm_adam.cpp
     dm_addlayernorm.cpp
+    dm_avgpool.cpp
     dm_bnorm.cpp
     dm_cat.cpp
     dm_conv.cpp
diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp
new file mode 100644
index 0000000000..38beba92f1
--- /dev/null
+++ b/driver/avgpool_driver.hpp
@@ -0,0 +1,596 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_MIOPEN_AVGPOOL_DRIVER_HPP
+#define GUARD_MIOPEN_AVGPOOL_DRIVER_HPP
+
+#include "InputFlags.hpp"
+#include "driver.hpp"
+#include "mloAvgPoolHost.hpp"
+#include "random.hpp"
+#include "tensor_driver.hpp"
+#include "timer.hpp"
+
+#include <../test/tensor_holder.hpp>
+#include <../test/verify.hpp>
+
+#include <miopen/env.hpp>
+#include <miopen/handle.hpp>
+#include <miopen/miopen.h>
+#include <miopen/tensor.hpp>
+#include <vector>
+
+template <typename Tgpu, typename Tref>
+class AvgPoolDriver : public Driver
+{
+public:
+    AvgPoolDriver() : Driver()
+    {
+        miopenCreateTensorDescriptor(&inputDesc);
+        miopenCreateTensorDescriptor(&outputDesc);
+        miopenCreateTensorDescriptor(&inputGradDesc);
+        miopenCreateTensorDescriptor(&outputGradDesc);
+        miopenCreateTensorDescriptor(&ksizeDesc);
+        miopenCreateTensorDescriptor(&strideDesc);
+        miopenCreateTensorDescriptor(&paddingDesc);
+
+        data_type = miopen_type<Tgpu>{};
+    }
+
+    int AddCmdLineArgs() override;
+    int ParseCmdLineArgs(int argc, char* argv[]) override;
+    InputFlags& GetInputFlags() override { return inflags; }
+
+    std::vector<int> GetInputTensorDimsFromCmd(const char* param);
+    int GetandSetData() override;
+
+    int AllocateBuffersAndCopy() override;
+
+    int RunForwardGPU() override;
+    int RunForwardCPU();
+
+    int RunBackwardGPU() override;
+    int RunBackwardCPU();
+
+    Tref GetTolerance();
+    int VerifyBackward() override;
+    int VerifyForward() override;
+    ~AvgPoolDriver() override
+    {
+        miopenDestroyTensorDescriptor(inputDesc);
+        miopenDestroyTensorDescriptor(outputDesc);
+        miopenDestroyTensorDescriptor(inputGradDesc);
+        miopenDestroyTensorDescriptor(outputGradDesc);
+        miopenDestroyTensorDescriptor(ksizeDesc);
+        miopenDestroyTensorDescriptor(strideDesc);
+        miopenDestroyTensorDescriptor(paddingDesc);
+    }
+
+private:
+    InputFlags inflags;
+
+    int forw;
+
+    miopenTensorDescriptor_t inputDesc;
+    miopenTensorDescriptor_t outputDesc;
+    miopenTensorDescriptor_t inputGradDesc;
+    miopenTensorDescriptor_t outputGradDesc;
+    miopenTensorDescriptor_t ksizeDesc;
+    miopenTensorDescriptor_t strideDesc;
+    miopenTensorDescriptor_t paddingDesc;
+
+    std::unique_ptr<GPUMem> input_dev;
+    std::unique_ptr<GPUMem> output_dev;
+    std::unique_ptr<GPUMem> input_grad_dev;
+    std::unique_ptr<GPUMem> output_grad_dev;
+    std::unique_ptr<GPUMem> ksize_dev;
+    std::unique_ptr<GPUMem> stride_dev;
+    std::unique_ptr<GPUMem> padding_dev;
+
+    std::vector<Tgpu> input;
+    std::vector<Tgpu> output;
+    std::vector<Tref> output_host;
+    std::vector<Tgpu> input_grad;
+    std::vector<Tref> input_grad_host;
+    std::vector<Tgpu> output_grad;
+    std::vector<int32_t> ksize;
+    std::vector<int32_t> stride;
+    std::vector<int32_t> padding;
+
+    bool ceil_mode;
+    bool count_include_pad;
+    int32_t divisor_override;
+    int32_t N, C, D, H, W, OD, OH, OW;
+
+    std::vector<int> in_dim;
+};
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
+{
+    inflags.Parse(argc, argv);
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        miopenEnableProfiling(GetHandle(), true);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+std::vector<int> AvgPoolDriver<Tgpu, Tref>::GetInputTensorDimsFromCmd(const char* param)
+{
+    std::string lengthsStr = inflags.GetValueStr(param);
+
+    std::vector<int> lengths;
+    std::size_t pos = 0;
+    std::size_t new_pos;
+
+    new_pos = lengthsStr.find(',', pos);
+    while(new_pos != std::string::npos)
+    {
+        std::string sliceStr = lengthsStr.substr(pos, new_pos - pos);
+
+        int len = std::stoi(sliceStr);
+
+        lengths.push_back(len);
+
+        pos     = new_pos + 1;
+        new_pos = lengthsStr.find(',', pos);
+    };
+
+    std::string sliceStr = lengthsStr.substr(pos);
+    int len              = std::stoi(sliceStr);
+
+    lengths.push_back(len);
+
+    return (lengths);
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
+{
+    in_dim                   = GetInputTensorDimsFromCmd("input_dims");
+    std::vector<int> ksp_dim = {in_dim.size() - 2};
+    ksize                    = GetInputTensorDimsFromCmd("kernel_size");
+    stride                   = GetInputTensorDimsFromCmd("stride");
+    padding                  = GetInputTensorDimsFromCmd("padding");
+
+    if(ksize.size() != ksp_dim[0])
+    {
+        int ref = ksp_dim[0] - ksize.size();
+        while(ref--)
+            ksize.push_back(1);
+    }
+    if(stride.size() != ksp_dim[0])
+    {
+        int ref = ksp_dim[0] - ksize.size();
+        while(ref--)
+            stride.push_back(1);
+    }
+    if(padding.size() != ksp_dim[0])
+    {
+        int ref = ksp_dim[0] - ksize.size();
+        while(ref--)
+            padding.push_back(0);
+    }
+
+    ceil_mode         = static_cast<bool>(inflags.GetValueInt("ceil_mode"));
+    count_include_pad = static_cast<bool>(inflags.GetValueInt("count_include_pad"));
+    divisor_override  = inflags.GetValueInt("divisor_override");
+
+    N = in_dim[0];
+    C = in_dim[1];
+    D = in_dim.size() == 5 ? in_dim[2] : 1;
+    H = in_dim.size() == 5 ? in_dim[3] : in_dim[2];
+    W = in_dim.size() == 5 ? in_dim[4] : in_dim[3];
+
+    std::vector<int32_t> out_dim;
+    if(in_dim.size() == 5)
+    {
+        if(ceil_mode)
+        {
+            OD = std::ceil(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+            OH = std::ceil(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+            OW = std::ceil(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
+        }
+        else
+        {
+            OD = std::floor(static_cast<float>(D - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+            OH = std::floor(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+            OW = std::floor(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
+        }
+        out_dim = std::vector<int32_t>{N, C, OD, OH, OW};
+    }
+    else
+    {
+        if(ceil_mode)
+        {
+            OH = std::ceil(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+            OW = std::ceil(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+        }
+        else
+        {
+            OH = std::floor(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
+            OW = std::floor(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
+        }
+        out_dim = std::vector<int32_t>{N, C, OH, OW};
+    }
+    SetTensorNd(inputDesc, in_dim, data_type);
+    SetTensorNd(outputDesc, out_dim, data_type);
+    SetTensorNd(outputGradDesc, out_dim, data_type);
+    SetTensorNd(inputGradDesc, in_dim, data_type);
+    SetTensorNd(ksizeDesc, ksp_dim, miopen_type<int32_t>{});
+    SetTensorNd(strideDesc, ksp_dim, miopen_type<int32_t>{});
+    SetTensorNd(paddingDesc, ksp_dim, miopen_type<int32_t>{});
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::AddCmdLineArgs()
+{
+    inflags.AddInputFlag("forw", 'F', "1", "Run only Forward AvgPool (Default=1)", "int");
+    inflags.AddInputFlag(
+        "input_dims",
+        'D',
+        "2,3,7,9",
+        "The dimensional lengths of the input tensor: N,C,D1,D2,... Example: 2,3,7,9.",
+        "string");
+    inflags.AddInputFlag(
+        "kernel_size", 'k', "1,1", "The size of the window D1,D2,... Example: 1,1.", "string");
+    inflags.AddInputFlag(
+        "stride",
+        's',
+        "1,1",
+        "The stride of the window. Default value is kernel_size D1,D2,... Example: 1,1.",
+        "string");
+    inflags.AddInputFlag("padding",
+                         'p',
+                         "0,0",
+                         "Implicit zero padding to be added on both sides D1,D2,... Example: 0,0.",
+                         "string");
+    inflags.AddInputFlag("ceil_mode",
+                         'c',
+                         "1",
+                         "When 1, will use ceil instead of floor to compute the output shape.",
+                         "int");
+    inflags.AddInputFlag("count_include_pad",
+                         'P',
+                         "0",
+                         "When 1, will include the zero-padding in the averaging calculation.",
+                         "int");
+    inflags.AddInputFlag("divisor_override",
+                         'd',
+                         "0",
+                         "If specified, it will be used as divisor, otherwise size of the pooling "
+                         "region will be used.",
+                         "int");
+
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
+    inflags.AddInputFlag("verify", 'V', "1", "Verify (Default=1)", "int");
+    inflags.AddInputFlag("time", 't', "1", "Time (Default=1)", "int");
+    inflags.AddInputFlag(
+        "wall", 'w', "0", "Wall-clock Time, Requires time == 1 (Default=0)", "int");
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
+{
+    size_t input_sz   = GetTensorSize(inputDesc);
+    size_t output_sz  = GetTensorSize(outputDesc);
+    size_t ksize_sz   = GetTensorSize(ksizeDesc);
+    size_t stride_sz  = GetTensorSize(strideDesc);
+    size_t padding_sz = GetTensorSize(paddingDesc);
+
+    uint32_t ctx = 0;
+
+    input_dev       = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    output_dev      = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
+    input_grad_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
+    output_grad_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
+    ksize_dev       = std::unique_ptr<GPUMem>(new GPUMem(ctx, ksize_sz, sizeof(int32_t)));
+    stride_dev      = std::unique_ptr<GPUMem>(new GPUMem(ctx, stride_sz, sizeof(int32_t)));
+    padding_dev     = std::unique_ptr<GPUMem>(new GPUMem(ctx, padding_sz, sizeof(int32_t)));
+
+    input       = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(0));
+    output      = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0));
+    output_host = std::vector<Tref>(output_sz, static_cast<Tref>(0));
+
+    input_grad      = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(0));
+    input_grad_host = std::vector<Tref>(input_sz, static_cast<Tref>(0));
+    output_grad     = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0));
+
+    int status;
+
+    for(int i = 0; i < input_sz; i++)
+    {
+        input[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-10.0f), static_cast<Tgpu>(10.0f));
+    }
+    status = input_dev->ToGPU(q, input.data());
+
+    status |= output_dev->ToGPU(q, output.data());
+
+    status |= input_grad_dev->ToGPU(q, input_grad.data());
+
+    for(int i = 0; i < output_sz; i++)
+    {
+        output_grad[i] = prng::gen_A_to_B<Tgpu>(static_cast<Tgpu>(-1.0), static_cast<Tgpu>(1.0));
+    }
+    status |= output_grad_dev->ToGPU(q, output_grad.data());
+
+    status |= ksize_dev->ToGPU(q, ksize.data());
+
+    status |= stride_dev->ToGPU(q, stride.data());
+
+    status |= padding_dev->ToGPU(q, padding.data());
+
+    if(status != 0)
+        std::cout << "Error copying data to GPU\n" << std::endl;
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::RunForwardGPU()
+{
+    float kernel_total_time = 0.0;
+    float kernel_first_time = 0.0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenAvgPoolForward(GetHandle(),
+                             inputDesc,
+                             input_dev->GetMem(),
+                             outputDesc,
+                             output_dev->GetMem(),
+                             strideDesc,
+                             stride_dev->GetMem(),
+                             paddingDesc,
+                             padding_dev->GetMem(),
+                             ksizeDesc,
+                             ksize_dev->GetMem(),
+                             count_include_pad,
+                             divisor_override);
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            printf("Wall-clock Time Forward AvgPool Elapsed: %f ms\n", t.gettime_ms() / iter);
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        printf("GPU Kernel Time Forward AvgPool Elapsed: %f ms\n", kernel_average_time);
+    }
+
+    output_dev->FromGPU(GetStream(), output.data());
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::RunForwardCPU()
+{
+    if(in_dim.size() == 4)
+    {
+        mloAvgPoolForward2dRunHost<Tgpu, Tref>(inputDesc,
+                                               outputDesc,
+                                               input.data(),
+                                               output_host.data(),
+                                               N,
+                                               C,
+                                               H,
+                                               W,
+                                               OH,
+                                               OW,
+                                               ksize.data(),
+                                               stride.data(),
+                                               padding.data(),
+                                               count_include_pad,
+                                               divisor_override);
+    }
+    else if(in_dim.size() == 5)
+    {
+        mloAvgPoolForward3dRunHost<Tgpu, Tref>(inputDesc,
+                                               outputDesc,
+                                               input.data(),
+                                               output_host.data(),
+                                               N,
+                                               C,
+                                               D,
+                                               H,
+                                               W,
+                                               OD,
+                                               OH,
+                                               OW,
+                                               ksize.data(),
+                                               stride.data(),
+                                               padding.data(),
+                                               count_include_pad,
+                                               divisor_override);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::RunBackwardGPU()
+{
+    float kernel_total_time = 0.0;
+    float kernel_first_time = 0.0;
+
+    Timer t;
+    START_TIME
+
+    for(int i = 0; i < inflags.GetValueInt("iter"); i++)
+    {
+        miopenAvgPoolBackward(GetHandle(),
+                              outputGradDesc,
+                              output_grad_dev->GetMem(),
+                              inputGradDesc,
+                              input_grad_dev->GetMem(),
+                              strideDesc,
+                              stride_dev->GetMem(),
+                              paddingDesc,
+                              padding_dev->GetMem(),
+                              ksizeDesc,
+                              ksize_dev->GetMem(),
+                              count_include_pad,
+                              divisor_override);
+
+        float time = 0.0;
+        miopenGetKernelTime(GetHandle(), &time);
+        kernel_total_time += time;
+        if(i == 0)
+            kernel_first_time = time;
+    }
+
+    if(inflags.GetValueInt("time") == 1)
+    {
+        STOP_TIME
+        int iter = inflags.GetValueInt("iter");
+        if(WALL_CLOCK)
+            printf("Wall-clock Time Backward AvgPool Elapsed: %f ms\n", t.gettime_ms() / iter);
+
+        float kernel_average_time =
+            iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
+        printf("GPU Kernel Time Backward AvgPool Elapsed: %f ms\n", kernel_average_time);
+    }
+
+    input_grad_dev->FromGPU(GetStream(), input_grad.data());
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::RunBackwardCPU()
+{
+    if(in_dim.size() == 4)
+    {
+        mloAvgPoolBackward2dRunHost<Tgpu, Tref>(outputGradDesc,
+                                                inputGradDesc,
+                                                output_grad.data(),
+                                                input_grad_host.data(),
+                                                N,
+                                                C,
+                                                H,
+                                                W,
+                                                OH,
+                                                OW,
+                                                ksize.data(),
+                                                stride.data(),
+                                                padding.data(),
+                                                count_include_pad,
+                                                divisor_override);
+    }
+    else if(in_dim.size() == 5)
+    {
+        mloAvgPoolBackward3dRunHost<Tgpu, Tref>(outputGradDesc,
+                                                inputGradDesc,
+                                                output_grad.data(),
+                                                input_grad_host.data(),
+                                                N,
+                                                C,
+                                                D,
+                                                H,
+                                                W,
+                                                OD,
+                                                OH,
+                                                OW,
+                                                ksize.data(),
+                                                stride.data(),
+                                                padding.data(),
+                                                count_include_pad,
+                                                divisor_override);
+    }
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+Tref AvgPoolDriver<Tgpu, Tref>::GetTolerance()
+{
+    // Computation error of fp16 is ~2^13 (=8192) bigger than
+    // the one of fp32 because mantissa is shorter by 13 bits.
+    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
+
+    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
+    if(std::is_same<Tgpu, bfloat16>::value)
+        tolerance *= 8.0;
+    return tolerance;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::VerifyForward()
+{
+    RunForwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error           = miopen::rms_range(output_host, output);
+
+    if(!std::isfinite(error) || error > tolerance)
+    {
+        std::cout << "Forward AvgPool FAILED: " << error << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        printf("Forward AvgPool Verifies on CPU and GPU (err=%f)\n", error);
+    }
+
+    return miopenStatusSuccess;
+}
+
+template <typename Tgpu, typename Tref>
+int AvgPoolDriver<Tgpu, Tref>::VerifyBackward()
+{
+    RunBackwardCPU();
+    const Tref tolerance = GetTolerance();
+    auto error           = miopen::rms_range(input_grad_host, input_grad);
+
+    if(!std::isfinite(error) || error > tolerance)
+    {
+        std::cout << "Backward AvgPool FAILED: " << error << std::endl;
+        return EC_VerifyFwd;
+    }
+    else
+    {
+        printf("Backward AvgPool Verifies on CPU and GPU (err=%f)\n", error);
+    }
+    return miopenStatusSuccess;
+}
+
+#endif // GUARD_MIOPEN_AVGPOOL_DRIVER_HPP
diff --git a/driver/dm_avgpool.cpp b/driver/dm_avgpool.cpp
new file mode 100644
index 0000000000..ec0e457056
--- /dev/null
+++ b/driver/dm_avgpool.cpp
@@ -0,0 +1,40 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#include "registry_driver_maker.hpp"
+#include "avgpool_driver.hpp"
+
+static Driver* makeDriver(const std::string& base_arg)
+{
+    if(base_arg == "avgpool")
+        return new AvgPoolDriver<float, float>();
+    if(base_arg == "avgpoolfp16")
+        return new AvgPoolDriver<float16, float>();
+    if(base_arg == "avgpoolbfp16")
+        return new AvgPoolDriver<bfloat16, float>();
+    return nullptr;
+}
+
+REGISTER_DRIVER_MAKER(makeDriver);
diff --git a/driver/driver.hpp b/driver/driver.hpp
index b23df690d1..bd42f6ee13 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -175,7 +175,7 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
            "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
            "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
-           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16]\n");
+           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], avgpool[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -206,7 +206,8 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" &&
        arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" &&
        arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" &&
-       arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "--version")
+       arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "avgpool" &&
+       arg != "avgpoolfp16" && arg != "avgpoolbfp16" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/mloAvgPoolHost.hpp b/driver/mloAvgPoolHost.hpp
new file mode 100644
index 0000000000..ad55c53c66
--- /dev/null
+++ b/driver/mloAvgPoolHost.hpp
@@ -0,0 +1,438 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef MLO_AVGPOOLHOST_H_
+#define MLO_AVGPOOLHOST_H_
+
+#include <miopen/tensor.hpp>
+#include <miopen/tensor_view_utils.hpp>
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc,
+                                   const miopenTensorDescriptor_t outputDesc,
+                                   Tgpu* input,
+                                   Tcheck* output,
+                                   size_t N,
+                                   size_t C,
+                                   size_t H,
+                                   size_t W,
+                                   size_t OH,
+                                   size_t OW,
+                                   const int32_t* kinfor,
+                                   const int32_t* stride,
+                                   const int32_t* padding,
+                                   bool count_include_pad,
+                                   int32_t divisor_override)
+{
+    auto dims  = miopen::deref(inputDesc).GetLengths();
+    auto numel = miopen::deref(outputDesc).GetElementSize();
+
+    auto input_tv  = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc));
+    auto output_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputDesc));
+
+    for(int32_t gid = 0; gid < numel; gid++)
+    {
+        int32_t ncoh = gid / OW, ow = gid % OW;
+        int32_t nc = ncoh / OH, oh = ncoh % OH;
+        int32_t n = nc / C, c = nc % C;
+        int32_t R  = kinfor[0];
+        int32_t S  = kinfor[1];
+        int32_t sh = stride[0];
+        int32_t sw = stride[1];
+        int32_t ph = padding[0];
+        int32_t pw = padding[1];
+
+        if(n >= N)
+            return 0;
+
+        float m = 0;
+        for(int32_t r = 0; r < R; ++r)
+        {
+            for(int32_t s = 0; s < S; ++s)
+            {
+                // input idx : (n, c, h, w)
+                int32_t h = oh * sh - ph + r;
+                if(h < 0 || h >= H)
+                    continue;
+                int32_t w = ow * sw - pw + s;
+                if(w < 0 || w >= W)
+                    continue;
+                // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+                m += static_cast<float>(
+                    input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]);
+            }
+        }
+
+        int32_t hstart = oh * sh - ph;
+        int32_t wstart = ow * sw - pw;
+        int32_t hend   = min(hstart + R, H + ph);
+        int32_t wend   = min(wstart + S, W + pw);
+
+        const int32_t pool_size = (hend - hstart) * (wend - wstart);
+
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        hend   = min(hend, H);
+        wend   = min(wend, W);
+
+        int32_t divide_factor;
+        if(divisor_override != 0)
+        {
+            divide_factor = divisor_override;
+        }
+        else
+        {
+            if(count_include_pad)
+            {
+                divide_factor = pool_size;
+            }
+            else
+            {
+                divide_factor = (hend - hstart) * (wend - wstart);
+            }
+        }
+        float val = m / divide_factor;
+
+        output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] =
+            static_cast<Tcheck>(val);
+    }
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc,
+                                   const miopenTensorDescriptor_t outputDesc,
+                                   Tgpu* input,
+                                   Tcheck* output,
+                                   size_t N,
+                                   size_t C,
+                                   size_t D,
+                                   size_t H,
+                                   size_t W,
+                                   size_t OD,
+                                   size_t OH,
+                                   size_t OW,
+                                   const int32_t* kinfor,
+                                   const int32_t* stride,
+                                   const int32_t* padding,
+                                   bool count_include_pad,
+                                   int32_t divisor_override)
+{
+    auto dims  = miopen::deref(inputDesc).GetLengths();
+    auto numel = miopen::deref(outputDesc).GetElementSize();
+
+    auto input_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
+    auto output_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc));
+
+    for(int32_t gid = 0; gid < numel; gid++)
+    {
+        int32_t ncodoh = gid / OW, ow = gid % OW;
+        int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
+        int32_t nc = ncod / OD, od = ncod % OD;
+        int32_t n = nc / C, c = nc % C;
+        int32_t KD = kinfor[0];
+        int32_t R  = kinfor[1];
+        int32_t S  = kinfor[2];
+        int32_t sd = stride[0];
+        int32_t sh = stride[1];
+        int32_t sw = stride[2];
+        int32_t pd = padding[0];
+        int32_t ph = padding[1];
+        int32_t pw = padding[2];
+
+        if(n >= N)
+            return 0;
+        float sum = 0;
+        for(int32_t kd = 0; kd < KD; ++kd)
+        {
+            for(int32_t r = 0; r < R; ++r)
+            {
+                for(int32_t s = 0; s < S; ++s)
+                {
+                    // input idx : (n, c, d, h, w)
+                    int32_t d = od * sd - pd + kd;
+                    if(d < 0 || d >= D)
+                        continue;
+                    int32_t h = oh * sh - ph + r;
+                    if(h < 0 || h >= H)
+                        continue;
+                    int32_t w = ow * sw - pw + s;
+                    if(w < 0 || w >= W)
+                        continue;
+                    // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+                    sum += static_cast<float>(
+                        input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]);
+                }
+            }
+        }
+        int32_t dstart = od * sd - pd;
+        int32_t hstart = oh * sh - ph;
+        int32_t wstart = ow * sw - pw;
+        int32_t dend   = min(dstart + KD, D + pd);
+        int32_t hend   = min(hstart + R, H + ph);
+        int32_t wend   = min(wstart + S, W + pw);
+
+        const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+        dstart                  = max(dstart, 0);
+        hstart                  = max(hstart, 0);
+        wstart                  = max(wstart, 0);
+        dend                    = min(dend, D);
+        hend                    = min(hend, H);
+        wend                    = min(wend, W);
+
+        int32_t divide_factor;
+        if(divisor_override != 0)
+        {
+            divide_factor = divisor_override;
+        }
+        else
+        {
+            if(count_include_pad)
+            {
+                divide_factor = pool_size;
+            }
+            else
+            {
+                divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            }
+        }
+        float val = sum / divide_factor;
+        output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] =
+            static_cast<Tcheck>(val);
+    }
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDesc,
+                                    const miopenTensorDescriptor_t inputGradDesc,
+                                    Tgpu* output_grad,
+                                    Tcheck* input_grad,
+                                    size_t N,
+                                    size_t C,
+                                    size_t H,
+                                    size_t W,
+                                    size_t OH,
+                                    size_t OW,
+                                    const int32_t* kinfor,
+                                    const int32_t* stride,
+                                    const int32_t* padding,
+                                    bool count_include_pad,
+                                    int32_t divisor_override)
+{
+    auto dims  = miopen::deref(inputGradDesc).GetLengths();
+    auto numel = miopen::deref(inputGradDesc).GetElementSize();
+
+    auto output_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc));
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<4>(miopen::deref(inputGradDesc));
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        int32_t nch = gid / W, w = gid % W;
+        int32_t nc = nch / H, h = nch % H;
+        int32_t n = nc / C, c = nc % C;
+        int32_t R  = kinfor[0];
+        int32_t S  = kinfor[1];
+        int32_t sh = stride[0];
+        int32_t sw = stride[1];
+        int32_t ph = padding[0];
+        int32_t pw = padding[1];
+
+        if(n >= N)
+            return 0;
+
+        float grad = 0;
+        for(int32_t r = 0; r < R; ++r)
+        {
+            for(int32_t s = 0; s < S; ++s)
+            {
+                int32_t ohsh = h + ph - r;
+                if(ohsh % sh != 0)
+                    continue;
+                int32_t oh = ohsh / sh;
+                if(oh < 0 || oh >= OH)
+                    continue;
+                int32_t owsw = w + pw - s;
+                if(owsw % sw != 0)
+                    continue;
+                int32_t ow = owsw / sw;
+                if(ow < 0 || ow >= OW)
+                    continue;
+
+                int32_t hstart = oh * sh - ph;
+                int32_t wstart = ow * sw - pw;
+                int32_t hend   = min(hstart + R, H + ph);
+                int32_t wend   = min(wstart + S, W + pw);
+
+                const int32_t pool_size = (hend - hstart) * (wend - wstart);
+
+                hstart = max(hstart, 0);
+                wstart = max(wstart, 0);
+                hend   = min(hend, H);
+                wend   = min(wend, W);
+
+                int32_t divide_factor;
+                if(divisor_override != 0)
+                {
+                    divide_factor = divisor_override;
+                }
+                else
+                {
+                    if(count_include_pad)
+                    {
+                        divide_factor = pool_size;
+                    }
+                    else
+                    {
+                        divide_factor = (hend - hstart) * (wend - wstart);
+                    }
+                }
+
+                grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
+                            tensor_layout_t<4>(n, c, oh, ow))]) /
+                        divide_factor;
+            }
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] =
+            static_cast<Tcheck>(grad);
+    }
+    return 0;
+}
+
+template <typename Tgpu, typename Tcheck>
+int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDesc,
+                                    const miopenTensorDescriptor_t inputGradDesc,
+                                    Tgpu* output_grad,
+                                    Tcheck* input_grad,
+                                    size_t N,
+                                    size_t C,
+                                    size_t D,
+                                    size_t H,
+                                    size_t W,
+                                    size_t OD,
+                                    size_t OH,
+                                    size_t OW,
+                                    const int32_t* kinfor,
+                                    const int32_t* stride,
+                                    const int32_t* padding,
+                                    bool count_include_pad,
+                                    int32_t divisor_override)
+{
+    auto dims  = miopen::deref(inputGradDesc).GetLengths();
+    auto numel = miopen::deref(inputGradDesc).GetElementSize();
+
+    auto output_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc));
+    auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(inputGradDesc));
+
+    for(size_t gid = 0; gid < numel; gid++)
+    {
+        int32_t ncdh = gid / W, w = gid % W;
+        int32_t ncd = ncdh / H, h = ncdh % H;
+        int32_t nc = ncd / D, d = ncd % D;
+        int32_t n = nc / C, c = nc % C;
+        int32_t KD = kinfor[0];
+        int32_t R  = kinfor[1];
+        int32_t S  = kinfor[2];
+        int32_t sd = stride[0];
+        int32_t sh = stride[1];
+        int32_t sw = stride[2];
+        int32_t pd = padding[0];
+        int32_t ph = padding[1];
+        int32_t pw = padding[2];
+
+        if(n >= N)
+            return 0;
+
+        float grad = 0;
+        for(int32_t kd = 0; kd < KD; ++kd)
+        {
+            for(int32_t r = 0; r < R; ++r)
+            {
+                for(int32_t s = 0; s < S; ++s)
+                {
+                    int32_t odsd = d + pd - kd;
+                    if(odsd % sd != 0)
+                        continue;
+                    int32_t od = odsd / sd;
+                    if(od < 0 || od >= OD)
+                        continue;
+
+                    int32_t ohsh = h + ph - r;
+                    if(ohsh % sh != 0)
+                        continue;
+                    int32_t oh = ohsh / sh;
+                    if(oh < 0 || oh >= OH)
+                        continue;
+
+                    int32_t owsw = w + pw - s;
+                    if(owsw % sw != 0)
+                        continue;
+                    int32_t ow = owsw / sw;
+                    if(ow < 0 || ow >= OW)
+                        continue;
+
+                    int32_t dstart = od * sd - pd;
+                    int32_t hstart = oh * sh - ph;
+                    int32_t wstart = ow * sw - pw;
+                    int32_t dend   = min(dstart + KD, D + pd);
+                    int32_t hend   = min(hstart + R, H + ph);
+                    int32_t wend   = min(wstart + S, W + pw);
+
+                    const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                    dstart                  = max(dstart, 0);
+                    hstart                  = max(hstart, 0);
+                    wstart                  = max(wstart, 0);
+                    dend                    = min(dend, D);
+                    hend                    = min(hend, H);
+                    wend                    = min(wend, W);
+                    int32_t divide_factor;
+                    if(divisor_override != 0)
+                    {
+                        divide_factor = divisor_override;
+                    }
+                    else
+                    {
+                        if(count_include_pad)
+                        {
+                            divide_factor = pool_size;
+                        }
+                        else
+                        {
+                            divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                        }
+                    }
+                    grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
+                                tensor_layout_t<5>(n, c, od, oh, ow))]) /
+                            divide_factor;
+                }
+            }
+        }
+        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] =
+            static_cast<Tcheck>(grad);
+    }
+    return 0;
+}
+
+#endif // MLO_AVGPOOLHOST_H_
diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
index bcbf4f6c60..f4a9e95ce1 100644
--- a/src/kernels/MIOpenAvgPool.cpp
+++ b/src/kernels/MIOpenAvgPool.cpp
@@ -43,15 +43,15 @@
 template <typename TI, typename TO>
 __device__ void avgPoolForward2d(const TI* __restrict__ input,
                                  TO* __restrict__ output,
-                                 int32_t N,
-                                 int32_t C,
-                                 int32_t H,
-                                 int32_t W,
-                                 int32_t OH,
-                                 int32_t OW,
-                                 int32_t* kinfor,
-                                 int32_t* stride,
-                                 int32_t* padding,
+                                 size_t N,
+                                 size_t C,
+                                 size_t H,
+                                 size_t W,
+                                 size_t OH,
+                                 size_t OW,
+                                 const int32_t* __restrict__ kinfor,
+                                 const int32_t* __restrict__ stride,
+                                 const int32_t* __restrict__ padding,
                                  bool count_include_pad,
                                  int32_t divisor_override,
                                  tensor_view_t<4> input_tv,
@@ -124,12 +124,12 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input,
 
 extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input,
                                             OUTPUT_TYPE* __restrict__ output,
-                                            int32_t N,
-                                            int32_t C,
-                                            int32_t H,
-                                            int32_t W,
-                                            int32_t OH,
-                                            int32_t OW,
+                                            size_t N,
+                                            size_t C,
+                                            size_t H,
+                                            size_t W,
+                                            size_t OH,
+                                            size_t OW,
                                             int32_t* kinfor,
                                             int32_t* stride,
                                             int32_t* padding,
@@ -158,14 +158,14 @@ extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input
 template <typename TI, typename TO>
 __device__ void avgPoolForward3d(const TI* __restrict__ input,
                                  TO* __restrict__ output,
-                                 int32_t N,
-                                 int32_t C,
-                                 int32_t D,
-                                 int32_t H,
-                                 int32_t W,
-                                 int32_t OD,
-                                 int32_t OH,
-                                 int32_t OW,
+                                 size_t N,
+                                 size_t C,
+                                 size_t D,
+                                 size_t H,
+                                 size_t W,
+                                 size_t OD,
+                                 size_t OH,
+                                 size_t OW,
                                  int32_t* kinfor,
                                  int32_t* stride,
                                  int32_t* padding,
@@ -252,14 +252,14 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input,
 
 extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input,
                                             OUTPUT_TYPE* __restrict__ output,
-                                            int32_t N,
-                                            int32_t C,
-                                            int32_t D,
-                                            int32_t H,
-                                            int32_t W,
-                                            int32_t OD,
-                                            int32_t OH,
-                                            int32_t OW,
+                                            size_t N,
+                                            size_t C,
+                                            size_t D,
+                                            size_t H,
+                                            size_t W,
+                                            size_t OD,
+                                            size_t OH,
+                                            size_t OW,
                                             int32_t* kinfor,
                                             int32_t* stride,
                                             int32_t* padding,
@@ -290,12 +290,12 @@ extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input
 template <typename TI, typename TO>
 __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
                                   TO* __restrict__ input_grad,
-                                  int32_t N,
-                                  int32_t C,
-                                  int32_t H,
-                                  int32_t W,
-                                  int32_t OH,
-                                  int32_t OW,
+                                  size_t N,
+                                  size_t C,
+                                  size_t H,
+                                  size_t W,
+                                  size_t OH,
+                                  size_t OW,
                                   int32_t* kinfor,
                                   int32_t* stride,
                                   int32_t* padding,
@@ -376,12 +376,12 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
 
 extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad,
                                              OUTPUT_TYPE* __restrict__ input_grad,
-                                             int32_t N,
-                                             int32_t C,
-                                             int32_t H,
-                                             int32_t W,
-                                             int32_t OH,
-                                             int32_t OW,
+                                             size_t N,
+                                             size_t C,
+                                             size_t H,
+                                             size_t W,
+                                             size_t OH,
+                                             size_t OW,
                                              int32_t* kinfor,
                                              int32_t* stride,
                                              int32_t* padding,
@@ -410,14 +410,14 @@ extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ outp
 template <typename TI, typename TO>
 __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
                                   TO* __restrict__ input_grad,
-                                  int32_t N,
-                                  int32_t C,
-                                  int32_t D,
-                                  int32_t H,
-                                  int32_t W,
-                                  int32_t OD,
-                                  int32_t OH,
-                                  int32_t OW,
+                                  size_t N,
+                                  size_t C,
+                                  size_t D,
+                                  size_t H,
+                                  size_t W,
+                                  size_t OD,
+                                  size_t OH,
+                                  size_t OW,
                                   int32_t* kinfor,
                                   int32_t* stride,
                                   int32_t* padding,
@@ -514,14 +514,14 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
 
 extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad,
                                              OUTPUT_TYPE* __restrict__ input_grad,
-                                             int32_t N,
-                                             int32_t C,
-                                             int32_t D,
-                                             int32_t H,
-                                             int32_t W,
-                                             int32_t OD,
-                                             int32_t OH,
-                                             int32_t OW,
+                                             size_t N,
+                                             size_t C,
+                                             size_t D,
+                                             size_t H,
+                                             size_t W,
+                                             size_t OD,
+                                             size_t OH,
+                                             size_t OW,
                                              int32_t* kinfor,
                                              int32_t* stride,
                                              int32_t* padding,
diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp
index 10c9479b0c..b677192b36 100644
--- a/src/solver/avgpool/backward_avgpool_2d.cpp
+++ b/src/solver/avgpool/backward_avgpool_2d.cpp
@@ -46,6 +46,11 @@ namespace avgpool {
 bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context,
                                      const miopen::avgpool::BwdProblemDescription& problem) const
 {
+    if(problem.GetInputGradDesc().GetNumDims() != 4 ||
+       problem.GetOutputGradDesc().GetNumDims() != 4)
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp
index b960554348..829511d8cb 100644
--- a/src/solver/avgpool/backward_avgpool_3d.cpp
+++ b/src/solver/avgpool/backward_avgpool_3d.cpp
@@ -46,6 +46,11 @@ namespace avgpool {
 bool AvgPoolBackward3d::IsApplicable(const ExecutionContext& context,
                                      const miopen::avgpool::BwdProblemDescription& problem) const
 {
+    if(problem.GetInputGradDesc().GetNumDims() != 5 ||
+       problem.GetOutputGradDesc().GetNumDims() != 5)
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
index 8b444370a0..6ddef062da 100644
--- a/src/solver/avgpool/forward_avgpool_2d.cpp
+++ b/src/solver/avgpool/forward_avgpool_2d.cpp
@@ -47,6 +47,10 @@ namespace avgpool {
 bool AvgPoolForward2d::IsApplicable(const ExecutionContext& context,
                                     const miopen::avgpool::FwdProblemDescription& problem) const
 {
+    if(problem.GetInputDesc().GetNumDims() != 4 || problem.GetOutputDesc().GetNumDims() != 4)
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp
index 9dd8c03cba..c1ee497b27 100644
--- a/src/solver/avgpool/forward_avgpool_3d.cpp
+++ b/src/solver/avgpool/forward_avgpool_3d.cpp
@@ -46,6 +46,10 @@ namespace avgpool {
 bool AvgPoolForward3d::IsApplicable(const ExecutionContext& context,
                                     const miopen::avgpool::FwdProblemDescription& problem) const
 {
+    if(problem.GetInputDesc().GetNumDims() != 5 || problem.GetOutputDesc().GetNumDims() != 5)
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/test/cpu_avgpool.hpp b/test/cpu_avgpool.hpp
index 40a67a8d7d..ef26e17d74 100644
--- a/test/cpu_avgpool.hpp
+++ b/test/cpu_avgpool.hpp
@@ -32,12 +32,12 @@
 template <class T>
 void cpu_avgpool_forward_2d(tensor<T> input,
                             tensor<T>& output,
-                            int32_t N,
-                            int32_t C,
-                            int32_t H,
-                            int32_t W,
-                            int32_t OH,
-                            int32_t OW,
+                            size_t N,
+                            size_t C,
+                            size_t H,
+                            size_t W,
+                            size_t OH,
+                            size_t OW,
                             tensor<int32_t> kinfor,
                             tensor<int32_t> stride,
                             tensor<int32_t> padding,
@@ -85,15 +85,15 @@ void cpu_avgpool_forward_2d(tensor<T> input,
 
         int32_t hstart = oh * sh - ph;
         int32_t wstart = ow * sw - pw;
-        int32_t hend   = std::min(hstart + R, H + ph);
-        int32_t wend   = std::min(wstart + S, W + pw);
+        int32_t hend   = min(hstart + R, H + ph);
+        int32_t wend   = min(wstart + S, W + pw);
 
         const int32_t pool_size = (hend - hstart) * (wend - wstart);
 
-        hstart = std::max(hstart, 0);
-        wstart = std::max(wstart, 0);
-        hend   = std::min(hend, H);
-        wend   = std::min(wend, W);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        hend   = min(hend, H);
+        wend   = min(wend, W);
 
         int32_t divide_factor;
         if(divisor_override != 0)
@@ -121,14 +121,14 @@ void cpu_avgpool_forward_2d(tensor<T> input,
 template <class T>
 void cpu_avgpool_forward_3d(tensor<T> input,
                             tensor<T>& output,
-                            int32_t N,
-                            int32_t C,
-                            int32_t D,
-                            int32_t H,
-                            int32_t W,
-                            int32_t OD,
-                            int32_t OH,
-                            int32_t OW,
+                            size_t N,
+                            size_t C,
+                            size_t D,
+                            size_t H,
+                            size_t W,
+                            size_t OD,
+                            size_t OH,
+                            size_t OW,
                             tensor<int32_t> kinfor,
                             tensor<int32_t> stride,
                             tensor<int32_t> padding,
@@ -185,17 +185,17 @@ void cpu_avgpool_forward_3d(tensor<T> input,
         int32_t dstart = od * sd - pd;
         int32_t hstart = oh * sh - ph;
         int32_t wstart = ow * sw - pw;
-        int32_t dend   = std::min(dstart + KD, D + pd);
-        int32_t hend   = std::min(hstart + R, H + ph);
-        int32_t wend   = std::min(wstart + S, W + pw);
+        int32_t dend   = min(dstart + KD, D + pd);
+        int32_t hend   = min(hstart + R, H + ph);
+        int32_t wend   = min(wstart + S, W + pw);
 
         const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-        dstart                  = std::max(dstart, 0);
-        hstart                  = std::max(hstart, 0);
-        wstart                  = std::max(wstart, 0);
-        dend                    = std::min(dend, D);
-        hend                    = std::min(hend, H);
-        wend                    = std::min(wend, W);
+        dstart                  = max(dstart, 0);
+        hstart                  = max(hstart, 0);
+        wstart                  = max(wstart, 0);
+        dend                    = min(dend, D);
+        hend                    = min(hend, H);
+        wend                    = min(wend, W);
 
         int32_t divide_factor;
         if(divisor_override != 0)
@@ -222,12 +222,12 @@ void cpu_avgpool_forward_3d(tensor<T> input,
 template <class T>
 void cpu_avgpool_backward_2d(tensor<T> output_grad,
                              tensor<T>& input_grad,
-                             int32_t N,
-                             int32_t C,
-                             int32_t H,
-                             int32_t W,
-                             int32_t OH,
-                             int32_t OW,
+                             size_t N,
+                             size_t C,
+                             size_t H,
+                             size_t W,
+                             size_t OH,
+                             size_t OW,
                              tensor<int32_t> kinfor,
                              tensor<int32_t> stride,
                              tensor<int32_t> padding,
@@ -275,15 +275,15 @@ void cpu_avgpool_backward_2d(tensor<T> output_grad,
 
                 int32_t hstart = oh * sh - ph;
                 int32_t wstart = ow * sw - pw;
-                int32_t hend   = std::min(hstart + R, H + ph);
-                int32_t wend   = std::min(wstart + S, W + pw);
+                int32_t hend   = min(hstart + R, H + ph);
+                int32_t wend   = min(wstart + S, W + pw);
 
                 const int32_t pool_size = (hend - hstart) * (wend - wstart);
 
-                hstart = std::max(hstart, 0);
-                wstart = std::max(wstart, 0);
-                hend   = std::min(hend, H);
-                wend   = std::min(wend, W);
+                hstart = max(hstart, 0);
+                wstart = max(wstart, 0);
+                hend   = min(hend, H);
+                wend   = min(wend, W);
 
                 int32_t divide_factor;
                 if(divisor_override != 0)
@@ -315,14 +315,14 @@ void cpu_avgpool_backward_2d(tensor<T> output_grad,
 template <class T>
 void cpu_avgpool_backward_3d(tensor<T> output_grad,
                              tensor<T>& input_grad,
-                             int32_t N,
-                             int32_t C,
-                             int32_t D,
-                             int32_t H,
-                             int32_t W,
-                             int32_t OD,
-                             int32_t OH,
-                             int32_t OW,
+                             size_t N,
+                             size_t C,
+                             size_t D,
+                             size_t H,
+                             size_t W,
+                             size_t OD,
+                             size_t OH,
+                             size_t OW,
                              tensor<int32_t> kinfor,
                              tensor<int32_t> stride,
                              tensor<int32_t> padding,
@@ -385,17 +385,17 @@ void cpu_avgpool_backward_3d(tensor<T> output_grad,
                     int32_t dstart = od * sd - pd;
                     int32_t hstart = oh * sh - ph;
                     int32_t wstart = ow * sw - pw;
-                    int32_t dend   = std::min(dstart + KD, D + pd);
-                    int32_t hend   = std::min(hstart + R, H + ph);
-                    int32_t wend   = std::min(wstart + S, W + pw);
+                    int32_t dend   = min(dstart + KD, D + pd);
+                    int32_t hend   = min(hstart + R, H + ph);
+                    int32_t wend   = min(wstart + S, W + pw);
 
                     const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-                    dstart                  = std::max(dstart, 0);
-                    hstart                  = std::max(hstart, 0);
-                    wstart                  = std::max(wstart, 0);
-                    dend                    = std::min(dend, D);
-                    hend                    = std::min(hend, H);
-                    wend                    = std::min(wend, W);
+                    dstart                  = max(dstart, 0);
+                    hstart                  = max(hstart, 0);
+                    wstart                  = max(wstart, 0);
+                    dend                    = min(dend, D);
+                    hend                    = min(hend, H);
+                    wend                    = min(wend, W);
                     int32_t divide_factor;
                     if(divisor_override != 0)
                     {
diff --git a/test/gtest/avgpool.cpp b/test/gtest/avgpool.cpp
index 1dd5502339..fa002e5610 100644
--- a/test/gtest/avgpool.cpp
+++ b/test/gtest/avgpool.cpp
@@ -115,49 +115,49 @@ INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP32, testing::ValuesIn(AvgPoolT
 INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP16, testing::ValuesIn(AvgPoolTestConfigs()));
 INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs()));
 
-// // BACKWARD TEST
-// TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd)
-// {
-//     if(!MIOPEN_TEST_ALL ||
-//        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
-//     {
-//         RunTest();
-//         Verify();
-//     }
-//     else
-//     {
-//         GTEST_SKIP();
-//     }
-// };
-
-// TEST_P(GPU_Avgpool_bwd_FP16, AvgPoolTestBwd)
-// {
-//     if(!MIOPEN_TEST_ALL ||
-//        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
-//     {
-//         RunTest();
-//         Verify();
-//     }
-//     else
-//     {
-//         GTEST_SKIP();
-//     }
-// };
-
-// TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd)
-// {
-//     if(!MIOPEN_TEST_ALL ||
-//        (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
-//     {
-//         RunTest();
-//         Verify();
-//     }
-//     else
-//     {
-//         GTEST_SKIP();
-//     }
-// };
-
-// INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP32, testing::ValuesIn(AvgPoolTestConfigs()));
-// INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP16, testing::ValuesIn(AvgPoolTestConfigs()));
-// INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs()));
+// BACKWARD TEST
+TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_Avgpool_bwd_FP16, AvgPoolTestBwd)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd)
+{
+    if(!MIOPEN_TEST_ALL ||
+       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
+    {
+        RunTest();
+        Verify();
+    }
+    else
+    {
+        GTEST_SKIP();
+    }
+};
+
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP32, testing::ValuesIn(AvgPoolTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP16, testing::ValuesIn(AvgPoolTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs()));
diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp
index 23ec4c1726..26548e0a12 100644
--- a/test/gtest/avgpool.hpp
+++ b/test/gtest/avgpool.hpp
@@ -221,7 +221,6 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
                                         count_include_pad,
                                         divisor_override);
         fflush(stdout);
-
         ASSERT_EQ(status, miopenStatusSuccess);
 
         output.data = handle.Read<T>(output_dev, output.data.size());
@@ -234,11 +233,6 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
         auto error = miopen::rms_range(ref_output, output);
 
         ASSERT_EQ(miopen::range_distance(ref_output), miopen::range_distance(output));
-        for(int i = 0; i < 10; ++i)
-        {
-            std::cout << "output cpu: " << ref_output[i] << " output gpu: " << output[i]
-                      << std::endl;
-        }
         EXPECT_LT(error, threshold * 10);
     }
     AvgPoolTestCase avgpool_config;

From 881e79671935b7cbc6a05ba2cf61ad8749927305 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Fri, 16 Aug 2024 11:49:15 +0700
Subject: [PATCH 06/29] change kinfor to ksize

---
 driver/mloAvgPoolHost.hpp                    | 28 ++++-----
 include/miopen/miopen.h                      | 16 ++---
 src/avgpool.cpp                              | 20 +++---
 src/avgpool_api.cpp                          | 24 ++++----
 src/include/miopen/avgpool.hpp               |  8 +--
 src/include/miopen/avgpool/invoke_params.hpp |  8 +--
 src/kernels/MIOpenAvgPool.cpp                | 65 +++++++++++++-------
 src/solver/avgpool/backward_avgpool_2d.cpp   | 38 +++++++++++-
 src/solver/avgpool/backward_avgpool_3d.cpp   |  4 +-
 src/solver/avgpool/forward_avgpool_2d.cpp    | 40 +++++++++++-
 src/solver/avgpool/forward_avgpool_3d.cpp    |  4 +-
 test/cpu_avgpool.hpp                         | 28 ++++-----
 12 files changed, 187 insertions(+), 96 deletions(-)

diff --git a/driver/mloAvgPoolHost.hpp b/driver/mloAvgPoolHost.hpp
index ad55c53c66..6980ce968e 100644
--- a/driver/mloAvgPoolHost.hpp
+++ b/driver/mloAvgPoolHost.hpp
@@ -40,7 +40,7 @@ int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc,
                                    size_t W,
                                    size_t OH,
                                    size_t OW,
-                                   const int32_t* kinfor,
+                                   const int32_t* ksize,
                                    const int32_t* stride,
                                    const int32_t* padding,
                                    bool count_include_pad,
@@ -57,8 +57,8 @@ int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc,
         int32_t ncoh = gid / OW, ow = gid % OW;
         int32_t nc = ncoh / OH, oh = ncoh % OH;
         int32_t n = nc / C, c = nc % C;
-        int32_t R  = kinfor[0];
-        int32_t S  = kinfor[1];
+        int32_t R  = ksize[0];
+        int32_t S  = ksize[1];
         int32_t sh = stride[0];
         int32_t sw = stride[1];
         int32_t ph = padding[0];
@@ -134,7 +134,7 @@ int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc,
                                    size_t OD,
                                    size_t OH,
                                    size_t OW,
-                                   const int32_t* kinfor,
+                                   const int32_t* ksize,
                                    const int32_t* stride,
                                    const int32_t* padding,
                                    bool count_include_pad,
@@ -152,9 +152,9 @@ int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc,
         int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
         int32_t nc = ncod / OD, od = ncod % OD;
         int32_t n = nc / C, c = nc % C;
-        int32_t KD = kinfor[0];
-        int32_t R  = kinfor[1];
-        int32_t S  = kinfor[2];
+        int32_t KD = ksize[0];
+        int32_t R  = ksize[1];
+        int32_t S  = ksize[2];
         int32_t sd = stride[0];
         int32_t sh = stride[1];
         int32_t sw = stride[2];
@@ -236,7 +236,7 @@ int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDes
                                     size_t W,
                                     size_t OH,
                                     size_t OW,
-                                    const int32_t* kinfor,
+                                    const int32_t* ksize,
                                     const int32_t* stride,
                                     const int32_t* padding,
                                     bool count_include_pad,
@@ -253,8 +253,8 @@ int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDes
         int32_t nch = gid / W, w = gid % W;
         int32_t nc = nch / H, h = nch % H;
         int32_t n = nc / C, c = nc % C;
-        int32_t R  = kinfor[0];
-        int32_t S  = kinfor[1];
+        int32_t R  = ksize[0];
+        int32_t S  = ksize[1];
         int32_t sh = stride[0];
         int32_t sw = stride[1];
         int32_t ph = padding[0];
@@ -334,7 +334,7 @@ int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDes
                                     size_t OD,
                                     size_t OH,
                                     size_t OW,
-                                    const int32_t* kinfor,
+                                    const int32_t* ksize,
                                     const int32_t* stride,
                                     const int32_t* padding,
                                     bool count_include_pad,
@@ -352,9 +352,9 @@ int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDes
         int32_t ncd = ncdh / H, h = ncdh % H;
         int32_t nc = ncd / D, d = ncd % D;
         int32_t n = nc / C, c = nc % C;
-        int32_t KD = kinfor[0];
-        int32_t R  = kinfor[1];
-        int32_t S  = kinfor[2];
+        int32_t KD = ksize[0];
+        int32_t R  = ksize[1];
+        int32_t S  = ksize[2];
         int32_t sd = stride[0];
         int32_t sh = stride[1];
         int32_t sw = stride[2];
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index fda8817e3a..18b0bcafdf 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -7640,8 +7640,8 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle,
  * @param stride                   Data tensor stride (output)
  * @param paddingDesc              Tensor descriptor for padding tensor (input)
  * @param padding                  Data tensor padding (output)
- * @param kinforDesc               Tensor descriptor for kinfor tensor (input)
- * @param kinfor                   Data tensor kinfor (output)
+ * @param ksizeDesc               Tensor descriptor for ksize tensor (input)
+ * @param ksize                   Data tensor ksize (output)
  * @param count_include_pad        When True, will include the zero-padding in the averaging
  * calculation (input)
  * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
@@ -7657,8 +7657,8 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                                                   const void* stride,
                                                   const miopenTensorDescriptor_t paddingDesc,
                                                   const void* padding,
-                                                  const miopenTensorDescriptor_t kinforDesc,
-                                                  const void* kinfor,
+                                                  const miopenTensorDescriptor_t ksizeDesc,
+                                                  const void* ksize,
                                                   const bool count_include_pad,
                                                   const int32_t divisor_override);
 
@@ -7673,8 +7673,8 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
  * @param stride                   Data tensor stride (output)
  * @param paddingDesc              Tensor descriptor for padding tensor (input)
  * @param padding                  Data tensor padding (output)
- * @param kinforDesc               Tensor descriptor for kinfor tensor (input)
- * @param kinfor                   Data tensor kinfor (output)
+ * @param ksizeDesc               Tensor descriptor for ksize tensor (input)
+ * @param ksize                   Data tensor ksize (output)
  * @param count_include_pad        When True, will include the zero-padding in the averaging
  * calculation (input)
  * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
@@ -7690,8 +7690,8 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                                                    const void* stride,
                                                    const miopenTensorDescriptor_t paddingDesc,
                                                    const void* padding,
-                                                   const miopenTensorDescriptor_t kinforDesc,
-                                                   const void* kinfor,
+                                                   const miopenTensorDescriptor_t ksizeDesc,
+                                                   const void* ksize,
                                                    const bool count_include_pad,
                                                    const int32_t divisor_override);
 /** @} */
diff --git a/src/avgpool.cpp b/src/avgpool.cpp
index 15bea1f9d8..87ff481c6a 100644
--- a/src/avgpool.cpp
+++ b/src/avgpool.cpp
@@ -42,8 +42,8 @@ miopenStatus_t AvgPoolForward(Handle& handle,
                               ConstData_t stride,
                               const TensorDescriptor& paddingDesc,
                               ConstData_t padding,
-                              const TensorDescriptor& kinforDesc,
-                              ConstData_t kinfor,
+                              const TensorDescriptor& ksizeDesc,
+                              ConstData_t ksize,
                               const bool count_include_pad,
                               const int32_t divisor_override)
 {
@@ -51,7 +51,7 @@ miopenStatus_t AvgPoolForward(Handle& handle,
                                                         outputDesc,
                                                         strideDesc,
                                                         paddingDesc,
-                                                        kinforDesc,
+                                                        ksizeDesc,
                                                         count_include_pad,
                                                         divisor_override};
 
@@ -61,13 +61,13 @@ miopenStatus_t AvgPoolForward(Handle& handle,
         tmp.outputDesc  = &outputDesc;
         tmp.strideDesc  = &strideDesc;
         tmp.paddingDesc = &paddingDesc;
-        tmp.kinforDesc  = &kinforDesc;
+        tmp.ksizeDesc   = &ksizeDesc;
 
         tmp.input             = input;
         tmp.output            = output;
         tmp.stride            = stride;
         tmp.padding           = padding;
-        tmp.kinfor            = kinfor;
+        tmp.ksize             = ksize;
         tmp.count_include_pad = count_include_pad;
         tmp.divisor_override  = divisor_override;
 
@@ -91,8 +91,8 @@ miopenStatus_t AvgPoolBackward(Handle& handle,
                                ConstData_t stride,
                                const TensorDescriptor& paddingDesc,
                                ConstData_t padding,
-                               const TensorDescriptor& kinforDesc,
-                               ConstData_t kinfor,
+                               const TensorDescriptor& ksizeDesc,
+                               ConstData_t ksize,
                                const bool count_include_pad,
                                const int32_t divisor_override)
 {
@@ -100,7 +100,7 @@ miopenStatus_t AvgPoolBackward(Handle& handle,
                                                         inputGradDesc,
                                                         strideDesc,
                                                         paddingDesc,
-                                                        kinforDesc,
+                                                        ksizeDesc,
                                                         count_include_pad,
                                                         divisor_override};
 
@@ -110,13 +110,13 @@ miopenStatus_t AvgPoolBackward(Handle& handle,
         tmp.inputGradDesc  = &inputGradDesc;
         tmp.strideDesc     = &strideDesc;
         tmp.paddingDesc    = &paddingDesc;
-        tmp.kinforDesc     = &kinforDesc;
+        tmp.ksizeDesc      = &ksizeDesc;
 
         tmp.output_grad       = output_grad;
         tmp.input_grad        = input_grad;
         tmp.stride            = stride;
         tmp.padding           = padding;
-        tmp.kinfor            = kinfor;
+        tmp.ksize             = ksize;
         tmp.count_include_pad = count_include_pad;
         tmp.divisor_override  = divisor_override;
 
diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp
index 4e62bd5e7b..fa2e8a957c 100644
--- a/src/avgpool_api.cpp
+++ b/src/avgpool_api.cpp
@@ -88,8 +88,8 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                                                const void* stride,
                                                const miopenTensorDescriptor_t paddingDesc,
                                                const void* padding,
-                                               const miopenTensorDescriptor_t kinforDesc,
-                                               const void* kinfor,
+                                               const miopenTensorDescriptor_t ksizeDesc,
+                                               const void* ksize,
                                                const bool count_include_pad,
                                                const int32_t divisor_override)
 {
@@ -102,8 +102,8 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                         stride,
                         paddingDesc,
                         padding,
-                        kinforDesc,
-                        kinfor,
+                        ksizeDesc,
+                        ksize,
                         count_include_pad,
                         divisor_override);
 
@@ -118,8 +118,8 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                                DataCast(stride),
                                miopen::deref(paddingDesc),
                                DataCast(padding),
-                               miopen::deref(kinforDesc),
-                               DataCast(kinfor),
+                               miopen::deref(ksizeDesc),
+                               DataCast(ksize),
                                count_include_pad,
                                divisor_override);
     });
@@ -134,8 +134,8 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                                                 const void* stride,
                                                 const miopenTensorDescriptor_t paddingDesc,
                                                 const void* padding,
-                                                const miopenTensorDescriptor_t kinforDesc,
-                                                const void* kinfor,
+                                                const miopenTensorDescriptor_t ksizeDesc,
+                                                const void* ksize,
                                                 const bool count_include_pad,
                                                 const int32_t divisor_override)
 {
@@ -148,8 +148,8 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                         stride,
                         paddingDesc,
                         padding,
-                        kinforDesc,
-                        kinfor,
+                        ksizeDesc,
+                        ksize,
                         count_include_pad,
                         divisor_override);
 
@@ -164,8 +164,8 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                                 DataCast(stride),
                                 miopen::deref(paddingDesc),
                                 DataCast(padding),
-                                miopen::deref(kinforDesc),
-                                DataCast(kinfor),
+                                miopen::deref(ksizeDesc),
+                                DataCast(ksize),
                                 count_include_pad,
                                 divisor_override);
     });
diff --git a/src/include/miopen/avgpool.hpp b/src/include/miopen/avgpool.hpp
index 617ed56782..9210e45e3a 100644
--- a/src/include/miopen/avgpool.hpp
+++ b/src/include/miopen/avgpool.hpp
@@ -43,8 +43,8 @@ MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolForward(Handle& handle,
                                                       ConstData_t stride,
                                                       const TensorDescriptor& paddingDesc,
                                                       ConstData_t padding,
-                                                      const TensorDescriptor& kinforDesc,
-                                                      ConstData_t kinfor,
+                                                      const TensorDescriptor& ksizeDesc,
+                                                      ConstData_t ksize,
                                                       bool count_include_pad,
                                                       int32_t divisor_override);
 
@@ -57,8 +57,8 @@ MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolBackward(Handle& handle,
                                                        ConstData_t stride,
                                                        const TensorDescriptor& paddingDesc,
                                                        ConstData_t padding,
-                                                       const TensorDescriptor& kinforDesc,
-                                                       ConstData_t kinfor,
+                                                       const TensorDescriptor& ksizeDesc,
+                                                       ConstData_t ksize,
                                                        bool count_include_pad,
                                                        int32_t divisor_override);
 } // namespace miopen
diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/avgpool/invoke_params.hpp
index b57f8e0edc..91a70725ee 100644
--- a/src/include/miopen/avgpool/invoke_params.hpp
+++ b/src/include/miopen/avgpool/invoke_params.hpp
@@ -42,13 +42,13 @@ struct FwdInvokeParams : public miopen::InvokeParams
     const TensorDescriptor* outputDesc  = nullptr;
     const TensorDescriptor* strideDesc  = nullptr;
     const TensorDescriptor* paddingDesc = nullptr;
-    const TensorDescriptor* kinforDesc  = nullptr;
+    const TensorDescriptor* ksizeDesc   = nullptr;
 
     ConstData_t input   = nullptr;
     Data_t output       = nullptr;
     ConstData_t stride  = nullptr;
     ConstData_t padding = nullptr;
-    ConstData_t kinfor  = nullptr;
+    ConstData_t ksize   = nullptr;
 
     bool count_include_pad   = false;
     int32_t divisor_override = 0;
@@ -66,13 +66,13 @@ struct BwdInvokeParams : public miopen::InvokeParams
     const TensorDescriptor* inputGradDesc  = nullptr;
     const TensorDescriptor* strideDesc     = nullptr;
     const TensorDescriptor* paddingDesc    = nullptr;
-    const TensorDescriptor* kinforDesc     = nullptr;
+    const TensorDescriptor* ksizeDesc      = nullptr;
 
     ConstData_t output_grad = nullptr;
     Data_t input_grad       = nullptr;
     ConstData_t stride      = nullptr;
     ConstData_t padding     = nullptr;
-    ConstData_t kinfor      = nullptr;
+    ConstData_t ksize       = nullptr;
 
     bool count_include_pad   = false;
     int32_t divisor_override = 0;
diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
index f4a9e95ce1..6d94bffac1 100644
--- a/src/kernels/MIOpenAvgPool.cpp
+++ b/src/kernels/MIOpenAvgPool.cpp
@@ -40,6 +40,27 @@
 #define OUTPUT_TYPE float
 #endif
 
+// template <typename T, uint32_t Nd>
+// struct blockNd
+// {
+//     T val[Nd];
+// };
+
+// template <typename TI, typename TO, uint32_t Nd>
+// __device__ void avgPoolForwardNdNew(const TI* __restrict__ input,
+//                                     TO* __restrict__ output,
+//                                     size_t N,
+//                                     size_t C,
+//                                     const blockNd<size_t, Nd> sizeIn,
+//                                     const blockNd<size_t, Nd> sizeOut,
+//                                     const blockNd<int32_t, Nd> ksize,
+//                                     const blockNd<int32_t, Nd> stride,
+//                                     const blockNd<int32_t, Nd> padding,
+//                                     bool count_include_pad,
+//                                     int32_t divisor_override,
+//                                     tensor_view_t<Nd + 2> input_tv,
+//                                     tensor_view_t<Nd + 2> output_tv);
+
 template <typename TI, typename TO>
 __device__ void avgPoolForward2d(const TI* __restrict__ input,
                                  TO* __restrict__ output,
@@ -49,7 +70,7 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input,
                                  size_t W,
                                  size_t OH,
                                  size_t OW,
-                                 const int32_t* __restrict__ kinfor,
+                                 const int32_t* __restrict__ ksize,
                                  const int32_t* __restrict__ stride,
                                  const int32_t* __restrict__ padding,
                                  bool count_include_pad,
@@ -61,8 +82,8 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input,
     int32_t ncoh = gid / OW, ow = gid % OW;
     int32_t nc = ncoh / OH, oh = ncoh % OH;
     int32_t n = nc / C, c = nc % C;
-    int32_t R  = kinfor[0];
-    int32_t S  = kinfor[1];
+    int32_t R  = ksize[0];
+    int32_t S  = ksize[1];
     int32_t sh = stride[0];
     int32_t sw = stride[1];
     int32_t ph = padding[0];
@@ -130,7 +151,7 @@ extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input
                                             size_t W,
                                             size_t OH,
                                             size_t OW,
-                                            int32_t* kinfor,
+                                            int32_t* ksize,
                                             int32_t* stride,
                                             int32_t* padding,
                                             bool count_include_pad,
@@ -146,7 +167,7 @@ extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input
                                               W,
                                               OH,
                                               OW,
-                                              kinfor,
+                                              ksize,
                                               stride,
                                               padding,
                                               count_include_pad,
@@ -166,7 +187,7 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input,
                                  size_t OD,
                                  size_t OH,
                                  size_t OW,
-                                 int32_t* kinfor,
+                                 int32_t* ksize,
                                  int32_t* stride,
                                  int32_t* padding,
                                  bool count_include_pad,
@@ -179,9 +200,9 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input,
     int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
     int32_t nc = ncod / OD, od = ncod % OD;
     int32_t n = nc / C, c = nc % C;
-    int32_t KD = kinfor[0];
-    int32_t R  = kinfor[1];
-    int32_t S  = kinfor[2];
+    int32_t KD = ksize[0];
+    int32_t R  = ksize[1];
+    int32_t S  = ksize[2];
     int32_t sd = stride[0];
     int32_t sh = stride[1];
     int32_t sw = stride[2];
@@ -260,7 +281,7 @@ extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input
                                             size_t OD,
                                             size_t OH,
                                             size_t OW,
-                                            int32_t* kinfor,
+                                            int32_t* ksize,
                                             int32_t* stride,
                                             int32_t* padding,
                                             bool count_include_pad,
@@ -278,7 +299,7 @@ extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input
                                               OD,
                                               OH,
                                               OW,
-                                              kinfor,
+                                              ksize,
                                               stride,
                                               padding,
                                               count_include_pad,
@@ -296,7 +317,7 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
                                   size_t W,
                                   size_t OH,
                                   size_t OW,
-                                  int32_t* kinfor,
+                                  int32_t* ksize,
                                   int32_t* stride,
                                   int32_t* padding,
                                   bool count_include_pad,
@@ -308,8 +329,8 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
     int32_t nch = gid / W, w = gid % W;
     int32_t nc = nch / H, h = nch % H;
     int32_t n = nc / C, c = nc % C;
-    int32_t R  = kinfor[0];
-    int32_t S  = kinfor[1];
+    int32_t R  = ksize[0];
+    int32_t S  = ksize[1];
     int32_t sh = stride[0];
     int32_t sw = stride[1];
     int32_t ph = padding[0];
@@ -382,7 +403,7 @@ extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ outp
                                              size_t W,
                                              size_t OH,
                                              size_t OW,
-                                             int32_t* kinfor,
+                                             int32_t* ksize,
                                              int32_t* stride,
                                              int32_t* padding,
                                              bool count_include_pad,
@@ -398,7 +419,7 @@ extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ outp
                                                W,
                                                OH,
                                                OW,
-                                               kinfor,
+                                               ksize,
                                                stride,
                                                padding,
                                                count_include_pad,
@@ -418,7 +439,7 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
                                   size_t OD,
                                   size_t OH,
                                   size_t OW,
-                                  int32_t* kinfor,
+                                  int32_t* ksize,
                                   int32_t* stride,
                                   int32_t* padding,
                                   bool count_include_pad,
@@ -431,9 +452,9 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
     int32_t ncd = ncdh / H, h = ncdh % H;
     int32_t nc = ncd / D, d = ncd % D;
     int32_t n = nc / C, c = nc % C;
-    int32_t KD = kinfor[0];
-    int32_t R  = kinfor[1];
-    int32_t S  = kinfor[2];
+    int32_t KD = ksize[0];
+    int32_t R  = ksize[1];
+    int32_t S  = ksize[2];
     int32_t sd = stride[0];
     int32_t sh = stride[1];
     int32_t sw = stride[2];
@@ -522,7 +543,7 @@ extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ outp
                                              size_t OD,
                                              size_t OH,
                                              size_t OW,
-                                             int32_t* kinfor,
+                                             int32_t* ksize,
                                              int32_t* stride,
                                              int32_t* padding,
                                              bool count_include_pad,
@@ -540,7 +561,7 @@ extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ outp
                                                OD,
                                                OH,
                                                OW,
-                                               kinfor,
+                                               ksize,
                                                stride,
                                                padding,
                                                count_include_pad,
diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp
index b677192b36..4fe9d5bc76 100644
--- a/src/solver/avgpool/backward_avgpool_2d.cpp
+++ b/src/solver/avgpool/backward_avgpool_2d.cpp
@@ -35,7 +35,7 @@
 #include <miopen/avgpool.hpp>
 #include <miopen/target_properties.hpp>
 
-#define LOCAL_SIZE_BWD_2D 1024
+#define LOCAL_SIZE_BWD_2D 256
 
 namespace miopen {
 
@@ -43,6 +43,36 @@ namespace solver {
 
 namespace avgpool {
 
+bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem)
+{
+    auto dtype      = problem.GetInputGradDesc().GetType();
+    auto in_nelems  = problem.GetInputGradDesc().GetElementSize();
+    auto out_nelems = problem.GetOutputGradDesc().GetElementSize();
+    auto mul_nc =
+        problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1];
+    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
+
+    if(dtype == miopenFloat)
+    {
+        return false;
+    }
+    else if(dtype == miopenHalf)
+    {
+        if(in_over_out < 2 && in_nelems >= 11075584)
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenBFloat16)
+    {
+        if(in_over_out < 2 || (in_nelems > 20000000 && mul_nc <= 2048))
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
 bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context,
                                      const miopen::avgpool::BwdProblemDescription& problem) const
 {
@@ -51,6 +81,10 @@ bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context,
     {
         return false;
     }
+    if(!IsOverRocm(problem))
+    {
+        return false;
+    }
     return true;
 }
 
@@ -101,7 +135,7 @@ AvgPoolBackward2d::GetSolution(const ExecutionContext& context,
                    W,
                    OH,
                    OW,
-                   params.kinfor,
+                   params.ksize,
                    params.stride,
                    params.padding,
                    params.count_include_pad,
diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp
index 829511d8cb..6897097955 100644
--- a/src/solver/avgpool/backward_avgpool_3d.cpp
+++ b/src/solver/avgpool/backward_avgpool_3d.cpp
@@ -35,7 +35,7 @@
 #include <miopen/avgpool.hpp>
 #include <miopen/target_properties.hpp>
 
-#define LOCAL_SIZE_BWD_3D 1024
+#define LOCAL_SIZE_BWD_3D 256
 
 namespace miopen {
 
@@ -105,7 +105,7 @@ AvgPoolBackward3d::GetSolution(const ExecutionContext& context,
                    OD,
                    OH,
                    OW,
-                   params.kinfor,
+                   params.ksize,
                    params.stride,
                    params.padding,
                    params.count_include_pad,
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
index 6ddef062da..3e70264097 100644
--- a/src/solver/avgpool/forward_avgpool_2d.cpp
+++ b/src/solver/avgpool/forward_avgpool_2d.cpp
@@ -36,7 +36,7 @@
 #include <miopen/avgpool.hpp>
 #include <miopen/target_properties.hpp>
 
-#define LOCAL_SIZE_FWD_2D 1024
+#define LOCAL_SIZE_FWD_2D 256
 
 namespace miopen {
 
@@ -44,6 +44,38 @@ namespace solver {
 
 namespace avgpool {
 
+bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem)
+{
+    auto dtype      = problem.GetOutputDesc().GetType();
+    auto in_nelems  = problem.GetInputDesc().GetElementSize();
+    auto out_nelems = problem.GetOutputDesc().GetElementSize();
+    auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1];
+    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
+
+    if(dtype == miopenFloat)
+    {
+        if(in_over_out > 11 || (in_over_out < 2 && mul_nc >= 12288))
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenHalf)
+    {
+        if(in_over_out > 11 || (in_over_out < 2 && mul_nc < 90000))
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenBFloat16)
+    {
+        if(in_over_out >= 1024 || in_over_out < 2 || out_nelems >= 6000000)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
 bool AvgPoolForward2d::IsApplicable(const ExecutionContext& context,
                                     const miopen::avgpool::FwdProblemDescription& problem) const
 {
@@ -51,6 +83,10 @@ bool AvgPoolForward2d::IsApplicable(const ExecutionContext& context,
     {
         return false;
     }
+    if(!IsOverRocm(problem))
+    {
+        return false;
+    }
     return true;
 }
 
@@ -101,7 +137,7 @@ AvgPoolForward2d::GetSolution(const ExecutionContext& context,
                    W,
                    OH,
                    OW,
-                   params.kinfor,
+                   params.ksize,
                    params.stride,
                    params.padding,
                    params.count_include_pad,
diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp
index c1ee497b27..088aac6dca 100644
--- a/src/solver/avgpool/forward_avgpool_3d.cpp
+++ b/src/solver/avgpool/forward_avgpool_3d.cpp
@@ -35,7 +35,7 @@
 #include <miopen/avgpool.hpp>
 #include <miopen/target_properties.hpp>
 
-#define LOCAL_SIZE_FWD_3D 1024
+#define LOCAL_SIZE_FWD_3D 256
 
 namespace miopen {
 
@@ -104,7 +104,7 @@ AvgPoolForward3d::GetSolution(const ExecutionContext& context,
                    OD,
                    OH,
                    OW,
-                   params.kinfor,
+                   params.ksize,
                    params.stride,
                    params.padding,
                    params.count_include_pad,
diff --git a/test/cpu_avgpool.hpp b/test/cpu_avgpool.hpp
index ef26e17d74..5b91033633 100644
--- a/test/cpu_avgpool.hpp
+++ b/test/cpu_avgpool.hpp
@@ -38,7 +38,7 @@ void cpu_avgpool_forward_2d(tensor<T> input,
                             size_t W,
                             size_t OH,
                             size_t OW,
-                            tensor<int32_t> kinfor,
+                            tensor<int32_t> ksize,
                             tensor<int32_t> stride,
                             tensor<int32_t> padding,
                             bool count_include_pad,
@@ -55,8 +55,8 @@ void cpu_avgpool_forward_2d(tensor<T> input,
         int32_t ncoh = gid / OW, ow = gid % OW;
         int32_t nc = ncoh / OH, oh = ncoh % OH;
         int32_t n = nc / C, c = nc % C;
-        int32_t R  = kinfor[0];
-        int32_t S  = kinfor[1];
+        int32_t R  = ksize[0];
+        int32_t S  = ksize[1];
         int32_t sh = stride[0];
         int32_t sw = stride[1];
         int32_t ph = padding[0];
@@ -129,7 +129,7 @@ void cpu_avgpool_forward_3d(tensor<T> input,
                             size_t OD,
                             size_t OH,
                             size_t OW,
-                            tensor<int32_t> kinfor,
+                            tensor<int32_t> ksize,
                             tensor<int32_t> stride,
                             tensor<int32_t> padding,
                             bool count_include_pad,
@@ -147,9 +147,9 @@ void cpu_avgpool_forward_3d(tensor<T> input,
         int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
         int32_t nc = ncod / OD, od = ncod % OD;
         int32_t n = nc / C, c = nc % C;
-        int32_t KD = kinfor[0];
-        int32_t R  = kinfor[1];
-        int32_t S  = kinfor[2];
+        int32_t KD = ksize[0];
+        int32_t R  = ksize[1];
+        int32_t S  = ksize[2];
         int32_t sd = stride[0];
         int32_t sh = stride[1];
         int32_t sw = stride[2];
@@ -228,7 +228,7 @@ void cpu_avgpool_backward_2d(tensor<T> output_grad,
                              size_t W,
                              size_t OH,
                              size_t OW,
-                             tensor<int32_t> kinfor,
+                             tensor<int32_t> ksize,
                              tensor<int32_t> stride,
                              tensor<int32_t> padding,
                              bool count_include_pad,
@@ -245,8 +245,8 @@ void cpu_avgpool_backward_2d(tensor<T> output_grad,
         int32_t nch = gid / W, w = gid % W;
         int32_t nc = nch / H, h = nch % H;
         int32_t n = nc / C, c = nc % C;
-        int32_t R  = kinfor[0];
-        int32_t S  = kinfor[1];
+        int32_t R  = ksize[0];
+        int32_t S  = ksize[1];
         int32_t sh = stride[0];
         int32_t sw = stride[1];
         int32_t ph = padding[0];
@@ -323,7 +323,7 @@ void cpu_avgpool_backward_3d(tensor<T> output_grad,
                              size_t OD,
                              size_t OH,
                              size_t OW,
-                             tensor<int32_t> kinfor,
+                             tensor<int32_t> ksize,
                              tensor<int32_t> stride,
                              tensor<int32_t> padding,
                              bool count_include_pad,
@@ -341,9 +341,9 @@ void cpu_avgpool_backward_3d(tensor<T> output_grad,
         int32_t ncd = ncdh / H, h = ncdh % H;
         int32_t nc = ncd / D, d = ncd % D;
         int32_t n = nc / C, c = nc % C;
-        int32_t KD = kinfor[0];
-        int32_t R  = kinfor[1];
-        int32_t S  = kinfor[2];
+        int32_t KD = ksize[0];
+        int32_t R  = ksize[1];
+        int32_t S  = ksize[2];
         int32_t sd = stride[0];
         int32_t sh = stride[1];
         int32_t sw = stride[2];

From 36128975121554bdd9336656f7781ddee410605f Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 19 Aug 2024 16:57:51 +0700
Subject: [PATCH 07/29] change params

---
 driver/avgpool_driver.hpp                     |  95 ++++-----
 include/miopen/miopen.h                       |  42 ++--
 src/avgpool.cpp                               |  84 ++++----
 src/avgpool_api.cpp                           |  90 +++++----
 src/include/miopen/avgpool.hpp                |  30 +--
 src/include/miopen/avgpool/invoke_params.hpp  |  42 ++--
 .../miopen/avgpool/problem_description.hpp    |  44 +----
 src/kernels/MIOpenAvgPool.cpp                 | 183 +++++++++---------
 src/solver/avgpool/backward_avgpool_2d.cpp    |  17 +-
 src/solver/avgpool/backward_avgpool_3d.cpp    |  52 ++++-
 src/solver/avgpool/forward_avgpool_2d.cpp     |  17 +-
 src/solver/avgpool/forward_avgpool_3d.cpp     |  48 ++++-
 12 files changed, 409 insertions(+), 335 deletions(-)

diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp
index 38beba92f1..ff7d04edd5 100644
--- a/driver/avgpool_driver.hpp
+++ b/driver/avgpool_driver.hpp
@@ -52,9 +52,6 @@ class AvgPoolDriver : public Driver
         miopenCreateTensorDescriptor(&outputDesc);
         miopenCreateTensorDescriptor(&inputGradDesc);
         miopenCreateTensorDescriptor(&outputGradDesc);
-        miopenCreateTensorDescriptor(&ksizeDesc);
-        miopenCreateTensorDescriptor(&strideDesc);
-        miopenCreateTensorDescriptor(&paddingDesc);
 
         data_type = miopen_type<Tgpu>{};
     }
@@ -83,9 +80,6 @@ class AvgPoolDriver : public Driver
         miopenDestroyTensorDescriptor(outputDesc);
         miopenDestroyTensorDescriptor(inputGradDesc);
         miopenDestroyTensorDescriptor(outputGradDesc);
-        miopenDestroyTensorDescriptor(ksizeDesc);
-        miopenDestroyTensorDescriptor(strideDesc);
-        miopenDestroyTensorDescriptor(paddingDesc);
     }
 
 private:
@@ -97,17 +91,11 @@ class AvgPoolDriver : public Driver
     miopenTensorDescriptor_t outputDesc;
     miopenTensorDescriptor_t inputGradDesc;
     miopenTensorDescriptor_t outputGradDesc;
-    miopenTensorDescriptor_t ksizeDesc;
-    miopenTensorDescriptor_t strideDesc;
-    miopenTensorDescriptor_t paddingDesc;
 
     std::unique_ptr<GPUMem> input_dev;
     std::unique_ptr<GPUMem> output_dev;
     std::unique_ptr<GPUMem> input_grad_dev;
     std::unique_ptr<GPUMem> output_grad_dev;
-    std::unique_ptr<GPUMem> ksize_dev;
-    std::unique_ptr<GPUMem> stride_dev;
-    std::unique_ptr<GPUMem> padding_dev;
 
     std::vector<Tgpu> input;
     std::vector<Tgpu> output;
@@ -172,29 +160,29 @@ std::vector<int> AvgPoolDriver<Tgpu, Tref>::GetInputTensorDimsFromCmd(const char
 template <typename Tgpu, typename Tref>
 int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
 {
-    in_dim                   = GetInputTensorDimsFromCmd("input_dims");
-    std::vector<int> ksp_dim = {in_dim.size() - 2};
-    ksize                    = GetInputTensorDimsFromCmd("kernel_size");
-    stride                   = GetInputTensorDimsFromCmd("stride");
-    padding                  = GetInputTensorDimsFromCmd("padding");
+    in_dim      = GetInputTensorDimsFromCmd("input_dims");
+    int ksp_dim = in_dim.size() - 2;
+    ksize       = GetInputTensorDimsFromCmd("kernel_size");
+    stride      = GetInputTensorDimsFromCmd("stride");
+    padding     = GetInputTensorDimsFromCmd("padding");
 
-    if(ksize.size() != ksp_dim[0])
+    if(ksize.size() != ksp_dim)
     {
-        int ref = ksp_dim[0] - ksize.size();
-        while(ref--)
-            ksize.push_back(1);
+        int ref = ksp_dim - ksize.size();
+        while((ref--) != 0)
+            ksize.push_back(ksize[0]);
     }
-    if(stride.size() != ksp_dim[0])
+    if(stride.size() != ksp_dim)
     {
-        int ref = ksp_dim[0] - ksize.size();
-        while(ref--)
-            stride.push_back(1);
+        int ref = ksp_dim - stride.size();
+        while((ref--) != 0)
+            stride.push_back(stride[0]);
     }
-    if(padding.size() != ksp_dim[0])
+    if(padding.size() != ksp_dim)
     {
-        int ref = ksp_dim[0] - ksize.size();
-        while(ref--)
-            padding.push_back(0);
+        int ref = ksp_dim - padding.size();
+        while((ref--) != 0)
+            padding.push_back(padding[0]);
     }
 
     ceil_mode         = static_cast<bool>(inflags.GetValueInt("ceil_mode"));
@@ -242,9 +230,6 @@ int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
     SetTensorNd(outputDesc, out_dim, data_type);
     SetTensorNd(outputGradDesc, out_dim, data_type);
     SetTensorNd(inputGradDesc, in_dim, data_type);
-    SetTensorNd(ksizeDesc, ksp_dim, miopen_type<int32_t>{});
-    SetTensorNd(strideDesc, ksp_dim, miopen_type<int32_t>{});
-    SetTensorNd(paddingDesc, ksp_dim, miopen_type<int32_t>{});
 
     return miopenStatusSuccess;
 }
@@ -301,11 +286,8 @@ int AvgPoolDriver<Tgpu, Tref>::AddCmdLineArgs()
 template <typename Tgpu, typename Tref>
 int AvgPoolDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
 {
-    size_t input_sz   = GetTensorSize(inputDesc);
-    size_t output_sz  = GetTensorSize(outputDesc);
-    size_t ksize_sz   = GetTensorSize(ksizeDesc);
-    size_t stride_sz  = GetTensorSize(strideDesc);
-    size_t padding_sz = GetTensorSize(paddingDesc);
+    size_t input_sz  = GetTensorSize(inputDesc);
+    size_t output_sz = GetTensorSize(outputDesc);
 
     uint32_t ctx = 0;
 
@@ -313,9 +295,6 @@ int AvgPoolDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
     output_dev      = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
     input_grad_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, input_sz, sizeof(Tgpu)));
     output_grad_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, output_sz, sizeof(Tgpu)));
-    ksize_dev       = std::unique_ptr<GPUMem>(new GPUMem(ctx, ksize_sz, sizeof(int32_t)));
-    stride_dev      = std::unique_ptr<GPUMem>(new GPUMem(ctx, stride_sz, sizeof(int32_t)));
-    padding_dev     = std::unique_ptr<GPUMem>(new GPUMem(ctx, padding_sz, sizeof(int32_t)));
 
     input       = std::vector<Tgpu>(input_sz, static_cast<Tgpu>(0));
     output      = std::vector<Tgpu>(output_sz, static_cast<Tgpu>(0));
@@ -343,12 +322,6 @@ int AvgPoolDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
     }
     status |= output_grad_dev->ToGPU(q, output_grad.data());
 
-    status |= ksize_dev->ToGPU(q, ksize.data());
-
-    status |= stride_dev->ToGPU(q, stride.data());
-
-    status |= padding_dev->ToGPU(q, padding.data());
-
     if(status != 0)
         std::cout << "Error copying data to GPU\n" << std::endl;
 
@@ -371,12 +344,15 @@ int AvgPoolDriver<Tgpu, Tref>::RunForwardGPU()
                              input_dev->GetMem(),
                              outputDesc,
                              output_dev->GetMem(),
-                             strideDesc,
-                             stride_dev->GetMem(),
-                             paddingDesc,
-                             padding_dev->GetMem(),
-                             ksizeDesc,
-                             ksize_dev->GetMem(),
+                             ksize.size() == 3 ? ksize[0] : 0,
+                             ksize.size() == 3 ? ksize[1] : ksize[0],
+                             ksize.size() == 3 ? ksize[2] : ksize[1],
+                             stride.size() == 3 ? stride[0] : 0,
+                             stride.size() == 3 ? stride[1] : stride[0],
+                             stride.size() == 3 ? stride[2] : stride[1],
+                             padding.size() == 3 ? padding[0] : 0,
+                             padding.size() == 3 ? padding[1] : padding[0],
+                             padding.size() == 3 ? padding[2] : padding[1],
                              count_include_pad,
                              divisor_override);
 
@@ -464,12 +440,15 @@ int AvgPoolDriver<Tgpu, Tref>::RunBackwardGPU()
                               output_grad_dev->GetMem(),
                               inputGradDesc,
                               input_grad_dev->GetMem(),
-                              strideDesc,
-                              stride_dev->GetMem(),
-                              paddingDesc,
-                              padding_dev->GetMem(),
-                              ksizeDesc,
-                              ksize_dev->GetMem(),
+                              ksize.size() == 3 ? ksize[0] : 0,
+                              ksize.size() == 3 ? ksize[1] : ksize[0],
+                              ksize.size() == 3 ? ksize[2] : ksize[1],
+                              stride.size() == 3 ? stride[0] : 0,
+                              stride.size() == 3 ? stride[1] : stride[0],
+                              stride.size() == 3 ? stride[2] : stride[1],
+                              padding.size() == 3 ? padding[0] : 0,
+                              padding.size() == 3 ? padding[1] : padding[0],
+                              padding.size() == 3 ? padding[2] : padding[1],
                               count_include_pad,
                               divisor_override);
 
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 18b0bcafdf..ea44de92d5 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -7636,12 +7636,6 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle,
  * @param input                    Data tensor input (input)
  * @param outputDesc               Tensor descriptor for output tensor (input)
  * @param output                   Data tensor output (output)
- * @param strideDesc               Tensor descriptor for stride tensor (input)
- * @param stride                   Data tensor stride (output)
- * @param paddingDesc              Tensor descriptor for padding tensor (input)
- * @param padding                  Data tensor padding (output)
- * @param ksizeDesc               Tensor descriptor for ksize tensor (input)
- * @param ksize                   Data tensor ksize (output)
  * @param count_include_pad        When True, will include the zero-padding in the averaging
  * calculation (input)
  * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
@@ -7653,12 +7647,15 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                                                   const void* input,
                                                   const miopenTensorDescriptor_t outputDesc,
                                                   void* output,
-                                                  const miopenTensorDescriptor_t strideDesc,
-                                                  const void* stride,
-                                                  const miopenTensorDescriptor_t paddingDesc,
-                                                  const void* padding,
-                                                  const miopenTensorDescriptor_t ksizeDesc,
-                                                  const void* ksize,
+                                                  const int32_t KD,
+                                                  const int32_t KH,
+                                                  const int32_t KW,
+                                                  const int32_t SD,
+                                                  const int32_t SH,
+                                                  const int32_t SW,
+                                                  const int32_t PD,
+                                                  const int32_t PH,
+                                                  const int32_t PW,
                                                   const bool count_include_pad,
                                                   const int32_t divisor_override);
 
@@ -7669,12 +7666,6 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
  * @param output_grad              Data tensor output grad (input)
  * @param inputGradDesc            Tensor descriptor for input grad tensor (input)
  * @param input_grad               Data tensor input grad (output)
- * @param strideDesc               Tensor descriptor for stride tensor (input)
- * @param stride                   Data tensor stride (output)
- * @param paddingDesc              Tensor descriptor for padding tensor (input)
- * @param padding                  Data tensor padding (output)
- * @param ksizeDesc               Tensor descriptor for ksize tensor (input)
- * @param ksize                   Data tensor ksize (output)
  * @param count_include_pad        When True, will include the zero-padding in the averaging
  * calculation (input)
  * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
@@ -7686,12 +7677,15 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                                                    const void* output_grad,
                                                    const miopenTensorDescriptor_t inputGradDesc,
                                                    void* input_grad,
-                                                   const miopenTensorDescriptor_t strideDesc,
-                                                   const void* stride,
-                                                   const miopenTensorDescriptor_t paddingDesc,
-                                                   const void* padding,
-                                                   const miopenTensorDescriptor_t ksizeDesc,
-                                                   const void* ksize,
+                                                   const int32_t KD,
+                                                   const int32_t KH,
+                                                   const int32_t KW,
+                                                   const int32_t SD,
+                                                   const int32_t SH,
+                                                   const int32_t SW,
+                                                   const int32_t PD,
+                                                   const int32_t PH,
+                                                   const int32_t PW,
                                                    const bool count_include_pad,
                                                    const int32_t divisor_override);
 /** @} */
diff --git a/src/avgpool.cpp b/src/avgpool.cpp
index 87ff481c6a..323f01c90e 100644
--- a/src/avgpool.cpp
+++ b/src/avgpool.cpp
@@ -38,36 +38,37 @@ miopenStatus_t AvgPoolForward(Handle& handle,
                               ConstData_t input,
                               const TensorDescriptor& outputDesc,
                               Data_t output,
-                              const TensorDescriptor& strideDesc,
-                              ConstData_t stride,
-                              const TensorDescriptor& paddingDesc,
-                              ConstData_t padding,
-                              const TensorDescriptor& ksizeDesc,
-                              ConstData_t ksize,
+                              const int32_t KD,
+                              const int32_t KH,
+                              const int32_t KW,
+                              const int32_t SD,
+                              const int32_t SH,
+                              const int32_t SW,
+                              const int32_t PD,
+                              const int32_t PH,
+                              const int32_t PW,
                               const bool count_include_pad,
                               const int32_t divisor_override)
 {
-    const auto problem = avgpool::FwdProblemDescription{inputDesc,
-                                                        outputDesc,
-                                                        strideDesc,
-                                                        paddingDesc,
-                                                        ksizeDesc,
-                                                        count_include_pad,
-                                                        divisor_override};
+    const auto problem =
+        avgpool::FwdProblemDescription{inputDesc, outputDesc, count_include_pad, divisor_override};
 
     const auto invoke_params = [&]() {
-        auto tmp        = avgpool::FwdInvokeParams{};
-        tmp.inputDesc   = &inputDesc;
-        tmp.outputDesc  = &outputDesc;
-        tmp.strideDesc  = &strideDesc;
-        tmp.paddingDesc = &paddingDesc;
-        tmp.ksizeDesc   = &ksizeDesc;
+        auto tmp       = avgpool::FwdInvokeParams{};
+        tmp.inputDesc  = &inputDesc;
+        tmp.outputDesc = &outputDesc;
 
         tmp.input             = input;
         tmp.output            = output;
-        tmp.stride            = stride;
-        tmp.padding           = padding;
-        tmp.ksize             = ksize;
+        tmp.KD                = KD;
+        tmp.KH                = KH;
+        tmp.KW                = KW;
+        tmp.SD                = SD;
+        tmp.SH                = SH;
+        tmp.SW                = SW;
+        tmp.PD                = PD;
+        tmp.PH                = PH;
+        tmp.PW                = PW;
         tmp.count_include_pad = count_include_pad;
         tmp.divisor_override  = divisor_override;
 
@@ -87,36 +88,37 @@ miopenStatus_t AvgPoolBackward(Handle& handle,
                                ConstData_t output_grad,
                                const TensorDescriptor& inputGradDesc,
                                Data_t input_grad,
-                               const TensorDescriptor& strideDesc,
-                               ConstData_t stride,
-                               const TensorDescriptor& paddingDesc,
-                               ConstData_t padding,
-                               const TensorDescriptor& ksizeDesc,
-                               ConstData_t ksize,
+                               const int32_t KD,
+                               const int32_t KH,
+                               const int32_t KW,
+                               const int32_t SD,
+                               const int32_t SH,
+                               const int32_t SW,
+                               const int32_t PD,
+                               const int32_t PH,
+                               const int32_t PW,
                                const bool count_include_pad,
                                const int32_t divisor_override)
 {
-    const auto problem = avgpool::BwdProblemDescription{outputGradDesc,
-                                                        inputGradDesc,
-                                                        strideDesc,
-                                                        paddingDesc,
-                                                        ksizeDesc,
-                                                        count_include_pad,
-                                                        divisor_override};
+    const auto problem = avgpool::BwdProblemDescription{
+        outputGradDesc, inputGradDesc, count_include_pad, divisor_override};
 
     const auto invoke_params = [&]() {
         auto tmp           = avgpool::BwdInvokeParams{};
         tmp.outputGradDesc = &outputGradDesc;
         tmp.inputGradDesc  = &inputGradDesc;
-        tmp.strideDesc     = &strideDesc;
-        tmp.paddingDesc    = &paddingDesc;
-        tmp.ksizeDesc      = &ksizeDesc;
 
         tmp.output_grad       = output_grad;
         tmp.input_grad        = input_grad;
-        tmp.stride            = stride;
-        tmp.padding           = padding;
-        tmp.ksize             = ksize;
+        tmp.KD                = KD;
+        tmp.KH                = KH;
+        tmp.KW                = KW;
+        tmp.SD                = SD;
+        tmp.SH                = SH;
+        tmp.SW                = SW;
+        tmp.PD                = PD;
+        tmp.PH                = PH;
+        tmp.PW                = PW;
         tmp.count_include_pad = count_include_pad;
         tmp.divisor_override  = divisor_override;
 
diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp
index fa2e8a957c..32e1f12f92 100644
--- a/src/avgpool_api.cpp
+++ b/src/avgpool_api.cpp
@@ -84,12 +84,15 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                                                const void* input,
                                                const miopenTensorDescriptor_t outputDesc,
                                                void* output,
-                                               const miopenTensorDescriptor_t strideDesc,
-                                               const void* stride,
-                                               const miopenTensorDescriptor_t paddingDesc,
-                                               const void* padding,
-                                               const miopenTensorDescriptor_t ksizeDesc,
-                                               const void* ksize,
+                                               const int32_t KD,
+                                               const int32_t KH,
+                                               const int32_t KW,
+                                               const int32_t SD,
+                                               const int32_t SH,
+                                               const int32_t SW,
+                                               const int32_t PD,
+                                               const int32_t PH,
+                                               const int32_t PW,
                                                const bool count_include_pad,
                                                const int32_t divisor_override)
 {
@@ -98,12 +101,15 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                         input,
                         outputDesc,
                         output,
-                        strideDesc,
-                        stride,
-                        paddingDesc,
-                        padding,
-                        ksizeDesc,
-                        ksize,
+                        KD,
+                        KH,
+                        KW,
+                        SD,
+                        SH,
+                        SW,
+                        PD,
+                        PH,
+                        PW,
                         count_include_pad,
                         divisor_override);
 
@@ -114,12 +120,15 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                                DataCast(input),
                                miopen::deref(outputDesc),
                                DataCast(output),
-                               miopen::deref(strideDesc),
-                               DataCast(stride),
-                               miopen::deref(paddingDesc),
-                               DataCast(padding),
-                               miopen::deref(ksizeDesc),
-                               DataCast(ksize),
+                               KD,
+                               KH,
+                               KW,
+                               SD,
+                               SH,
+                               SW,
+                               PD,
+                               PH,
+                               PW,
                                count_include_pad,
                                divisor_override);
     });
@@ -130,12 +139,15 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                                                 const void* output_grad,
                                                 const miopenTensorDescriptor_t inputGradDesc,
                                                 void* input_grad,
-                                                const miopenTensorDescriptor_t strideDesc,
-                                                const void* stride,
-                                                const miopenTensorDescriptor_t paddingDesc,
-                                                const void* padding,
-                                                const miopenTensorDescriptor_t ksizeDesc,
-                                                const void* ksize,
+                                                const int32_t KD,
+                                                const int32_t KH,
+                                                const int32_t KW,
+                                                const int32_t SD,
+                                                const int32_t SH,
+                                                const int32_t SW,
+                                                const int32_t PD,
+                                                const int32_t PH,
+                                                const int32_t PW,
                                                 const bool count_include_pad,
                                                 const int32_t divisor_override)
 {
@@ -144,12 +156,15 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                         output_grad,
                         inputGradDesc,
                         input_grad,
-                        strideDesc,
-                        stride,
-                        paddingDesc,
-                        padding,
-                        ksizeDesc,
-                        ksize,
+                        KD,
+                        KH,
+                        KW,
+                        SD,
+                        SH,
+                        SW,
+                        PD,
+                        PH,
+                        PW,
                         count_include_pad,
                         divisor_override);
 
@@ -160,12 +175,15 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                                 DataCast(output_grad),
                                 miopen::deref(inputGradDesc),
                                 DataCast(input_grad),
-                                miopen::deref(strideDesc),
-                                DataCast(stride),
-                                miopen::deref(paddingDesc),
-                                DataCast(padding),
-                                miopen::deref(ksizeDesc),
-                                DataCast(ksize),
+                                KD,
+                                KH,
+                                KW,
+                                SD,
+                                SH,
+                                SW,
+                                PD,
+                                PH,
+                                PW,
                                 count_include_pad,
                                 divisor_override);
     });
diff --git a/src/include/miopen/avgpool.hpp b/src/include/miopen/avgpool.hpp
index 9210e45e3a..00a2717ff6 100644
--- a/src/include/miopen/avgpool.hpp
+++ b/src/include/miopen/avgpool.hpp
@@ -39,12 +39,15 @@ MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolForward(Handle& handle,
                                                       ConstData_t input,
                                                       const TensorDescriptor& outputDesc,
                                                       Data_t output,
-                                                      const TensorDescriptor& strideDesc,
-                                                      ConstData_t stride,
-                                                      const TensorDescriptor& paddingDesc,
-                                                      ConstData_t padding,
-                                                      const TensorDescriptor& ksizeDesc,
-                                                      ConstData_t ksize,
+                                                      int32_t KD,
+                                                      int32_t KH,
+                                                      int32_t KW,
+                                                      int32_t SD,
+                                                      int32_t SH,
+                                                      int32_t SW,
+                                                      int32_t PD,
+                                                      int32_t PH,
+                                                      int32_t PW,
                                                       bool count_include_pad,
                                                       int32_t divisor_override);
 
@@ -53,12 +56,15 @@ MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolBackward(Handle& handle,
                                                        ConstData_t output_grad,
                                                        const TensorDescriptor& inputGradDesc,
                                                        Data_t input_grad,
-                                                       const TensorDescriptor& strideDesc,
-                                                       ConstData_t stride,
-                                                       const TensorDescriptor& paddingDesc,
-                                                       ConstData_t padding,
-                                                       const TensorDescriptor& ksizeDesc,
-                                                       ConstData_t ksize,
+                                                       int32_t KD,
+                                                       int32_t KH,
+                                                       int32_t KW,
+                                                       int32_t SD,
+                                                       int32_t SH,
+                                                       int32_t SW,
+                                                       int32_t PD,
+                                                       int32_t PH,
+                                                       int32_t PW,
                                                        bool count_include_pad,
                                                        int32_t divisor_override);
 } // namespace miopen
diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/avgpool/invoke_params.hpp
index 91a70725ee..e8bd9256ac 100644
--- a/src/include/miopen/avgpool/invoke_params.hpp
+++ b/src/include/miopen/avgpool/invoke_params.hpp
@@ -38,18 +38,22 @@ struct FwdInvokeParams : public miopen::InvokeParams
 
     FwdInvokeParams() = default;
 
-    const TensorDescriptor* inputDesc   = nullptr;
-    const TensorDescriptor* outputDesc  = nullptr;
-    const TensorDescriptor* strideDesc  = nullptr;
-    const TensorDescriptor* paddingDesc = nullptr;
-    const TensorDescriptor* ksizeDesc   = nullptr;
-
-    ConstData_t input   = nullptr;
-    Data_t output       = nullptr;
-    ConstData_t stride  = nullptr;
-    ConstData_t padding = nullptr;
-    ConstData_t ksize   = nullptr;
-
+    const TensorDescriptor* inputDesc  = nullptr;
+    const TensorDescriptor* outputDesc = nullptr;
+
+    ConstData_t input = nullptr;
+    Data_t output     = nullptr;
+    ConstData_t ksize = nullptr;
+
+    int32_t KD               = 0;
+    int32_t KH               = 0;
+    int32_t KW               = 0;
+    int32_t SD               = 0;
+    int32_t SH               = 0;
+    int32_t SW               = 0;
+    int32_t PD               = 0;
+    int32_t PH               = 0;
+    int32_t PW               = 0;
     bool count_include_pad   = false;
     int32_t divisor_override = 0;
 
@@ -64,16 +68,20 @@ struct BwdInvokeParams : public miopen::InvokeParams
 
     const TensorDescriptor* outputGradDesc = nullptr;
     const TensorDescriptor* inputGradDesc  = nullptr;
-    const TensorDescriptor* strideDesc     = nullptr;
-    const TensorDescriptor* paddingDesc    = nullptr;
-    const TensorDescriptor* ksizeDesc      = nullptr;
 
     ConstData_t output_grad = nullptr;
     Data_t input_grad       = nullptr;
-    ConstData_t stride      = nullptr;
-    ConstData_t padding     = nullptr;
     ConstData_t ksize       = nullptr;
 
+    int32_t KD               = 0;
+    int32_t KH               = 0;
+    int32_t KW               = 0;
+    int32_t SD               = 0;
+    int32_t SH               = 0;
+    int32_t SW               = 0;
+    int32_t PD               = 0;
+    int32_t PH               = 0;
+    int32_t PW               = 0;
     bool count_include_pad   = false;
     int32_t divisor_override = 0;
 
diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/avgpool/problem_description.hpp
index 9166762235..2dee6a30ea 100644
--- a/src/include/miopen/avgpool/problem_description.hpp
+++ b/src/include/miopen/avgpool/problem_description.hpp
@@ -38,16 +38,8 @@ namespace avgpool {
 
 struct ProblemDescription : ProblemDescriptionBase
 {
-    ProblemDescription(const TensorDescriptor& strideDesc_,
-                       const TensorDescriptor& paddingDesc_,
-                       const TensorDescriptor& kinforDesc_,
-                       const bool count_include_pad_,
-                       const int32_t divisor_override_)
-        : strideDesc(strideDesc_),
-          paddingDesc(paddingDesc_),
-          kinforDesc(kinforDesc_),
-          count_include_pad(count_include_pad_),
-          divisor_override(divisor_override_)
+    ProblemDescription(const bool count_include_pad_, const int32_t divisor_override_)
+        : count_include_pad(count_include_pad_), divisor_override(divisor_override_)
     {
         if(divisor_override < 0)
         {
@@ -56,10 +48,6 @@ struct ProblemDescription : ProblemDescriptionBase
     }
 
 protected:
-    TensorDescriptor strideDesc;
-    TensorDescriptor paddingDesc;
-    TensorDescriptor kinforDesc;
-
     bool count_include_pad;
     int32_t divisor_override;
 };
@@ -68,13 +56,9 @@ struct FwdProblemDescription : ProblemDescription
 {
     FwdProblemDescription(const TensorDescriptor& inputDesc_,
                           const TensorDescriptor& outputDesc_,
-                          const TensorDescriptor& strideDesc_,
-                          const TensorDescriptor& paddingDesc_,
-                          const TensorDescriptor& kinforDesc_,
                           const bool count_include_pad_,
                           const int32_t divisor_override_)
-        : ProblemDescription(
-              strideDesc_, paddingDesc_, kinforDesc_, count_include_pad_, divisor_override_),
+        : ProblemDescription(count_include_pad_, divisor_override_),
           inputDesc(inputDesc_),
           outputDesc(outputDesc_)
     {
@@ -95,14 +79,6 @@ struct FwdProblemDescription : ProblemDescription
             MIOPEN_THROW(miopenStatusBadParm,
                          "AvgPool: Input and output tensor sizes do not match.");
         }
-        if(input_dims - 2 != strideDesc.GetElementSize() ||
-           input_dims - 2 != paddingDesc.GetElementSize() ||
-           input_dims - 2 != kinforDesc.GetElementSize())
-        {
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "AvgPool: Input tensor sizes and Kernel size or stride "
-                         "or padding do not match.");
-        }
 
         return true;
     }
@@ -118,13 +94,9 @@ struct BwdProblemDescription : ProblemDescription
 {
     BwdProblemDescription(const TensorDescriptor& outputGradDesc_,
                           const TensorDescriptor& inputGradDesc_,
-                          const TensorDescriptor& strideDesc_,
-                          const TensorDescriptor& paddingDesc_,
-                          const TensorDescriptor& kinforDesc_,
                           const bool count_include_pad_,
                           const int32_t divisor_override_)
-        : ProblemDescription(
-              strideDesc_, paddingDesc_, kinforDesc_, count_include_pad_, divisor_override_),
+        : ProblemDescription(count_include_pad_, divisor_override_),
           outputGradDesc(outputGradDesc_),
           inputGradDesc(inputGradDesc_)
     {
@@ -145,14 +117,6 @@ struct BwdProblemDescription : ProblemDescription
             MIOPEN_THROW(miopenStatusBadParm,
                          "AvgPool: Input grad and output grad tensor sizes do not match.");
         }
-        if(input_dims - 2 != strideDesc.GetElementSize() ||
-           input_dims - 2 != paddingDesc.GetElementSize() ||
-           input_dims - 2 != kinforDesc.GetElementSize())
-        {
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "AvgPool: Input grad tensor sizes and Kernel size or stride or padding do "
-                         "not match.");
-        }
 
         return true;
     }
diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
index 6d94bffac1..32ac270b37 100644
--- a/src/kernels/MIOpenAvgPool.cpp
+++ b/src/kernels/MIOpenAvgPool.cpp
@@ -40,27 +40,6 @@
 #define OUTPUT_TYPE float
 #endif
 
-// template <typename T, uint32_t Nd>
-// struct blockNd
-// {
-//     T val[Nd];
-// };
-
-// template <typename TI, typename TO, uint32_t Nd>
-// __device__ void avgPoolForwardNdNew(const TI* __restrict__ input,
-//                                     TO* __restrict__ output,
-//                                     size_t N,
-//                                     size_t C,
-//                                     const blockNd<size_t, Nd> sizeIn,
-//                                     const blockNd<size_t, Nd> sizeOut,
-//                                     const blockNd<int32_t, Nd> ksize,
-//                                     const blockNd<int32_t, Nd> stride,
-//                                     const blockNd<int32_t, Nd> padding,
-//                                     bool count_include_pad,
-//                                     int32_t divisor_override,
-//                                     tensor_view_t<Nd + 2> input_tv,
-//                                     tensor_view_t<Nd + 2> output_tv);
-
 template <typename TI, typename TO>
 __device__ void avgPoolForward2d(const TI* __restrict__ input,
                                  TO* __restrict__ output,
@@ -70,9 +49,12 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input,
                                  size_t W,
                                  size_t OH,
                                  size_t OW,
-                                 const int32_t* __restrict__ ksize,
-                                 const int32_t* __restrict__ stride,
-                                 const int32_t* __restrict__ padding,
+                                 int32_t R,
+                                 int32_t S,
+                                 int32_t sh,
+                                 int32_t sw,
+                                 int32_t ph,
+                                 int32_t pw,
                                  bool count_include_pad,
                                  int32_t divisor_override,
                                  tensor_view_t<4> input_tv,
@@ -82,19 +64,15 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input,
     int32_t ncoh = gid / OW, ow = gid % OW;
     int32_t nc = ncoh / OH, oh = ncoh % OH;
     int32_t n = nc / C, c = nc % C;
-    int32_t R  = ksize[0];
-    int32_t S  = ksize[1];
-    int32_t sh = stride[0];
-    int32_t sw = stride[1];
-    int32_t ph = padding[0];
-    int32_t pw = padding[1];
 
     if(n >= N)
         return;
 
     FLOAT_ACCUM m = 0;
+#pragma unroll
     for(int32_t r = 0; r < R; ++r)
     {
+#pragma unroll
         for(int32_t s = 0; s < S; ++s)
         {
             // input idx : (n, c, h, w)
@@ -151,9 +129,12 @@ extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input
                                             size_t W,
                                             size_t OH,
                                             size_t OW,
-                                            int32_t* ksize,
-                                            int32_t* stride,
-                                            int32_t* padding,
+                                            int32_t R,
+                                            int32_t S,
+                                            int32_t sh,
+                                            int32_t sw,
+                                            int32_t ph,
+                                            int32_t pw,
                                             bool count_include_pad,
                                             int32_t divisor_override,
                                             tensor_view_t<4> input_tv,
@@ -167,9 +148,12 @@ extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input
                                               W,
                                               OH,
                                               OW,
-                                              ksize,
-                                              stride,
-                                              padding,
+                                              R,
+                                              S,
+                                              sh,
+                                              sw,
+                                              ph,
+                                              pw,
                                               count_include_pad,
                                               divisor_override,
                                               input_tv,
@@ -187,9 +171,15 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input,
                                  size_t OD,
                                  size_t OH,
                                  size_t OW,
-                                 int32_t* ksize,
-                                 int32_t* stride,
-                                 int32_t* padding,
+                                 int32_t KD,
+                                 int32_t R,
+                                 int32_t S,
+                                 int32_t sd,
+                                 int32_t sh,
+                                 int32_t sw,
+                                 int32_t pd,
+                                 int32_t ph,
+                                 int32_t pw,
                                  bool count_include_pad,
                                  int32_t divisor_override,
                                  tensor_view_t<5> input_tv,
@@ -200,19 +190,11 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input,
     int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
     int32_t nc = ncod / OD, od = ncod % OD;
     int32_t n = nc / C, c = nc % C;
-    int32_t KD = ksize[0];
-    int32_t R  = ksize[1];
-    int32_t S  = ksize[2];
-    int32_t sd = stride[0];
-    int32_t sh = stride[1];
-    int32_t sw = stride[2];
-    int32_t pd = padding[0];
-    int32_t ph = padding[1];
-    int32_t pw = padding[2];
 
     if(n >= N)
         return;
     FLOAT_ACCUM sum = 0;
+#pragma unroll
     for(int32_t kd = 0; kd < KD; ++kd)
     {
         for(int32_t r = 0; r < R; ++r)
@@ -281,9 +263,15 @@ extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input
                                             size_t OD,
                                             size_t OH,
                                             size_t OW,
-                                            int32_t* ksize,
-                                            int32_t* stride,
-                                            int32_t* padding,
+                                            int32_t KD,
+                                            int32_t R,
+                                            int32_t S,
+                                            int32_t sd,
+                                            int32_t sh,
+                                            int32_t sw,
+                                            int32_t pd,
+                                            int32_t ph,
+                                            int32_t pw,
                                             bool count_include_pad,
                                             int32_t divisor_override,
                                             tensor_view_t<5> input_tv,
@@ -299,9 +287,15 @@ extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input
                                               OD,
                                               OH,
                                               OW,
-                                              ksize,
-                                              stride,
-                                              padding,
+                                              KD,
+                                              R,
+                                              S,
+                                              sd,
+                                              sh,
+                                              sw,
+                                              pd,
+                                              ph,
+                                              pw,
                                               count_include_pad,
                                               divisor_override,
                                               input_tv,
@@ -317,9 +311,12 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
                                   size_t W,
                                   size_t OH,
                                   size_t OW,
-                                  int32_t* ksize,
-                                  int32_t* stride,
-                                  int32_t* padding,
+                                  int32_t R,
+                                  int32_t S,
+                                  int32_t sh,
+                                  int32_t sw,
+                                  int32_t ph,
+                                  int32_t pw,
                                   bool count_include_pad,
                                   int32_t divisor_override,
                                   tensor_view_t<4> output_grad_tv,
@@ -329,19 +326,15 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
     int32_t nch = gid / W, w = gid % W;
     int32_t nc = nch / H, h = nch % H;
     int32_t n = nc / C, c = nc % C;
-    int32_t R  = ksize[0];
-    int32_t S  = ksize[1];
-    int32_t sh = stride[0];
-    int32_t sw = stride[1];
-    int32_t ph = padding[0];
-    int32_t pw = padding[1];
 
     if(n >= N)
         return;
 
     FLOAT_ACCUM grad = 0;
+#pragma unroll
     for(int32_t r = 0; r < R; ++r)
     {
+#pragma unroll
         for(int32_t s = 0; s < S; ++s)
         {
             int32_t ohsh = h + ph - r;
@@ -403,9 +396,12 @@ extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ outp
                                              size_t W,
                                              size_t OH,
                                              size_t OW,
-                                             int32_t* ksize,
-                                             int32_t* stride,
-                                             int32_t* padding,
+                                             int32_t R,
+                                             int32_t S,
+                                             int32_t sh,
+                                             int32_t sw,
+                                             int32_t ph,
+                                             int32_t pw,
                                              bool count_include_pad,
                                              int32_t divisor_override,
                                              tensor_view_t<4> output_grad_tv,
@@ -419,9 +415,12 @@ extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ outp
                                                W,
                                                OH,
                                                OW,
-                                               ksize,
-                                               stride,
-                                               padding,
+                                               R,
+                                               S,
+                                               sh,
+                                               sw,
+                                               ph,
+                                               pw,
                                                count_include_pad,
                                                divisor_override,
                                                output_grad_tv,
@@ -439,9 +438,15 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
                                   size_t OD,
                                   size_t OH,
                                   size_t OW,
-                                  int32_t* ksize,
-                                  int32_t* stride,
-                                  int32_t* padding,
+                                  int32_t KD,
+                                  int32_t R,
+                                  int32_t S,
+                                  int32_t sd,
+                                  int32_t sh,
+                                  int32_t sw,
+                                  int32_t pd,
+                                  int32_t ph,
+                                  int32_t pw,
                                   bool count_include_pad,
                                   int32_t divisor_override,
                                   tensor_view_t<5> output_grad_tv,
@@ -452,20 +457,12 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
     int32_t ncd = ncdh / H, h = ncdh % H;
     int32_t nc = ncd / D, d = ncd % D;
     int32_t n = nc / C, c = nc % C;
-    int32_t KD = ksize[0];
-    int32_t R  = ksize[1];
-    int32_t S  = ksize[2];
-    int32_t sd = stride[0];
-    int32_t sh = stride[1];
-    int32_t sw = stride[2];
-    int32_t pd = padding[0];
-    int32_t ph = padding[1];
-    int32_t pw = padding[2];
 
     if(n >= N)
         return;
 
     FLOAT_ACCUM grad = 0;
+#pragma unroll
     for(int32_t kd = 0; kd < KD; ++kd)
     {
         for(int32_t r = 0; r < R; ++r)
@@ -543,9 +540,15 @@ extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ outp
                                              size_t OD,
                                              size_t OH,
                                              size_t OW,
-                                             int32_t* ksize,
-                                             int32_t* stride,
-                                             int32_t* padding,
+                                             int32_t KD,
+                                             int32_t R,
+                                             int32_t S,
+                                             int32_t sd,
+                                             int32_t sh,
+                                             int32_t sw,
+                                             int32_t pd,
+                                             int32_t ph,
+                                             int32_t pw,
                                              bool count_include_pad,
                                              int32_t divisor_override,
                                              tensor_view_t<5> output_grad_tv,
@@ -561,9 +564,15 @@ extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ outp
                                                OD,
                                                OH,
                                                OW,
-                                               ksize,
-                                               stride,
-                                               padding,
+                                               KD,
+                                               R,
+                                               S,
+                                               sd,
+                                               sh,
+                                               sw,
+                                               pd,
+                                               ph,
+                                               pw,
                                                count_include_pad,
                                                divisor_override,
                                                output_grad_tv,
diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp
index 4fe9d5bc76..c5ed51dc27 100644
--- a/src/solver/avgpool/backward_avgpool_2d.cpp
+++ b/src/solver/avgpool/backward_avgpool_2d.cpp
@@ -81,10 +81,10 @@ bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context,
     {
         return false;
     }
-    if(!IsOverRocm(problem))
-    {
-        return false;
-    }
+    // if(!IsOverRocm(problem))
+    // {
+    //     return false;
+    // }
     return true;
 }
 
@@ -135,9 +135,12 @@ AvgPoolBackward2d::GetSolution(const ExecutionContext& context,
                    W,
                    OH,
                    OW,
-                   params.ksize,
-                   params.stride,
-                   params.padding,
+                   params.KH,
+                   params.KW,
+                   params.SH,
+                   params.SW,
+                   params.PH,
+                   params.PW,
                    params.count_include_pad,
                    params.divisor_override,
                    output_grad_tv,
diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp
index 6897097955..96adbb2e46 100644
--- a/src/solver/avgpool/backward_avgpool_3d.cpp
+++ b/src/solver/avgpool/backward_avgpool_3d.cpp
@@ -43,6 +43,42 @@ namespace solver {
 
 namespace avgpool {
 
+bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem)
+{
+    auto dtype      = problem.GetInputGradDesc().GetType();
+    auto in_nelems  = problem.GetInputGradDesc().GetElementSize();
+    auto out_nelems = problem.GetOutputGradDesc().GetElementSize();
+    auto mul_nc =
+        problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1];
+    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
+
+    if(dtype == miopenFloat)
+    {
+        if((in_over_out < 8 && in_over_out > 1) || (in_over_out < 2 && in_nelems <= 5971968))
+        {
+            return true;
+        }
+        return false;
+    }
+    else if(dtype == miopenHalf)
+    {
+        if((in_over_out < 2 && mul_nc < 8192) ||
+           (8 > in_over_out && in_over_out > 7 && out_nelems >= 32401152))
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenBFloat16)
+    {
+        if((7 < in_over_out && in_over_out < 8 && in_nelems >= 944111616) ||
+           (in_over_out < 2 && in_nelems >= 4194304))
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
 bool AvgPoolBackward3d::IsApplicable(const ExecutionContext& context,
                                      const miopen::avgpool::BwdProblemDescription& problem) const
 {
@@ -51,6 +87,10 @@ bool AvgPoolBackward3d::IsApplicable(const ExecutionContext& context,
     {
         return false;
     }
+    // if(!IsOverRocm(problem))
+    // {
+    //     return false;
+    // }
     return true;
 }
 
@@ -105,9 +145,15 @@ AvgPoolBackward3d::GetSolution(const ExecutionContext& context,
                    OD,
                    OH,
                    OW,
-                   params.ksize,
-                   params.stride,
-                   params.padding,
+                   params.KD,
+                   params.KH,
+                   params.KW,
+                   params.SD,
+                   params.SH,
+                   params.SW,
+                   params.PD,
+                   params.PH,
+                   params.PW,
                    params.count_include_pad,
                    params.divisor_override,
                    output_grad_tv,
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
index 3e70264097..ebc5c4b956 100644
--- a/src/solver/avgpool/forward_avgpool_2d.cpp
+++ b/src/solver/avgpool/forward_avgpool_2d.cpp
@@ -83,10 +83,10 @@ bool AvgPoolForward2d::IsApplicable(const ExecutionContext& context,
     {
         return false;
     }
-    if(!IsOverRocm(problem))
-    {
-        return false;
-    }
+    // if(!IsOverRocm(problem))
+    // {
+    //     return false;
+    // }
     return true;
 }
 
@@ -137,9 +137,12 @@ AvgPoolForward2d::GetSolution(const ExecutionContext& context,
                    W,
                    OH,
                    OW,
-                   params.ksize,
-                   params.stride,
-                   params.padding,
+                   params.KH,
+                   params.KW,
+                   params.SH,
+                   params.SW,
+                   params.PH,
+                   params.PW,
                    params.count_include_pad,
                    params.divisor_override,
                    input_tv,
diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp
index 088aac6dca..32a24d47bb 100644
--- a/src/solver/avgpool/forward_avgpool_3d.cpp
+++ b/src/solver/avgpool/forward_avgpool_3d.cpp
@@ -43,6 +43,38 @@ namespace solver {
 
 namespace avgpool {
 
+bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem)
+{
+    auto dtype      = problem.GetOutputDesc().GetType();
+    auto in_nelems  = problem.GetInputDesc().GetElementSize();
+    auto out_nelems = problem.GetOutputDesc().GetElementSize();
+    auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1];
+    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
+
+    if(dtype == miopenFloat)
+    {
+        if(in_over_out < 8 || in_over_out >= 262144)
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenHalf)
+    {
+        if(in_nelems >= 201326592 || (in_over_out < 2 && mul_nc < 8192))
+        {
+            return true;
+        }
+    }
+    else if(dtype == miopenBFloat16)
+    {
+        if((out_nelems >= 5971968 && in_over_out < 2) || out_nelems >= 74088000)
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
 bool AvgPoolForward3d::IsApplicable(const ExecutionContext& context,
                                     const miopen::avgpool::FwdProblemDescription& problem) const
 {
@@ -50,6 +82,10 @@ bool AvgPoolForward3d::IsApplicable(const ExecutionContext& context,
     {
         return false;
     }
+    // if(!IsOverRocm(problem))
+    // {
+    //     return false;
+    // }
     return true;
 }
 
@@ -104,9 +140,15 @@ AvgPoolForward3d::GetSolution(const ExecutionContext& context,
                    OD,
                    OH,
                    OW,
-                   params.ksize,
-                   params.stride,
-                   params.padding,
+                   params.KD,
+                   params.KH,
+                   params.KW,
+                   params.SD,
+                   params.SH,
+                   params.SW,
+                   params.PD,
+                   params.PH,
+                   params.PW,
                    params.count_include_pad,
                    params.divisor_override,
                    input_tv,

From 930d47e02a4573ac52713238704794d4228b7fb8 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Tue, 20 Aug 2024 18:42:22 +0700
Subject: [PATCH 08/29] fix gtest

---
 src/kernels/MIOpenAvgPool.cpp              |  4 -
 src/solver/avgpool/backward_avgpool_2d.cpp | 12 +--
 src/solver/avgpool/backward_avgpool_3d.cpp | 22 +++---
 src/solver/avgpool/forward_avgpool_2d.cpp  | 14 ++--
 src/solver/avgpool/forward_avgpool_3d.cpp  | 19 +++--
 test/gtest/avgpool.cpp                     | 24 ++++--
 test/gtest/avgpool.hpp                     | 89 +++++++++++++++-------
 7 files changed, 114 insertions(+), 70 deletions(-)

diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
index 32ac270b37..d17dcc38ff 100644
--- a/src/kernels/MIOpenAvgPool.cpp
+++ b/src/kernels/MIOpenAvgPool.cpp
@@ -72,7 +72,6 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input,
 #pragma unroll
     for(int32_t r = 0; r < R; ++r)
     {
-#pragma unroll
         for(int32_t s = 0; s < S; ++s)
         {
             // input idx : (n, c, h, w)
@@ -194,7 +193,6 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input,
     if(n >= N)
         return;
     FLOAT_ACCUM sum = 0;
-#pragma unroll
     for(int32_t kd = 0; kd < KD; ++kd)
     {
         for(int32_t r = 0; r < R; ++r)
@@ -334,7 +332,6 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
 #pragma unroll
     for(int32_t r = 0; r < R; ++r)
     {
-#pragma unroll
         for(int32_t s = 0; s < S; ++s)
         {
             int32_t ohsh = h + ph - r;
@@ -462,7 +459,6 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
         return;
 
     FLOAT_ACCUM grad = 0;
-#pragma unroll
     for(int32_t kd = 0; kd < KD; ++kd)
     {
         for(int32_t r = 0; r < R; ++r)
diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp
index c5ed51dc27..73adabb8e7 100644
--- a/src/solver/avgpool/backward_avgpool_2d.cpp
+++ b/src/solver/avgpool/backward_avgpool_2d.cpp
@@ -43,7 +43,7 @@ namespace solver {
 
 namespace avgpool {
 
-bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem)
+bool IsOverRocmBwd2d(const miopen::avgpool::BwdProblemDescription& problem)
 {
     auto dtype      = problem.GetInputGradDesc().GetType();
     auto in_nelems  = problem.GetInputGradDesc().GetElementSize();
@@ -73,7 +73,7 @@ bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem)
     return false;
 }
 
-bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context,
+bool AvgPoolBackward2d::IsApplicable(const ExecutionContext&,
                                      const miopen::avgpool::BwdProblemDescription& problem) const
 {
     if(problem.GetInputGradDesc().GetNumDims() != 4 ||
@@ -81,10 +81,10 @@ bool AvgPoolBackward2d::IsApplicable(const ExecutionContext& context,
     {
         return false;
     }
-    // if(!IsOverRocm(problem))
-    // {
-    //     return false;
-    // }
+    if(!IsOverRocmBwd2d(problem))
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp
index 96adbb2e46..4815803ad3 100644
--- a/src/solver/avgpool/backward_avgpool_3d.cpp
+++ b/src/solver/avgpool/backward_avgpool_3d.cpp
@@ -43,18 +43,19 @@ namespace solver {
 
 namespace avgpool {
 
-bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem)
+bool IsOverRocmBwd3d(const miopen::avgpool::BwdProblemDescription& problem)
 {
     auto dtype      = problem.GetInputGradDesc().GetType();
     auto in_nelems  = problem.GetInputGradDesc().GetElementSize();
     auto out_nelems = problem.GetOutputGradDesc().GetElementSize();
     auto mul_nc =
         problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1];
+    auto N           = problem.GetOutputGradDesc().GetLengths()[0];
     auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
 
     if(dtype == miopenFloat)
     {
-        if((in_over_out < 8 && in_over_out > 1) || (in_over_out < 2 && in_nelems <= 5971968))
+        if((in_over_out < 2 && out_nelems <= 12582912) || (in_over_out <= 8 && N >= 6))
         {
             return true;
         }
@@ -62,16 +63,15 @@ bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem)
     }
     else if(dtype == miopenHalf)
     {
-        if((in_over_out < 2 && mul_nc < 8192) ||
-           (8 > in_over_out && in_over_out > 7 && out_nelems >= 32401152))
+        if((in_over_out < 2 && mul_nc < 8192) || (8 > in_over_out && out_nelems >= 29052108))
         {
             return true;
         }
     }
     else if(dtype == miopenBFloat16)
     {
-        if((7 < in_over_out && in_over_out < 8 && in_nelems >= 944111616) ||
-           (in_over_out < 2 && in_nelems >= 4194304))
+        if((1 <= in_over_out && in_over_out < 2 && in_nelems >= 4194304) ||
+           (in_over_out <= 8 && in_nelems >= 944111616))
         {
             return true;
         }
@@ -79,7 +79,7 @@ bool IsOverRocm(const miopen::avgpool::BwdProblemDescription& problem)
     return false;
 }
 
-bool AvgPoolBackward3d::IsApplicable(const ExecutionContext& context,
+bool AvgPoolBackward3d::IsApplicable(const ExecutionContext&,
                                      const miopen::avgpool::BwdProblemDescription& problem) const
 {
     if(problem.GetInputGradDesc().GetNumDims() != 5 ||
@@ -87,10 +87,10 @@ bool AvgPoolBackward3d::IsApplicable(const ExecutionContext& context,
     {
         return false;
     }
-    // if(!IsOverRocm(problem))
-    // {
-    //     return false;
-    // }
+    if(!IsOverRocmBwd3d(problem))
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
index ebc5c4b956..1c51feb54b 100644
--- a/src/solver/avgpool/forward_avgpool_2d.cpp
+++ b/src/solver/avgpool/forward_avgpool_2d.cpp
@@ -44,7 +44,7 @@ namespace solver {
 
 namespace avgpool {
 
-bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem)
+bool IsOverRocmFwd2d(const miopen::avgpool::FwdProblemDescription& problem)
 {
     auto dtype      = problem.GetOutputDesc().GetType();
     auto in_nelems  = problem.GetInputDesc().GetElementSize();
@@ -68,7 +68,7 @@ bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem)
     }
     else if(dtype == miopenBFloat16)
     {
-        if(in_over_out >= 1024 || in_over_out < 2 || out_nelems >= 6000000)
+        if(in_over_out >= 1024 || in_over_out < 2 || out_nelems >= 4816896)
         {
             return true;
         }
@@ -76,17 +76,17 @@ bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem)
     return false;
 }
 
-bool AvgPoolForward2d::IsApplicable(const ExecutionContext& context,
+bool AvgPoolForward2d::IsApplicable(const ExecutionContext&,
                                     const miopen::avgpool::FwdProblemDescription& problem) const
 {
     if(problem.GetInputDesc().GetNumDims() != 4 || problem.GetOutputDesc().GetNumDims() != 4)
     {
         return false;
     }
-    // if(!IsOverRocm(problem))
-    // {
-    //     return false;
-    // }
+    if(!IsOverRocmFwd2d(problem))
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp
index 32a24d47bb..6f70a07419 100644
--- a/src/solver/avgpool/forward_avgpool_3d.cpp
+++ b/src/solver/avgpool/forward_avgpool_3d.cpp
@@ -43,17 +43,22 @@ namespace solver {
 
 namespace avgpool {
 
-bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem)
+bool IsOverRocmFwd3d(const miopen::avgpool::FwdProblemDescription& problem)
 {
     auto dtype      = problem.GetOutputDesc().GetType();
     auto in_nelems  = problem.GetInputDesc().GetElementSize();
     auto out_nelems = problem.GetOutputDesc().GetElementSize();
     auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1];
+    auto N      = problem.GetOutputDesc().GetLengths()[0];
     auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
 
+    std::cout << "in_over_out: " << in_over_out << std::endl;
+    std::cout << "in_nelems: " << in_nelems << std::endl;
+    std::cout << "out_nelems: " << out_nelems << std::endl;
+
     if(dtype == miopenFloat)
     {
-        if(in_over_out < 8 || in_over_out >= 262144)
+        if(in_over_out < 2 || in_over_out >= 262144 || (out_nelems >= 10125000 && N > 4))
         {
             return true;
         }
@@ -75,17 +80,17 @@ bool IsOverRocm(const miopen::avgpool::FwdProblemDescription& problem)
     return false;
 }
 
-bool AvgPoolForward3d::IsApplicable(const ExecutionContext& context,
+bool AvgPoolForward3d::IsApplicable(const ExecutionContext&,
                                     const miopen::avgpool::FwdProblemDescription& problem) const
 {
     if(problem.GetInputDesc().GetNumDims() != 5 || problem.GetOutputDesc().GetNumDims() != 5)
     {
         return false;
     }
-    // if(!IsOverRocm(problem))
-    // {
-    //     return false;
-    // }
+    if(!IsOverRocmFwd3d(problem))
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/test/gtest/avgpool.cpp b/test/gtest/avgpool.cpp
index fa002e5610..3ab32be510 100644
--- a/test/gtest/avgpool.cpp
+++ b/test/gtest/avgpool.cpp
@@ -111,9 +111,15 @@ TEST_P(GPU_Avgpool_fwd_BFP16, AvgPoolTestFwd)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP32, testing::ValuesIn(AvgPoolTestConfigs()));
-INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP16, testing::ValuesIn(AvgPoolTestConfigs()));
-INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Avgpool_fwd_FP32,
+                         testing::ValuesIn(AvgPoolTestConfigsFwdFp32()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Avgpool_fwd_FP16,
+                         testing::ValuesIn(AvgPoolTestConfigsFwdFp16()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Avgpool_fwd_BFP16,
+                         testing::ValuesIn(AvgPoolTestConfigsFwdBfp16()));
 
 // BACKWARD TEST
 TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd)
@@ -158,6 +164,12 @@ TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd)
     }
 };
 
-INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP32, testing::ValuesIn(AvgPoolTestConfigs()));
-INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_FP16, testing::ValuesIn(AvgPoolTestConfigs()));
-INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_bwd_BFP16, testing::ValuesIn(AvgPoolTestConfigs()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Avgpool_bwd_FP32,
+                         testing::ValuesIn(AvgPoolTestConfigsBwdFp32()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Avgpool_bwd_FP16,
+                         testing::ValuesIn(AvgPoolTestConfigsBwdFp16()));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_Avgpool_bwd_BFP16,
+                         testing::ValuesIn(AvgPoolTestConfigsBwdBfp16()));
diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp
index 26548e0a12..fca812357d 100644
--- a/test/gtest/avgpool.hpp
+++ b/test/gtest/avgpool.hpp
@@ -70,25 +70,50 @@ struct AvgPoolTestCase
     std::vector<int32_t> GetInput() const { return input_dims; }
 };
 
-inline std::vector<AvgPoolTestCase> AvgPoolTestConfigs()
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdFp32()
 {
     return {
-        {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, false, 0},
-        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, false, 0},
-        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, true, 0},
-        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, true, 0},
-        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, false, 1},
-        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, false, 1},
-        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, false, true, 1},
-        // {{2, 3, 7, 9}, {3, 3}, {2, 2}, {1, 1}, true, true, 1},
-        {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, false, 0},
-        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, false, 0},
-        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
-        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, true, 0},
-        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, false, 1},
-        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, false, 1},
-        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 1},
-        // {{2, 3, 7, 9, 11}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, true, true, 1},
+        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+    };
+}
+
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdFp16()
+{
+    return {
+        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+    };
+}
+
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdBfp16()
+{
+    return {
+        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+    };
+}
+
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdFp32()
+{
+    return {
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+    };
+}
+
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdFp16()
+{
+    return {
+        {{64, 288, 35, 35}, {3, 3}, {1, 1}, {1, 1}, false, true, 0},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+    };
+}
+
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdBfp16()
+{
+    return {
+        {{64, 2048, 9, 9}, {3, 3}, {1, 1}, {1, 1}, false, true, 0},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
     };
 }
 
@@ -212,12 +237,15 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
                                         input_dev.get(),
                                         output.desc,
                                         output_dev.get(),
-                                        stride.desc,
-                                        stride_dev.get(),
-                                        padding.desc,
-                                        padding_dev.get(),
-                                        ksize.desc,
-                                        ksize_dev.get(),
+                                        ksize.GetSize() == 3 ? ksize[0] : 0,
+                                        ksize.GetSize() == 3 ? ksize[1] : ksize[0],
+                                        ksize.GetSize() == 3 ? ksize[2] : ksize[1],
+                                        stride.GetSize() == 3 ? stride[0] : 0,
+                                        stride.GetSize() == 3 ? stride[1] : stride[0],
+                                        stride.GetSize() == 3 ? stride[2] : stride[1],
+                                        padding.GetSize() == 3 ? padding[0] : 0,
+                                        padding.GetSize() == 3 ? padding[1] : padding[0],
+                                        padding.GetSize() == 3 ? padding[2] : padding[1],
                                         count_include_pad,
                                         divisor_override);
         fflush(stdout);
@@ -377,12 +405,15 @@ struct AvgPoolTestBwd : public ::testing::TestWithParam<AvgPoolTestCase>
                                          output_grad_dev.get(),
                                          input_grad.desc,
                                          input_grad_dev.get(),
-                                         stride.desc,
-                                         stride_dev.get(),
-                                         padding.desc,
-                                         padding_dev.get(),
-                                         ksize.desc,
-                                         ksize_dev.get(),
+                                         ksize.GetSize() == 3 ? ksize[0] : 0,
+                                         ksize.GetSize() == 3 ? ksize[1] : ksize[0],
+                                         ksize.GetSize() == 3 ? ksize[2] : ksize[1],
+                                         stride.GetSize() == 3 ? stride[0] : 0,
+                                         stride.GetSize() == 3 ? stride[1] : stride[0],
+                                         stride.GetSize() == 3 ? stride[2] : stride[1],
+                                         padding.GetSize() == 3 ? padding[0] : 0,
+                                         padding.GetSize() == 3 ? padding[1] : padding[0],
+                                         padding.GetSize() == 3 ? padding[2] : padding[1],
                                          count_include_pad,
                                          divisor_override);
 

From 5a357389c31287ebfdf57893d9a5046e08cce8a0 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Wed, 21 Aug 2024 17:48:10 +0700
Subject: [PATCH 09/29] passed gtest

---
 src/kernels/MIOpenAvgPool.cpp | 2 --
 test/gtest/avgpool.hpp        | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
index d17dcc38ff..76355d5729 100644
--- a/src/kernels/MIOpenAvgPool.cpp
+++ b/src/kernels/MIOpenAvgPool.cpp
@@ -69,7 +69,6 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input,
         return;
 
     FLOAT_ACCUM m = 0;
-#pragma unroll
     for(int32_t r = 0; r < R; ++r)
     {
         for(int32_t s = 0; s < S; ++s)
@@ -329,7 +328,6 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
         return;
 
     FLOAT_ACCUM grad = 0;
-#pragma unroll
     for(int32_t r = 0; r < R; ++r)
     {
         for(int32_t s = 0; s < S; ++s)
diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp
index fca812357d..94898d32b6 100644
--- a/test/gtest/avgpool.hpp
+++ b/test/gtest/avgpool.hpp
@@ -105,7 +105,7 @@ inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdFp16()
 {
     return {
         {{64, 288, 35, 35}, {3, 3}, {1, 1}, {1, 1}, false, true, 0},
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+        {{6, 288, 35, 35, 35}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, false, true, 0},
     };
 }
 
@@ -113,7 +113,7 @@ inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdBfp16()
 {
     return {
         {{64, 2048, 9, 9}, {3, 3}, {1, 1}, {1, 1}, false, true, 0},
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+        {{6, 128, 112, 112, 112}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
     };
 }
 

From daa077f6cc8b22af985a89bcc93ce6964084c623 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Tue, 1 Oct 2024 16:04:16 +0700
Subject: [PATCH 10/29] fix int32

---
 include/miopen/miopen.h        |  58 +++--
 src/avgpool.cpp                |  40 ++--
 src/avgpool_api.cpp            |  42 ++--
 src/include/miopen/avgpool.hpp |  40 ++--
 src/kernels/MIOpenAvgPool.cpp  | 396 ++++++++++++++++-----------------
 test/cpu_avgpool.hpp           | 296 ++++++++++++------------
 test/gtest/avgpool.hpp         |  48 ++--
 7 files changed, 469 insertions(+), 451 deletions(-)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index ea44de92d5..2e8b988741 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -7636,6 +7636,15 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle,
  * @param input                    Data tensor input (input)
  * @param outputDesc               Tensor descriptor for output tensor (input)
  * @param output                   Data tensor output (output)
+ * @param KD                       Kernel size in dimension D  (input)
+ * @param KH                       Kernel size in dimension H (input)
+ * @param KW                       Kernel size in dimension K (input)
+ * @param SD                       Stride size in dimension D (input)
+ * @param SH                       Stride size in dimension H (input)
+ * @param SW                       Stride size in dimension K (input)
+ * @param PD                       Padding size in dimension D (input)
+ * @param PH                       Padding size in dimension H (input)
+ * @param PW                       Padding size in dimension K (input)
  * @param count_include_pad        When True, will include the zero-padding in the averaging
  * calculation (input)
  * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
@@ -7647,17 +7656,17 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                                                   const void* input,
                                                   const miopenTensorDescriptor_t outputDesc,
                                                   void* output,
-                                                  const int32_t KD,
-                                                  const int32_t KH,
-                                                  const int32_t KW,
-                                                  const int32_t SD,
-                                                  const int32_t SH,
-                                                  const int32_t SW,
-                                                  const int32_t PD,
-                                                  const int32_t PH,
-                                                  const int32_t PW,
+                                                  const int64_t KD,
+                                                  const int64_t KH,
+                                                  const int64_t KW,
+                                                  const int64_t SD,
+                                                  const int64_t SH,
+                                                  const int64_t SW,
+                                                  const int64_t PD,
+                                                  const int64_t PH,
+                                                  const int64_t PW,
                                                   const bool count_include_pad,
-                                                  const int32_t divisor_override);
+                                                  const int64_t divisor_override);
 
 /*! @brief Execute an avgpool backward layer
  *
@@ -7666,6 +7675,15 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
  * @param output_grad              Data tensor output grad (input)
  * @param inputGradDesc            Tensor descriptor for input grad tensor (input)
  * @param input_grad               Data tensor input grad (output)
+ * @param KD                       Kernel size in dimension D  (input)
+ * @param KH                       Kernel size in dimension H (input)
+ * @param KW                       Kernel size in dimension K (input)
+ * @param SD                       Stride size in dimension D (input)
+ * @param SH                       Stride size in dimension H (input)
+ * @param SW                       Stride size in dimension K (input)
+ * @param PD                       Padding size in dimension D (input)
+ * @param PH                       Padding size in dimension H (input)
+ * @param PW                       Padding size in dimension K (input)
  * @param count_include_pad        When True, will include the zero-padding in the averaging
  * calculation (input)
  * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
@@ -7677,17 +7695,17 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                                                    const void* output_grad,
                                                    const miopenTensorDescriptor_t inputGradDesc,
                                                    void* input_grad,
-                                                   const int32_t KD,
-                                                   const int32_t KH,
-                                                   const int32_t KW,
-                                                   const int32_t SD,
-                                                   const int32_t SH,
-                                                   const int32_t SW,
-                                                   const int32_t PD,
-                                                   const int32_t PH,
-                                                   const int32_t PW,
+                                                   const int64_t KD,
+                                                   const int64_t KH,
+                                                   const int64_t KW,
+                                                   const int64_t SD,
+                                                   const int64_t SH,
+                                                   const int64_t SW,
+                                                   const int64_t PD,
+                                                   const int64_t PH,
+                                                   const int64_t PW,
                                                    const bool count_include_pad,
-                                                   const int32_t divisor_override);
+                                                   const int64_t divisor_override);
 /** @} */
 // CLOSEOUT avgpool DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
diff --git a/src/avgpool.cpp b/src/avgpool.cpp
index 323f01c90e..ed71f9ef8b 100644
--- a/src/avgpool.cpp
+++ b/src/avgpool.cpp
@@ -38,17 +38,17 @@ miopenStatus_t AvgPoolForward(Handle& handle,
                               ConstData_t input,
                               const TensorDescriptor& outputDesc,
                               Data_t output,
-                              const int32_t KD,
-                              const int32_t KH,
-                              const int32_t KW,
-                              const int32_t SD,
-                              const int32_t SH,
-                              const int32_t SW,
-                              const int32_t PD,
-                              const int32_t PH,
-                              const int32_t PW,
+                              const int64_t KD,
+                              const int64_t KH,
+                              const int64_t KW,
+                              const int64_t SD,
+                              const int64_t SH,
+                              const int64_t SW,
+                              const int64_t PD,
+                              const int64_t PH,
+                              const int64_t PW,
                               const bool count_include_pad,
-                              const int32_t divisor_override)
+                              const int64_t divisor_override)
 {
     const auto problem =
         avgpool::FwdProblemDescription{inputDesc, outputDesc, count_include_pad, divisor_override};
@@ -88,17 +88,17 @@ miopenStatus_t AvgPoolBackward(Handle& handle,
                                ConstData_t output_grad,
                                const TensorDescriptor& inputGradDesc,
                                Data_t input_grad,
-                               const int32_t KD,
-                               const int32_t KH,
-                               const int32_t KW,
-                               const int32_t SD,
-                               const int32_t SH,
-                               const int32_t SW,
-                               const int32_t PD,
-                               const int32_t PH,
-                               const int32_t PW,
+                               const int64_t KD,
+                               const int64_t KH,
+                               const int64_t KW,
+                               const int64_t SD,
+                               const int64_t SH,
+                               const int64_t SW,
+                               const int64_t PD,
+                               const int64_t PH,
+                               const int64_t PW,
                                const bool count_include_pad,
-                               const int32_t divisor_override)
+                               const int64_t divisor_override)
 {
     const auto problem = avgpool::BwdProblemDescription{
         outputGradDesc, inputGradDesc, count_include_pad, divisor_override};
diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp
index 32e1f12f92..286fe97456 100644
--- a/src/avgpool_api.cpp
+++ b/src/avgpool_api.cpp
@@ -46,7 +46,7 @@ inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
 static void LogCmdAvgPool(const miopenTensorDescriptor_t xDesc,
                           const miopenTensorDescriptor_t oDesc,
                           const bool count_include_pad,
-                          const int32_t divisor_override,
+                          const int64_t divisor_override,
                           const bool is_fwd)
 {
     if(miopen::IsLoggingCmd())
@@ -84,17 +84,17 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                                                const void* input,
                                                const miopenTensorDescriptor_t outputDesc,
                                                void* output,
-                                               const int32_t KD,
-                                               const int32_t KH,
-                                               const int32_t KW,
-                                               const int32_t SD,
-                                               const int32_t SH,
-                                               const int32_t SW,
-                                               const int32_t PD,
-                                               const int32_t PH,
-                                               const int32_t PW,
+                                               const int64_t KD,
+                                               const int64_t KH,
+                                               const int64_t KW,
+                                               const int64_t SD,
+                                               const int64_t SH,
+                                               const int64_t SW,
+                                               const int64_t PD,
+                                               const int64_t PH,
+                                               const int64_t PW,
                                                const bool count_include_pad,
-                                               const int32_t divisor_override)
+                                               const int64_t divisor_override)
 {
     MIOPEN_LOG_FUNCTION(handle,
                         inputDesc,
@@ -139,17 +139,17 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                                                 const void* output_grad,
                                                 const miopenTensorDescriptor_t inputGradDesc,
                                                 void* input_grad,
-                                                const int32_t KD,
-                                                const int32_t KH,
-                                                const int32_t KW,
-                                                const int32_t SD,
-                                                const int32_t SH,
-                                                const int32_t SW,
-                                                const int32_t PD,
-                                                const int32_t PH,
-                                                const int32_t PW,
+                                                const int64_t KD,
+                                                const int64_t KH,
+                                                const int64_t KW,
+                                                const int64_t SD,
+                                                const int64_t SH,
+                                                const int64_t SW,
+                                                const int64_t PD,
+                                                const int64_t PH,
+                                                const int64_t PW,
                                                 const bool count_include_pad,
-                                                const int32_t divisor_override)
+                                                const int64_t divisor_override)
 {
     MIOPEN_LOG_FUNCTION(handle,
                         outputGradDesc,
diff --git a/src/include/miopen/avgpool.hpp b/src/include/miopen/avgpool.hpp
index 00a2717ff6..c11fe6cadf 100644
--- a/src/include/miopen/avgpool.hpp
+++ b/src/include/miopen/avgpool.hpp
@@ -39,33 +39,33 @@ MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolForward(Handle& handle,
                                                       ConstData_t input,
                                                       const TensorDescriptor& outputDesc,
                                                       Data_t output,
-                                                      int32_t KD,
-                                                      int32_t KH,
-                                                      int32_t KW,
-                                                      int32_t SD,
-                                                      int32_t SH,
-                                                      int32_t SW,
-                                                      int32_t PD,
-                                                      int32_t PH,
-                                                      int32_t PW,
+                                                      int64_t KD,
+                                                      int64_t KH,
+                                                      int64_t KW,
+                                                      int64_t SD,
+                                                      int64_t SH,
+                                                      int64_t SW,
+                                                      int64_t PD,
+                                                      int64_t PH,
+                                                      int64_t PW,
                                                       bool count_include_pad,
-                                                      int32_t divisor_override);
+                                                      int64_t divisor_override);
 
 MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolBackward(Handle& handle,
                                                        const TensorDescriptor& outputGradDesc,
                                                        ConstData_t output_grad,
                                                        const TensorDescriptor& inputGradDesc,
                                                        Data_t input_grad,
-                                                       int32_t KD,
-                                                       int32_t KH,
-                                                       int32_t KW,
-                                                       int32_t SD,
-                                                       int32_t SH,
-                                                       int32_t SW,
-                                                       int32_t PD,
-                                                       int32_t PH,
-                                                       int32_t PW,
+                                                       int64_t KD,
+                                                       int64_t KH,
+                                                       int64_t KW,
+                                                       int64_t SD,
+                                                       int64_t SH,
+                                                       int64_t SW,
+                                                       int64_t PD,
+                                                       int64_t PH,
+                                                       int64_t PW,
                                                        bool count_include_pad,
-                                                       int32_t divisor_override);
+                                                       int64_t divisor_override);
 } // namespace miopen
 #endif // _MIOPEN_AVGPOOL_HPP_
diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
index 76355d5729..5fe015edc1 100644
--- a/src/kernels/MIOpenAvgPool.cpp
+++ b/src/kernels/MIOpenAvgPool.cpp
@@ -43,62 +43,62 @@
 template <typename TI, typename TO>
 __device__ void avgPoolForward2d(const TI* __restrict__ input,
                                  TO* __restrict__ output,
-                                 size_t N,
-                                 size_t C,
-                                 size_t H,
-                                 size_t W,
-                                 size_t OH,
-                                 size_t OW,
-                                 int32_t R,
-                                 int32_t S,
-                                 int32_t sh,
-                                 int32_t sw,
-                                 int32_t ph,
-                                 int32_t pw,
+                                 int64_t N,
+                                 int64_t C,
+                                 int64_t H,
+                                 int64_t W,
+                                 int64_t OH,
+                                 int64_t OW,
+                                 int64_t R,
+                                 int64_t S,
+                                 int64_t sh,
+                                 int64_t sw,
+                                 int64_t ph,
+                                 int64_t pw,
                                  bool count_include_pad,
-                                 int32_t divisor_override,
+                                 int64_t divisor_override,
                                  tensor_view_t<4> input_tv,
                                  tensor_view_t<4> output_tv)
 {
-    int32_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
-    int32_t ncoh = gid / OW, ow = gid % OW;
-    int32_t nc = ncoh / OH, oh = ncoh % OH;
-    int32_t n = nc / C, c = nc % C;
+    int64_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
+    int64_t ncoh = gid / OW, ow = gid % OW;
+    int64_t nc = ncoh / OH, oh = ncoh % OH;
+    int64_t n = nc / C, c = nc % C;
 
     if(n >= N)
         return;
 
     FLOAT_ACCUM m = 0;
-    for(int32_t r = 0; r < R; ++r)
+    for(int64_t r = 0; r < R; ++r)
     {
-        for(int32_t s = 0; s < S; ++s)
+        for(int64_t s = 0; s < S; ++s)
         {
             // input idx : (n, c, h, w)
-            int32_t h = oh * sh - ph + r;
+            int64_t h = oh * sh - ph + r;
             if(h < 0 || h >= H)
                 continue;
-            int32_t w = ow * sw - pw + s;
+            int64_t w = ow * sw - pw + s;
             if(w < 0 || w >= W)
                 continue;
-            // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+            // int64_t input_idx = ((n * C + c) * H + h) * W + w;
             m += CVT_FLOAT2ACCUM(
                 input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]);
         }
     }
 
-    int32_t hstart = oh * sh - ph;
-    int32_t wstart = ow * sw - pw;
-    int32_t hend   = min(hstart + R, H + ph);
-    int32_t wend   = min(wstart + S, W + pw);
+    int64_t hstart = oh * sh - ph;
+    int64_t wstart = ow * sw - pw;
+    int64_t hend   = min(hstart + R, H + ph);
+    int64_t wend   = min(wstart + S, W + pw);
 
-    const int32_t pool_size = (hend - hstart) * (wend - wstart);
+    const int64_t pool_size = (hend - hstart) * (wend - wstart);
 
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     hend   = min(hend, H);
     wend   = min(wend, W);
 
-    int32_t divide_factor;
+    int64_t divide_factor;
     if(divisor_override != 0)
     {
         divide_factor = divisor_override;
@@ -121,20 +121,20 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input,
 
 extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input,
                                             OUTPUT_TYPE* __restrict__ output,
-                                            size_t N,
-                                            size_t C,
-                                            size_t H,
-                                            size_t W,
-                                            size_t OH,
-                                            size_t OW,
-                                            int32_t R,
-                                            int32_t S,
-                                            int32_t sh,
-                                            int32_t sw,
-                                            int32_t ph,
-                                            int32_t pw,
+                                            int64_t N,
+                                            int64_t C,
+                                            int64_t H,
+                                            int64_t W,
+                                            int64_t OH,
+                                            int64_t OW,
+                                            int64_t R,
+                                            int64_t S,
+                                            int64_t sh,
+                                            int64_t sw,
+                                            int64_t ph,
+                                            int64_t pw,
                                             bool count_include_pad,
-                                            int32_t divisor_override,
+                                            int64_t divisor_override,
                                             tensor_view_t<4> input_tv,
                                             tensor_view_t<4> output_tv)
 {
@@ -161,67 +161,67 @@ extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input
 template <typename TI, typename TO>
 __device__ void avgPoolForward3d(const TI* __restrict__ input,
                                  TO* __restrict__ output,
-                                 size_t N,
-                                 size_t C,
-                                 size_t D,
-                                 size_t H,
-                                 size_t W,
-                                 size_t OD,
-                                 size_t OH,
-                                 size_t OW,
-                                 int32_t KD,
-                                 int32_t R,
-                                 int32_t S,
-                                 int32_t sd,
-                                 int32_t sh,
-                                 int32_t sw,
-                                 int32_t pd,
-                                 int32_t ph,
-                                 int32_t pw,
+                                 int64_t N,
+                                 int64_t C,
+                                 int64_t D,
+                                 int64_t H,
+                                 int64_t W,
+                                 int64_t OD,
+                                 int64_t OH,
+                                 int64_t OW,
+                                 int64_t KD,
+                                 int64_t R,
+                                 int64_t S,
+                                 int64_t sd,
+                                 int64_t sh,
+                                 int64_t sw,
+                                 int64_t pd,
+                                 int64_t ph,
+                                 int64_t pw,
                                  bool count_include_pad,
-                                 int32_t divisor_override,
+                                 int64_t divisor_override,
                                  tensor_view_t<5> input_tv,
                                  tensor_view_t<5> output_tv)
 {
-    int32_t gid    = threadIdx.x + blockIdx.x * blockDim.x;
-    int32_t ncodoh = gid / OW, ow = gid % OW;
-    int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
-    int32_t nc = ncod / OD, od = ncod % OD;
-    int32_t n = nc / C, c = nc % C;
+    int64_t gid    = threadIdx.x + blockIdx.x * blockDim.x;
+    int64_t ncodoh = gid / OW, ow = gid % OW;
+    int64_t ncod = ncodoh / OH, oh = ncodoh % OH;
+    int64_t nc = ncod / OD, od = ncod % OD;
+    int64_t n = nc / C, c = nc % C;
 
     if(n >= N)
         return;
     FLOAT_ACCUM sum = 0;
-    for(int32_t kd = 0; kd < KD; ++kd)
+    for(int64_t kd = 0; kd < KD; ++kd)
     {
-        for(int32_t r = 0; r < R; ++r)
+        for(int64_t r = 0; r < R; ++r)
         {
-            for(int32_t s = 0; s < S; ++s)
+            for(int64_t s = 0; s < S; ++s)
             {
                 // input idx : (n, c, d, h, w)
-                int32_t d = od * sd - pd + kd;
+                int64_t d = od * sd - pd + kd;
                 if(d < 0 || d >= D)
                     continue;
-                int32_t h = oh * sh - ph + r;
+                int64_t h = oh * sh - ph + r;
                 if(h < 0 || h >= H)
                     continue;
-                int32_t w = ow * sw - pw + s;
+                int64_t w = ow * sw - pw + s;
                 if(w < 0 || w >= W)
                     continue;
-                // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+                // int64_t input_idx = ((n * C + c) * H + h) * W + w;
                 sum += CVT_FLOAT2ACCUM(
                     input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]);
             }
         }
     }
-    int32_t dstart = od * sd - pd;
-    int32_t hstart = oh * sh - ph;
-    int32_t wstart = ow * sw - pw;
-    int32_t dend   = min(dstart + KD, D + pd);
-    int32_t hend   = min(hstart + R, H + ph);
-    int32_t wend   = min(wstart + S, W + pw);
-
-    const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    int64_t dstart = od * sd - pd;
+    int64_t hstart = oh * sh - ph;
+    int64_t wstart = ow * sw - pw;
+    int64_t dend   = min(dstart + KD, D + pd);
+    int64_t hend   = min(hstart + R, H + ph);
+    int64_t wend   = min(wstart + S, W + pw);
+
+    const int64_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
     dstart                  = max(dstart, 0);
     hstart                  = max(hstart, 0);
     wstart                  = max(wstart, 0);
@@ -229,7 +229,7 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input,
     hend                    = min(hend, H);
     wend                    = min(wend, W);
 
-    int32_t divide_factor;
+    int64_t divide_factor;
     if(divisor_override != 0)
     {
         divide_factor = divisor_override;
@@ -252,25 +252,25 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input,
 
 extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input,
                                             OUTPUT_TYPE* __restrict__ output,
-                                            size_t N,
-                                            size_t C,
-                                            size_t D,
-                                            size_t H,
-                                            size_t W,
-                                            size_t OD,
-                                            size_t OH,
-                                            size_t OW,
-                                            int32_t KD,
-                                            int32_t R,
-                                            int32_t S,
-                                            int32_t sd,
-                                            int32_t sh,
-                                            int32_t sw,
-                                            int32_t pd,
-                                            int32_t ph,
-                                            int32_t pw,
+                                            int64_t N,
+                                            int64_t C,
+                                            int64_t D,
+                                            int64_t H,
+                                            int64_t W,
+                                            int64_t OD,
+                                            int64_t OH,
+                                            int64_t OW,
+                                            int64_t KD,
+                                            int64_t R,
+                                            int64_t S,
+                                            int64_t sd,
+                                            int64_t sh,
+                                            int64_t sw,
+                                            int64_t pd,
+                                            int64_t ph,
+                                            int64_t pw,
                                             bool count_include_pad,
-                                            int32_t divisor_override,
+                                            int64_t divisor_override,
                                             tensor_view_t<5> input_tv,
                                             tensor_view_t<5> output_tv)
 {
@@ -302,62 +302,62 @@ extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input
 template <typename TI, typename TO>
 __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
                                   TO* __restrict__ input_grad,
-                                  size_t N,
-                                  size_t C,
-                                  size_t H,
-                                  size_t W,
-                                  size_t OH,
-                                  size_t OW,
-                                  int32_t R,
-                                  int32_t S,
-                                  int32_t sh,
-                                  int32_t sw,
-                                  int32_t ph,
-                                  int32_t pw,
+                                  int64_t N,
+                                  int64_t C,
+                                  int64_t H,
+                                  int64_t W,
+                                  int64_t OH,
+                                  int64_t OW,
+                                  int64_t R,
+                                  int64_t S,
+                                  int64_t sh,
+                                  int64_t sw,
+                                  int64_t ph,
+                                  int64_t pw,
                                   bool count_include_pad,
-                                  int32_t divisor_override,
+                                  int64_t divisor_override,
                                   tensor_view_t<4> output_grad_tv,
                                   tensor_view_t<4> input_grad_tv)
 {
-    int32_t gid = threadIdx.x + blockIdx.x * blockDim.x;
-    int32_t nch = gid / W, w = gid % W;
-    int32_t nc = nch / H, h = nch % H;
-    int32_t n = nc / C, c = nc % C;
+    int64_t gid = threadIdx.x + blockIdx.x * blockDim.x;
+    int64_t nch = gid / W, w = gid % W;
+    int64_t nc = nch / H, h = nch % H;
+    int64_t n = nc / C, c = nc % C;
 
     if(n >= N)
         return;
 
     FLOAT_ACCUM grad = 0;
-    for(int32_t r = 0; r < R; ++r)
+    for(int64_t r = 0; r < R; ++r)
     {
-        for(int32_t s = 0; s < S; ++s)
+        for(int64_t s = 0; s < S; ++s)
         {
-            int32_t ohsh = h + ph - r;
+            int64_t ohsh = h + ph - r;
             if(ohsh % sh != 0)
                 continue;
-            int32_t oh = ohsh / sh;
+            int64_t oh = ohsh / sh;
             if(oh < 0 || oh >= OH)
                 continue;
-            int32_t owsw = w + pw - s;
+            int64_t owsw = w + pw - s;
             if(owsw % sw != 0)
                 continue;
-            int32_t ow = owsw / sw;
+            int64_t ow = owsw / sw;
             if(ow < 0 || ow >= OW)
                 continue;
 
-            int32_t hstart = oh * sh - ph;
-            int32_t wstart = ow * sw - pw;
-            int32_t hend   = min(hstart + R, H + ph);
-            int32_t wend   = min(wstart + S, W + pw);
+            int64_t hstart = oh * sh - ph;
+            int64_t wstart = ow * sw - pw;
+            int64_t hend   = min(hstart + R, H + ph);
+            int64_t wend   = min(wstart + S, W + pw);
 
-            const int32_t pool_size = (hend - hstart) * (wend - wstart);
+            const int64_t pool_size = (hend - hstart) * (wend - wstart);
 
             hstart = max(hstart, 0);
             wstart = max(wstart, 0);
             hend   = min(hend, H);
             wend   = min(wend, W);
 
-            int32_t divide_factor;
+            int64_t divide_factor;
             if(divisor_override != 0)
             {
                 divide_factor = divisor_override;
@@ -385,20 +385,20 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
 
 extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad,
                                              OUTPUT_TYPE* __restrict__ input_grad,
-                                             size_t N,
-                                             size_t C,
-                                             size_t H,
-                                             size_t W,
-                                             size_t OH,
-                                             size_t OW,
-                                             int32_t R,
-                                             int32_t S,
-                                             int32_t sh,
-                                             int32_t sw,
-                                             int32_t ph,
-                                             int32_t pw,
+                                             int64_t N,
+                                             int64_t C,
+                                             int64_t H,
+                                             int64_t W,
+                                             int64_t OH,
+                                             int64_t OW,
+                                             int64_t R,
+                                             int64_t S,
+                                             int64_t sh,
+                                             int64_t sw,
+                                             int64_t ph,
+                                             int64_t pw,
                                              bool count_include_pad,
-                                             int32_t divisor_override,
+                                             int64_t divisor_override,
                                              tensor_view_t<4> output_grad_tv,
                                              tensor_view_t<4> input_grad_tv)
 {
@@ -425,80 +425,80 @@ extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ outp
 template <typename TI, typename TO>
 __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
                                   TO* __restrict__ input_grad,
-                                  size_t N,
-                                  size_t C,
-                                  size_t D,
-                                  size_t H,
-                                  size_t W,
-                                  size_t OD,
-                                  size_t OH,
-                                  size_t OW,
-                                  int32_t KD,
-                                  int32_t R,
-                                  int32_t S,
-                                  int32_t sd,
-                                  int32_t sh,
-                                  int32_t sw,
-                                  int32_t pd,
-                                  int32_t ph,
-                                  int32_t pw,
+                                  int64_t N,
+                                  int64_t C,
+                                  int64_t D,
+                                  int64_t H,
+                                  int64_t W,
+                                  int64_t OD,
+                                  int64_t OH,
+                                  int64_t OW,
+                                  int64_t KD,
+                                  int64_t R,
+                                  int64_t S,
+                                  int64_t sd,
+                                  int64_t sh,
+                                  int64_t sw,
+                                  int64_t pd,
+                                  int64_t ph,
+                                  int64_t pw,
                                   bool count_include_pad,
-                                  int32_t divisor_override,
+                                  int64_t divisor_override,
                                   tensor_view_t<5> output_grad_tv,
                                   tensor_view_t<5> input_grad_tv)
 {
-    int32_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
-    int32_t ncdh = gid / W, w = gid % W;
-    int32_t ncd = ncdh / H, h = ncdh % H;
-    int32_t nc = ncd / D, d = ncd % D;
-    int32_t n = nc / C, c = nc % C;
+    int64_t gid  = threadIdx.x + blockIdx.x * blockDim.x;
+    int64_t ncdh = gid / W, w = gid % W;
+    int64_t ncd = ncdh / H, h = ncdh % H;
+    int64_t nc = ncd / D, d = ncd % D;
+    int64_t n = nc / C, c = nc % C;
 
     if(n >= N)
         return;
 
     FLOAT_ACCUM grad = 0;
-    for(int32_t kd = 0; kd < KD; ++kd)
+    for(int64_t kd = 0; kd < KD; ++kd)
     {
-        for(int32_t r = 0; r < R; ++r)
+        for(int64_t r = 0; r < R; ++r)
         {
-            for(int32_t s = 0; s < S; ++s)
+            for(int64_t s = 0; s < S; ++s)
             {
-                int32_t odsd = d + pd - kd;
+                int64_t odsd = d + pd - kd;
                 if(odsd % sd != 0)
                     continue;
-                int32_t od = odsd / sd;
+                int64_t od = odsd / sd;
                 if(od < 0 || od >= OD)
                     continue;
 
-                int32_t ohsh = h + ph - r;
+                int64_t ohsh = h + ph - r;
                 if(ohsh % sh != 0)
                     continue;
-                int32_t oh = ohsh / sh;
+                int64_t oh = ohsh / sh;
                 if(oh < 0 || oh >= OH)
                     continue;
 
-                int32_t owsw = w + pw - s;
+                int64_t owsw = w + pw - s;
                 if(owsw % sw != 0)
                     continue;
-                int32_t ow = owsw / sw;
+                int64_t ow = owsw / sw;
                 if(ow < 0 || ow >= OW)
                     continue;
 
-                int32_t dstart = od * sd - pd;
-                int32_t hstart = oh * sh - ph;
-                int32_t wstart = ow * sw - pw;
-                int32_t dend   = min(dstart + KD, D + pd);
-                int32_t hend   = min(hstart + R, H + ph);
-                int32_t wend   = min(wstart + S, W + pw);
+                int64_t dstart = od * sd - pd;
+                int64_t hstart = oh * sh - ph;
+                int64_t wstart = ow * sw - pw;
+                int64_t dend   = min(dstart + KD, D + pd);
+                int64_t hend   = min(hstart + R, H + ph);
+                int64_t wend   = min(wstart + S, W + pw);
 
-                const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                const int64_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
                 dstart                  = max(dstart, 0);
                 hstart                  = max(hstart, 0);
                 wstart                  = max(wstart, 0);
                 dend                    = min(dend, D);
                 hend                    = min(hend, H);
                 wend                    = min(wend, W);
-                int32_t divide_factor;
+                int64_t divide_factor;
                 if(divisor_override != 0)
                 {
                     divide_factor = divisor_override;
@@ -526,25 +526,25 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
 
 extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad,
                                              OUTPUT_TYPE* __restrict__ input_grad,
-                                             size_t N,
-                                             size_t C,
-                                             size_t D,
-                                             size_t H,
-                                             size_t W,
-                                             size_t OD,
-                                             size_t OH,
-                                             size_t OW,
-                                             int32_t KD,
-                                             int32_t R,
-                                             int32_t S,
-                                             int32_t sd,
-                                             int32_t sh,
-                                             int32_t sw,
-                                             int32_t pd,
-                                             int32_t ph,
-                                             int32_t pw,
+                                             int64_t N,
+                                             int64_t C,
+                                             int64_t D,
+                                             int64_t H,
+                                             int64_t W,
+                                             int64_t OD,
+                                             int64_t OH,
+                                             int64_t OW,
+                                             int64_t KD,
+                                             int64_t R,
+                                             int64_t S,
+                                             int64_t sd,
+                                             int64_t sh,
+                                             int64_t sw,
+                                             int64_t pd,
+                                             int64_t ph,
+                                             int64_t pw,
                                              bool count_include_pad,
-                                             int32_t divisor_override,
+                                             int64_t divisor_override,
                                              tensor_view_t<5> output_grad_tv,
                                              tensor_view_t<5> input_grad_tv)
 {
diff --git a/test/cpu_avgpool.hpp b/test/cpu_avgpool.hpp
index 5b91033633..069130ec3a 100644
--- a/test/cpu_avgpool.hpp
+++ b/test/cpu_avgpool.hpp
@@ -32,17 +32,17 @@
 template <class T>
 void cpu_avgpool_forward_2d(tensor<T> input,
                             tensor<T>& output,
-                            size_t N,
-                            size_t C,
-                            size_t H,
-                            size_t W,
-                            size_t OH,
-                            size_t OW,
-                            tensor<int32_t> ksize,
-                            tensor<int32_t> stride,
-                            tensor<int32_t> padding,
+                            int64_t N,
+                            int64_t C,
+                            int64_t H,
+                            int64_t W,
+                            int64_t OH,
+                            int64_t OW,
+                            tensor<int64_t> ksize,
+                            tensor<int64_t> stride,
+                            tensor<int64_t> padding,
                             bool count_include_pad,
-                            int32_t divisor_override)
+                            int64_t divisor_override)
 {
     auto dims  = input.desc.GetLengths();
     auto numel = output.desc.GetElementSize();
@@ -50,52 +50,52 @@ void cpu_avgpool_forward_2d(tensor<T> input,
     auto input_tv  = miopen::get_inner_expanded_tv<4>(input.desc);
     auto output_tv = miopen::get_inner_expanded_tv<4>(output.desc);
 
-    for(int32_t gid = 0; gid < numel; gid++)
+    for(int64_t gid = 0; gid < numel; gid++)
     {
-        int32_t ncoh = gid / OW, ow = gid % OW;
-        int32_t nc = ncoh / OH, oh = ncoh % OH;
-        int32_t n = nc / C, c = nc % C;
-        int32_t R  = ksize[0];
-        int32_t S  = ksize[1];
-        int32_t sh = stride[0];
-        int32_t sw = stride[1];
-        int32_t ph = padding[0];
-        int32_t pw = padding[1];
+        int64_t ncoh = gid / OW, ow = gid % OW;
+        int64_t nc = ncoh / OH, oh = ncoh % OH;
+        int64_t n = nc / C, c = nc % C;
+        int64_t R  = ksize[0];
+        int64_t S  = ksize[1];
+        int64_t sh = stride[0];
+        int64_t sw = stride[1];
+        int64_t ph = padding[0];
+        int64_t pw = padding[1];
 
         if(n >= N)
             return;
 
         float m = 0;
-        for(int32_t r = 0; r < R; ++r)
+        for(int64_t r = 0; r < R; ++r)
         {
-            for(int32_t s = 0; s < S; ++s)
+            for(int64_t s = 0; s < S; ++s)
             {
                 // input idx : (n, c, h, w)
-                int32_t h = oh * sh - ph + r;
+                int64_t h = oh * sh - ph + r;
                 if(h < 0 || h >= H)
                     continue;
-                int32_t w = ow * sw - pw + s;
+                int64_t w = ow * sw - pw + s;
                 if(w < 0 || w >= W)
                     continue;
-                // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+                // int64_t input_idx = ((n * C + c) * H + h) * W + w;
                 m += static_cast<float>(
                     input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]);
             }
         }
 
-        int32_t hstart = oh * sh - ph;
-        int32_t wstart = ow * sw - pw;
-        int32_t hend   = min(hstart + R, H + ph);
-        int32_t wend   = min(wstart + S, W + pw);
+        int64_t hstart = oh * sh - ph;
+        int64_t wstart = ow * sw - pw;
+        int64_t hend   = min(hstart + R, H + ph);
+        int64_t wend   = min(wstart + S, W + pw);
 
-        const int32_t pool_size = (hend - hstart) * (wend - wstart);
+        const int64_t pool_size = (hend - hstart) * (wend - wstart);
 
         hstart = max(hstart, 0);
         wstart = max(wstart, 0);
         hend   = min(hend, H);
         wend   = min(wend, W);
 
-        int32_t divide_factor;
+        int64_t divide_factor;
         if(divisor_override != 0)
         {
             divide_factor = divisor_override;
@@ -121,19 +121,19 @@ void cpu_avgpool_forward_2d(tensor<T> input,
 template <class T>
 void cpu_avgpool_forward_3d(tensor<T> input,
                             tensor<T>& output,
-                            size_t N,
-                            size_t C,
-                            size_t D,
-                            size_t H,
-                            size_t W,
-                            size_t OD,
-                            size_t OH,
-                            size_t OW,
-                            tensor<int32_t> ksize,
-                            tensor<int32_t> stride,
-                            tensor<int32_t> padding,
+                            int64_t N,
+                            int64_t C,
+                            int64_t D,
+                            int64_t H,
+                            int64_t W,
+                            int64_t OD,
+                            int64_t OH,
+                            int64_t OW,
+                            tensor<int64_t> ksize,
+                            tensor<int64_t> stride,
+                            tensor<int64_t> padding,
                             bool count_include_pad,
-                            int32_t divisor_override)
+                            int64_t divisor_override)
 {
     auto dims  = input.desc.GetLengths();
     auto numel = output.desc.GetElementSize();
@@ -141,55 +141,55 @@ void cpu_avgpool_forward_3d(tensor<T> input,
     auto input_tv  = miopen::get_inner_expanded_tv<5>(input.desc);
     auto output_tv = miopen::get_inner_expanded_tv<5>(output.desc);
 
-    for(int32_t gid = 0; gid < numel; gid++)
+    for(int64_t gid = 0; gid < numel; gid++)
     {
-        int32_t ncodoh = gid / OW, ow = gid % OW;
-        int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
-        int32_t nc = ncod / OD, od = ncod % OD;
-        int32_t n = nc / C, c = nc % C;
-        int32_t KD = ksize[0];
-        int32_t R  = ksize[1];
-        int32_t S  = ksize[2];
-        int32_t sd = stride[0];
-        int32_t sh = stride[1];
-        int32_t sw = stride[2];
-        int32_t pd = padding[0];
-        int32_t ph = padding[1];
-        int32_t pw = padding[2];
+        int64_t ncodoh = gid / OW, ow = gid % OW;
+        int64_t ncod = ncodoh / OH, oh = ncodoh % OH;
+        int64_t nc = ncod / OD, od = ncod % OD;
+        int64_t n = nc / C, c = nc % C;
+        int64_t KD = ksize[0];
+        int64_t R  = ksize[1];
+        int64_t S  = ksize[2];
+        int64_t sd = stride[0];
+        int64_t sh = stride[1];
+        int64_t sw = stride[2];
+        int64_t pd = padding[0];
+        int64_t ph = padding[1];
+        int64_t pw = padding[2];
 
         if(n >= N)
             return;
         float sum = 0;
-        for(int32_t kd = 0; kd < KD; ++kd)
+        for(int64_t kd = 0; kd < KD; ++kd)
         {
-            for(int32_t r = 0; r < R; ++r)
+            for(int64_t r = 0; r < R; ++r)
             {
-                for(int32_t s = 0; s < S; ++s)
+                for(int64_t s = 0; s < S; ++s)
                 {
                     // input idx : (n, c, d, h, w)
-                    int32_t d = od * sd - pd + kd;
+                    int64_t d = od * sd - pd + kd;
                     if(d < 0 || d >= D)
                         continue;
-                    int32_t h = oh * sh - ph + r;
+                    int64_t h = oh * sh - ph + r;
                     if(h < 0 || h >= H)
                         continue;
-                    int32_t w = ow * sw - pw + s;
+                    int64_t w = ow * sw - pw + s;
                     if(w < 0 || w >= W)
                         continue;
-                    // int32_t input_idx = ((n * C + c) * H + h) * W + w;
+                    // int64_t input_idx = ((n * C + c) * H + h) * W + w;
                     sum += static_cast<float>(
                         input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]);
                 }
             }
         }
-        int32_t dstart = od * sd - pd;
-        int32_t hstart = oh * sh - ph;
-        int32_t wstart = ow * sw - pw;
-        int32_t dend   = min(dstart + KD, D + pd);
-        int32_t hend   = min(hstart + R, H + ph);
-        int32_t wend   = min(wstart + S, W + pw);
-
-        const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+        int64_t dstart = od * sd - pd;
+        int64_t hstart = oh * sh - ph;
+        int64_t wstart = ow * sw - pw;
+        int64_t dend   = min(dstart + KD, D + pd);
+        int64_t hend   = min(hstart + R, H + ph);
+        int64_t wend   = min(wstart + S, W + pw);
+
+        const int64_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
         dstart                  = max(dstart, 0);
         hstart                  = max(hstart, 0);
         wstart                  = max(wstart, 0);
@@ -197,7 +197,7 @@ void cpu_avgpool_forward_3d(tensor<T> input,
         hend                    = min(hend, H);
         wend                    = min(wend, W);
 
-        int32_t divide_factor;
+        int64_t divide_factor;
         if(divisor_override != 0)
         {
             divide_factor = divisor_override;
@@ -222,17 +222,17 @@ void cpu_avgpool_forward_3d(tensor<T> input,
 template <class T>
 void cpu_avgpool_backward_2d(tensor<T> output_grad,
                              tensor<T>& input_grad,
-                             size_t N,
-                             size_t C,
-                             size_t H,
-                             size_t W,
-                             size_t OH,
-                             size_t OW,
-                             tensor<int32_t> ksize,
-                             tensor<int32_t> stride,
-                             tensor<int32_t> padding,
+                             int64_t N,
+                             int64_t C,
+                             int64_t H,
+                             int64_t W,
+                             int64_t OH,
+                             int64_t OW,
+                             tensor<int64_t> ksize,
+                             tensor<int64_t> stride,
+                             tensor<int64_t> padding,
                              bool count_include_pad,
-                             int32_t divisor_override)
+                             int64_t divisor_override)
 {
     auto dims  = input_grad.desc.GetLengths();
     auto numel = input_grad.desc.GetElementSize();
@@ -240,52 +240,52 @@ void cpu_avgpool_backward_2d(tensor<T> output_grad,
     auto output_grad_tv = miopen::get_inner_expanded_tv<4>(output_grad.desc);
     auto input_grad_tv  = miopen::get_inner_expanded_tv<4>(input_grad.desc);
 
-    for(size_t gid = 0; gid < numel; gid++)
+    for(int64_t gid = 0; gid < numel; gid++)
     {
-        int32_t nch = gid / W, w = gid % W;
-        int32_t nc = nch / H, h = nch % H;
-        int32_t n = nc / C, c = nc % C;
-        int32_t R  = ksize[0];
-        int32_t S  = ksize[1];
-        int32_t sh = stride[0];
-        int32_t sw = stride[1];
-        int32_t ph = padding[0];
-        int32_t pw = padding[1];
+        int64_t nch = gid / W, w = gid % W;
+        int64_t nc = nch / H, h = nch % H;
+        int64_t n = nc / C, c = nc % C;
+        int64_t R  = ksize[0];
+        int64_t S  = ksize[1];
+        int64_t sh = stride[0];
+        int64_t sw = stride[1];
+        int64_t ph = padding[0];
+        int64_t pw = padding[1];
 
         if(n >= N)
             return;
 
         float grad = 0;
-        for(int32_t r = 0; r < R; ++r)
+        for(int64_t r = 0; r < R; ++r)
         {
-            for(int32_t s = 0; s < S; ++s)
+            for(int64_t s = 0; s < S; ++s)
             {
-                int32_t ohsh = h + ph - r;
+                int64_t ohsh = h + ph - r;
                 if(ohsh % sh != 0)
                     continue;
-                int32_t oh = ohsh / sh;
+                int64_t oh = ohsh / sh;
                 if(oh < 0 || oh >= OH)
                     continue;
-                int32_t owsw = w + pw - s;
+                int64_t owsw = w + pw - s;
                 if(owsw % sw != 0)
                     continue;
-                int32_t ow = owsw / sw;
+                int64_t ow = owsw / sw;
                 if(ow < 0 || ow >= OW)
                     continue;
 
-                int32_t hstart = oh * sh - ph;
-                int32_t wstart = ow * sw - pw;
-                int32_t hend   = min(hstart + R, H + ph);
-                int32_t wend   = min(wstart + S, W + pw);
+                int64_t hstart = oh * sh - ph;
+                int64_t wstart = ow * sw - pw;
+                int64_t hend   = min(hstart + R, H + ph);
+                int64_t wend   = min(wstart + S, W + pw);
 
-                const int32_t pool_size = (hend - hstart) * (wend - wstart);
+                const int64_t pool_size = (hend - hstart) * (wend - wstart);
 
                 hstart = max(hstart, 0);
                 wstart = max(wstart, 0);
                 hend   = min(hend, H);
                 wend   = min(wend, W);
 
-                int32_t divide_factor;
+                int64_t divide_factor;
                 if(divisor_override != 0)
                 {
                     divide_factor = divisor_override;
@@ -315,19 +315,19 @@ void cpu_avgpool_backward_2d(tensor<T> output_grad,
 template <class T>
 void cpu_avgpool_backward_3d(tensor<T> output_grad,
                              tensor<T>& input_grad,
-                             size_t N,
-                             size_t C,
-                             size_t D,
-                             size_t H,
-                             size_t W,
-                             size_t OD,
-                             size_t OH,
-                             size_t OW,
-                             tensor<int32_t> ksize,
-                             tensor<int32_t> stride,
-                             tensor<int32_t> padding,
+                             int64_t N,
+                             int64_t C,
+                             int64_t D,
+                             int64_t H,
+                             int64_t W,
+                             int64_t OD,
+                             int64_t OH,
+                             int64_t OW,
+                             tensor<int64_t> ksize,
+                             tensor<int64_t> stride,
+                             tensor<int64_t> padding,
                              bool count_include_pad,
-                             int32_t divisor_override)
+                             int64_t divisor_override)
 {
     auto dims  = input_grad.desc.GetLengths();
     auto numel = input_grad.desc.GetElementSize();
@@ -335,68 +335,68 @@ void cpu_avgpool_backward_3d(tensor<T> output_grad,
     auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc);
     auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(input_grad.desc);
 
-    for(size_t gid = 0; gid < numel; gid++)
+    for(int64_t gid = 0; gid < numel; gid++)
     {
-        int32_t ncdh = gid / W, w = gid % W;
-        int32_t ncd = ncdh / H, h = ncdh % H;
-        int32_t nc = ncd / D, d = ncd % D;
-        int32_t n = nc / C, c = nc % C;
-        int32_t KD = ksize[0];
-        int32_t R  = ksize[1];
-        int32_t S  = ksize[2];
-        int32_t sd = stride[0];
-        int32_t sh = stride[1];
-        int32_t sw = stride[2];
-        int32_t pd = padding[0];
-        int32_t ph = padding[1];
-        int32_t pw = padding[2];
+        int64_t ncdh = gid / W, w = gid % W;
+        int64_t ncd = ncdh / H, h = ncdh % H;
+        int64_t nc = ncd / D, d = ncd % D;
+        int64_t n = nc / C, c = nc % C;
+        int64_t KD = ksize[0];
+        int64_t R  = ksize[1];
+        int64_t S  = ksize[2];
+        int64_t sd = stride[0];
+        int64_t sh = stride[1];
+        int64_t sw = stride[2];
+        int64_t pd = padding[0];
+        int64_t ph = padding[1];
+        int64_t pw = padding[2];
 
         if(n >= N)
             return;
 
         float grad = 0;
-        for(int32_t kd = 0; kd < KD; ++kd)
+        for(int64_t kd = 0; kd < KD; ++kd)
         {
-            for(int32_t r = 0; r < R; ++r)
+            for(int64_t r = 0; r < R; ++r)
             {
-                for(int32_t s = 0; s < S; ++s)
+                for(int64_t s = 0; s < S; ++s)
                 {
-                    int32_t odsd = d + pd - kd;
+                    int64_t odsd = d + pd - kd;
                     if(odsd % sd != 0)
                         continue;
-                    int32_t od = odsd / sd;
+                    int64_t od = odsd / sd;
                     if(od < 0 || od >= OD)
                         continue;
 
-                    int32_t ohsh = h + ph - r;
+                    int64_t ohsh = h + ph - r;
                     if(ohsh % sh != 0)
                         continue;
-                    int32_t oh = ohsh / sh;
+                    int64_t oh = ohsh / sh;
                     if(oh < 0 || oh >= OH)
                         continue;
 
-                    int32_t owsw = w + pw - s;
+                    int64_t owsw = w + pw - s;
                     if(owsw % sw != 0)
                         continue;
-                    int32_t ow = owsw / sw;
+                    int64_t ow = owsw / sw;
                     if(ow < 0 || ow >= OW)
                         continue;
 
-                    int32_t dstart = od * sd - pd;
-                    int32_t hstart = oh * sh - ph;
-                    int32_t wstart = ow * sw - pw;
-                    int32_t dend   = min(dstart + KD, D + pd);
-                    int32_t hend   = min(hstart + R, H + ph);
-                    int32_t wend   = min(wstart + S, W + pw);
+                    int64_t dstart = od * sd - pd;
+                    int64_t hstart = oh * sh - ph;
+                    int64_t wstart = ow * sw - pw;
+                    int64_t dend   = min(dstart + KD, D + pd);
+                    int64_t hend   = min(hstart + R, H + ph);
+                    int64_t wend   = min(wstart + S, W + pw);
 
-                    const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                    const int64_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
                     dstart                  = max(dstart, 0);
                     hstart                  = max(hstart, 0);
                     wstart                  = max(wstart, 0);
                     dend                    = min(dend, D);
                     hend                    = min(hend, H);
                     wend                    = min(wend, W);
-                    int32_t divide_factor;
+                    int64_t divide_factor;
                     if(divisor_override != 0)
                     {
                         divide_factor = divisor_override;
diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp
index 94898d32b6..bb6019354a 100644
--- a/test/gtest/avgpool.hpp
+++ b/test/gtest/avgpool.hpp
@@ -51,13 +51,13 @@ inline std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
 
 struct AvgPoolTestCase
 {
-    std::vector<int32_t> input_dims;
-    std::vector<int32_t> kernel_size;
-    std::vector<int32_t> stride;
-    std::vector<int32_t> padding;
+    std::vector<int64_t> input_dims;
+    std::vector<int64_t> kernel_size;
+    std::vector<int64_t> stride;
+    std::vector<int64_t> padding;
     bool ceil_mode;
     bool count_include_pad;
-    int32_t divisor_override;
+    int64_t divisor_override;
 
     friend std::ostream& operator<<(std::ostream& os, const AvgPoolTestCase& tc)
     {
@@ -67,7 +67,7 @@ struct AvgPoolTestCase
                   << " divisor_override:" << tc.divisor_override;
     }
 
-    std::vector<int32_t> GetInput() const { return input_dims; }
+    std::vector<int64_t> GetInput() const { return input_dims; }
 };
 
 inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdFp32()
@@ -132,11 +132,11 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
         D                 = in_dim.size() == 5 ? in_dim[2] : 1;
         H                 = in_dim.size() == 5 ? in_dim[3] : in_dim[2];
         W                 = in_dim.size() == 5 ? in_dim[4] : in_dim[3];
-        ksize             = tensor<int32_t>{in_dim.size() - 2};
+        ksize             = tensor<int64_t>{in_dim.size() - 2};
         ksize.data        = avgpool_config.kernel_size;
-        stride            = tensor<int32_t>{in_dim.size() - 2};
+        stride            = tensor<int64_t>{in_dim.size() - 2};
         stride.data       = avgpool_config.stride;
-        padding           = tensor<int32_t>{in_dim.size() - 2};
+        padding           = tensor<int64_t>{in_dim.size() - 2};
         padding.data      = avgpool_config.padding;
         ceil_mode         = avgpool_config.ceil_mode;
         count_include_pad = avgpool_config.count_include_pad;
@@ -147,7 +147,7 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
         };
         input = tensor<T>{in_dim}.generate(gen_input_value);
 
-        std::vector<int32_t> out_dim;
+        std::vector<int64_t> out_dim;
         if(in_dim.size() == 5)
         {
             if(ceil_mode)
@@ -268,14 +268,14 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
     tensor<T> input;
     tensor<T> output;
     tensor<T> ref_output;
-    tensor<int32_t> ksize;
-    tensor<int32_t> stride;
-    tensor<int32_t> padding;
+    tensor<int64_t> ksize;
+    tensor<int64_t> stride;
+    tensor<int64_t> padding;
 
     bool ceil_mode;
     bool count_include_pad;
-    int32_t divisor_override;
-    int32_t N, C, D, H, W, OD, OH, OW;
+    int64_t divisor_override;
+    int64_t N, C, D, H, W, OD, OH, OW;
 
     miopen::Allocator::ManageDataPtr input_dev;
     miopen::Allocator::ManageDataPtr output_dev;
@@ -299,17 +299,17 @@ struct AvgPoolTestBwd : public ::testing::TestWithParam<AvgPoolTestCase>
         D                 = in_grad_dim.size() == 5 ? in_grad_dim[2] : 1;
         H                 = in_grad_dim.size() == 5 ? in_grad_dim[3] : in_grad_dim[2];
         W                 = in_grad_dim.size() == 5 ? in_grad_dim[4] : in_grad_dim[3];
-        ksize             = tensor<int32_t>{in_grad_dim.size() - 2};
+        ksize             = tensor<int64_t>{in_grad_dim.size() - 2};
         ksize.data        = avgpool_config.kernel_size;
-        stride            = tensor<int32_t>{in_grad_dim.size() - 2};
+        stride            = tensor<int64_t>{in_grad_dim.size() - 2};
         stride.data       = avgpool_config.stride;
-        padding           = tensor<int32_t>{in_grad_dim.size() - 2};
+        padding           = tensor<int64_t>{in_grad_dim.size() - 2};
         padding.data      = avgpool_config.padding;
         ceil_mode         = avgpool_config.ceil_mode;
         count_include_pad = avgpool_config.count_include_pad;
         divisor_override  = avgpool_config.divisor_override;
 
-        std::vector<int32_t> out_grad_dim;
+        std::vector<int64_t> out_grad_dim;
         if(in_grad_dim.size() == 5)
         {
             if(ceil_mode)
@@ -434,14 +434,14 @@ struct AvgPoolTestBwd : public ::testing::TestWithParam<AvgPoolTestCase>
     tensor<T> output_grad;
     tensor<T> input_grad;
     tensor<T> ref_input_grad;
-    tensor<int32_t> ksize;
-    tensor<int32_t> stride;
-    tensor<int32_t> padding;
+    tensor<int64_t> ksize;
+    tensor<int64_t> stride;
+    tensor<int64_t> padding;
 
     bool ceil_mode;
     bool count_include_pad;
-    int32_t divisor_override;
-    int32_t N, C, D, H, W, OD, OH, OW;
+    int64_t divisor_override;
+    int64_t N, C, D, H, W, OD, OH, OW;
 
     miopen::Allocator::ManageDataPtr output_grad_dev;
     miopen::Allocator::ManageDataPtr input_grad_dev;

From beba4df66ca396cee31ed12f36ab12bf278a9226 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Thu, 3 Oct 2024 15:45:26 +0700
Subject: [PATCH 11/29] fix tensor_view and add noncont test

---
 driver/avgpool_driver.hpp                    |  50 ++-
 src/include/miopen/avgpool/invoke_params.hpp |  40 +--
 src/kernels/MIOpenAvgPool.cpp                |  29 +-
 src/kernels/tensor_view.hpp                  |  46 +--
 src/solver/avgpool/forward_avgpool_3d.cpp    |   4 -
 test/cpu_avgpool.hpp                         | 355 +++++++++----------
 test/gtest/avgpool.hpp                       | 212 ++++++-----
 7 files changed, 370 insertions(+), 366 deletions(-)

diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp
index ff7d04edd5..9563be6718 100644
--- a/driver/avgpool_driver.hpp
+++ b/driver/avgpool_driver.hpp
@@ -56,6 +56,7 @@ class AvgPoolDriver : public Driver
         data_type = miopen_type<Tgpu>{};
     }
 
+    std::vector<int> ComputeStrides(std::vector<int> input);
     int AddCmdLineArgs() override;
     int ParseCmdLineArgs(int argc, char* argv[]) override;
     InputFlags& GetInputFlags() override { return inflags; }
@@ -103,22 +104,24 @@ class AvgPoolDriver : public Driver
     std::vector<Tgpu> input_grad;
     std::vector<Tref> input_grad_host;
     std::vector<Tgpu> output_grad;
-    std::vector<int32_t> ksize;
-    std::vector<int32_t> stride;
-    std::vector<int32_t> padding;
+    std::vector<int64_t> ksize;
+    std::vector<int64_t> stride;
+    std::vector<int64_t> padding;
 
     bool ceil_mode;
     bool count_include_pad;
-    int32_t divisor_override;
-    int32_t N, C, D, H, W, OD, OH, OW;
+    int64_t divisor_override;
+    int64_t N, C, D, H, W, OD, OH, OW;
 
     std::vector<int> in_dim;
+    bool isContiguous;
 };
 
 template <typename Tgpu, typename Tref>
 int AvgPoolDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
 {
     inflags.Parse(argc, argv);
+    isContiguous = inflags.GetValueInt("is-contiguous") == 1 ? true : false;
 
     if(inflags.GetValueInt("time") == 1)
     {
@@ -160,11 +163,12 @@ std::vector<int> AvgPoolDriver<Tgpu, Tref>::GetInputTensorDimsFromCmd(const char
 template <typename Tgpu, typename Tref>
 int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
 {
-    in_dim      = GetInputTensorDimsFromCmd("input_dims");
-    int ksp_dim = in_dim.size() - 2;
-    ksize       = GetInputTensorDimsFromCmd("kernel_size");
-    stride      = GetInputTensorDimsFromCmd("stride");
-    padding     = GetInputTensorDimsFromCmd("padding");
+    in_dim                     = GetInputTensorDimsFromCmd("input_dims");
+    std::vector<int> in_stride = ComputeStrides(in_dim);
+    int ksp_dim                = in_dim.size() - 2;
+    ksize                      = GetInputTensorDimsFromCmd("kernel_size");
+    stride                     = GetInputTensorDimsFromCmd("stride");
+    padding                    = GetInputTensorDimsFromCmd("padding");
 
     if(ksize.size() != ksp_dim)
     {
@@ -195,7 +199,7 @@ int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
     H = in_dim.size() == 5 ? in_dim[3] : in_dim[2];
     W = in_dim.size() == 5 ? in_dim[4] : in_dim[3];
 
-    std::vector<int32_t> out_dim;
+    std::vector<int64_t> out_dim;
     if(in_dim.size() == 5)
     {
         if(ceil_mode)
@@ -210,7 +214,7 @@ int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
             OH = std::floor(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
             OW = std::floor(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
         }
-        out_dim = std::vector<int32_t>{N, C, OD, OH, OW};
+        out_dim = std::vector<int64_t>{N, C, OD, OH, OW};
     }
     else
     {
@@ -224,16 +228,32 @@ int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
             OH = std::floor(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
             OW = std::floor(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
         }
-        out_dim = std::vector<int32_t>{N, C, OH, OW};
+        out_dim = std::vector<int64_t>{N, C, OH, OW};
     }
-    SetTensorNd(inputDesc, in_dim, data_type);
+    std::vector<int> out_grad_stride = ComputeStrides(out_dim);
+    SetTensorNd(inputDesc, in_dim, in_stride, data_type);
     SetTensorNd(outputDesc, out_dim, data_type);
-    SetTensorNd(outputGradDesc, out_dim, data_type);
+    SetTensorNd(outputGradDesc, out_dim, out_grad_stride, data_type);
     SetTensorNd(inputGradDesc, in_dim, data_type);
 
     return miopenStatusSuccess;
 }
 
+// Equivalent to: tensor.tranpose(0, -1).contiguous().tranpose(0, -1) incase contiguous = False
+template <typename Tgpu, typename Tref>
+std::vector<int> AvgPoolDriver<Tgpu, Tref>::ComputeStrides(std::vector<int> inputDim)
+{
+    if(!isContiguous)
+        std::swap(inputDim.front(), inputDim.back());
+    std::vector<int> strides(inputDim.size());
+    strides.back() = 1;
+    for(int i = inputDim.size() - 2; i >= 0; --i)
+        strides[i] = strides[i + 1] * inputDim[i + 1];
+    if(!isContiguous)
+        std::swap(strides.front(), strides.back());
+    return strides;
+}
+
 template <typename Tgpu, typename Tref>
 int AvgPoolDriver<Tgpu, Tref>::AddCmdLineArgs()
 {
diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/avgpool/invoke_params.hpp
index e8bd9256ac..d312339c5e 100644
--- a/src/include/miopen/avgpool/invoke_params.hpp
+++ b/src/include/miopen/avgpool/invoke_params.hpp
@@ -45,17 +45,17 @@ struct FwdInvokeParams : public miopen::InvokeParams
     Data_t output     = nullptr;
     ConstData_t ksize = nullptr;
 
-    int32_t KD               = 0;
-    int32_t KH               = 0;
-    int32_t KW               = 0;
-    int32_t SD               = 0;
-    int32_t SH               = 0;
-    int32_t SW               = 0;
-    int32_t PD               = 0;
-    int32_t PH               = 0;
-    int32_t PW               = 0;
+    int64_t KD               = 0;
+    int64_t KH               = 0;
+    int64_t KW               = 0;
+    int64_t SD               = 0;
+    int64_t SH               = 0;
+    int64_t SW               = 0;
+    int64_t PD               = 0;
+    int64_t PH               = 0;
+    int64_t PW               = 0;
     bool count_include_pad   = false;
-    int32_t divisor_override = 0;
+    int64_t divisor_override = 0;
 
     std::size_t GetWorkspaceSize() const { return 0; }
     Data_t GetWorkspace() const { return nullptr; }
@@ -73,17 +73,17 @@ struct BwdInvokeParams : public miopen::InvokeParams
     Data_t input_grad       = nullptr;
     ConstData_t ksize       = nullptr;
 
-    int32_t KD               = 0;
-    int32_t KH               = 0;
-    int32_t KW               = 0;
-    int32_t SD               = 0;
-    int32_t SH               = 0;
-    int32_t SW               = 0;
-    int32_t PD               = 0;
-    int32_t PH               = 0;
-    int32_t PW               = 0;
+    int64_t KD               = 0;
+    int64_t KH               = 0;
+    int64_t KW               = 0;
+    int64_t SD               = 0;
+    int64_t SH               = 0;
+    int64_t SW               = 0;
+    int64_t PD               = 0;
+    int64_t PH               = 0;
+    int64_t PW               = 0;
     bool count_include_pad   = false;
-    int32_t divisor_override = 0;
+    int64_t divisor_override = 0;
 
     std::size_t GetWorkspaceSize() const { return 0; }
     Data_t GetWorkspace() const { return nullptr; }
diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
index 5fe015edc1..98299d7cdb 100644
--- a/src/kernels/MIOpenAvgPool.cpp
+++ b/src/kernels/MIOpenAvgPool.cpp
@@ -81,8 +81,7 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input,
             if(w < 0 || w >= W)
                 continue;
             // int64_t input_idx = ((n * C + c) * H + h) * W + w;
-            m += CVT_FLOAT2ACCUM(
-                input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]);
+            m += CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx({n, c, h, w})]);
         }
     }
 
@@ -116,7 +115,7 @@ __device__ void avgPoolForward2d(const TI* __restrict__ input,
     }
     FLOAT_ACCUM val = m / divide_factor;
 
-    output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] = CVT_ACCUM2FLOAT(val);
+    output[output_tv.get_tensor_view_idx({n, c, oh, ow})] = CVT_ACCUM2FLOAT(val);
 }
 
 extern "C" __global__ void AvgPoolForward2d(const INPUT_TYPE* __restrict__ input,
@@ -209,8 +208,7 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input,
                 if(w < 0 || w >= W)
                     continue;
                 // int64_t input_idx = ((n * C + c) * H + h) * W + w;
-                sum += CVT_FLOAT2ACCUM(
-                    input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]);
+                sum += CVT_FLOAT2ACCUM(input[input_tv.get_tensor_view_idx({n, c, d, h, w})]);
             }
         }
     }
@@ -245,9 +243,8 @@ __device__ void avgPoolForward3d(const TI* __restrict__ input,
             divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
         }
     }
-    FLOAT_ACCUM val = sum / divide_factor;
-    output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] =
-        CVT_ACCUM2FLOAT(val);
+    FLOAT_ACCUM val                                           = sum / divide_factor;
+    output[output_tv.get_tensor_view_idx({n, c, od, oh, ow})] = CVT_ACCUM2FLOAT(val);
 }
 
 extern "C" __global__ void AvgPoolForward3d(const INPUT_TYPE* __restrict__ input,
@@ -374,13 +371,12 @@ __device__ void avgPoolBackward2d(const TI* __restrict__ output_grad,
                 }
             }
 
-            grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(
-                        tensor_layout_t<4>(n, c, oh, ow))]) /
-                    divide_factor;
+            grad +=
+                CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx({n, c, oh, ow})]) /
+                divide_factor;
         }
     }
-    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] =
-        CVT_ACCUM2FLOAT(grad);
+    input_grad[input_grad_tv.get_tensor_view_idx({n, c, h, w})] = CVT_ACCUM2FLOAT(grad);
 }
 
 extern "C" __global__ void AvgPoolBackward2d(const INPUT_TYPE* __restrict__ output_grad,
@@ -514,14 +510,13 @@ __device__ void avgPoolBackward3d(const TI* __restrict__ output_grad,
                         divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
                     }
                 }
-                grad += CVT_FLOAT2ACCUM(output_grad[output_grad_tv.get_tensor_view_idx(
-                            tensor_layout_t<5>(n, c, od, oh, ow))]) /
+                grad += CVT_FLOAT2ACCUM(
+                            output_grad[output_grad_tv.get_tensor_view_idx({n, c, od, oh, ow})]) /
                         divide_factor;
             }
         }
     }
-    input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] =
-        CVT_ACCUM2FLOAT(grad);
+    input_grad[input_grad_tv.get_tensor_view_idx({n, c, d, h, w})] = CVT_ACCUM2FLOAT(grad);
 }
 
 extern "C" __global__ void AvgPoolBackward3d(const INPUT_TYPE* __restrict__ output_grad,
diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
index d64dbf21f9..12394dbde6 100644
--- a/src/kernels/tensor_view.hpp
+++ b/src/kernels/tensor_view.hpp
@@ -27,6 +27,8 @@
 #ifndef GUARD_TENSOR_VIEW_HPP
 #define GUARD_TENSOR_VIEW_HPP
 
+#include <initializer_list>
+
 template <int N>
 struct tensor_layout_t;
 
@@ -47,7 +49,6 @@ struct tensor_view_t
     uint64_t stride[N];
     uint64_t size[N];
 };
-
 template <int N>
 struct tensor_layout_t
 {
@@ -72,44 +73,13 @@ struct tensor_layout_t
         }
     }
 
-    constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t d, uint64_t h, uint64_t w)
-    {
-        static_assert(N == 5);
-        layout[0] = n;
-        layout[1] = c;
-        layout[2] = d;
-        layout[3] = h;
-        layout[4] = w;
-    }
-
-    constexpr tensor_layout_t(uint64_t n, uint64_t c, uint64_t h, uint64_t w)
-    {
-        static_assert(N == 4);
-        layout[0] = n;
-        layout[1] = c;
-        layout[2] = h;
-        layout[3] = w;
-    }
-
-    constexpr tensor_layout_t(uint64_t n, uint64_t h, uint64_t w)
-    {
-        static_assert(N == 3);
-        layout[0] = n;
-        layout[1] = h;
-        layout[2] = w;
-    }
-
-    constexpr tensor_layout_t(uint64_t n, uint64_t w)
-    {
-        static_assert(N == 2);
-        layout[0] = n;
-        layout[1] = w;
-    }
-
-    constexpr tensor_layout_t(uint64_t n)
+    constexpr tensor_layout_t(std::initializer_list<uint64_t> layout_)
     {
-        static_assert(N == 1);
-        layout[0] = n;
+        static_assert(N > 0);
+        for(auto i = 0; i < N; ++i)
+        {
+            layout[i] = layout_.begin()[i];
+        }
     }
 
     uint64_t layout[N];
diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp
index 6f70a07419..ce2e05305c 100644
--- a/src/solver/avgpool/forward_avgpool_3d.cpp
+++ b/src/solver/avgpool/forward_avgpool_3d.cpp
@@ -52,10 +52,6 @@ bool IsOverRocmFwd3d(const miopen::avgpool::FwdProblemDescription& problem)
     auto N      = problem.GetOutputDesc().GetLengths()[0];
     auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
 
-    std::cout << "in_over_out: " << in_over_out << std::endl;
-    std::cout << "in_nelems: " << in_nelems << std::endl;
-    std::cout << "out_nelems: " << out_nelems << std::endl;
-
     if(dtype == miopenFloat)
     {
         if(in_over_out < 2 || in_over_out >= 262144 || (out_nelems >= 10125000 && N > 4))
diff --git a/test/cpu_avgpool.hpp b/test/cpu_avgpool.hpp
index 069130ec3a..26965df52c 100644
--- a/test/cpu_avgpool.hpp
+++ b/test/cpu_avgpool.hpp
@@ -32,17 +32,17 @@
 template <class T>
 void cpu_avgpool_forward_2d(tensor<T> input,
                             tensor<T>& output,
-                            int64_t N,
-                            int64_t C,
-                            int64_t H,
-                            int64_t W,
-                            int64_t OH,
-                            int64_t OW,
-                            tensor<int64_t> ksize,
-                            tensor<int64_t> stride,
-                            tensor<int64_t> padding,
+                            long N,
+                            long C,
+                            long H,
+                            long W,
+                            long OH,
+                            long OW,
+                            tensor<long> ksize,
+                            tensor<long> stride,
+                            tensor<long> padding,
                             bool count_include_pad,
-                            int64_t divisor_override)
+                            long divisor_override)
 {
     auto dims  = input.desc.GetLengths();
     auto numel = output.desc.GetElementSize();
@@ -50,52 +50,51 @@ void cpu_avgpool_forward_2d(tensor<T> input,
     auto input_tv  = miopen::get_inner_expanded_tv<4>(input.desc);
     auto output_tv = miopen::get_inner_expanded_tv<4>(output.desc);
 
-    for(int64_t gid = 0; gid < numel; gid++)
+    for(long gid = 0; gid < numel; gid++)
     {
-        int64_t ncoh = gid / OW, ow = gid % OW;
-        int64_t nc = ncoh / OH, oh = ncoh % OH;
-        int64_t n = nc / C, c = nc % C;
-        int64_t R  = ksize[0];
-        int64_t S  = ksize[1];
-        int64_t sh = stride[0];
-        int64_t sw = stride[1];
-        int64_t ph = padding[0];
-        int64_t pw = padding[1];
+        long ncoh = gid / OW, ow = gid % OW;
+        long nc = ncoh / OH, oh = ncoh % OH;
+        long n = nc / C, c = nc % C;
+        long R  = ksize[0];
+        long S  = ksize[1];
+        long sh = stride[0];
+        long sw = stride[1];
+        long ph = padding[0];
+        long pw = padding[1];
 
         if(n >= N)
             return;
 
         float m = 0;
-        for(int64_t r = 0; r < R; ++r)
+        for(long r = 0; r < R; ++r)
         {
-            for(int64_t s = 0; s < S; ++s)
+            for(long s = 0; s < S; ++s)
             {
                 // input idx : (n, c, h, w)
-                int64_t h = oh * sh - ph + r;
+                long h = oh * sh - ph + r;
                 if(h < 0 || h >= H)
                     continue;
-                int64_t w = ow * sw - pw + s;
+                long w = ow * sw - pw + s;
                 if(w < 0 || w >= W)
                     continue;
-                // int64_t input_idx = ((n * C + c) * H + h) * W + w;
-                m += static_cast<float>(
-                    input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]);
+                // long input_idx = ((n * C + c) * H + h) * W + w;
+                m += static_cast<float>(input[input_tv.get_tensor_view_idx({n, c, h, w})]);
             }
         }
 
-        int64_t hstart = oh * sh - ph;
-        int64_t wstart = ow * sw - pw;
-        int64_t hend   = min(hstart + R, H + ph);
-        int64_t wend   = min(wstart + S, W + pw);
+        long hstart = oh * sh - ph;
+        long wstart = ow * sw - pw;
+        long hend   = min(hstart + R, H + ph);
+        long wend   = min(wstart + S, W + pw);
 
-        const int64_t pool_size = (hend - hstart) * (wend - wstart);
+        const long pool_size = (hend - hstart) * (wend - wstart);
 
         hstart = max(hstart, 0);
         wstart = max(wstart, 0);
         hend   = min(hend, H);
         wend   = min(wend, W);
 
-        int64_t divide_factor;
+        long divide_factor;
         if(divisor_override != 0)
         {
             divide_factor = divisor_override;
@@ -113,27 +112,26 @@ void cpu_avgpool_forward_2d(tensor<T> input,
         }
         float val = m / divide_factor;
 
-        output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] =
-            static_cast<T>(val);
+        output[output_tv.get_tensor_view_idx({n, c, oh, ow})] = static_cast<T>(val);
     }
 }
 
 template <class T>
 void cpu_avgpool_forward_3d(tensor<T> input,
                             tensor<T>& output,
-                            int64_t N,
-                            int64_t C,
-                            int64_t D,
-                            int64_t H,
-                            int64_t W,
-                            int64_t OD,
-                            int64_t OH,
-                            int64_t OW,
-                            tensor<int64_t> ksize,
-                            tensor<int64_t> stride,
-                            tensor<int64_t> padding,
+                            long N,
+                            long C,
+                            long D,
+                            long H,
+                            long W,
+                            long OD,
+                            long OH,
+                            long OW,
+                            tensor<long> ksize,
+                            tensor<long> stride,
+                            tensor<long> padding,
                             bool count_include_pad,
-                            int64_t divisor_override)
+                            long divisor_override)
 {
     auto dims  = input.desc.GetLengths();
     auto numel = output.desc.GetElementSize();
@@ -141,63 +139,62 @@ void cpu_avgpool_forward_3d(tensor<T> input,
     auto input_tv  = miopen::get_inner_expanded_tv<5>(input.desc);
     auto output_tv = miopen::get_inner_expanded_tv<5>(output.desc);
 
-    for(int64_t gid = 0; gid < numel; gid++)
+    for(long gid = 0; gid < numel; gid++)
     {
-        int64_t ncodoh = gid / OW, ow = gid % OW;
-        int64_t ncod = ncodoh / OH, oh = ncodoh % OH;
-        int64_t nc = ncod / OD, od = ncod % OD;
-        int64_t n = nc / C, c = nc % C;
-        int64_t KD = ksize[0];
-        int64_t R  = ksize[1];
-        int64_t S  = ksize[2];
-        int64_t sd = stride[0];
-        int64_t sh = stride[1];
-        int64_t sw = stride[2];
-        int64_t pd = padding[0];
-        int64_t ph = padding[1];
-        int64_t pw = padding[2];
+        long ncodoh = gid / OW, ow = gid % OW;
+        long ncod = ncodoh / OH, oh = ncodoh % OH;
+        long nc = ncod / OD, od = ncod % OD;
+        long n = nc / C, c = nc % C;
+        long KD = ksize[0];
+        long R  = ksize[1];
+        long S  = ksize[2];
+        long sd = stride[0];
+        long sh = stride[1];
+        long sw = stride[2];
+        long pd = padding[0];
+        long ph = padding[1];
+        long pw = padding[2];
 
         if(n >= N)
             return;
         float sum = 0;
-        for(int64_t kd = 0; kd < KD; ++kd)
+        for(long kd = 0; kd < KD; ++kd)
         {
-            for(int64_t r = 0; r < R; ++r)
+            for(long r = 0; r < R; ++r)
             {
-                for(int64_t s = 0; s < S; ++s)
+                for(long s = 0; s < S; ++s)
                 {
                     // input idx : (n, c, d, h, w)
-                    int64_t d = od * sd - pd + kd;
+                    long d = od * sd - pd + kd;
                     if(d < 0 || d >= D)
                         continue;
-                    int64_t h = oh * sh - ph + r;
+                    long h = oh * sh - ph + r;
                     if(h < 0 || h >= H)
                         continue;
-                    int64_t w = ow * sw - pw + s;
+                    long w = ow * sw - pw + s;
                     if(w < 0 || w >= W)
                         continue;
-                    // int64_t input_idx = ((n * C + c) * H + h) * W + w;
-                    sum += static_cast<float>(
-                        input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]);
+                    // long input_idx = ((n * C + c) * H + h) * W + w;
+                    sum += static_cast<float>(input[input_tv.get_tensor_view_idx({n, c, d, h, w})]);
                 }
             }
         }
-        int64_t dstart = od * sd - pd;
-        int64_t hstart = oh * sh - ph;
-        int64_t wstart = ow * sw - pw;
-        int64_t dend   = min(dstart + KD, D + pd);
-        int64_t hend   = min(hstart + R, H + ph);
-        int64_t wend   = min(wstart + S, W + pw);
-
-        const int64_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-        dstart                  = max(dstart, 0);
-        hstart                  = max(hstart, 0);
-        wstart                  = max(wstart, 0);
-        dend                    = min(dend, D);
-        hend                    = min(hend, H);
-        wend                    = min(wend, W);
-
-        int64_t divide_factor;
+        long dstart = od * sd - pd;
+        long hstart = oh * sh - ph;
+        long wstart = ow * sw - pw;
+        long dend   = min(dstart + KD, D + pd);
+        long hend   = min(hstart + R, H + ph);
+        long wend   = min(wstart + S, W + pw);
+
+        const long pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+        dstart               = max(dstart, 0);
+        hstart               = max(hstart, 0);
+        wstart               = max(wstart, 0);
+        dend                 = min(dend, D);
+        hend                 = min(hend, H);
+        wend                 = min(wend, W);
+
+        long divide_factor;
         if(divisor_override != 0)
         {
             divide_factor = divisor_override;
@@ -213,26 +210,25 @@ void cpu_avgpool_forward_3d(tensor<T> input,
                 divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
             }
         }
-        float val = sum / divide_factor;
-        output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] =
-            static_cast<T>(val);
+        float val                                                 = sum / divide_factor;
+        output[output_tv.get_tensor_view_idx({n, c, od, oh, ow})] = static_cast<T>(val);
     }
 }
 
 template <class T>
 void cpu_avgpool_backward_2d(tensor<T> output_grad,
                              tensor<T>& input_grad,
-                             int64_t N,
-                             int64_t C,
-                             int64_t H,
-                             int64_t W,
-                             int64_t OH,
-                             int64_t OW,
-                             tensor<int64_t> ksize,
-                             tensor<int64_t> stride,
-                             tensor<int64_t> padding,
+                             long N,
+                             long C,
+                             long H,
+                             long W,
+                             long OH,
+                             long OW,
+                             tensor<long> ksize,
+                             tensor<long> stride,
+                             tensor<long> padding,
                              bool count_include_pad,
-                             int64_t divisor_override)
+                             long divisor_override)
 {
     auto dims  = input_grad.desc.GetLengths();
     auto numel = input_grad.desc.GetElementSize();
@@ -240,52 +236,52 @@ void cpu_avgpool_backward_2d(tensor<T> output_grad,
     auto output_grad_tv = miopen::get_inner_expanded_tv<4>(output_grad.desc);
     auto input_grad_tv  = miopen::get_inner_expanded_tv<4>(input_grad.desc);
 
-    for(int64_t gid = 0; gid < numel; gid++)
+    for(long gid = 0; gid < numel; gid++)
     {
-        int64_t nch = gid / W, w = gid % W;
-        int64_t nc = nch / H, h = nch % H;
-        int64_t n = nc / C, c = nc % C;
-        int64_t R  = ksize[0];
-        int64_t S  = ksize[1];
-        int64_t sh = stride[0];
-        int64_t sw = stride[1];
-        int64_t ph = padding[0];
-        int64_t pw = padding[1];
+        long nch = gid / W, w = gid % W;
+        long nc = nch / H, h = nch % H;
+        long n = nc / C, c = nc % C;
+        long R  = ksize[0];
+        long S  = ksize[1];
+        long sh = stride[0];
+        long sw = stride[1];
+        long ph = padding[0];
+        long pw = padding[1];
 
         if(n >= N)
             return;
 
         float grad = 0;
-        for(int64_t r = 0; r < R; ++r)
+        for(long r = 0; r < R; ++r)
         {
-            for(int64_t s = 0; s < S; ++s)
+            for(long s = 0; s < S; ++s)
             {
-                int64_t ohsh = h + ph - r;
+                long ohsh = h + ph - r;
                 if(ohsh % sh != 0)
                     continue;
-                int64_t oh = ohsh / sh;
+                long oh = ohsh / sh;
                 if(oh < 0 || oh >= OH)
                     continue;
-                int64_t owsw = w + pw - s;
+                long owsw = w + pw - s;
                 if(owsw % sw != 0)
                     continue;
-                int64_t ow = owsw / sw;
+                long ow = owsw / sw;
                 if(ow < 0 || ow >= OW)
                     continue;
 
-                int64_t hstart = oh * sh - ph;
-                int64_t wstart = ow * sw - pw;
-                int64_t hend   = min(hstart + R, H + ph);
-                int64_t wend   = min(wstart + S, W + pw);
+                long hstart = oh * sh - ph;
+                long wstart = ow * sw - pw;
+                long hend   = min(hstart + R, H + ph);
+                long wend   = min(wstart + S, W + pw);
 
-                const int64_t pool_size = (hend - hstart) * (wend - wstart);
+                const long pool_size = (hend - hstart) * (wend - wstart);
 
                 hstart = max(hstart, 0);
                 wstart = max(wstart, 0);
                 hend   = min(hend, H);
                 wend   = min(wend, W);
 
-                int64_t divide_factor;
+                long divide_factor;
                 if(divisor_override != 0)
                 {
                     divide_factor = divisor_override;
@@ -302,32 +298,31 @@ void cpu_avgpool_backward_2d(tensor<T> output_grad,
                     }
                 }
 
-                grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
-                            tensor_layout_t<4>(n, c, oh, ow))]) /
+                grad += static_cast<float>(
+                            output_grad[output_grad_tv.get_tensor_view_idx({n, c, oh, ow})]) /
                         divide_factor;
             }
         }
-        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] =
-            static_cast<T>(grad);
+        input_grad[input_grad_tv.get_tensor_view_idx({n, c, h, w})] = static_cast<T>(grad);
     }
 }
 
 template <class T>
 void cpu_avgpool_backward_3d(tensor<T> output_grad,
                              tensor<T>& input_grad,
-                             int64_t N,
-                             int64_t C,
-                             int64_t D,
-                             int64_t H,
-                             int64_t W,
-                             int64_t OD,
-                             int64_t OH,
-                             int64_t OW,
-                             tensor<int64_t> ksize,
-                             tensor<int64_t> stride,
-                             tensor<int64_t> padding,
+                             long N,
+                             long C,
+                             long D,
+                             long H,
+                             long W,
+                             long OD,
+                             long OH,
+                             long OW,
+                             tensor<long> ksize,
+                             tensor<long> stride,
+                             tensor<long> padding,
                              bool count_include_pad,
-                             int64_t divisor_override)
+                             long divisor_override)
 {
     auto dims  = input_grad.desc.GetLengths();
     auto numel = input_grad.desc.GetElementSize();
@@ -335,68 +330,68 @@ void cpu_avgpool_backward_3d(tensor<T> output_grad,
     auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc);
     auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(input_grad.desc);
 
-    for(int64_t gid = 0; gid < numel; gid++)
+    for(long gid = 0; gid < numel; gid++)
     {
-        int64_t ncdh = gid / W, w = gid % W;
-        int64_t ncd = ncdh / H, h = ncdh % H;
-        int64_t nc = ncd / D, d = ncd % D;
-        int64_t n = nc / C, c = nc % C;
-        int64_t KD = ksize[0];
-        int64_t R  = ksize[1];
-        int64_t S  = ksize[2];
-        int64_t sd = stride[0];
-        int64_t sh = stride[1];
-        int64_t sw = stride[2];
-        int64_t pd = padding[0];
-        int64_t ph = padding[1];
-        int64_t pw = padding[2];
+        long ncdh = gid / W, w = gid % W;
+        long ncd = ncdh / H, h = ncdh % H;
+        long nc = ncd / D, d = ncd % D;
+        long n = nc / C, c = nc % C;
+        long KD = ksize[0];
+        long R  = ksize[1];
+        long S  = ksize[2];
+        long sd = stride[0];
+        long sh = stride[1];
+        long sw = stride[2];
+        long pd = padding[0];
+        long ph = padding[1];
+        long pw = padding[2];
 
         if(n >= N)
             return;
 
         float grad = 0;
-        for(int64_t kd = 0; kd < KD; ++kd)
+        for(long kd = 0; kd < KD; ++kd)
         {
-            for(int64_t r = 0; r < R; ++r)
+            for(long r = 0; r < R; ++r)
             {
-                for(int64_t s = 0; s < S; ++s)
+                for(long s = 0; s < S; ++s)
                 {
-                    int64_t odsd = d + pd - kd;
+                    long odsd = d + pd - kd;
                     if(odsd % sd != 0)
                         continue;
-                    int64_t od = odsd / sd;
+                    long od = odsd / sd;
                     if(od < 0 || od >= OD)
                         continue;
 
-                    int64_t ohsh = h + ph - r;
+                    long ohsh = h + ph - r;
                     if(ohsh % sh != 0)
                         continue;
-                    int64_t oh = ohsh / sh;
+                    long oh = ohsh / sh;
                     if(oh < 0 || oh >= OH)
                         continue;
 
-                    int64_t owsw = w + pw - s;
+                    long owsw = w + pw - s;
                     if(owsw % sw != 0)
                         continue;
-                    int64_t ow = owsw / sw;
+                    long ow = owsw / sw;
                     if(ow < 0 || ow >= OW)
                         continue;
 
-                    int64_t dstart = od * sd - pd;
-                    int64_t hstart = oh * sh - ph;
-                    int64_t wstart = ow * sw - pw;
-                    int64_t dend   = min(dstart + KD, D + pd);
-                    int64_t hend   = min(hstart + R, H + ph);
-                    int64_t wend   = min(wstart + S, W + pw);
-
-                    const int64_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-                    dstart                  = max(dstart, 0);
-                    hstart                  = max(hstart, 0);
-                    wstart                  = max(wstart, 0);
-                    dend                    = min(dend, D);
-                    hend                    = min(hend, H);
-                    wend                    = min(wend, W);
-                    int64_t divide_factor;
+                    long dstart = od * sd - pd;
+                    long hstart = oh * sh - ph;
+                    long wstart = ow * sw - pw;
+                    long dend   = min(dstart + KD, D + pd);
+                    long hend   = min(hstart + R, H + ph);
+                    long wend   = min(wstart + S, W + pw);
+
+                    const long pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                    dstart               = max(dstart, 0);
+                    hstart               = max(hstart, 0);
+                    wstart               = max(wstart, 0);
+                    dend                 = min(dend, D);
+                    hend                 = min(hend, H);
+                    wend                 = min(wend, W);
+                    long divide_factor;
                     if(divisor_override != 0)
                     {
                         divide_factor = divisor_override;
@@ -412,14 +407,14 @@ void cpu_avgpool_backward_3d(tensor<T> output_grad,
                             divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
                         }
                     }
-                    grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
-                                tensor_layout_t<5>(n, c, od, oh, ow))]) /
-                            divide_factor;
+                    grad +=
+                        static_cast<float>(
+                            output_grad[output_grad_tv.get_tensor_view_idx({n, c, od, oh, ow})]) /
+                        divide_factor;
                 }
             }
         }
-        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] =
-            static_cast<T>(grad);
+        input_grad[input_grad_tv.get_tensor_view_idx({n, c, d, h, w})] = static_cast<T>(grad);
     }
 }
 
diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp
index bb6019354a..4a22541b19 100644
--- a/test/gtest/avgpool.hpp
+++ b/test/gtest/avgpool.hpp
@@ -23,13 +23,10 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include "../driver/tensor_driver.hpp"
 #include "cpu_avgpool.hpp"
 #include "get_handle.hpp"
-#include "random.hpp"
 #include "tensor_holder.hpp"
 #include "verify.hpp"
-#include <cstdint>
 #include <gtest/gtest.h>
 #include <iostream>
 #include <miopen/avgpool.hpp>
@@ -51,69 +48,94 @@ inline std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
 
 struct AvgPoolTestCase
 {
-    std::vector<int64_t> input_dims;
+    std::vector<size_t> input_dims;
     std::vector<int64_t> kernel_size;
     std::vector<int64_t> stride;
     std::vector<int64_t> padding;
     bool ceil_mode;
     bool count_include_pad;
     int64_t divisor_override;
+    bool is_contiguous = true;
 
     friend std::ostream& operator<<(std::ostream& os, const AvgPoolTestCase& tc)
     {
         return os << " input_dims:" << tc.input_dims << " kernel_size:" << tc.kernel_size
                   << " stride:" << tc.stride << " padding:" << tc.padding
                   << " ceil_mode:" << tc.ceil_mode << " count_include_pad:" << tc.count_include_pad
-                  << " divisor_override:" << tc.divisor_override;
+                  << " divisor_override:" << tc.divisor_override
+                  << "is_contiguous:" << tc.is_contiguous;
     }
 
-    std::vector<int64_t> GetInput() const { return input_dims; }
+    std::vector<size_t> GetInput() const { return input_dims; }
+    std::vector<size_t> ComputeStrides(std::vector<size_t> inputDim) const
+    {
+        if(!is_contiguous)
+            std::swap(inputDim.front(), inputDim.back());
+        std::vector<size_t> strides(inputDim.size());
+        strides.back() = 1;
+        for(int i = inputDim.size() - 2; i >= 0; --i)
+            strides[i] = strides[i + 1] * inputDim[i + 1];
+        if(!is_contiguous)
+            std::swap(strides.front(), strides.back());
+        return strides;
+    }
 };
 
 inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdFp32()
 {
     return {
-        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0},
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0, true},
+        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0, false},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, true},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, false},
     };
 }
 
 inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdFp16()
 {
     return {
-        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0},
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0, true},
+        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0, false},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, true},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, false},
     };
 }
 
 inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdBfp16()
 {
     return {
-        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0},
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0, true},
+        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0, false},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, true},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, false},
     };
 }
 
 inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdFp32()
 {
     return {
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, true},
+        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, false},
     };
 }
 
 inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdFp16()
 {
     return {
-        {{64, 288, 35, 35}, {3, 3}, {1, 1}, {1, 1}, false, true, 0},
-        {{6, 288, 35, 35, 35}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, false, true, 0},
+        {{64, 288, 35, 35}, {3, 3}, {1, 1}, {1, 1}, false, true, 0, true},
+        {{64, 288, 35, 35}, {3, 3}, {1, 1}, {1, 1}, false, true, 0, false},
+        {{6, 288, 35, 35, 35}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, false, true, 0, true},
+        {{6, 288, 35, 35, 35}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, false, true, 0, false},
     };
 }
 
 inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdBfp16()
 {
     return {
-        {{64, 2048, 9, 9}, {3, 3}, {1, 1}, {1, 1}, false, true, 0},
-        {{6, 128, 112, 112, 112}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0},
+        {{64, 2048, 9, 9}, {3, 3}, {1, 1}, {1, 1}, false, true, 0, true},
+        {{64, 2048, 9, 9}, {3, 3}, {1, 1}, {1, 1}, false, true, 0, false},
+        {{6, 128, 112, 112, 112}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, true},
+        {{6, 128, 112, 112, 112}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, false},
     };
 }
 
@@ -124,20 +146,33 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
 protected:
     void SetUp() override
     {
-        auto&& handle     = get_handle();
-        avgpool_config    = GetParam();
-        auto in_dim       = avgpool_config.GetInput();
-        N                 = in_dim[0];
-        C                 = in_dim[1];
-        D                 = in_dim.size() == 5 ? in_dim[2] : 1;
-        H                 = in_dim.size() == 5 ? in_dim[3] : in_dim[2];
-        W                 = in_dim.size() == 5 ? in_dim[4] : in_dim[3];
-        ksize             = tensor<int64_t>{in_dim.size() - 2};
-        ksize.data        = avgpool_config.kernel_size;
-        stride            = tensor<int64_t>{in_dim.size() - 2};
-        stride.data       = avgpool_config.stride;
-        padding           = tensor<int64_t>{in_dim.size() - 2};
-        padding.data      = avgpool_config.padding;
+        auto&& handle                  = get_handle();
+        avgpool_config                 = GetParam();
+        std::vector<size_t> in_dim     = avgpool_config.GetInput();
+        std::vector<size_t> in_strides = avgpool_config.ComputeStrides(in_dim);
+
+        N            = in_dim[0];
+        C            = in_dim[1];
+        D            = in_dim.size() == 5 ? in_dim[2] : 1;
+        H            = in_dim.size() == 5 ? in_dim[3] : in_dim[2];
+        W            = in_dim.size() == 5 ? in_dim[4] : in_dim[3];
+        ksize        = tensor<int64_t>{in_dim.size() - 2};
+        ksize.data   = avgpool_config.kernel_size;
+        stride       = tensor<int64_t>{in_dim.size() - 2};
+        stride.data  = avgpool_config.stride;
+        padding      = tensor<int64_t>{in_dim.size() - 2};
+        padding.data = avgpool_config.padding;
+
+        ksize_long   = tensor<long>{in_dim.size() - 2};
+        stride_long  = tensor<long>{in_dim.size() - 2};
+        padding_long = tensor<long>{in_dim.size() - 2};
+        for(int i = 0; i < in_dim.size() - 2; i++)
+        {
+            ksize_long.data[i]   = static_cast<long>(ksize.data[i]);
+            stride_long.data[i]  = static_cast<long>(stride.data[i]);
+            padding_long.data[i] = static_cast<long>(padding.data[i]);
+        }
+
         ceil_mode         = avgpool_config.ceil_mode;
         count_include_pad = avgpool_config.count_include_pad;
         divisor_override  = avgpool_config.divisor_override;
@@ -145,9 +180,9 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
         auto gen_input_value = [](auto...) {
             return prng::gen_A_to_B<T>(static_cast<T>(-10.0f), static_cast<T>(10.0f));
         };
-        input = tensor<T>{in_dim}.generate(gen_input_value);
+        input = tensor<T>{in_dim, in_strides}.generate(gen_input_value);
 
-        std::vector<int64_t> out_dim;
+        std::vector<size_t> out_dim;
         if(in_dim.size() == 5)
         {
             if(ceil_mode)
@@ -185,11 +220,8 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
         ref_output = tensor<T>{out_dim};
         std::fill(ref_output.begin(), ref_output.end(), std::numeric_limits<T>::quiet_NaN());
 
-        input_dev   = handle.Write(input.data);
-        output_dev  = handle.Write(output.data);
-        ksize_dev   = handle.Write(ksize.data);
-        stride_dev  = handle.Write(stride.data);
-        padding_dev = handle.Write(padding.data);
+        input_dev  = handle.Write(input.data);
+        output_dev = handle.Write(output.data);
     }
 
     void RunTest()
@@ -202,15 +234,15 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
         {
             cpu_avgpool_forward_2d(input,
                                    ref_output,
-                                   N,
-                                   C,
-                                   H,
-                                   W,
-                                   OH,
-                                   OW,
-                                   ksize,
-                                   stride,
-                                   padding,
+                                   static_cast<long>(N),
+                                   static_cast<long>(C),
+                                   static_cast<long>(H),
+                                   static_cast<long>(W),
+                                   static_cast<long>(OH),
+                                   static_cast<long>(OW),
+                                   ksize_long,
+                                   stride_long,
+                                   padding_long,
                                    count_include_pad,
                                    divisor_override);
         }
@@ -218,17 +250,17 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
         {
             cpu_avgpool_forward_3d<T>(input,
                                       ref_output,
-                                      N,
-                                      C,
-                                      D,
-                                      H,
-                                      W,
-                                      OD,
-                                      OH,
-                                      OW,
-                                      ksize,
-                                      stride,
-                                      padding,
+                                      static_cast<long>(N),
+                                      static_cast<long>(C),
+                                      static_cast<long>(D),
+                                      static_cast<long>(H),
+                                      static_cast<long>(W),
+                                      static_cast<long>(OD),
+                                      static_cast<long>(OH),
+                                      static_cast<long>(OW),
+                                      ksize_long,
+                                      stride_long,
+                                      padding_long,
                                       count_include_pad,
                                       divisor_override);
         }
@@ -237,13 +269,13 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
                                         input_dev.get(),
                                         output.desc,
                                         output_dev.get(),
-                                        ksize.GetSize() == 3 ? ksize[0] : 0,
+                                        ksize.GetSize() == 3 ? ksize[0] : 1,
                                         ksize.GetSize() == 3 ? ksize[1] : ksize[0],
                                         ksize.GetSize() == 3 ? ksize[2] : ksize[1],
-                                        stride.GetSize() == 3 ? stride[0] : 0,
+                                        stride.GetSize() == 3 ? stride[0] : 1,
                                         stride.GetSize() == 3 ? stride[1] : stride[0],
                                         stride.GetSize() == 3 ? stride[2] : stride[1],
-                                        padding.GetSize() == 3 ? padding[0] : 0,
+                                        padding.GetSize() == 3 ? padding[0] : 1,
                                         padding.GetSize() == 3 ? padding[1] : padding[0],
                                         padding.GetSize() == 3 ? padding[2] : padding[1],
                                         count_include_pad,
@@ -269,19 +301,19 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
     tensor<T> output;
     tensor<T> ref_output;
     tensor<int64_t> ksize;
+    tensor<long> ksize_long;
     tensor<int64_t> stride;
+    tensor<long> stride_long;
     tensor<int64_t> padding;
+    tensor<long> padding_long;
 
     bool ceil_mode;
     bool count_include_pad;
     int64_t divisor_override;
-    int64_t N, C, D, H, W, OD, OH, OW;
+    int64_t N = 1, C = 1, D = 1, H = 1, W = 1, OD = 1, OH = 1, OW = 1;
 
     miopen::Allocator::ManageDataPtr input_dev;
     miopen::Allocator::ManageDataPtr output_dev;
-    miopen::Allocator::ManageDataPtr ksize_dev;
-    miopen::Allocator::ManageDataPtr stride_dev;
-    miopen::Allocator::ManageDataPtr padding_dev;
 };
 
 // BACKWARD TEST
@@ -309,7 +341,7 @@ struct AvgPoolTestBwd : public ::testing::TestWithParam<AvgPoolTestCase>
         count_include_pad = avgpool_config.count_include_pad;
         divisor_override  = avgpool_config.divisor_override;
 
-        std::vector<int64_t> out_grad_dim;
+        std::vector<size_t> out_grad_dim;
         if(in_grad_dim.size() == 5)
         {
             if(ceil_mode)
@@ -340,10 +372,12 @@ struct AvgPoolTestBwd : public ::testing::TestWithParam<AvgPoolTestCase>
             }
             out_grad_dim = {N, C, OH, OW};
         }
+
         auto gen_output_grad_value = [](auto...) {
             return prng::gen_A_to_B<T>(static_cast<T>(-10.0f), static_cast<T>(10.0f));
         };
-        output_grad = tensor<T>{out_grad_dim}.generate(gen_output_grad_value);
+        auto out_grad_strides = avgpool_config.ComputeStrides(out_grad_dim);
+        output_grad = tensor<T>{out_grad_dim, out_grad_strides}.generate(gen_output_grad_value);
 
         input_grad = tensor<T>{in_grad_dim};
         std::fill(input_grad.begin(), input_grad.end(), std::numeric_limits<T>::quiet_NaN());
@@ -354,9 +388,6 @@ struct AvgPoolTestBwd : public ::testing::TestWithParam<AvgPoolTestCase>
 
         output_grad_dev = handle.Write(output_grad.data);
         input_grad_dev  = handle.Write(input_grad.data);
-        ksize_dev       = handle.Write(ksize.data);
-        stride_dev      = handle.Write(stride.data);
-        padding_dev     = handle.Write(padding.data);
     }
 
     void RunTest()
@@ -370,48 +401,48 @@ struct AvgPoolTestBwd : public ::testing::TestWithParam<AvgPoolTestCase>
         {
             cpu_avgpool_backward_2d(output_grad,
                                     ref_input_grad,
-                                    N,
-                                    C,
-                                    H,
-                                    W,
-                                    OH,
-                                    OW,
+                                    static_cast<long>(N),
+                                    static_cast<long>(C),
+                                    static_cast<long>(H),
+                                    static_cast<long>(W),
+                                    static_cast<long>(OH),
+                                    static_cast<long>(OW),
                                     ksize,
                                     stride,
                                     padding,
                                     count_include_pad,
-                                    divisor_override);
+                                    static_cast<long>(divisor_override));
         }
         else if(dims == 5)
         {
             cpu_avgpool_backward_3d<T>(output_grad,
                                        ref_input_grad,
-                                       N,
-                                       C,
-                                       D,
-                                       H,
-                                       W,
-                                       OD,
-                                       OH,
-                                       OW,
+                                       static_cast<long>(N),
+                                       static_cast<long>(C),
+                                       static_cast<long>(D),
+                                       static_cast<long>(H),
+                                       static_cast<long>(W),
+                                       static_cast<long>(OD),
+                                       static_cast<long>(OH),
+                                       static_cast<long>(OW),
                                        ksize,
                                        stride,
                                        padding,
                                        count_include_pad,
-                                       divisor_override);
+                                       static_cast<long>(divisor_override));
         }
         status = miopen::AvgPoolBackward(handle,
                                          output_grad.desc,
                                          output_grad_dev.get(),
                                          input_grad.desc,
                                          input_grad_dev.get(),
-                                         ksize.GetSize() == 3 ? ksize[0] : 0,
+                                         ksize.GetSize() == 3 ? ksize[0] : 1,
                                          ksize.GetSize() == 3 ? ksize[1] : ksize[0],
                                          ksize.GetSize() == 3 ? ksize[2] : ksize[1],
-                                         stride.GetSize() == 3 ? stride[0] : 0,
+                                         stride.GetSize() == 3 ? stride[0] : 1,
                                          stride.GetSize() == 3 ? stride[1] : stride[0],
                                          stride.GetSize() == 3 ? stride[2] : stride[1],
-                                         padding.GetSize() == 3 ? padding[0] : 0,
+                                         padding.GetSize() == 3 ? padding[0] : 1,
                                          padding.GetSize() == 3 ? padding[1] : padding[0],
                                          padding.GetSize() == 3 ? padding[2] : padding[1],
                                          count_include_pad,
@@ -445,7 +476,4 @@ struct AvgPoolTestBwd : public ::testing::TestWithParam<AvgPoolTestCase>
 
     miopen::Allocator::ManageDataPtr output_grad_dev;
     miopen::Allocator::ManageDataPtr input_grad_dev;
-    miopen::Allocator::ManageDataPtr ksize_dev;
-    miopen::Allocator::ManageDataPtr stride_dev;
-    miopen::Allocator::ManageDataPtr padding_dev;
 };

From ccd0b6fa0b65b301fd594e5a0d9e2e13978d3466 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Thu, 3 Oct 2024 15:53:10 +0700
Subject: [PATCH 12/29] resolve conflict

---
 docs/reference/index.rst         |   3 +
 driver/driver.hpp                |   9 ++-
 include/miopen/miopen.h          | 111 +++++++++++++++++++++++++++++++
 src/include/miopen/solver_id.hpp |   2 +
 4 files changed, 122 insertions(+), 3 deletions(-)

diff --git a/docs/reference/index.rst b/docs/reference/index.rst
index 9594e00ef0..8d57816fa0 100644
--- a/docs/reference/index.rst
+++ b/docs/reference/index.rst
@@ -35,4 +35,7 @@ The MIOpen API library is structured as follows:
   * :doc:`ReduceExtreme <../doxygen/html/group__ReduceExtreme>` (experimental)
   * :doc:`Getitem <../doxygen/html/group__getitem>` (experimental)
   * :doc:`ReduceCalculation <../doxygen/html/group__ReduceCalculation>` (experimental)
+  * :doc:`RotaryPositionalEmbeddings <../doxygen/html/group__RotaryPositionalEmbeddings>` (experimental)
+  * :doc:`ReLU <../doxygen/html/group___re_l_u>` (experimental)
+  * :doc:`GLU <../doxygen/html/group__glu>` (experimental)
   * :doc:`AvgPool <../doxygen/html/group__avgpool>` (experimental)
diff --git a/driver/driver.hpp b/driver/driver.hpp
index bd42f6ee13..5d2349523f 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -175,7 +175,8 @@ inline void PadBufferSize(size_t& sz, int datatype_sz)
            "groupnorm[bfp16|fp16], cat[bfp16|fp16], addlayernorm[bfp16|fp16], "
            "t5layernorm[bfp16|fp16], adam[fp16], ampadam, reduceextreme[bfp16|fp16], "
            "adamw[fp16], ampadamw, transformersadamw[fp16], transformersampadamw, "
-           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], avgpool[bfp16|fp16]\n");
+           "getitem[bfp16|fp16], reducecalculation[bfp16|fp16], rope[bfp16|fp16], "
+           "prelu[bfp16|fp16], glu[bfp16|fp16]\n, avgpool[bfp16|fp16]\n");
     exit(0); // NOLINT (concurrency-mt-unsafe)
 }
 
@@ -206,8 +207,10 @@ inline std::string ParseBaseArg(int argc, char* argv[])
        arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" &&
        arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" &&
        arg != "getitemfp16" && arg != "getitembfp16" && arg != "reducecalculation" &&
-       arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "avgpool" &&
-       arg != "avgpoolfp16" && arg != "avgpoolbfp16" && arg != "--version")
+       arg != "reducecalculationfp16" && arg != "reducecalculationbfp16" && arg != "rope" &&
+       arg != "ropefp16" && arg != "ropebfp16" && arg != "prelu" && arg != "prelufp16" &&
+       arg != "prelubfp16" && arg != "glu" && arg != "glufp16" && arg != "glubfp16" &&
+       arg != "avgpool" && arg != "avgpoolfp16" && arg != "avgpoolbfp16" && arg != "--version")
     {
         printf("FAILED: Invalid Base Input Argument\n");
         Usage();
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 2e8b988741..04156f33b0 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -70,6 +70,8 @@
  * @defgroup SGD
  * @defgroup getitem
  * @defgroup ReduceCalculation
+ * @defgroup RotaryPositionalEmbeddings
+ * @defgroup ReLU
  * @defgroup avgpool
  *
  */
@@ -7622,6 +7624,115 @@ MIOPEN_EXPORT miopenStatus_t miopenGetitemBackward(miopenHandle_t handle,
 // CLOSEOUT GETITEM DOXYGEN GROUP
 #endif // MIOPEN_BETA_API
 
+#ifdef MIOPEN_BETA_API
+// RotaryPositionalEmbeddings APIs
+/** @addtogroup RotaryPositionalEmbeddings
+ *
+ *  @{
+ */
+/*! @brief Execute a rope forward layer
+ *
+ * @param [in]   handle         MIOpen handle
+ * @param [in]   xDesc          Tensor descriptor for data input tensor x
+ * @param [in]   x              Data tensor x
+ * @param [in]   cosDesc        Tensor descriptor for data input tensor cos
+ * @param [in]   cos            Data tensor cos
+ * @param [in]   sinDesc        Tensor descriptor for data input tensor sin
+ * @param [in]   sin            Data tensor sin
+ * @param [in]   yDesc          Tensor descriptor for output data tensor y
+ * @param [out]  y              Data tensor y
+ * @return                      miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenRoPEForward(miopenHandle_t handle,
+                                               const miopenTensorDescriptor_t xDesc,
+                                               const void* x,
+                                               const miopenTensorDescriptor_t cosDesc,
+                                               const void* cos,
+                                               const miopenTensorDescriptor_t sinDesc,
+                                               const void* sin,
+                                               const miopenTensorDescriptor_t yDesc,
+                                               void* y);
+
+/*! @brief Execute a rope backward layer
+ *
+ * @param [in]   handle         MIOpen handle
+ * @param [in]   dyDesc         Tensor descriptor for data input tensor dy
+ * @param [in]   dy             Data tensor dy
+ * @param [in]   cosDesc        Tensor descriptor for output data tensor cos
+ * @param [in]   cos            Data tensor cos
+ * @param [in]   sinDesc        Tensor descriptor for data input tensor sin
+ * @param [in]   sin            Data tensor sin
+ * @param [in]   dxDesc         Tensor descriptor for output data tensor dx
+ * @param [out]  dx             Data tensor dx
+ * @return                      miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t miopenRoPEBackward(miopenHandle_t handle,
+                                                const miopenTensorDescriptor_t dyDesc,
+                                                const void* dy,
+                                                const miopenTensorDescriptor_t cosDesc,
+                                                const void* cos,
+                                                const miopenTensorDescriptor_t sinDesc,
+                                                const void* sin,
+                                                const miopenTensorDescriptor_t dxDesc,
+                                                void* dx);
+/** @} */
+// CLOSEOUT ROPE DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
+#ifdef MIOPEN_BETA_API
+/** @addtogroup ReLU
+ *
+ *  @{
+ */
+
+/*! @brief Helper function to query the minimum workspace size required by the PReLU backward call
+ *
+ * @param handle                   MIOpen Handle (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param weightDesc               Tensor descriptor for weight tensor (input)
+ * @param sizeInBytes              Pointer to data to return the minimum workspace size
+ * @return                         miopenStatus_t
+ */
+MIOPEN_EXPORT miopenStatus_t
+miopenGetPReLUBackwardWorkspaceSize(miopenHandle_t handle,
+                                    miopenTensorDescriptor_t inputDesc,
+                                    miopenTensorDescriptor_t weightDesc,
+                                    size_t* sizeInBytes);
+
+/*! @brief Execute a PReLU backward layer
+ *
+ * @param handle                   MIOpen handle (input)
+ * @param workspace                Address of the allocated workspace data (input)
+ * @param workspaceSizeInBytes     Size in bytes of the allocated workspace data (input)
+ * @param inputDesc                Tensor descriptor for input tensor (input)
+ * @param input                    Data tensor input (input)
+ * @param weightDesc               Tensor descriptor for weight tensor (input)
+ * @param weight                   Data tensor weight (input)
+ * @param doutputDesc              Tensor descriptor for output gradient (input)
+ * @param doutput                  Gradient of output (input)
+ * @param dinputDesc               Tensor descriptor for input gradient (input)
+ * @param dinput                   Gradient of input (output)
+ * @param dweightDesc              Tensor descriptor for weight gradient (input)
+ * @param dweight                  Gradient of weight (output)
+ */
+MIOPEN_EXPORT miopenStatus_t miopenPReLUBackward(miopenHandle_t handle,
+                                                 void* workspace,
+                                                 size_t workspaceSizeInBytes,
+                                                 miopenTensorDescriptor_t inputDesc,
+                                                 const void* input,
+                                                 miopenTensorDescriptor_t weightDesc,
+                                                 const void* weight,
+                                                 miopenTensorDescriptor_t doutputDesc,
+                                                 const void* doutput,
+                                                 miopenTensorDescriptor_t dinputDesc,
+                                                 void* dinput,
+                                                 miopenTensorDescriptor_t dweightDesc,
+                                                 void* dweight);
+
+/** @} */
+// CLOSEOUT RELU DOXYGEN GROUP
+#endif // MIOPEN_BETA_API
+
 #ifdef MIOPEN_BETA_API
 // avgpool APIs
 /** @addtogroup avgpool
diff --git a/src/include/miopen/solver_id.hpp b/src/include/miopen/solver_id.hpp
index 194afd79ac..18c538a1db 100644
--- a/src/include/miopen/solver_id.hpp
+++ b/src/include/miopen/solver_id.hpp
@@ -60,6 +60,8 @@ enum class Primitive
     Softmax,
     Adam,
     Item,
+    RoPE,
+    ReLU,
     AvgPool
 };
 

From 09fbbd85d274939efa5117409b221c0f771e3440 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Thu, 3 Oct 2024 17:52:08 +0700
Subject: [PATCH 13/29] fix driver

---
 driver/avgpool_driver.hpp              |  40 +--
 driver/mloAvgPoolHost.hpp              | 331 ++++++++++++-------------
 src/include/miopen/avgpool/solvers.hpp |   1 +
 test/gtest/avgpool.hpp                 |   2 +-
 4 files changed, 187 insertions(+), 187 deletions(-)

diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp
index 9563be6718..08f007611c 100644
--- a/driver/avgpool_driver.hpp
+++ b/driver/avgpool_driver.hpp
@@ -36,6 +36,7 @@
 #include <../test/tensor_holder.hpp>
 #include <../test/verify.hpp>
 
+#include <cstddef>
 #include <miopen/env.hpp>
 #include <miopen/handle.hpp>
 #include <miopen/miopen.h>
@@ -56,12 +57,13 @@ class AvgPoolDriver : public Driver
         data_type = miopen_type<Tgpu>{};
     }
 
-    std::vector<int> ComputeStrides(std::vector<int> input);
+    std::vector<size_t> ComputeStrides(std::vector<size_t> input);
     int AddCmdLineArgs() override;
     int ParseCmdLineArgs(int argc, char* argv[]) override;
     InputFlags& GetInputFlags() override { return inflags; }
 
-    std::vector<int> GetInputTensorDimsFromCmd(const char* param);
+    template <typename T>
+    std::vector<T> GetInputTensorDimsFromCmd(const char* param);
     int GetandSetData() override;
 
     int AllocateBuffersAndCopy() override;
@@ -113,7 +115,7 @@ class AvgPoolDriver : public Driver
     int64_t divisor_override;
     int64_t N, C, D, H, W, OD, OH, OW;
 
-    std::vector<int> in_dim;
+    std::vector<size_t> in_dim;
     bool isContiguous;
 };
 
@@ -131,11 +133,12 @@ int AvgPoolDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
 }
 
 template <typename Tgpu, typename Tref>
-std::vector<int> AvgPoolDriver<Tgpu, Tref>::GetInputTensorDimsFromCmd(const char* param)
+template <typename T>
+std::vector<T> AvgPoolDriver<Tgpu, Tref>::GetInputTensorDimsFromCmd(const char* param)
 {
     std::string lengthsStr = inflags.GetValueStr(param);
 
-    std::vector<int> lengths;
+    std::vector<T> lengths;
     std::size_t pos = 0;
     std::size_t new_pos;
 
@@ -155,7 +158,7 @@ std::vector<int> AvgPoolDriver<Tgpu, Tref>::GetInputTensorDimsFromCmd(const char
     std::string sliceStr = lengthsStr.substr(pos);
     int len              = std::stoi(sliceStr);
 
-    lengths.push_back(len);
+    lengths.push_back(static_cast<T>(len));
 
     return (lengths);
 }
@@ -163,12 +166,12 @@ std::vector<int> AvgPoolDriver<Tgpu, Tref>::GetInputTensorDimsFromCmd(const char
 template <typename Tgpu, typename Tref>
 int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
 {
-    in_dim                     = GetInputTensorDimsFromCmd("input_dims");
-    std::vector<int> in_stride = ComputeStrides(in_dim);
-    int ksp_dim                = in_dim.size() - 2;
-    ksize                      = GetInputTensorDimsFromCmd("kernel_size");
-    stride                     = GetInputTensorDimsFromCmd("stride");
-    padding                    = GetInputTensorDimsFromCmd("padding");
+    in_dim                        = GetInputTensorDimsFromCmd<size_t>("input_dims");
+    std::vector<size_t> in_stride = ComputeStrides(in_dim);
+    int ksp_dim                   = in_dim.size() - 2;
+    ksize                         = GetInputTensorDimsFromCmd<int64_t>("kernel_size");
+    stride                        = GetInputTensorDimsFromCmd<int64_t>("stride");
+    padding                       = GetInputTensorDimsFromCmd<int64_t>("padding");
 
     if(ksize.size() != ksp_dim)
     {
@@ -199,7 +202,7 @@ int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
     H = in_dim.size() == 5 ? in_dim[3] : in_dim[2];
     W = in_dim.size() == 5 ? in_dim[4] : in_dim[3];
 
-    std::vector<int64_t> out_dim;
+    std::vector<size_t> out_dim;
     if(in_dim.size() == 5)
     {
         if(ceil_mode)
@@ -214,7 +217,7 @@ int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
             OH = std::floor(static_cast<float>(H - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
             OW = std::floor(static_cast<float>(W - ksize[2] + 2 * padding[2]) / stride[2]) + 1;
         }
-        out_dim = std::vector<int64_t>{N, C, OD, OH, OW};
+        out_dim = {N, C, OD, OH, OW};
     }
     else
     {
@@ -228,9 +231,9 @@ int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
             OH = std::floor(static_cast<float>(H - ksize[0] + 2 * padding[0]) / stride[0]) + 1;
             OW = std::floor(static_cast<float>(W - ksize[1] + 2 * padding[1]) / stride[1]) + 1;
         }
-        out_dim = std::vector<int64_t>{N, C, OH, OW};
+        out_dim = {N, C, OH, OW};
     }
-    std::vector<int> out_grad_stride = ComputeStrides(out_dim);
+    std::vector<size_t> out_grad_stride = ComputeStrides(out_dim);
     SetTensorNd(inputDesc, in_dim, in_stride, data_type);
     SetTensorNd(outputDesc, out_dim, data_type);
     SetTensorNd(outputGradDesc, out_dim, out_grad_stride, data_type);
@@ -241,11 +244,11 @@ int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
 
 // Equivalent to: tensor.tranpose(0, -1).contiguous().tranpose(0, -1) incase contiguous = False
 template <typename Tgpu, typename Tref>
-std::vector<int> AvgPoolDriver<Tgpu, Tref>::ComputeStrides(std::vector<int> inputDim)
+std::vector<size_t> AvgPoolDriver<Tgpu, Tref>::ComputeStrides(std::vector<size_t> inputDim)
 {
     if(!isContiguous)
         std::swap(inputDim.front(), inputDim.back());
-    std::vector<int> strides(inputDim.size());
+    std::vector<size_t> strides(inputDim.size());
     strides.back() = 1;
     for(int i = inputDim.size() - 2; i >= 0; --i)
         strides[i] = strides[i + 1] * inputDim[i + 1];
@@ -294,6 +297,7 @@ int AvgPoolDriver<Tgpu, Tref>::AddCmdLineArgs()
                          "region will be used.",
                          "int");
 
+    inflags.AddInputFlag("is-contiguous", 'C', "1", "is-contiguous (Default=1)", "int");
     inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
     inflags.AddInputFlag("verify", 'V', "1", "Verify (Default=1)", "int");
     inflags.AddInputFlag("time", 't', "1", "Time (Default=1)", "int");
diff --git a/driver/mloAvgPoolHost.hpp b/driver/mloAvgPoolHost.hpp
index 6980ce968e..5249e8d195 100644
--- a/driver/mloAvgPoolHost.hpp
+++ b/driver/mloAvgPoolHost.hpp
@@ -32,19 +32,19 @@
 template <typename Tgpu, typename Tcheck>
 int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc,
                                    const miopenTensorDescriptor_t outputDesc,
-                                   Tgpu* input,
+                                   const Tgpu* input,
                                    Tcheck* output,
-                                   size_t N,
-                                   size_t C,
-                                   size_t H,
-                                   size_t W,
-                                   size_t OH,
-                                   size_t OW,
-                                   const int32_t* ksize,
-                                   const int32_t* stride,
-                                   const int32_t* padding,
+                                   int64_t N,
+                                   int64_t C,
+                                   int64_t H,
+                                   int64_t W,
+                                   int64_t OH,
+                                   int64_t OW,
+                                   const int64_t* ksize,
+                                   const int64_t* stride,
+                                   const int64_t* padding,
                                    bool count_include_pad,
-                                   int32_t divisor_override)
+                                   int64_t divisor_override)
 {
     auto dims  = miopen::deref(inputDesc).GetLengths();
     auto numel = miopen::deref(outputDesc).GetElementSize();
@@ -52,52 +52,51 @@ int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc,
     auto input_tv  = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc));
     auto output_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputDesc));
 
-    for(int32_t gid = 0; gid < numel; gid++)
+    for(int64_t gid = 0; gid < numel; gid++)
     {
-        int32_t ncoh = gid / OW, ow = gid % OW;
-        int32_t nc = ncoh / OH, oh = ncoh % OH;
-        int32_t n = nc / C, c = nc % C;
-        int32_t R  = ksize[0];
-        int32_t S  = ksize[1];
-        int32_t sh = stride[0];
-        int32_t sw = stride[1];
-        int32_t ph = padding[0];
-        int32_t pw = padding[1];
+        int64_t ncoh = gid / OW, ow = gid % OW;
+        int64_t nc = ncoh / OH, oh = ncoh % OH;
+        int64_t n = nc / C, c = nc % C;
+        int64_t R  = ksize[0];
+        int64_t S  = ksize[1];
+        int64_t sh = stride[0];
+        int64_t sw = stride[1];
+        int64_t ph = padding[0];
+        int64_t pw = padding[1];
 
         if(n >= N)
             return 0;
 
         float m = 0;
-        for(int32_t r = 0; r < R; ++r)
+        for(int64_t r = 0; r < R; ++r)
         {
-            for(int32_t s = 0; s < S; ++s)
+            for(int64_t s = 0; s < S; ++s)
             {
                 // input idx : (n, c, h, w)
-                int32_t h = oh * sh - ph + r;
+                int64_t h = oh * sh - ph + r;
                 if(h < 0 || h >= H)
                     continue;
-                int32_t w = ow * sw - pw + s;
+                int64_t w = ow * sw - pw + s;
                 if(w < 0 || w >= W)
                     continue;
-                // int32_t input_idx = ((n * C + c) * H + h) * W + w;
-                m += static_cast<float>(
-                    input[input_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))]);
+                // int64_t input_idx = ((n * C + c) * H + h) * W + w;
+                m += static_cast<float>(input[input_tv.get_tensor_view_idx({n, c, h, w})]);
             }
         }
 
-        int32_t hstart = oh * sh - ph;
-        int32_t wstart = ow * sw - pw;
-        int32_t hend   = min(hstart + R, H + ph);
-        int32_t wend   = min(wstart + S, W + pw);
+        int64_t hstart = oh * sh - ph;
+        int64_t wstart = ow * sw - pw;
+        int64_t hend   = min(hstart + R, H + ph);
+        int64_t wend   = min(wstart + S, W + pw);
 
-        const int32_t pool_size = (hend - hstart) * (wend - wstart);
+        const int64_t pool_size = (hend - hstart) * (wend - wstart);
 
         hstart = max(hstart, 0);
         wstart = max(wstart, 0);
         hend   = min(hend, H);
         wend   = min(wend, W);
 
-        int32_t divide_factor;
+        int64_t divide_factor;
         if(divisor_override != 0)
         {
             divide_factor = divisor_override;
@@ -115,8 +114,7 @@ int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc,
         }
         float val = m / divide_factor;
 
-        output[output_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, oh, ow))] =
-            static_cast<Tcheck>(val);
+        output[output_tv.get_tensor_view_idx({n, c, oh, ow})] = static_cast<Tcheck>(val);
     }
     return 0;
 }
@@ -124,21 +122,21 @@ int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc,
 template <typename Tgpu, typename Tcheck>
 int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc,
                                    const miopenTensorDescriptor_t outputDesc,
-                                   Tgpu* input,
+                                   const Tgpu* input,
                                    Tcheck* output,
-                                   size_t N,
-                                   size_t C,
-                                   size_t D,
-                                   size_t H,
-                                   size_t W,
-                                   size_t OD,
-                                   size_t OH,
-                                   size_t OW,
-                                   const int32_t* ksize,
-                                   const int32_t* stride,
-                                   const int32_t* padding,
+                                   int64_t N,
+                                   int64_t C,
+                                   int64_t D,
+                                   int64_t H,
+                                   int64_t W,
+                                   int64_t OD,
+                                   int64_t OH,
+                                   int64_t OW,
+                                   const int64_t* ksize,
+                                   const int64_t* stride,
+                                   const int64_t* padding,
                                    bool count_include_pad,
-                                   int32_t divisor_override)
+                                   int64_t divisor_override)
 {
     auto dims  = miopen::deref(inputDesc).GetLengths();
     auto numel = miopen::deref(outputDesc).GetElementSize();
@@ -146,55 +144,54 @@ int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc,
     auto input_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
     auto output_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc));
 
-    for(int32_t gid = 0; gid < numel; gid++)
+    for(int64_t gid = 0; gid < numel; gid++)
     {
-        int32_t ncodoh = gid / OW, ow = gid % OW;
-        int32_t ncod = ncodoh / OH, oh = ncodoh % OH;
-        int32_t nc = ncod / OD, od = ncod % OD;
-        int32_t n = nc / C, c = nc % C;
-        int32_t KD = ksize[0];
-        int32_t R  = ksize[1];
-        int32_t S  = ksize[2];
-        int32_t sd = stride[0];
-        int32_t sh = stride[1];
-        int32_t sw = stride[2];
-        int32_t pd = padding[0];
-        int32_t ph = padding[1];
-        int32_t pw = padding[2];
+        int64_t ncodoh = gid / OW, ow = gid % OW;
+        int64_t ncod = ncodoh / OH, oh = ncodoh % OH;
+        int64_t nc = ncod / OD, od = ncod % OD;
+        int64_t n = nc / C, c = nc % C;
+        int64_t KD = ksize[0];
+        int64_t R  = ksize[1];
+        int64_t S  = ksize[2];
+        int64_t sd = stride[0];
+        int64_t sh = stride[1];
+        int64_t sw = stride[2];
+        int64_t pd = padding[0];
+        int64_t ph = padding[1];
+        int64_t pw = padding[2];
 
         if(n >= N)
             return 0;
         float sum = 0;
-        for(int32_t kd = 0; kd < KD; ++kd)
+        for(int64_t kd = 0; kd < KD; ++kd)
         {
-            for(int32_t r = 0; r < R; ++r)
+            for(int64_t r = 0; r < R; ++r)
             {
-                for(int32_t s = 0; s < S; ++s)
+                for(int64_t s = 0; s < S; ++s)
                 {
                     // input idx : (n, c, d, h, w)
-                    int32_t d = od * sd - pd + kd;
+                    int64_t d = od * sd - pd + kd;
                     if(d < 0 || d >= D)
                         continue;
-                    int32_t h = oh * sh - ph + r;
+                    int64_t h = oh * sh - ph + r;
                     if(h < 0 || h >= H)
                         continue;
-                    int32_t w = ow * sw - pw + s;
+                    int64_t w = ow * sw - pw + s;
                     if(w < 0 || w >= W)
                         continue;
-                    // int32_t input_idx = ((n * C + c) * H + h) * W + w;
-                    sum += static_cast<float>(
-                        input[input_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))]);
+                    // int64_t input_idx = ((n * C + c) * H + h) * W + w;
+                    sum += static_cast<float>(input[input_tv.get_tensor_view_idx({n, c, d, h, w})]);
                 }
             }
         }
-        int32_t dstart = od * sd - pd;
-        int32_t hstart = oh * sh - ph;
-        int32_t wstart = ow * sw - pw;
-        int32_t dend   = min(dstart + KD, D + pd);
-        int32_t hend   = min(hstart + R, H + ph);
-        int32_t wend   = min(wstart + S, W + pw);
-
-        const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+        int64_t dstart = od * sd - pd;
+        int64_t hstart = oh * sh - ph;
+        int64_t wstart = ow * sw - pw;
+        int64_t dend   = min(dstart + KD, D + pd);
+        int64_t hend   = min(hstart + R, H + ph);
+        int64_t wend   = min(wstart + S, W + pw);
+
+        const int64_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
         dstart                  = max(dstart, 0);
         hstart                  = max(hstart, 0);
         wstart                  = max(wstart, 0);
@@ -202,7 +199,7 @@ int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc,
         hend                    = min(hend, H);
         wend                    = min(wend, W);
 
-        int32_t divide_factor;
+        int64_t divide_factor;
         if(divisor_override != 0)
         {
             divide_factor = divisor_override;
@@ -218,9 +215,8 @@ int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc,
                 divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
             }
         }
-        float val = sum / divide_factor;
-        output[output_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, od, oh, ow))] =
-            static_cast<Tcheck>(val);
+        float val                                                 = sum / divide_factor;
+        output[output_tv.get_tensor_view_idx({n, c, od, oh, ow})] = static_cast<Tcheck>(val);
     }
     return 0;
 }
@@ -230,17 +226,17 @@ int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDes
                                     const miopenTensorDescriptor_t inputGradDesc,
                                     Tgpu* output_grad,
                                     Tcheck* input_grad,
-                                    size_t N,
-                                    size_t C,
-                                    size_t H,
-                                    size_t W,
-                                    size_t OH,
-                                    size_t OW,
-                                    const int32_t* ksize,
-                                    const int32_t* stride,
-                                    const int32_t* padding,
+                                    int64_t N,
+                                    int64_t C,
+                                    int64_t H,
+                                    int64_t W,
+                                    int64_t OH,
+                                    int64_t OW,
+                                    const int64_t* ksize,
+                                    const int64_t* stride,
+                                    const int64_t* padding,
                                     bool count_include_pad,
-                                    int32_t divisor_override)
+                                    int64_t divisor_override)
 {
     auto dims  = miopen::deref(inputGradDesc).GetLengths();
     auto numel = miopen::deref(inputGradDesc).GetElementSize();
@@ -248,52 +244,52 @@ int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDes
     auto output_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc));
     auto input_grad_tv  = miopen::get_inner_expanded_tv<4>(miopen::deref(inputGradDesc));
 
-    for(size_t gid = 0; gid < numel; gid++)
+    for(int64_t gid = 0; gid < numel; gid++)
     {
-        int32_t nch = gid / W, w = gid % W;
-        int32_t nc = nch / H, h = nch % H;
-        int32_t n = nc / C, c = nc % C;
-        int32_t R  = ksize[0];
-        int32_t S  = ksize[1];
-        int32_t sh = stride[0];
-        int32_t sw = stride[1];
-        int32_t ph = padding[0];
-        int32_t pw = padding[1];
+        int64_t nch = gid / W, w = gid % W;
+        int64_t nc = nch / H, h = nch % H;
+        int64_t n = nc / C, c = nc % C;
+        int64_t R  = ksize[0];
+        int64_t S  = ksize[1];
+        int64_t sh = stride[0];
+        int64_t sw = stride[1];
+        int64_t ph = padding[0];
+        int64_t pw = padding[1];
 
         if(n >= N)
             return 0;
 
         float grad = 0;
-        for(int32_t r = 0; r < R; ++r)
+        for(int64_t r = 0; r < R; ++r)
         {
-            for(int32_t s = 0; s < S; ++s)
+            for(int64_t s = 0; s < S; ++s)
             {
-                int32_t ohsh = h + ph - r;
+                int64_t ohsh = h + ph - r;
                 if(ohsh % sh != 0)
                     continue;
-                int32_t oh = ohsh / sh;
+                int64_t oh = ohsh / sh;
                 if(oh < 0 || oh >= OH)
                     continue;
-                int32_t owsw = w + pw - s;
+                int64_t owsw = w + pw - s;
                 if(owsw % sw != 0)
                     continue;
-                int32_t ow = owsw / sw;
+                int64_t ow = owsw / sw;
                 if(ow < 0 || ow >= OW)
                     continue;
 
-                int32_t hstart = oh * sh - ph;
-                int32_t wstart = ow * sw - pw;
-                int32_t hend   = min(hstart + R, H + ph);
-                int32_t wend   = min(wstart + S, W + pw);
+                int64_t hstart = oh * sh - ph;
+                int64_t wstart = ow * sw - pw;
+                int64_t hend   = min(hstart + R, H + ph);
+                int64_t wend   = min(wstart + S, W + pw);
 
-                const int32_t pool_size = (hend - hstart) * (wend - wstart);
+                const int64_t pool_size = (hend - hstart) * (wend - wstart);
 
                 hstart = max(hstart, 0);
                 wstart = max(wstart, 0);
                 hend   = min(hend, H);
                 wend   = min(wend, W);
 
-                int32_t divide_factor;
+                int64_t divide_factor;
                 if(divisor_override != 0)
                 {
                     divide_factor = divisor_override;
@@ -310,13 +306,12 @@ int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDes
                     }
                 }
 
-                grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
-                            tensor_layout_t<4>(n, c, oh, ow))]) /
+                grad += static_cast<float>(
+                            output_grad[output_grad_tv.get_tensor_view_idx({n, c, oh, ow})]) /
                         divide_factor;
             }
         }
-        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<4>(n, c, h, w))] =
-            static_cast<Tcheck>(grad);
+        input_grad[input_grad_tv.get_tensor_view_idx({n, c, h, w})] = static_cast<Tcheck>(grad);
     }
     return 0;
 }
@@ -326,19 +321,19 @@ int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDes
                                     const miopenTensorDescriptor_t inputGradDesc,
                                     Tgpu* output_grad,
                                     Tcheck* input_grad,
-                                    size_t N,
-                                    size_t C,
-                                    size_t D,
-                                    size_t H,
-                                    size_t W,
-                                    size_t OD,
-                                    size_t OH,
-                                    size_t OW,
-                                    const int32_t* ksize,
-                                    const int32_t* stride,
-                                    const int32_t* padding,
+                                    int64_t N,
+                                    int64_t C,
+                                    int64_t D,
+                                    int64_t H,
+                                    int64_t W,
+                                    int64_t OD,
+                                    int64_t OH,
+                                    int64_t OW,
+                                    const int64_t* ksize,
+                                    const int64_t* stride,
+                                    const int64_t* padding,
                                     bool count_include_pad,
-                                    int32_t divisor_override)
+                                    int64_t divisor_override)
 {
     auto dims  = miopen::deref(inputGradDesc).GetLengths();
     auto numel = miopen::deref(inputGradDesc).GetElementSize();
@@ -346,68 +341,68 @@ int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDes
     auto output_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc));
     auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(inputGradDesc));
 
-    for(size_t gid = 0; gid < numel; gid++)
+    for(int64_t gid = 0; gid < numel; gid++)
     {
-        int32_t ncdh = gid / W, w = gid % W;
-        int32_t ncd = ncdh / H, h = ncdh % H;
-        int32_t nc = ncd / D, d = ncd % D;
-        int32_t n = nc / C, c = nc % C;
-        int32_t KD = ksize[0];
-        int32_t R  = ksize[1];
-        int32_t S  = ksize[2];
-        int32_t sd = stride[0];
-        int32_t sh = stride[1];
-        int32_t sw = stride[2];
-        int32_t pd = padding[0];
-        int32_t ph = padding[1];
-        int32_t pw = padding[2];
+        int64_t ncdh = gid / W, w = gid % W;
+        int64_t ncd = ncdh / H, h = ncdh % H;
+        int64_t nc = ncd / D, d = ncd % D;
+        int64_t n = nc / C, c = nc % C;
+        int64_t KD = ksize[0];
+        int64_t R  = ksize[1];
+        int64_t S  = ksize[2];
+        int64_t sd = stride[0];
+        int64_t sh = stride[1];
+        int64_t sw = stride[2];
+        int64_t pd = padding[0];
+        int64_t ph = padding[1];
+        int64_t pw = padding[2];
 
         if(n >= N)
             return 0;
 
         float grad = 0;
-        for(int32_t kd = 0; kd < KD; ++kd)
+        for(int64_t kd = 0; kd < KD; ++kd)
         {
-            for(int32_t r = 0; r < R; ++r)
+            for(int64_t r = 0; r < R; ++r)
             {
-                for(int32_t s = 0; s < S; ++s)
+                for(int64_t s = 0; s < S; ++s)
                 {
-                    int32_t odsd = d + pd - kd;
+                    int64_t odsd = d + pd - kd;
                     if(odsd % sd != 0)
                         continue;
-                    int32_t od = odsd / sd;
+                    int64_t od = odsd / sd;
                     if(od < 0 || od >= OD)
                         continue;
 
-                    int32_t ohsh = h + ph - r;
+                    int64_t ohsh = h + ph - r;
                     if(ohsh % sh != 0)
                         continue;
-                    int32_t oh = ohsh / sh;
+                    int64_t oh = ohsh / sh;
                     if(oh < 0 || oh >= OH)
                         continue;
 
-                    int32_t owsw = w + pw - s;
+                    int64_t owsw = w + pw - s;
                     if(owsw % sw != 0)
                         continue;
-                    int32_t ow = owsw / sw;
+                    int64_t ow = owsw / sw;
                     if(ow < 0 || ow >= OW)
                         continue;
 
-                    int32_t dstart = od * sd - pd;
-                    int32_t hstart = oh * sh - ph;
-                    int32_t wstart = ow * sw - pw;
-                    int32_t dend   = min(dstart + KD, D + pd);
-                    int32_t hend   = min(hstart + R, H + ph);
-                    int32_t wend   = min(wstart + S, W + pw);
+                    int64_t dstart = od * sd - pd;
+                    int64_t hstart = oh * sh - ph;
+                    int64_t wstart = ow * sw - pw;
+                    int64_t dend   = min(dstart + KD, D + pd);
+                    int64_t hend   = min(hstart + R, H + ph);
+                    int64_t wend   = min(wstart + S, W + pw);
 
-                    const int32_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                    const int64_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
                     dstart                  = max(dstart, 0);
                     hstart                  = max(hstart, 0);
                     wstart                  = max(wstart, 0);
                     dend                    = min(dend, D);
                     hend                    = min(hend, H);
                     wend                    = min(wend, W);
-                    int32_t divide_factor;
+                    int64_t divide_factor;
                     if(divisor_override != 0)
                     {
                         divide_factor = divisor_override;
@@ -423,14 +418,14 @@ int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDes
                             divide_factor = (dend - dstart) * (hend - hstart) * (wend - wstart);
                         }
                     }
-                    grad += static_cast<float>(output_grad[output_grad_tv.get_tensor_view_idx(
-                                tensor_layout_t<5>(n, c, od, oh, ow))]) /
-                            divide_factor;
+                    grad +=
+                        static_cast<float>(
+                            output_grad[output_grad_tv.get_tensor_view_idx({n, c, od, oh, ow})]) /
+                        divide_factor;
                 }
             }
         }
-        input_grad[input_grad_tv.get_tensor_view_idx(tensor_layout_t<5>(n, c, d, h, w))] =
-            static_cast<Tcheck>(grad);
+        input_grad[input_grad_tv.get_tensor_view_idx({n, c, d, h, w})] = static_cast<Tcheck>(grad);
     }
     return 0;
 }
diff --git a/src/include/miopen/avgpool/solvers.hpp b/src/include/miopen/avgpool/solvers.hpp
index 5577b9fad6..f9d45bae28 100644
--- a/src/include/miopen/avgpool/solvers.hpp
+++ b/src/include/miopen/avgpool/solvers.hpp
@@ -32,6 +32,7 @@
 #include <miopen/avgpool/problem_description.hpp>
 #include "miopen/kernel_build_params.hpp"
 #include "miopen/kernel_info.hpp"
+#include "miopen/mlo_internal.hpp"
 
 namespace miopen {
 
diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp
index 4a22541b19..1c022c8abe 100644
--- a/test/gtest/avgpool.hpp
+++ b/test/gtest/avgpool.hpp
@@ -63,7 +63,7 @@ struct AvgPoolTestCase
                   << " stride:" << tc.stride << " padding:" << tc.padding
                   << " ceil_mode:" << tc.ceil_mode << " count_include_pad:" << tc.count_include_pad
                   << " divisor_override:" << tc.divisor_override
-                  << "is_contiguous:" << tc.is_contiguous;
+                  << " is_contiguous:" << tc.is_contiguous;
     }
 
     std::vector<size_t> GetInput() const { return input_dims; }

From aa033dd0047cabd0b791068ac93a1ea7989a35f3 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Fri, 4 Oct 2024 16:08:45 +0700
Subject: [PATCH 14/29] rm magic number

---
 driver/avgpool_driver.hpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp
index 08f007611c..a847b620d7 100644
--- a/driver/avgpool_driver.hpp
+++ b/driver/avgpool_driver.hpp
@@ -547,13 +547,7 @@ int AvgPoolDriver<Tgpu, Tref>::RunBackwardCPU()
 template <typename Tgpu, typename Tref>
 Tref AvgPoolDriver<Tgpu, Tref>::GetTolerance()
 {
-    // Computation error of fp16 is ~2^13 (=8192) bigger than
-    // the one of fp32 because mantissa is shorter by 13 bits.
-    auto tolerance = std::is_same<Tgpu, float>::value ? 1.5e-6 : 8.2e-3;
-
-    // bf16 mantissa has 7 bits, by 3 bits shorter than fp16.
-    if(std::is_same<Tgpu, bfloat16>::value)
-        tolerance *= 8.0;
+    Tref tolerance = std::numeric_limits<Tgpu>::epsilon() * 10;
     return tolerance;
 }
 

From 71dd203d69aa2b5fa474a0fbdc631a2bbf95d47d Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Tue, 8 Oct 2024 14:37:29 +0700
Subject: [PATCH 15/29] fix as review in AdaptiveAvgpool

---
 driver/avgpool_driver.hpp                     | 348 +++++++++---------
 driver/mloAvgPoolHost.hpp                     |  45 +--
 src/avgpool.cpp                               |   4 +
 src/avgpool_api.cpp                           |  64 ++--
 src/include/miopen/avgpool.hpp                |  10 +-
 src/include/miopen/avgpool/invoke_params.hpp  |   4 +-
 .../miopen/avgpool/problem_description.hpp    |  53 +++
 src/include/miopen/avgpool/solvers.hpp        |  14 +-
 src/include/miopen/tensor_view_utils.hpp      |  33 +-
 src/kernels/MIOpenAvgPool.cpp                 |   9 -
 src/kernels/tensor_view.hpp                   |   1 +
 src/solver/avgpool/backward_avgpool_2d.cpp    |  30 +-
 src/solver/avgpool/backward_avgpool_3d.cpp    |  34 +-
 src/solver/avgpool/forward_avgpool_2d.cpp     |  31 +-
 src/solver/avgpool/forward_avgpool_3d.cpp     |  34 +-
 test/cpu_avgpool.hpp                          |  26 +-
 test/gtest/avgpool.cpp                        | 125 ++-----
 test/gtest/avgpool.hpp                        |  64 ++--
 18 files changed, 448 insertions(+), 481 deletions(-)

diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp
index a847b620d7..acaed31f32 100644
--- a/driver/avgpool_driver.hpp
+++ b/driver/avgpool_driver.hpp
@@ -23,8 +23,7 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#ifndef GUARD_MIOPEN_AVGPOOL_DRIVER_HPP
-#define GUARD_MIOPEN_AVGPOOL_DRIVER_HPP
+#pragma once
 
 #include "InputFlags.hpp"
 #include "driver.hpp"
@@ -36,7 +35,6 @@
 #include <../test/tensor_holder.hpp>
 #include <../test/verify.hpp>
 
-#include <cstddef>
 #include <miopen/env.hpp>
 #include <miopen/handle.hpp>
 #include <miopen/miopen.h>
@@ -57,13 +55,11 @@ class AvgPoolDriver : public Driver
         data_type = miopen_type<Tgpu>{};
     }
 
-    std::vector<size_t> ComputeStrides(std::vector<size_t> input);
+    std::vector<uint64_t> ComputeStrides(std::vector<uint64_t> input);
     int AddCmdLineArgs() override;
     int ParseCmdLineArgs(int argc, char* argv[]) override;
     InputFlags& GetInputFlags() override { return inflags; }
 
-    template <typename T>
-    std::vector<T> GetInputTensorDimsFromCmd(const char* param);
     int GetandSetData() override;
 
     int AllocateBuffersAndCopy() override;
@@ -115,7 +111,7 @@ class AvgPoolDriver : public Driver
     int64_t divisor_override;
     int64_t N, C, D, H, W, OD, OH, OW;
 
-    std::vector<size_t> in_dim;
+    std::vector<uint64_t> in_dim;
     bool isContiguous;
 };
 
@@ -132,46 +128,18 @@ int AvgPoolDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref>
-template <typename T>
-std::vector<T> AvgPoolDriver<Tgpu, Tref>::GetInputTensorDimsFromCmd(const char* param)
-{
-    std::string lengthsStr = inflags.GetValueStr(param);
-
-    std::vector<T> lengths;
-    std::size_t pos = 0;
-    std::size_t new_pos;
-
-    new_pos = lengthsStr.find(',', pos);
-    while(new_pos != std::string::npos)
-    {
-        std::string sliceStr = lengthsStr.substr(pos, new_pos - pos);
-
-        int len = std::stoi(sliceStr);
-
-        lengths.push_back(len);
-
-        pos     = new_pos + 1;
-        new_pos = lengthsStr.find(',', pos);
-    };
-
-    std::string sliceStr = lengthsStr.substr(pos);
-    int len              = std::stoi(sliceStr);
-
-    lengths.push_back(static_cast<T>(len));
-
-    return (lengths);
-}
-
 template <typename Tgpu, typename Tref>
 int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
 {
-    in_dim                        = GetInputTensorDimsFromCmd<size_t>("input_dims");
-    std::vector<size_t> in_stride = ComputeStrides(in_dim);
-    int ksp_dim                   = in_dim.size() - 2;
-    ksize                         = GetInputTensorDimsFromCmd<int64_t>("kernel_size");
-    stride                        = GetInputTensorDimsFromCmd<int64_t>("stride");
-    padding                       = GetInputTensorDimsFromCmd<int64_t>("padding");
+    in_dim                            = inflags.GetValueTensorUint64("input_dims").lengths;
+    std::vector<uint64_t> in_stride   = ComputeStrides(in_dim);
+    int ksp_dim                       = in_dim.size() - 2;
+    std::vector<uint64_t> ksize_int   = inflags.GetValueTensorUint64("kernel_size").lengths;
+    ksize                             = std::vector<int64_t>(ksize_int.begin(), ksize_int.end());
+    std::vector<uint64_t> stride_int  = inflags.GetValueTensorUint64("stride").lengths;
+    stride                            = std::vector<int64_t>(stride_int.begin(), stride_int.end());
+    std::vector<uint64_t> padding_int = inflags.GetValueTensorUint64("padding").lengths;
+    padding = std::vector<int64_t>(padding_int.begin(), padding_int.end());
 
     if(ksize.size() != ksp_dim)
     {
@@ -202,7 +170,7 @@ int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
     H = in_dim.size() == 5 ? in_dim[3] : in_dim[2];
     W = in_dim.size() == 5 ? in_dim[4] : in_dim[3];
 
-    std::vector<size_t> out_dim;
+    std::vector<uint64_t> out_dim;
     if(in_dim.size() == 5)
     {
         if(ceil_mode)
@@ -233,22 +201,27 @@ int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
         }
         out_dim = {N, C, OH, OW};
     }
-    std::vector<size_t> out_grad_stride = ComputeStrides(out_dim);
-    SetTensorNd(inputDesc, in_dim, in_stride, data_type);
-    SetTensorNd(outputDesc, out_dim, data_type);
-    SetTensorNd(outputGradDesc, out_dim, out_grad_stride, data_type);
-    SetTensorNd(inputGradDesc, in_dim, data_type);
+    std::vector<uint64_t> out_grad_stride = ComputeStrides(out_dim);
+    if(SetTensorNd(inputDesc, in_dim, in_stride, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing input tensor: " + inflags.GetValueStr("input_dims") + ".");
+    if(SetTensorNd(outputDesc, out_dim, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing output tensor: " + inflags.GetValueStr("output_dims") + ".");
+    if(SetTensorNd(outputGradDesc, out_dim, out_grad_stride, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing output grad tensor: " + inflags.GetValueStr("output_dims") +
+                     ".");
+    if(SetTensorNd(inputGradDesc, in_dim, data_type) != miopenStatusSuccess)
+        MIOPEN_THROW("Error parsing input grad tensor: " + inflags.GetValueStr("input_dims") + ".");
 
     return miopenStatusSuccess;
 }
 
 // Equivalent to: tensor.tranpose(0, -1).contiguous().tranpose(0, -1) incase contiguous = False
 template <typename Tgpu, typename Tref>
-std::vector<size_t> AvgPoolDriver<Tgpu, Tref>::ComputeStrides(std::vector<size_t> inputDim)
+std::vector<uint64_t> AvgPoolDriver<Tgpu, Tref>::ComputeStrides(std::vector<uint64_t> inputDim)
 {
     if(!isContiguous)
         std::swap(inputDim.front(), inputDim.back());
-    std::vector<size_t> strides(inputDim.size());
+    std::vector<uint64_t> strides(inputDim.size());
     strides.back() = 1;
     for(int i = inputDim.size() - 2; i >= 0; --i)
         strides[i] = strides[i + 1] * inputDim[i + 1];
@@ -261,25 +234,23 @@ template <typename Tgpu, typename Tref>
 int AvgPoolDriver<Tgpu, Tref>::AddCmdLineArgs()
 {
     inflags.AddInputFlag("forw", 'F', "1", "Run only Forward AvgPool (Default=1)", "int");
-    inflags.AddInputFlag(
+    inflags.AddTensorFlag(
         "input_dims",
         'D',
-        "2,3,7,9",
-        "The dimensional lengths of the input tensor: N,C,D1,D2,... Example: 2,3,7,9.",
-        "string");
-    inflags.AddInputFlag(
-        "kernel_size", 'k', "1,1", "The size of the window D1,D2,... Example: 1,1.", "string");
-    inflags.AddInputFlag(
+        "2x3x7x9",
+        "The dimensional lengths of the input tensor: N,C,D1,D2,... Example: 2x3x7x9.");
+    inflags.AddTensorFlag(
+        "kernel_size", 'k', "1x1", "The size of the window D1,D2,... Example: 1x1.");
+    inflags.AddTensorFlag(
         "stride",
         's',
-        "1,1",
-        "The stride of the window. Default value is kernel_size D1,D2,... Example: 1,1.",
-        "string");
-    inflags.AddInputFlag("padding",
-                         'p',
-                         "0,0",
-                         "Implicit zero padding to be added on both sides D1,D2,... Example: 0,0.",
-                         "string");
+        "1x1",
+        "The stride of the window. Default value is kernel_size D1,D2,... Example: 1x1.");
+    inflags.AddTensorFlag(
+        "padding",
+        'p',
+        "0x0",
+        "Implicit zero padding to be added on both sides D1,D2,... Example: 0x0.");
     inflags.AddInputFlag("ceil_mode",
                          'c',
                          "1",
@@ -347,7 +318,10 @@ int AvgPoolDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
     status |= output_grad_dev->ToGPU(q, output_grad.data());
 
     if(status != 0)
+    {
         std::cout << "Error copying data to GPU\n" << std::endl;
+        return miopenStatusAllocFailed;
+    }
 
     return miopenStatusSuccess;
 }
@@ -363,22 +337,23 @@ int AvgPoolDriver<Tgpu, Tref>::RunForwardGPU()
 
     for(int i = 0; i < inflags.GetValueInt("iter"); i++)
     {
-        miopenAvgPoolForward(GetHandle(),
-                             inputDesc,
-                             input_dev->GetMem(),
-                             outputDesc,
-                             output_dev->GetMem(),
-                             ksize.size() == 3 ? ksize[0] : 0,
-                             ksize.size() == 3 ? ksize[1] : ksize[0],
-                             ksize.size() == 3 ? ksize[2] : ksize[1],
-                             stride.size() == 3 ? stride[0] : 0,
-                             stride.size() == 3 ? stride[1] : stride[0],
-                             stride.size() == 3 ? stride[2] : stride[1],
-                             padding.size() == 3 ? padding[0] : 0,
-                             padding.size() == 3 ? padding[1] : padding[0],
-                             padding.size() == 3 ? padding[2] : padding[1],
-                             count_include_pad,
-                             divisor_override);
+        auto status = miopenAvgPoolForward(GetHandle(),
+                                           inputDesc,
+                                           input_dev->GetMem(),
+                                           outputDesc,
+                                           output_dev->GetMem(),
+                                           ksize.size() == 3 ? ksize[0] : 0,
+                                           ksize.size() == 3 ? ksize[1] : ksize[0],
+                                           ksize.size() == 3 ? ksize[2] : ksize[1],
+                                           stride.size() == 3 ? stride[0] : 0,
+                                           stride.size() == 3 ? stride[1] : stride[0],
+                                           stride.size() == 3 ? stride[2] : stride[1],
+                                           padding.size() == 3 ? padding[0] : 0,
+                                           padding.size() == 3 ? padding[1] : padding[0],
+                                           padding.size() == 3 ? padding[2] : padding[1],
+                                           count_include_pad,
+                                           divisor_override);
+        MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in miopenAvgPoolForward");
 
         float time = 0.0;
         miopenGetKernelTime(GetHandle(), &time);
@@ -392,14 +367,21 @@ int AvgPoolDriver<Tgpu, Tref>::RunForwardGPU()
         STOP_TIME
         int iter = inflags.GetValueInt("iter");
         if(WALL_CLOCK)
-            printf("Wall-clock Time Forward AvgPool Elapsed: %f ms\n", t.gettime_ms() / iter);
+            std::cout << "Wall-clock Time Forward AvgPool Elapsed: " << t.gettime_ms() / iter
+                      << " ms" << std::endl;
 
         float kernel_average_time =
             iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
-        printf("GPU Kernel Time Forward AvgPool Elapsed: %f ms\n", kernel_average_time);
+        std::cout << "GPU Kernel Time Forward AvgPool Elapsed: " << kernel_average_time << " ms"
+                  << std::endl;
     }
 
-    output_dev->FromGPU(GetStream(), output.data());
+    if(output_dev->FromGPU(GetStream(), output.data()) != 0)
+    {
+        std::cerr << "Error copying (output_dev) from GPU, size: " << output_dev->GetSize()
+                  << std::endl;
+        return miopenStatusInternalError;
+    }
 
     return miopenStatusSuccess;
 }
@@ -407,45 +389,49 @@ int AvgPoolDriver<Tgpu, Tref>::RunForwardGPU()
 template <typename Tgpu, typename Tref>
 int AvgPoolDriver<Tgpu, Tref>::RunForwardCPU()
 {
+    int status = miopenStatusSuccess;
+
     if(in_dim.size() == 4)
     {
-        mloAvgPoolForward2dRunHost<Tgpu, Tref>(inputDesc,
-                                               outputDesc,
-                                               input.data(),
-                                               output_host.data(),
-                                               N,
-                                               C,
-                                               H,
-                                               W,
-                                               OH,
-                                               OW,
-                                               ksize.data(),
-                                               stride.data(),
-                                               padding.data(),
-                                               count_include_pad,
-                                               divisor_override);
+        status = mloAvgPoolForward2dRunHost<Tgpu, Tref>(inputDesc,
+                                                        outputDesc,
+                                                        input.data(),
+                                                        output_host.data(),
+                                                        N,
+                                                        C,
+                                                        H,
+                                                        W,
+                                                        OH,
+                                                        OW,
+                                                        ksize.data(),
+                                                        stride.data(),
+                                                        padding.data(),
+                                                        count_include_pad,
+                                                        divisor_override);
+        MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in mloAvgPoolForward2dRunHost");
     }
     else if(in_dim.size() == 5)
     {
-        mloAvgPoolForward3dRunHost<Tgpu, Tref>(inputDesc,
-                                               outputDesc,
-                                               input.data(),
-                                               output_host.data(),
-                                               N,
-                                               C,
-                                               D,
-                                               H,
-                                               W,
-                                               OD,
-                                               OH,
-                                               OW,
-                                               ksize.data(),
-                                               stride.data(),
-                                               padding.data(),
-                                               count_include_pad,
-                                               divisor_override);
+        status = mloAvgPoolForward3dRunHost<Tgpu, Tref>(inputDesc,
+                                                        outputDesc,
+                                                        input.data(),
+                                                        output_host.data(),
+                                                        N,
+                                                        C,
+                                                        D,
+                                                        H,
+                                                        W,
+                                                        OD,
+                                                        OH,
+                                                        OW,
+                                                        ksize.data(),
+                                                        stride.data(),
+                                                        padding.data(),
+                                                        count_include_pad,
+                                                        divisor_override);
+        MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in mloAvgPoolForward3dRunHost");
     }
-    return miopenStatusSuccess;
+    return status;
 }
 
 template <typename Tgpu, typename Tref>
@@ -459,22 +445,23 @@ int AvgPoolDriver<Tgpu, Tref>::RunBackwardGPU()
 
     for(int i = 0; i < inflags.GetValueInt("iter"); i++)
     {
-        miopenAvgPoolBackward(GetHandle(),
-                              outputGradDesc,
-                              output_grad_dev->GetMem(),
-                              inputGradDesc,
-                              input_grad_dev->GetMem(),
-                              ksize.size() == 3 ? ksize[0] : 0,
-                              ksize.size() == 3 ? ksize[1] : ksize[0],
-                              ksize.size() == 3 ? ksize[2] : ksize[1],
-                              stride.size() == 3 ? stride[0] : 0,
-                              stride.size() == 3 ? stride[1] : stride[0],
-                              stride.size() == 3 ? stride[2] : stride[1],
-                              padding.size() == 3 ? padding[0] : 0,
-                              padding.size() == 3 ? padding[1] : padding[0],
-                              padding.size() == 3 ? padding[2] : padding[1],
-                              count_include_pad,
-                              divisor_override);
+        auto status = miopenAvgPoolBackward(GetHandle(),
+                                            outputGradDesc,
+                                            output_grad_dev->GetMem(),
+                                            inputGradDesc,
+                                            input_grad_dev->GetMem(),
+                                            ksize.size() == 3 ? ksize[0] : 0,
+                                            ksize.size() == 3 ? ksize[1] : ksize[0],
+                                            ksize.size() == 3 ? ksize[2] : ksize[1],
+                                            stride.size() == 3 ? stride[0] : 0,
+                                            stride.size() == 3 ? stride[1] : stride[0],
+                                            stride.size() == 3 ? stride[2] : stride[1],
+                                            padding.size() == 3 ? padding[0] : 0,
+                                            padding.size() == 3 ? padding[1] : padding[0],
+                                            padding.size() == 3 ? padding[2] : padding[1],
+                                            count_include_pad,
+                                            divisor_override);
+        MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in miopenAvgPoolBackward");
 
         float time = 0.0;
         miopenGetKernelTime(GetHandle(), &time);
@@ -488,14 +475,21 @@ int AvgPoolDriver<Tgpu, Tref>::RunBackwardGPU()
         STOP_TIME
         int iter = inflags.GetValueInt("iter");
         if(WALL_CLOCK)
-            printf("Wall-clock Time Backward AvgPool Elapsed: %f ms\n", t.gettime_ms() / iter);
+            std::cout << "Wall-clock Time Backward AvgPool Elapsed: " << t.gettime_ms() / iter
+                      << " ms" << std::endl;
 
         float kernel_average_time =
             iter > 1 ? (kernel_total_time - kernel_first_time) / (iter - 1) : kernel_first_time;
-        printf("GPU Kernel Time Backward AvgPool Elapsed: %f ms\n", kernel_average_time);
+        std::cout << "GPU Kernel Time Backward AvgPool Elapsed: " << kernel_average_time << " ms"
+                  << std::endl;
     }
 
-    input_grad_dev->FromGPU(GetStream(), input_grad.data());
+    if(input_grad_dev->FromGPU(GetStream(), input_grad.data()) != 0)
+    {
+        std::cerr << "Error copying (input_grad_dev) from GPU, size: " << input_grad_dev->GetSize()
+                  << std::endl;
+        return miopenStatusInternalError;
+    }
 
     return miopenStatusSuccess;
 }
@@ -503,45 +497,49 @@ int AvgPoolDriver<Tgpu, Tref>::RunBackwardGPU()
 template <typename Tgpu, typename Tref>
 int AvgPoolDriver<Tgpu, Tref>::RunBackwardCPU()
 {
+    int status = miopenStatusSuccess;
+
     if(in_dim.size() == 4)
     {
-        mloAvgPoolBackward2dRunHost<Tgpu, Tref>(outputGradDesc,
-                                                inputGradDesc,
-                                                output_grad.data(),
-                                                input_grad_host.data(),
-                                                N,
-                                                C,
-                                                H,
-                                                W,
-                                                OH,
-                                                OW,
-                                                ksize.data(),
-                                                stride.data(),
-                                                padding.data(),
-                                                count_include_pad,
-                                                divisor_override);
+        status = mloAvgPoolBackward2dRunHost<Tgpu, Tref>(outputGradDesc,
+                                                         inputGradDesc,
+                                                         output_grad.data(),
+                                                         input_grad_host.data(),
+                                                         N,
+                                                         C,
+                                                         H,
+                                                         W,
+                                                         OH,
+                                                         OW,
+                                                         ksize.data(),
+                                                         stride.data(),
+                                                         padding.data(),
+                                                         count_include_pad,
+                                                         divisor_override);
+        MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in mloAvgPoolBackward2dRunHost");
     }
     else if(in_dim.size() == 5)
     {
-        mloAvgPoolBackward3dRunHost<Tgpu, Tref>(outputGradDesc,
-                                                inputGradDesc,
-                                                output_grad.data(),
-                                                input_grad_host.data(),
-                                                N,
-                                                C,
-                                                D,
-                                                H,
-                                                W,
-                                                OD,
-                                                OH,
-                                                OW,
-                                                ksize.data(),
-                                                stride.data(),
-                                                padding.data(),
-                                                count_include_pad,
-                                                divisor_override);
+        status = mloAvgPoolBackward3dRunHost<Tgpu, Tref>(outputGradDesc,
+                                                         inputGradDesc,
+                                                         output_grad.data(),
+                                                         input_grad_host.data(),
+                                                         N,
+                                                         C,
+                                                         D,
+                                                         H,
+                                                         W,
+                                                         OD,
+                                                         OH,
+                                                         OW,
+                                                         ksize.data(),
+                                                         stride.data(),
+                                                         padding.data(),
+                                                         count_include_pad,
+                                                         divisor_override);
+        MIOPEN_THROW_IF(status != miopenStatusSuccess, "Error in mloAvgPoolBackward3dRunHost");
     }
-    return miopenStatusSuccess;
+    return status;
 }
 
 template <typename Tgpu, typename Tref>
@@ -565,7 +563,7 @@ int AvgPoolDriver<Tgpu, Tref>::VerifyForward()
     }
     else
     {
-        printf("Forward AvgPool Verifies on CPU and GPU (err=%f)\n", error);
+        std::cout << "Forward AvgPool Verifies on CPU and GPU (err=" << error << ")" << std::endl;
     }
 
     return miopenStatusSuccess;
@@ -581,13 +579,11 @@ int AvgPoolDriver<Tgpu, Tref>::VerifyBackward()
     if(!std::isfinite(error) || error > tolerance)
     {
         std::cout << "Backward AvgPool FAILED: " << error << std::endl;
-        return EC_VerifyFwd;
+        return EC_VerifyBwd;
     }
     else
     {
-        printf("Backward AvgPool Verifies on CPU and GPU (err=%f)\n", error);
+        std::cout << "Backward AvgPool Verifies on CPU and GPU (err=" << error << ")" << std::endl;
     }
     return miopenStatusSuccess;
 }
-
-#endif // GUARD_MIOPEN_AVGPOOL_DRIVER_HPP
diff --git a/driver/mloAvgPoolHost.hpp b/driver/mloAvgPoolHost.hpp
index 5249e8d195..1405633e73 100644
--- a/driver/mloAvgPoolHost.hpp
+++ b/driver/mloAvgPoolHost.hpp
@@ -23,11 +23,11 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#ifndef MLO_AVGPOOLHOST_H_
-#define MLO_AVGPOOLHOST_H_
+#pragma once
 
 #include <miopen/tensor.hpp>
 #include <miopen/tensor_view_utils.hpp>
+#include <../test/ford.hpp>
 
 template <typename Tgpu, typename Tcheck>
 int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc,
@@ -52,8 +52,7 @@ int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc,
     auto input_tv  = miopen::get_inner_expanded_tv<4>(miopen::deref(inputDesc));
     auto output_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputDesc));
 
-    for(int64_t gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](int64_t gid) {
         int64_t ncoh = gid / OW, ow = gid % OW;
         int64_t nc = ncoh / OH, oh = ncoh % OH;
         int64_t n = nc / C, c = nc % C;
@@ -64,9 +63,6 @@ int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc,
         int64_t ph = padding[0];
         int64_t pw = padding[1];
 
-        if(n >= N)
-            return 0;
-
         float m = 0;
         for(int64_t r = 0; r < R; ++r)
         {
@@ -115,8 +111,8 @@ int32_t mloAvgPoolForward2dRunHost(const miopenTensorDescriptor_t inputDesc,
         float val = m / divide_factor;
 
         output[output_tv.get_tensor_view_idx({n, c, oh, ow})] = static_cast<Tcheck>(val);
-    }
-    return 0;
+    });
+    return miopenStatusSuccess;
 }
 
 template <typename Tgpu, typename Tcheck>
@@ -144,8 +140,7 @@ int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc,
     auto input_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(inputDesc));
     auto output_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputDesc));
 
-    for(int64_t gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](int64_t gid) {
         int64_t ncodoh = gid / OW, ow = gid % OW;
         int64_t ncod = ncodoh / OH, oh = ncodoh % OH;
         int64_t nc = ncod / OD, od = ncod % OD;
@@ -160,8 +155,6 @@ int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc,
         int64_t ph = padding[1];
         int64_t pw = padding[2];
 
-        if(n >= N)
-            return 0;
         float sum = 0;
         for(int64_t kd = 0; kd < KD; ++kd)
         {
@@ -217,8 +210,8 @@ int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc,
         }
         float val                                                 = sum / divide_factor;
         output[output_tv.get_tensor_view_idx({n, c, od, oh, ow})] = static_cast<Tcheck>(val);
-    }
-    return 0;
+    });
+    return miopenStatusSuccess;
 }
 
 template <typename Tgpu, typename Tcheck>
@@ -244,8 +237,7 @@ int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDes
     auto output_grad_tv = miopen::get_inner_expanded_tv<4>(miopen::deref(outputGradDesc));
     auto input_grad_tv  = miopen::get_inner_expanded_tv<4>(miopen::deref(inputGradDesc));
 
-    for(int64_t gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](int64_t gid) {
         int64_t nch = gid / W, w = gid % W;
         int64_t nc = nch / H, h = nch % H;
         int64_t n = nc / C, c = nc % C;
@@ -256,9 +248,6 @@ int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDes
         int64_t ph = padding[0];
         int64_t pw = padding[1];
 
-        if(n >= N)
-            return 0;
-
         float grad = 0;
         for(int64_t r = 0; r < R; ++r)
         {
@@ -312,8 +301,8 @@ int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDes
             }
         }
         input_grad[input_grad_tv.get_tensor_view_idx({n, c, h, w})] = static_cast<Tcheck>(grad);
-    }
-    return 0;
+    });
+    return miopenStatusSuccess;
 }
 
 template <typename Tgpu, typename Tcheck>
@@ -341,8 +330,7 @@ int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDes
     auto output_grad_tv = miopen::get_inner_expanded_tv<5>(miopen::deref(outputGradDesc));
     auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(miopen::deref(inputGradDesc));
 
-    for(int64_t gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](int64_t gid) {
         int64_t ncdh = gid / W, w = gid % W;
         int64_t ncd = ncdh / H, h = ncdh % H;
         int64_t nc = ncd / D, d = ncd % D;
@@ -357,9 +345,6 @@ int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDes
         int64_t ph = padding[1];
         int64_t pw = padding[2];
 
-        if(n >= N)
-            return 0;
-
         float grad = 0;
         for(int64_t kd = 0; kd < KD; ++kd)
         {
@@ -426,8 +411,6 @@ int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDes
             }
         }
         input_grad[input_grad_tv.get_tensor_view_idx({n, c, d, h, w})] = static_cast<Tcheck>(grad);
-    }
-    return 0;
+    });
+    return miopenStatusSuccess;
 }
-
-#endif // MLO_AVGPOOLHOST_H_
diff --git a/src/avgpool.cpp b/src/avgpool.cpp
index ed71f9ef8b..216241a643 100644
--- a/src/avgpool.cpp
+++ b/src/avgpool.cpp
@@ -33,6 +33,8 @@
 
 namespace miopen {
 
+namespace avgpool {
+
 miopenStatus_t AvgPoolForward(Handle& handle,
                               const TensorDescriptor& inputDesc,
                               ConstData_t input,
@@ -133,4 +135,6 @@ miopenStatus_t AvgPoolBackward(Handle& handle,
     return miopenStatusSuccess;
 }
 
+} // namespace avgpool
+
 } // namespace miopen
diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp
index 286fe97456..eaeb70dca8 100644
--- a/src/avgpool_api.cpp
+++ b/src/avgpool_api.cpp
@@ -115,22 +115,22 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
 
     LogCmdAvgPool(inputDesc, outputDesc, count_include_pad, divisor_override, true);
     return miopen::try_([&] {
-        miopen::AvgPoolForward(miopen::deref(handle),
-                               miopen::deref(inputDesc),
-                               DataCast(input),
-                               miopen::deref(outputDesc),
-                               DataCast(output),
-                               KD,
-                               KH,
-                               KW,
-                               SD,
-                               SH,
-                               SW,
-                               PD,
-                               PH,
-                               PW,
-                               count_include_pad,
-                               divisor_override);
+        miopen::avgpool::AvgPoolForward(miopen::deref(handle),
+                                        miopen::deref(inputDesc),
+                                        DataCast(input),
+                                        miopen::deref(outputDesc),
+                                        DataCast(output),
+                                        KD,
+                                        KH,
+                                        KW,
+                                        SD,
+                                        SH,
+                                        SW,
+                                        PD,
+                                        PH,
+                                        PW,
+                                        count_include_pad,
+                                        divisor_override);
     });
 }
 
@@ -170,21 +170,21 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
 
     LogCmdAvgPool(inputGradDesc, outputGradDesc, count_include_pad, divisor_override, false);
     return miopen::try_([&] {
-        miopen::AvgPoolBackward(miopen::deref(handle),
-                                miopen::deref(outputGradDesc),
-                                DataCast(output_grad),
-                                miopen::deref(inputGradDesc),
-                                DataCast(input_grad),
-                                KD,
-                                KH,
-                                KW,
-                                SD,
-                                SH,
-                                SW,
-                                PD,
-                                PH,
-                                PW,
-                                count_include_pad,
-                                divisor_override);
+        miopen::avgpool::AvgPoolBackward(miopen::deref(handle),
+                                         miopen::deref(outputGradDesc),
+                                         DataCast(output_grad),
+                                         miopen::deref(inputGradDesc),
+                                         DataCast(input_grad),
+                                         KD,
+                                         KH,
+                                         KW,
+                                         SD,
+                                         SH,
+                                         SW,
+                                         PD,
+                                         PH,
+                                         PW,
+                                         count_include_pad,
+                                         divisor_override);
     });
 }
diff --git a/src/include/miopen/avgpool.hpp b/src/include/miopen/avgpool.hpp
index c11fe6cadf..23646eb787 100644
--- a/src/include/miopen/avgpool.hpp
+++ b/src/include/miopen/avgpool.hpp
@@ -23,10 +23,7 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include <miopen/miopen.h>
-#ifndef MIOPEN_AVGPOOL_HPP_
-#define MIOPEN_AVGPOOL_HPP_
-
+#pragma once
 #include <miopen/common.hpp>
 
 namespace miopen {
@@ -34,6 +31,8 @@ namespace miopen {
 struct Handle;
 struct TensorDescriptor;
 
+namespace avgpool {
+
 MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolForward(Handle& handle,
                                                       const TensorDescriptor& inputDesc,
                                                       ConstData_t input,
@@ -67,5 +66,6 @@ MIOPEN_INTERNALS_EXPORT miopenStatus_t AvgPoolBackward(Handle& handle,
                                                        int64_t PW,
                                                        bool count_include_pad,
                                                        int64_t divisor_override);
+} // namespace avgpool
+
 } // namespace miopen
-#endif // _MIOPEN_AVGPOOL_HPP_
diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/avgpool/invoke_params.hpp
index d312339c5e..a905cdc4e6 100644
--- a/src/include/miopen/avgpool/invoke_params.hpp
+++ b/src/include/miopen/avgpool/invoke_params.hpp
@@ -26,11 +26,12 @@
 
 #pragma once
 
-#include "miopen/common.hpp"
+#include <miopen/common.hpp>
 #include <miopen/invoke_params.hpp>
 #include <miopen/tensor.hpp>
 
 namespace miopen {
+
 namespace avgpool {
 
 struct FwdInvokeParams : public miopen::InvokeParams
@@ -90,4 +91,5 @@ struct BwdInvokeParams : public miopen::InvokeParams
 };
 
 } // namespace avgpool
+
 } // namespace miopen
diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/avgpool/problem_description.hpp
index 2dee6a30ea..502dcac71d 100644
--- a/src/include/miopen/avgpool/problem_description.hpp
+++ b/src/include/miopen/avgpool/problem_description.hpp
@@ -63,6 +63,8 @@ struct FwdProblemDescription : ProblemDescription
           outputDesc(outputDesc_)
     {
         IsValidLength();
+        IsSameType();
+        IsValidDims();
     }
 
     auto GetInputDesc() const { return inputDesc; }
@@ -83,6 +85,29 @@ struct FwdProblemDescription : ProblemDescription
         return true;
     }
 
+    bool IsValidDims() const
+    {
+        if(inputDesc.GetLengths().size() > 5 || inputDesc.GetLengths().size() < 4)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Only 4D and 5D tensors are supported.");
+        }
+
+        return true;
+    }
+
+    bool IsAllContiguous() const { return inputDesc.IsContiguous() && outputDesc.IsContiguous(); }
+
+    bool IsSameType() const
+    {
+        if(inputDesc.GetType() != outputDesc.GetType())
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "AvgPool: Input and output tensor types do not match.");
+        }
+
+        return true;
+    }
+
     NetworkConfig MakeNetworkConfig() const override;
 
 protected:
@@ -101,6 +126,8 @@ struct BwdProblemDescription : ProblemDescription
           inputGradDesc(inputGradDesc_)
     {
         IsValidLength();
+        IsSameType();
+        IsValidDims();
     }
 
     auto GetOutputGradDesc() const { return outputGradDesc; }
@@ -121,6 +148,32 @@ struct BwdProblemDescription : ProblemDescription
         return true;
     }
 
+    bool IsValidDims() const
+    {
+        if(inputGradDesc.GetLengths().size() > 5 || inputGradDesc.GetLengths().size() < 4)
+        {
+            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: Only 4D and 5D tensors are supported.");
+        }
+
+        return true;
+    }
+
+    bool IsAllContiguous() const
+    {
+        return inputGradDesc.IsContiguous() && outputGradDesc.IsContiguous();
+    }
+
+    bool IsSameType() const
+    {
+        if(inputGradDesc.GetType() != outputGradDesc.GetType())
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "AvgPool: Input grad and output grad tensor types do not match.");
+        }
+
+        return true;
+    }
+
     NetworkConfig MakeNetworkConfig() const override;
 
 protected:
diff --git a/src/include/miopen/avgpool/solvers.hpp b/src/include/miopen/avgpool/solvers.hpp
index f9d45bae28..854611dd07 100644
--- a/src/include/miopen/avgpool/solvers.hpp
+++ b/src/include/miopen/avgpool/solvers.hpp
@@ -26,18 +26,20 @@
 
 #pragma once
 
-#include "miopen/conv_solution.hpp"
-#include "miopen/execution_context.hpp"
+#include <miopen/conv_solution.hpp>
+#include <miopen/execution_context.hpp>
 #include <miopen/solver.hpp>
 #include <miopen/avgpool/problem_description.hpp>
-#include "miopen/kernel_build_params.hpp"
-#include "miopen/kernel_info.hpp"
-#include "miopen/mlo_internal.hpp"
+#include <miopen/kernel_build_params.hpp>
+#include <miopen/kernel_info.hpp>
+#include <miopen/mlo_internal.hpp>
 
 namespace miopen {
 
 namespace solver {
 
+namespace avgpool {
+
 const auto make_hip_kernel = [](std::vector<size_t> localsize,
                                 std::vector<size_t> gridsize,
                                 std::string kernel_file,
@@ -53,8 +55,6 @@ const auto make_hip_kernel = [](std::vector<size_t> localsize,
         build_params.GenerateFor(kbp::HIP{}), localsize, gridsize, kernel_file, kernel_name};
 };
 
-namespace avgpool {
-
 using AvgPoolForward =
     NonTunableSolverBase<ExecutionContext, miopen::avgpool::FwdProblemDescription>;
 
diff --git a/src/include/miopen/tensor_view_utils.hpp b/src/include/miopen/tensor_view_utils.hpp
index d4f3aa4163..a75d0a380f 100644
--- a/src/include/miopen/tensor_view_utils.hpp
+++ b/src/include/miopen/tensor_view_utils.hpp
@@ -27,10 +27,8 @@
 #ifndef MIOPEN_TENSOR_VIEW_UTIL_HPP_
 #define MIOPEN_TENSOR_VIEW_UTIL_HPP_
 
-#include <miopen/common.hpp>
-#include <miopen/tensor.hpp>
 #include "../../kernels/tensor_view.hpp"
-#include "miopen/tensor.hpp"
+#include <miopen/tensor.hpp>
 
 namespace miopen {
 
@@ -43,7 +41,12 @@ inline tensor_view_t<N> get_inner_expanded_tv(const TensorDescriptor Desc)
     tensor_view_t<N> tensor_view{};
     for(size_t i = 0; i < N; ++i)
     {
-        if(i < dims.size())
+        if(dims.empty())
+        {
+            tensor_view.stride[i] = 0;
+            tensor_view.size[i]   = 0;
+        }
+        else if(i < dims.size())
         {
             tensor_view.stride[i] = strides[i];
             tensor_view.size[i]   = dims[i];
@@ -77,6 +80,28 @@ inline void slice_tv(tensor_view_t<N>& tensor_view, int32_t sliceCount, const in
     }
 }
 
+template <int N>
+inline tensor_view_t<N - 1> get_tv_without_dim(const tensor_view_t<N>& origin_tv, int selected_dim)
+{
+    tensor_view_t<N - 1> res{};
+    for(int i = 0; i < N; ++i)
+    {
+        if(i == selected_dim)
+            continue;
+        if(i < selected_dim)
+        {
+            res.size[i]   = origin_tv.size[i];
+            res.stride[i] = origin_tv.stride[i];
+        }
+        else
+        {
+            res.size[i - 1]   = origin_tv.size[i];
+            res.stride[i - 1] = origin_tv.stride[i];
+        }
+    }
+    return res;
+}
+
 } // namespace miopen
 
 #endif // MIOPEN_TENSOR_VIEW_UTIL_HPP_
diff --git a/src/kernels/MIOpenAvgPool.cpp b/src/kernels/MIOpenAvgPool.cpp
index 98299d7cdb..7c64b1076f 100644
--- a/src/kernels/MIOpenAvgPool.cpp
+++ b/src/kernels/MIOpenAvgPool.cpp
@@ -23,7 +23,6 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#include <cstddef>
 #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS
 #include <hip/hip_fp16.h>
 #include <hip/hip_runtime.h>
@@ -32,14 +31,6 @@
 #include "float_types.h"
 #include "tensor_view.hpp"
 
-#ifndef INPUT_TYPE
-#define INPUT_TYPE float
-#endif
-
-#ifndef OUTPUT_TYPE
-#define OUTPUT_TYPE float
-#endif
-
 template <typename TI, typename TO>
 __device__ void avgPoolForward2d(const TI* __restrict__ input,
                                  TO* __restrict__ output,
diff --git a/src/kernels/tensor_view.hpp b/src/kernels/tensor_view.hpp
index 12394dbde6..c9357dd729 100644
--- a/src/kernels/tensor_view.hpp
+++ b/src/kernels/tensor_view.hpp
@@ -49,6 +49,7 @@ struct tensor_view_t
     uint64_t stride[N];
     uint64_t size[N];
 };
+
 template <int N>
 struct tensor_layout_t
 {
diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp
index 73adabb8e7..de3e7a2f73 100644
--- a/src/solver/avgpool/backward_avgpool_2d.cpp
+++ b/src/solver/avgpool/backward_avgpool_2d.cpp
@@ -24,10 +24,10 @@
  *
  *******************************************************************************/
 
-#include "miopen/conv_solution.hpp"
-#include "miopen/execution_context.hpp"
-#include "miopen/invoke_params.hpp"
-#include "miopen/tensor_view_utils.hpp"
+#include <miopen/conv_solution.hpp>
+#include <miopen/execution_context.hpp>
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor_view_utils.hpp>
 #include <miopen/avgpool/solvers.hpp>
 
 #include <miopen/avgpool/invoke_params.hpp>
@@ -81,10 +81,10 @@ bool AvgPoolBackward2d::IsApplicable(const ExecutionContext&,
     {
         return false;
     }
-    if(!IsOverRocmBwd2d(problem))
-    {
-        return false;
-    }
+    // if(!IsOverRocmBwd2d(problem))
+    // {
+    //     return false;
+    // }
     return true;
 }
 
@@ -98,7 +98,7 @@ AvgPoolBackward2d::GetSolution(const ExecutionContext& context,
     auto input_dtype  = miopen::GetDataType(problem.GetOutputGradDesc().GetType());
     auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType());
     auto dtype        = problem.GetInputGradDesc().GetType();
-    size_t N_total    = problem.GetNtotal();
+    uint64_t N_total  = problem.GetNtotal();
 
     auto build_params = KernelBuildParameters{
         {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
@@ -120,12 +120,12 @@ AvgPoolBackward2d::GetSolution(const ExecutionContext& context,
             auto input_grad_tv  = get_inner_expanded_tv<4>(deref(params.inputGradDesc));
             auto output_grad_tv = get_inner_expanded_tv<4>(deref(params.outputGradDesc));
 
-            auto N  = deref(params.inputGradDesc).GetLengths()[0];
-            auto C  = deref(params.inputGradDesc).GetLengths()[1];
-            auto H  = deref(params.inputGradDesc).GetLengths()[2];
-            auto W  = deref(params.inputGradDesc).GetLengths()[3];
-            auto OH = deref(params.outputGradDesc).GetLengths()[2];
-            auto OW = deref(params.outputGradDesc).GetLengths()[3];
+            int64_t N  = deref(params.inputGradDesc).GetLengths()[0];
+            int64_t C  = deref(params.inputGradDesc).GetLengths()[1];
+            int64_t H  = deref(params.inputGradDesc).GetLengths()[2];
+            int64_t W  = deref(params.inputGradDesc).GetLengths()[3];
+            int64_t OH = deref(params.outputGradDesc).GetLengths()[2];
+            int64_t OW = deref(params.outputGradDesc).GetLengths()[3];
 
             kernel(params.output_grad,
                    params.input_grad,
diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp
index 4815803ad3..7b74de255b 100644
--- a/src/solver/avgpool/backward_avgpool_3d.cpp
+++ b/src/solver/avgpool/backward_avgpool_3d.cpp
@@ -24,10 +24,10 @@
  *
  *******************************************************************************/
 
-#include "miopen/conv_solution.hpp"
-#include "miopen/execution_context.hpp"
-#include "miopen/invoke_params.hpp"
-#include "miopen/tensor_view_utils.hpp"
+#include <miopen/conv_solution.hpp>
+#include <miopen/execution_context.hpp>
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor_view_utils.hpp>
 #include <miopen/avgpool/solvers.hpp>
 
 #include <miopen/avgpool/invoke_params.hpp>
@@ -87,10 +87,10 @@ bool AvgPoolBackward3d::IsApplicable(const ExecutionContext&,
     {
         return false;
     }
-    if(!IsOverRocmBwd3d(problem))
-    {
-        return false;
-    }
+    // if(!IsOverRocmBwd3d(problem))
+    // {
+    //     return false;
+    // }
     return true;
 }
 
@@ -104,7 +104,7 @@ AvgPoolBackward3d::GetSolution(const ExecutionContext& context,
     auto input_dtype  = miopen::GetDataType(problem.GetOutputGradDesc().GetType());
     auto output_dtype = miopen::GetDataType(problem.GetInputGradDesc().GetType());
     auto dtype        = problem.GetInputGradDesc().GetType();
-    size_t N_total    = problem.GetNtotal();
+    uint64_t N_total  = problem.GetNtotal();
 
     auto build_params = KernelBuildParameters{
         {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
@@ -126,14 +126,14 @@ AvgPoolBackward3d::GetSolution(const ExecutionContext& context,
             auto input_grad_tv  = get_inner_expanded_tv<5>(deref(params.inputGradDesc));
             auto output_grad_tv = get_inner_expanded_tv<5>(deref(params.outputGradDesc));
 
-            auto N  = deref(params.inputGradDesc).GetLengths()[0];
-            auto C  = deref(params.inputGradDesc).GetLengths()[1];
-            auto D  = deref(params.inputGradDesc).GetLengths()[2];
-            auto H  = deref(params.inputGradDesc).GetLengths()[3];
-            auto W  = deref(params.inputGradDesc).GetLengths()[4];
-            auto OD = deref(params.outputGradDesc).GetLengths()[2];
-            auto OH = deref(params.outputGradDesc).GetLengths()[3];
-            auto OW = deref(params.outputGradDesc).GetLengths()[4];
+            int64_t N  = deref(params.inputGradDesc).GetLengths()[0];
+            int64_t C  = deref(params.inputGradDesc).GetLengths()[1];
+            int64_t D  = deref(params.inputGradDesc).GetLengths()[2];
+            int64_t H  = deref(params.inputGradDesc).GetLengths()[3];
+            int64_t W  = deref(params.inputGradDesc).GetLengths()[4];
+            int64_t OD = deref(params.outputGradDesc).GetLengths()[2];
+            int64_t OH = deref(params.outputGradDesc).GetLengths()[3];
+            int64_t OW = deref(params.outputGradDesc).GetLengths()[4];
 
             kernel(params.output_grad,
                    params.input_grad,
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
index 1c51feb54b..0735911f08 100644
--- a/src/solver/avgpool/forward_avgpool_2d.cpp
+++ b/src/solver/avgpool/forward_avgpool_2d.cpp
@@ -24,11 +24,10 @@
  *
  *******************************************************************************/
 
-#include "miopen/conv_solution.hpp"
-#include "miopen/execution_context.hpp"
-#include "miopen/invoke_params.hpp"
-#include "miopen/tensor_view_utils.hpp"
-#include <cstdint>
+#include <miopen/conv_solution.hpp>
+#include <miopen/execution_context.hpp>
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor_view_utils.hpp>
 #include <miopen/avgpool/solvers.hpp>
 
 #include <miopen/avgpool/invoke_params.hpp>
@@ -83,10 +82,10 @@ bool AvgPoolForward2d::IsApplicable(const ExecutionContext&,
     {
         return false;
     }
-    if(!IsOverRocmFwd2d(problem))
-    {
-        return false;
-    }
+    // if(!IsOverRocmFwd2d(problem))
+    // {
+    //     return false;
+    // }
     return true;
 }
 
@@ -100,7 +99,7 @@ AvgPoolForward2d::GetSolution(const ExecutionContext& context,
     auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
     auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
     auto dtype        = problem.GetOutputDesc().GetType();
-    size_t N_total    = problem.GetNtotal();
+    uint64_t N_total  = problem.GetNtotal();
 
     auto build_params = KernelBuildParameters{
         {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
@@ -122,12 +121,12 @@ AvgPoolForward2d::GetSolution(const ExecutionContext& context,
             auto input_tv  = get_inner_expanded_tv<4>(deref(params.inputDesc));
             auto output_tv = get_inner_expanded_tv<4>(deref(params.outputDesc));
 
-            size_t N  = deref(params.inputDesc).GetLengths()[0];
-            size_t C  = deref(params.inputDesc).GetLengths()[1];
-            size_t H  = deref(params.inputDesc).GetLengths()[2];
-            size_t W  = deref(params.inputDesc).GetLengths()[3];
-            size_t OH = deref(params.outputDesc).GetLengths()[2];
-            size_t OW = deref(params.outputDesc).GetLengths()[3];
+            int64_t N  = deref(params.inputDesc).GetLengths()[0];
+            int64_t C  = deref(params.inputDesc).GetLengths()[1];
+            int64_t H  = deref(params.inputDesc).GetLengths()[2];
+            int64_t W  = deref(params.inputDesc).GetLengths()[3];
+            int64_t OH = deref(params.outputDesc).GetLengths()[2];
+            int64_t OW = deref(params.outputDesc).GetLengths()[3];
 
             kernel(params.input,
                    params.output,
diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp
index ce2e05305c..abfffd3f0b 100644
--- a/src/solver/avgpool/forward_avgpool_3d.cpp
+++ b/src/solver/avgpool/forward_avgpool_3d.cpp
@@ -24,10 +24,10 @@
  *
  *******************************************************************************/
 
-#include "miopen/conv_solution.hpp"
-#include "miopen/execution_context.hpp"
-#include "miopen/invoke_params.hpp"
-#include "miopen/tensor_view_utils.hpp"
+#include <miopen/conv_solution.hpp>
+#include <miopen/execution_context.hpp>
+#include <miopen/invoke_params.hpp>
+#include <miopen/tensor_view_utils.hpp>
 #include <miopen/avgpool/solvers.hpp>
 
 #include <miopen/avgpool/invoke_params.hpp>
@@ -83,10 +83,10 @@ bool AvgPoolForward3d::IsApplicable(const ExecutionContext&,
     {
         return false;
     }
-    if(!IsOverRocmFwd3d(problem))
-    {
-        return false;
-    }
+    // if(!IsOverRocmFwd3d(problem))
+    // {
+    //     return false;
+    // }
     return true;
 }
 
@@ -100,7 +100,7 @@ AvgPoolForward3d::GetSolution(const ExecutionContext& context,
     auto input_dtype  = miopen::GetDataType(problem.GetInputDesc().GetType());
     auto output_dtype = miopen::GetDataType(problem.GetOutputDesc().GetType());
     auto dtype        = problem.GetOutputDesc().GetType();
-    size_t N_total    = problem.GetNtotal();
+    uint64_t N_total  = problem.GetNtotal();
 
     auto build_params = KernelBuildParameters{
         {"MIOPEN_USE_FP16", static_cast<int>(dtype == miopenHalf)},
@@ -122,14 +122,14 @@ AvgPoolForward3d::GetSolution(const ExecutionContext& context,
             auto input_tv  = get_inner_expanded_tv<5>(deref(params.inputDesc));
             auto output_tv = get_inner_expanded_tv<5>(deref(params.outputDesc));
 
-            auto N  = deref(params.inputDesc).GetLengths()[0];
-            auto C  = deref(params.inputDesc).GetLengths()[1];
-            auto D  = deref(params.inputDesc).GetLengths()[2];
-            auto H  = deref(params.inputDesc).GetLengths()[3];
-            auto W  = deref(params.inputDesc).GetLengths()[4];
-            auto OD = deref(params.outputDesc).GetLengths()[2];
-            auto OH = deref(params.outputDesc).GetLengths()[3];
-            auto OW = deref(params.outputDesc).GetLengths()[4];
+            int64_t N  = deref(params.inputDesc).GetLengths()[0];
+            int64_t C  = deref(params.inputDesc).GetLengths()[1];
+            int64_t D  = deref(params.inputDesc).GetLengths()[2];
+            int64_t H  = deref(params.inputDesc).GetLengths()[3];
+            int64_t W  = deref(params.inputDesc).GetLengths()[4];
+            int64_t OD = deref(params.outputDesc).GetLengths()[2];
+            int64_t OH = deref(params.outputDesc).GetLengths()[3];
+            int64_t OW = deref(params.outputDesc).GetLengths()[4];
 
             kernel(params.input,
                    params.output,
diff --git a/test/cpu_avgpool.hpp b/test/cpu_avgpool.hpp
index 26965df52c..7cbd724b95 100644
--- a/test/cpu_avgpool.hpp
+++ b/test/cpu_avgpool.hpp
@@ -23,11 +23,11 @@
  * SOFTWARE.
  *
  *******************************************************************************/
-#ifndef GUARD_CPU_AVGPOOL_HPP
-#define GUARD_CPU_AVGPOOL_HPP
+#pragma once
 
 #include "tensor_holder.hpp"
 #include <miopen/tensor_view_utils.hpp>
+#include "ford.hpp"
 
 template <class T>
 void cpu_avgpool_forward_2d(tensor<T> input,
@@ -50,8 +50,7 @@ void cpu_avgpool_forward_2d(tensor<T> input,
     auto input_tv  = miopen::get_inner_expanded_tv<4>(input.desc);
     auto output_tv = miopen::get_inner_expanded_tv<4>(output.desc);
 
-    for(long gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](long gid) {
         long ncoh = gid / OW, ow = gid % OW;
         long nc = ncoh / OH, oh = ncoh % OH;
         long n = nc / C, c = nc % C;
@@ -113,7 +112,7 @@ void cpu_avgpool_forward_2d(tensor<T> input,
         float val = m / divide_factor;
 
         output[output_tv.get_tensor_view_idx({n, c, oh, ow})] = static_cast<T>(val);
-    }
+    });
 }
 
 template <class T>
@@ -139,8 +138,7 @@ void cpu_avgpool_forward_3d(tensor<T> input,
     auto input_tv  = miopen::get_inner_expanded_tv<5>(input.desc);
     auto output_tv = miopen::get_inner_expanded_tv<5>(output.desc);
 
-    for(long gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](long gid) {
         long ncodoh = gid / OW, ow = gid % OW;
         long ncod = ncodoh / OH, oh = ncodoh % OH;
         long nc = ncod / OD, od = ncod % OD;
@@ -212,7 +210,7 @@ void cpu_avgpool_forward_3d(tensor<T> input,
         }
         float val                                                 = sum / divide_factor;
         output[output_tv.get_tensor_view_idx({n, c, od, oh, ow})] = static_cast<T>(val);
-    }
+    });
 }
 
 template <class T>
@@ -236,8 +234,7 @@ void cpu_avgpool_backward_2d(tensor<T> output_grad,
     auto output_grad_tv = miopen::get_inner_expanded_tv<4>(output_grad.desc);
     auto input_grad_tv  = miopen::get_inner_expanded_tv<4>(input_grad.desc);
 
-    for(long gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](long gid) {
         long nch = gid / W, w = gid % W;
         long nc = nch / H, h = nch % H;
         long n = nc / C, c = nc % C;
@@ -304,7 +301,7 @@ void cpu_avgpool_backward_2d(tensor<T> output_grad,
             }
         }
         input_grad[input_grad_tv.get_tensor_view_idx({n, c, h, w})] = static_cast<T>(grad);
-    }
+    });
 }
 
 template <class T>
@@ -330,8 +327,7 @@ void cpu_avgpool_backward_3d(tensor<T> output_grad,
     auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc);
     auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(input_grad.desc);
 
-    for(long gid = 0; gid < numel; gid++)
-    {
+    par_ford(numel)([&](long gid) {
         long ncdh = gid / W, w = gid % W;
         long ncd = ncdh / H, h = ncdh % H;
         long nc = ncd / D, d = ncd % D;
@@ -415,7 +411,5 @@ void cpu_avgpool_backward_3d(tensor<T> output_grad,
             }
         }
         input_grad[input_grad_tv.get_tensor_view_idx({n, c, d, h, w})] = static_cast<T>(grad);
-    }
+    });
 }
-
-#endif
diff --git a/test/gtest/avgpool.cpp b/test/gtest/avgpool.cpp
index 3ab32be510..cd15c93cb4 100644
--- a/test/gtest/avgpool.cpp
+++ b/test/gtest/avgpool.cpp
@@ -24,91 +24,30 @@
  *
  *******************************************************************************/
 #include "avgpool.hpp"
-#include <miopen/env.hpp>
-
-MIOPEN_DECLARE_ENV_VAR_STR(MIOPEN_TEST_FLOAT_ARG)
-MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_TEST_ALL)
-
-namespace avgpool {
-
-std::string GetFloatArg()
-{
-    const auto& tmp = env::value(MIOPEN_TEST_FLOAT_ARG);
-    if(tmp.empty())
-    {
-        return "";
-    }
-    return tmp;
-}
-
-struct GPU_Avgpool_fwd_FP32 : AvgPoolTestFwd<float>
-{
-};
-
-struct GPU_Avgpool_fwd_FP16 : AvgPoolTestFwd<half>
-{
-};
-
-struct GPU_Avgpool_fwd_BFP16 : AvgPoolTestFwd<bfloat16>
-{
-};
-
-struct GPU_Avgpool_bwd_FP32 : AvgPoolTestBwd<float>
-{
-};
-
-struct GPU_Avgpool_bwd_FP16 : AvgPoolTestBwd<half>
-{
-};
-
-struct GPU_Avgpool_bwd_BFP16 : AvgPoolTestBwd<bfloat16>
-{
-};
-
-} // namespace avgpool
-using namespace avgpool;
+#include "gtest/gtest.h"
+using float16 = half_float::half;
 
 // FORWARD TEST
+using GPU_Avgpool_fwd_FP32  = AvgPoolTestFwd<float>;
+using GPU_Avgpool_fwd_FP16  = AvgPoolTestFwd<float16>;
+using GPU_Avgpool_fwd_BFP16 = AvgPoolTestFwd<bfloat16>;
+
 TEST_P(GPU_Avgpool_fwd_FP32, AvgPoolTestFwd)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 TEST_P(GPU_Avgpool_fwd_FP16, AvgPoolTestFwd)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 TEST_P(GPU_Avgpool_fwd_BFP16, AvgPoolTestFwd)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
@@ -122,46 +61,26 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                          testing::ValuesIn(AvgPoolTestConfigsFwdBfp16()));
 
 // BACKWARD TEST
+using GPU_Avgpool_bwd_FP32  = AvgPoolTestBwd<float>;
+using GPU_Avgpool_bwd_FP16  = AvgPoolTestBwd<float16>;
+using GPU_Avgpool_bwd_BFP16 = AvgPoolTestBwd<bfloat16>;
+
 TEST_P(GPU_Avgpool_bwd_FP32, AvgPoolTestBwd)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--float"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 TEST_P(GPU_Avgpool_bwd_FP16, AvgPoolTestBwd)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--half"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 TEST_P(GPU_Avgpool_bwd_BFP16, AvgPoolTestBwd)
 {
-    if(!MIOPEN_TEST_ALL ||
-       (env::enabled(MIOPEN_TEST_ALL) && env::value(MIOPEN_TEST_FLOAT_ARG) == "--bfloat16"))
-    {
-        RunTest();
-        Verify();
-    }
-    else
-    {
-        GTEST_SKIP();
-    }
+    RunTest();
+    Verify();
 };
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp
index 1c022c8abe..49259094e8 100644
--- a/test/gtest/avgpool.hpp
+++ b/test/gtest/avgpool.hpp
@@ -264,22 +264,22 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
                                       count_include_pad,
                                       divisor_override);
         }
-        status = miopen::AvgPoolForward(handle,
-                                        input.desc,
-                                        input_dev.get(),
-                                        output.desc,
-                                        output_dev.get(),
-                                        ksize.GetSize() == 3 ? ksize[0] : 1,
-                                        ksize.GetSize() == 3 ? ksize[1] : ksize[0],
-                                        ksize.GetSize() == 3 ? ksize[2] : ksize[1],
-                                        stride.GetSize() == 3 ? stride[0] : 1,
-                                        stride.GetSize() == 3 ? stride[1] : stride[0],
-                                        stride.GetSize() == 3 ? stride[2] : stride[1],
-                                        padding.GetSize() == 3 ? padding[0] : 1,
-                                        padding.GetSize() == 3 ? padding[1] : padding[0],
-                                        padding.GetSize() == 3 ? padding[2] : padding[1],
-                                        count_include_pad,
-                                        divisor_override);
+        status = miopen::avgpool::AvgPoolForward(handle,
+                                                 input.desc,
+                                                 input_dev.get(),
+                                                 output.desc,
+                                                 output_dev.get(),
+                                                 ksize.GetSize() == 3 ? ksize[0] : 1,
+                                                 ksize.GetSize() == 3 ? ksize[1] : ksize[0],
+                                                 ksize.GetSize() == 3 ? ksize[2] : ksize[1],
+                                                 stride.GetSize() == 3 ? stride[0] : 1,
+                                                 stride.GetSize() == 3 ? stride[1] : stride[0],
+                                                 stride.GetSize() == 3 ? stride[2] : stride[1],
+                                                 padding.GetSize() == 3 ? padding[0] : 1,
+                                                 padding.GetSize() == 3 ? padding[1] : padding[0],
+                                                 padding.GetSize() == 3 ? padding[2] : padding[1],
+                                                 count_include_pad,
+                                                 divisor_override);
         fflush(stdout);
         ASSERT_EQ(status, miopenStatusSuccess);
 
@@ -431,22 +431,22 @@ struct AvgPoolTestBwd : public ::testing::TestWithParam<AvgPoolTestCase>
                                        count_include_pad,
                                        static_cast<long>(divisor_override));
         }
-        status = miopen::AvgPoolBackward(handle,
-                                         output_grad.desc,
-                                         output_grad_dev.get(),
-                                         input_grad.desc,
-                                         input_grad_dev.get(),
-                                         ksize.GetSize() == 3 ? ksize[0] : 1,
-                                         ksize.GetSize() == 3 ? ksize[1] : ksize[0],
-                                         ksize.GetSize() == 3 ? ksize[2] : ksize[1],
-                                         stride.GetSize() == 3 ? stride[0] : 1,
-                                         stride.GetSize() == 3 ? stride[1] : stride[0],
-                                         stride.GetSize() == 3 ? stride[2] : stride[1],
-                                         padding.GetSize() == 3 ? padding[0] : 1,
-                                         padding.GetSize() == 3 ? padding[1] : padding[0],
-                                         padding.GetSize() == 3 ? padding[2] : padding[1],
-                                         count_include_pad,
-                                         divisor_override);
+        status = miopen::avgpool::AvgPoolBackward(handle,
+                                                  output_grad.desc,
+                                                  output_grad_dev.get(),
+                                                  input_grad.desc,
+                                                  input_grad_dev.get(),
+                                                  ksize.GetSize() == 3 ? ksize[0] : 1,
+                                                  ksize.GetSize() == 3 ? ksize[1] : ksize[0],
+                                                  ksize.GetSize() == 3 ? ksize[2] : ksize[1],
+                                                  stride.GetSize() == 3 ? stride[0] : 1,
+                                                  stride.GetSize() == 3 ? stride[1] : stride[0],
+                                                  stride.GetSize() == 3 ? stride[2] : stride[1],
+                                                  padding.GetSize() == 3 ? padding[0] : 1,
+                                                  padding.GetSize() == 3 ? padding[1] : padding[0],
+                                                  padding.GetSize() == 3 ? padding[2] : padding[1],
+                                                  count_include_pad,
+                                                  divisor_override);
 
         ASSERT_EQ(status, miopenStatusSuccess);
 

From 73e9ab58d774c6036d370a7d4784a5e222591e50 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Tue, 8 Oct 2024 14:57:34 +0700
Subject: [PATCH 16/29] small fix

---
 include/miopen/miopen.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 1de70a867a..18102e8dde 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -7808,13 +7808,13 @@ MIOPEN_EXPORT miopenStatus_t miopenPReLUBackward(miopenHandle_t handle,
  * @param output                   Data tensor output (output)
  * @param KD                       Kernel size in dimension D  (input)
  * @param KH                       Kernel size in dimension H (input)
- * @param KW                       Kernel size in dimension K (input)
+ * @param KW                       Kernel size in dimension W (input)
  * @param SD                       Stride size in dimension D (input)
  * @param SH                       Stride size in dimension H (input)
- * @param SW                       Stride size in dimension K (input)
+ * @param SW                       Stride size in dimension W (input)
  * @param PD                       Padding size in dimension D (input)
  * @param PH                       Padding size in dimension H (input)
- * @param PW                       Padding size in dimension K (input)
+ * @param PW                       Padding size in dimension W (input)
  * @param count_include_pad        When True, will include the zero-padding in the averaging
  * calculation (input)
  * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will
@@ -7847,13 +7847,13 @@ MIOPEN_EXPORT miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
  * @param input_grad               Data tensor input grad (output)
  * @param KD                       Kernel size in dimension D  (input)
  * @param KH                       Kernel size in dimension H (input)
- * @param KW                       Kernel size in dimension K (input)
+ * @param KW                       Kernel size in dimension W (input)
  * @param SD                       Stride size in dimension D (input)
  * @param SH                       Stride size in dimension H (input)
- * @param SW                       Stride size in dimension K (input)
+ * @param SW                       Stride size in dimension W (input)
  * @param PD                       Padding size in dimension D (input)
  * @param PH                       Padding size in dimension H (input)
- * @param PW                       Padding size in dimension K (input)
+ * @param PW                       Padding size in dimension W (input)
  * @param count_include_pad        When True, will include the zero-padding in the averaging
  * calculation (input)
  * @param divisor_override         If non-zero, will use this value as the divisor, otherwise will

From 272a81e5f098bdc857c875b206188aad3c545cdd Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Tue, 8 Oct 2024 18:28:07 +0700
Subject: [PATCH 17/29] add conditions to IsOverRocm

---
 src/solver/avgpool/backward_avgpool_2d.cpp | 40 +++++++---------
 src/solver/avgpool/backward_avgpool_3d.cpp | 51 ++++++++++-----------
 src/solver/avgpool/forward_avgpool_2d.cpp  | 53 ++++++++++++----------
 src/solver/avgpool/forward_avgpool_3d.cpp  | 50 +++++++++-----------
 4 files changed, 90 insertions(+), 104 deletions(-)

diff --git a/src/solver/avgpool/backward_avgpool_2d.cpp b/src/solver/avgpool/backward_avgpool_2d.cpp
index de3e7a2f73..0f8e4e1f42 100644
--- a/src/solver/avgpool/backward_avgpool_2d.cpp
+++ b/src/solver/avgpool/backward_avgpool_2d.cpp
@@ -45,27 +45,13 @@ namespace avgpool {
 
 bool IsOverRocmBwd2d(const miopen::avgpool::BwdProblemDescription& problem)
 {
-    auto dtype      = problem.GetInputGradDesc().GetType();
-    auto in_nelems  = problem.GetInputGradDesc().GetElementSize();
-    auto out_nelems = problem.GetOutputGradDesc().GetElementSize();
-    auto mul_nc =
-        problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1];
-    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
-
-    if(dtype == miopenFloat)
+    if(!problem.IsAllContiguous())
     {
-        return false;
-    }
-    else if(dtype == miopenHalf)
-    {
-        if(in_over_out < 2 && in_nelems >= 11075584)
-        {
-            return true;
-        }
-    }
-    else if(dtype == miopenBFloat16)
-    {
-        if(in_over_out < 2 || (in_nelems > 20000000 && mul_nc <= 2048))
+        auto in_nelems   = problem.GetInputGradDesc().GetElementSize();
+        auto out_nelems  = problem.GetOutputGradDesc().GetElementSize();
+        auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
+
+        if(in_over_out == 4)
         {
             return true;
         }
@@ -81,10 +67,16 @@ bool AvgPoolBackward2d::IsApplicable(const ExecutionContext&,
     {
         return false;
     }
-    // if(!IsOverRocmBwd2d(problem))
-    // {
-    //     return false;
-    // }
+    if(!(problem.GetOutputGradDesc().GetType() == miopenHalf ||
+         problem.GetOutputGradDesc().GetType() == miopenFloat ||
+         problem.GetOutputGradDesc().GetType() == miopenBFloat16))
+    {
+        return false;
+    }
+    if(!IsOverRocmBwd2d(problem))
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp
index 7b74de255b..720616f305 100644
--- a/src/solver/avgpool/backward_avgpool_3d.cpp
+++ b/src/solver/avgpool/backward_avgpool_3d.cpp
@@ -24,6 +24,7 @@
  *
  *******************************************************************************/
 
+#include "miopen/miopen.h"
 #include <miopen/conv_solution.hpp>
 #include <miopen/execution_context.hpp>
 #include <miopen/invoke_params.hpp>
@@ -45,36 +46,26 @@ namespace avgpool {
 
 bool IsOverRocmBwd3d(const miopen::avgpool::BwdProblemDescription& problem)
 {
-    auto dtype      = problem.GetInputGradDesc().GetType();
-    auto in_nelems  = problem.GetInputGradDesc().GetElementSize();
-    auto out_nelems = problem.GetOutputGradDesc().GetElementSize();
-    auto mul_nc =
-        problem.GetOutputGradDesc().GetLengths()[0] * problem.GetOutputGradDesc().GetLengths()[1];
-    auto N           = problem.GetOutputGradDesc().GetLengths()[0];
+
+    auto dtype       = problem.GetInputGradDesc().GetType();
+    auto in_nelems   = problem.GetInputGradDesc().GetElementSize();
+    auto out_nelems  = problem.GetOutputGradDesc().GetElementSize();
     auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
 
-    if(dtype == miopenFloat)
+    if(problem.IsAllContiguous())
     {
-        if((in_over_out < 2 && out_nelems <= 12582912) || (in_over_out <= 8 && N >= 6))
+        if(dtype == miopenBFloat16 || dtype == miopenHalf)
         {
-            return true;
+            if(in_over_out < 2)
+            {
+                return true;
+            }
         }
-        return false;
     }
-    else if(dtype == miopenHalf)
+    else
     {
-        if((in_over_out < 2 && mul_nc < 8192) || (8 > in_over_out && out_nelems >= 29052108))
-        {
-            return true;
-        }
-    }
-    else if(dtype == miopenBFloat16)
-    {
-        if((1 <= in_over_out && in_over_out < 2 && in_nelems >= 4194304) ||
-           (in_over_out <= 8 && in_nelems >= 944111616))
-        {
-            return true;
-        }
+        // TODO: Add more conditions
+        return true;
     }
     return false;
 }
@@ -87,10 +78,16 @@ bool AvgPoolBackward3d::IsApplicable(const ExecutionContext&,
     {
         return false;
     }
-    // if(!IsOverRocmBwd3d(problem))
-    // {
-    //     return false;
-    // }
+    if(!(problem.GetOutputGradDesc().GetType() == miopenHalf ||
+         problem.GetOutputGradDesc().GetType() == miopenFloat ||
+         problem.GetOutputGradDesc().GetType() == miopenBFloat16))
+    {
+        return false;
+    }
+    if(!IsOverRocmBwd3d(problem))
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
index 0735911f08..53fb2190a2 100644
--- a/src/solver/avgpool/forward_avgpool_2d.cpp
+++ b/src/solver/avgpool/forward_avgpool_2d.cpp
@@ -24,6 +24,7 @@
  *
  *******************************************************************************/
 
+#include "miopen/miopen.h"
 #include <miopen/conv_solution.hpp>
 #include <miopen/execution_context.hpp>
 #include <miopen/invoke_params.hpp>
@@ -45,31 +46,27 @@ namespace avgpool {
 
 bool IsOverRocmFwd2d(const miopen::avgpool::FwdProblemDescription& problem)
 {
-    auto dtype      = problem.GetOutputDesc().GetType();
-    auto in_nelems  = problem.GetInputDesc().GetElementSize();
-    auto out_nelems = problem.GetOutputDesc().GetElementSize();
-    auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1];
-    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
-
-    if(dtype == miopenFloat)
+    if(problem.IsAllContiguous())
+        return true;
+    else
     {
-        if(in_over_out > 11 || (in_over_out < 2 && mul_nc >= 12288))
+        auto dtype       = problem.GetInputDesc().GetType();
+        auto in_nelems   = problem.GetInputDesc().GetElementSize();
+        auto out_nelems  = problem.GetOutputDesc().GetElementSize();
+        auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
+        if(dtype == miopenFloat)
         {
-            return true;
+            if(out_nelems <= 9633792 && in_over_out >= 4)
+            {
+                return true;
+            }
         }
-    }
-    else if(dtype == miopenHalf)
-    {
-        if(in_over_out > 11 || (in_over_out < 2 && mul_nc < 90000))
-        {
-            return true;
-        }
-    }
-    else if(dtype == miopenBFloat16)
-    {
-        if(in_over_out >= 1024 || in_over_out < 2 || out_nelems >= 4816896)
+        else if(dtype == miopenHalf || dtype == miopenBFloat16)
         {
-            return true;
+            if(out_nelems <= 3311616 && in_over_out >= 4)
+            {
+                return true;
+            }
         }
     }
     return false;
@@ -82,10 +79,16 @@ bool AvgPoolForward2d::IsApplicable(const ExecutionContext&,
     {
         return false;
     }
-    // if(!IsOverRocmFwd2d(problem))
-    // {
-    //     return false;
-    // }
+    if(!(problem.GetInputDesc().GetType() == miopenHalf ||
+         problem.GetInputDesc().GetType() == miopenFloat ||
+         problem.GetInputDesc().GetType() == miopenBFloat16))
+    {
+        return false;
+    }
+    if(!IsOverRocmFwd2d(problem))
+    {
+        return false;
+    }
     return true;
 }
 
diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp
index abfffd3f0b..98c8db4570 100644
--- a/src/solver/avgpool/forward_avgpool_3d.cpp
+++ b/src/solver/avgpool/forward_avgpool_3d.cpp
@@ -45,33 +45,21 @@ namespace avgpool {
 
 bool IsOverRocmFwd3d(const miopen::avgpool::FwdProblemDescription& problem)
 {
-    auto dtype      = problem.GetOutputDesc().GetType();
-    auto in_nelems  = problem.GetInputDesc().GetElementSize();
-    auto out_nelems = problem.GetOutputDesc().GetElementSize();
-    auto mul_nc = problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1];
-    auto N      = problem.GetOutputDesc().GetLengths()[0];
-    auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
-
-    if(dtype == miopenFloat)
+    if(problem.IsAllContiguous())
     {
-        if(in_over_out < 2 || in_over_out >= 262144 || (out_nelems >= 10125000 && N > 4))
-        {
-            return true;
-        }
+        return true;
     }
-    else if(dtype == miopenHalf)
+    else
     {
-        if(in_nelems >= 201326592 || (in_over_out < 2 && mul_nc < 8192))
-        {
-            return true;
-        }
-    }
-    else if(dtype == miopenBFloat16)
-    {
-        if((out_nelems >= 5971968 && in_over_out < 2) || out_nelems >= 74088000)
-        {
-            return true;
-        }
+        // TODO: Add more conditions
+        auto dtype      = problem.GetOutputDesc().GetType();
+        auto in_nelems  = problem.GetInputDesc().GetElementSize();
+        auto out_nelems = problem.GetOutputDesc().GetElementSize();
+        auto mul_nc =
+            problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1];
+        auto N           = problem.GetOutputDesc().GetLengths()[0];
+        auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
+        return true;
     }
     return false;
 }
@@ -83,10 +71,16 @@ bool AvgPoolForward3d::IsApplicable(const ExecutionContext&,
     {
         return false;
     }
-    // if(!IsOverRocmFwd3d(problem))
-    // {
-    //     return false;
-    // }
+    if(!(problem.GetInputDesc().GetType() == miopenHalf ||
+         problem.GetInputDesc().GetType() == miopenFloat ||
+         problem.GetInputDesc().GetType() == miopenBFloat16))
+    {
+        return false;
+    }
+    if(!IsOverRocmFwd3d(problem))
+    {
+        return false;
+    }
     return true;
 }
 

From c57381db07ebdf65941922ca48ddb81e313d8f35 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Wed, 9 Oct 2024 11:22:57 +0700
Subject: [PATCH 18/29] add isoverrocm check

---
 src/solver/avgpool/backward_avgpool_3d.cpp | 15 ++++-----------
 src/solver/avgpool/forward_avgpool_3d.cpp  | 15 +++++----------
 2 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp
index 720616f305..647efe881f 100644
--- a/src/solver/avgpool/backward_avgpool_3d.cpp
+++ b/src/solver/avgpool/backward_avgpool_3d.cpp
@@ -52,21 +52,14 @@ bool IsOverRocmBwd3d(const miopen::avgpool::BwdProblemDescription& problem)
     auto out_nelems  = problem.GetOutputGradDesc().GetElementSize();
     auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
 
-    if(problem.IsAllContiguous())
+    if(dtype == miopenBFloat16 || dtype == miopenHalf)
     {
-        if(dtype == miopenBFloat16 || dtype == miopenHalf)
+        if(in_over_out < 2)
         {
-            if(in_over_out < 2)
-            {
-                return true;
-            }
+            return true;
         }
     }
-    else
-    {
-        // TODO: Add more conditions
-        return true;
-    }
+
     return false;
 }
 
diff --git a/src/solver/avgpool/forward_avgpool_3d.cpp b/src/solver/avgpool/forward_avgpool_3d.cpp
index 98c8db4570..b89134f403 100644
--- a/src/solver/avgpool/forward_avgpool_3d.cpp
+++ b/src/solver/avgpool/forward_avgpool_3d.cpp
@@ -45,21 +45,16 @@ namespace avgpool {
 
 bool IsOverRocmFwd3d(const miopen::avgpool::FwdProblemDescription& problem)
 {
+    auto out_nelems = problem.GetOutputDesc().GetElementSize();
     if(problem.IsAllContiguous())
     {
-        return true;
+        if(out_nelems > 1536)
+            return true;
     }
     else
     {
-        // TODO: Add more conditions
-        auto dtype      = problem.GetOutputDesc().GetType();
-        auto in_nelems  = problem.GetInputDesc().GetElementSize();
-        auto out_nelems = problem.GetOutputDesc().GetElementSize();
-        auto mul_nc =
-            problem.GetOutputDesc().GetLengths()[0] * problem.GetOutputDesc().GetLengths()[1];
-        auto N           = problem.GetOutputDesc().GetLengths()[0];
-        auto in_over_out = static_cast<float>(in_nelems) / out_nelems;
-        return true;
+        if(out_nelems > 6144 && out_nelems <= 17915904)
+            return true;
     }
     return false;
 }

From 9c371faebf739b3be2a7dfc59d681cf1a409ecb1 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Wed, 9 Oct 2024 11:47:25 +0700
Subject: [PATCH 19/29] rm dupl code

---
 test/gtest/avgpool.cpp | 16 +++++---------
 test/gtest/avgpool.hpp | 50 +++++++++---------------------------------
 2 files changed, 15 insertions(+), 51 deletions(-)

diff --git a/test/gtest/avgpool.cpp b/test/gtest/avgpool.cpp
index cd15c93cb4..28c9def5a4 100644
--- a/test/gtest/avgpool.cpp
+++ b/test/gtest/avgpool.cpp
@@ -50,15 +50,9 @@ TEST_P(GPU_Avgpool_fwd_BFP16, AvgPoolTestFwd)
     Verify();
 };
 
-INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_Avgpool_fwd_FP32,
-                         testing::ValuesIn(AvgPoolTestConfigsFwdFp32()));
-INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_Avgpool_fwd_FP16,
-                         testing::ValuesIn(AvgPoolTestConfigsFwdFp16()));
-INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_Avgpool_fwd_BFP16,
-                         testing::ValuesIn(AvgPoolTestConfigsFwdBfp16()));
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP32, testing::ValuesIn(AvgPoolTestConfigsFwd()));
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_FP16, testing::ValuesIn(AvgPoolTestConfigsFwd()));
+INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Avgpool_fwd_BFP16, testing::ValuesIn(AvgPoolTestConfigsFwd()));
 
 // BACKWARD TEST
 using GPU_Avgpool_bwd_FP32  = AvgPoolTestBwd<float>;
@@ -88,7 +82,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                          testing::ValuesIn(AvgPoolTestConfigsBwdFp32()));
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_Avgpool_bwd_FP16,
-                         testing::ValuesIn(AvgPoolTestConfigsBwdFp16()));
+                         testing::ValuesIn(AvgPoolTestConfigsBwdFp16BFp16()));
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_Avgpool_bwd_BFP16,
-                         testing::ValuesIn(AvgPoolTestConfigsBwdBfp16()));
+                         testing::ValuesIn(AvgPoolTestConfigsBwdFp16BFp16()));
diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp
index 49259094e8..ba4ad245cd 100644
--- a/test/gtest/avgpool.hpp
+++ b/test/gtest/avgpool.hpp
@@ -81,61 +81,31 @@ struct AvgPoolTestCase
     }
 };
 
-inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdFp32()
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwd()
 {
     return {
-        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0, true},
-        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0, false},
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, true},
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, false},
-    };
-}
+        {{64, 512, 14, 14}, {2, 2}, {2, 2}, {0, 0}, false, true, 0, false},
+        {{64, 512, 14, 14}, {2, 2}, {2, 2}, {0, 0}, false, true, 0, true},
+        {{4, 512, 14, 14, 14}, {2, 2, 2}, {2, 2, 2}, {0, 0, 0}, false, true, 0, false},
+        {{4, 512, 14, 14, 14}, {2, 2, 2}, {2, 2, 2}, {0, 0, 0}, false, true, 0, true},
 
-inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdFp16()
-{
-    return {
-        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0, true},
-        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0, false},
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, true},
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, false},
-    };
-}
-
-inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsFwdBfp16()
-{
-    return {
-        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0, true},
-        {{64, 768, 17, 17}, {5, 5}, {1, 1}, {1, 1}, false, false, 0, false},
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, true},
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, false},
     };
 }
 
 inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdFp32()
 {
     return {
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, true},
-        {{6, 128, 128, 128, 128}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, false},
+        {{16, 112, 112, 112}, {3, 3}, {2, 2}, {1, 1}, false, true, 0, false},
     };
 }
 
-inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdFp16()
+inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdFp16BFp16()
 {
     return {
-        {{64, 288, 35, 35}, {3, 3}, {1, 1}, {1, 1}, false, true, 0, true},
-        {{64, 288, 35, 35}, {3, 3}, {1, 1}, {1, 1}, false, true, 0, false},
-        {{6, 288, 35, 35, 35}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, false, true, 0, true},
-        {{6, 288, 35, 35, 35}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, false, true, 0, false},
-    };
-}
+        {{16, 112, 112, 112}, {3, 3}, {2, 2}, {1, 1}, false, true, 0, false},
+        {{4, 912, 8, 8, 8}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, false, true, 0, false},
+        {{4, 912, 8, 8, 8}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, false, true, 0, true},
 
-inline std::vector<AvgPoolTestCase> AvgPoolTestConfigsBwdBfp16()
-{
-    return {
-        {{64, 2048, 9, 9}, {3, 3}, {1, 1}, {1, 1}, false, true, 0, true},
-        {{64, 2048, 9, 9}, {3, 3}, {1, 1}, {1, 1}, false, true, 0, false},
-        {{6, 128, 112, 112, 112}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, true},
-        {{6, 128, 112, 112, 112}, {3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false, true, 0, false},
     };
 }
 

From 1ddaebde566b1ce6d2a6094373202de15b15f353 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Wed, 9 Oct 2024 13:47:27 +0700
Subject: [PATCH 20/29] minor change

---
 src/solver/avgpool/backward_avgpool_3d.cpp | 1 -
 src/solver/avgpool/forward_avgpool_2d.cpp  | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/solver/avgpool/backward_avgpool_3d.cpp b/src/solver/avgpool/backward_avgpool_3d.cpp
index 647efe881f..662d53be4a 100644
--- a/src/solver/avgpool/backward_avgpool_3d.cpp
+++ b/src/solver/avgpool/backward_avgpool_3d.cpp
@@ -24,7 +24,6 @@
  *
  *******************************************************************************/
 
-#include "miopen/miopen.h"
 #include <miopen/conv_solution.hpp>
 #include <miopen/execution_context.hpp>
 #include <miopen/invoke_params.hpp>
diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
index 53fb2190a2..664e0b8e66 100644
--- a/src/solver/avgpool/forward_avgpool_2d.cpp
+++ b/src/solver/avgpool/forward_avgpool_2d.cpp
@@ -24,7 +24,7 @@
  *
  *******************************************************************************/
 
-#include "miopen/miopen.h"
+#include <miopen/miopen.h>
 #include <miopen/conv_solution.hpp>
 #include <miopen/execution_context.hpp>
 #include <miopen/invoke_params.hpp>

From 0d3cc71da1e41d822d80ec07c058249bbe73a94c Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Thu, 10 Oct 2024 15:23:14 +0700
Subject: [PATCH 21/29] minor change

---
 src/solver/avgpool/forward_avgpool_2d.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/solver/avgpool/forward_avgpool_2d.cpp b/src/solver/avgpool/forward_avgpool_2d.cpp
index 664e0b8e66..25be3af3d4 100644
--- a/src/solver/avgpool/forward_avgpool_2d.cpp
+++ b/src/solver/avgpool/forward_avgpool_2d.cpp
@@ -24,7 +24,6 @@
  *
  *******************************************************************************/
 
-#include <miopen/miopen.h>
 #include <miopen/conv_solution.hpp>
 #include <miopen/execution_context.hpp>
 #include <miopen/invoke_params.hpp>

From 56a2146395386f2acd4002b497336f319a89127b Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Fri, 11 Oct 2024 10:20:37 +0700
Subject: [PATCH 22/29] small fix

---
 driver/mloAvgPoolHost.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/driver/mloAvgPoolHost.hpp b/driver/mloAvgPoolHost.hpp
index 1405633e73..4453f06f06 100644
--- a/driver/mloAvgPoolHost.hpp
+++ b/driver/mloAvgPoolHost.hpp
@@ -217,7 +217,7 @@ int32_t mloAvgPoolForward3dRunHost(const miopenTensorDescriptor_t inputDesc,
 template <typename Tgpu, typename Tcheck>
 int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDesc,
                                     const miopenTensorDescriptor_t inputGradDesc,
-                                    Tgpu* output_grad,
+                                    const Tgpu* output_grad,
                                     Tcheck* input_grad,
                                     int64_t N,
                                     int64_t C,
@@ -308,7 +308,7 @@ int32_t mloAvgPoolBackward2dRunHost(const miopenTensorDescriptor_t outputGradDes
 template <typename Tgpu, typename Tcheck>
 int32_t mloAvgPoolBackward3dRunHost(const miopenTensorDescriptor_t outputGradDesc,
                                     const miopenTensorDescriptor_t inputGradDesc,
-                                    Tgpu* output_grad,
+                                    const Tgpu* output_grad,
                                     Tcheck* input_grad,
                                     int64_t N,
                                     int64_t C,

From 651395dc4591d309adff0b2850e21fea45b9e4c1 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Fri, 11 Oct 2024 11:47:53 +0700
Subject: [PATCH 23/29] small fix

---
 driver/avgpool_driver.hpp                     | 32 ++++++----
 src/avgpool_api.cpp                           | 58 ++++++++++++++++---
 .../miopen/avgpool/problem_description.hpp    | 12 ++--
 test/gtest/avgpool.hpp                        | 53 +++++++++--------
 4 files changed, 104 insertions(+), 51 deletions(-)

diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp
index acaed31f32..3439ad0e2c 100644
--- a/driver/avgpool_driver.hpp
+++ b/driver/avgpool_driver.hpp
@@ -125,6 +125,14 @@ int AvgPoolDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
     {
         miopenEnableProfiling(GetHandle(), true);
     }
+
+    forw = inflags.GetValueInt("forw");
+
+    if(forw != 0 && forw != 1)
+    {
+        MIOPEN_THROW("Invalid Forward Mode");
+    }
+
     return miopenStatusSuccess;
 }
 
@@ -251,21 +259,23 @@ int AvgPoolDriver<Tgpu, Tref>::AddCmdLineArgs()
         'p',
         "0x0",
         "Implicit zero padding to be added on both sides D1,D2,... Example: 0x0.");
-    inflags.AddInputFlag("ceil_mode",
-                         'c',
-                         "1",
-                         "When 1, will use ceil instead of floor to compute the output shape.",
-                         "int");
-    inflags.AddInputFlag("count_include_pad",
-                         'P',
-                         "0",
-                         "When 1, will include the zero-padding in the averaging calculation.",
-                         "int");
+    inflags.AddInputFlag(
+        "ceil_mode",
+        'c',
+        "1",
+        "When 1, will use ceil instead of floor to compute the output shape (Default=1).",
+        "int");
+    inflags.AddInputFlag(
+        "count_include_pad",
+        'P',
+        "0",
+        "When 1, will include the zero-padding in the averaging calculation (Default=0).",
+        "int");
     inflags.AddInputFlag("divisor_override",
                          'd',
                          "0",
                          "If specified, it will be used as divisor, otherwise size of the pooling "
-                         "region will be used.",
+                         "region will be used (Default=0).",
                          "int");
 
     inflags.AddInputFlag("is-contiguous", 'C', "1", "is-contiguous (Default=1)", "int");
diff --git a/src/avgpool_api.cpp b/src/avgpool_api.cpp
index eaeb70dca8..117aa7c433 100644
--- a/src/avgpool_api.cpp
+++ b/src/avgpool_api.cpp
@@ -43,8 +43,17 @@ inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
     return os;
 }
 
-static void LogCmdAvgPool(const miopenTensorDescriptor_t xDesc,
+static void LogCmdAvgPool(const miopenTensorDescriptor_t iDesc,
                           const miopenTensorDescriptor_t oDesc,
+                          const int64_t KD,
+                          const int64_t KH,
+                          const int64_t KW,
+                          const int64_t SD,
+                          const int64_t SH,
+                          const int64_t SW,
+                          const int64_t PD,
+                          const int64_t PH,
+                          const int64_t PW,
                           const bool count_include_pad,
                           const int64_t divisor_override,
                           const bool is_fwd)
@@ -52,7 +61,7 @@ static void LogCmdAvgPool(const miopenTensorDescriptor_t xDesc,
     if(miopen::IsLoggingCmd())
     {
         std::stringstream ss;
-        auto dtype = miopen::deref(xDesc).GetType();
+        auto dtype = miopen::deref(iDesc).GetType();
         if(dtype == miopenHalf)
         {
             ss << "avgpoolfp16";
@@ -66,11 +75,20 @@ static void LogCmdAvgPool(const miopenTensorDescriptor_t xDesc,
             ss << "avgpoolbfp16";
         }
 
-        MIOPEN_LOG_FUNCTION(xDesc, oDesc, count_include_pad, divisor_override);
-        ss << " -Is " << miopen::deref(xDesc).GetLengths();
+        MIOPEN_LOG_FUNCTION(iDesc, oDesc, count_include_pad, divisor_override);
+        ss << " -Is " << miopen::deref(iDesc).GetLengths();
         ss << " -Os " << miopen::deref(oDesc).GetLengths();
-        ss << " -Si " << miopen::deref(xDesc).GetStrides();
+        ss << " -Si " << miopen::deref(iDesc).GetStrides();
         ss << " -So " << miopen::deref(oDesc).GetStrides();
+        ss << " -KD " << KD;
+        ss << " -KH " << KH;
+        ss << " -KW " << KW;
+        ss << " -SD " << SD;
+        ss << " -SH " << SH;
+        ss << " -SW " << SW;
+        ss << " -PD " << PD;
+        ss << " -PH " << PH;
+        ss << " -PW " << PW;
         ss << " -Cp " << count_include_pad;
         ss << " -Do " << divisor_override;
         ss << " -F " << ((is_fwd) ? "1" : "2");
@@ -113,7 +131,20 @@ extern "C" miopenStatus_t miopenAvgPoolForward(miopenHandle_t handle,
                         count_include_pad,
                         divisor_override);
 
-    LogCmdAvgPool(inputDesc, outputDesc, count_include_pad, divisor_override, true);
+    LogCmdAvgPool(inputDesc,
+                  outputDesc,
+                  KD,
+                  KH,
+                  KW,
+                  SD,
+                  SH,
+                  SW,
+                  PD,
+                  PH,
+                  PW,
+                  count_include_pad,
+                  divisor_override,
+                  true);
     return miopen::try_([&] {
         miopen::avgpool::AvgPoolForward(miopen::deref(handle),
                                         miopen::deref(inputDesc),
@@ -168,7 +199,20 @@ extern "C" miopenStatus_t miopenAvgPoolBackward(miopenHandle_t handle,
                         count_include_pad,
                         divisor_override);
 
-    LogCmdAvgPool(inputGradDesc, outputGradDesc, count_include_pad, divisor_override, false);
+    LogCmdAvgPool(inputGradDesc,
+                  outputGradDesc,
+                  KD,
+                  KH,
+                  KW,
+                  SD,
+                  SH,
+                  SW,
+                  PD,
+                  PH,
+                  PW,
+                  count_include_pad,
+                  divisor_override,
+                  false);
     return miopen::try_([&] {
         miopen::avgpool::AvgPoolBackward(miopen::deref(handle),
                                          miopen::deref(outputGradDesc),
diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/avgpool/problem_description.hpp
index 502dcac71d..0ad7b0e7d7 100644
--- a/src/include/miopen/avgpool/problem_description.hpp
+++ b/src/include/miopen/avgpool/problem_description.hpp
@@ -38,7 +38,7 @@ namespace avgpool {
 
 struct ProblemDescription : ProblemDescriptionBase
 {
-    ProblemDescription(const bool count_include_pad_, const int32_t divisor_override_)
+    ProblemDescription(const bool count_include_pad_, const int64_t divisor_override_)
         : count_include_pad(count_include_pad_), divisor_override(divisor_override_)
     {
         if(divisor_override < 0)
@@ -49,7 +49,7 @@ struct ProblemDescription : ProblemDescriptionBase
 
 protected:
     bool count_include_pad;
-    int32_t divisor_override;
+    int64_t divisor_override;
 };
 
 struct FwdProblemDescription : ProblemDescription
@@ -57,7 +57,7 @@ struct FwdProblemDescription : ProblemDescription
     FwdProblemDescription(const TensorDescriptor& inputDesc_,
                           const TensorDescriptor& outputDesc_,
                           const bool count_include_pad_,
-                          const int32_t divisor_override_)
+                          const int64_t divisor_override_)
         : ProblemDescription(count_include_pad_, divisor_override_),
           inputDesc(inputDesc_),
           outputDesc(outputDesc_)
@@ -67,8 +67,8 @@ struct FwdProblemDescription : ProblemDescription
         IsValidDims();
     }
 
-    auto GetInputDesc() const { return inputDesc; }
-    auto GetOutputDesc() const { return outputDesc; }
+    const TensorDescriptor GetInputDesc() const { return inputDesc; }
+    const TensorDescriptor GetOutputDesc() const { return outputDesc; }
     auto GetNtotal() const { return outputDesc.GetElementSize(); }
 
     bool IsValidLength() const
@@ -120,7 +120,7 @@ struct BwdProblemDescription : ProblemDescription
     BwdProblemDescription(const TensorDescriptor& outputGradDesc_,
                           const TensorDescriptor& inputGradDesc_,
                           const bool count_include_pad_,
-                          const int32_t divisor_override_)
+                          const int64_t divisor_override_)
         : ProblemDescription(count_include_pad_, divisor_override_),
           outputGradDesc(outputGradDesc_),
           inputGradDesc(inputGradDesc_)
diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp
index ba4ad245cd..8d43a4521d 100644
--- a/test/gtest/avgpool.hpp
+++ b/test/gtest/avgpool.hpp
@@ -202,19 +202,19 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
         auto dims = input.desc.GetNumDims();
         if(dims == 4)
         {
-            cpu_avgpool_forward_2d(input,
-                                   ref_output,
-                                   static_cast<long>(N),
-                                   static_cast<long>(C),
-                                   static_cast<long>(H),
-                                   static_cast<long>(W),
-                                   static_cast<long>(OH),
-                                   static_cast<long>(OW),
-                                   ksize_long,
-                                   stride_long,
-                                   padding_long,
-                                   count_include_pad,
-                                   divisor_override);
+            cpu_avgpool_forward_2d<T>(input,
+                                      ref_output,
+                                      static_cast<long>(N),
+                                      static_cast<long>(C),
+                                      static_cast<long>(H),
+                                      static_cast<long>(W),
+                                      static_cast<long>(OH),
+                                      static_cast<long>(OW),
+                                      ksize_long,
+                                      stride_long,
+                                      padding_long,
+                                      count_include_pad,
+                                      divisor_override);
         }
         else if(dims == 5)
         {
@@ -250,7 +250,6 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
                                                  padding.GetSize() == 3 ? padding[2] : padding[1],
                                                  count_include_pad,
                                                  divisor_override);
-        fflush(stdout);
         ASSERT_EQ(status, miopenStatusSuccess);
 
         output.data = handle.Read<T>(output_dev, output.data.size());
@@ -369,19 +368,19 @@ struct AvgPoolTestBwd : public ::testing::TestWithParam<AvgPoolTestCase>
         auto dims = input_grad.desc.GetNumDims();
         if(dims == 4)
         {
-            cpu_avgpool_backward_2d(output_grad,
-                                    ref_input_grad,
-                                    static_cast<long>(N),
-                                    static_cast<long>(C),
-                                    static_cast<long>(H),
-                                    static_cast<long>(W),
-                                    static_cast<long>(OH),
-                                    static_cast<long>(OW),
-                                    ksize,
-                                    stride,
-                                    padding,
-                                    count_include_pad,
-                                    static_cast<long>(divisor_override));
+            cpu_avgpool_backward_2d<T>(output_grad,
+                                       ref_input_grad,
+                                       static_cast<long>(N),
+                                       static_cast<long>(C),
+                                       static_cast<long>(H),
+                                       static_cast<long>(W),
+                                       static_cast<long>(OH),
+                                       static_cast<long>(OW),
+                                       ksize,
+                                       stride,
+                                       padding,
+                                       count_include_pad,
+                                       static_cast<long>(divisor_override));
         }
         else if(dims == 5)
         {

From f8ae16a7d6c569424a73db8c2bd57590c3be6abf Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Fri, 11 Oct 2024 11:50:45 +0700
Subject: [PATCH 24/29] throw invalid params

---
 driver/avgpool_driver.hpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp
index 3439ad0e2c..38e3d8e21e 100644
--- a/driver/avgpool_driver.hpp
+++ b/driver/avgpool_driver.hpp
@@ -152,18 +152,24 @@ int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
     if(ksize.size() != ksp_dim)
     {
         int ref = ksp_dim - ksize.size();
+        if(ref < 0)
+            MIOPEN_THROW("Invalid kernel size");
         while((ref--) != 0)
             ksize.push_back(ksize[0]);
     }
     if(stride.size() != ksp_dim)
     {
         int ref = ksp_dim - stride.size();
+        if(ref < 0)
+            MIOPEN_THROW("Invalid kernel size");
         while((ref--) != 0)
             stride.push_back(stride[0]);
     }
     if(padding.size() != ksp_dim)
     {
         int ref = ksp_dim - padding.size();
+        if(ref < 0)
+            MIOPEN_THROW("Invalid kernel size");
         while((ref--) != 0)
             padding.push_back(padding[0]);
     }

From 110249a97fd1d4e9bc158bbb2982885aabd0fa07 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Fri, 11 Oct 2024 11:53:23 +0700
Subject: [PATCH 25/29] small fix

---
 driver/avgpool_driver.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp
index 38e3d8e21e..c0b630e577 100644
--- a/driver/avgpool_driver.hpp
+++ b/driver/avgpool_driver.hpp
@@ -161,7 +161,7 @@ int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
     {
         int ref = ksp_dim - stride.size();
         if(ref < 0)
-            MIOPEN_THROW("Invalid kernel size");
+            MIOPEN_THROW("Invalid stride size");
         while((ref--) != 0)
             stride.push_back(stride[0]);
     }
@@ -169,7 +169,7 @@ int AvgPoolDriver<Tgpu, Tref>::GetandSetData()
     {
         int ref = ksp_dim - padding.size();
         if(ref < 0)
-            MIOPEN_THROW("Invalid kernel size");
+            MIOPEN_THROW("Invalid padding size");
         while((ref--) != 0)
             padding.push_back(padding[0]);
     }

From 5ec1dec7c4e814814be18e5735f4039fec5f66a8 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 14 Oct 2024 11:38:25 +0700
Subject: [PATCH 26/29] resolved comments

---
 src/avgpool/problem_description.cpp           |  10 +-
 src/include/miopen/avgpool/invoke_params.hpp  |   2 -
 .../miopen/avgpool/problem_description.hpp    |   8 +-
 test/cpu_avgpool.hpp                          | 324 +++++++++---------
 test/gtest/avgpool.hpp                        |  85 ++---
 5 files changed, 204 insertions(+), 225 deletions(-)

diff --git a/src/avgpool/problem_description.cpp b/src/avgpool/problem_description.cpp
index 96ecb4bb72..e4952d6985 100644
--- a/src/avgpool/problem_description.cpp
+++ b/src/avgpool/problem_description.cpp
@@ -59,10 +59,7 @@ NetworkConfig FwdProblemDescription::MakeNetworkConfig() const
     ss << "-input_dtype" << input_dtype;
     ss << "-Is" << input_size;
     ss << "-Os" << output_size;
-    ss << "-Si" << input_stride;
-    ss << "-So" << output_stride;
-    ss << "-Cp " << count_include_pad;
-    ss << "-Do " << divisor_override;
+    ss << "-Ic" << IsAllContiguous();
 
     return NetworkConfig{ss.str()};
 }
@@ -82,10 +79,7 @@ NetworkConfig BwdProblemDescription::MakeNetworkConfig() const
     ss << "-input_dtype" << input_dtype;
     ss << "-dIs" << input_grad_size;
     ss << "-dOs" << output_grad_size;
-    ss << "-dSi" << input_grad_stride;
-    ss << "-dSo" << output_grad_stride;
-    ss << "-Cp " << count_include_pad;
-    ss << "-Do " << divisor_override;
+    ss << "-Ic" << IsAllContiguous();
 
     return NetworkConfig{ss.str()};
 }
diff --git a/src/include/miopen/avgpool/invoke_params.hpp b/src/include/miopen/avgpool/invoke_params.hpp
index a905cdc4e6..65d1f2beeb 100644
--- a/src/include/miopen/avgpool/invoke_params.hpp
+++ b/src/include/miopen/avgpool/invoke_params.hpp
@@ -44,7 +44,6 @@ struct FwdInvokeParams : public miopen::InvokeParams
 
     ConstData_t input = nullptr;
     Data_t output     = nullptr;
-    ConstData_t ksize = nullptr;
 
     int64_t KD               = 0;
     int64_t KH               = 0;
@@ -72,7 +71,6 @@ struct BwdInvokeParams : public miopen::InvokeParams
 
     ConstData_t output_grad = nullptr;
     Data_t input_grad       = nullptr;
-    ConstData_t ksize       = nullptr;
 
     int64_t KD               = 0;
     int64_t KH               = 0;
diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/avgpool/problem_description.hpp
index 0ad7b0e7d7..c6f50700c7 100644
--- a/src/include/miopen/avgpool/problem_description.hpp
+++ b/src/include/miopen/avgpool/problem_description.hpp
@@ -67,8 +67,8 @@ struct FwdProblemDescription : ProblemDescription
         IsValidDims();
     }
 
-    const TensorDescriptor GetInputDesc() const { return inputDesc; }
-    const TensorDescriptor GetOutputDesc() const { return outputDesc; }
+    const TensorDescriptor& GetInputDesc() const { return inputDesc; }
+    const TensorDescriptor& GetOutputDesc() const { return outputDesc; }
     auto GetNtotal() const { return outputDesc.GetElementSize(); }
 
     bool IsValidLength() const
@@ -130,8 +130,8 @@ struct BwdProblemDescription : ProblemDescription
         IsValidDims();
     }
 
-    auto GetOutputGradDesc() const { return outputGradDesc; }
-    auto GetInputGradDesc() const { return inputGradDesc; }
+    const TensorDescriptor& GetOutputGradDesc() const { return outputGradDesc; }
+    const TensorDescriptor& GetInputGradDesc() const { return inputGradDesc; }
     auto GetNtotal() const { return inputGradDesc.GetElementSize(); }
 
     bool IsValidLength() const
diff --git a/test/cpu_avgpool.hpp b/test/cpu_avgpool.hpp
index 7cbd724b95..8e5adb1da8 100644
--- a/test/cpu_avgpool.hpp
+++ b/test/cpu_avgpool.hpp
@@ -32,17 +32,17 @@
 template <class T>
 void cpu_avgpool_forward_2d(tensor<T> input,
                             tensor<T>& output,
-                            long N,
-                            long C,
-                            long H,
-                            long W,
-                            long OH,
-                            long OW,
-                            tensor<long> ksize,
-                            tensor<long> stride,
-                            tensor<long> padding,
+                            int64_t N,
+                            int64_t C,
+                            int64_t H,
+                            int64_t W,
+                            int64_t OH,
+                            int64_t OW,
+                            tensor<int64_t> ksize,
+                            tensor<int64_t> stride,
+                            tensor<int64_t> padding,
                             bool count_include_pad,
-                            long divisor_override)
+                            int64_t divisor_override)
 {
     auto dims  = input.desc.GetLengths();
     auto numel = output.desc.GetElementSize();
@@ -50,50 +50,50 @@ void cpu_avgpool_forward_2d(tensor<T> input,
     auto input_tv  = miopen::get_inner_expanded_tv<4>(input.desc);
     auto output_tv = miopen::get_inner_expanded_tv<4>(output.desc);
 
-    par_ford(numel)([&](long gid) {
-        long ncoh = gid / OW, ow = gid % OW;
-        long nc = ncoh / OH, oh = ncoh % OH;
-        long n = nc / C, c = nc % C;
-        long R  = ksize[0];
-        long S  = ksize[1];
-        long sh = stride[0];
-        long sw = stride[1];
-        long ph = padding[0];
-        long pw = padding[1];
+    par_ford(numel)([&](int64_t gid) {
+        int64_t ncoh = gid / OW, ow = gid % OW;
+        int64_t nc = ncoh / OH, oh = ncoh % OH;
+        int64_t n = nc / C, c = nc % C;
+        int64_t R  = ksize[0];
+        int64_t S  = ksize[1];
+        int64_t sh = stride[0];
+        int64_t sw = stride[1];
+        int64_t ph = padding[0];
+        int64_t pw = padding[1];
 
         if(n >= N)
             return;
 
         float m = 0;
-        for(long r = 0; r < R; ++r)
+        for(int64_t r = 0; r < R; ++r)
         {
-            for(long s = 0; s < S; ++s)
+            for(int64_t s = 0; s < S; ++s)
             {
                 // input idx : (n, c, h, w)
-                long h = oh * sh - ph + r;
+                int64_t h = oh * sh - ph + r;
                 if(h < 0 || h >= H)
                     continue;
-                long w = ow * sw - pw + s;
+                int64_t w = ow * sw - pw + s;
                 if(w < 0 || w >= W)
                     continue;
-                // long input_idx = ((n * C + c) * H + h) * W + w;
+                // int64_t input_idx = ((n * C + c) * H + h) * W + w;
                 m += static_cast<float>(input[input_tv.get_tensor_view_idx({n, c, h, w})]);
             }
         }
 
-        long hstart = oh * sh - ph;
-        long wstart = ow * sw - pw;
-        long hend   = min(hstart + R, H + ph);
-        long wend   = min(wstart + S, W + pw);
+        int64_t hstart = oh * sh - ph;
+        int64_t wstart = ow * sw - pw;
+        int64_t hend   = min(hstart + R, H + ph);
+        int64_t wend   = min(wstart + S, W + pw);
 
-        const long pool_size = (hend - hstart) * (wend - wstart);
+        const int64_t pool_size = (hend - hstart) * (wend - wstart);
 
         hstart = max(hstart, 0);
         wstart = max(wstart, 0);
         hend   = min(hend, H);
         wend   = min(wend, W);
 
-        long divide_factor;
+        int64_t divide_factor;
         if(divisor_override != 0)
         {
             divide_factor = divisor_override;
@@ -118,19 +118,19 @@ void cpu_avgpool_forward_2d(tensor<T> input,
 template <class T>
 void cpu_avgpool_forward_3d(tensor<T> input,
                             tensor<T>& output,
-                            long N,
-                            long C,
-                            long D,
-                            long H,
-                            long W,
-                            long OD,
-                            long OH,
-                            long OW,
-                            tensor<long> ksize,
-                            tensor<long> stride,
-                            tensor<long> padding,
+                            int64_t N,
+                            int64_t C,
+                            int64_t D,
+                            int64_t H,
+                            int64_t W,
+                            int64_t OD,
+                            int64_t OH,
+                            int64_t OW,
+                            tensor<int64_t> ksize,
+                            tensor<int64_t> stride,
+                            tensor<int64_t> padding,
                             bool count_include_pad,
-                            long divisor_override)
+                            int64_t divisor_override)
 {
     auto dims  = input.desc.GetLengths();
     auto numel = output.desc.GetElementSize();
@@ -138,61 +138,61 @@ void cpu_avgpool_forward_3d(tensor<T> input,
     auto input_tv  = miopen::get_inner_expanded_tv<5>(input.desc);
     auto output_tv = miopen::get_inner_expanded_tv<5>(output.desc);
 
-    par_ford(numel)([&](long gid) {
-        long ncodoh = gid / OW, ow = gid % OW;
-        long ncod = ncodoh / OH, oh = ncodoh % OH;
-        long nc = ncod / OD, od = ncod % OD;
-        long n = nc / C, c = nc % C;
-        long KD = ksize[0];
-        long R  = ksize[1];
-        long S  = ksize[2];
-        long sd = stride[0];
-        long sh = stride[1];
-        long sw = stride[2];
-        long pd = padding[0];
-        long ph = padding[1];
-        long pw = padding[2];
+    par_ford(numel)([&](int64_t gid) {
+        int64_t ncodoh = gid / OW, ow = gid % OW;
+        int64_t ncod = ncodoh / OH, oh = ncodoh % OH;
+        int64_t nc = ncod / OD, od = ncod % OD;
+        int64_t n = nc / C, c = nc % C;
+        int64_t KD = ksize[0];
+        int64_t R  = ksize[1];
+        int64_t S  = ksize[2];
+        int64_t sd = stride[0];
+        int64_t sh = stride[1];
+        int64_t sw = stride[2];
+        int64_t pd = padding[0];
+        int64_t ph = padding[1];
+        int64_t pw = padding[2];
 
         if(n >= N)
             return;
         float sum = 0;
-        for(long kd = 0; kd < KD; ++kd)
+        for(int64_t kd = 0; kd < KD; ++kd)
         {
-            for(long r = 0; r < R; ++r)
+            for(int64_t r = 0; r < R; ++r)
             {
-                for(long s = 0; s < S; ++s)
+                for(int64_t s = 0; s < S; ++s)
                 {
                     // input idx : (n, c, d, h, w)
-                    long d = od * sd - pd + kd;
+                    int64_t d = od * sd - pd + kd;
                     if(d < 0 || d >= D)
                         continue;
-                    long h = oh * sh - ph + r;
+                    int64_t h = oh * sh - ph + r;
                     if(h < 0 || h >= H)
                         continue;
-                    long w = ow * sw - pw + s;
+                    int64_t w = ow * sw - pw + s;
                     if(w < 0 || w >= W)
                         continue;
-                    // long input_idx = ((n * C + c) * H + h) * W + w;
+                    // int64_t input_idx = ((n * C + c) * H + h) * W + w;
                     sum += static_cast<float>(input[input_tv.get_tensor_view_idx({n, c, d, h, w})]);
                 }
             }
         }
-        long dstart = od * sd - pd;
-        long hstart = oh * sh - ph;
-        long wstart = ow * sw - pw;
-        long dend   = min(dstart + KD, D + pd);
-        long hend   = min(hstart + R, H + ph);
-        long wend   = min(wstart + S, W + pw);
-
-        const long pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-        dstart               = max(dstart, 0);
-        hstart               = max(hstart, 0);
-        wstart               = max(wstart, 0);
-        dend                 = min(dend, D);
-        hend                 = min(hend, H);
-        wend                 = min(wend, W);
-
-        long divide_factor;
+        int64_t dstart = od * sd - pd;
+        int64_t hstart = oh * sh - ph;
+        int64_t wstart = ow * sw - pw;
+        int64_t dend   = min(dstart + KD, D + pd);
+        int64_t hend   = min(hstart + R, H + ph);
+        int64_t wend   = min(wstart + S, W + pw);
+
+        const int64_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+        dstart                  = max(dstart, 0);
+        hstart                  = max(hstart, 0);
+        wstart                  = max(wstart, 0);
+        dend                    = min(dend, D);
+        hend                    = min(hend, H);
+        wend                    = min(wend, W);
+
+        int64_t divide_factor;
         if(divisor_override != 0)
         {
             divide_factor = divisor_override;
@@ -216,17 +216,17 @@ void cpu_avgpool_forward_3d(tensor<T> input,
 template <class T>
 void cpu_avgpool_backward_2d(tensor<T> output_grad,
                              tensor<T>& input_grad,
-                             long N,
-                             long C,
-                             long H,
-                             long W,
-                             long OH,
-                             long OW,
-                             tensor<long> ksize,
-                             tensor<long> stride,
-                             tensor<long> padding,
+                             int64_t N,
+                             int64_t C,
+                             int64_t H,
+                             int64_t W,
+                             int64_t OH,
+                             int64_t OW,
+                             tensor<int64_t> ksize,
+                             tensor<int64_t> stride,
+                             tensor<int64_t> padding,
                              bool count_include_pad,
-                             long divisor_override)
+                             int64_t divisor_override)
 {
     auto dims  = input_grad.desc.GetLengths();
     auto numel = input_grad.desc.GetElementSize();
@@ -234,51 +234,51 @@ void cpu_avgpool_backward_2d(tensor<T> output_grad,
     auto output_grad_tv = miopen::get_inner_expanded_tv<4>(output_grad.desc);
     auto input_grad_tv  = miopen::get_inner_expanded_tv<4>(input_grad.desc);
 
-    par_ford(numel)([&](long gid) {
-        long nch = gid / W, w = gid % W;
-        long nc = nch / H, h = nch % H;
-        long n = nc / C, c = nc % C;
-        long R  = ksize[0];
-        long S  = ksize[1];
-        long sh = stride[0];
-        long sw = stride[1];
-        long ph = padding[0];
-        long pw = padding[1];
+    par_ford(numel)([&](int64_t gid) {
+        int64_t nch = gid / W, w = gid % W;
+        int64_t nc = nch / H, h = nch % H;
+        int64_t n = nc / C, c = nc % C;
+        int64_t R  = ksize[0];
+        int64_t S  = ksize[1];
+        int64_t sh = stride[0];
+        int64_t sw = stride[1];
+        int64_t ph = padding[0];
+        int64_t pw = padding[1];
 
         if(n >= N)
             return;
 
         float grad = 0;
-        for(long r = 0; r < R; ++r)
+        for(int64_t r = 0; r < R; ++r)
         {
-            for(long s = 0; s < S; ++s)
+            for(int64_t s = 0; s < S; ++s)
             {
-                long ohsh = h + ph - r;
+                int64_t ohsh = h + ph - r;
                 if(ohsh % sh != 0)
                     continue;
-                long oh = ohsh / sh;
+                int64_t oh = ohsh / sh;
                 if(oh < 0 || oh >= OH)
                     continue;
-                long owsw = w + pw - s;
+                int64_t owsw = w + pw - s;
                 if(owsw % sw != 0)
                     continue;
-                long ow = owsw / sw;
+                int64_t ow = owsw / sw;
                 if(ow < 0 || ow >= OW)
                     continue;
 
-                long hstart = oh * sh - ph;
-                long wstart = ow * sw - pw;
-                long hend   = min(hstart + R, H + ph);
-                long wend   = min(wstart + S, W + pw);
+                int64_t hstart = oh * sh - ph;
+                int64_t wstart = ow * sw - pw;
+                int64_t hend   = min(hstart + R, H + ph);
+                int64_t wend   = min(wstart + S, W + pw);
 
-                const long pool_size = (hend - hstart) * (wend - wstart);
+                const int64_t pool_size = (hend - hstart) * (wend - wstart);
 
                 hstart = max(hstart, 0);
                 wstart = max(wstart, 0);
                 hend   = min(hend, H);
                 wend   = min(wend, W);
 
-                long divide_factor;
+                int64_t divide_factor;
                 if(divisor_override != 0)
                 {
                     divide_factor = divisor_override;
@@ -307,19 +307,19 @@ void cpu_avgpool_backward_2d(tensor<T> output_grad,
 template <class T>
 void cpu_avgpool_backward_3d(tensor<T> output_grad,
                              tensor<T>& input_grad,
-                             long N,
-                             long C,
-                             long D,
-                             long H,
-                             long W,
-                             long OD,
-                             long OH,
-                             long OW,
-                             tensor<long> ksize,
-                             tensor<long> stride,
-                             tensor<long> padding,
+                             int64_t N,
+                             int64_t C,
+                             int64_t D,
+                             int64_t H,
+                             int64_t W,
+                             int64_t OD,
+                             int64_t OH,
+                             int64_t OW,
+                             tensor<int64_t> ksize,
+                             tensor<int64_t> stride,
+                             tensor<int64_t> padding,
                              bool count_include_pad,
-                             long divisor_override)
+                             int64_t divisor_override)
 {
     auto dims  = input_grad.desc.GetLengths();
     auto numel = input_grad.desc.GetElementSize();
@@ -327,67 +327,67 @@ void cpu_avgpool_backward_3d(tensor<T> output_grad,
     auto output_grad_tv = miopen::get_inner_expanded_tv<5>(output_grad.desc);
     auto input_grad_tv  = miopen::get_inner_expanded_tv<5>(input_grad.desc);
 
-    par_ford(numel)([&](long gid) {
-        long ncdh = gid / W, w = gid % W;
-        long ncd = ncdh / H, h = ncdh % H;
-        long nc = ncd / D, d = ncd % D;
-        long n = nc / C, c = nc % C;
-        long KD = ksize[0];
-        long R  = ksize[1];
-        long S  = ksize[2];
-        long sd = stride[0];
-        long sh = stride[1];
-        long sw = stride[2];
-        long pd = padding[0];
-        long ph = padding[1];
-        long pw = padding[2];
+    par_ford(numel)([&](int64_t gid) {
+        int64_t ncdh = gid / W, w = gid % W;
+        int64_t ncd = ncdh / H, h = ncdh % H;
+        int64_t nc = ncd / D, d = ncd % D;
+        int64_t n = nc / C, c = nc % C;
+        int64_t KD = ksize[0];
+        int64_t R  = ksize[1];
+        int64_t S  = ksize[2];
+        int64_t sd = stride[0];
+        int64_t sh = stride[1];
+        int64_t sw = stride[2];
+        int64_t pd = padding[0];
+        int64_t ph = padding[1];
+        int64_t pw = padding[2];
 
         if(n >= N)
             return;
 
         float grad = 0;
-        for(long kd = 0; kd < KD; ++kd)
+        for(int64_t kd = 0; kd < KD; ++kd)
         {
-            for(long r = 0; r < R; ++r)
+            for(int64_t r = 0; r < R; ++r)
             {
-                for(long s = 0; s < S; ++s)
+                for(int64_t s = 0; s < S; ++s)
                 {
-                    long odsd = d + pd - kd;
+                    int64_t odsd = d + pd - kd;
                     if(odsd % sd != 0)
                         continue;
-                    long od = odsd / sd;
+                    int64_t od = odsd / sd;
                     if(od < 0 || od >= OD)
                         continue;
 
-                    long ohsh = h + ph - r;
+                    int64_t ohsh = h + ph - r;
                     if(ohsh % sh != 0)
                         continue;
-                    long oh = ohsh / sh;
+                    int64_t oh = ohsh / sh;
                     if(oh < 0 || oh >= OH)
                         continue;
 
-                    long owsw = w + pw - s;
+                    int64_t owsw = w + pw - s;
                     if(owsw % sw != 0)
                         continue;
-                    long ow = owsw / sw;
+                    int64_t ow = owsw / sw;
                     if(ow < 0 || ow >= OW)
                         continue;
 
-                    long dstart = od * sd - pd;
-                    long hstart = oh * sh - ph;
-                    long wstart = ow * sw - pw;
-                    long dend   = min(dstart + KD, D + pd);
-                    long hend   = min(hstart + R, H + ph);
-                    long wend   = min(wstart + S, W + pw);
-
-                    const long pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-                    dstart               = max(dstart, 0);
-                    hstart               = max(hstart, 0);
-                    wstart               = max(wstart, 0);
-                    dend                 = min(dend, D);
-                    hend                 = min(hend, H);
-                    wend                 = min(wend, W);
-                    long divide_factor;
+                    int64_t dstart = od * sd - pd;
+                    int64_t hstart = oh * sh - ph;
+                    int64_t wstart = ow * sw - pw;
+                    int64_t dend   = min(dstart + KD, D + pd);
+                    int64_t hend   = min(hstart + R, H + ph);
+                    int64_t wend   = min(wstart + S, W + pw);
+
+                    const int64_t pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+                    dstart                  = max(dstart, 0);
+                    hstart                  = max(hstart, 0);
+                    wstart                  = max(wstart, 0);
+                    dend                    = min(dend, D);
+                    hend                    = min(hend, H);
+                    wend                    = min(wend, W);
+                    int64_t divide_factor;
                     if(divisor_override != 0)
                     {
                         divide_factor = divisor_override;
diff --git a/test/gtest/avgpool.hpp b/test/gtest/avgpool.hpp
index 8d43a4521d..01b87f1023 100644
--- a/test/gtest/avgpool.hpp
+++ b/test/gtest/avgpool.hpp
@@ -133,16 +133,6 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
         padding      = tensor<int64_t>{in_dim.size() - 2};
         padding.data = avgpool_config.padding;
 
-        ksize_long   = tensor<long>{in_dim.size() - 2};
-        stride_long  = tensor<long>{in_dim.size() - 2};
-        padding_long = tensor<long>{in_dim.size() - 2};
-        for(int i = 0; i < in_dim.size() - 2; i++)
-        {
-            ksize_long.data[i]   = static_cast<long>(ksize.data[i]);
-            stride_long.data[i]  = static_cast<long>(stride.data[i]);
-            padding_long.data[i] = static_cast<long>(padding.data[i]);
-        }
-
         ceil_mode         = avgpool_config.ceil_mode;
         count_include_pad = avgpool_config.count_include_pad;
         divisor_override  = avgpool_config.divisor_override;
@@ -204,15 +194,15 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
         {
             cpu_avgpool_forward_2d<T>(input,
                                       ref_output,
-                                      static_cast<long>(N),
-                                      static_cast<long>(C),
-                                      static_cast<long>(H),
-                                      static_cast<long>(W),
-                                      static_cast<long>(OH),
-                                      static_cast<long>(OW),
-                                      ksize_long,
-                                      stride_long,
-                                      padding_long,
+                                      N,
+                                      C,
+                                      H,
+                                      W,
+                                      OH,
+                                      OW,
+                                      ksize,
+                                      stride,
+                                      padding,
                                       count_include_pad,
                                       divisor_override);
         }
@@ -220,17 +210,17 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
         {
             cpu_avgpool_forward_3d<T>(input,
                                       ref_output,
-                                      static_cast<long>(N),
-                                      static_cast<long>(C),
-                                      static_cast<long>(D),
-                                      static_cast<long>(H),
-                                      static_cast<long>(W),
-                                      static_cast<long>(OD),
-                                      static_cast<long>(OH),
-                                      static_cast<long>(OW),
-                                      ksize_long,
-                                      stride_long,
-                                      padding_long,
+                                      N,
+                                      C,
+                                      D,
+                                      H,
+                                      W,
+                                      OD,
+                                      OH,
+                                      OW,
+                                      ksize,
+                                      stride,
+                                      padding,
                                       count_include_pad,
                                       divisor_override);
         }
@@ -270,11 +260,8 @@ struct AvgPoolTestFwd : public ::testing::TestWithParam<AvgPoolTestCase>
     tensor<T> output;
     tensor<T> ref_output;
     tensor<int64_t> ksize;
-    tensor<long> ksize_long;
     tensor<int64_t> stride;
-    tensor<long> stride_long;
     tensor<int64_t> padding;
-    tensor<long> padding_long;
 
     bool ceil_mode;
     bool count_include_pad;
@@ -370,35 +357,35 @@ struct AvgPoolTestBwd : public ::testing::TestWithParam<AvgPoolTestCase>
         {
             cpu_avgpool_backward_2d<T>(output_grad,
                                        ref_input_grad,
-                                       static_cast<long>(N),
-                                       static_cast<long>(C),
-                                       static_cast<long>(H),
-                                       static_cast<long>(W),
-                                       static_cast<long>(OH),
-                                       static_cast<long>(OW),
+                                       N,
+                                       C,
+                                       H,
+                                       W,
+                                       OH,
+                                       OW,
                                        ksize,
                                        stride,
                                        padding,
                                        count_include_pad,
-                                       static_cast<long>(divisor_override));
+                                       divisor_override);
         }
         else if(dims == 5)
         {
             cpu_avgpool_backward_3d<T>(output_grad,
                                        ref_input_grad,
-                                       static_cast<long>(N),
-                                       static_cast<long>(C),
-                                       static_cast<long>(D),
-                                       static_cast<long>(H),
-                                       static_cast<long>(W),
-                                       static_cast<long>(OD),
-                                       static_cast<long>(OH),
-                                       static_cast<long>(OW),
+                                       N,
+                                       C,
+                                       D,
+                                       H,
+                                       W,
+                                       OD,
+                                       OH,
+                                       OW,
                                        ksize,
                                        stride,
                                        padding,
                                        count_include_pad,
-                                       static_cast<long>(divisor_override));
+                                       divisor_override);
         }
         status = miopen::avgpool::AvgPoolBackward(handle,
                                                   output_grad.desc,

From 51999ebb84d0a95d320817bb3d0374706d59d2a7 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Mon, 14 Oct 2024 14:05:40 +0700
Subject: [PATCH 27/29] rm unrelated

---
 driver/avgpool_driver.hpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp
index c0b630e577..227a085ab0 100644
--- a/driver/avgpool_driver.hpp
+++ b/driver/avgpool_driver.hpp
@@ -84,8 +84,6 @@ class AvgPoolDriver : public Driver
 private:
     InputFlags inflags;
 
-    int forw;
-
     miopenTensorDescriptor_t inputDesc;
     miopenTensorDescriptor_t outputDesc;
     miopenTensorDescriptor_t inputGradDesc;
@@ -126,13 +124,6 @@ int AvgPoolDriver<Tgpu, Tref>::ParseCmdLineArgs(int argc, char* argv[])
         miopenEnableProfiling(GetHandle(), true);
     }
 
-    forw = inflags.GetValueInt("forw");
-
-    if(forw != 0 && forw != 1)
-    {
-        MIOPEN_THROW("Invalid Forward Mode");
-    }
-
     return miopenStatusSuccess;
 }
 

From 5a261135caa6cd7828e677ccc26bdde62cf28717 Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Tue, 15 Oct 2024 14:59:06 +0700
Subject: [PATCH 28/29] rm unused code

---
 src/avgpool.cpp                               |  6 +--
 src/avgpool/problem_description.cpp           | 12 ++----
 .../miopen/avgpool/problem_description.hpp    | 37 +++----------------
 3 files changed, 12 insertions(+), 43 deletions(-)

diff --git a/src/avgpool.cpp b/src/avgpool.cpp
index 216241a643..db6f2e72f6 100644
--- a/src/avgpool.cpp
+++ b/src/avgpool.cpp
@@ -52,8 +52,7 @@ miopenStatus_t AvgPoolForward(Handle& handle,
                               const bool count_include_pad,
                               const int64_t divisor_override)
 {
-    const auto problem =
-        avgpool::FwdProblemDescription{inputDesc, outputDesc, count_include_pad, divisor_override};
+    const auto problem = avgpool::FwdProblemDescription{inputDesc, outputDesc};
 
     const auto invoke_params = [&]() {
         auto tmp       = avgpool::FwdInvokeParams{};
@@ -102,8 +101,7 @@ miopenStatus_t AvgPoolBackward(Handle& handle,
                                const bool count_include_pad,
                                const int64_t divisor_override)
 {
-    const auto problem = avgpool::BwdProblemDescription{
-        outputGradDesc, inputGradDesc, count_include_pad, divisor_override};
+    const auto problem = avgpool::BwdProblemDescription{outputGradDesc, inputGradDesc};
 
     const auto invoke_params = [&]() {
         auto tmp           = avgpool::BwdInvokeParams{};
diff --git a/src/avgpool/problem_description.cpp b/src/avgpool/problem_description.cpp
index e4952d6985..489c63e6d4 100644
--- a/src/avgpool/problem_description.cpp
+++ b/src/avgpool/problem_description.cpp
@@ -46,10 +46,8 @@ inline std::ostream& operator<<(std::ostream& os, const std::vector<size_t>& v)
 
 NetworkConfig FwdProblemDescription::MakeNetworkConfig() const
 {
-    auto input_size    = inputDesc.GetLengths();
-    auto output_size   = outputDesc.GetLengths();
-    auto input_stride  = inputDesc.GetStrides();
-    auto output_stride = outputDesc.GetStrides();
+    auto input_size  = inputDesc.GetLengths();
+    auto output_size = outputDesc.GetLengths();
 
     auto input_dtype = inputDesc.GetType();
 
@@ -66,10 +64,8 @@ NetworkConfig FwdProblemDescription::MakeNetworkConfig() const
 
 NetworkConfig BwdProblemDescription::MakeNetworkConfig() const
 {
-    auto input_grad_size    = inputGradDesc.GetLengths();
-    auto output_grad_size   = outputGradDesc.GetLengths();
-    auto input_grad_stride  = inputGradDesc.GetStrides();
-    auto output_grad_stride = outputGradDesc.GetStrides();
+    auto input_grad_size  = inputGradDesc.GetLengths();
+    auto output_grad_size = outputGradDesc.GetLengths();
 
     auto input_dtype = inputGradDesc.GetType();
 
diff --git a/src/include/miopen/avgpool/problem_description.hpp b/src/include/miopen/avgpool/problem_description.hpp
index c6f50700c7..e71ba5e617 100644
--- a/src/include/miopen/avgpool/problem_description.hpp
+++ b/src/include/miopen/avgpool/problem_description.hpp
@@ -36,31 +36,10 @@ struct NetworkConfig;
 
 namespace avgpool {
 
-struct ProblemDescription : ProblemDescriptionBase
+struct FwdProblemDescription : ProblemDescriptionBase
 {
-    ProblemDescription(const bool count_include_pad_, const int64_t divisor_override_)
-        : count_include_pad(count_include_pad_), divisor_override(divisor_override_)
-    {
-        if(divisor_override < 0)
-        {
-            MIOPEN_THROW(miopenStatusBadParm, "AvgPool: divisor_override must be non-negative.");
-        }
-    }
-
-protected:
-    bool count_include_pad;
-    int64_t divisor_override;
-};
-
-struct FwdProblemDescription : ProblemDescription
-{
-    FwdProblemDescription(const TensorDescriptor& inputDesc_,
-                          const TensorDescriptor& outputDesc_,
-                          const bool count_include_pad_,
-                          const int64_t divisor_override_)
-        : ProblemDescription(count_include_pad_, divisor_override_),
-          inputDesc(inputDesc_),
-          outputDesc(outputDesc_)
+    FwdProblemDescription(const TensorDescriptor& inputDesc_, const TensorDescriptor& outputDesc_)
+        : inputDesc(inputDesc_), outputDesc(outputDesc_)
     {
         IsValidLength();
         IsSameType();
@@ -115,15 +94,11 @@ struct FwdProblemDescription : ProblemDescription
     TensorDescriptor outputDesc;
 };
 
-struct BwdProblemDescription : ProblemDescription
+struct BwdProblemDescription : ProblemDescriptionBase
 {
     BwdProblemDescription(const TensorDescriptor& outputGradDesc_,
-                          const TensorDescriptor& inputGradDesc_,
-                          const bool count_include_pad_,
-                          const int64_t divisor_override_)
-        : ProblemDescription(count_include_pad_, divisor_override_),
-          outputGradDesc(outputGradDesc_),
-          inputGradDesc(inputGradDesc_)
+                          const TensorDescriptor& inputGradDesc_)
+        : outputGradDesc(outputGradDesc_), inputGradDesc(inputGradDesc_)
     {
         IsValidLength();
         IsSameType();

From 0bba9ade16845acd3c7c3376cf805f6f01b2f5dc Mon Sep 17 00:00:00 2001
From: hieule88 <hieu.le@moreh.com.vn>
Date: Tue, 15 Oct 2024 15:12:07 +0700
Subject: [PATCH 29/29] small fix

---
 driver/avgpool_driver.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/avgpool_driver.hpp b/driver/avgpool_driver.hpp
index 227a085ab0..65d0f9d001 100644
--- a/driver/avgpool_driver.hpp
+++ b/driver/avgpool_driver.hpp
@@ -327,7 +327,7 @@ int AvgPoolDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
     if(status != 0)
     {
         std::cout << "Error copying data to GPU\n" << std::endl;
-        return miopenStatusAllocFailed;
+        return miopenStatusInternalError;
     }
 
     return miopenStatusSuccess;