From 9873c364d52c170f7f0a4e8871b3e04117c4783b Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Wed, 4 Sep 2024 14:12:26 +0000
Subject: [PATCH 01/27] Initial attempt to enable NHWC layout for batch norm
 driver command

---
 driver/bn_driver.hpp        | 1261 ++++++++++++++++-------------------
 driver/dm_bnorm.cpp         |    4 +-
 driver/gemm_driver.hpp      |    2 +-
 test/fusionHost.hpp         |   34 +-
 test/gtest/bn.hpp           |    5 +
 test/gtest/bn_test_data.hpp |   61 +-
 6 files changed, 646 insertions(+), 721 deletions(-)
diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index 4b94ac42d8..9c78bfb869 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -33,12 +33,15 @@
 #include "tensor_driver.hpp"
 #include "timer.hpp"
 #include "util_driver.hpp"
+#include "rocrand_wrapper.hpp"
 
 #include "../test/verify.hpp"
+#include "../test/fusionHost.hpp"
 
 #include <miopen/handle.hpp>
 #include <miopen/miopen.h>
 #include <miopen/tensor.hpp>
+#include "miopen/batch_norm.hpp"
 
 #include <algorithm>
 #include <cmath>
@@ -60,9 +63,147 @@
 
 #define MIO_DRIVER_BN_REFERENCE_COMPUTE_3D_AS_2D 1 // Resolves issue #1974
 
+//========================
+
+
+template <typename Tgpu>
+class GpumemTensor
+{
+    std::unique_ptr<GPUMem> dev;
+    tensor<Tgpu> host;
+    bool is_gpualloc = false;
+
+public:
+    void SetGpuallocMode(bool v) { is_gpualloc = v; }
+    tensor<Tgpu>& GetTensor() { return host; }
+
+    void AllocOnHost(miopenTensorDescriptor_t t)
+    {
+        host = tensor<Tgpu>(miopen::deref(t));
+        if(is_gpualloc) // We do not need host data.
+        {
+            host.data.clear();
+            host.data.shrink_to_fit(); // To free host memory.
+        }
+    }
+    template <typename T>
+    void AllocOnHost(tensor<T> t)
+    {
+        AllocOnHost(&t.desc);
+    }
+
+    std::vector<Tgpu>& GetVector()
+    {
+        if(is_gpualloc)
+            MIOPEN_THROW("[MIOpenDriver] GpumemTensor::GetVector should not be called in "
+                         "'--gpualloc 1' mode");
+        return host.data;
+    }
+
+    Tgpu* GetVectorData() { return is_gpualloc ? nullptr : host.data.data(); }
+    std::size_t GetVectorSize() const { return is_gpualloc ? 0 : host.data.size(); }
+
+    void
+    InitHostData(const size_t sz,     //
+                 const bool do_write, // If set to false, then only generate random data. This is
+                                      // necessary to reproduce values in input buffers even if some
+                                      // directions are skipped. For example, inputs for Backward
+                                      // will be the same for both "-F 0" and "-F 2".
+                 std::function<Tgpu()> generator)
+    {
+        if(is_gpualloc)
+        {
+            /// In gpualloc mode, we do not care about reproducibility of results, because
+            /// validation is not used. Therefore, we do not have to always generate random value
+            /// (\ref move_rand)
+            return;
+        }
+
+        for(size_t i = 0; i < sz; ++i)
+        {
+            /// \anchor move_rand
+            /// Generate random value, even if buffer is unused. This provides the same
+            /// initialization of input buffers regardless of which kinds of
+            /// convolutions are currently selectedfor testing (see the "-F" option).
+            /// Verification cache would be broken otherwise.
+            auto val = generator();
+            if(do_write)
+                GetVector()[i] = val;
+        }
+    }
+
+    status_t AllocOnDevice(stream, context_t ctx, const size_t sz)
+    {
+        dev = std::make_unique<GPUMem>(ctx, sz, sizeof(Tgpu));
+        return STATUS_SUCCESS;
+    }
+
+    status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz)
+    {
+        AllocOnDevice(q, ctx, sz);
+        if(is_gpualloc)
+        {
+            /// \anchor gpualloc_random_init
+            /// In gpualloc mode, we do not want to leave input buffers uninitialized, because
+            /// there could be NaNs and Infs, which may affect the performance (which we are
+            /// interested to evaluate in this mode). Initialization with all 0's is not the
+            /// best choice as well, because GPU HW may optimize out computations with 0's and
+            /// that could affect performance of kernels too. That is why we are using
+            /// rocrand to initialize input buffers.
+            ///
+            /// However we do not care about precision in gpualloc mode, because validation
+            /// is not used. Therefore, range (0,1] is fine.
+            return gpumemrand::gen_0_1(static_cast<Tgpu*>(GetDevicePtr()), sz);
+        }
+        return dev->ToGPU(q, GetVectorData());
+    }
+
+    template <typename T>
+    status_t AllocOnDevice(stream, context_t ctx, const size_t sz, std::vector<T>&)
+    {
+        static_assert(std::is_same<T, float>::value           //
+                          || std::is_same<T, int32_t>::value, //
+                      "Before enabling more types, check thoroughly.");
+        dev = std::make_unique<GPUMem>(ctx, sz, sizeof(T));
+        return STATUS_SUCCESS;
+    }
+
+    template <typename T>
+    status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz, std::vector<T>& init)
+    {
+        AllocOnDevice(q, ctx, sz, init);
+        if(is_gpualloc)
+        {
+            /// \ref gpualloc_random_init
+            return gpumemrand::gen_0_1(static_cast<Tgpu*>(GetDevicePtr()), sz);
+        }
+        return dev->ToGPU(q, init.data());
+    }
+
+    status_t CopyFromDeviceToHost(stream q)
+    {
+        return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, GetVectorData());
+    }
+
+    template <typename T>
+    status_t CopyFromDeviceToHost(stream q, tensor<T>& t)
+    {
+        return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, t.data.data());
+    }
+
+    template <typename T>
+    status_t CopyFromDeviceToHost(stream q, std::vector<T>& v)
+    {
+        return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, v.data());
+    }
+
+    auto GetDevicePtr() -> auto { return dev->GetMem(); }
+};
+//========================
+
 //#define BN_RUNFOR_PROFILER
 
-template <typename Tgpu, typename Tref, typename Tmix = Tgpu>
+template <typename Tgpu, typename Acc, typename Tref, typename Tmix = Tgpu>
 class BatchNormDriver : public Driver
 {
 public:
@@ -70,9 +211,9 @@ class BatchNormDriver : public Driver
     {
         miopenCreateTensorDescriptor(&inputTensor);
         miopenCreateTensorDescriptor(&outputTensor);
-        miopenCreateTensorDescriptor(&biasScaleTensor);
-        miopenCreateTensorDescriptor(&dxOutputTensor);
-        miopenCreateTensorDescriptor(&dyInputTensor);
+        // miopenCreateTensorDescriptor(&biasScaleTensor);
+        // miopenCreateTensorDescriptor(&dxOutputTensor);
+        // miopenCreateTensorDescriptor(&dyInputTensor);
 
         data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf;
     }
@@ -100,9 +241,9 @@ class BatchNormDriver : public Driver
     void runGPUBwd(Tref epsilon, float alpha, float beta);
 
     void runCPUFwdInference(
-        Tref epsilon, int batch_sz, int channels, int height, int width, int depth = 0);
+        Tref epsilon);
     void runCPUFwdTrain(
-        Tref epsilon, Tref eAF, int batch_sz, int channels, int height, int width, int depth = 0);
+        Tref epsilon, Tref eAF);
 
     int VerifyBackward() override;
     int VerifyForward() override;
@@ -111,13 +252,15 @@ class BatchNormDriver : public Driver
     {
         miopenDestroyTensorDescriptor(outputTensor);
         miopenDestroyTensorDescriptor(inputTensor);
-        miopenDestroyTensorDescriptor(biasScaleTensor);
-        miopenDestroyTensorDescriptor(dxOutputTensor);
-        miopenDestroyTensorDescriptor(dyInputTensor);
+        // miopenDestroyTensorDescriptor(biasScaleTensor);
+        // miopenDestroyTensorDescriptor(dxOutputTensor);
+        // miopenDestroyTensorDescriptor(dyInputTensor);
     }
 
 private:
     miopenBatchNormMode_t bn_mode;
+    miopenActivationMode_t activ_mode = miopenActivationRELU;
+
     bool saveMeanVar;
     bool bsaveMeanVar;
     bool keepRunningMeanVar;
@@ -126,67 +269,84 @@ class BatchNormDriver : public Driver
     int forw;
     int back;
 
+    bool isFwdInfer = false;
+    bool isFwdTrain = false;
+    bool isBwd = false;
+
     InputFlags inflags;
     bool isDepthSpecified = false;
 
-    miopenTensorDescriptor_t inputTensor;
-    miopenTensorDescriptor_t biasScaleTensor;
+    miopenTensorDescriptor_t inputTensor; 
     miopenTensorDescriptor_t outputTensor;
+    // // forward
+    // miopenTensorDescriptor_t scaleTensor;
+    // miopenTensorDescriptor_t biasTensor;
+
+    // // forward inference
+    // miopenTensorDescriptor_t estMean;
+    // miopenTensorDescriptor_t estVariance;
 
-    // Backwards
-    miopenTensorDescriptor_t dyInputTensor;
-    miopenTensorDescriptor_t dxOutputTensor;
-
-    std::unique_ptr<GPUMem> dyin_dev; // this is the output of fwd
-    std::unique_ptr<GPUMem> in_dev;
-    std::unique_ptr<GPUMem> out_dev;
-    std::unique_ptr<GPUMem> scale_dev;
-    std::unique_ptr<GPUMem> bias_dev;
-
-    std::unique_ptr<GPUMem> dxout_dev;
-    std::unique_ptr<GPUMem> dscale_dev;
-    std::unique_ptr<GPUMem> dbias_dev;
-
-    std::unique_ptr<GPUMem> runningMean_dev;
-    std::unique_ptr<GPUMem> runningVariance_dev;
-    std::unique_ptr<GPUMem> saveMean_dev;
-    std::unique_ptr<GPUMem> saveInvVariance_dev;
-
-    std::vector<Tgpu> dyin; // output of forward
-    std::vector<Tgpu> in;
-    std::vector<Tgpu> out;
-    std::vector<Tref> out_host;
-    std::vector<Tgpu> dxout;
-    std::vector<Tref> dxout_host;
-
-    std::vector<Tmix> scale;
-    std::vector<Tref> scale_host;
-    std::vector<Tmix> bias;
-    std::vector<Tref> bias_host;
-
-    std::vector<Tmix> dscale;
-    std::vector<Tref> dscale_host;
-    std::vector<Tmix> dbias;
-    std::vector<Tref> dbias_host;
-
-    std::vector<Tmix> runningMean;
-    std::vector<Tmix> runningVariance;
-    std::vector<Tref> runningMean_host;
-    std::vector<Tref> runningVariance_host;
-
-    std::vector<Tmix> saveMean;
-    std::vector<Tmix> saveInvVariance;
-
-    std::vector<Tref> saveMean_host;
-    std::vector<Tref> saveInvVariance_host;
-
-    int createSaveBuffers();
-    int createRunningBuffers();
+    // // forward training
+    // miopenTensorDescriptor_t savedMean;
+    // miopenTensorDescriptor_t savedVariance;
+    // miopenTensorDescriptor_t runMean;
+    // miopenTensorDescriptor_t runVariance;
+
+    // // backward
+    // miopenTensorDescriptor_t bnScale;
+    // miopenTensorDescriptor_t dy;
+
+    // miopenTensorDescriptor_t dScale;
+    // miopenTensorDescriptor_t dBias;
+    // miopenTensorDescriptor_t savedMean;
+    // miopenTensorDescriptor_t savedInvVar;
+
+    // --------------
+
+
+    GpumemTensor<Tgpu> in; // done
+    GpumemTensor<Tgpu> out; // done
+    tensor<Tgpu> out_ref;
+
+
+    // forward
+    GpumemTensor<Tgpu> scale; // done
+    GpumemTensor<Tgpu> bias; // done
+
+    // forward inference
+    GpumemTensor<Acc> estMean;
+    GpumemTensor<Acc> estVariance;
+
+    // forward training
+    GpumemTensor<Acc> savedMean;
+    tensor<Acc> savedMean_ref;
+
+    GpumemTensor<Acc> savedVariance;
+    tensor<Acc> savedVariance_ref;
+
+    GpumemTensor<Acc> runMean;
+    tensor<Acc> runMean_ref;
+
+    GpumemTensor<Acc> runVariance;
+    tensor<Acc> runVariance_ref;
+
+
+    // backward
+    GpumemTensor<Tgpu> bnScale;
+    GpumemTensor<Acc> dy;
+
+    GpumemTensor<Acc> dScale;
+    tensor<Acc> dScale_ref;
+    GpumemTensor<Acc> dBias;
+    tensor<Acc> dBias_ref;
+
+    GpumemTensor<Acc> savedInvVar;
+    
     Tref maxval;
 };
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::ParseCmdLineArgs(int argc, char* argv[])
+template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::ParseCmdLineArgs(int argc, char* argv[])
 {
     inflags.Parse(argc, argv);
 
@@ -198,56 +358,56 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::ParseCmdLineArgs(int argc, char* argv[])
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::GetandSetData()
+template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::GetandSetData()
 {
 
     SetBNParametersFromCmdLineArgs();
 
     std::vector<int> in_len = GetInputTensorLengthsFromCmdLine();
-
-    std::vector<int> sb_len;
-    if(bn_mode == miopenBNPerActivation)
+    
+    // change this to supoort NHWC too 
+    in.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, in_len});
+    out.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, in_len});
+    auto derivedBnDesc = miopen::TensorDescriptor{};
+    miopen::DeriveBNTensorDescriptor(derivedBnDesc,
+                                         in.GetTensor().desc,
+                                         bn_mode);
+    if(isFwdInfer || isFwdTrain)
     {
-        // 1xCxHxW | in_len.size = 4
-        sb_len.push_back(1);
-        sb_len.push_back(in_len[1]);
-        sb_len.push_back(in_len[2]);
-        sb_len.push_back(in_len[3]);
-
-        // 1xCxDxHxW | in_len.size = 5
-        if(in_len.size() == 5)
-        {
-            sb_len.push_back(in_len[4]);
-        }
+        scale.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
+        bias.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
     }
-    else if(bn_mode == miopenBNSpatial)
-    { // 1xCx1x1
-        sb_len.push_back(1);
-        sb_len.push_back(in_len[1]);
-        sb_len.push_back(1);
-        sb_len.push_back(1);
-
-        // 1xCx1x1x1
-        if(in_len.size() == 5)
-        {
-            sb_len.push_back(1);
-        }
+    if(isFwdInfer)
+    {
+        estMean.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
+        estVariance.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
     }
+    if(isFwdTrain && saveMeanVar)
+    {
+        savedMean.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
+        savedVariance.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
+    }
+    if(isFwdTrain && keepRunningMeanVar)
+    {
+        runMean.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
+        runVariance.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
+    }
+    if(isBwd)
+    {
+        bnScale.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
+        dy.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
 
-    SetTensorNd(inputTensor, in_len, data_type);
-    SetTensorNd(biasScaleTensor, sb_len, ((sizeof(Tmix) == 4) ? miopenFloat : miopenHalf));
-    SetTensorNd(outputTensor, in_len, data_type);
-
-    // backwards
-    SetTensorNd(dyInputTensor, in_len, data_type);
-    SetTensorNd(dxOutputTensor, in_len, data_type);
-
+        dScale.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
+        dBias.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
+        savedMean.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
+        savedInvVar.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
+    }
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::AddCmdLineArgs()
+template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::AddCmdLineArgs()
 {
     inflags.AddInputFlag(
         "forw",
@@ -294,8 +454,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::AddCmdLineArgs()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-std::vector<int> BatchNormDriver<Tgpu, Tref, Tmix>::GetInputTensorLengthsFromCmdLine()
+template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
+std::vector<int> BatchNormDriver<Tgpu, Acc, Tref, Tmix>::GetInputTensorLengthsFromCmdLine()
 {
     int in_n = inflags.GetValueInt("batchsize");
     int in_c = inflags.GetValueInt("in_channels");
@@ -317,8 +477,8 @@ std::vector<int> BatchNormDriver<Tgpu, Tref, Tmix>::GetInputTensorLengthsFromCmd
     }
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::SetBNParametersFromCmdLineArgs()
+template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::SetBNParametersFromCmdLineArgs()
 {
 
     //    	double bnAlpha = inflags.GetValueDouble("alpha");
@@ -395,242 +555,61 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::SetBNParametersFromCmdLineArgs()
         forw = 1;
     }
 
-    return miopenStatusSuccess;
-}
-
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::createSaveBuffers()
-{
-
-    status_t status = STATUS_SUCCESS;
-    DEFINE_CONTEXT(ctx);
-#if MIOPEN_BACKEND_OPENCL
-    clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr);
-#endif
-
-    size_t sb_sz = GetTensorSize(biasScaleTensor);
-
-    if(saveMeanVar)
+    if(forw == 1)
     {
-        // GPU allocation
-        saveMean_dev        = std::unique_ptr<GPUMem>(new GPUMem(ctx, sb_sz, sizeof(Tmix)));
-        saveInvVariance_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, sb_sz, sizeof(Tmix)));
-
-        if(back == 1)
-        {
-            // GPU host allocation
-            saveMean        = std::vector<Tmix>(sb_sz, static_cast<Tmix>(0));
-            saveInvVariance = std::vector<Tmix>(sb_sz, static_cast<Tmix>(0));
-
-            // CPU allocation
-            saveMean_host        = std::vector<Tref>(sb_sz, static_cast<Tref>(0));
-            saveInvVariance_host = std::vector<Tref>(sb_sz, static_cast<Tref>(0));
-
-            // Populate
-            for(int i = 0; i < sb_sz; i++)
-            {
-                saveMean[i]             = prng::gen_canonical<Tmix>();
-                saveMean_host[i]        = static_cast<Tref>(saveMean[i]);
-                saveInvVariance[i]      = prng::gen_canonical<Tmix>();
-                saveInvVariance_host[i] = static_cast<Tref>(saveInvVariance[i]);
-            }
-        }
-        else
-        {
-            // GPU host allocation
-            saveMean        = std::vector<Tmix>(sb_sz, static_cast<Tmix>(0));
-            saveInvVariance = std::vector<Tmix>(sb_sz, static_cast<Tmix>(0));
-
-            // CPU allocation
-            saveMean_host        = std::vector<Tref>(sb_sz, static_cast<Tref>(0));
-            saveInvVariance_host = std::vector<Tref>(sb_sz, static_cast<Tref>(0));
-        }
-        // GPU data transfer
-        status |= saveMean_dev->ToGPU(q, saveMean.data());
-        status |= saveInvVariance_dev->ToGPU(q, saveInvVariance.data());
+        isFwdInfer = true;
     }
-    else
+    else if(forw == 2)
     {
-        saveMean_dev        = nullptr;
-        saveInvVariance_dev = nullptr;
+        isFwdTrain = true;
+    }
+    else{
+        isBwd = true;
     }
-
-    if(status != STATUS_SUCCESS)
-        printf("Error copying data to GPU\n");
 
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::createRunningBuffers()
+
+template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::AllocateBuffersAndCopy()
 {
     status_t status = STATUS_SUCCESS;
     DEFINE_CONTEXT(ctx);
 #if MIOPEN_BACKEND_OPENCL
     clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr);
 #endif
-    size_t sb_sz = GetTensorSize(biasScaleTensor);
-
-    if(keepRunningMeanVar)
+    status |= in.AllocOnDeviceAndInit(q, ctx, in.GetTensor().desc.GetElementSpace());
+    status |= out.AllocOnDeviceAndInit(q, ctx, out.GetTensor().desc.GetElementSpace());
+    if(isFwdInfer || isFwdTrain)
     {
-        // GPU allocation
-        runningMean_dev     = std::unique_ptr<GPUMem>(new GPUMem(ctx, sb_sz, sizeof(Tmix)));
-        runningVariance_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, sb_sz, sizeof(Tmix)));
-
-        if(forw == 2)
-        {
-            // GPU host allocation
-            runningMean     = std::vector<Tmix>(sb_sz, static_cast<Tmix>(0));
-            runningVariance = std::vector<Tmix>(sb_sz, static_cast<Tmix>(0));
-
-            // CPU allocation
-            runningMean_host     = std::vector<Tref>(sb_sz, static_cast<Tref>(0));
-            runningVariance_host = std::vector<Tref>(sb_sz, static_cast<Tref>(0));
-
-            // Populate
-            for(int i = 0; i < sb_sz; i++)
-            {
-                runningMean[i]          = prng::gen_canonical<Tmix>();
-                runningMean_host[i]     = static_cast<Tref>(runningMean[i]);
-                runningVariance[i]      = prng::gen_canonical<Tmix>();
-                runningVariance_host[i] = static_cast<Tref>(runningVariance[i]);
-            }
-        }
-        else
-        {
-            // GPU host allocation
-            runningMean     = std::vector<Tmix>(sb_sz, static_cast<Tmix>(0));
-            runningVariance = std::vector<Tmix>(sb_sz, static_cast<Tmix>(0));
-
-            // CPU allocation
-            runningMean_host     = std::vector<Tref>(sb_sz, static_cast<Tref>(0));
-            runningVariance_host = std::vector<Tref>(sb_sz, static_cast<Tref>(0));
-        }
-
-        // GPU data transfer
-        status |= runningMean_dev->ToGPU(q, runningMean.data());
-        status |= runningVariance_dev->ToGPU(q, runningVariance.data());
+        status |= scale.AllocOnDeviceAndInit(q, ctx, scale.GetTensor().desc.GetElementSpace());
+        status |= bias.AllocOnDeviceAndInit(q, ctx, bias.GetTensor().desc.GetElementSpace());
     }
-    else
+    if(isFwdInfer)
     {
-        runningMean_dev     = nullptr;
-        runningVariance_dev = nullptr;
+        status |= estMean.AllocOnDeviceAndInit(q, ctx, estMean.GetTensor().desc.GetElementSpace());
+        status |= estVariance.AllocOnDeviceAndInit(q, ctx, estVariance.GetTensor().desc.GetElementSpace());
     }
-    if(status != STATUS_SUCCESS)
-        printf("Error copying data to GPU\n");
-
-    return miopenStatusSuccess;
-}
-
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::AllocateBuffersAndCopy()
-{
-    status_t status = STATUS_SUCCESS;
-    DEFINE_CONTEXT(ctx);
-#if MIOPEN_BACKEND_OPENCL
-    clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr);
-#endif
-
-    size_t in_sz = GetTensorSize(inputTensor);
-    size_t sb_sz = GetTensorSize(biasScaleTensor);
-
-    if(forw)
+    if(isFwdTrain && saveMeanVar)
     {
-
-        size_t out_sz = GetTensorSize(outputTensor);
-
-        // GPU allocation
-        in_dev    = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_sz, sizeof(Tgpu)));
-        scale_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, sb_sz, sizeof(Tmix)));
-        bias_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, sb_sz, sizeof(Tmix)));
-        out_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(Tgpu)));
-
-        // GPU host allocation
-        in    = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
-        out   = std::vector<Tgpu>(out_sz, static_cast<Tgpu>(0));
-        scale = std::vector<Tmix>(sb_sz, static_cast<Tmix>(0));
-        bias  = std::vector<Tmix>(sb_sz, static_cast<Tmix>(0));
-
-        // CPU allocation
-        out_host   = std::vector<Tref>(out_sz, static_cast<Tref>(0));
-        scale_host = std::vector<Tref>(sb_sz, static_cast<Tref>(0));
-        bias_host  = std::vector<Tref>(sb_sz, static_cast<Tref>(0));
-
-        // Data initialization
-        for(int i = 0; i < in_sz; i++)
-        {
-            in[i] = prng::gen_canonical<Tgpu>();
-        }
-        status |= in_dev->ToGPU(q, in.data());
-
-        // Using random beta and gamma
-        for(int i = 0; i < sb_sz; i++)
-        {
-            scale[i]      = prng::gen_canonical<Tmix>();
-            scale_host[i] = static_cast<Tref>(scale[i]);
-            bias[i]       = prng::gen_canonical<Tmix>();
-            bias_host[i]  = static_cast<Tref>(bias[i]);
-        }
-        status |= scale_dev->ToGPU(q, scale.data());
-        status |= bias_dev->ToGPU(q, bias.data());
-        status |= out_dev->ToGPU(q, out.data());
-
-        if(forw == 1)
-        { // training
-            status |= createRunningBuffers();
-            status |= createSaveBuffers();
-        }
-        else if(forw == 2)
-        { // inference
-            status |= createRunningBuffers();
-        }
-    } // end forward
-
-    if(back == 1)
+        status |= savedMean.AllocOnDeviceAndInit(q, ctx, savedMean.GetTensor().desc.GetElementSpace());
+        status |= savedVariance.AllocOnDeviceAndInit(q, ctx, savedVariance.GetTensor().desc.GetElementSpace());
+    }
+    if(isFwdTrain && keepRunningMeanVar)
     {
+        status |= runMean.AllocOnDeviceAndInit(q, ctx, runMean.GetTensor().desc.GetElementSpace());
+        status |= runVariance.AllocOnDeviceAndInit(q, ctx, runVariance.GetTensor().desc.GetElementSpace());
+    }
+    if(isBwd)
+    {
+        status |= bnScale.AllocOnDeviceAndInit(q, ctx, bnScale.GetTensor().desc.GetElementSpace());
+        status |= dy.AllocOnDeviceAndInit(q, ctx, dy.GetTensor().desc.GetElementSpace());
 
-        size_t out_sz = GetTensorSize(dxOutputTensor);
-
-        // GPU allocation
-        in_dev     = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_sz, sizeof(Tgpu)));
-        dyin_dev   = std::unique_ptr<GPUMem>(new GPUMem(ctx, in_sz, sizeof(Tgpu)));
-        dxout_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, out_sz, sizeof(Tgpu)));
-        dscale_dev = std::unique_ptr<GPUMem>(new GPUMem(ctx, sb_sz, sizeof(Tmix)));
-        dbias_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, sb_sz, sizeof(Tmix)));
-        scale_dev  = std::unique_ptr<GPUMem>(new GPUMem(ctx, sb_sz, sizeof(Tmix)));
-
-        // GPU host allocation
-        in     = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
-        dyin   = std::vector<Tgpu>(in_sz, static_cast<Tgpu>(0));
-        dxout  = std::vector<Tgpu>(out_sz, static_cast<Tgpu>(0));
-        dscale = std::vector<Tmix>(sb_sz, static_cast<Tmix>(0));
-        dbias  = std::vector<Tmix>(sb_sz, static_cast<Tmix>(0));
-        scale  = std::vector<Tmix>(sb_sz, static_cast<Tmix>(0));
-
-        // CPU allocation
-        dxout_host  = std::vector<Tref>(out_sz, static_cast<Tref>(0));
-        dscale_host = std::vector<Tref>(sb_sz, static_cast<Tref>(0));
-        dbias_host  = std::vector<Tref>(sb_sz, static_cast<Tref>(0));
-
-        // Populate
-        for(int i = 0; i < sb_sz; i++)
-        {
-            scale[i] = prng::gen_canonical<Tmix>();
-        }
-        status |= scale_dev->ToGPU(q, scale.data());
-        status |= dscale_dev->ToGPU(q, dscale.data());
-        status |= dbias_dev->ToGPU(q, dbias.data());
-
-        for(int i = 0; i < in_sz; i++)
-        {
-            dyin[i] = prng::gen_canonical<Tgpu>();
-            in[i]   = prng::gen_canonical<Tgpu>();
-        }
-        status |= dyin_dev->ToGPU(q, dyin.data());
-        status |= in_dev->ToGPU(q, in.data());
-        status |= dxout_dev->ToGPU(q, dxout.data());
-
-        status |= createSaveBuffers();
+        status |= dScale.AllocOnDeviceAndInit(q, ctx, dScale.GetTensor().desc.GetElementSpace());
+        status |= dBias.AllocOnDeviceAndInit(q, ctx, dBias.GetTensor().desc.GetElementSpace());
+        status |= savedMean.AllocOnDeviceAndInit(q, ctx, savedMean.GetTensor().desc.GetElementSpace());
+        status |= savedInvVar.AllocOnDeviceAndInit(q, ctx, savedInvVar.GetTensor().desc.GetElementSpace());
     }
 
     if(status != STATUS_SUCCESS)
@@ -639,8 +618,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::AllocateBuffersAndCopy()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-void BatchNormDriver<Tgpu, Tref, Tmix>::runGPUFwdInference(Tref epsilon, float alpha, float beta)
+template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
+void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runGPUFwdInference(Tref epsilon, float alpha, float beta)
 {
 
     if(keepRunningMeanVar)
@@ -649,15 +628,15 @@ void BatchNormDriver<Tgpu, Tref, Tmix>::runGPUFwdInference(Tref epsilon, float a
                                                  bn_mode,
                                                  &alpha,
                                                  &beta,
-                                                 inputTensor,
-                                                 in_dev->GetMem(),
-                                                 outputTensor,
-                                                 out_dev->GetMem(),
-                                                 biasScaleTensor,
-                                                 scale_dev->GetMem(),
-                                                 bias_dev->GetMem(),
-                                                 runningMean_dev->GetMem(),
-                                                 runningVariance_dev->GetMem(),
+                                                 &in.GetTensor().desc,
+                                                 in.GetDevicePtr(),
+                                                 &out.GetTensor().desc,
+                                                 out.GetDevicePtr(),
+                                                 &scale.GetTensor().desc,
+                                                 scale.GetDevicePtr(),
+                                                 bias.GetDevicePtr(),
+                                                 estMean.GetDevicePtr(),
+                                                 estVariance.GetDevicePtr(),
                                                  epsilon);
     }
     else
@@ -666,13 +645,13 @@ void BatchNormDriver<Tgpu, Tref, Tmix>::runGPUFwdInference(Tref epsilon, float a
                                                  bn_mode,
                                                  &alpha,
                                                  &beta,
-                                                 inputTensor,
-                                                 in_dev->GetMem(),
-                                                 outputTensor,
-                                                 out_dev->GetMem(),
-                                                 biasScaleTensor,
-                                                 scale_dev->GetMem(),
-                                                 bias_dev->GetMem(),
+                                                 &in.GetTensor().desc,
+                                                 in.GetDevicePtr(),
+                                                 &out.GetTensor().desc,
+                                                 out.GetDevicePtr(),
+                                                 &scale.GetTensor().desc,
+                                                 scale.GetDevicePtr(),
+                                                 bias.GetDevicePtr(),
                                                  nullptr,
                                                  nullptr,
                                                  epsilon);
@@ -681,8 +660,8 @@ void BatchNormDriver<Tgpu, Tref, Tmix>::runGPUFwdInference(Tref epsilon, float a
     return;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-void BatchNormDriver<Tgpu, Tref, Tmix>::runGPUFwdTrain(Tref epsilon,
+template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
+void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runGPUFwdTrain(Tref epsilon,
                                                        Tref eAF,
                                                        float alpha,
                                                        float beta)
@@ -690,107 +669,107 @@ void BatchNormDriver<Tgpu, Tref, Tmix>::runGPUFwdTrain(Tref epsilon,
     if(saveMeanVar && keepRunningMeanVar)
     {
         miopenBatchNormalizationForwardTraining(GetHandle(),
-                                                bn_mode,
-                                                &alpha,
-                                                &beta,
-                                                inputTensor,
-                                                in_dev->GetMem(),
-                                                outputTensor,
-                                                out_dev->GetMem(),
-                                                biasScaleTensor,
-                                                scale_dev->GetMem(),
-                                                bias_dev->GetMem(),
-                                                eAF,
-                                                runningMean_dev->GetMem(),
-                                                runningVariance_dev->GetMem(),
-                                                epsilon,
-                                                saveMean_dev->GetMem(),
-                                                saveInvVariance_dev->GetMem());
+                                                 bn_mode,
+                                                 &alpha,
+                                                 &beta,
+                                                 &in.GetTensor().desc,
+                                                 in.GetDevicePtr(),
+                                                 &out.GetTensor().desc,
+                                                 out.GetDevicePtr(),
+                                                 &scale.GetTensor().desc,
+                                                 scale.GetDevicePtr(),
+                                                 bias.GetDevicePtr(),
+                                                 eAF,
+                                                 runMean.GetDevicePtr(),
+                                                 runVariance.GetDevicePtr(),
+                                                 epsilon,
+                                                 savedMean.GetDevicePtr(),
+                                                 savedVariance.GetDevicePtr());
     }
     else if(saveMeanVar)
     {
         miopenBatchNormalizationForwardTraining(GetHandle(),
-                                                bn_mode,
-                                                &alpha,
-                                                &beta,
-                                                inputTensor,
-                                                in_dev->GetMem(),
-                                                outputTensor,
-                                                out_dev->GetMem(),
-                                                biasScaleTensor,
-                                                scale_dev->GetMem(),
-                                                bias_dev->GetMem(),
-                                                eAF,
-                                                nullptr,
-                                                nullptr,
-                                                epsilon,
-                                                saveMean_dev->GetMem(),
-                                                saveInvVariance_dev->GetMem());
+                                                 bn_mode,
+                                                 &alpha,
+                                                 &beta,
+                                                 &in.GetTensor().desc,
+                                                 in.GetDevicePtr(),
+                                                 &out.GetTensor().desc,
+                                                 out.GetDevicePtr(),
+                                                 &scale.GetTensor().desc,
+                                                 scale.GetDevicePtr(),
+                                                 bias.GetDevicePtr(),
+                                                 eAF,
+                                                 nullptr,
+                                                 nullptr,
+                                                 epsilon,
+                                                 savedMean.GetDevicePtr(),
+                                                 savedVariance.GetDevicePtr());
     }
     else if(keepRunningMeanVar)
     {
         miopenBatchNormalizationForwardTraining(GetHandle(),
-                                                bn_mode,
-                                                &alpha,
-                                                &beta,
-                                                inputTensor,
-                                                in_dev->GetMem(),
-                                                outputTensor,
-                                                out_dev->GetMem(),
-                                                biasScaleTensor,
-                                                scale_dev->GetMem(),
-                                                bias_dev->GetMem(),
-                                                eAF,
-                                                runningMean_dev->GetMem(),
-                                                runningVariance_dev->GetMem(),
-                                                epsilon,
-                                                nullptr,
-                                                nullptr);
+                                                 bn_mode,
+                                                 &alpha,
+                                                 &beta,
+                                                 &in.GetTensor().desc,
+                                                 in.GetDevicePtr(),
+                                                 &out.GetTensor().desc,
+                                                 out.GetDevicePtr(),
+                                                 &scale.GetTensor().desc,
+                                                 scale.GetDevicePtr(),
+                                                 bias.GetDevicePtr(),
+                                                 eAF,
+                                                 runMean.GetDevicePtr(),
+                                                 runVariance.GetDevicePtr(),
+                                                 epsilon,
+                                                 nullptr,
+                                                 nullptr);
     }
     else
     {
         miopenBatchNormalizationForwardTraining(GetHandle(),
-                                                bn_mode,
-                                                &alpha,
-                                                &beta,
-                                                inputTensor,
-                                                in_dev->GetMem(),
-                                                outputTensor,
-                                                out_dev->GetMem(),
-                                                biasScaleTensor,
-                                                scale_dev->GetMem(),
-                                                bias_dev->GetMem(),
-                                                eAF,
-                                                nullptr,
-                                                nullptr,
-                                                epsilon,
-                                                nullptr,
-                                                nullptr);
+                                                 bn_mode,
+                                                 &alpha,
+                                                 &beta,
+                                                 &in.GetTensor().desc,
+                                                 in.GetDevicePtr(),
+                                                 &out.GetTensor().desc,
+                                                 out.GetDevicePtr(),
+                                                 &scale.GetTensor().desc,
+                                                 scale.GetDevicePtr(),
+                                                 bias.GetDevicePtr(),
+                                                 eAF,
+                                                 nullptr,
+                                                 nullptr,
+                                                 epsilon,
+                                                 nullptr,
+                                                 nullptr);
     }
 
 #ifdef BN_RUNFOR_PROFILER
     miopenBatchNormalizationForwardTraining(GetHandle(),
-                                            bn_mode,
-                                            &alpha,
-                                            &beta,
-                                            inputTensor,
-                                            in_dev->GetMem(),
-                                            outputTensor,
-                                            out_dev->GetMem(),
-                                            biasScaleTensor,
-                                            scale_dev->GetMem(),
-                                            bias_dev->GetMem(),
-                                            eAF,
-                                            nullptr,
-                                            nullptr,
-                                            epsilon,
-                                            nullptr,
-                                            nullptr);
+                                                 bn_mode,
+                                                 &alpha,
+                                                 &beta,
+                                                 &in.GetTensor().desc,
+                                                 in.GetDevicePtr(),
+                                                 &out.GetTensor().desc,
+                                                 out.GetDevicePtr(),
+                                                 &scale.GetTensor().desc,
+                                                 scale.GetDevicePtr(),
+                                                 bias.GetDevicePtr(),
+                                                 eAF,
+                                                 nullptr,
+                                                 nullptr,
+                                                 epsilon,
+                                                 nullptr,
+                                                 nullptr);
 #endif
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::RunForwardGPU()
+template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::RunForwardGPU()
 {
 
     float alpha = static_cast<float>(1), beta = static_cast<float>(0);
@@ -889,42 +868,33 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunForwardGPU()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-void BatchNormDriver<Tgpu, Tref, Tmix>::runCPUFwdInference(
-    Tref epsilon, int batch_sz, int channels, int height, int width, int depth)
+template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
+void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runCPUFwdInference(
+    Tref epsilon)
 {
 
     if(bn_mode == miopenBNPerActivation)
     { // 1xCxHxW
-        miopenBNFwdInferPerActivationRunHost<Tgpu, Tref>(/* alpha, beta, */ batch_sz,
-                                                         channels,
-                                                         (isDepthSpecified ? depth : 1),
-                                                         height,
-                                                         width,
-                                                         in.data(),
-                                                         out_host.data(),
-                                                         scale_host.data(),
-                                                         bias_host.data(),
-                                                         epsilon,
-                                                         keepRunningMeanVar,
-                                                         runningMean_host.data(),
-                                                         runningVariance_host.data());
+        // handle 3d case
+        batchNormPerActivHostInference(in.GetTensor(),
+                                      out_ref,
+                                      scale.GetTensor(),
+                                      bias.GetTensor(),
+                                      epsilon,
+                                      estMean.GetTensor(),
+                                      estVariance.GetTensor());
     }
     else if(bn_mode == miopenBNSpatial)
     { // 1xCx1x1
-        miopenBNFwdInferSpatialRunHost<Tgpu, Tref>(/* alpha, beta, */ batch_sz,
-                                                   channels,
-                                                   (isDepthSpecified ? depth : 1),
-                                                   height,
-                                                   width,
-                                                   in.data(),
-                                                   out_host.data(),
-                                                   scale_host.data(),
-                                                   bias_host.data(),
-                                                   epsilon,
-                                                   keepRunningMeanVar,
-                                                   runningMean_host.data(),
-                                                   runningVariance_host.data());
+        batchNormSpatialHostInference(in.GetTensor(),
+                                      out_ref,
+                                      scale.GetTensor(),
+                                      bias.GetTensor(),
+                                      epsilon,
+                                      estMean.GetTensor(),
+                                      estVariance.GetTensor()
+                                      );
+
     }
     else
     {
@@ -935,60 +905,36 @@ void BatchNormDriver<Tgpu, Tref, Tmix>::runCPUFwdInference(
     return;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-void BatchNormDriver<Tgpu, Tref, Tmix>::runCPUFwdTrain(
-    Tref epsilon, Tref eAF, int batch_sz, int channels, int height, int width, int depth)
+template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
+void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runCPUFwdTrain(
+    Tref epsilon, Tref eAF)
 {
 
     if(bn_mode == miopenBNPerActivation)
     { // 1xCxHxW
-        miopenBNFwdTrainPerActivationRunHost<Tgpu, Tref>(/* alpha, beta, */ batch_sz,
-                                                         channels,
-#if MIO_DRIVER_BN_REFERENCE_COMPUTE_3D_AS_2D
-                                                         1,
-                                                         height * (isDepthSpecified ? depth : 1),
-#else
-                                                         (isDepthSpecified ? depth : 1),
-                                                         height,
-#endif
-                                                         width,
-                                                         in.data(),
-                                                         out_host.data(),
-                                                         scale_host.data(),
-                                                         bias_host.data(),
-                                                         epsilon,
-                                                         saveMeanVar,
-                                                         keepRunningMeanVar,
-                                                         saveMean_host.data(),
-                                                         saveInvVariance_host.data(),
-                                                         runningMean_host.data(),
-                                                         runningVariance_host.data(),
-                                                         eAF);
+        batchNormPerActHostFwdTrain(in.GetTensor(),
+                                 out_ref,
+                                 scale.GetTensor(),
+                                 bias.GetTensor(),
+                                 static_cast<double>(epsilon),
+                                 static_cast<double>(eAF),
+                                 savedMean_ref,
+                                 savedVariance_ref,
+                                 runMean_ref,
+                                 runVariance_ref);
     }
     else if(bn_mode == miopenBNSpatial)
     { // 1xCx1x1
-        miopenBNFwdTrainSpatialRunHost<Tgpu, Tref>(/* alpha, beta, */ batch_sz,
-                                                   channels,
-#if MIO_DRIVER_BN_REFERENCE_COMPUTE_3D_AS_2D
-                                                   1,
-                                                   height * (isDepthSpecified ? depth : 1),
-#else
-                                                   (isDepthSpecified ? depth : 1),
-                                                   height,
-#endif
-                                                   width,
-                                                   in.data(),
-                                                   out_host.data(),
-                                                   scale_host.data(),
-                                                   bias_host.data(),
-                                                   epsilon,
-                                                   saveMeanVar,
-                                                   keepRunningMeanVar,
-                                                   saveMean_host.data(),
-                                                   saveInvVariance_host.data(),
-                                                   runningMean_host.data(),
-                                                   runningVariance_host.data(),
-                                                   eAF);
+        batchNormSpatialHostFwdTrain(in.GetTensor(),
+                                 out_ref,
+                                 scale.GetTensor(),
+                                 bias.GetTensor(),
+                                 static_cast<double>(epsilon),
+                                 static_cast<double>(eAF),
+                                 savedMean_ref,
+                                 savedVariance_ref,
+                                 runMean_ref,
+                                 runVariance_ref);
     }
     else
     {
@@ -998,22 +944,9 @@ void BatchNormDriver<Tgpu, Tref, Tmix>::runCPUFwdTrain(
     }
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::RunForwardCPU()
+template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::RunForwardCPU()
 {
-    int nIn = 0, cIn = 0, dIn = 0, hIn = 0, wIn = 0;
-
-    if(isDepthSpecified)
-        miopenGet5dTensorDescriptorLengths(inputTensor, &nIn, &cIn, &dIn, &hIn, &wIn);
-    else
-        miopenGet4dTensorDescriptorLengths(inputTensor, &nIn, &cIn, &hIn, &wIn);
-
-    int batch_sz = nIn;
-    int channels = cIn;
-    int height   = hIn;
-    int width    = wIn;
-    int depth    = dIn;
-
     //	T alpha = 0., beta  = 0.;
     Tref epsilon = static_cast<Tref>(EPSILON);
     Tref eAF     = static_cast<Tref>(1.0);
@@ -1024,19 +957,19 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunForwardCPU()
         {
             eAF = static_cast<Tref>(1.0) / (static_cast<Tref>(i) + static_cast<Tref>(1.0));
             runCPUFwdTrain(
-                epsilon, eAF, /* alpha, beta,*/ batch_sz, channels, height, width, depth);
+                epsilon, eAF /* alpha, beta,*/);
         }
     }
     else if(forw == 2)
     { // inference only
-        runCPUFwdInference(epsilon, /* alpha, beta,*/ batch_sz, channels, height, width, depth);
+        runCPUFwdInference(epsilon);
     }
 
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardGPU()
+template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::RunBackwardGPU()
 {
 
     if(!back)
@@ -1059,46 +992,47 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardGPU()
         if(saveMeanVar)
         {
             miopenBatchNormalizationBackward(GetHandle(),
-                                             bn_mode,
-                                             &alphaDataDiff,
-                                             &betaDataDiff,
-                                             &alphaParamDiff,
-                                             &betaParamDiff,
-                                             inputTensor,
-                                             in_dev->GetMem(),
-                                             dyInputTensor,
-                                             dyin_dev->GetMem(),
-                                             dxOutputTensor,
-                                             dxout_dev->GetMem(),
-                                             biasScaleTensor,
-                                             scale_dev->GetMem(),
-                                             dscale_dev->GetMem(),
-                                             dbias_dev->GetMem(),
-                                             epsilon,
-                                             saveMean_dev->GetMem(),
-                                             saveInvVariance_dev->GetMem());
+                                                 bn_mode,
+                                                 &alphaDataDiff,
+                                                 &betaDataDiff,
+                                                 &alphaParamDiff,
+                                                 &betaParamDiff,
+                                                 &in.GetTensor().desc,
+                                                 in.GetDevicePtr(),
+                                                 &dy.GetTensor().desc,
+                                                 dy.GetDevicePtr(),
+                                                 &out.GetTensor().desc,
+                                                 out.GetDevicePtr(),
+                                                 &bnScale.GetTensor().desc,
+                                                 bnScale.GetDevicePtr(),
+                                                 dScale.GetDevicePtr(),
+                                                 dBias.GetDevicePtr(),
+                                                 epsilon,
+                                                 savedMean.GetDevicePtr(),
+                                                 savedInvVar.GetDevicePtr()
+                                                 );
         }
         else
         {
             miopenBatchNormalizationBackward(GetHandle(),
-                                             bn_mode,
-                                             &alphaDataDiff,
-                                             &betaDataDiff,
-                                             &alphaParamDiff,
-                                             &betaParamDiff,
-                                             inputTensor,
-                                             in_dev->GetMem(),
-                                             dyInputTensor,
-                                             dyin_dev->GetMem(),
-                                             dxOutputTensor,
-                                             dxout_dev->GetMem(),
-                                             biasScaleTensor,
-                                             scale_dev->GetMem(),
-                                             dscale_dev->GetMem(),
-                                             dbias_dev->GetMem(),
-                                             epsilon,
-                                             nullptr,
-                                             nullptr);
+                                                 bn_mode,
+                                                 &alphaDataDiff,
+                                                 &betaDataDiff,
+                                                 &alphaParamDiff,
+                                                 &betaParamDiff,
+                                                 &in.GetTensor().desc,
+                                                 in.GetDevicePtr(),
+                                                 &dy.GetTensor().desc,
+                                                 dy.GetDevicePtr(),
+                                                 &out.GetTensor().desc,
+                                                 out.GetDevicePtr(),
+                                                 &bnScale.GetTensor().desc,
+                                                 bnScale.GetDevicePtr(),
+                                                 dScale.GetDevicePtr(),
+                                                 dBias.GetDevicePtr(),
+                                                 epsilon,
+                                                 nullptr,
+                                                 nullptr);
         }
 
         miopen::deref(GetHandle()).Finish();
@@ -1152,8 +1086,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardGPU()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
+template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::VerifyForward()
 {
 
     // jump out since we are forcing forward off when doing backwards.
@@ -1176,27 +1110,27 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
 
         if(keepRunningMeanVar)
         { // copy back for verification
-            runningMean_dev->FromGPU(GetStream(), runningMean.data());
-            runningVariance_dev->FromGPU(GetStream(), runningVariance.data());
+            runMean.CopyFromDeviceToHost(GetStream());
+            runVariance.CopyFromDeviceToHost(GetStream());
 
-            auto errorRunMean = miopen::rms_range(runningMean_host, runningMean);
+            auto errorRunMean = miopen::rms_range(runMean_ref.data, runMean.GetVector());
             if(!std::isfinite(errorRunMean) || errorRunMean > maxrms)
             {
                 std::cout << "Forward train batch norm verification FAILED on running mean: "
                           << errorRunMean << std::endl;
                 anError = true;
 #if(MIO_BN_DEBUG == 1)
-                for(int i = 0; i < runningMean.size() && i < runningMean_host.size() &&
+                for(int i = 0; i < runMean.GetVector().size() && i < runMean_ref.data.size() &&
                                i < MIO_BN_MAX_DEBUGLOOP;
                     i++)
                 {
-                    diff = fabs(Tmix(fabs(runningMean[i]) - fabs(runningMean_host[i])));
+                    diff = fabs(Tmix(fabs(runMean.GetVector()[i]) - fabs(runMean_ref.data[i])));
                     if(!std::isfinite(diff) || diff > tolerance)
                     {
-                        std::cout << "rm[" << i << "]: " << runningMean[i];
-                        std::cout << ", rm_host[" << i << "]: " << runningMean_host[i];
+                        std::cout << "rm[" << i << "]: " << runMean.GetVector()[i];
+                        std::cout << ", rm_host[" << i << "]: " << runMean_ref.data[i];
                         std::cout << ", diff[" << i
-                                  << "]: " << Tmix(fabs(runningMean[i]) - fabs(runningMean_host[i]))
+                                  << "]: " << Tmix(fabs(runMean.GetVector()[i]) - fabs(runMean_ref.data[i]))
                                   << std::endl;
                     }
                 }
@@ -1208,24 +1142,24 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
                           << errorRunMean << ')' << std::endl;
             }
 
-            auto errorRunVar = miopen::rms_range(runningVariance_host, runningVariance);
+            auto errorRunVar = miopen::rms_range(runVariance_ref.data, runVariance.GetVector());
             if(!std::isfinite(errorRunVar) || errorRunVar > maxrms)
             {
                 std::cout << "Forward train batch norm verification FAILED on running variance: "
                           << errorRunVar << std::endl;
                 anError = true;
 #if(MIO_BN_DEBUG == 1)
-                for(int i = 0; i < runningVariance.size() && i < runningVariance_host.size() &&
+                for(int i = 0; i < runVariance.GetVector().size() && i < runVariance_ref.data.size() &&
                                i < MIO_BN_MAX_DEBUGLOOP;
                     i++)
                 {
-                    diff = fabs(Tmix(fabs(runningVariance[i]) - fabs(runningVariance_host[i])));
+                    diff = fabs(Tmix(fabs(runVariance.GetVector()[i]) - fabs(runVariance_ref.data[i])));
                     if(!std::isfinite(diff) || diff > tolerance)
                     {
-                        std::cout << "rv[" << i << "]: " << runningVariance[i];
-                        std::cout << ", rv_host[" << i << "]: " << runningVariance_host[i];
+                        std::cout << "rv[" << i << "]: " << runVariance.GetVector()[i];
+                        std::cout << ", rv_host[" << i << "]: " << runVariance_ref.data[i];
                         std::cout << ", diff[" << i << "]: "
-                                  << Tmix(fabs(runningVariance[i]) - fabs(runningVariance_host[i]))
+                                  << Tmix(fabs(runVariance.GetVector()[i]) - fabs(runVariance_ref.data[i]))
                                   << std::endl;
                     }
                 }
@@ -1240,10 +1174,14 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
 
         if(saveMeanVar)
         { // copy back for verification
-            saveMean_dev->FromGPU(GetStream(), saveMean.data());
-            saveInvVariance_dev->FromGPU(GetStream(), saveInvVariance.data());
+            // saveMean_dev->FromGPU(GetStream(), savedMean.data());
+            // saveInvVariance_dev->FromGPU(GetStream(), savedInvVar.data());
+            
+            savedMean.CopyFromDeviceToHost(GetStream());
+            savedVariance.CopyFromDeviceToHost(GetStream());
+
             maxval             = static_cast<Tref>(0.0);
-            auto errorSaveMean = miopen::rms_range(saveMean_host, saveMean);
+            auto errorSaveMean = miopen::rms_range(savedVariance_ref.data, savedMean.GetVector());
             if(!std::isfinite(errorSaveMean) || errorSaveMean > maxrms)
             {
                 std::cout << "Forward train batch norm verification FAILED on saved mean: "
@@ -1251,17 +1189,17 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
                 anError = true;
 #if(MIO_BN_DEBUG == 1)
                 for(int i = 0;
-                    i < saveMean.size() && i < saveMean_host.size() && i < MIO_BN_MAX_DEBUGLOOP;
+                    i < savedMean.GetVector().size() && i < savedVariance_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP;
                     i++)
                 {
-                    diff   = fabs(Tmix(fabs(saveMean[i]) - fabs(saveMean_host[i])));
+                    diff   = fabs(Tmix(fabs(savedMean.GetVector()[i]) - fabs(savedVariance_ref.data[i])));
                     maxval = maxval < diff ? diff : maxval;
                     if(!std::isfinite(diff) || diff > tolerance)
                     {
-                        std::cout << "sm[" << i << "]: " << saveMean[i];
-                        std::cout << ", sm_host[" << i << "]: " << saveMean_host[i];
+                        std::cout << "sm[" << i << "]: " << savedMean.GetVector()[i];
+                        std::cout << ", sm_host[" << i << "]: " << savedVariance_ref.data[i];
                         std::cout << ", diff[" << i
-                                  << "]: " << Tmix(fabs(saveMean[i]) - fabs(saveMean_host[i]))
+                                  << "]: " << Tmix(fabs(savedMean.GetVector()[i]) - fabs(savedVariance_ref.data[i]))
                                   << std::endl;
                     }
                 }
@@ -1274,7 +1212,7 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
                           << errorSaveMean << ')' << std::endl;
             }
 
-            auto errorSaveVar = miopen::rms_range(saveInvVariance_host, saveInvVariance);
+            auto errorSaveVar = miopen::rms_range(savedVariance_ref.data, savedVariance.GetVector());
             if(!std::isfinite(errorSaveVar) || errorSaveVar > maxrms)
             {
                 std::cout
@@ -1282,17 +1220,17 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
                     << errorSaveVar << std::endl;
                 anError = true;
 #if(MIO_BN_DEBUG == 1)
-                for(int i = 0; i < saveInvVariance.size() && i < saveInvVariance_host.size() &&
+                for(int i = 0; i < savedVariance.GetVector().size() && i < savedVariance_ref.data.size() &&
                                i < MIO_BN_MAX_DEBUGLOOP;
                     i++)
                 {
-                    diff = fabs(Tmix(fabs(saveInvVariance[i]) - fabs(saveInvVariance_host[i])));
+                    diff = fabs(Tmix(fabs(savedVariance.GetVector()[i]) - fabs(savedVariance_ref.data[i])));
                     if(!std::isfinite(diff) || diff > tolerance)
                     {
-                        std::cout << "sv[" << i << "]: " << saveInvVariance[i];
-                        std::cout << ", sv_host[" << i << "]: " << saveInvVariance_host[i];
+                        std::cout << "sv[" << i << "]: " << savedVariance.GetVector()[i];
+                        std::cout << ", sv_host[" << i << "]: " << savedVariance_ref.data[i];
                         std::cout << ", diff[" << i << "]: "
-                                  << Tmix(fabs(saveInvVariance[i]) - fabs(saveInvVariance_host[i]))
+                                  << Tmix(fabs(savedVariance.GetVector()[i]) - fabs(savedVariance_ref.data[i]))
                                   << std::endl;
                     }
                 }
@@ -1308,37 +1246,39 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
     }
 
     // Check output tensor error
-    out_dev->FromGPU(GetStream(), out.data());
+    // out_dev->FromGPU(GetStream(), out.data());
+    out.CopyFromDeviceToHost(GetStream());
+
     maxval        = static_cast<Tref>(0.0);
-    auto errorOut = miopen::rms_range(out_host, out);
+    auto errorOut = miopen::rms_range(out_ref.data, out.GetVector());
     if(!std::isfinite(errorOut) || errorOut > maxrms)
     {
         std::cout << "Forward batch norm verification FAILED on output: " << errorOut << std::endl;
         anError = true;
 #if(MIO_BN_DEBUG == 1)
         unsigned int count = 0;
-        for(int i = 0; i < out.size() && i < out_host.size(); i++)
+        for(int i = 0; i < out.GetVector().size() && i < out_ref.data.size(); i++)
         {
-            if(std::isnan(out[i]))
+            if(std::isnan(out.GetVector()[i]))
             {
-                std::cout << "out[" << i << "] produced a nan: " << out[i] << std::endl;
+                std::cout << "out[" << i << "] produced a nan: " << out.GetVector()[i] << std::endl;
             }
-            if(std::isnan(out_host[i]))
+            if(std::isnan(out_ref.data[i]))
             {
-                std::cout << "out_host[" << i << "] produced a nan: " << out_host[i] << std::endl;
+                std::cout << "out_ref[" << i << "] produced a nan: " << out_ref.data[i] << std::endl;
             }
-            diff   = Tref(fabs(out[i]) - fabs(out_host[i]));
+            diff   = Tref(fabs(out.GetVector()[i]) - fabs(out_ref.data[i]));
             maxval = maxval < diff ? diff : maxval;
             if(!std::isfinite(diff) || diff > tolerance)
             {
-                std::cout << "out[" << i << "]: " << out[i];
-                std::cout << ", out_host[" << i << "]: " << out_host[i];
-                std::cout << ", diff[" << i << "]: " << Tref(out[i] - out_host[i]) << std::endl;
+                std::cout << "out[" << i << "]: " << out.GetVector()[i];
+                std::cout << ", out_ref.data[" << i << "]: " << out_ref.data[i];
+                std::cout << ", diff[" << i << "]: " << Tref(out.GetVector()[i] - out_ref.data[i]) << std::endl;
                 count++;
             }
         }
 
-        std::cout << "Number of elements: " << out.size() << std::endl;
+        std::cout << "Number of elements: " << out.GetVector().size() << std::endl;
         std::cout << "Number of bad elements: " << count << std::endl;
         std::cout << "max difference in output: " << maxval << std::endl;
 #endif
@@ -1358,68 +1298,46 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardCPU()
+template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::RunBackwardCPU()
 {
 
     if(!back)
         return miopenStatusSuccess;
 
-    int nIn = 0, cIn = 0, dIn = 0, hIn = 0, wIn = 0;
-    if(isDepthSpecified)
-        miopenGet5dTensorDescriptorLengths(inputTensor, &nIn, &cIn, &dIn, &hIn, &wIn);
-    else
-        miopenGet4dTensorDescriptorLengths(inputTensor, &nIn, &cIn, &hIn, &wIn);
-
-    int batch_sz = nIn;
-    int channels = cIn;
-    int height   = hIn;
-    int width    = wIn;
-    int depth    = dIn;
-
     //	T alphaDiff = 1, betaDiff = 0;
     //	T alphaParam = 1, betaParam = 0;
-    Tref epsilon = static_cast<Tref>(EPSILON);
+    float alpha = static_cast<float>(1), beta = static_cast<float>(0), gamma = static_cast<float>(1);
 
     if(bn_mode == miopenBNPerActivation)
-    {                                                     // 1xCxHxW
-        miopenBNBwdPerActivationRunHost<Tgpu, Tref, Tmix>(/* alphaDiff, betaDiff, alphaParam,
-                                                             betaParam, */
-                                                          batch_sz,
-                                                          channels,
-                                                          (isDepthSpecified ? depth : 1),
-                                                          height,
-                                                          width,
-                                                          in.data(),
-                                                          dyin.data(),
-                                                          dxout_host.data(),
-                                                          scale.data(),
-                                                          dscale_host.data(),
-                                                          dbias_host.data(),
-                                                          epsilon,
-                                                          saveMeanVar,
-                                                          saveMean_host.data(),
-                                                          saveInvVariance_host.data());
+    {   
+        // 1xCxHxW
+        batchNormActivSpatialHostBwdTrain(activ_mode,
+                                      gamma,
+                                      beta,
+                                      alpha,
+                                      in.GetTensor(),
+                                      dy.GetTensor(),
+                                      out.GetTensor(),
+                                      out_ref,
+                                      scale.GetTensor(),
+                                      bias.GetTensor(),
+                                      dScale_ref,
+                                      dBias_ref,
+                                      savedMean.GetTensor(),
+                                      savedInvVar.GetTensor());
+
     }
     else if(bn_mode == miopenBNSpatial)
     {                                               // 1xCx1x1
-        miopenBNBwdSpatialRunHost<Tgpu, Tref, Tmix>(/* alphaDiff, betaDiff, alphaParam, betaParam,
-                                                     */
-                                                    batch_sz,
-                                                    channels,
-                                                    (isDepthSpecified ? depth : 1),
-                                                    height,
-                                                    width,
-                                                    in.data(),
-                                                    dyin.data(),
-                                                    dxout_host.data(),
-                                                    scale.data(),
-                                                    dscale_host.data(),
-                                                    dbias_host.data(),
-                                                    epsilon,
-                                                    saveMeanVar,
-                                                    saveMean_host.data(),
-                                                    saveInvVariance_host.data());
+                batchNormSpatialHostBwdTrain(in.GetTensor(),
+                                      dy.GetTensor(),
+                                      out_ref,
+                                      scale.GetTensor(),
+                                      dScale_ref,
+                                      dBias_ref,
+                                      savedMean.GetTensor(),
+                                      savedInvVar.GetTensor());
     }
     else
     {
@@ -1431,8 +1349,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardCPU()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyBackward()
+template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::VerifyBackward()
 {
 
     if(!back)
@@ -1442,34 +1360,35 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyBackward()
     bool anError      = false;
 
     RunBackwardCPU();
-
-    dxout_dev->FromGPU(GetStream(), dxout.data());
-    dscale_dev->FromGPU(GetStream(), dscale.data());
-    dbias_dev->FromGPU(GetStream(), dbias.data());
+    
+    out.CopyFromDeviceToHost(GetStream());
+    dScale.CopyFromDeviceToHost(GetStream());
+    dBias.CopyFromDeviceToHost(GetStream());
+    
 #if(MIO_BN_DEBUG == 1)
     const Tref tolerance =
         static_cast<Tref>(1000 * (sizeof(Tgpu) == 4) ? ERRTOL_FP32 : ERRTOL_FP16);
     Tref diff = static_cast<Tref>(0.0);
 #endif
     maxval          = static_cast<Tref>(0.0);
-    auto errordxout = miopen::rms_range(dxout_host, dxout);
+    auto errordxout = miopen::rms_range(out_ref.data, out.GetVector());
     if(!std::isfinite(errordxout) || errordxout > maxrms)
     {
         std::cout << "Backwards prop batch norm verification FAILED on dx: " << errordxout
                   << std::endl;
         anError = true;
 #if(MIO_BN_DEBUG == 1)
-        for(int i = 0; i < dxout.size() && i < MIO_BN_MAX_DEBUGLOOP; i++)
+        for(int i = 0; i < out_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++)
         {
-            diff   = fabs(Tgpu(fabs(dxout[i]) - fabs(dxout_host[i])));
+            diff   = fabs(Tgpu(fabs(out_ref.data[i]) - fabs(out.GetVector()[i])));
             maxval = maxval < diff ? diff : maxval;
             if(!std::isfinite(diff) || diff > tolerance)
             {
-                std::cout << "dxout[" << i << "]: " << dxout[i];
-                std::cout << "\tdxout_host[" << i << "]: " << dxout_host[i];
-                std::cout << "\tdiff[" << i << "]: " << Tgpu(fabs(dxout[i]) - fabs(dxout_host[i]));
+                std::cout << "out_ref[" << i << "]: " << out_ref.data[i];
+                std::cout << "\tout.GetVector()[" << i << "]: " << out.GetVector()[i];
+                std::cout << "\tdiff[" << i << "]: " << Tgpu(fabs(out_ref.data[i]) - fabs(out.GetVector()[i]));
                 std::cout << "\tratioH: "
-                          << fabs(fabs(dxout[i]) - fabs(dxout_host[i])) / fabs(dxout_host[i])
+                          << fabs(fabs(out_ref.data[i]) - fabs(out.GetVector()[i])) / fabs(out.GetVector()[i])
                           << std::endl;
             }
         }
@@ -1483,25 +1402,25 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyBackward()
     }
 
     maxval           = static_cast<Tref>(0.0);
-    auto errordscale = miopen::rms_range(dscale_host, dscale);
+    auto errordscale = miopen::rms_range(dScale_ref.data, dScale.GetVector());
     if(!std::isfinite(errordscale) || errordscale > maxrms)
     {
         std::cout << "Backwards prop batch norm verification FAILED on dscale: " << errordscale
                   << std::endl;
         anError = true;
 #if(MIO_BN_DEBUG == 1)
-        for(int i = 0; i < dscale.size() && i < MIO_BN_MAX_DEBUGLOOP; i++)
+        for(int i = 0; i < dScale.GetVector().size() && i < MIO_BN_MAX_DEBUGLOOP; i++)
         {
-            diff   = fabs(Tmix(fabs(dscale[i]) - fabs(dscale_host[i])));
+            diff   = fabs(Tmix(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i])));
             maxval = maxval < diff ? diff : maxval;
             if(!std::isfinite(diff) || diff > tolerance)
             {
-                std::cout << "dscale[" << i << "]: " << dscale[i];
-                std::cout << "\tdscale_host[" << i << "]: " << dscale_host[i];
+                std::cout << "dscale[" << i << "]: " << dScale.GetVector()[i];
+                std::cout << "\tdscale_host[" << i << "]: " << dScale_ref.data[i];
                 std::cout << "\tdiff[" << i
-                          << "]: " << Tmix(fabs(dscale[i]) - fabs(dscale_host[i]));
+                          << "]: " << Tmix(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i]));
                 std::cout << "\tratioH: "
-                          << fabs(fabs(dscale[i]) - fabs(dscale_host[i])) / fabs(dscale_host[i])
+                          << fabs(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i])) / fabs(dScale_ref.data[i])
                           << std::endl;
             }
         }
@@ -1514,23 +1433,23 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyBackward()
                   << ')' << std::endl;
     }
 
-    auto errordbias = miopen::rms_range(dbias_host, dbias);
+    auto errordbias = miopen::rms_range(dBias_ref.data, dBias.GetVector());
     if(!std::isfinite(errordbias) || errordbias > maxrms)
     {
         std::cout << "Backwards prop batch norm verification FAILED on dbias: " << errordbias
                   << std::endl;
         anError = true;
 #if(MIO_BN_DEBUG == 1)
-        for(int i = 0; i < dbias.size() && i < MIO_BN_MAX_DEBUGLOOP; i++)
+        for(int i = 0; i <  dBias.GetVector().size() && i < MIO_BN_MAX_DEBUGLOOP; i++)
         {
-            diff = fabs(Tmix(fabs(dbias[i]) - fabs(dbias_host[i])));
+            diff = fabs(Tmix(fabs( dBias.GetVector()[i]) - fabs(dBias_ref.data[i])));
             if(!std::isfinite(diff) || diff > tolerance)
             {
-                std::cout << "dbias[" << i << "]: " << dbias[i];
-                std::cout << "\tdbias_host[" << i << "]: " << dbias_host[i];
-                std::cout << "\tdiff[" << i << "]: " << Tmix(fabs(dbias[i]) - fabs(dbias_host[i]));
+                std::cout << "dbias[" << i << "]: " <<  dBias.GetVector()[i];
+                std::cout << "\tdbias_host[" << i << "]: " << dBias_ref.data[i];
+                std::cout << "\tdiff[" << i << "]: " << Tmix(fabs( dBias.GetVector()[i]) - fabs(dBias_ref.data[i]));
                 std::cout << "\tratioH: "
-                          << fabs(fabs(dbias[i]) - fabs(dbias_host[i])) / fabs(dbias_host[i])
+                          << fabs(fabs( dBias.GetVector()[i]) - fabs(dBias_ref.data[i])) / fabs(dBias_ref.data[i])
                           << std::endl;
             }
         }
diff --git a/driver/dm_bnorm.cpp b/driver/dm_bnorm.cpp
index c7bab90bb5..4333787e5e 100644
--- a/driver/dm_bnorm.cpp
+++ b/driver/dm_bnorm.cpp
@@ -29,9 +29,9 @@
 static Driver* makeDriver(const std::string& base_arg)
 {
     if(base_arg == "bnorm")
-        return new BatchNormDriver<float, double>();
+        return new BatchNormDriver<float, float, double>();
     if(base_arg == "bnormfp16")
-        return new BatchNormDriver<float16, double, float>();
+        return new BatchNormDriver<float16, float, double, float>();
     return nullptr;
 }
 
diff --git a/driver/gemm_driver.hpp b/driver/gemm_driver.hpp
index 772104544e..282173101d 100644
--- a/driver/gemm_driver.hpp
+++ b/driver/gemm_driver.hpp
@@ -148,7 +148,7 @@ class GemmDriver : public Driver
     std::vector<T> c;
     std::vector<T> chost;
 
-    T alpha, beta;
+    T alpha, beta, gamma;
 
     miopen::GemmDescriptor gemm_desc = {
         false, false, false, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.0f, 0.0f, miopenFloat, false};
diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index c0c49b06b6..2bf2e47c4a 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -171,8 +171,8 @@ void batchNormSpatialHostInference(const tensor<T>& input,
 template <class T, class U>
 void batchNormPerActivHostInference(const tensor<T>& input,
                                     tensor<T>& output,
-                                    const tensor<U>& scale,
-                                    const tensor<U>& bias,
+                                    const tensor<T>& scale,
+                                    const tensor<T>& bias,
                                     double epsilon,
                                     const tensor<U>& estimatedMean,
                                     const tensor<U>& estimatedVariance)
@@ -278,15 +278,15 @@ void batchNormSpatialHostFwdTrain(const tensor<T>& input,
     });
 }
 
-template <class DataType, class XAndScaleDataType>
-void batchNormSpatialHostBwdTrain(const tensor<XAndScaleDataType>& x_input,
-                                  const tensor<DataType>& dy_input,
-                                  tensor<DataType>& dx_out,
-                                  const tensor<XAndScaleDataType>& scale,
-                                  tensor<DataType>& dscale,
-                                  tensor<DataType>& dbias,
-                                  const tensor<DataType>& savedMean,
-                                  const tensor<DataType>& savedInvVar)
+template <class T, class U=T>
+void batchNormSpatialHostBwdTrain(const tensor<T>& x_input,
+                                  const tensor<U>& dy_input,
+                                  tensor<T>& dx_out,
+                                  const tensor<T>& scale,
+                                  tensor<U>& dscale,
+                                  tensor<U>& dbias,
+                                  const tensor<U>& savedMean,
+                                  const tensor<U>& savedInvVar)
 {
 
     int height, width, n_batch, channels;
@@ -334,7 +334,7 @@ void batchNormSpatialHostBwdTrain(const tensor<XAndScaleDataType>& x_input,
                     double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0);
                     double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
                     double tmp3 = (scale(0, cidx, 0, 0) * invVar) / nhw;
-                    dx_out(bidx, cidx, row, column) = static_cast<DataType>(tmp3 * (tmp2 + tmp1));
+                    dx_out(bidx, cidx, row, column) = static_cast<T>(tmp3 * (tmp2 + tmp1));
                 } // end for(n_batchs)
             }     // for (column)
         }         // for (row)
@@ -347,11 +347,11 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
                                        double beta,
                                        double alpha,
                                        const tensor<T>& x_input,
-                                       const tensor<T>& dy_input,
+                                       const tensor<U>& dy_input,
                                        const tensor<T>& y_input,
                                        tensor<T>& dx_out,
-                                       const tensor<U>& scale,
-                                       const tensor<U>& bias,
+                                       const tensor<T>& scale,
+                                       const tensor<T>& bias,
                                        tensor<U>& dscale,
                                        tensor<U>& dbias,
                                        const tensor<U>& savedMean,
@@ -432,8 +432,8 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
 template <class T, class U>
 void batchNormPerActHostFwdTrain(const tensor<T>& input,
                                  tensor<T>& out,
-                                 const tensor<U>& scale,
-                                 const tensor<U>& bias,
+                                 const tensor<T>& scale,
+                                 const tensor<T>& bias,
                                  double epsilon,
                                  double expAvgFactor,
                                  tensor<U>& saveMean,
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index f5227217e4..16d788a70c 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -120,14 +120,19 @@ struct BNBwdTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopen
                                                     &bn_bwd_test_data.betaDataDiff,
                                                     &bn_bwd_test_data.alphaParamDiff,
                                                     &bn_bwd_test_data.betaParamDiff,
+                                                    
                                                     &bn_bwd_test_data.input.desc,
                                                     bn_bwd_test_data.in_dev.get(),
+
                                                     &bn_bwd_test_data.dy.desc,
                                                     bn_bwd_test_data.dy_dev.get(),
+                                                    
                                                     &bn_bwd_test_data.output.desc,
                                                     bn_bwd_test_data.out_dev.get(),
+                                                    
                                                     &bn_bwd_test_data.bnScale.desc,
                                                     bn_bwd_test_data.bnScale_dev.get(),
+                                                    
                                                     bn_bwd_test_data.dScale_dev.get(),
                                                     bn_bwd_test_data.dBias_dev.get(),
                                                     bn_bwd_test_data.epsilon,
diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp
index fcf237400b..2eafaf4665 100644
--- a/test/gtest/bn_test_data.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -63,36 +63,37 @@ inline std::vector<BNTestCase> Network1()
 {
     // pyt_mlperf_resnet50v1.5
     return {
-        {192, 1, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0},
-        {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0},
-        {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}};
+        {4, 2, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0},
+        // {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}
+    };
 }
 
 template <typename XDataType, typename YDataType, typename TConfig>

From 5c8a57b12ac6f0b8da6637d2daef619d6c942f59 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Thu, 5 Sep 2024 16:27:42 +0000
Subject: [PATCH 02/27] runs infer, forward train and bwd driver command

---
 driver/bn_driver.hpp     | 595 ++++++++++++++++++++-------------------
 src/driver_arguments.cpp |   1 +
 src/tensor.cpp           |   1 -
 test/fusionHost.hpp      |  22 +-
 test/na_train.cpp        |  28 +-
 test/na_train_find2.cpp  |  28 +-
 6 files changed, 345 insertions(+), 330 deletions(-)

diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index 9c78bfb869..c56fb1a4a3 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -65,7 +65,6 @@
 
 //========================
 
-
 template <typename Tgpu>
 class GpumemTensor
 {
@@ -240,10 +239,8 @@ class BatchNormDriver : public Driver
     void runGPUFwdTrain(Tref epsilon, Tref eAF, float alpha, float beta);
     void runGPUBwd(Tref epsilon, float alpha, float beta);
 
-    void runCPUFwdInference(
-        Tref epsilon);
-    void runCPUFwdTrain(
-        Tref epsilon, Tref eAF);
+    void runCPUFwdInference(Tref epsilon);
+    void runCPUFwdTrain(Tref epsilon, Tref eAF);
 
     int VerifyBackward() override;
     int VerifyForward() override;
@@ -271,47 +268,21 @@ class BatchNormDriver : public Driver
 
     bool isFwdInfer = false;
     bool isFwdTrain = false;
-    bool isBwd = false;
+    bool isBwd      = false;
 
     InputFlags inflags;
     bool isDepthSpecified = false;
 
-    miopenTensorDescriptor_t inputTensor; 
+    miopenTensorDescriptor_t inputTensor;
     miopenTensorDescriptor_t outputTensor;
-    // // forward
-    // miopenTensorDescriptor_t scaleTensor;
-    // miopenTensorDescriptor_t biasTensor;
-
-    // // forward inference
-    // miopenTensorDescriptor_t estMean;
-    // miopenTensorDescriptor_t estVariance;
-
-    // // forward training
-    // miopenTensorDescriptor_t savedMean;
-    // miopenTensorDescriptor_t savedVariance;
-    // miopenTensorDescriptor_t runMean;
-    // miopenTensorDescriptor_t runVariance;
-
-    // // backward
-    // miopenTensorDescriptor_t bnScale;
-    // miopenTensorDescriptor_t dy;
-
-    // miopenTensorDescriptor_t dScale;
-    // miopenTensorDescriptor_t dBias;
-    // miopenTensorDescriptor_t savedMean;
-    // miopenTensorDescriptor_t savedInvVar;
 
-    // --------------
-
-
-    GpumemTensor<Tgpu> in; // done
-    GpumemTensor<Tgpu> out; // done
+    GpumemTensor<Tgpu> in;
+    GpumemTensor<Tgpu> out;
     tensor<Tgpu> out_ref;
 
-
     // forward
-    GpumemTensor<Tgpu> scale; // done
-    GpumemTensor<Tgpu> bias; // done
+    GpumemTensor<Tgpu> scale;
+    GpumemTensor<Tgpu> bias;
 
     // forward inference
     GpumemTensor<Acc> estMean;
@@ -320,29 +291,26 @@ class BatchNormDriver : public Driver
     // forward training
     GpumemTensor<Acc> savedMean;
     tensor<Acc> savedMean_ref;
-
     GpumemTensor<Acc> savedVariance;
     tensor<Acc> savedVariance_ref;
-
     GpumemTensor<Acc> runMean;
     tensor<Acc> runMean_ref;
-
     GpumemTensor<Acc> runVariance;
     tensor<Acc> runVariance_ref;
 
-
     // backward
     GpumemTensor<Tgpu> bnScale;
-    GpumemTensor<Acc> dy;
 
+    GpumemTensor<Acc> dy;
     GpumemTensor<Acc> dScale;
     tensor<Acc> dScale_ref;
     GpumemTensor<Acc> dBias;
     tensor<Acc> dBias_ref;
-
     GpumemTensor<Acc> savedInvVar;
-    
+
     Tref maxval;
+
+    miopenTensorLayout_t bn_layout;
 };
 
 template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
@@ -365,43 +333,42 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::GetandSetData()
     SetBNParametersFromCmdLineArgs();
 
     std::vector<int> in_len = GetInputTensorLengthsFromCmdLine();
-    
-    // change this to supoort NHWC too 
-    in.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, in_len});
-    out.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, in_len});
+
+    in.AllocOnHost(tensor<Tgpu>{bn_layout, in_len});
+    out.AllocOnHost(tensor<Tgpu>{bn_layout, in_len});
     auto derivedBnDesc = miopen::TensorDescriptor{};
-    miopen::DeriveBNTensorDescriptor(derivedBnDesc,
-                                         in.GetTensor().desc,
-                                         bn_mode);
+    miopen::DeriveBNTensorDescriptor(derivedBnDesc, in.GetTensor().desc, bn_mode);
     if(isFwdInfer || isFwdTrain)
     {
-        scale.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
-        bias.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
+        scale.AllocOnHost(tensor<Tgpu>{bn_layout, derivedBnDesc.GetLengths()});
+        bias.AllocOnHost(tensor<Tgpu>{bn_layout, derivedBnDesc.GetLengths()});
     }
     if(isFwdInfer)
     {
-        estMean.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
-        estVariance.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
+        estMean.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
+        estVariance.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
     }
-    if(isFwdTrain && saveMeanVar)
+    else if(isFwdTrain)
     {
-        savedMean.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
-        savedVariance.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
+        savedMean.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
+        savedVariance.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
+        runMean.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
+        runVariance.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
     }
-    if(isFwdTrain && keepRunningMeanVar)
+    else if(isBwd)
     {
-        runMean.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
-        runVariance.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
+        bnScale.AllocOnHost(tensor<Tgpu>{bn_layout, derivedBnDesc.GetLengths()});
+        dy.AllocOnHost(tensor<Acc>{bn_layout, in_len});
+
+        dScale.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
+        dBias.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
+        savedMean.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
+        savedInvVar.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
     }
-    if(isBwd)
+    else
     {
-        bnScale.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
-        dy.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
-
-        dScale.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
-        dBias.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
-        savedMean.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
-        savedInvVar.AllocOnHost(tensor<Tgpu>{miopenTensorNCHW, derivedBnDesc.GetLengths()});
+        std::cout << "\nUnknown batch norm state!\n";
+        exit(EXIT_FAILURE);
     }
     return miopenStatusSuccess;
 }
@@ -425,6 +392,14 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::AddCmdLineArgs()
     inflags.AddInputFlag("in_h", 'H', "32", "Input Height (Default=32)", "int");
     inflags.AddInputFlag("in_w", 'W', "32", "Input Width (Default=32)", "int");
     inflags.AddInputFlag("in_d", 'D', "0", "Input Depth (Default=0)", "int");
+
+    inflags.AddInputFlag("layout",
+                         'L',
+                         "NCHW",
+                         "Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)",
+                         "string",
+                         true);
+
     inflags.AddInputFlag("alpha", 'A', "1.0", "Alpha (Default=1.0)", "float");
     inflags.AddInputFlag("beta", 'B', "0.", "Beta (Default=0.)", "float");
     inflags.AddInputFlag("iter", 'i', "1", "Number of Iterations (Default=1)", "int");
@@ -484,6 +459,22 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::SetBNParametersFromCmdLineArgs()
     //    	double bnAlpha = inflags.GetValueDouble("alpha");
     //    	double bnBeta = inflags.GetValueDouble("beta");
 
+    std::string layout = inflags.GetValueStr("layout");
+
+    if(layout == "NCHW")
+    {
+        bn_layout = miopenTensorNCHW;
+    }
+    else if(layout == "NHWC")
+    {
+        bn_layout = miopenTensorNHWC;
+    }
+    else
+    {
+        std::cout << "Cannot handle layout : " << layout << "\n";
+        exit(EXIT_FAILURE); // NOLINT (concurrency-mt-unsafe)
+    }
+
     // batch norm mode type
     if(inflags.GetValueInt("mode") == 0)
     {
@@ -557,20 +548,20 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::SetBNParametersFromCmdLineArgs()
 
     if(forw == 1)
     {
-        isFwdInfer = true;
+        isFwdTrain = true;
     }
     else if(forw == 2)
     {
-        isFwdTrain = true;
+        isFwdInfer = true;
     }
-    else{
+    else
+    {
         isBwd = true;
     }
 
     return miopenStatusSuccess;
 }
 
-
 template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
 int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::AllocateBuffersAndCopy()
 {
@@ -581,6 +572,7 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::AllocateBuffersAndCopy()
 #endif
     status |= in.AllocOnDeviceAndInit(q, ctx, in.GetTensor().desc.GetElementSpace());
     status |= out.AllocOnDeviceAndInit(q, ctx, out.GetTensor().desc.GetElementSpace());
+    out_ref = out.GetTensor();
     if(isFwdInfer || isFwdTrain)
     {
         status |= scale.AllocOnDeviceAndInit(q, ctx, scale.GetTensor().desc.GetElementSpace());
@@ -589,17 +581,23 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::AllocateBuffersAndCopy()
     if(isFwdInfer)
     {
         status |= estMean.AllocOnDeviceAndInit(q, ctx, estMean.GetTensor().desc.GetElementSpace());
-        status |= estVariance.AllocOnDeviceAndInit(q, ctx, estVariance.GetTensor().desc.GetElementSpace());
+        status |= estVariance.AllocOnDeviceAndInit(
+            q, ctx, estVariance.GetTensor().desc.GetElementSpace());
     }
-    if(isFwdTrain && saveMeanVar)
-    {
-        status |= savedMean.AllocOnDeviceAndInit(q, ctx, savedMean.GetTensor().desc.GetElementSpace());
-        status |= savedVariance.AllocOnDeviceAndInit(q, ctx, savedVariance.GetTensor().desc.GetElementSpace());
-    }
-    if(isFwdTrain && keepRunningMeanVar)
+    if(isFwdTrain)
     {
+        status |=
+            savedMean.AllocOnDeviceAndInit(q, ctx, savedMean.GetTensor().desc.GetElementSpace());
+        status |= savedVariance.AllocOnDeviceAndInit(
+            q, ctx, savedVariance.GetTensor().desc.GetElementSpace());
         status |= runMean.AllocOnDeviceAndInit(q, ctx, runMean.GetTensor().desc.GetElementSpace());
-        status |= runVariance.AllocOnDeviceAndInit(q, ctx, runVariance.GetTensor().desc.GetElementSpace());
+        status |= runVariance.AllocOnDeviceAndInit(
+            q, ctx, runVariance.GetTensor().desc.GetElementSpace());
+
+        savedMean_ref     = savedMean.GetTensor();
+        savedVariance_ref = savedVariance.GetTensor();
+        runMean_ref       = runMean.GetTensor();
+        runVariance_ref   = runVariance.GetTensor();
     }
     if(isBwd)
     {
@@ -608,8 +606,13 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::AllocateBuffersAndCopy()
 
         status |= dScale.AllocOnDeviceAndInit(q, ctx, dScale.GetTensor().desc.GetElementSpace());
         status |= dBias.AllocOnDeviceAndInit(q, ctx, dBias.GetTensor().desc.GetElementSpace());
-        status |= savedMean.AllocOnDeviceAndInit(q, ctx, savedMean.GetTensor().desc.GetElementSpace());
-        status |= savedInvVar.AllocOnDeviceAndInit(q, ctx, savedInvVar.GetTensor().desc.GetElementSpace());
+        status |=
+            savedMean.AllocOnDeviceAndInit(q, ctx, savedMean.GetTensor().desc.GetElementSpace());
+        status |= savedInvVar.AllocOnDeviceAndInit(
+            q, ctx, savedInvVar.GetTensor().desc.GetElementSpace());
+
+        dScale_ref = dScale.GetTensor();
+        dBias_ref  = dBias.GetTensor();
     }
 
     if(status != STATUS_SUCCESS)
@@ -619,7 +622,9 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::AllocateBuffersAndCopy()
 }
 
 template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runGPUFwdInference(Tref epsilon, float alpha, float beta)
+void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runGPUFwdInference(Tref epsilon,
+                                                                float alpha,
+                                                                float beta)
 {
 
     if(keepRunningMeanVar)
@@ -662,109 +667,109 @@ void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runGPUFwdInference(Tref epsilon, fl
 
 template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
 void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runGPUFwdTrain(Tref epsilon,
-                                                       Tref eAF,
-                                                       float alpha,
-                                                       float beta)
+                                                            Tref eAF,
+                                                            float alpha,
+                                                            float beta)
 {
     if(saveMeanVar && keepRunningMeanVar)
     {
         miopenBatchNormalizationForwardTraining(GetHandle(),
-                                                 bn_mode,
-                                                 &alpha,
-                                                 &beta,
-                                                 &in.GetTensor().desc,
-                                                 in.GetDevicePtr(),
-                                                 &out.GetTensor().desc,
-                                                 out.GetDevicePtr(),
-                                                 &scale.GetTensor().desc,
-                                                 scale.GetDevicePtr(),
-                                                 bias.GetDevicePtr(),
-                                                 eAF,
-                                                 runMean.GetDevicePtr(),
-                                                 runVariance.GetDevicePtr(),
-                                                 epsilon,
-                                                 savedMean.GetDevicePtr(),
-                                                 savedVariance.GetDevicePtr());
+                                                bn_mode,
+                                                &alpha,
+                                                &beta,
+                                                &in.GetTensor().desc,
+                                                in.GetDevicePtr(),
+                                                &out.GetTensor().desc,
+                                                out.GetDevicePtr(),
+                                                &scale.GetTensor().desc,
+                                                scale.GetDevicePtr(),
+                                                bias.GetDevicePtr(),
+                                                eAF,
+                                                runMean.GetDevicePtr(),
+                                                runVariance.GetDevicePtr(),
+                                                epsilon,
+                                                savedMean.GetDevicePtr(),
+                                                savedVariance.GetDevicePtr());
     }
     else if(saveMeanVar)
     {
         miopenBatchNormalizationForwardTraining(GetHandle(),
-                                                 bn_mode,
-                                                 &alpha,
-                                                 &beta,
-                                                 &in.GetTensor().desc,
-                                                 in.GetDevicePtr(),
-                                                 &out.GetTensor().desc,
-                                                 out.GetDevicePtr(),
-                                                 &scale.GetTensor().desc,
-                                                 scale.GetDevicePtr(),
-                                                 bias.GetDevicePtr(),
-                                                 eAF,
-                                                 nullptr,
-                                                 nullptr,
-                                                 epsilon,
-                                                 savedMean.GetDevicePtr(),
-                                                 savedVariance.GetDevicePtr());
+                                                bn_mode,
+                                                &alpha,
+                                                &beta,
+                                                &in.GetTensor().desc,
+                                                in.GetDevicePtr(),
+                                                &out.GetTensor().desc,
+                                                out.GetDevicePtr(),
+                                                &scale.GetTensor().desc,
+                                                scale.GetDevicePtr(),
+                                                bias.GetDevicePtr(),
+                                                eAF,
+                                                nullptr,
+                                                nullptr,
+                                                epsilon,
+                                                savedMean.GetDevicePtr(),
+                                                savedVariance.GetDevicePtr());
     }
     else if(keepRunningMeanVar)
     {
         miopenBatchNormalizationForwardTraining(GetHandle(),
-                                                 bn_mode,
-                                                 &alpha,
-                                                 &beta,
-                                                 &in.GetTensor().desc,
-                                                 in.GetDevicePtr(),
-                                                 &out.GetTensor().desc,
-                                                 out.GetDevicePtr(),
-                                                 &scale.GetTensor().desc,
-                                                 scale.GetDevicePtr(),
-                                                 bias.GetDevicePtr(),
-                                                 eAF,
-                                                 runMean.GetDevicePtr(),
-                                                 runVariance.GetDevicePtr(),
-                                                 epsilon,
-                                                 nullptr,
-                                                 nullptr);
+                                                bn_mode,
+                                                &alpha,
+                                                &beta,
+                                                &in.GetTensor().desc,
+                                                in.GetDevicePtr(),
+                                                &out.GetTensor().desc,
+                                                out.GetDevicePtr(),
+                                                &scale.GetTensor().desc,
+                                                scale.GetDevicePtr(),
+                                                bias.GetDevicePtr(),
+                                                eAF,
+                                                runMean.GetDevicePtr(),
+                                                runVariance.GetDevicePtr(),
+                                                epsilon,
+                                                nullptr,
+                                                nullptr);
     }
     else
     {
         miopenBatchNormalizationForwardTraining(GetHandle(),
-                                                 bn_mode,
-                                                 &alpha,
-                                                 &beta,
-                                                 &in.GetTensor().desc,
-                                                 in.GetDevicePtr(),
-                                                 &out.GetTensor().desc,
-                                                 out.GetDevicePtr(),
-                                                 &scale.GetTensor().desc,
-                                                 scale.GetDevicePtr(),
-                                                 bias.GetDevicePtr(),
-                                                 eAF,
-                                                 nullptr,
-                                                 nullptr,
-                                                 epsilon,
-                                                 nullptr,
-                                                 nullptr);
+                                                bn_mode,
+                                                &alpha,
+                                                &beta,
+                                                &in.GetTensor().desc,
+                                                in.GetDevicePtr(),
+                                                &out.GetTensor().desc,
+                                                out.GetDevicePtr(),
+                                                &scale.GetTensor().desc,
+                                                scale.GetDevicePtr(),
+                                                bias.GetDevicePtr(),
+                                                eAF,
+                                                nullptr,
+                                                nullptr,
+                                                epsilon,
+                                                nullptr,
+                                                nullptr);
     }
 
 #ifdef BN_RUNFOR_PROFILER
     miopenBatchNormalizationForwardTraining(GetHandle(),
-                                                 bn_mode,
-                                                 &alpha,
-                                                 &beta,
-                                                 &in.GetTensor().desc,
-                                                 in.GetDevicePtr(),
-                                                 &out.GetTensor().desc,
-                                                 out.GetDevicePtr(),
-                                                 &scale.GetTensor().desc,
-                                                 scale.GetDevicePtr(),
-                                                 bias.GetDevicePtr(),
-                                                 eAF,
-                                                 nullptr,
-                                                 nullptr,
-                                                 epsilon,
-                                                 nullptr,
-                                                 nullptr);
+                                            bn_mode,
+                                            &alpha,
+                                            &beta,
+                                            &in.GetTensor().desc,
+                                            in.GetDevicePtr(),
+                                            &out.GetTensor().desc,
+                                            out.GetDevicePtr(),
+                                            &scale.GetTensor().desc,
+                                            scale.GetDevicePtr(),
+                                            bias.GetDevicePtr(),
+                                            eAF,
+                                            nullptr,
+                                            nullptr,
+                                            epsilon,
+                                            nullptr,
+                                            nullptr);
 #endif
 }
 
@@ -869,20 +874,19 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::RunForwardGPU()
 }
 
 template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runCPUFwdInference(
-    Tref epsilon)
+void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runCPUFwdInference(Tref epsilon)
 {
 
     if(bn_mode == miopenBNPerActivation)
     { // 1xCxHxW
         // handle 3d case
         batchNormPerActivHostInference(in.GetTensor(),
-                                      out_ref,
-                                      scale.GetTensor(),
-                                      bias.GetTensor(),
-                                      epsilon,
-                                      estMean.GetTensor(),
-                                      estVariance.GetTensor());
+                                       out_ref,
+                                       scale.GetTensor(),
+                                       bias.GetTensor(),
+                                       epsilon,
+                                       estMean.GetTensor(),
+                                       estVariance.GetTensor());
     }
     else if(bn_mode == miopenBNSpatial)
     { // 1xCx1x1
@@ -892,9 +896,7 @@ void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runCPUFwdInference(
                                       bias.GetTensor(),
                                       epsilon,
                                       estMean.GetTensor(),
-                                      estVariance.GetTensor()
-                                      );
-
+                                      estVariance.GetTensor());
     }
     else
     {
@@ -906,35 +908,34 @@ void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runCPUFwdInference(
 }
 
 template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runCPUFwdTrain(
-    Tref epsilon, Tref eAF)
+void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runCPUFwdTrain(Tref epsilon, Tref eAF)
 {
 
     if(bn_mode == miopenBNPerActivation)
     { // 1xCxHxW
         batchNormPerActHostFwdTrain(in.GetTensor(),
-                                 out_ref,
-                                 scale.GetTensor(),
-                                 bias.GetTensor(),
-                                 static_cast<double>(epsilon),
-                                 static_cast<double>(eAF),
-                                 savedMean_ref,
-                                 savedVariance_ref,
-                                 runMean_ref,
-                                 runVariance_ref);
+                                    out_ref,
+                                    scale.GetTensor(),
+                                    bias.GetTensor(),
+                                    static_cast<double>(epsilon),
+                                    static_cast<double>(eAF),
+                                    savedMean_ref,
+                                    savedVariance_ref,
+                                    runMean_ref,
+                                    runVariance_ref);
     }
     else if(bn_mode == miopenBNSpatial)
     { // 1xCx1x1
         batchNormSpatialHostFwdTrain(in.GetTensor(),
-                                 out_ref,
-                                 scale.GetTensor(),
-                                 bias.GetTensor(),
-                                 static_cast<double>(epsilon),
-                                 static_cast<double>(eAF),
-                                 savedMean_ref,
-                                 savedVariance_ref,
-                                 runMean_ref,
-                                 runVariance_ref);
+                                     out_ref,
+                                     scale.GetTensor(),
+                                     bias.GetTensor(),
+                                     static_cast<double>(epsilon),
+                                     static_cast<double>(eAF),
+                                     savedMean_ref,
+                                     savedVariance_ref,
+                                     runMean_ref,
+                                     runVariance_ref);
     }
     else
     {
@@ -956,8 +957,7 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::RunForwardCPU()
         for(int i = 0; i < inflags.GetValueInt("iter"); i++)
         {
             eAF = static_cast<Tref>(1.0) / (static_cast<Tref>(i) + static_cast<Tref>(1.0));
-            runCPUFwdTrain(
-                epsilon, eAF /* alpha, beta,*/);
+            runCPUFwdTrain(epsilon, eAF /* alpha, beta,*/);
         }
     }
     else if(forw == 2)
@@ -992,47 +992,46 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::RunBackwardGPU()
         if(saveMeanVar)
         {
             miopenBatchNormalizationBackward(GetHandle(),
-                                                 bn_mode,
-                                                 &alphaDataDiff,
-                                                 &betaDataDiff,
-                                                 &alphaParamDiff,
-                                                 &betaParamDiff,
-                                                 &in.GetTensor().desc,
-                                                 in.GetDevicePtr(),
-                                                 &dy.GetTensor().desc,
-                                                 dy.GetDevicePtr(),
-                                                 &out.GetTensor().desc,
-                                                 out.GetDevicePtr(),
-                                                 &bnScale.GetTensor().desc,
-                                                 bnScale.GetDevicePtr(),
-                                                 dScale.GetDevicePtr(),
-                                                 dBias.GetDevicePtr(),
-                                                 epsilon,
-                                                 savedMean.GetDevicePtr(),
-                                                 savedInvVar.GetDevicePtr()
-                                                 );
+                                             bn_mode,
+                                             &alphaDataDiff,
+                                             &betaDataDiff,
+                                             &alphaParamDiff,
+                                             &betaParamDiff,
+                                             &in.GetTensor().desc,
+                                             in.GetDevicePtr(),
+                                             &dy.GetTensor().desc,
+                                             dy.GetDevicePtr(),
+                                             &out.GetTensor().desc,
+                                             out.GetDevicePtr(),
+                                             &bnScale.GetTensor().desc,
+                                             bnScale.GetDevicePtr(),
+                                             dScale.GetDevicePtr(),
+                                             dBias.GetDevicePtr(),
+                                             epsilon,
+                                             savedMean.GetDevicePtr(),
+                                             savedInvVar.GetDevicePtr());
         }
         else
         {
             miopenBatchNormalizationBackward(GetHandle(),
-                                                 bn_mode,
-                                                 &alphaDataDiff,
-                                                 &betaDataDiff,
-                                                 &alphaParamDiff,
-                                                 &betaParamDiff,
-                                                 &in.GetTensor().desc,
-                                                 in.GetDevicePtr(),
-                                                 &dy.GetTensor().desc,
-                                                 dy.GetDevicePtr(),
-                                                 &out.GetTensor().desc,
-                                                 out.GetDevicePtr(),
-                                                 &bnScale.GetTensor().desc,
-                                                 bnScale.GetDevicePtr(),
-                                                 dScale.GetDevicePtr(),
-                                                 dBias.GetDevicePtr(),
-                                                 epsilon,
-                                                 nullptr,
-                                                 nullptr);
+                                             bn_mode,
+                                             &alphaDataDiff,
+                                             &betaDataDiff,
+                                             &alphaParamDiff,
+                                             &betaParamDiff,
+                                             &in.GetTensor().desc,
+                                             in.GetDevicePtr(),
+                                             &dy.GetTensor().desc,
+                                             dy.GetDevicePtr(),
+                                             &out.GetTensor().desc,
+                                             out.GetDevicePtr(),
+                                             &bnScale.GetTensor().desc,
+                                             bnScale.GetDevicePtr(),
+                                             dScale.GetDevicePtr(),
+                                             dBias.GetDevicePtr(),
+                                             epsilon,
+                                             nullptr,
+                                             nullptr);
         }
 
         miopen::deref(GetHandle()).Finish();
@@ -1129,8 +1128,8 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::VerifyForward()
                     {
                         std::cout << "rm[" << i << "]: " << runMean.GetVector()[i];
                         std::cout << ", rm_host[" << i << "]: " << runMean_ref.data[i];
-                        std::cout << ", diff[" << i
-                                  << "]: " << Tmix(fabs(runMean.GetVector()[i]) - fabs(runMean_ref.data[i]))
+                        std::cout << ", diff[" << i << "]: "
+                                  << Tmix(fabs(runMean.GetVector()[i]) - fabs(runMean_ref.data[i]))
                                   << std::endl;
                     }
                 }
@@ -1149,17 +1148,19 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::VerifyForward()
                           << errorRunVar << std::endl;
                 anError = true;
 #if(MIO_BN_DEBUG == 1)
-                for(int i = 0; i < runVariance.GetVector().size() && i < runVariance_ref.data.size() &&
-                               i < MIO_BN_MAX_DEBUGLOOP;
+                for(int i = 0; i < runVariance.GetVector().size() &&
+                               i < runVariance_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP;
                     i++)
                 {
-                    diff = fabs(Tmix(fabs(runVariance.GetVector()[i]) - fabs(runVariance_ref.data[i])));
+                    diff = fabs(
+                        Tmix(fabs(runVariance.GetVector()[i]) - fabs(runVariance_ref.data[i])));
                     if(!std::isfinite(diff) || diff > tolerance)
                     {
                         std::cout << "rv[" << i << "]: " << runVariance.GetVector()[i];
                         std::cout << ", rv_host[" << i << "]: " << runVariance_ref.data[i];
                         std::cout << ", diff[" << i << "]: "
-                                  << Tmix(fabs(runVariance.GetVector()[i]) - fabs(runVariance_ref.data[i]))
+                                  << Tmix(fabs(runVariance.GetVector()[i]) -
+                                          fabs(runVariance_ref.data[i]))
                                   << std::endl;
                     }
                 }
@@ -1176,7 +1177,7 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::VerifyForward()
         { // copy back for verification
             // saveMean_dev->FromGPU(GetStream(), savedMean.data());
             // saveInvVariance_dev->FromGPU(GetStream(), savedInvVar.data());
-            
+
             savedMean.CopyFromDeviceToHost(GetStream());
             savedVariance.CopyFromDeviceToHost(GetStream());
 
@@ -1188,18 +1189,20 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::VerifyForward()
                           << errorSaveMean << std::endl;
                 anError = true;
 #if(MIO_BN_DEBUG == 1)
-                for(int i = 0;
-                    i < savedMean.GetVector().size() && i < savedVariance_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP;
+                for(int i = 0; i < savedMean.GetVector().size() &&
+                               i < savedVariance_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP;
                     i++)
                 {
-                    diff   = fabs(Tmix(fabs(savedMean.GetVector()[i]) - fabs(savedVariance_ref.data[i])));
+                    diff = fabs(
+                        Tmix(fabs(savedMean.GetVector()[i]) - fabs(savedVariance_ref.data[i])));
                     maxval = maxval < diff ? diff : maxval;
                     if(!std::isfinite(diff) || diff > tolerance)
                     {
                         std::cout << "sm[" << i << "]: " << savedMean.GetVector()[i];
                         std::cout << ", sm_host[" << i << "]: " << savedVariance_ref.data[i];
-                        std::cout << ", diff[" << i
-                                  << "]: " << Tmix(fabs(savedMean.GetVector()[i]) - fabs(savedVariance_ref.data[i]))
+                        std::cout << ", diff[" << i << "]: "
+                                  << Tmix(fabs(savedMean.GetVector()[i]) -
+                                          fabs(savedVariance_ref.data[i]))
                                   << std::endl;
                     }
                 }
@@ -1212,7 +1215,8 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::VerifyForward()
                           << errorSaveMean << ')' << std::endl;
             }
 
-            auto errorSaveVar = miopen::rms_range(savedVariance_ref.data, savedVariance.GetVector());
+            auto errorSaveVar =
+                miopen::rms_range(savedVariance_ref.data, savedVariance.GetVector());
             if(!std::isfinite(errorSaveVar) || errorSaveVar > maxrms)
             {
                 std::cout
@@ -1220,17 +1224,19 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::VerifyForward()
                     << errorSaveVar << std::endl;
                 anError = true;
 #if(MIO_BN_DEBUG == 1)
-                for(int i = 0; i < savedVariance.GetVector().size() && i < savedVariance_ref.data.size() &&
-                               i < MIO_BN_MAX_DEBUGLOOP;
+                for(int i = 0; i < savedVariance.GetVector().size() &&
+                               i < savedVariance_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP;
                     i++)
                 {
-                    diff = fabs(Tmix(fabs(savedVariance.GetVector()[i]) - fabs(savedVariance_ref.data[i])));
+                    diff = fabs(
+                        Tmix(fabs(savedVariance.GetVector()[i]) - fabs(savedVariance_ref.data[i])));
                     if(!std::isfinite(diff) || diff > tolerance)
                     {
                         std::cout << "sv[" << i << "]: " << savedVariance.GetVector()[i];
                         std::cout << ", sv_host[" << i << "]: " << savedVariance_ref.data[i];
                         std::cout << ", diff[" << i << "]: "
-                                  << Tmix(fabs(savedVariance.GetVector()[i]) - fabs(savedVariance_ref.data[i]))
+                                  << Tmix(fabs(savedVariance.GetVector()[i]) -
+                                          fabs(savedVariance_ref.data[i]))
                                   << std::endl;
                     }
                 }
@@ -1265,7 +1271,8 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::VerifyForward()
             }
             if(std::isnan(out_ref.data[i]))
             {
-                std::cout << "out_ref[" << i << "] produced a nan: " << out_ref.data[i] << std::endl;
+                std::cout << "out_ref[" << i << "] produced a nan: " << out_ref.data[i]
+                          << std::endl;
             }
             diff   = Tref(fabs(out.GetVector()[i]) - fabs(out_ref.data[i]));
             maxval = maxval < diff ? diff : maxval;
@@ -1273,7 +1280,8 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::VerifyForward()
             {
                 std::cout << "out[" << i << "]: " << out.GetVector()[i];
                 std::cout << ", out_ref.data[" << i << "]: " << out_ref.data[i];
-                std::cout << ", diff[" << i << "]: " << Tref(out.GetVector()[i] - out_ref.data[i]) << std::endl;
+                std::cout << ", diff[" << i << "]: " << Tref(out.GetVector()[i] - out_ref.data[i])
+                          << std::endl;
                 count++;
             }
         }
@@ -1307,37 +1315,37 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::RunBackwardCPU()
 
     //	T alphaDiff = 1, betaDiff = 0;
     //	T alphaParam = 1, betaParam = 0;
-    float alpha = static_cast<float>(1), beta = static_cast<float>(0), gamma = static_cast<float>(1);
+    double alpha = static_cast<double>(1), beta = static_cast<double>(0),
+           gamma = static_cast<double>(1);
 
     if(bn_mode == miopenBNPerActivation)
-    {   
+    {
         // 1xCxHxW
         batchNormActivSpatialHostBwdTrain(activ_mode,
-                                      gamma,
-                                      beta,
-                                      alpha,
-                                      in.GetTensor(),
-                                      dy.GetTensor(),
-                                      out.GetTensor(),
-                                      out_ref,
-                                      scale.GetTensor(),
-                                      bias.GetTensor(),
-                                      dScale_ref,
-                                      dBias_ref,
-                                      savedMean.GetTensor(),
-                                      savedInvVar.GetTensor());
-
+                                          gamma,
+                                          beta,
+                                          alpha,
+                                          in.GetTensor(),
+                                          out.GetTensor(),
+                                          out_ref,
+                                          bnScale.GetTensor(),
+                                          dy.GetTensor(),
+                                          dBias.GetTensor(),
+                                          dScale_ref,
+                                          dBias_ref,
+                                          savedMean.GetTensor(),
+                                          savedInvVar.GetTensor());
     }
     else if(bn_mode == miopenBNSpatial)
-    {                                               // 1xCx1x1
-                batchNormSpatialHostBwdTrain(in.GetTensor(),
-                                      dy.GetTensor(),
-                                      out_ref,
-                                      scale.GetTensor(),
-                                      dScale_ref,
-                                      dBias_ref,
-                                      savedMean.GetTensor(),
-                                      savedInvVar.GetTensor());
+    { // 1xCx1x1
+        batchNormSpatialHostBwdTrain(in.GetTensor(),
+                                     dy.GetTensor(),
+                                     out_ref,
+                                     bnScale.GetTensor(),
+                                     dScale_ref,
+                                     dBias_ref,
+                                     savedMean.GetTensor(),
+                                     savedInvVar.GetTensor());
     }
     else
     {
@@ -1360,11 +1368,11 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::VerifyBackward()
     bool anError      = false;
 
     RunBackwardCPU();
-    
+
     out.CopyFromDeviceToHost(GetStream());
     dScale.CopyFromDeviceToHost(GetStream());
     dBias.CopyFromDeviceToHost(GetStream());
-    
+
 #if(MIO_BN_DEBUG == 1)
     const Tref tolerance =
         static_cast<Tref>(1000 * (sizeof(Tgpu) == 4) ? ERRTOL_FP32 : ERRTOL_FP16);
@@ -1386,9 +1394,11 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::VerifyBackward()
             {
                 std::cout << "out_ref[" << i << "]: " << out_ref.data[i];
                 std::cout << "\tout.GetVector()[" << i << "]: " << out.GetVector()[i];
-                std::cout << "\tdiff[" << i << "]: " << Tgpu(fabs(out_ref.data[i]) - fabs(out.GetVector()[i]));
+                std::cout << "\tdiff[" << i
+                          << "]: " << Tgpu(fabs(out_ref.data[i]) - fabs(out.GetVector()[i]));
                 std::cout << "\tratioH: "
-                          << fabs(fabs(out_ref.data[i]) - fabs(out.GetVector()[i])) / fabs(out.GetVector()[i])
+                          << fabs(fabs(out_ref.data[i]) - fabs(out.GetVector()[i])) /
+                                 fabs(out.GetVector()[i])
                           << std::endl;
             }
         }
@@ -1420,7 +1430,8 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::VerifyBackward()
                 std::cout << "\tdiff[" << i
                           << "]: " << Tmix(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i]));
                 std::cout << "\tratioH: "
-                          << fabs(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i])) / fabs(dScale_ref.data[i])
+                          << fabs(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i])) /
+                                 fabs(dScale_ref.data[i])
                           << std::endl;
             }
         }
@@ -1440,16 +1451,18 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::VerifyBackward()
                   << std::endl;
         anError = true;
 #if(MIO_BN_DEBUG == 1)
-        for(int i = 0; i <  dBias.GetVector().size() && i < MIO_BN_MAX_DEBUGLOOP; i++)
+        for(int i = 0; i < dBias.GetVector().size() && i < MIO_BN_MAX_DEBUGLOOP; i++)
         {
-            diff = fabs(Tmix(fabs( dBias.GetVector()[i]) - fabs(dBias_ref.data[i])));
+            diff = fabs(Tmix(fabs(dBias.GetVector()[i]) - fabs(dBias_ref.data[i])));
             if(!std::isfinite(diff) || diff > tolerance)
             {
-                std::cout << "dbias[" << i << "]: " <<  dBias.GetVector()[i];
+                std::cout << "dbias[" << i << "]: " << dBias.GetVector()[i];
                 std::cout << "\tdbias_host[" << i << "]: " << dBias_ref.data[i];
-                std::cout << "\tdiff[" << i << "]: " << Tmix(fabs( dBias.GetVector()[i]) - fabs(dBias_ref.data[i]));
+                std::cout << "\tdiff[" << i
+                          << "]: " << Tmix(fabs(dBias.GetVector()[i]) - fabs(dBias_ref.data[i]));
                 std::cout << "\tratioH: "
-                          << fabs(fabs( dBias.GetVector()[i]) - fabs(dBias_ref.data[i])) / fabs(dBias_ref.data[i])
+                          << fabs(fabs(dBias.GetVector()[i]) - fabs(dBias_ref.data[i])) /
+                                 fabs(dBias_ref.data[i])
                           << std::endl;
             }
         }
diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp
index 82b4fb156f..97fe16d7c4 100644
--- a/src/driver_arguments.cpp
+++ b/src/driver_arguments.cpp
@@ -248,6 +248,7 @@ std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc,
                      resultRunningVariance,
                      resultSaveMean,
                      resultSaveInvVariance);
+        ss << " --layout " << miopen::deref(xDesc).GetLayout_str();
     }
     return ss.str();
 }
diff --git a/src/tensor.cpp b/src/tensor.cpp
index 7ec4c4e581..f0fbd86a7b 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -35,7 +35,6 @@
 #include <cassert>
 #include <numeric>
 #include <string>
-
 namespace miopen {
 
 namespace {
diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index 2bf2e47c4a..0bd8df42e8 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -278,11 +278,11 @@ void batchNormSpatialHostFwdTrain(const tensor<T>& input,
     });
 }
 
-template <class T, class U=T>
+template <class T, class U>
 void batchNormSpatialHostBwdTrain(const tensor<T>& x_input,
                                   const tensor<U>& dy_input,
                                   tensor<T>& dx_out,
-                                  const tensor<T>& scale,
+                                  const tensor<T>& bnScale,
                                   tensor<U>& dscale,
                                   tensor<U>& dbias,
                                   const tensor<U>& savedMean,
@@ -333,7 +333,7 @@ void batchNormSpatialHostBwdTrain(const tensor<T>& x_input,
 
                     double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0);
                     double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
-                    double tmp3 = (scale(0, cidx, 0, 0) * invVar) / nhw;
+                    double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw;
                     dx_out(bidx, cidx, row, column) = static_cast<T>(tmp3 * (tmp2 + tmp1));
                 } // end for(n_batchs)
             }     // for (column)
@@ -347,11 +347,11 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
                                        double beta,
                                        double alpha,
                                        const tensor<T>& x_input,
-                                       const tensor<U>& dy_input,
                                        const tensor<T>& y_input,
                                        tensor<T>& dx_out,
-                                       const tensor<T>& scale,
-                                       const tensor<T>& bias,
+                                       const tensor<T>& bnScale,
+                                       const tensor<U>& dy_input,
+                                       const tensor<U>& bias,
                                        tensor<U>& dscale,
                                        tensor<U>& dbias,
                                        const tensor<U>& savedMean,
@@ -387,7 +387,8 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
                     elemStd = static_cast<double>(x_input(bidx, cidx, row, column)) -
                               mean; // (x_i - mean)
                     xhat[xhat_index] = elemStd * invVar;
-                    double bnrefowd = scale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0);
+                    double bnrefowd =
+                        bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0);
                     activationHostBwdElement(activMode,
                                              gamma,
                                              beta,
@@ -408,8 +409,9 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
             { // via columns
                 for(int bidx = 0; bidx < n_batch; bidx++)
                 { // via mini_batch
-                    xhat_index      = in_cstride * bidx + (width * row + column);
-                    double bnrefowd = scale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0);
+                    xhat_index = in_cstride * bidx + (width * row + column);
+                    double bnrefowd =
+                        bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0);
                     activationHostBwdElement(activMode,
                                              gamma,
                                              beta,
@@ -421,7 +423,7 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
                     // double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0);
                     double tmp1                     = nhw * dyelem - dbias(0, cidx, 0, 0);
                     double tmp2                     = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
-                    double tmp3                     = (scale(0, cidx, 0, 0) * invVar) / nhw;
+                    double tmp3                     = (bnScale(0, cidx, 0, 0) * invVar) / nhw;
                     dx_out(bidx, cidx, row, column) = static_cast<T>(tmp3 * (tmp2 + tmp1));
                 } // end for(n_batchs)
             }     // for (column)
diff --git a/test/na_train.cpp b/test/na_train.cpp
index e776f4414e..f2c1c9af1b 100644
--- a/test/na_train.cpp
+++ b/test/na_train.cpp
@@ -302,20 +302,20 @@ struct verify_bwd_batchnorm_spatial_activ
         std::fill(dgamma.begin(), dgamma.end(), 0.);
         std::fill(dbeta.begin(), dbeta.end(), 0.);
 
-        batchNormActivSpatialHostBwdTrain(activ_mode,
-                                          activ_gamma,
-                                          activ_beta,
-                                          activ_alpha,
-                                          x,
-                                          dy,
-                                          y,
-                                          dx,
-                                          bnscale,
-                                          bnbias,
-                                          dgamma,
-                                          dbeta,
-                                          savedMean,
-                                          savedInvVar);
+        // batchNormActivSpatialHostBwdTrain(activ_mode,
+        //                                   activ_gamma,
+        //                                   activ_beta,
+        //                                   activ_alpha,
+        //                                   x,
+        //                                   y,
+        //                                   dx,
+        //                                   bnscale,
+        //                                   dy,
+        //                                   bnbias,
+        //                                   dgamma,
+        //                                   dbeta,
+        //                                   savedMean,
+        //                                   savedInvVar);
 
         return std::make_tuple(dx, dgamma, dbeta);
     }
diff --git a/test/na_train_find2.cpp b/test/na_train_find2.cpp
index 51868ae77b..edb79874b0 100644
--- a/test/na_train_find2.cpp
+++ b/test/na_train_find2.cpp
@@ -380,20 +380,20 @@ struct verify_bwd_batchnorm_spatial_activ
         std::fill(dgamma.begin(), dgamma.end(), 0.);
         std::fill(dbeta.begin(), dbeta.end(), 0.);
 
-        batchNormActivSpatialHostBwdTrain(activ_mode,
-                                          activ_gamma,
-                                          activ_beta,
-                                          activ_alpha,
-                                          x,
-                                          dy,
-                                          y,
-                                          dx,
-                                          bnscale,
-                                          bnbias,
-                                          dgamma,
-                                          dbeta,
-                                          savedMean,
-                                          savedInvVar);
+        // batchNormActivSpatialHostBwdTrain(activ_mode,
+        //                                   activ_gamma,
+        //                                   activ_beta,
+        //                                   activ_alpha,
+        //                                   x,
+        //                                   y,
+        //                                   dx,
+        //                                   bnscale,
+        //                                   dy,
+        //                                   bnbias,
+        //                                   dgamma,
+        //                                   dbeta,
+        //                                   savedMean,
+        //                                   savedInvVar);
 
         return std::make_tuple(dx, dgamma, dbeta);
     }

From d4ae0e1c46b2f5eef4e468546a9396ec3ea73c86 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Thu, 5 Sep 2024 18:09:02 +0000
Subject: [PATCH 03/27] fix build error

---
 test/fusionHost.hpp | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index 0bd8df42e8..4e61d89edb 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -168,14 +168,14 @@ void batchNormSpatialHostInference(const tensor<T>& input,
     });
 }
 
-template <class T, class U>
+template <class T, class U, class V>
 void batchNormPerActivHostInference(const tensor<T>& input,
                                     tensor<T>& output,
-                                    const tensor<T>& scale,
-                                    const tensor<T>& bias,
+                                    const tensor<U>& scale,
+                                    const tensor<U>& bias,
                                     double epsilon,
-                                    const tensor<U>& estimatedMean,
-                                    const tensor<U>& estimatedVariance)
+                                    const tensor<V>& estimatedMean,
+                                    const tensor<V>& estimatedVariance)
 {
     int n_batches, channels, height, width;
     std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths());
@@ -278,15 +278,15 @@ void batchNormSpatialHostFwdTrain(const tensor<T>& input,
     });
 }
 
-template <class T, class U>
+template <class T, class U, class V>
 void batchNormSpatialHostBwdTrain(const tensor<T>& x_input,
-                                  const tensor<U>& dy_input,
-                                  tensor<T>& dx_out,
+                                  const tensor<V>& dy_input,
+                                  tensor<U>& dx_out,
                                   const tensor<T>& bnScale,
-                                  tensor<U>& dscale,
-                                  tensor<U>& dbias,
-                                  const tensor<U>& savedMean,
-                                  const tensor<U>& savedInvVar)
+                                  tensor<V>& dscale,
+                                  tensor<V>& dbias,
+                                  const tensor<V>& savedMean,
+                                  const tensor<V>& savedInvVar)
 {
 
     int height, width, n_batch, channels;
@@ -431,17 +431,17 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
     });           // for (channel)
 }
 
-template <class T, class U>
+template <class T, class U, class V>
 void batchNormPerActHostFwdTrain(const tensor<T>& input,
                                  tensor<T>& out,
-                                 const tensor<T>& scale,
-                                 const tensor<T>& bias,
+                                 const tensor<U>& scale,
+                                 const tensor<U>& bias,
                                  double epsilon,
                                  double expAvgFactor,
-                                 tensor<U>& saveMean,
-                                 tensor<U>& saveInvVar,
-                                 tensor<U>& runMean,
-                                 tensor<U>& runVar)
+                                 tensor<V>& saveMean,
+                                 tensor<V>& saveInvVar,
+                                 tensor<V>& runMean,
+                                 tensor<V>& runVar)
 {
 
     int height, width, n_batch, channels;

From db2a42f34bf810f031c0624b81c9ece5a30c943f Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Thu, 5 Sep 2024 18:21:27 +0000
Subject: [PATCH 04/27] undo template change

---
 driver/bn_driver.hpp | 128 +++++++++++++++++++++----------------------
 driver/dm_bnorm.cpp  |   4 +-
 2 files changed, 65 insertions(+), 67 deletions(-)

diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index c56fb1a4a3..72db700a3f 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -202,7 +202,7 @@ class GpumemTensor
 
 //#define BN_RUNFOR_PROFILER
 
-template <typename Tgpu, typename Acc, typename Tref, typename Tmix = Tgpu>
+template <typename Tgpu, typename Tref, typename Tmix = Tgpu>
 class BatchNormDriver : public Driver
 {
 public:
@@ -285,36 +285,36 @@ class BatchNormDriver : public Driver
     GpumemTensor<Tgpu> bias;
 
     // forward inference
-    GpumemTensor<Acc> estMean;
-    GpumemTensor<Acc> estVariance;
+    GpumemTensor<Tmix> estMean;
+    GpumemTensor<Tmix> estVariance;
 
     // forward training
-    GpumemTensor<Acc> savedMean;
-    tensor<Acc> savedMean_ref;
-    GpumemTensor<Acc> savedVariance;
-    tensor<Acc> savedVariance_ref;
-    GpumemTensor<Acc> runMean;
-    tensor<Acc> runMean_ref;
-    GpumemTensor<Acc> runVariance;
-    tensor<Acc> runVariance_ref;
+    GpumemTensor<Tmix> savedMean;
+    tensor<Tmix> savedMean_ref;
+    GpumemTensor<Tmix> savedVariance;
+    tensor<Tmix> savedVariance_ref;
+    GpumemTensor<Tmix> runMean;
+    tensor<Tmix> runMean_ref;
+    GpumemTensor<Tmix> runVariance;
+    tensor<Tmix> runVariance_ref;
 
     // backward
     GpumemTensor<Tgpu> bnScale;
 
-    GpumemTensor<Acc> dy;
-    GpumemTensor<Acc> dScale;
-    tensor<Acc> dScale_ref;
-    GpumemTensor<Acc> dBias;
-    tensor<Acc> dBias_ref;
-    GpumemTensor<Acc> savedInvVar;
+    GpumemTensor<Tmix> dy;
+    GpumemTensor<Tmix> dScale;
+    tensor<Tmix> dScale_ref;
+    GpumemTensor<Tmix> dBias;
+    tensor<Tmix> dBias_ref;
+    GpumemTensor<Tmix> savedInvVar;
 
     Tref maxval;
 
     miopenTensorLayout_t bn_layout;
 };
 
-template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::ParseCmdLineArgs(int argc, char* argv[])
+template <typename Tgpu, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Tref, Tmix>::ParseCmdLineArgs(int argc, char* argv[])
 {
     inflags.Parse(argc, argv);
 
@@ -326,8 +326,8 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::ParseCmdLineArgs(int argc, char* arg
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::GetandSetData()
+template <typename Tgpu, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Tref, Tmix>::GetandSetData()
 {
 
     SetBNParametersFromCmdLineArgs();
@@ -345,25 +345,25 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::GetandSetData()
     }
     if(isFwdInfer)
     {
-        estMean.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
-        estVariance.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
+        estMean.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
+        estVariance.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
     }
     else if(isFwdTrain)
     {
-        savedMean.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
-        savedVariance.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
-        runMean.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
-        runVariance.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
+        savedMean.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
+        savedVariance.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
+        runMean.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
+        runVariance.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
     }
     else if(isBwd)
     {
         bnScale.AllocOnHost(tensor<Tgpu>{bn_layout, derivedBnDesc.GetLengths()});
-        dy.AllocOnHost(tensor<Acc>{bn_layout, in_len});
+        dy.AllocOnHost(tensor<Tref>{bn_layout, in_len});
 
-        dScale.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
-        dBias.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
-        savedMean.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
-        savedInvVar.AllocOnHost(tensor<Acc>{bn_layout, derivedBnDesc.GetLengths()});
+        dScale.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
+        dBias.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
+        savedMean.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
+        savedInvVar.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
     }
     else
     {
@@ -373,8 +373,8 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::GetandSetData()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::AddCmdLineArgs()
+template <typename Tgpu, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Tref, Tmix>::AddCmdLineArgs()
 {
     inflags.AddInputFlag(
         "forw",
@@ -429,8 +429,8 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::AddCmdLineArgs()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-std::vector<int> BatchNormDriver<Tgpu, Acc, Tref, Tmix>::GetInputTensorLengthsFromCmdLine()
+template <typename Tgpu, typename Tref, typename Tmix>
+std::vector<int> BatchNormDriver<Tgpu, Tref, Tmix>::GetInputTensorLengthsFromCmdLine()
 {
     int in_n = inflags.GetValueInt("batchsize");
     int in_c = inflags.GetValueInt("in_channels");
@@ -452,8 +452,8 @@ std::vector<int> BatchNormDriver<Tgpu, Acc, Tref, Tmix>::GetInputTensorLengthsFr
     }
 }
 
-template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::SetBNParametersFromCmdLineArgs()
+template <typename Tgpu, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Tref, Tmix>::SetBNParametersFromCmdLineArgs()
 {
 
     //    	double bnAlpha = inflags.GetValueDouble("alpha");
@@ -562,8 +562,8 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::SetBNParametersFromCmdLineArgs()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::AllocateBuffersAndCopy()
+template <typename Tgpu, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Tref, Tmix>::AllocateBuffersAndCopy()
 {
     status_t status = STATUS_SUCCESS;
     DEFINE_CONTEXT(ctx);
@@ -621,10 +621,8 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::AllocateBuffersAndCopy()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runGPUFwdInference(Tref epsilon,
-                                                                float alpha,
-                                                                float beta)
+template <typename Tgpu, typename Tref, typename Tmix>
+void BatchNormDriver<Tgpu, Tref, Tmix>::runGPUFwdInference(Tref epsilon, float alpha, float beta)
 {
 
     if(keepRunningMeanVar)
@@ -665,11 +663,11 @@ void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runGPUFwdInference(Tref epsilon,
     return;
 }
 
-template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runGPUFwdTrain(Tref epsilon,
-                                                            Tref eAF,
-                                                            float alpha,
-                                                            float beta)
+template <typename Tgpu, typename Tref, typename Tmix>
+void BatchNormDriver<Tgpu, Tref, Tmix>::runGPUFwdTrain(Tref epsilon,
+                                                       Tref eAF,
+                                                       float alpha,
+                                                       float beta)
 {
     if(saveMeanVar && keepRunningMeanVar)
     {
@@ -773,8 +771,8 @@ void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runGPUFwdTrain(Tref epsilon,
 #endif
 }
 
-template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::RunForwardGPU()
+template <typename Tgpu, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Tref, Tmix>::RunForwardGPU()
 {
 
     float alpha = static_cast<float>(1), beta = static_cast<float>(0);
@@ -873,8 +871,8 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::RunForwardGPU()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runCPUFwdInference(Tref epsilon)
+template <typename Tgpu, typename Tref, typename Tmix>
+void BatchNormDriver<Tgpu, Tref, Tmix>::runCPUFwdInference(Tref epsilon)
 {
 
     if(bn_mode == miopenBNPerActivation)
@@ -907,8 +905,8 @@ void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runCPUFwdInference(Tref epsilon)
     return;
 }
 
-template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runCPUFwdTrain(Tref epsilon, Tref eAF)
+template <typename Tgpu, typename Tref, typename Tmix>
+void BatchNormDriver<Tgpu, Tref, Tmix>::runCPUFwdTrain(Tref epsilon, Tref eAF)
 {
 
     if(bn_mode == miopenBNPerActivation)
@@ -945,8 +943,8 @@ void BatchNormDriver<Tgpu, Acc, Tref, Tmix>::runCPUFwdTrain(Tref epsilon, Tref e
     }
 }
 
-template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::RunForwardCPU()
+template <typename Tgpu, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Tref, Tmix>::RunForwardCPU()
 {
     //	T alpha = 0., beta  = 0.;
     Tref epsilon = static_cast<Tref>(EPSILON);
@@ -968,8 +966,8 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::RunForwardCPU()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::RunBackwardGPU()
+template <typename Tgpu, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardGPU()
 {
 
     if(!back)
@@ -1085,8 +1083,8 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::RunBackwardGPU()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::VerifyForward()
+template <typename Tgpu, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
 {
 
     // jump out since we are forcing forward off when doing backwards.
@@ -1306,8 +1304,8 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::VerifyForward()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::RunBackwardCPU()
+template <typename Tgpu, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardCPU()
 {
 
     if(!back)
@@ -1357,8 +1355,8 @@ int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::RunBackwardCPU()
     return miopenStatusSuccess;
 }
 
-template <typename Tgpu, typename Acc, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Acc, Tref, Tmix>::VerifyBackward()
+template <typename Tgpu, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyBackward()
 {
 
     if(!back)
diff --git a/driver/dm_bnorm.cpp b/driver/dm_bnorm.cpp
index 4333787e5e..c7bab90bb5 100644
--- a/driver/dm_bnorm.cpp
+++ b/driver/dm_bnorm.cpp
@@ -29,9 +29,9 @@
 static Driver* makeDriver(const std::string& base_arg)
 {
     if(base_arg == "bnorm")
-        return new BatchNormDriver<float, float, double>();
+        return new BatchNormDriver<float, double>();
     if(base_arg == "bnormfp16")
-        return new BatchNormDriver<float16, float, double, float>();
+        return new BatchNormDriver<float16, double, float>();
     return nullptr;
 }
 

From f843c8c3101119688be9e639b49219fb66cd8eb4 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Thu, 5 Sep 2024 18:57:51 +0000
Subject: [PATCH 05/27] fixed batchNormActivSpatialHostBwdTrain issue

---
 src/tensor.cpp          |  1 +
 test/fusionHost.hpp     | 52 ++++++++++++++++++++++++-----------------
 test/gtest/bn.hpp       |  5 ----
 test/na_train.cpp       | 28 +++++++++++-----------
 test/na_train_find2.cpp | 28 +++++++++++-----------
 5 files changed, 59 insertions(+), 55 deletions(-)

diff --git a/src/tensor.cpp b/src/tensor.cpp
index f0fbd86a7b..7ec4c4e581 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -35,6 +35,7 @@
 #include <cassert>
 #include <numeric>
 #include <string>
+
 namespace miopen {
 
 namespace {
diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index 4e61d89edb..3751dde95b 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -278,15 +278,19 @@ void batchNormSpatialHostFwdTrain(const tensor<T>& input,
     });
 }
 
-template <class T, class U, class V>
-void batchNormSpatialHostBwdTrain(const tensor<T>& x_input,
-                                  const tensor<V>& dy_input,
-                                  tensor<U>& dx_out,
-                                  const tensor<T>& bnScale,
-                                  tensor<V>& dscale,
-                                  tensor<V>& dbias,
-                                  const tensor<V>& savedMean,
-                                  const tensor<V>& savedInvVar)
+template <typename XDataType,
+          typename DyDataType,
+          typename DxDataType,
+          typename ScaleDataType,
+          typename AccDataType>
+void batchNormSpatialHostBwdTrain(const tensor<XDataType>& x_input,
+                                  const tensor<DyDataType>& dy_input,
+                                  tensor<DxDataType>& dx_out,
+                                  const tensor<ScaleDataType>& bnScale,
+                                  tensor<AccDataType>& dscale,
+                                  tensor<AccDataType>& dbias,
+                                  const tensor<AccDataType>& savedMean,
+                                  const tensor<AccDataType>& savedInvVar)
 {
 
     int height, width, n_batch, channels;
@@ -334,28 +338,32 @@ void batchNormSpatialHostBwdTrain(const tensor<T>& x_input,
                     double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0);
                     double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
                     double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw;
-                    dx_out(bidx, cidx, row, column) = static_cast<T>(tmp3 * (tmp2 + tmp1));
+                    dx_out(bidx, cidx, row, column) = static_cast<DxDataType>(tmp3 * (tmp2 + tmp1));
                 } // end for(n_batchs)
             }     // for (column)
         }         // for (row)
     });           // for (channel)
 }
 
-template <class T, class U>
+template <typename XDataType,
+          typename DyDataType,
+          typename DxDataType,
+          typename ScaleDataType,
+          typename AccDataType>
 void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
                                        double gamma,
                                        double beta,
                                        double alpha,
-                                       const tensor<T>& x_input,
-                                       const tensor<T>& y_input,
-                                       tensor<T>& dx_out,
-                                       const tensor<T>& bnScale,
-                                       const tensor<U>& dy_input,
-                                       const tensor<U>& bias,
-                                       tensor<U>& dscale,
-                                       tensor<U>& dbias,
-                                       const tensor<U>& savedMean,
-                                       const tensor<U>& savedInvVar)
+                                       const tensor<XDataType>& x_input,
+                                       const tensor<DxDataType>& y_input,
+                                       tensor<DxDataType>& dx_out,
+                                       const tensor<ScaleDataType>& bnScale,
+                                       const tensor<DyDataType>& dy_input,
+                                       const tensor<AccDataType>& bias,
+                                       tensor<AccDataType>& dscale,
+                                       tensor<AccDataType>& dbias,
+                                       const tensor<AccDataType>& savedMean,
+                                       const tensor<AccDataType>& savedInvVar)
 {
 
     int height, width, n_batch, channels;
@@ -424,7 +432,7 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
                     double tmp1                     = nhw * dyelem - dbias(0, cidx, 0, 0);
                     double tmp2                     = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
                     double tmp3                     = (bnScale(0, cidx, 0, 0) * invVar) / nhw;
-                    dx_out(bidx, cidx, row, column) = static_cast<T>(tmp3 * (tmp2 + tmp1));
+                    dx_out(bidx, cidx, row, column) = static_cast<DxDataType>(tmp3 * (tmp2 + tmp1));
                 } // end for(n_batchs)
             }     // for (column)
         }         // for (row)
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index 16d788a70c..f5227217e4 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -120,19 +120,14 @@ struct BNBwdTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopen
                                                     &bn_bwd_test_data.betaDataDiff,
                                                     &bn_bwd_test_data.alphaParamDiff,
                                                     &bn_bwd_test_data.betaParamDiff,
-                                                    
                                                     &bn_bwd_test_data.input.desc,
                                                     bn_bwd_test_data.in_dev.get(),
-
                                                     &bn_bwd_test_data.dy.desc,
                                                     bn_bwd_test_data.dy_dev.get(),
-                                                    
                                                     &bn_bwd_test_data.output.desc,
                                                     bn_bwd_test_data.out_dev.get(),
-                                                    
                                                     &bn_bwd_test_data.bnScale.desc,
                                                     bn_bwd_test_data.bnScale_dev.get(),
-                                                    
                                                     bn_bwd_test_data.dScale_dev.get(),
                                                     bn_bwd_test_data.dBias_dev.get(),
                                                     bn_bwd_test_data.epsilon,
diff --git a/test/na_train.cpp b/test/na_train.cpp
index f2c1c9af1b..3541245f80 100644
--- a/test/na_train.cpp
+++ b/test/na_train.cpp
@@ -302,20 +302,20 @@ struct verify_bwd_batchnorm_spatial_activ
         std::fill(dgamma.begin(), dgamma.end(), 0.);
         std::fill(dbeta.begin(), dbeta.end(), 0.);
 
-        // batchNormActivSpatialHostBwdTrain(activ_mode,
-        //                                   activ_gamma,
-        //                                   activ_beta,
-        //                                   activ_alpha,
-        //                                   x,
-        //                                   y,
-        //                                   dx,
-        //                                   bnscale,
-        //                                   dy,
-        //                                   bnbias,
-        //                                   dgamma,
-        //                                   dbeta,
-        //                                   savedMean,
-        //                                   savedInvVar);
+        batchNormActivSpatialHostBwdTrain(activ_mode,
+                                          activ_gamma,
+                                          activ_beta,
+                                          activ_alpha,
+                                          x,
+                                          y,
+                                          dx,
+                                          bnscale,
+                                          dy,
+                                          bnbias,
+                                          dgamma,
+                                          dbeta,
+                                          savedMean,
+                                          savedInvVar);
 
         return std::make_tuple(dx, dgamma, dbeta);
     }
diff --git a/test/na_train_find2.cpp b/test/na_train_find2.cpp
index edb79874b0..fd123b324f 100644
--- a/test/na_train_find2.cpp
+++ b/test/na_train_find2.cpp
@@ -380,20 +380,20 @@ struct verify_bwd_batchnorm_spatial_activ
         std::fill(dgamma.begin(), dgamma.end(), 0.);
         std::fill(dbeta.begin(), dbeta.end(), 0.);
 
-        // batchNormActivSpatialHostBwdTrain(activ_mode,
-        //                                   activ_gamma,
-        //                                   activ_beta,
-        //                                   activ_alpha,
-        //                                   x,
-        //                                   y,
-        //                                   dx,
-        //                                   bnscale,
-        //                                   dy,
-        //                                   bnbias,
-        //                                   dgamma,
-        //                                   dbeta,
-        //                                   savedMean,
-        //                                   savedInvVar);
+        batchNormActivSpatialHostBwdTrain(activ_mode,
+                                          activ_gamma,
+                                          activ_beta,
+                                          activ_alpha,
+                                          x,
+                                          y,
+                                          dx,
+                                          bnscale,
+                                          dy,
+                                          bnbias,
+                                          dgamma,
+                                          dbeta,
+                                          savedMean,
+                                          savedInvVar);
 
         return std::make_tuple(dx, dgamma, dbeta);
     }

From 7e09ad7b3ed4197c9742762b798cfa3dacf6752e Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Thu, 5 Sep 2024 19:15:11 +0000
Subject: [PATCH 06/27] undo minor changes

---
 driver/bn_driver.hpp    | 4 ++--
 driver/gemm_driver.hpp  | 2 +-
 test/fusionHost.hpp     | 4 ++--
 test/na_train.cpp       | 4 ++--
 test/na_train_find2.cpp | 4 ++--
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index 72db700a3f..e7cf441502 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -1324,10 +1324,10 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardCPU()
                                           beta,
                                           alpha,
                                           in.GetTensor(),
-                                          out.GetTensor(),
+                                          dy.GetTensor(),
                                           out_ref,
                                           bnScale.GetTensor(),
-                                          dy.GetTensor(),
+                                          out.GetTensor(),
                                           dBias.GetTensor(),
                                           dScale_ref,
                                           dBias_ref,
diff --git a/driver/gemm_driver.hpp b/driver/gemm_driver.hpp
index 282173101d..772104544e 100644
--- a/driver/gemm_driver.hpp
+++ b/driver/gemm_driver.hpp
@@ -148,7 +148,7 @@ class GemmDriver : public Driver
     std::vector<T> c;
     std::vector<T> chost;
 
-    T alpha, beta, gamma;
+    T alpha, beta;
 
     miopen::GemmDescriptor gemm_desc = {
         false, false, false, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.0f, 0.0f, miopenFloat, false};
diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index 3751dde95b..713dc4b567 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -355,10 +355,10 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
                                        double beta,
                                        double alpha,
                                        const tensor<XDataType>& x_input,
-                                       const tensor<DxDataType>& y_input,
+                                       const tensor<DyDataType>& dy_input,
                                        tensor<DxDataType>& dx_out,
                                        const tensor<ScaleDataType>& bnScale,
-                                       const tensor<DyDataType>& dy_input,
+                                       const tensor<DxDataType>& y_input,
                                        const tensor<AccDataType>& bias,
                                        tensor<AccDataType>& dscale,
                                        tensor<AccDataType>& dbias,
diff --git a/test/na_train.cpp b/test/na_train.cpp
index 3541245f80..b63a0dce57 100644
--- a/test/na_train.cpp
+++ b/test/na_train.cpp
@@ -307,10 +307,10 @@ struct verify_bwd_batchnorm_spatial_activ
                                           activ_beta,
                                           activ_alpha,
                                           x,
-                                          y,
+                                          dy,
                                           dx,
                                           bnscale,
-                                          dy,
+                                          y,
                                           bnbias,
                                           dgamma,
                                           dbeta,
diff --git a/test/na_train_find2.cpp b/test/na_train_find2.cpp
index fd123b324f..bf837905d4 100644
--- a/test/na_train_find2.cpp
+++ b/test/na_train_find2.cpp
@@ -385,10 +385,10 @@ struct verify_bwd_batchnorm_spatial_activ
                                           activ_beta,
                                           activ_alpha,
                                           x,
-                                          y,
+                                          dy,
                                           dx,
                                           bnscale,
-                                          dy,
+                                          y,
                                           bnbias,
                                           dgamma,
                                           dbeta,

From bd36353007d645824058aed3ce9a9be8e3b7c12d Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Thu, 5 Sep 2024 19:27:33 +0000
Subject: [PATCH 07/27] revert few changes

---
 driver/bn_driver.hpp        |  2 +-
 test/fusionHost.hpp         |  2 +-
 test/gtest/bn_test_data.hpp | 61 ++++++++++++++++++-------------------
 test/na_train.cpp           |  2 +-
 test/na_train_find2.cpp     |  2 +-
 5 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index e7cf441502..5e77c3ba43 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -1325,9 +1325,9 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardCPU()
                                           alpha,
                                           in.GetTensor(),
                                           dy.GetTensor(),
+                                          out.GetTensor(),
                                           out_ref,
                                           bnScale.GetTensor(),
-                                          out.GetTensor(),
                                           dBias.GetTensor(),
                                           dScale_ref,
                                           dBias_ref,
diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index 713dc4b567..a2cdad46b7 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -356,9 +356,9 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
                                        double alpha,
                                        const tensor<XDataType>& x_input,
                                        const tensor<DyDataType>& dy_input,
+                                       const tensor<DxDataType>& y_input,
                                        tensor<DxDataType>& dx_out,
                                        const tensor<ScaleDataType>& bnScale,
-                                       const tensor<DxDataType>& y_input,
                                        const tensor<AccDataType>& bias,
                                        tensor<AccDataType>& dscale,
                                        tensor<AccDataType>& dbias,
diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp
index 2eafaf4665..fcf237400b 100644
--- a/test/gtest/bn_test_data.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -63,37 +63,36 @@ inline std::vector<BNTestCase> Network1()
 {
     // pyt_mlperf_resnet50v1.5
     return {
-        {4, 2, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0},
-        // {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}
-    };
+        {192, 1, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0},
+        {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0},
+        {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}};
 }
 
 template <typename XDataType, typename YDataType, typename TConfig>
diff --git a/test/na_train.cpp b/test/na_train.cpp
index b63a0dce57..e776f4414e 100644
--- a/test/na_train.cpp
+++ b/test/na_train.cpp
@@ -308,9 +308,9 @@ struct verify_bwd_batchnorm_spatial_activ
                                           activ_alpha,
                                           x,
                                           dy,
+                                          y,
                                           dx,
                                           bnscale,
-                                          y,
                                           bnbias,
                                           dgamma,
                                           dbeta,
diff --git a/test/na_train_find2.cpp b/test/na_train_find2.cpp
index bf837905d4..51868ae77b 100644
--- a/test/na_train_find2.cpp
+++ b/test/na_train_find2.cpp
@@ -386,9 +386,9 @@ struct verify_bwd_batchnorm_spatial_activ
                                           activ_alpha,
                                           x,
                                           dy,
+                                          y,
                                           dx,
                                           bnscale,
-                                          y,
                                           bnbias,
                                           dgamma,
                                           dbeta,

From 53a83eaa9a6af7598e4bffbb5d0944d21ef3cc82 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Fri, 6 Sep 2024 01:39:53 +0000
Subject: [PATCH 08/27] fix run time error

---
 driver/bn_driver.hpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index 5e77c3ba43..037950ef74 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -345,25 +345,25 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::GetandSetData()
     }
     if(isFwdInfer)
     {
-        estMean.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
-        estVariance.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
+        estMean.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
+        estVariance.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
     }
     else if(isFwdTrain)
     {
-        savedMean.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
-        savedVariance.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
-        runMean.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
-        runVariance.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
+        savedMean.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
+        savedVariance.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
+        runMean.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
+        runVariance.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
     }
     else if(isBwd)
     {
         bnScale.AllocOnHost(tensor<Tgpu>{bn_layout, derivedBnDesc.GetLengths()});
-        dy.AllocOnHost(tensor<Tref>{bn_layout, in_len});
+        dy.AllocOnHost(tensor<Tmix>{bn_layout, in_len});
 
-        dScale.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
-        dBias.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
-        savedMean.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
-        savedInvVar.AllocOnHost(tensor<Tref>{bn_layout, derivedBnDesc.GetLengths()});
+        dScale.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
+        dBias.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
+        savedMean.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
+        savedInvVar.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
     }
     else
     {

From eb36fcbcf3cf8bc04818dc748f910ea4b9eeb0df Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Fri, 6 Sep 2024 01:55:38 +0000
Subject: [PATCH 09/27] moved GpumemTensor to driver.hpp

---
 driver/bn_driver.hpp   | 137 -----------------------------------------
 driver/conv_driver.hpp | 129 --------------------------------------
 driver/driver.hpp      | 137 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 137 insertions(+), 266 deletions(-)

diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index 037950ef74..7179471ddc 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -63,143 +63,6 @@
 
 #define MIO_DRIVER_BN_REFERENCE_COMPUTE_3D_AS_2D 1 // Resolves issue #1974
 
-//========================
-
-template <typename Tgpu>
-class GpumemTensor
-{
-    std::unique_ptr<GPUMem> dev;
-    tensor<Tgpu> host;
-    bool is_gpualloc = false;
-
-public:
-    void SetGpuallocMode(bool v) { is_gpualloc = v; }
-    tensor<Tgpu>& GetTensor() { return host; }
-
-    void AllocOnHost(miopenTensorDescriptor_t t)
-    {
-        host = tensor<Tgpu>(miopen::deref(t));
-        if(is_gpualloc) // We do not need host data.
-        {
-            host.data.clear();
-            host.data.shrink_to_fit(); // To free host memory.
-        }
-    }
-    template <typename T>
-    void AllocOnHost(tensor<T> t)
-    {
-        AllocOnHost(&t.desc);
-    }
-
-    std::vector<Tgpu>& GetVector()
-    {
-        if(is_gpualloc)
-            MIOPEN_THROW("[MIOpenDriver] GpumemTensor::GetVector should not be called in "
-                         "'--gpualloc 1' mode");
-        return host.data;
-    }
-
-    Tgpu* GetVectorData() { return is_gpualloc ? nullptr : host.data.data(); }
-    std::size_t GetVectorSize() const { return is_gpualloc ? 0 : host.data.size(); }
-
-    void
-    InitHostData(const size_t sz,     //
-                 const bool do_write, // If set to false, then only generate random data. This is
-                                      // necessary to reproduce values in input buffers even if some
-                                      // directions are skipped. For example, inputs for Backward
-                                      // will be the same for both "-F 0" and "-F 2".
-                 std::function<Tgpu()> generator)
-    {
-        if(is_gpualloc)
-        {
-            /// In gpualloc mode, we do not care about reproducibility of results, because
-            /// validation is not used. Therefore, we do not have to always generate random value
-            /// (\ref move_rand)
-            return;
-        }
-
-        for(size_t i = 0; i < sz; ++i)
-        {
-            /// \anchor move_rand
-            /// Generate random value, even if buffer is unused. This provides the same
-            /// initialization of input buffers regardless of which kinds of
-            /// convolutions are currently selectedfor testing (see the "-F" option).
-            /// Verification cache would be broken otherwise.
-            auto val = generator();
-            if(do_write)
-                GetVector()[i] = val;
-        }
-    }
-
-    status_t AllocOnDevice(stream, context_t ctx, const size_t sz)
-    {
-        dev = std::make_unique<GPUMem>(ctx, sz, sizeof(Tgpu));
-        return STATUS_SUCCESS;
-    }
-
-    status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz)
-    {
-        AllocOnDevice(q, ctx, sz);
-        if(is_gpualloc)
-        {
-            /// \anchor gpualloc_random_init
-            /// In gpualloc mode, we do not want to leave input buffers uninitialized, because
-            /// there could be NaNs and Infs, which may affect the performance (which we are
-            /// interested to evaluate in this mode). Initialization with all 0's is not the
-            /// best choice as well, because GPU HW may optimize out computations with 0's and
-            /// that could affect performance of kernels too. That is why we are using
-            /// rocrand to initialize input buffers.
-            ///
-            /// However we do not care about precision in gpualloc mode, because validation
-            /// is not used. Therefore, range (0,1] is fine.
-            return gpumemrand::gen_0_1(static_cast<Tgpu*>(GetDevicePtr()), sz);
-        }
-        return dev->ToGPU(q, GetVectorData());
-    }
-
-    template <typename T>
-    status_t AllocOnDevice(stream, context_t ctx, const size_t sz, std::vector<T>&)
-    {
-        static_assert(std::is_same<T, float>::value           //
-                          || std::is_same<T, int32_t>::value, //
-                      "Before enabling more types, check thoroughly.");
-        dev = std::make_unique<GPUMem>(ctx, sz, sizeof(T));
-        return STATUS_SUCCESS;
-    }
-
-    template <typename T>
-    status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz, std::vector<T>& init)
-    {
-        AllocOnDevice(q, ctx, sz, init);
-        if(is_gpualloc)
-        {
-            /// \ref gpualloc_random_init
-            return gpumemrand::gen_0_1(static_cast<Tgpu*>(GetDevicePtr()), sz);
-        }
-        return dev->ToGPU(q, init.data());
-    }
-
-    status_t CopyFromDeviceToHost(stream q)
-    {
-        return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, GetVectorData());
-    }
-
-    template <typename T>
-    status_t CopyFromDeviceToHost(stream q, tensor<T>& t)
-    {
-        return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, t.data.data());
-    }
-
-    template <typename T>
-    status_t CopyFromDeviceToHost(stream q, std::vector<T>& v)
-    {
-        return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, v.data());
-    }
-
-    auto GetDevicePtr() -> auto { return dev->GetMem(); }
-};
-//========================
-
 //#define BN_RUNFOR_PROFILER
 
 template <typename Tgpu, typename Tref, typename Tmix = Tgpu>
diff --git a/driver/conv_driver.hpp b/driver/conv_driver.hpp
index 8f9e836345..48658164de 100644
--- a/driver/conv_driver.hpp
+++ b/driver/conv_driver.hpp
@@ -180,135 +180,6 @@ static inline miopenDataType_t DataTypeFromShortString(const std::string& type)
     }
 }
 
-template <typename Tgpu>
-class GpumemTensor
-{
-    std::unique_ptr<GPUMem> dev;
-    tensor<Tgpu> host;
-    bool is_gpualloc = false;
-
-public:
-    void SetGpuallocMode(bool v) { is_gpualloc = v; }
-    tensor<Tgpu>& GetTensor() { return host; }
-
-    void AllocOnHost(miopenTensorDescriptor_t t)
-    {
-        host = tensor<Tgpu>(miopen::deref(t));
-        if(is_gpualloc) // We do not need host data.
-        {
-            host.data.clear();
-            host.data.shrink_to_fit(); // To free host memory.
-        }
-    }
-
-    std::vector<Tgpu>& GetVector()
-    {
-        if(is_gpualloc)
-            MIOPEN_THROW("[MIOpenDriver] GpumemTensor::GetVector should not be called in "
-                         "'--gpualloc 1' mode");
-        return host.data;
-    }
-
-    Tgpu* GetVectorData() { return is_gpualloc ? nullptr : host.data.data(); }
-    std::size_t GetVectorSize() const { return is_gpualloc ? 0 : host.data.size(); }
-
-    void
-    InitHostData(const size_t sz,     //
-                 const bool do_write, // If set to false, then only generate random data. This is
-                                      // necessary to reproduce values in input buffers even if some
-                                      // directions are skipped. For example, inputs for Backward
-                                      // will be the same for both "-F 0" and "-F 2".
-                 std::function<Tgpu()> generator)
-    {
-        if(is_gpualloc)
-        {
-            /// In gpualloc mode, we do not care about reproducibility of results, because
-            /// validation is not used. Therefore, we do not have to always generate random value
-            /// (\ref move_rand)
-            return;
-        }
-
-        for(size_t i = 0; i < sz; ++i)
-        {
-            /// \anchor move_rand
-            /// Generate random value, even if buffer is unused. This provides the same
-            /// initialization of input buffers regardless of which kinds of
-            /// convolutions are currently selectedfor testing (see the "-F" option).
-            /// Verification cache would be broken otherwise.
-            auto val = generator();
-            if(do_write)
-                GetVector()[i] = val;
-        }
-    }
-
-    status_t AllocOnDevice(stream, context_t ctx, const size_t sz)
-    {
-        dev = std::make_unique<GPUMem>(ctx, sz, sizeof(Tgpu));
-        return STATUS_SUCCESS;
-    }
-
-    status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz)
-    {
-        AllocOnDevice(q, ctx, sz);
-        if(is_gpualloc)
-        {
-            /// \anchor gpualloc_random_init
-            /// In gpualloc mode, we do not want to leave input buffers uninitialized, because
-            /// there could be NaNs and Infs, which may affect the performance (which we are
-            /// interested to evaluate in this mode). Initialization with all 0's is not the
-            /// best choice as well, because GPU HW may optimize out computations with 0's and
-            /// that could affect performance of kernels too. That is why we are using
-            /// rocrand to initialize input buffers.
-            ///
-            /// However we do not care about precision in gpualloc mode, because validation
-            /// is not used. Therefore, range (0,1] is fine.
-            return gpumemrand::gen_0_1(static_cast<Tgpu*>(GetDevicePtr()), sz);
-        }
-        return dev->ToGPU(q, GetVectorData());
-    }
-
-    template <typename T>
-    status_t AllocOnDevice(stream, context_t ctx, const size_t sz, std::vector<T>&)
-    {
-        static_assert(std::is_same<T, float>::value           //
-                          || std::is_same<T, int32_t>::value, //
-                      "Before enabling more types, check thoroughly.");
-        dev = std::make_unique<GPUMem>(ctx, sz, sizeof(T));
-        return STATUS_SUCCESS;
-    }
-
-    template <typename T>
-    status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz, std::vector<T>& init)
-    {
-        AllocOnDevice(q, ctx, sz, init);
-        if(is_gpualloc)
-        {
-            /// \ref gpualloc_random_init
-            return gpumemrand::gen_0_1(static_cast<Tgpu*>(GetDevicePtr()), sz);
-        }
-        return dev->ToGPU(q, init.data());
-    }
-
-    status_t CopyFromDeviceToHost(stream q)
-    {
-        return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, GetVectorData());
-    }
-
-    template <typename T>
-    status_t CopyFromDeviceToHost(stream q, tensor<T>& t)
-    {
-        return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, t.data.data());
-    }
-
-    template <typename T>
-    status_t CopyFromDeviceToHost(stream q, std::vector<T>& v)
-    {
-        return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, v.data());
-    }
-
-    auto GetDevicePtr() -> auto { return dev->GetMem(); }
-};
-
 template <typename Tgpu>
 class GpumemVector
 {
diff --git a/driver/driver.hpp b/driver/driver.hpp
index d0c708ff1d..c9decb2185 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -38,6 +38,9 @@
 #include <miopen/logger.hpp>
 #include <miopen/miopen.h>
 #include <miopen/bfloat16.hpp>
+#include <../test/tensor_holder.hpp>
+#include "util_driver.hpp"
+#include "rocrand_wrapper.hpp"
 using half         = half_float::half;
 using hip_bfloat16 = bfloat16;
 #include <hip_float8.hpp>
@@ -157,6 +160,140 @@ struct GPUMem
 #endif
 };
 
+template <typename Tgpu>
+class GpumemTensor
+{
+    std::unique_ptr<GPUMem> dev;
+    tensor<Tgpu> host;
+    bool is_gpualloc = false;
+
+public:
+    void SetGpuallocMode(bool v) { is_gpualloc = v; }
+    tensor<Tgpu>& GetTensor() { return host; }
+
+    void AllocOnHost(miopenTensorDescriptor_t t)
+    {
+        host = tensor<Tgpu>(miopen::deref(t));
+        if(is_gpualloc) // We do not need host data.
+        {
+            host.data.clear();
+            host.data.shrink_to_fit(); // To free host memory.
+        }
+    }
+    template <typename T>
+    void AllocOnHost(tensor<T> t)
+    {
+        AllocOnHost(&t.desc);
+    }
+
+    std::vector<Tgpu>& GetVector()
+    {
+        if(is_gpualloc)
+            MIOPEN_THROW("[MIOpenDriver] GpumemTensor::GetVector should not be called in "
+                         "'--gpualloc 1' mode");
+        return host.data;
+    }
+
+    Tgpu* GetVectorData() { return is_gpualloc ? nullptr : host.data.data(); }
+    std::size_t GetVectorSize() const { return is_gpualloc ? 0 : host.data.size(); }
+
+    void
+    InitHostData(const size_t sz,     //
+                 const bool do_write, // If set to false, then only generate random data. This is
+                                      // necessary to reproduce values in input buffers even if some
+                                      // directions are skipped. For example, inputs for Backward
+                                      // will be the same for both "-F 0" and "-F 2".
+                 std::function<Tgpu()> generator)
+    {
+        if(is_gpualloc)
+        {
+            /// In gpualloc mode, we do not care about reproducibility of results, because
+            /// validation is not used. Therefore, we do not have to always generate random value
+            /// (\ref move_rand)
+            return;
+        }
+
+        for(size_t i = 0; i < sz; ++i)
+        {
+            /// \anchor move_rand
+            /// Generate random value, even if buffer is unused. This provides the same
+            /// initialization of input buffers regardless of which kinds of
+            /// convolutions are currently selectedfor testing (see the "-F" option).
+            /// Verification cache would be broken otherwise.
+            auto val = generator();
+            if(do_write)
+                GetVector()[i] = val;
+        }
+    }
+
+    status_t AllocOnDevice(stream, context_t ctx, const size_t sz)
+    {
+        dev = std::make_unique<GPUMem>(ctx, sz, sizeof(Tgpu));
+        return STATUS_SUCCESS;
+    }
+
+    status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz)
+    {
+        AllocOnDevice(q, ctx, sz);
+        if(is_gpualloc)
+        {
+            /// \anchor gpualloc_random_init
+            /// In gpualloc mode, we do not want to leave input buffers uninitialized, because
+            /// there could be NaNs and Infs, which may affect the performance (which we are
+            /// interested to evaluate in this mode). Initialization with all 0's is not the
+            /// best choice as well, because GPU HW may optimize out computations with 0's and
+            /// that could affect performance of kernels too. That is why we are using
+            /// rocrand to initialize input buffers.
+            ///
+            /// However we do not care about precision in gpualloc mode, because validation
+            /// is not used. Therefore, range (0,1] is fine.
+            return gpumemrand::gen_0_1(static_cast<Tgpu*>(GetDevicePtr()), sz);
+        }
+        return dev->ToGPU(q, GetVectorData());
+    }
+
+    template <typename T>
+    status_t AllocOnDevice(stream, context_t ctx, const size_t sz, std::vector<T>&)
+    {
+        static_assert(std::is_same<T, float>::value           //
+                          || std::is_same<T, int32_t>::value, //
+                      "Before enabling more types, check thoroughly.");
+        dev = std::make_unique<GPUMem>(ctx, sz, sizeof(T));
+        return STATUS_SUCCESS;
+    }
+
+    template <typename T>
+    status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz, std::vector<T>& init)
+    {
+        AllocOnDevice(q, ctx, sz, init);
+        if(is_gpualloc)
+        {
+            /// \ref gpualloc_random_init
+            return gpumemrand::gen_0_1(static_cast<Tgpu*>(GetDevicePtr()), sz);
+        }
+        return dev->ToGPU(q, init.data());
+    }
+
+    status_t CopyFromDeviceToHost(stream q)
+    {
+        return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, GetVectorData());
+    }
+
+    template <typename T>
+    status_t CopyFromDeviceToHost(stream q, tensor<T>& t)
+    {
+        return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, t.data.data());
+    }
+
+    template <typename T>
+    status_t CopyFromDeviceToHost(stream q, std::vector<T>& v)
+    {
+        return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, v.data());
+    }
+
+    auto GetDevicePtr() -> auto { return dev->GetMem(); }
+};
+
 inline void PadBufferSize(size_t& sz, int datatype_sz)
 {
     size_t page_sz = (2 * 1024 * 1024) / datatype_sz;

From 7496f3b744e0712df81bb4943f8cd141ac93d535 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Fri, 6 Sep 2024 19:32:29 +0000
Subject: [PATCH 10/27] fix correctness

---
 driver/bn_driver.hpp | 53 +++++++++++++++++++++++++++++++++++++++-----
 test/fusionHost.hpp  |  2 +-
 test/gtest/bn.hpp    |  1 -
 3 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index 7179471ddc..359deffaa8 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -36,6 +36,7 @@
 #include "rocrand_wrapper.hpp"
 
 #include "../test/verify.hpp"
+#include "../test/random.hpp"
 #include "../test/fusionHost.hpp"
 
 #include <miopen/handle.hpp>
@@ -162,7 +163,7 @@ class BatchNormDriver : public Driver
     tensor<Tmix> runVariance_ref;
 
     // backward
-    GpumemTensor<Tgpu> bnScale;
+    GpumemTensor<Tmix> bnScale;
 
     GpumemTensor<Tmix> dy;
     GpumemTensor<Tmix> dScale;
@@ -197,19 +198,38 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::GetandSetData()
 
     std::vector<int> in_len = GetInputTensorLengthsFromCmdLine();
 
+    auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<Tgpu>(1e-2, 100); };
+
     in.AllocOnHost(tensor<Tgpu>{bn_layout, in_len});
+    in.InitHostData(in.GetTensor().desc.GetElementSize(), true, gen_value);
+
     out.AllocOnHost(tensor<Tgpu>{bn_layout, in_len});
+    // out.InitHostData(in.GetTensor().desc.GetElementSize(), true, gen_value);
+
     auto derivedBnDesc = miopen::TensorDescriptor{};
     miopen::DeriveBNTensorDescriptor(derivedBnDesc, in.GetTensor().desc, bn_mode);
+
     if(isFwdInfer || isFwdTrain)
     {
         scale.AllocOnHost(tensor<Tgpu>{bn_layout, derivedBnDesc.GetLengths()});
         bias.AllocOnHost(tensor<Tgpu>{bn_layout, derivedBnDesc.GetLengths()});
+
+        auto gen_value_scale_bias = [](auto...) {
+            return prng::gen_descreet_uniform_sign<Tgpu>(1e-2, 100);
+        };
+
+        scale.InitHostData(scale.GetTensor().desc.GetElementSize(), true, gen_value_scale_bias);
+        bias.InitHostData(bias.GetTensor().desc.GetElementSize(), true, gen_value_scale_bias);
     }
     if(isFwdInfer)
     {
         estMean.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
         estVariance.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
+
+        auto gen_value_emean = [](auto...) {
+            return prng::gen_descreet_uniform_sign<Tmix>(1e-2, 100);
+        };
+        estMean.InitHostData(estMean.GetTensor().desc.GetElementSize(), true, gen_value_emean);
     }
     else if(isFwdTrain)
     {
@@ -217,16 +237,36 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::GetandSetData()
         savedVariance.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
         runMean.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
         runVariance.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
+
+        auto gen_var = [](auto...) {
+            return static_cast<Tmix>(1e-2 * (prng::gen_0_to_B(100) + 1));
+        };
+        runMean.InitHostData(runMean.GetTensor().desc.GetElementSize(), true, gen_var);
+        runVariance.InitHostData(runVariance.GetTensor().desc.GetElementSize(), true, gen_var);
     }
     else if(isBwd)
     {
-        bnScale.AllocOnHost(tensor<Tgpu>{bn_layout, derivedBnDesc.GetLengths()});
+        bnScale.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
         dy.AllocOnHost(tensor<Tmix>{bn_layout, in_len});
 
+        auto gen_value_bwd = [](auto...) {
+            return prng::gen_descreet_uniform_sign<Tmix>(1e-2, 100);
+        };
+
+        dy.InitHostData(dy.GetTensor().desc.GetElementSize(), true, gen_value_bwd);
+        bnScale.InitHostData(bnScale.GetTensor().desc.GetElementSize(), true, gen_value_bwd);
+
         dScale.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
         dBias.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
         savedMean.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
         savedInvVar.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
+
+        savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_value_bwd);
+
+        auto gen_inv_var = [](auto...) {
+            return static_cast<Tmix>(1e-2 * (prng::gen_0_to_B(100) + 1));
+        };
+        savedInvVar.InitHostData(savedInvVar.GetTensor().desc.GetElementSize(), true, gen_inv_var);
     }
     else
     {
@@ -1112,8 +1152,6 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
         } // end if(saveMeanVar)
     }
 
-    // Check output tensor error
-    // out_dev->FromGPU(GetStream(), out.data());
     out.CopyFromDeviceToHost(GetStream());
 
     maxval        = static_cast<Tref>(0.0);
@@ -1179,6 +1217,9 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardCPU()
     double alpha = static_cast<double>(1), beta = static_cast<double>(0),
            gamma = static_cast<double>(1);
 
+    // float alphaDataDiff = static_cast<float>(1), betaDataDiff = static_cast<float>(0);
+    // float alphaParamDiff = static_cast<float>(1), betaParamDiff = static_cast<float>(0);
+
     if(bn_mode == miopenBNPerActivation)
     {
         // 1xCxHxW
@@ -1282,8 +1323,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyBackward()
 #if(MIO_BN_DEBUG == 1)
         for(int i = 0; i < dScale.GetVector().size() && i < MIO_BN_MAX_DEBUGLOOP; i++)
         {
-            diff   = fabs(Tmix(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i])));
-            maxval = maxval < diff ? diff : maxval;
+            auto diff = fabs(Tmix(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i])));
+            maxval    = maxval < diff ? diff : maxval;
             if(!std::isfinite(diff) || diff > tolerance)
             {
                 std::cout << "dscale[" << i << "]: " << dScale.GetVector()[i];
diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index a2cdad46b7..a65832b9de 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -162,6 +162,7 @@ void batchNormSpatialHostInference(const tensor<T>& input,
                     output(bidx, cidx, row, column) =
                         static_cast<T>(scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0));
                     // printf("output: %f\n",scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0));
+                    // std::cout << output(bidx, cidx, row, column) << ",";
                 }
             }
         }
@@ -292,7 +293,6 @@ void batchNormSpatialHostBwdTrain(const tensor<XDataType>& x_input,
                                   const tensor<AccDataType>& savedMean,
                                   const tensor<AccDataType>& savedInvVar)
 {
-
     int height, width, n_batch, channels;
     std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths());
     auto nhw                                   = double(height * width * n_batch);
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index f5227217e4..5699bea4c0 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -165,7 +165,6 @@ struct BNBwdTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopen
                               ScaleDataType,
                               DscaleDbiasDataType,
                               MeanVarDataType>(bn_bwd_test_data);
-
         test::CompareTensor<DxDataType>(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 5e-4);
         test::CompareTensor<DxDataType>(bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, 5e-4);
         test::CompareTensor<DxDataType>(bn_bwd_test_data.dBias, bn_bwd_test_data.dBias_ref, 5e-4);

From 497c47dd87694b0a5ca73be1b6c5dfe430e42398 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Sat, 7 Sep 2024 01:56:12 +0000
Subject: [PATCH 11/27] passing bin/MIOpenDriver bnorm

---
 driver/bn_driver.hpp     | 14 +++++++-------
 src/ocl/batchnormocl.cpp |  8 ++++----
 test/gtest/bn.hpp        |  1 +
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index 359deffaa8..35972ce38e 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -1083,27 +1083,26 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
             savedVariance.CopyFromDeviceToHost(GetStream());
 
             maxval             = static_cast<Tref>(0.0);
-            auto errorSaveMean = miopen::rms_range(savedVariance_ref.data, savedMean.GetVector());
+            auto errorSaveMean = miopen::rms_range(savedMean_ref.data, savedMean.GetVector());
             if(!std::isfinite(errorSaveMean) || errorSaveMean > maxrms)
             {
                 std::cout << "Forward train batch norm verification FAILED on saved mean: "
                           << errorSaveMean << std::endl;
                 anError = true;
 #if(MIO_BN_DEBUG == 1)
-                for(int i = 0; i < savedMean.GetVector().size() &&
-                               i < savedVariance_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP;
+                for(int i = 0; i < savedMean.GetVector().size() && i < savedMean_ref.data.size() &&
+                               i < MIO_BN_MAX_DEBUGLOOP;
                     i++)
                 {
-                    diff = fabs(
-                        Tmix(fabs(savedMean.GetVector()[i]) - fabs(savedVariance_ref.data[i])));
+                    diff = fabs(Tmix(fabs(savedMean.GetVector()[i]) - fabs(savedMean_ref.data[i])));
                     maxval = maxval < diff ? diff : maxval;
                     if(!std::isfinite(diff) || diff > tolerance)
                     {
                         std::cout << "sm[" << i << "]: " << savedMean.GetVector()[i];
-                        std::cout << ", sm_host[" << i << "]: " << savedVariance_ref.data[i];
+                        std::cout << ", sm_host[" << i << "]: " << savedMean_ref.data[i];
                         std::cout << ", diff[" << i << "]: "
                                   << Tmix(fabs(savedMean.GetVector()[i]) -
-                                          fabs(savedVariance_ref.data[i]))
+                                          fabs(savedMean_ref.data[i]))
                                   << std::endl;
                     }
                 }
@@ -1156,6 +1155,7 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
 
     maxval        = static_cast<Tref>(0.0);
     auto errorOut = miopen::rms_range(out_ref.data, out.GetVector());
+
     if(!std::isfinite(errorOut) || errorOut > maxrms)
     {
         std::cout << "Forward batch norm verification FAILED on output: " << errorOut << std::endl;
diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp
index 40bcd34935..205bae8bc4 100644
--- a/src/ocl/batchnormocl.cpp
+++ b/src/ocl/batchnormocl.cpp
@@ -313,10 +313,10 @@ void BatchNormBackward(Handle& handle,
     {
         MIOPEN_THROW(miopenStatusBadParm);
     }
-    if(dxDesc.GetType() != dyDesc.GetType())
-    {
-        MIOPEN_THROW(miopenStatusBadParm);
-    }
+    // if(dxDesc.GetType() != dyDesc.GetType())
+    // {
+    //     MIOPEN_THROW(miopenStatusBadParm);
+    // }
     if(xDesc.GetNumDims() < 3)
     {
         MIOPEN_THROW(miopenStatusBadParm);
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index 5699bea4c0..f5227217e4 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -165,6 +165,7 @@ struct BNBwdTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopen
                               ScaleDataType,
                               DscaleDbiasDataType,
                               MeanVarDataType>(bn_bwd_test_data);
+
         test::CompareTensor<DxDataType>(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 5e-4);
         test::CompareTensor<DxDataType>(bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, 5e-4);
         test::CompareTensor<DxDataType>(bn_bwd_test_data.dBias, bn_bwd_test_data.dBias_ref, 5e-4);

From 22a3384d858a1ea98f9c83db3a61d90d6b072a35 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Tue, 24 Sep 2024 21:59:01 +0000
Subject: [PATCH 12/27] now working driver command for float, fp16 and bfp16
 for batch norm

---
 driver/bn_driver.hpp           | 151 ++++++++++++++++++++-------------
 driver/dm_bnorm.cpp            |   6 ++
 driver/driver.hpp              |  20 ++---
 src/batch_norm_api.cpp         |   1 +
 src/driver_arguments.cpp       |   4 +
 test/fusionHost.hpp            |  69 ++++++++-------
 test/gtest/bn.hpp              |   1 -
 test/gtest/bn_bwd.cpp          |  48 +++++------
 test/gtest/test_operations.hpp |  26 ++++++
 9 files changed, 197 insertions(+), 129 deletions(-)

diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index 35972ce38e..a922b97acf 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -142,7 +142,7 @@ class BatchNormDriver : public Driver
 
     GpumemTensor<Tgpu> in;
     GpumemTensor<Tgpu> out;
-    tensor<Tgpu> out_ref;
+    tensor<Tref> out_ref;
 
     // forward
     GpumemTensor<Tgpu> scale;
@@ -152,25 +152,30 @@ class BatchNormDriver : public Driver
     GpumemTensor<Tmix> estMean;
     GpumemTensor<Tmix> estVariance;
 
-    // forward training
     GpumemTensor<Tmix> savedMean;
-    tensor<Tmix> savedMean_ref;
+    tensor<Tref> savedMean_ref;
+
+    // forward training
     GpumemTensor<Tmix> savedVariance;
-    tensor<Tmix> savedVariance_ref;
     GpumemTensor<Tmix> runMean;
-    tensor<Tmix> runMean_ref;
     GpumemTensor<Tmix> runVariance;
-    tensor<Tmix> runVariance_ref;
+    // ref
+    tensor<Tref> savedVariance_ref;
+    tensor<Tref> runMean_ref;
+    tensor<Tref> runVariance_ref;
 
     // backward
-    GpumemTensor<Tmix> bnScale;
+    GpumemTensor<Tmix> out_bwd;
 
-    GpumemTensor<Tmix> dy;
+    GpumemTensor<Tgpu> bnScale;
     GpumemTensor<Tmix> dScale;
-    tensor<Tmix> dScale_ref;
     GpumemTensor<Tmix> dBias;
-    tensor<Tmix> dBias_ref;
+    // savedMean declared above as Tmix as well
     GpumemTensor<Tmix> savedInvVar;
+    GpumemTensor<Tmix> dy;
+
+    tensor<Tref> dBias_ref;
+    tensor<Tref> dScale_ref;
 
     Tref maxval;
 
@@ -203,14 +208,12 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::GetandSetData()
     in.AllocOnHost(tensor<Tgpu>{bn_layout, in_len});
     in.InitHostData(in.GetTensor().desc.GetElementSize(), true, gen_value);
 
-    out.AllocOnHost(tensor<Tgpu>{bn_layout, in_len});
-    // out.InitHostData(in.GetTensor().desc.GetElementSize(), true, gen_value);
-
     auto derivedBnDesc = miopen::TensorDescriptor{};
     miopen::DeriveBNTensorDescriptor(derivedBnDesc, in.GetTensor().desc, bn_mode);
 
     if(isFwdInfer || isFwdTrain)
     {
+        out.AllocOnHost(tensor<Tgpu>{bn_layout, in_len});
         scale.AllocOnHost(tensor<Tgpu>{bn_layout, derivedBnDesc.GetLengths()});
         bias.AllocOnHost(tensor<Tgpu>{bn_layout, derivedBnDesc.GetLengths()});
 
@@ -246,27 +249,33 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::GetandSetData()
     }
     else if(isBwd)
     {
-        bnScale.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
+        out_bwd.AllocOnHost(tensor<Tmix>{bn_layout, in_len});
+
+        bnScale.AllocOnHost(tensor<Tgpu>{bn_layout, derivedBnDesc.GetLengths()});
         dy.AllocOnHost(tensor<Tmix>{bn_layout, in_len});
 
-        auto gen_value_bwd = [](auto...) {
-            return prng::gen_descreet_uniform_sign<Tmix>(1e-2, 100);
+        auto gen_var_bwd = [](auto...) {
+            return static_cast<Tmix>(1e-2 * (prng::gen_0_to_B(100) + 1));
         };
-
-        dy.InitHostData(dy.GetTensor().desc.GetElementSize(), true, gen_value_bwd);
-        bnScale.InitHostData(bnScale.GetTensor().desc.GetElementSize(), true, gen_value_bwd);
+        dy.InitHostData(dy.GetTensor().desc.GetElementSize(), true, gen_var_bwd);
 
         dScale.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
         dBias.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
         savedMean.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
         savedInvVar.AllocOnHost(tensor<Tmix>{bn_layout, derivedBnDesc.GetLengths()});
 
-        savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_value_bwd);
+        bnScale.InitHostData(bnScale.GetTensor().desc.GetElementSize(), true, gen_value);
 
-        auto gen_inv_var = [](auto...) {
-            return static_cast<Tmix>(1e-2 * (prng::gen_0_to_B(100) + 1));
-        };
-        savedInvVar.InitHostData(savedInvVar.GetTensor().desc.GetElementSize(), true, gen_inv_var);
+        if(saveMeanVar && keepRunningMeanVar)
+        {
+            savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_var_bwd);
+
+            auto gen_in_var = [](auto...) {
+                return static_cast<Tmix>(1e-2 * (prng::gen_0_to_B(100) + 1));
+            };
+            savedInvVar.InitHostData(
+                savedInvVar.GetTensor().desc.GetElementSize(), true, gen_in_var);
+        }
     }
     else
     {
@@ -473,49 +482,66 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::AllocateBuffersAndCopy()
 #if MIOPEN_BACKEND_OPENCL
     clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr);
 #endif
-    status |= in.AllocOnDeviceAndInit(q, ctx, in.GetTensor().desc.GetElementSpace());
-    status |= out.AllocOnDeviceAndInit(q, ctx, out.GetTensor().desc.GetElementSpace());
-    out_ref = out.GetTensor();
+    status |= in.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&in.GetTensor().desc));
+
     if(isFwdInfer || isFwdTrain)
     {
-        status |= scale.AllocOnDeviceAndInit(q, ctx, scale.GetTensor().desc.GetElementSpace());
-        status |= bias.AllocOnDeviceAndInit(q, ctx, bias.GetTensor().desc.GetElementSpace());
+        status |= out.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&out.GetTensor().desc));
+        out_ref =
+            tensor<Tref>{out.GetTensor().desc.GetLayout_t(), out.GetTensor().desc.GetLengths()};
+        status |= scale.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&scale.GetTensor().desc));
+        status |= bias.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&bias.GetTensor().desc));
     }
     if(isFwdInfer)
     {
-        status |= estMean.AllocOnDeviceAndInit(q, ctx, estMean.GetTensor().desc.GetElementSpace());
-        status |= estVariance.AllocOnDeviceAndInit(
-            q, ctx, estVariance.GetTensor().desc.GetElementSpace());
+        status |= estMean.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&estMean.GetTensor().desc));
+        status |=
+            estVariance.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&estVariance.GetTensor().desc));
     }
     if(isFwdTrain)
     {
         status |=
-            savedMean.AllocOnDeviceAndInit(q, ctx, savedMean.GetTensor().desc.GetElementSpace());
+            savedMean.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&savedMean.GetTensor().desc));
         status |= savedVariance.AllocOnDeviceAndInit(
-            q, ctx, savedVariance.GetTensor().desc.GetElementSpace());
-        status |= runMean.AllocOnDeviceAndInit(q, ctx, runMean.GetTensor().desc.GetElementSpace());
-        status |= runVariance.AllocOnDeviceAndInit(
-            q, ctx, runVariance.GetTensor().desc.GetElementSpace());
+            q, ctx, GetTensorSize(&savedVariance.GetTensor().desc));
+        status |= runMean.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&runMean.GetTensor().desc));
+        status |=
+            runVariance.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&runVariance.GetTensor().desc));
 
-        savedMean_ref     = savedMean.GetTensor();
-        savedVariance_ref = savedVariance.GetTensor();
-        runMean_ref       = runMean.GetTensor();
-        runVariance_ref   = runVariance.GetTensor();
+        savedMean_ref = tensor<Tref>{savedMean.GetTensor().desc.GetLayout_t(),
+                                     savedMean.GetTensor().desc.GetLengths()};
+
+        savedVariance_ref = tensor<Tref>{savedVariance.GetTensor().desc.GetLayout_t(),
+                                         savedVariance.GetTensor().desc.GetLengths()};
+
+        runMean_ref = tensor<Tref>{runMean.GetTensor().desc.GetLayout_t(),
+                                   runMean.GetTensor().desc.GetLengths()};
+
+        runVariance_ref = tensor<Tref>{runVariance.GetTensor().desc.GetLayout_t(),
+                                       runVariance.GetTensor().desc.GetLengths()};
     }
     if(isBwd)
     {
-        status |= bnScale.AllocOnDeviceAndInit(q, ctx, bnScale.GetTensor().desc.GetElementSpace());
-        status |= dy.AllocOnDeviceAndInit(q, ctx, dy.GetTensor().desc.GetElementSpace());
+        status |= out_bwd.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&out_bwd.GetTensor().desc));
+
+        out_ref = tensor<Tref>{out_bwd.GetTensor().desc.GetLayout_t(),
+                               out_bwd.GetTensor().desc.GetLengths()};
 
-        status |= dScale.AllocOnDeviceAndInit(q, ctx, dScale.GetTensor().desc.GetElementSpace());
-        status |= dBias.AllocOnDeviceAndInit(q, ctx, dBias.GetTensor().desc.GetElementSpace());
+        status |= bnScale.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&bnScale.GetTensor().desc));
+        status |= dy.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&dy.GetTensor().desc));
+
+        status |= dScale.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&dScale.GetTensor().desc));
+        status |= dBias.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&dBias.GetTensor().desc));
+        status |=
+            savedMean.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&savedMean.GetTensor().desc));
         status |=
-            savedMean.AllocOnDeviceAndInit(q, ctx, savedMean.GetTensor().desc.GetElementSpace());
-        status |= savedInvVar.AllocOnDeviceAndInit(
-            q, ctx, savedInvVar.GetTensor().desc.GetElementSpace());
+            savedInvVar.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&savedInvVar.GetTensor().desc));
 
-        dScale_ref = dScale.GetTensor();
-        dBias_ref  = dBias.GetTensor();
+        dScale_ref = tensor<Tref>{dScale.GetTensor().desc.GetLayout_t(),
+                                  dScale.GetTensor().desc.GetLengths()};
+
+        dBias_ref =
+            tensor<Tref>{dBias.GetTensor().desc.GetLayout_t(), dBias.GetTensor().desc.GetLengths()};
     }
 
     if(status != STATUS_SUCCESS)
@@ -902,8 +928,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardGPU()
                                              in.GetDevicePtr(),
                                              &dy.GetTensor().desc,
                                              dy.GetDevicePtr(),
-                                             &out.GetTensor().desc,
-                                             out.GetDevicePtr(),
+                                             &out_bwd.GetTensor().desc,
+                                             out_bwd.GetDevicePtr(),
                                              &bnScale.GetTensor().desc,
                                              bnScale.GetDevicePtr(),
                                              dScale.GetDevicePtr(),
@@ -924,8 +950,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardGPU()
                                              in.GetDevicePtr(),
                                              &dy.GetTensor().desc,
                                              dy.GetDevicePtr(),
-                                             &out.GetTensor().desc,
-                                             out.GetDevicePtr(),
+                                             &out_bwd.GetTensor().desc,
+                                             out_bwd.GetDevicePtr(),
                                              &bnScale.GetTensor().desc,
                                              bnScale.GetDevicePtr(),
                                              dScale.GetDevicePtr(),
@@ -1014,6 +1040,7 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
             runVariance.CopyFromDeviceToHost(GetStream());
 
             auto errorRunMean = miopen::rms_range(runMean_ref.data, runMean.GetVector());
+
             if(!std::isfinite(errorRunMean) || errorRunMean > maxrms)
             {
                 std::cout << "Forward train batch norm verification FAILED on running mean: "
@@ -1240,6 +1267,7 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardCPU()
     }
     else if(bn_mode == miopenBNSpatial)
     { // 1xCx1x1
+
         batchNormSpatialHostBwdTrain(in.GetTensor(),
                                      dy.GetTensor(),
                                      out_ref,
@@ -1271,7 +1299,7 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyBackward()
 
     RunBackwardCPU();
 
-    out.CopyFromDeviceToHost(GetStream());
+    out_bwd.CopyFromDeviceToHost(GetStream());
     dScale.CopyFromDeviceToHost(GetStream());
     dBias.CopyFromDeviceToHost(GetStream());
 
@@ -1281,7 +1309,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyBackward()
     Tref diff = static_cast<Tref>(0.0);
 #endif
     maxval          = static_cast<Tref>(0.0);
-    auto errordxout = miopen::rms_range(out_ref.data, out.GetVector());
+    auto errordxout = miopen::rms_range(out_ref.data, out_bwd.GetVector());
+
     if(!std::isfinite(errordxout) || errordxout > maxrms)
     {
         std::cout << "Backwards prop batch norm verification FAILED on dx: " << errordxout
@@ -1290,17 +1319,17 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyBackward()
 #if(MIO_BN_DEBUG == 1)
         for(int i = 0; i < out_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++)
         {
-            diff   = fabs(Tgpu(fabs(out_ref.data[i]) - fabs(out.GetVector()[i])));
+            diff   = fabs(Tgpu(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i])));
             maxval = maxval < diff ? diff : maxval;
             if(!std::isfinite(diff) || diff > tolerance)
             {
                 std::cout << "out_ref[" << i << "]: " << out_ref.data[i];
-                std::cout << "\tout.GetVector()[" << i << "]: " << out.GetVector()[i];
+                std::cout << "\tout_bwd.GetVector()[" << i << "]: " << out_bwd.GetVector()[i];
                 std::cout << "\tdiff[" << i
-                          << "]: " << Tgpu(fabs(out_ref.data[i]) - fabs(out.GetVector()[i]));
+                          << "]: " << Tgpu(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i]));
                 std::cout << "\tratioH: "
-                          << fabs(fabs(out_ref.data[i]) - fabs(out.GetVector()[i])) /
-                                 fabs(out.GetVector()[i])
+                          << fabs(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i])) /
+                                 fabs(out_bwd.GetVector()[i])
                           << std::endl;
             }
         }
diff --git a/driver/dm_bnorm.cpp b/driver/dm_bnorm.cpp
index c7bab90bb5..23340adc94 100644
--- a/driver/dm_bnorm.cpp
+++ b/driver/dm_bnorm.cpp
@@ -26,12 +26,18 @@
 #include "bn_driver.hpp"
 #include "registry_driver_maker.hpp"
 
+// template <typename Tgpu, typename Tref, typename Tmix = Tgpu>
+
 static Driver* makeDriver(const std::string& base_arg)
 {
     if(base_arg == "bnorm")
         return new BatchNormDriver<float, double>();
     if(base_arg == "bnormfp16")
+        return new BatchNormDriver<float16, double, float16>();
+    if(base_arg == "bnormfp16fp32")
         return new BatchNormDriver<float16, double, float>();
+    if(base_arg == "bnormbfp16fp32")
+        return new BatchNormDriver<bfloat16, double, float>();
     return nullptr;
 }
 
diff --git a/driver/driver.hpp b/driver/driver.hpp
index c9decb2185..64400cc405 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -330,16 +330,16 @@ inline std::string ParseBaseArg(int argc, char* argv[])
     if(arg != "conv" && arg != "convfp16" && arg != "convint8" && arg != "convbfp16" &&
        arg != "pool" && arg != "poolfp16" && arg != "lrn" && arg != "lrnfp16" && arg != "activ" &&
        arg != "activfp16" && arg != "softmax" && arg != "softmaxfp16" && arg != "bnorm" &&
-       arg != "bnormfp16" && arg != "rnn" && arg != "rnnfp16" && arg != "rnn_seq" &&
-       arg != "rnn_seqfp16" && arg != "gemm" && arg != "gemmfp16" && arg != "ctc" &&
-       arg != "dropout" && arg != "dropoutfp16" && arg != "tensorop" && arg != "reduce" &&
-       arg != "reducefp16" && arg != "reducefp64" && arg != "layernorm" && arg != "layernormfp16" &&
-       arg != "layernormbfp16" && arg != "sum" && arg != "sumfp16" && arg != "sumbfp16" &&
-       arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" &&
-       arg != "catfp16" && arg != "catbfp16" && arg != "addlayernorm" &&
-       arg != "addlayernormfp16" && arg != "addlayernormbfp16" && arg != "t5layernorm" &&
-       arg != "t5layernormfp16" && arg != "t5layernormbfp16" && arg != "adam" &&
-       arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" &&
+       arg != "bnormfp16" && arg != "bnormfp16fp32" && arg != "bnormbfp16fp32" && arg != "rnn" &&
+       arg != "rnnfp16" && arg != "rnn_seq" && arg != "rnn_seqfp16" && arg != "gemm" &&
+       arg != "gemmfp16" && arg != "ctc" && arg != "dropout" && arg != "dropoutfp16" &&
+       arg != "tensorop" && arg != "reduce" && arg != "reducefp16" && arg != "reducefp64" &&
+       arg != "layernorm" && arg != "layernormfp16" && arg != "layernormbfp16" && arg != "sum" &&
+       arg != "sumfp16" && arg != "sumbfp16" && arg != "groupnorm" && arg != "groupnormfp16" &&
+       arg != "groupnormbfp16" && arg != "cat" && arg != "catfp16" && arg != "catbfp16" &&
+       arg != "addlayernorm" && arg != "addlayernormfp16" && arg != "addlayernormbfp16" &&
+       arg != "t5layernorm" && arg != "t5layernormfp16" && arg != "t5layernormbfp16" &&
+       arg != "adam" && arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" &&
        arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "adamw" &&
        arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" &&
        arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" &&
diff --git a/src/batch_norm_api.cpp b/src/batch_norm_api.cpp
index 8f184a9508..3a2de02d01 100644
--- a/src/batch_norm_api.cpp
+++ b/src/batch_norm_api.cpp
@@ -251,6 +251,7 @@ miopenBatchNormalizationBackward(miopenHandle_t handle,
                                savedMean,
                                savedInvVariance,
                                miopen::debug::BatchNormDirection_t::Backward);
+
     // In case of NxCxDxHxW
     int size{0};
     miopenGetTensorDescriptorSize(xDesc, &size);
diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp
index 97fe16d7c4..57034e5378 100644
--- a/src/driver_arguments.cpp
+++ b/src/driver_arguments.cpp
@@ -66,6 +66,10 @@ void BnDataType(std::stringstream& ss, const miopen::TensorDescriptor& desc)
     {
         ss << "bnormfp16";
     }
+    if(desc.GetType() == miopenBFloat16)
+    {
+        ss << "bnormbfp16";
+    }
     else
     {
         ss << "bnorm";
diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index a65832b9de..ec271ef967 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -134,9 +134,9 @@ void convHostForward(const tensor<T>& input,
     }
 }
 
-template <class T, class U, class V = U>
+template <class T, class Tref, class U, class V = U>
 void batchNormSpatialHostInference(const tensor<T>& input,
-                                   tensor<T>& output,
+                                   tensor<Tref>& output,
                                    const tensor<U>& scale,
                                    const tensor<U>& bias,
                                    double epsilon,
@@ -169,9 +169,9 @@ void batchNormSpatialHostInference(const tensor<T>& input,
     });
 }
 
-template <class T, class U, class V>
+template <class T, class U, class V, class Tref>
 void batchNormPerActivHostInference(const tensor<T>& input,
-                                    tensor<T>& output,
+                                    tensor<Tref>& output,
                                     const tensor<U>& scale,
                                     const tensor<U>& bias,
                                     double epsilon,
@@ -203,17 +203,17 @@ void batchNormPerActivHostInference(const tensor<T>& input,
     });
 }
 
-template <class T, class U, class V = U>
+template <class T, class U, class Tref = U, class Tout>
 void batchNormSpatialHostFwdTrain(const tensor<T>& input,
-                                  tensor<T>& out,
+                                  tensor<Tout>& out,
                                   const tensor<U>& scale,
                                   const tensor<U>& bias,
                                   double epsilon,
                                   double expAvgFactor,
-                                  tensor<V>& saveMean,
-                                  tensor<V>& saveInvVar,
-                                  tensor<V>& runMean,
-                                  tensor<V>& runVar)
+                                  tensor<Tref>& saveMean,
+                                  tensor<Tref>& saveInvVar,
+                                  tensor<Tref>& runMean,
+                                  tensor<Tref>& runVar)
 {
 
     int height, width, n_batch, channels;
@@ -281,15 +281,15 @@ void batchNormSpatialHostFwdTrain(const tensor<T>& input,
 
 template <typename XDataType,
           typename DyDataType,
-          typename DxDataType,
           typename ScaleDataType,
-          typename AccDataType>
+          typename AccDataType,
+          typename RefDataType>
 void batchNormSpatialHostBwdTrain(const tensor<XDataType>& x_input,
                                   const tensor<DyDataType>& dy_input,
-                                  tensor<DxDataType>& dx_out,
+                                  tensor<RefDataType>& dx_out,
                                   const tensor<ScaleDataType>& bnScale,
-                                  tensor<AccDataType>& dscale,
-                                  tensor<AccDataType>& dbias,
+                                  tensor<RefDataType>& dscale,
+                                  tensor<RefDataType>& dbias,
                                   const tensor<AccDataType>& savedMean,
                                   const tensor<AccDataType>& savedInvVar)
 {
@@ -338,7 +338,8 @@ void batchNormSpatialHostBwdTrain(const tensor<XDataType>& x_input,
                     double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0);
                     double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0);
                     double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw;
-                    dx_out(bidx, cidx, row, column) = static_cast<DxDataType>(tmp3 * (tmp2 + tmp1));
+                    dx_out(bidx, cidx, row, column) =
+                        static_cast<RefDataType>(tmp3 * (tmp2 + tmp1));
                 } // end for(n_batchs)
             }     // for (column)
         }         // for (row)
@@ -349,7 +350,9 @@ template <typename XDataType,
           typename DyDataType,
           typename DxDataType,
           typename ScaleDataType,
-          typename AccDataType>
+          typename AccDataType,
+          typename OutRefDataType,
+          typename RefDataType>
 void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
                                        double gamma,
                                        double beta,
@@ -357,11 +360,11 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
                                        const tensor<XDataType>& x_input,
                                        const tensor<DyDataType>& dy_input,
                                        const tensor<DxDataType>& y_input,
-                                       tensor<DxDataType>& dx_out,
+                                       tensor<OutRefDataType>& dx_out,
                                        const tensor<ScaleDataType>& bnScale,
                                        const tensor<AccDataType>& bias,
-                                       tensor<AccDataType>& dscale,
-                                       tensor<AccDataType>& dbias,
+                                       tensor<RefDataType>& dscale,
+                                       tensor<RefDataType>& dbias,
                                        const tensor<AccDataType>& savedMean,
                                        const tensor<AccDataType>& savedInvVar)
 {
@@ -439,17 +442,17 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode,
     });           // for (channel)
 }
 
-template <class T, class U, class V>
+template <class T, class U, class Tref, class TOutref>
 void batchNormPerActHostFwdTrain(const tensor<T>& input,
-                                 tensor<T>& out,
+                                 tensor<TOutref>& out,
                                  const tensor<U>& scale,
                                  const tensor<U>& bias,
                                  double epsilon,
                                  double expAvgFactor,
-                                 tensor<V>& saveMean,
-                                 tensor<V>& saveInvVar,
-                                 tensor<V>& runMean,
-                                 tensor<V>& runVar)
+                                 tensor<Tref>& saveMean,
+                                 tensor<Tref>& saveInvVar,
+                                 tensor<Tref>& runMean,
+                                 tensor<Tref>& runVar)
 {
 
     int height, width, n_batch, channels;
@@ -493,7 +496,7 @@ void batchNormPerActHostFwdTrain(const tensor<T>& input,
                     elemStd = (input(bidx, cidx, row, column) - mean_accum); // (x_i - mean)
                     inhat   = elemStd * elemInvVar;
                     // #5 Gamma and Beta adjust :: y_i = gamma*x_hat + beta
-                    out(bidx, cidx, row, column) = static_cast<T>(
+                    out(bidx, cidx, row, column) = static_cast<Tref>(
                         scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column));
                 } // end for(n_batch)
 
@@ -506,21 +509,21 @@ void batchNormPerActHostFwdTrain(const tensor<T>& input,
                 runVar(0, cidx, row, column) =
                     (1 - expAvgFactor) * runVar(0, cidx, row, column) + expAvgFactor * adjust;
 
-                saveMean(0, cidx, row, column)   = static_cast<U>(mean_accum);
-                saveInvVar(0, cidx, row, column) = static_cast<U>(elemInvVar);
+                saveMean(0, cidx, row, column)   = static_cast<Tref>(mean_accum);
+                saveInvVar(0, cidx, row, column) = static_cast<Tref>(elemInvVar);
 
             } // for (column)
         }     // for (row)
     });
 }
 
-template <class T, class U>
+template <class T, class U, class Tref>
 void batchNormPerActHostBwdTrain(const tensor<T>& x_input,
                                  const tensor<T>& dy_input,
                                  const tensor<U>& scale,
-                                 tensor<U>& dscale,
-                                 tensor<U>& dbias,
-                                 tensor<T>& dx_out,
+                                 tensor<Tref>& dscale,
+                                 tensor<Tref>& dbias,
+                                 tensor<Tref>& dx_out,
                                  const tensor<U>& savedMean,
                                  const tensor<U>& savedInvVar)
 {
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index f5227217e4..5699bea4c0 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -165,7 +165,6 @@ struct BNBwdTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopen
                               ScaleDataType,
                               DscaleDbiasDataType,
                               MeanVarDataType>(bn_bwd_test_data);
-
         test::CompareTensor<DxDataType>(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 5e-4);
         test::CompareTensor<DxDataType>(bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, 5e-4);
         test::CompareTensor<DxDataType>(bn_bwd_test_data.dBias, bn_bwd_test_data.dBias_ref, 5e-4);
diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp
index f2d54e8077..f14b008233 100644
--- a/test/gtest/bn_bwd.cpp
+++ b/test/gtest/bn_bwd.cpp
@@ -31,41 +31,41 @@ struct GPU_BNBwd_FP16
 {
 };
 
-struct GPU_BNBwd_FP32 : BNBwdTest<float, float, float, float, float, float, float>
-{
-};
+// struct GPU_BNBwd_FP32 : BNBwdTest<float, float, float, float, float, float, float>
+// {
+// };
 
-struct GPU_BNBwd_BFP16 : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float>
-{
-};
+// struct GPU_BNBwd_BFP16 : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float>
+// {
+// };
 
-struct GPU_BNBwd_FP64 : BNBwdTest<double, double, double, double, double, double, double>
-{
-};
+// struct GPU_BNBwd_FP64 : BNBwdTest<double, double, double, double, double, double, double>
+// {
+// };
 
 TEST_P(GPU_BNBwd_FP16, BnBwdCKHalf) {}
 
-TEST_P(GPU_BNBwd_FP32, BnBwdCKFloat) {}
+// TEST_P(GPU_BNBwd_FP32, BnBwdCKFloat) {}
 
-TEST_P(GPU_BNBwd_BFP16, BnBwdCKBFloat16) {}
-TEST_P(GPU_BNBwd_FP64, BnBwdCKDouble) {}
+// TEST_P(GPU_BNBwd_BFP16, BnBwdCKBFloat16) {}
+// TEST_P(GPU_BNBwd_FP64, BnBwdCKDouble) {}
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNBwd_FP16,
                          testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
                                           testing::Values(miopenTensorNHWC)));
 
-INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BNBwd_FP32,
-                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC)));
+// INSTANTIATE_TEST_SUITE_P(Smoke,
+//                          GPU_BNBwd_FP32,
+//                          testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
+//                                           testing::Values(miopenTensorNHWC)));
 
-INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BNBwd_BFP16,
-                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC)));
+// INSTANTIATE_TEST_SUITE_P(Smoke,
+//                          GPU_BNBwd_BFP16,
+//                          testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
+//                                           testing::Values(miopenTensorNHWC)));
 
-INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BNBwd_FP64,
-                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC)));
+// INSTANTIATE_TEST_SUITE_P(Smoke,
+//                          GPU_BNBwd_FP64,
+//                          testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
+//                                           testing::Values(miopenTensorNHWC)));
diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp
index 298ac55e3e..2abcb05fe7 100644
--- a/test/gtest/test_operations.hpp
+++ b/test/gtest/test_operations.hpp
@@ -48,6 +48,32 @@ template <typename XDataType,
           typename DLModule>
 void ComputeCPUBNBwd(DLModule& dl_module)
 {
+    std::cout << "\n====start dy====\n";
+    for(int i = 0; i < dl_module.dy.data.size(); ++i)
+    {
+        std::cout << dl_module.dy[i] << ",";
+    }
+    std::cout << "\n";
+    std::cout << "\n====start bnScale====\n";
+    for(int i = 0; i < dl_module.bnScale.data.size(); ++i)
+    {
+        std::cout << dl_module.bnScale[i] << ",";
+    }
+    std::cout << "\n";
+    std::cout << "\n====start savedMean====\n";
+    for(int i = 0; i < dl_module.savedMean.data.size(); ++i)
+    {
+        std::cout << dl_module.savedMean[i] << ",";
+    }
+    std::cout << "\n";
+    std::cout << "\n====start savedInvVar====\n";
+    for(int i = 0; i < dl_module.savedInvVar.data.size(); ++i)
+    {
+        std::cout << dl_module.savedInvVar[i] << ",";
+    }
+    std::cout << "\n";
+
+    // todo : need to do based on bn_mode
     batchNormSpatialHostBwdTrain(dl_module.input,
                                  dl_module.dy,
                                  dl_module.ref_out,

From 0c6957396b7dd446342b930c2e1e061c97f293dc Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Wed, 25 Sep 2024 13:49:40 +0000
Subject: [PATCH 13/27] cleanups

---
 src/batch_norm_api.cpp         |  1 -
 src/ocl/batchnormocl.cpp       |  8 +++---
 test/gtest/bn.hpp              |  1 +
 test/gtest/bn_bwd.cpp          | 48 +++++++++++++++++-----------------
 test/gtest/test_operations.hpp | 26 ------------------
 5 files changed, 29 insertions(+), 55 deletions(-)

diff --git a/src/batch_norm_api.cpp b/src/batch_norm_api.cpp
index 3a2de02d01..8f184a9508 100644
--- a/src/batch_norm_api.cpp
+++ b/src/batch_norm_api.cpp
@@ -251,7 +251,6 @@ miopenBatchNormalizationBackward(miopenHandle_t handle,
                                savedMean,
                                savedInvVariance,
                                miopen::debug::BatchNormDirection_t::Backward);
-
     // In case of NxCxDxHxW
     int size{0};
     miopenGetTensorDescriptorSize(xDesc, &size);
diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp
index 205bae8bc4..40bcd34935 100644
--- a/src/ocl/batchnormocl.cpp
+++ b/src/ocl/batchnormocl.cpp
@@ -313,10 +313,10 @@ void BatchNormBackward(Handle& handle,
     {
         MIOPEN_THROW(miopenStatusBadParm);
     }
-    // if(dxDesc.GetType() != dyDesc.GetType())
-    // {
-    //     MIOPEN_THROW(miopenStatusBadParm);
-    // }
+    if(dxDesc.GetType() != dyDesc.GetType())
+    {
+        MIOPEN_THROW(miopenStatusBadParm);
+    }
     if(xDesc.GetNumDims() < 3)
     {
         MIOPEN_THROW(miopenStatusBadParm);
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index 5699bea4c0..f5227217e4 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -165,6 +165,7 @@ struct BNBwdTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopen
                               ScaleDataType,
                               DscaleDbiasDataType,
                               MeanVarDataType>(bn_bwd_test_data);
+
         test::CompareTensor<DxDataType>(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 5e-4);
         test::CompareTensor<DxDataType>(bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, 5e-4);
         test::CompareTensor<DxDataType>(bn_bwd_test_data.dBias, bn_bwd_test_data.dBias_ref, 5e-4);
diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp
index f14b008233..f2d54e8077 100644
--- a/test/gtest/bn_bwd.cpp
+++ b/test/gtest/bn_bwd.cpp
@@ -31,41 +31,41 @@ struct GPU_BNBwd_FP16
 {
 };
 
-// struct GPU_BNBwd_FP32 : BNBwdTest<float, float, float, float, float, float, float>
-// {
-// };
+struct GPU_BNBwd_FP32 : BNBwdTest<float, float, float, float, float, float, float>
+{
+};
 
-// struct GPU_BNBwd_BFP16 : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float>
-// {
-// };
+struct GPU_BNBwd_BFP16 : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float>
+{
+};
 
-// struct GPU_BNBwd_FP64 : BNBwdTest<double, double, double, double, double, double, double>
-// {
-// };
+struct GPU_BNBwd_FP64 : BNBwdTest<double, double, double, double, double, double, double>
+{
+};
 
 TEST_P(GPU_BNBwd_FP16, BnBwdCKHalf) {}
 
-// TEST_P(GPU_BNBwd_FP32, BnBwdCKFloat) {}
+TEST_P(GPU_BNBwd_FP32, BnBwdCKFloat) {}
 
-// TEST_P(GPU_BNBwd_BFP16, BnBwdCKBFloat16) {}
-// TEST_P(GPU_BNBwd_FP64, BnBwdCKDouble) {}
+TEST_P(GPU_BNBwd_BFP16, BnBwdCKBFloat16) {}
+TEST_P(GPU_BNBwd_FP64, BnBwdCKDouble) {}
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNBwd_FP16,
                          testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
                                           testing::Values(miopenTensorNHWC)));
 
-// INSTANTIATE_TEST_SUITE_P(Smoke,
-//                          GPU_BNBwd_FP32,
-//                          testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-//                                           testing::Values(miopenTensorNHWC)));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_BNBwd_FP32,
+                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC)));
 
-// INSTANTIATE_TEST_SUITE_P(Smoke,
-//                          GPU_BNBwd_BFP16,
-//                          testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-//                                           testing::Values(miopenTensorNHWC)));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_BNBwd_BFP16,
+                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC)));
 
-// INSTANTIATE_TEST_SUITE_P(Smoke,
-//                          GPU_BNBwd_FP64,
-//                          testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-//                                           testing::Values(miopenTensorNHWC)));
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_BNBwd_FP64,
+                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC)));
diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp
index 2abcb05fe7..298ac55e3e 100644
--- a/test/gtest/test_operations.hpp
+++ b/test/gtest/test_operations.hpp
@@ -48,32 +48,6 @@ template <typename XDataType,
           typename DLModule>
 void ComputeCPUBNBwd(DLModule& dl_module)
 {
-    std::cout << "\n====start dy====\n";
-    for(int i = 0; i < dl_module.dy.data.size(); ++i)
-    {
-        std::cout << dl_module.dy[i] << ",";
-    }
-    std::cout << "\n";
-    std::cout << "\n====start bnScale====\n";
-    for(int i = 0; i < dl_module.bnScale.data.size(); ++i)
-    {
-        std::cout << dl_module.bnScale[i] << ",";
-    }
-    std::cout << "\n";
-    std::cout << "\n====start savedMean====\n";
-    for(int i = 0; i < dl_module.savedMean.data.size(); ++i)
-    {
-        std::cout << dl_module.savedMean[i] << ",";
-    }
-    std::cout << "\n";
-    std::cout << "\n====start savedInvVar====\n";
-    for(int i = 0; i < dl_module.savedInvVar.data.size(); ++i)
-    {
-        std::cout << dl_module.savedInvVar[i] << ",";
-    }
-    std::cout << "\n";
-
-    // todo : need to do based on bn_mode
     batchNormSpatialHostBwdTrain(dl_module.input,
                                  dl_module.dy,
                                  dl_module.ref_out,

From aeebe811a9f0d30b3a14ec553ccc15912eeba6b6 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Wed, 25 Sep 2024 14:00:54 +0000
Subject: [PATCH 14/27] remove old variables

---
 driver/bn_driver.hpp | 39 +++++++++------------------------------
 1 file changed, 9 insertions(+), 30 deletions(-)

diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index a922b97acf..22d55665dd 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -70,16 +70,7 @@ template <typename Tgpu, typename Tref, typename Tmix = Tgpu>
 class BatchNormDriver : public Driver
 {
 public:
-    BatchNormDriver() : Driver()
-    {
-        miopenCreateTensorDescriptor(&inputTensor);
-        miopenCreateTensorDescriptor(&outputTensor);
-        // miopenCreateTensorDescriptor(&biasScaleTensor);
-        // miopenCreateTensorDescriptor(&dxOutputTensor);
-        // miopenCreateTensorDescriptor(&dyInputTensor);
-
-        data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf;
-    }
+    BatchNormDriver() : Driver() { data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf; }
 
     int AddCmdLineArgs() override;
     int ParseCmdLineArgs(int argc, char* argv[]) override;
@@ -109,14 +100,7 @@ class BatchNormDriver : public Driver
     int VerifyBackward() override;
     int VerifyForward() override;
 
-    ~BatchNormDriver() override
-    {
-        miopenDestroyTensorDescriptor(outputTensor);
-        miopenDestroyTensorDescriptor(inputTensor);
-        // miopenDestroyTensorDescriptor(biasScaleTensor);
-        // miopenDestroyTensorDescriptor(dxOutputTensor);
-        // miopenDestroyTensorDescriptor(dyInputTensor);
-    }
+    ~BatchNormDriver() override {}
 
 private:
     miopenBatchNormMode_t bn_mode;
@@ -137,9 +121,6 @@ class BatchNormDriver : public Driver
     InputFlags inflags;
     bool isDepthSpecified = false;
 
-    miopenTensorDescriptor_t inputTensor;
-    miopenTensorDescriptor_t outputTensor;
-
     GpumemTensor<Tgpu> in;
     GpumemTensor<Tgpu> out;
     tensor<Tref> out_ref;
@@ -777,9 +758,9 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunForwardGPU()
                    avgtime / (iters - 1),
                    iters - 1);
         int in_n, in_c, in_h, in_w;
-        std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths());
+        std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(in.GetTensor().desc.GetLengths());
         size_t M                         = in_n * in_c * in_h * in_w;
-        size_t dataSz = (M + 2 * in_c) * miopen::GetTypeSize(miopen::deref(inputTensor).GetType());
+        size_t dataSz = (M + 2 * in_c) * miopen::GetTypeSize(in.GetTensor().desc.GetType());
         float rdCnt   = -1.0;
         float wrCnt   = 1.0;
         if(forw == 1)
@@ -980,13 +961,11 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardGPU()
                 avgtime += time;
 
             int in_n, in_c, in_h, in_w;
-            std::tie(in_n, in_c, in_h, in_w) =
-                miopen::tien<4>(miopen::deref(inputTensor).GetLengths());
-            size_t M = in_n * in_c * in_h * in_w;
-            size_t dataSz =
-                (M + 2 * in_c) * miopen::GetTypeSize(miopen::deref(inputTensor).GetType());
-            float rdCnt = 2.0;
-            float wrCnt = 1.0;
+            std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(in.GetTensor().desc.GetLengths());
+            size_t M                         = in_n * in_c * in_h * in_w;
+            size_t dataSz = (M + 2 * in_c) * miopen::GetTypeSize(in.GetTensor().desc.GetType());
+            float rdCnt   = 2.0;
+            float wrCnt   = 1.0;
             // layer, flopCnt, reads, writes, GFLOPS, GB/s, timeMs
             printf("stats: bnormb, 0, %zu, %zu, 0, %f, %f\n",
                    dataSz,

From 5379f94a3fa869682ccefc25add693208f75744f Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Wed, 25 Sep 2024 14:13:26 +0000
Subject: [PATCH 15/27] remove dead code

---
 driver/dm_bnorm.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/driver/dm_bnorm.cpp b/driver/dm_bnorm.cpp
index 23340adc94..24e986fa1d 100644
--- a/driver/dm_bnorm.cpp
+++ b/driver/dm_bnorm.cpp
@@ -26,8 +26,6 @@
 #include "bn_driver.hpp"
 #include "registry_driver_maker.hpp"
 
-// template <typename Tgpu, typename Tref, typename Tmix = Tgpu>
-
 static Driver* makeDriver(const std::string& base_arg)
 {
     if(base_arg == "bnorm")

From a6ec4f029e5775e9559195088137c0e050748868 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Thu, 26 Sep 2024 20:28:39 +0000
Subject: [PATCH 16/27] add 3d

---
 driver/bn_driver.hpp                          | 132 ++++++++++++++++--
 src/batch_norm.cpp                            |   3 +-
 src/solver/batchnorm/backward_ck.cpp          |   3 +-
 .../batchnorm/backward_per_activation.cpp     |   2 +
 .../batchnorm/backward_spatial_multiple.cpp   |   4 +
 .../batchnorm/backward_spatial_single.cpp     |   2 +
 6 files changed, 133 insertions(+), 13 deletions(-)

diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index 22d55665dd..396ad6b218 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -100,6 +100,13 @@ class BatchNormDriver : public Driver
     int VerifyBackward() override;
     int VerifyForward() override;
 
+    // Helper function to check the Layout type short names
+    int ChkLayout_ShortName();
+    // function to validate the Layout type parameters.
+    // layout parameter value to std (NCHW/NHWC/NCDHW/NDHWC) values,
+    // defined in MIOpen lib.
+    void ValidateLayoutInputParameters(std::string layout_type);
+
     ~BatchNormDriver() override {}
 
 private:
@@ -145,7 +152,7 @@ class BatchNormDriver : public Driver
     tensor<Tref> runMean_ref;
     tensor<Tref> runVariance_ref;
 
-    // backward
+    // backward needed different type for bwd.
     GpumemTensor<Tmix> out_bwd;
 
     GpumemTensor<Tgpu> bnScale;
@@ -180,9 +187,8 @@ template <typename Tgpu, typename Tref, typename Tmix>
 int BatchNormDriver<Tgpu, Tref, Tmix>::GetandSetData()
 {
 
-    SetBNParametersFromCmdLineArgs();
-
     std::vector<int> in_len = GetInputTensorLengthsFromCmdLine();
+    SetBNParametersFromCmdLineArgs();
 
     auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign<Tgpu>(1e-2, 100); };
 
@@ -286,12 +292,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::AddCmdLineArgs()
     inflags.AddInputFlag("in_w", 'W', "32", "Input Width (Default=32)", "int");
     inflags.AddInputFlag("in_d", 'D', "0", "Input Depth (Default=0)", "int");
 
-    inflags.AddInputFlag("layout",
-                         'L',
-                         "NCHW",
-                         "Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)",
-                         "string",
-                         true);
+    inflags.AddInputFlag(
+        "layout", 'L', "", "Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)", "string", true);
 
     inflags.AddInputFlag("alpha", 'A', "1.0", "Alpha (Default=1.0)", "float");
     inflags.AddInputFlag("beta", 'B', "0.", "Beta (Default=0.)", "float");
@@ -345,6 +347,46 @@ std::vector<int> BatchNormDriver<Tgpu, Tref, Tmix>::GetInputTensorLengthsFromCmd
     }
 }
 
+template <typename Tgpu, typename Tref, typename Tmix>
+int BatchNormDriver<Tgpu, Tref, Tmix>::ChkLayout_ShortName()
+{
+    // check for short name of layout type
+    if(inflags.FindShortName("layout") == 'I')
+    {
+        // do noting
+        // found valid short names
+        return 0;
+    }
+    else
+    {
+        std::cerr << "Error:Invalid Short Name for layout!" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+template <typename Tgpu, typename Tref, typename Tmix>
+void BatchNormDriver<Tgpu, Tref, Tmix>::ValidateLayoutInputParameters(std::string layout_value)
+{
+    if((ChkLayout_ShortName()))
+    {
+        std::cerr << " Invalid Layout Short Name = " << ChkLayout_ShortName() << std::endl;
+        exit(EXIT_FAILURE);
+    }
+    else
+    {
+        if((layout_value.compare("NCHW") == 0) || (layout_value.compare("NHWC") == 0) ||
+           (layout_value.compare("NCDHW") == 0) || (layout_value.compare("NDHWC") == 0))
+        {
+            // do nothing,Values are matching as defined in Lib.
+        }
+        else
+        {
+            std::cerr << "Invalid Layout Parameter Value - " << layout_value << std::endl;
+            exit(EXIT_FAILURE);
+        }
+    }
+}
+
 template <typename Tgpu, typename Tref, typename Tmix>
 int BatchNormDriver<Tgpu, Tref, Tmix>::SetBNParametersFromCmdLineArgs()
 {
@@ -352,6 +394,21 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::SetBNParametersFromCmdLineArgs()
     //    	double bnAlpha = inflags.GetValueDouble("alpha");
     //    	double bnBeta = inflags.GetValueDouble("beta");
 
+    const std::string default_layout = isDepthSpecified ? "NCDHW" : "NCHW";
+
+    // inflags value is empty, default value is used
+    // if it is supplied via cmd line, check the value.
+    if(inflags.GetValueStr("layout").empty())
+    {
+        inflags.SetValue("layout", default_layout);
+    }
+    else
+    {
+        std::string layoutValue = inflags.GetValueStr("layout");
+        ValidateLayoutInputParameters(layoutValue);
+        inflags.SetValue("layout", layoutValue);
+    }
+
     std::string layout = inflags.GetValueStr("layout");
 
     if(layout == "NCHW")
@@ -362,6 +419,14 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::SetBNParametersFromCmdLineArgs()
     {
         bn_layout = miopenTensorNHWC;
     }
+    else if(layout == "NCDHW")
+    {
+        bn_layout = miopenTensorNCDHW;
+    }
+    else if(layout == "NDHWC")
+    {
+        bn_layout = miopenTensorNDHWC;
+    }
     else
     {
         std::cout << "Cannot handle layout : " << layout << "\n";
@@ -784,6 +849,20 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunForwardGPU()
 template <typename Tgpu, typename Tref, typename Tmix>
 void BatchNormDriver<Tgpu, Tref, Tmix>::runCPUFwdInference(Tref epsilon)
 {
+    int size{0};
+    miopenGetTensorDescriptorSize(&in.GetTensor().desc, &size);
+
+    if(size == 5)
+    {
+        in.GetTensor().desc    = miopen::BuildReshaped4DTensorDescriptor(in.GetTensor().desc);
+        out_ref.desc           = miopen::BuildReshaped4DTensorDescriptor(out_ref.desc);
+        scale.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(scale.GetTensor().desc);
+        bias.GetTensor().desc  = miopen::BuildReshaped4DTensorDescriptor(bias.GetTensor().desc);
+        estMean.GetTensor().desc =
+            miopen::BuildReshaped4DTensorDescriptor(estMean.GetTensor().desc);
+        estVariance.GetTensor().desc =
+            miopen::BuildReshaped4DTensorDescriptor(estVariance.GetTensor().desc);
+    }
 
     if(bn_mode == miopenBNPerActivation)
     { // 1xCxHxW
@@ -798,6 +877,7 @@ void BatchNormDriver<Tgpu, Tref, Tmix>::runCPUFwdInference(Tref epsilon)
     }
     else if(bn_mode == miopenBNSpatial)
     { // 1xCx1x1
+
         batchNormSpatialHostInference(in.GetTensor(),
                                       out_ref,
                                       scale.GetTensor(),
@@ -818,7 +898,19 @@ void BatchNormDriver<Tgpu, Tref, Tmix>::runCPUFwdInference(Tref epsilon)
 template <typename Tgpu, typename Tref, typename Tmix>
 void BatchNormDriver<Tgpu, Tref, Tmix>::runCPUFwdTrain(Tref epsilon, Tref eAF)
 {
-
+    int size{0};
+    miopenGetTensorDescriptorSize(&in.GetTensor().desc, &size);
+    if(size == 5)
+    {
+        in.GetTensor().desc    = miopen::BuildReshaped4DTensorDescriptor(in.GetTensor().desc);
+        out_ref.desc           = miopen::BuildReshaped4DTensorDescriptor(out_ref.desc);
+        scale.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(scale.GetTensor().desc);
+        bias.GetTensor().desc  = miopen::BuildReshaped4DTensorDescriptor(bias.GetTensor().desc);
+        savedMean_ref.desc     = miopen::BuildReshaped4DTensorDescriptor(savedMean_ref.desc);
+        savedVariance_ref.desc = miopen::BuildReshaped4DTensorDescriptor(savedVariance_ref.desc);
+        runMean_ref.desc       = miopen::BuildReshaped4DTensorDescriptor(runMean_ref.desc);
+        runVariance_ref.desc   = miopen::BuildReshaped4DTensorDescriptor(runVariance_ref.desc);
+    }
     if(bn_mode == miopenBNPerActivation)
     { // 1xCxHxW
         batchNormPerActHostFwdTrain(in.GetTensor(),
@@ -879,7 +971,6 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunForwardCPU()
 template <typename Tgpu, typename Tref, typename Tmix>
 int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardGPU()
 {
-
     if(!back)
         return miopenStatusSuccess;
 
@@ -1225,6 +1316,25 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardCPU()
 
     // float alphaDataDiff = static_cast<float>(1), betaDataDiff = static_cast<float>(0);
     // float alphaParamDiff = static_cast<float>(1), betaParamDiff = static_cast<float>(0);
+    int size{0};
+    miopenGetTensorDescriptorSize(&in.GetTensor().desc, &size);
+    if(size == 5)
+    {
+        in.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(in.GetTensor().desc);
+        dy.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(dy.GetTensor().desc);
+        out_bwd.GetTensor().desc =
+            miopen::BuildReshaped4DTensorDescriptor(out_bwd.GetTensor().desc);
+        out_ref.desc = miopen::BuildReshaped4DTensorDescriptor(out_ref.desc);
+        bnScale.GetTensor().desc =
+            miopen::BuildReshaped4DTensorDescriptor(bnScale.GetTensor().desc);
+        dBias.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(dBias.GetTensor().desc);
+        dScale_ref.desc        = miopen::BuildReshaped4DTensorDescriptor(dScale_ref.desc);
+        dBias_ref.desc         = miopen::BuildReshaped4DTensorDescriptor(dBias_ref.desc);
+        savedMean.GetTensor().desc =
+            miopen::BuildReshaped4DTensorDescriptor(savedMean.GetTensor().desc);
+        savedInvVar.GetTensor().desc =
+            miopen::BuildReshaped4DTensorDescriptor(savedInvVar.GetTensor().desc);
+    }
 
     if(bn_mode == miopenBNPerActivation)
     {
diff --git a/src/batch_norm.cpp b/src/batch_norm.cpp
index a3c5f93e36..1b8f4ce640 100644
--- a/src/batch_norm.cpp
+++ b/src/batch_norm.cpp
@@ -67,6 +67,7 @@ void DeriveBNTensorDescriptor(TensorDescriptor& derivedBnDesc,
 TensorDescriptor BuildReshaped4DTensorDescriptor(const miopen::TensorDescriptor& tDesc)
 {
     auto dataType = tDesc.GetType();
+    auto layout   = tDesc.GetLayout_t();
     std::vector<size_t> dims(tDesc.GetLengths());
 
     // NxCxDxHxW -> NxCx(D*H)xW
@@ -74,7 +75,7 @@ TensorDescriptor BuildReshaped4DTensorDescriptor(const miopen::TensorDescriptor&
     dims[3] = dims[4];
     dims.pop_back();
 
-    return {dataType, dims};
+    return {dataType, layout, dims};
 }
 
 void profileSequence(const Handle& handle, unsigned char select, float* ctime)
diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp
index 7769e4d563..bca7afc3a5 100644
--- a/src/solver/batchnorm/backward_ck.cpp
+++ b/src/solver/batchnorm/backward_ck.cpp
@@ -201,7 +201,8 @@ bool BnCKBwdBackward::IsApplicable(
         return false;
     if(bn_problem.GetDirection() != miopen::batchnorm::Direction::Backward)
         return false;
-
+    if(!bn_problem.Is2D())
+        return false;
     switch(bn_problem.GetXDesc().GetType())
     {
     case miopenFloat: return CheckCKApplicability<F32, F32, F32, F32, F32, F32, F32>(bn_problem);
diff --git a/src/solver/batchnorm/backward_per_activation.cpp b/src/solver/batchnorm/backward_per_activation.cpp
index 93cf670194..af52fbc339 100644
--- a/src/solver/batchnorm/backward_per_activation.cpp
+++ b/src/solver/batchnorm/backward_per_activation.cpp
@@ -41,6 +41,8 @@ namespace batchnorm {
 bool BnBwdTrainingPerActivation::IsApplicable(
     const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem) const
 {
+    if(!problem.Is2D())
+        return false;
     return problem.GetDirection() == miopen::batchnorm::Direction::Backward &&
            problem.GetMode() == miopenBNPerActivation;
 }
diff --git a/src/solver/batchnorm/backward_spatial_multiple.cpp b/src/solver/batchnorm/backward_spatial_multiple.cpp
index 29bbd5dba9..7fa9c0f89a 100644
--- a/src/solver/batchnorm/backward_spatial_multiple.cpp
+++ b/src/solver/batchnorm/backward_spatial_multiple.cpp
@@ -44,6 +44,10 @@ bool BnBwdTrainingSpatialMultiple::IsApplicable(
     if(problem.GetDirection() != miopen::batchnorm::Direction::Backward ||
        problem.GetMode() != miopenBNSpatial)
         return false;
+    if(!problem.Is2D())
+    {
+        return false;
+    }
 
 #if WORKAROUND_ISSUE_1549_FP16_BUILD_ERROR
     if(problem.GetXDesc().GetType() == miopenHalf &&
diff --git a/src/solver/batchnorm/backward_spatial_single.cpp b/src/solver/batchnorm/backward_spatial_single.cpp
index 30b0c0495f..86fa5a68c7 100644
--- a/src/solver/batchnorm/backward_spatial_single.cpp
+++ b/src/solver/batchnorm/backward_spatial_single.cpp
@@ -45,6 +45,8 @@ bool BnBwdTrainingSpatialSingle::IsApplicable(
     if(problem.GetDirection() != miopen::batchnorm::Direction::Backward ||
        problem.GetMode() != miopenBNSpatial)
         return false;
+    if(!problem.Is2D())
+        return false;
 
 #if WORKAROUND_ISSUE_1549_FP16_BUILD_ERROR
     if(problem.GetXDesc().GetType() == miopenHalf &&

From ebd014bceae4a4543b9fa99a821f52dbc39dccee Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Thu, 26 Sep 2024 21:12:04 +0000
Subject: [PATCH 17/27] fix minor layout issue in bn

---
 driver/bn_driver.hpp |  2 +-
 src/batch_norm.cpp   | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index 396ad6b218..dc42190314 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -351,7 +351,7 @@ template <typename Tgpu, typename Tref, typename Tmix>
 int BatchNormDriver<Tgpu, Tref, Tmix>::ChkLayout_ShortName()
 {
     // check for short name of layout type
-    if(inflags.FindShortName("layout") == 'I')
+    if(inflags.FindShortName("layout") == 'L')
     {
         // do noting
         // found valid short names
diff --git a/src/batch_norm.cpp b/src/batch_norm.cpp
index 1b8f4ce640..938809d81c 100644
--- a/src/batch_norm.cpp
+++ b/src/batch_norm.cpp
@@ -68,6 +68,19 @@ TensorDescriptor BuildReshaped4DTensorDescriptor(const miopen::TensorDescriptor&
 {
     auto dataType = tDesc.GetType();
     auto layout   = tDesc.GetLayout_t();
+    if(layout == miopenTensorNCDHW)
+    {
+        layout = miopenTensorNCHW;
+    }
+    else if(layout == miopenTensorNDHWC)
+    {
+        layout = miopenTensorNHWC;
+    }
+    else
+    {
+        std::cout << "Cannot handle layout : " << layout << "\n";
+        exit(EXIT_FAILURE); // NOLINT (concurrency-mt-unsafe)
+    }
     std::vector<size_t> dims(tDesc.GetLengths());
 
     // NxCxDxHxW -> NxCx(D*H)xW

From 35cdca1fc918b6d565bfa5e86aa46d4aabd10884 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Wed, 2 Oct 2024 05:36:57 +0000
Subject: [PATCH 18/27] fix run variance issue

---
 driver/bn_driver.hpp | 113 +++++++++++++++++++++++++++++--------------
 test/fusionHost.hpp  |  65 ++++++++++++++++++++-----
 2 files changed, 130 insertions(+), 48 deletions(-)

diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index dc42190314..e8ae9ff216 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -253,22 +253,19 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::GetandSetData()
 
         bnScale.InitHostData(bnScale.GetTensor().desc.GetElementSize(), true, gen_value);
 
-        if(saveMeanVar && keepRunningMeanVar)
-        {
-            savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_var_bwd);
+        savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_var_bwd);
 
-            auto gen_in_var = [](auto...) {
-                return static_cast<Tmix>(1e-2 * (prng::gen_0_to_B(100) + 1));
-            };
-            savedInvVar.InitHostData(
-                savedInvVar.GetTensor().desc.GetElementSize(), true, gen_in_var);
-        }
+        auto gen_in_var = [](auto...) {
+            return static_cast<Tmix>(1e-2 * (prng::gen_0_to_B(100) + 1));
+        };
+        savedInvVar.InitHostData(savedInvVar.GetTensor().desc.GetElementSize(), true, gen_in_var);
     }
     else
     {
         std::cout << "\nUnknown batch norm state!\n";
         exit(EXIT_FAILURE);
     }
+
     return miopenStatusSuccess;
 }
 
@@ -590,6 +587,16 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::AllocateBuffersAndCopy()
             tensor<Tref>{dBias.GetTensor().desc.GetLayout_t(), dBias.GetTensor().desc.GetLengths()};
     }
 
+    for(size_t i = 0; i < runMean.GetVector().size(); ++i)
+    {
+        runMean_ref.data[i] = static_cast<Tref>(runMean.GetVector()[i]);
+    }
+
+    for(size_t i = 0; i < runVariance.GetVector().size(); ++i)
+    {
+        runVariance_ref.data[i] = static_cast<Tref>(runVariance.GetVector()[i]);
+    }
+
     if(status != STATUS_SUCCESS)
         printf("Fatal: Error copying data to GPU\nExiting...\n\n");
 
@@ -913,6 +920,7 @@ void BatchNormDriver<Tgpu, Tref, Tmix>::runCPUFwdTrain(Tref epsilon, Tref eAF)
     }
     if(bn_mode == miopenBNPerActivation)
     { // 1xCxHxW
+
         batchNormPerActHostFwdTrain(in.GetTensor(),
                                     out_ref,
                                     scale.GetTensor(),
@@ -926,16 +934,34 @@ void BatchNormDriver<Tgpu, Tref, Tmix>::runCPUFwdTrain(Tref epsilon, Tref eAF)
     }
     else if(bn_mode == miopenBNSpatial)
     { // 1xCx1x1
-        batchNormSpatialHostFwdTrain(in.GetTensor(),
-                                     out_ref,
-                                     scale.GetTensor(),
-                                     bias.GetTensor(),
-                                     static_cast<double>(epsilon),
-                                     static_cast<double>(eAF),
-                                     savedMean_ref,
-                                     savedVariance_ref,
-                                     runMean_ref,
-                                     runVariance_ref);
+
+        if(forw == 2 && !keepRunningMeanVar)
+        {
+            tensor<Tref> empty_tensor;
+            batchNormSpatialHostFwdTrain(in.GetTensor(),
+                                         out_ref,
+                                         scale.GetTensor(),
+                                         bias.GetTensor(),
+                                         static_cast<double>(epsilon),
+                                         static_cast<double>(eAF),
+                                         empty_tensor,  // savedMean_ref
+                                         empty_tensor,  // savedVariance_ref
+                                         empty_tensor,  // runMean_ref
+                                         empty_tensor); // runVariance_ref
+        }
+        else
+        {
+            batchNormSpatialHostFwdTrain(in.GetTensor(),
+                                         out_ref,
+                                         scale.GetTensor(),
+                                         bias.GetTensor(),
+                                         static_cast<double>(epsilon),
+                                         static_cast<double>(eAF),
+                                         savedMean_ref,
+                                         savedVariance_ref,
+                                         runMean_ref,
+                                         runVariance_ref);
+        }
     }
     else
     {
@@ -952,7 +978,7 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunForwardCPU()
     Tref epsilon = static_cast<Tref>(EPSILON);
     Tref eAF     = static_cast<Tref>(1.0);
 
-    if(forw == 1)
+    if(forw == 1 || (forw == 2 && !keepRunningMeanVar))
     { // training only
         for(int i = 0; i < inflags.GetValueInt("iter"); i++)
         {
@@ -960,10 +986,16 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunForwardCPU()
             runCPUFwdTrain(epsilon, eAF /* alpha, beta,*/);
         }
     }
-    else if(forw == 2)
-    { // inference only
+    else if(forw == 2 && keepRunningMeanVar)
+    {
+        // inference only
         runCPUFwdInference(epsilon);
     }
+    else
+    {
+        printf("Unsupported forward cpu run state.\nExiting...\n\n");
+        exit(EXIT_FAILURE); // NOLINT (concurrency-mt-unsafe)
+    }
 
     return miopenStatusSuccess;
 }
@@ -1173,12 +1205,8 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
 
         if(saveMeanVar)
         { // copy back for verification
-            // saveMean_dev->FromGPU(GetStream(), savedMean.data());
-            // saveInvVariance_dev->FromGPU(GetStream(), savedInvVar.data());
-
             savedMean.CopyFromDeviceToHost(GetStream());
             savedVariance.CopyFromDeviceToHost(GetStream());
-
             maxval             = static_cast<Tref>(0.0);
             auto errorSaveMean = miopen::rms_range(savedMean_ref.data, savedMean.GetVector());
             if(!std::isfinite(errorSaveMean) || errorSaveMean > maxrms)
@@ -1252,7 +1280,6 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyForward()
 
     maxval        = static_cast<Tref>(0.0);
     auto errorOut = miopen::rms_range(out_ref.data, out.GetVector());
-
     if(!std::isfinite(errorOut) || errorOut > maxrms)
     {
         std::cout << "Forward batch norm verification FAILED on output: " << errorOut << std::endl;
@@ -1356,15 +1383,30 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardCPU()
     }
     else if(bn_mode == miopenBNSpatial)
     { // 1xCx1x1
+        if(saveMeanVar)
+        {
 
-        batchNormSpatialHostBwdTrain(in.GetTensor(),
-                                     dy.GetTensor(),
-                                     out_ref,
-                                     bnScale.GetTensor(),
-                                     dScale_ref,
-                                     dBias_ref,
-                                     savedMean.GetTensor(),
-                                     savedInvVar.GetTensor());
+            batchNormSpatialHostBwdTrain(in.GetTensor(),
+                                         dy.GetTensor(),
+                                         out_ref,
+                                         bnScale.GetTensor(),
+                                         dScale_ref,
+                                         dBias_ref,
+                                         savedMean.GetTensor(),
+                                         savedInvVar.GetTensor());
+        }
+        else
+        {
+            tensor<Tref> empty_tensor;
+            batchNormSpatialHostBwdTrain(in.GetTensor(),
+                                         dy.GetTensor(),
+                                         out_ref,
+                                         bnScale.GetTensor(),
+                                         dScale_ref,
+                                         dBias_ref,
+                                         empty_tensor,
+                                         empty_tensor);
+        }
     }
     else
     {
@@ -1399,7 +1441,6 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::VerifyBackward()
 #endif
     maxval          = static_cast<Tref>(0.0);
     auto errordxout = miopen::rms_range(out_ref.data, out_bwd.GetVector());
-
     if(!std::isfinite(errordxout) || errordxout > maxrms)
     {
         std::cout << "Backwards prop batch norm verification FAILED on dx: " << errordxout
diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index ec271ef967..d525b79cf6 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -266,16 +266,21 @@ void batchNormSpatialHostFwdTrain(const tensor<T>& input,
                 } // for (column)
             }     // for (row)
         }         // end for(n_batchs)
-
-        saveMean(0, cidx, 0, 0)   = mean_accum;
-        saveInvVar(0, cidx, 0, 0) = invVar;
-
-        newRunMean             = runMean(0, cidx, 0, 0) * (1 - expAvgFactor);
-        runMean(0, cidx, 0, 0) = mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp
-        // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n)
-        adjust =
-            (n_batch * height * width == 1) ? variance_accum : (nhw / (nhw - 1)) * variance_accum;
-        runVar(0, cidx, 0, 0) = (1 - expAvgFactor) * runVar(0, cidx, 0, 0) + expAvgFactor * adjust;
+        if(!saveMean.data.empty())
+        {
+            saveMean(0, cidx, 0, 0)   = mean_accum;
+            saveInvVar(0, cidx, 0, 0) = invVar;
+        }
+        if(!runMean.data.empty())
+        {
+            newRunMean             = runMean(0, cidx, 0, 0) * (1 - expAvgFactor);
+            runMean(0, cidx, 0, 0) = mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp
+            // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n)
+            adjust = (n_batch * height * width == 1) ? variance_accum
+                                                     : (nhw / (nhw - 1)) * variance_accum;
+            runVar(0, cidx, 0, 0) =
+                (1 - expAvgFactor) * runVar(0, cidx, 0, 0) + expAvgFactor * adjust;
+        }
     });
 }
 
@@ -301,14 +306,50 @@ void batchNormSpatialHostBwdTrain(const tensor<XDataType>& x_input,
     par_for(channels, 1, [&](int cidx) {
         double elemStd = 0.;
         unsigned int xhat_index;
-        double mean   = savedMean(0, cidx, 0, 0);   // HxW elements
-        double invVar = savedInvVar(0, cidx, 0, 0); // HxW elements
+        double mean   = 0.0;
+        double invVar = 0.0;
         double dyelem = 0.;
         std::vector<double> xhat(static_cast<std::size_t>(n_batch) * in_cstride, 0.0);
         // process the batch per channel
         dscale(0, cidx, 0, 0) = 0.;
         dbias(0, cidx, 0, 0)  = 0.;
 
+        if(!savedMean.data.empty())
+        {
+
+            mean   = savedMean(0, cidx, 0, 0);   // HxW elements
+            invVar = savedInvVar(0, cidx, 0, 0); // HxW elements
+        }
+        else
+        {
+            double variance_accum = 0.;
+            double mean_accum     = 0.;
+            double inv_Var        = 0.;
+
+            // process the batch per channel
+            for(int bidx = 0; bidx < n_batch; bidx++)
+            { // via mini_batch
+                for(int row = 0; row < height; row++)
+                { // via rows
+                    for(int column = 0; column < width; column++)
+                    { // via columns
+                        // #1 calculate the mean
+                        // iterating through the stack of images in the mini_batch
+                        auto inval = static_cast<double>(x_input(bidx, cidx, row, column));
+                        mean_accum += inval;
+                        variance_accum += inval * inval;
+                    } // end for (column)
+                }     // end for (row)
+            }         // end for (n)
+
+            mean_accum /= nhw;
+            variance_accum /= nhw;
+            variance_accum += (-mean_accum * mean_accum);
+            inv_Var = 1.0 / sqrt(variance_accum);
+
+            mean   = mean_accum;
+            invVar = inv_Var;
+        }
         for(int row = 0; row < height; row++)
         { // via rows
             for(int column = 0; column < width; column++)

From 5893793e1949410a24cd23d7470e4adb96a226e8 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Thu, 3 Oct 2024 14:02:11 +0000
Subject: [PATCH 19/27] fixed review comments

---
 src/driver_arguments.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp
index 1bd82b71b4..ce6a7593b5 100644
--- a/src/driver_arguments.cpp
+++ b/src/driver_arguments.cpp
@@ -66,7 +66,7 @@ void BnDataType(std::stringstream& ss, const miopen::TensorDescriptor& desc)
     {
         ss << "bnormfp16";
     }
-    if(desc.GetType() == miopenBFloat16)
+    else if(desc.GetType() == miopenBFloat16)
     {
         ss << "bnormbfp16";
     }

From 6463c480587ed502f3f08232defee583f72c4451 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Wed, 9 Oct 2024 15:18:26 +0000
Subject: [PATCH 20/27] create new API for batch norm

---
 include/miopen/miopen.h                       |  65 +++++
 src/batch_norm_api.cpp                        | 246 ++++++++++++++----
 src/driver_arguments.cpp                      |  23 +-
 src/fusion.cpp                                |   1 +
 src/include/miopen/batch_norm.hpp             |  77 +++---
 .../miopen/batchnorm/problem_description.hpp  |  40 ++-
 src/include/miopen/driver_arguments.hpp       |   1 +
 .../miopen/fusion/problem_description.hpp     |  15 +-
 src/ocl/batchnormocl.cpp                      |  97 ++++---
 test/bn_3d_peract_test.cpp                    |  15 ++
 test/bn_3d_spatial_test.cpp                   |  15 ++
 test/bn_peract_test.cpp                       |  15 ++
 test/bn_spatial_test.cpp                      |  15 ++
 13 files changed, 494 insertions(+), 131 deletions(-)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 74be683ca3..4244b5d63f 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -2738,6 +2738,28 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle,
                                         void* resultSaveMean,
                                         void* resultSaveInvVariance);
 
+MIOPEN_EXPORT miopenStatus_t
+miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle,
+                                           miopenBatchNormMode_t bn_mode,
+                                           void* alpha,
+                                           void* beta,
+                                           const miopenTensorDescriptor_t xDesc,
+                                           const void* x,
+                                           const miopenTensorDescriptor_t yDesc,
+                                           void* y,
+                                           const miopenTensorDescriptor_t scaleDesc,
+                                           const miopenTensorDescriptor_t biasVarDesc,
+                                           const miopenTensorDescriptor_t savedMeanDesc,
+                                           const miopenTensorDescriptor_t savedVarDesc,
+                                           void* bnScale,
+                                           void* bnBias,
+                                           double expAvgFactor,
+                                           void* resultRunningMean,
+                                           void* resultRunningVariance,
+                                           double epsilon,
+                                           void* resultSaveMean,
+                                           void* resultSaveInvVariance);
+
 /*! @brief Execute forward inference layer for batch normalization
  *
  * Batch normalization pass for forward inference pass.
@@ -2783,6 +2805,25 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle,
                                          void* estimatedVariance,
                                          double epsilon);
 
+MIOPEN_EXPORT miopenStatus_t
+miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle,
+                                            miopenBatchNormMode_t bn_mode,
+                                            void* alpha,
+                                            void* beta,
+                                            const miopenTensorDescriptor_t xDesc,
+                                            const void* x,
+                                            const miopenTensorDescriptor_t yDesc,
+                                            void* y,
+                                            const miopenTensorDescriptor_t scaleDesc,
+                                            const miopenTensorDescriptor_t BiasDesc,
+                                            const miopenTensorDescriptor_t estMeanDesc,
+                                            const miopenTensorDescriptor_t estVarianceDesc,
+                                            void* bnScale,
+                                            void* bnBias,
+                                            void* estimatedMean,
+                                            void* estimatedVariance,
+                                            double epsilon);
+
 /*! @brief Execute backwards propagation layer for batch normalization
  *
  * Batch normalization pass for backwards propagation training pass.
@@ -2838,6 +2879,30 @@ miopenBatchNormalizationBackward(miopenHandle_t handle,
                                  const void* savedMean,
                                  const void* savedInvVariance);
 
+MIOPEN_EXPORT miopenStatus_t
+miopenBatchNormalizationBackward_V2(miopenHandle_t handle,
+                                    miopenBatchNormMode_t bn_mode,
+                                    const void* alphaDataDiff,
+                                    const void* betaDataDiff,
+                                    const void* alphaParamDiff,
+                                    const void* betaParamDiff,
+                                    const miopenTensorDescriptor_t xDesc,
+                                    const void* x,
+                                    const miopenTensorDescriptor_t dyDesc,
+                                    const void* dy,
+                                    const miopenTensorDescriptor_t dxDesc,
+                                    void* dx,
+                                    const miopenTensorDescriptor_t scaleDesc,
+                                    const miopenTensorDescriptor_t biasDesc,
+                                    const miopenTensorDescriptor_t savedMeanDesc,
+                                    const miopenTensorDescriptor_t savedVarDesc,
+                                    const void* bnScale,
+                                    void* resultBnScaleDiff,
+                                    void* resultBnBiasDiff,
+                                    double epsilon,
+                                    const void* savedMean,
+                                    const void* savedInvVariance);
+
 /** @} */
 // CLOSEOUT BATCHNORM DOXYGEN GROUP
 
diff --git a/src/batch_norm_api.cpp b/src/batch_norm_api.cpp
index 8f184a9508..72e6a64554 100644
--- a/src/batch_norm_api.cpp
+++ b/src/batch_norm_api.cpp
@@ -50,6 +50,7 @@ namespace miopen {
 namespace debug {
 
 void LogCmdBNorm(const miopenTensorDescriptor_t xDesc,
+                 const miopenTensorDescriptor_t sMeanDesc,
                  miopenBatchNormMode_t bn_mode,
                  const void* resultRunningMean,
                  const void* resultRunningVariance,
@@ -60,6 +61,7 @@ void LogCmdBNorm(const miopenTensorDescriptor_t xDesc,
     if(miopen::IsLoggingCmd())
     {
         const std::string& str = BnormArgsForMIOpenDriver(xDesc,
+                                                          sMeanDesc,
                                                           bn_mode,
                                                           resultRunningMean,
                                                           resultRunningVariance,
@@ -88,6 +90,130 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle,
                                          void* estimatedMean,
                                          void* estimatedVariance,
                                          double epsilon)
+{
+    return miopenBatchNormalizationForwardInference_V2(handle,
+                                                       bn_mode,
+                                                       alpha,
+                                                       beta,
+                                                       xDesc,
+                                                       x,
+                                                       yDesc,
+                                                       y,
+                                                       bnScaleBiasMeanVarDesc,
+                                                       nullptr,
+                                                       nullptr,
+                                                       nullptr,
+                                                       bnScale,
+                                                       bnBias,
+                                                       estimatedMean,
+                                                       estimatedVariance,
+                                                       epsilon);
+}
+
+extern "C" miopenStatus_t
+miopenBatchNormalizationForwardTraining(miopenHandle_t handle,
+                                        miopenBatchNormMode_t bn_mode,
+                                        void* alpha,
+                                        void* beta,
+                                        const miopenTensorDescriptor_t xDesc,
+                                        const void* x,
+                                        const miopenTensorDescriptor_t yDesc,
+                                        void* y,
+                                        const miopenTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                        void* bnScale,
+                                        void* bnBias,
+                                        double expAvgFactor,
+                                        void* resultRunningMean,
+                                        void* resultRunningVariance,
+                                        double epsilon,
+                                        void* resultSaveMean,
+                                        void* resultSaveInvVariance)
+{
+    return miopenBatchNormalizationForwardTraining_V2(handle,
+                                                      bn_mode,
+                                                      alpha,
+                                                      beta,
+                                                      xDesc,
+                                                      x,
+                                                      yDesc,
+                                                      y,
+                                                      bnScaleBiasMeanVarDesc,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      bnScale,
+                                                      bnBias,
+                                                      expAvgFactor,
+                                                      resultRunningMean,
+                                                      resultRunningVariance,
+                                                      epsilon,
+                                                      resultSaveMean,
+                                                      resultSaveInvVariance);
+}
+
+extern "C" miopenStatus_t
+miopenBatchNormalizationBackward(miopenHandle_t handle,
+                                 miopenBatchNormMode_t bn_mode,
+                                 const void* alphaDataDiff,
+                                 const void* betaDataDiff,
+                                 const void* alphaParamDiff,
+                                 const void* betaParamDiff,
+                                 const miopenTensorDescriptor_t xDesc,
+                                 const void* x,
+                                 const miopenTensorDescriptor_t dyDesc,
+                                 const void* dy,
+                                 const miopenTensorDescriptor_t dxDesc,
+                                 void* dx,
+                                 const miopenTensorDescriptor_t bnScaleBiasDiffDesc,
+                                 const void* bnScale,
+                                 void* resultBnScaleDiff,
+                                 void* resultBnBiasDiff,
+                                 double epsilon,
+                                 const void* savedMean,
+                                 const void* savedInvVariance)
+{
+    return miopenBatchNormalizationBackward_V2(handle,
+                                               bn_mode,
+                                               alphaDataDiff,
+                                               betaDataDiff,
+                                               alphaParamDiff,
+                                               betaParamDiff,
+                                               xDesc,
+                                               x,
+                                               dyDesc,
+                                               dy,
+                                               dxDesc,
+                                               dx,
+                                               bnScaleBiasDiffDesc,
+                                               nullptr,
+                                               nullptr,
+                                               nullptr,
+                                               bnScale,
+                                               resultBnScaleDiff,
+                                               resultBnBiasDiff,
+                                               epsilon,
+                                               savedMean,
+                                               savedInvVariance);
+}
+
+extern "C" miopenStatus_t
+miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle,
+                                            miopenBatchNormMode_t bn_mode,
+                                            void* alpha,
+                                            void* beta,
+                                            const miopenTensorDescriptor_t xDesc,
+                                            const void* x,
+                                            const miopenTensorDescriptor_t yDesc,
+                                            void* y,
+                                            const miopenTensorDescriptor_t scaleDesc,
+                                            const miopenTensorDescriptor_t BiasDesc,
+                                            const miopenTensorDescriptor_t estMeanDesc,
+                                            const miopenTensorDescriptor_t estVarianceDesc,
+                                            void* bnScale,
+                                            void* bnBias,
+                                            void* estimatedMean,
+                                            void* estimatedVariance,
+                                            double epsilon)
 {
     MIOPEN_LOG_FUNCTION(handle,
                         bn_mode,
@@ -95,7 +221,10 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle,
                         x,
                         yDesc,
                         y,
-                        bnScaleBiasMeanVarDesc,
+                        scaleDesc,
+                        (BiasDesc == nullptr) ? scaleDesc : BiasDesc,
+                        (estMeanDesc == nullptr) ? scaleDesc : estMeanDesc,
+                        (estVarianceDesc == nullptr) ? scaleDesc : estVarianceDesc,
                         bnScale,
                         bnBias,
                         estimatedMean,
@@ -103,12 +232,14 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle,
                         epsilon);
 
     miopen::debug::LogCmdBNorm(xDesc,
+                               (estMeanDesc == nullptr) ? scaleDesc : estMeanDesc,
                                bn_mode,
                                estimatedMean,
                                estimatedVariance,
                                nullptr,
                                nullptr,
                                miopen::debug::BatchNormDirection_t::ForwardInference);
+
     // In case of NxCxDxHxW
     int size{0};
     miopenGetTensorDescriptorSize(xDesc, &size);
@@ -124,9 +255,10 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle,
             (size == 5) ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(yDesc))
                         : miopen::deref(yDesc),
             DataCast(y),
-            (size == 5)
-                ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(bnScaleBiasMeanVarDesc))
-                : miopen::deref(bnScaleBiasMeanVarDesc),
+            miopen::deref(scaleDesc),
+            miopen::deref((BiasDesc == nullptr) ? scaleDesc : BiasDesc),
+            miopen::deref((estMeanDesc == nullptr) ? scaleDesc : estMeanDesc),
+            miopen::deref((estVarianceDesc == nullptr) ? scaleDesc : estVarianceDesc),
             DataCast(bnScale),
             DataCast(bnBias),
             DataCast(estimatedMean),
@@ -136,32 +268,37 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle,
 }
 
 extern "C" miopenStatus_t
-miopenBatchNormalizationForwardTraining(miopenHandle_t handle,
-                                        miopenBatchNormMode_t bn_mode,
-                                        void* alpha,
-                                        void* beta,
-                                        const miopenTensorDescriptor_t xDesc,
-                                        const void* x,
-                                        const miopenTensorDescriptor_t yDesc,
-                                        void* y,
-                                        const miopenTensorDescriptor_t bnScaleBiasMeanVarDesc,
-                                        void* bnScale,
-                                        void* bnBias,
-                                        double expAvgFactor,
-                                        void* resultRunningMean,
-                                        void* resultRunningVariance,
-                                        double epsilon,
-                                        void* resultSaveMean,
-                                        void* resultSaveInvVariance)
+miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle,
+                                           miopenBatchNormMode_t bn_mode,
+                                           void* alpha,
+                                           void* beta,
+                                           const miopenTensorDescriptor_t xDesc,
+                                           const void* x,
+                                           const miopenTensorDescriptor_t yDesc,
+                                           void* y,
+                                           const miopenTensorDescriptor_t scaleDesc,
+                                           const miopenTensorDescriptor_t BiasDesc,
+                                           const miopenTensorDescriptor_t savedMeanDesc,
+                                           const miopenTensorDescriptor_t savedVarianceDesc,
+                                           void* bnScale,
+                                           void* bnBias,
+                                           double expAvgFactor,
+                                           void* resultRunningMean,
+                                           void* resultRunningVariance,
+                                           double epsilon,
+                                           void* resultSaveMean,
+                                           void* resultSaveInvVariance)
 {
-
     MIOPEN_LOG_FUNCTION(handle,
                         bn_mode,
                         xDesc,
                         x,
                         yDesc,
                         y,
-                        bnScaleBiasMeanVarDesc,
+                        scaleDesc,
+                        (BiasDesc == nullptr) ? scaleDesc : BiasDesc,
+                        (savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc,
+                        (savedVarianceDesc == nullptr) ? scaleDesc : savedVarianceDesc,
                         bnScale,
                         bnBias,
                         expAvgFactor,
@@ -172,6 +309,7 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle,
                         resultSaveInvVariance);
 
     miopen::debug::LogCmdBNorm(xDesc,
+                               (savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc,
                                bn_mode,
                                resultRunningMean,
                                resultRunningVariance,
@@ -193,9 +331,10 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle,
             (size == 5) ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(yDesc))
                         : miopen::deref(yDesc),
             DataCast(y),
-            (size == 5)
-                ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(bnScaleBiasMeanVarDesc))
-                : miopen::deref(bnScaleBiasMeanVarDesc),
+            miopen::deref(scaleDesc),
+            miopen::deref((BiasDesc == nullptr) ? scaleDesc : BiasDesc),
+            miopen::deref((savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc),
+            miopen::deref((savedVarianceDesc == nullptr) ? scaleDesc : savedVarianceDesc),
             DataCast(bnScale),
             DataCast(bnBias),
             expAvgFactor,
@@ -208,27 +347,29 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle,
 }
 
 extern "C" miopenStatus_t
-miopenBatchNormalizationBackward(miopenHandle_t handle,
-                                 miopenBatchNormMode_t bn_mode,
-                                 const void* alphaDataDiff,
-                                 const void* betaDataDiff,
-                                 const void* alphaParamDiff,
-                                 const void* betaParamDiff,
-                                 const miopenTensorDescriptor_t xDesc,
-                                 const void* x,
-                                 const miopenTensorDescriptor_t dyDesc,
-                                 const void* dy,
-                                 const miopenTensorDescriptor_t dxDesc,
-                                 void* dx,
-                                 const miopenTensorDescriptor_t bnScaleBiasDiffDesc,
-                                 const void* bnScale,
-                                 void* resultBnScaleDiff,
-                                 void* resultBnBiasDiff,
-                                 double epsilon,
-                                 const void* savedMean,
-                                 const void* savedInvVariance)
+miopenBatchNormalizationBackward_V2(miopenHandle_t handle,
+                                    miopenBatchNormMode_t bn_mode,
+                                    const void* alphaDataDiff,
+                                    const void* betaDataDiff,
+                                    const void* alphaParamDiff,
+                                    const void* betaParamDiff,
+                                    const miopenTensorDescriptor_t xDesc,
+                                    const void* x,
+                                    const miopenTensorDescriptor_t dyDesc,
+                                    const void* dy,
+                                    const miopenTensorDescriptor_t dxDesc,
+                                    void* dx,
+                                    const miopenTensorDescriptor_t scaleDesc,
+                                    const miopenTensorDescriptor_t BiasDesc,
+                                    const miopenTensorDescriptor_t savedMeanDesc,
+                                    const miopenTensorDescriptor_t savedVarianceDesc,
+                                    const void* bnScale,
+                                    void* resultBnScaleDiff,
+                                    void* resultBnBiasDiff,
+                                    double epsilon,
+                                    const void* savedMean,
+                                    const void* savedInvVariance)
 {
-
     MIOPEN_LOG_FUNCTION(handle,
                         bn_mode,
                         xDesc,
@@ -237,7 +378,10 @@ miopenBatchNormalizationBackward(miopenHandle_t handle,
                         dy,
                         dxDesc,
                         dx,
-                        bnScaleBiasDiffDesc,
+                        scaleDesc,
+                        (BiasDesc == nullptr) ? scaleDesc : BiasDesc,
+                        (savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc,
+                        (savedVarianceDesc == nullptr) ? scaleDesc : savedVarianceDesc,
                         bnScale,
                         resultBnScaleDiff,
                         resultBnBiasDiff,
@@ -245,6 +389,7 @@ miopenBatchNormalizationBackward(miopenHandle_t handle,
                         savedMean,
                         savedInvVariance);
     miopen::debug::LogCmdBNorm(xDesc,
+                               (savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc,
                                bn_mode,
                                nullptr,
                                nullptr,
@@ -271,9 +416,10 @@ miopenBatchNormalizationBackward(miopenHandle_t handle,
             (size == 5) ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(dxDesc))
                         : miopen::deref(dxDesc),
             DataCast(dx),
-            (size == 5)
-                ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(bnScaleBiasDiffDesc))
-                : miopen::deref(bnScaleBiasDiffDesc),
+            miopen::deref(scaleDesc),
+            miopen::deref((BiasDesc == nullptr) ? scaleDesc : BiasDesc),
+            miopen::deref((savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc),
+            miopen::deref((savedVarianceDesc == nullptr) ? scaleDesc : savedVarianceDesc),
             DataCast(bnScale),
             DataCast(resultBnScaleDiff),
             DataCast(resultBnBiasDiff),
diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp
index ce6a7593b5..971977afa9 100644
--- a/src/driver_arguments.cpp
+++ b/src/driver_arguments.cpp
@@ -60,16 +60,28 @@ void ConvDataType(std::stringstream& ss, const miopen::TensorDescriptor& desc)
     }
 }
 
-void BnDataType(std::stringstream& ss, const miopen::TensorDescriptor& desc)
+// test based on the input tensor and scaleMean.
+// We choose scaleMean because its a accumulator type.
+void BnDataType(std::stringstream& ss,
+                const miopen::TensorDescriptor& xDesc,
+                const miopen::TensorDescriptor& sMeanDesc)
 {
-    if(desc.GetType() == miopenHalf)
+    if(xDesc.GetType() == miopenHalf && sMeanDesc.GetType() == miopenHalf)
     {
         ss << "bnormfp16";
     }
-    else if(desc.GetType() == miopenBFloat16)
+    else if(xDesc.GetType() == miopenBFloat16 && sMeanDesc.GetType() == miopenBFloat16)
     {
         ss << "bnormbfp16";
     }
+    else if(xDesc.GetType() == miopenHalf && sMeanDesc.GetType() == miopenFloat)
+    {
+        ss << "bnormbfp16fp32";
+    }
+    else if(xDesc.GetType() == miopenBFloat16 && sMeanDesc.GetType() == miopenFloat)
+    {
+        ss << "bnormbfp16fp32";
+    }
     else
     {
         ss << "bnorm";
@@ -215,7 +227,8 @@ std::string ConvArgsForMIOpenDriver(const miopen::TensorDescriptor& xDesc,
     return ss.str();
 }
 
-std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc,
+std::string BnormArgsForMIOpenDriver(const miopenTensorDescriptor_t xDesc,
+                                     const miopenTensorDescriptor_t sMeanDesc,
                                      miopenBatchNormMode_t bn_mode,
                                      const void* resultRunningMean,
                                      const void* resultRunningVariance,
@@ -228,7 +241,7 @@ std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc,
     miopenGetTensorDescriptorSize(xDesc, &size);
     std::stringstream ss;
     if(print_for_bn_driver)
-        BnDataType(ss, miopen::deref(xDesc));
+        BnDataType(ss, miopen::deref(xDesc), miopen::deref(sMeanDesc));
 
     ss << " -n " << miopen::deref(xDesc).GetLengths()[0] // clang-format off
             << " -c " << miopen::deref(xDesc).GetLengths()[1];
diff --git a/src/fusion.cpp b/src/fusion.cpp
index a9ef5e27a9..e536f6a1a1 100644
--- a/src/fusion.cpp
+++ b/src/fusion.cpp
@@ -390,6 +390,7 @@ std::string LogCmdBnormFusion(const miopenFusionPlanDescriptor_t fusePlanDesc, i
     if(bn_op != nullptr)
     {
         str += BnormArgsForMIOpenDriver(&bn_op->input_desc,
+                                        &bn_op->base_desc,
                                         bn_op->mode,
                                         nullptr,
                                         nullptr,
diff --git a/src/include/miopen/batch_norm.hpp b/src/include/miopen/batch_norm.hpp
index 50c309550c..92444f039b 100644
--- a/src/include/miopen/batch_norm.hpp
+++ b/src/include/miopen/batch_norm.hpp
@@ -163,40 +163,44 @@ void bnFwdTrainSelectMulti(const Handle& handle,
 
 void profileSequence(const Handle& handle, unsigned char select, float* ctime);
 
-MIOPEN_INTERNALS_EXPORT void
-BatchNormForwardInference(Handle& handle,
-                          miopenBatchNormMode_t bn_mode,
-                          const void* alpha,
-                          const void* beta,
-                          const TensorDescriptor& xDesc,
-                          ConstData_t x,
-                          const TensorDescriptor& yDesc,
-                          Data_t y,
-                          const TensorDescriptor& bnScaleBiasMeanVarDesc,
-                          ConstData_t bnScale,
-                          ConstData_t bnBias,
-                          ConstData_t estimatedMean,
-                          ConstData_t estimatedVariance,
-                          double epsilon);
-
-MIOPEN_INTERNALS_EXPORT void
-BatchNormForwardTraining(Handle& handle,
-                         miopenBatchNormMode_t bn_mode,
-                         const void* alpha, /* these don't seem to be used in conv */
-                         const void* beta,
-                         const TensorDescriptor& xDesc,
-                         ConstData_t x,
-                         const TensorDescriptor& yDesc,
-                         Data_t y,
-                         const TensorDescriptor& bnScaleBiasMeanVarDesc,
-                         ConstData_t bnScale,
-                         ConstData_t bnBias,
-                         double expAvgFactor,
-                         Data_t resultRunningMean,
-                         Data_t resultRunningVariance,
-                         double epsilon,
-                         Data_t resultSaveMean,
-                         Data_t resultSaveInvVariance);
+MIOPEN_INTERNALS_EXPORT void BatchNormForwardInference(Handle& handle,
+                                                       miopenBatchNormMode_t bn_mode,
+                                                       const void* alpha,
+                                                       const void* beta,
+                                                       const TensorDescriptor& xDesc,
+                                                       ConstData_t x,
+                                                       const TensorDescriptor& yDesc,
+                                                       Data_t y,
+                                                       const TensorDescriptor& scaleDesc,
+                                                       const TensorDescriptor& BiasDesc,
+                                                       const TensorDescriptor& estMeanDesc,
+                                                       const TensorDescriptor& estVarianceDesc,
+                                                       ConstData_t bnScale,
+                                                       ConstData_t bnBias,
+                                                       ConstData_t estimatedMean,
+                                                       ConstData_t estimatedVariance,
+                                                       double epsilon);
+
+MIOPEN_INTERNALS_EXPORT void BatchNormForwardTraining(Handle& handle,
+                                                      miopenBatchNormMode_t bn_mode,
+                                                      const void* alpha,
+                                                      const void* beta,
+                                                      const TensorDescriptor& xDesc,
+                                                      ConstData_t x,
+                                                      const TensorDescriptor& yDesc,
+                                                      Data_t y,
+                                                      const TensorDescriptor& scaleDesc,
+                                                      const TensorDescriptor& biasDesc,
+                                                      const TensorDescriptor& savedMeanDesc,
+                                                      const TensorDescriptor& savedVarianceDesc,
+                                                      ConstData_t bnScale,
+                                                      ConstData_t bnBias,
+                                                      double expAvgFactor,
+                                                      Data_t resultRunningMean,
+                                                      Data_t resultRunningVariance,
+                                                      double epsilon,
+                                                      Data_t resultSaveMean,
+                                                      Data_t resultSaveInvVariance);
 
 MIOPEN_INTERNALS_EXPORT void BatchNormBackward(Handle& handle,
                                                miopenBatchNormMode_t bn_mode,
@@ -210,7 +214,10 @@ MIOPEN_INTERNALS_EXPORT void BatchNormBackward(Handle& handle,
                                                ConstData_t dy,
                                                const TensorDescriptor& dxDesc,
                                                Data_t dx,
-                                               const TensorDescriptor& bnScaleBiasDiffDesc,
+                                               const TensorDescriptor& scaleDesc,
+                                               const TensorDescriptor& BiasDesc,
+                                               const TensorDescriptor& savedMeanDesc,
+                                               const TensorDescriptor& savedVarianceDesc,
                                                ConstData_t bnScale,
                                                Data_t resultBnScaleDiff,
                                                Data_t resultBnBiasDiff,
diff --git a/src/include/miopen/batchnorm/problem_description.hpp b/src/include/miopen/batchnorm/problem_description.hpp
index b87494b725..d28e91adfd 100644
--- a/src/include/miopen/batchnorm/problem_description.hpp
+++ b/src/include/miopen/batchnorm/problem_description.hpp
@@ -58,7 +58,10 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob
     ProblemDescription(miopenBatchNormMode_t bn_mode_,
                        const TensorDescriptor& xDesc_,
                        const TensorDescriptor& yDesc_,
-                       const TensorDescriptor& bnScaleBiasMeanVarDesc_,
+                       const TensorDescriptor& scaleDesc_,
+                       const TensorDescriptor& biasDesc_,
+                       const TensorDescriptor& sMeanDesc_,
+                       const TensorDescriptor& sVarianceDesc_,
                        double expAvgFactor_,
                        double epsilon_,
                        bool resultsave_,
@@ -67,7 +70,10 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob
           bn_mode(bn_mode_),
           xDesc(xDesc_),
           yOrDyDesc(yDesc_),
-          scaleBiasDesc(bnScaleBiasMeanVarDesc_),
+          scaleDesc(scaleDesc_),
+          biasDesc(biasDesc_),
+          sMeanDesc(sMeanDesc_),
+          sVarianceDesc(sVarianceDesc_),
           expAvgFactor(expAvgFactor_),
           epsilon(epsilon_),
           resultsave(resultsave_),
@@ -82,13 +88,19 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob
     ProblemDescription(miopenBatchNormMode_t bn_mode_,
                        const TensorDescriptor& xDesc_,
                        const TensorDescriptor& yDesc_,
-                       const TensorDescriptor& bnScaleBiasMeanVarDesc_,
+                       const TensorDescriptor& scaleDesc_,
+                       const TensorDescriptor& biasDesc_,
+                       const TensorDescriptor& sMeanDesc_,
+                       const TensorDescriptor& sVarianceDesc_,
                        double epsilon_)
         : direction(Direction::ForwardInference),
           bn_mode(bn_mode_),
           xDesc(xDesc_),
           yOrDyDesc(yDesc_),
-          scaleBiasDesc(bnScaleBiasMeanVarDesc_),
+          scaleDesc(scaleDesc_),
+          biasDesc(biasDesc_),
+          sMeanDesc(sMeanDesc_),
+          sVarianceDesc(sVarianceDesc_),
           epsilon(epsilon_)
     {
         SetSpatialDims();
@@ -101,7 +113,10 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob
                        const TensorDescriptor& xDesc_,
                        const TensorDescriptor& dyDesc_,
                        const TensorDescriptor& dxDesc_,
-                       const TensorDescriptor& bnScaleBiasDiffDesc_,
+                       const TensorDescriptor& scaleDesc_,
+                       const TensorDescriptor& biasDesc_,
+                       const TensorDescriptor& sMeanDesc_,
+                       const TensorDescriptor& sVarianceDesc_,
                        double epsilon_,
                        bool useSaved_)
         : direction(Direction::Backward),
@@ -109,7 +124,10 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob
           xDesc(xDesc_),
           yOrDyDesc(dyDesc_),
           dxDesc(dxDesc_),
-          scaleBiasDesc(bnScaleBiasDiffDesc_),
+          scaleDesc(scaleDesc_),
+          biasDesc(biasDesc_),
+          sMeanDesc(sMeanDesc_),
+          sVarianceDesc(sVarianceDesc_),
           epsilon(epsilon_),
           useSaved(useSaved_)
     {
@@ -153,13 +171,13 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob
     const TensorDescriptor& GetBnScaleBiasMeanVarDesc() const
     {
         assert(direction == Direction::ForwardTraining || direction == Direction::ForwardInference);
-        return scaleBiasDesc;
+        return scaleDesc;
     }
 
     const TensorDescriptor& GetScaleBiasDiffDesc() const
     {
         assert(direction == Direction::Backward);
-        return scaleBiasDesc;
+        return scaleDesc;
     }
 
     bool GetResultSave() const
@@ -215,7 +233,11 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob
     TensorDescriptor xDesc;     // input
     TensorDescriptor yOrDyDesc; // output
     TensorDescriptor dxDesc;
-    TensorDescriptor scaleBiasDesc;
+
+    TensorDescriptor scaleDesc; // scale
+    TensorDescriptor biasDesc;  // bias (shift)
+    TensorDescriptor sMeanDesc;
+    TensorDescriptor sVarianceDesc;
 
 #ifdef __clang__
 #pragma clang diagnostic push
diff --git a/src/include/miopen/driver_arguments.hpp b/src/include/miopen/driver_arguments.hpp
index da4064b7f0..a964e7fe27 100644
--- a/src/include/miopen/driver_arguments.hpp
+++ b/src/include/miopen/driver_arguments.hpp
@@ -67,6 +67,7 @@ std::string ConvArgsForMIOpenDriver(const miopen::TensorDescriptor& xDesc,
                                     bool print_for_conv_driver = true);
 
 std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc,
+                                     miopenTensorDescriptor_t sMeanDesc,
                                      miopenBatchNormMode_t bn_mode,
                                      const void* resultRunningMean,
                                      const void* resultRunningVariance,
diff --git a/src/include/miopen/fusion/problem_description.hpp b/src/include/miopen/fusion/problem_description.hpp
index bcb37878d9..b3d1669cee 100644
--- a/src/include/miopen/fusion/problem_description.hpp
+++ b/src/include/miopen/fusion/problem_description.hpp
@@ -128,7 +128,14 @@ struct FusionDescription : ProblemDescriptionBase
                 dynamic_cast<BatchNormInferenceFusionOpDescriptor&>(*fusion_plan_desc->op_map[idx]);
             miopen::TensorDescriptor out_desc;
             bn_op.GetOutputDesc(out_desc);
-            return {bn_op.mode, bn_op.input_desc, out_desc, bn_op.base_desc, not_used};
+            return {bn_op.mode,
+                    bn_op.input_desc,
+                    out_desc,
+                    bn_op.base_desc,
+                    bn_op.base_desc,
+                    bn_op.base_desc,
+                    bn_op.base_desc,
+                    not_used};
         }
         else if(dir == miopen::batchnorm::Direction::ForwardTraining)
         {
@@ -140,6 +147,9 @@ struct FusionDescription : ProblemDescriptionBase
                     bn_op.input_desc,
                     out_desc,
                     bn_op.base_desc,
+                    bn_op.base_desc,
+                    bn_op.base_desc,
+                    bn_op.base_desc,
                     not_used, // expAvgFactor filler
                     not_used,
                     true /* resultSave*/,
@@ -156,6 +166,9 @@ struct FusionDescription : ProblemDescriptionBase
                     out_desc,
                     bn_op.input_desc,
                     {} /*bn_op.base_desc*/,
+                    {} /*bn_op.base_desc*/,
+                    {} /*bn_op.base_desc*/,
+                    {} /*bn_op.base_desc*/,
                     not_used,
                     bn_op.useBatchStats /*useSaved*/};
         }
diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp
index 40bcd34935..f33c5ac5db 100644
--- a/src/ocl/batchnormocl.cpp
+++ b/src/ocl/batchnormocl.cpp
@@ -55,6 +55,8 @@ miopen::PerformanceDb GetDb(const miopen::ExecutionContext& ctx,
 }
 } // namespace batchnorm
 
+//============ BEGIN FORWARD TRAINING ===============
+
 void BatchNormForwardTraining(Handle& handle,
                               miopenBatchNormMode_t bn_mode,
                               const void* alpha,
@@ -63,7 +65,10 @@ void BatchNormForwardTraining(Handle& handle,
                               ConstData_t x,
                               const TensorDescriptor& yDesc,
                               Data_t y,
-                              const TensorDescriptor& bnScaleBiasMeanVarDesc,
+                              const TensorDescriptor& scaleDesc,
+                              const TensorDescriptor& biasDesc,
+                              const TensorDescriptor& savedMeanDesc,
+                              const TensorDescriptor& savedVarianceDesc,
                               ConstData_t bnScale,
                               ConstData_t bnBias,
                               double expAvgFactor,
@@ -73,13 +78,14 @@ void BatchNormForwardTraining(Handle& handle,
                               Data_t resultSaveMean,
                               Data_t resultSaveInvVariance)
 {
-
     if(x == nullptr || y == nullptr || bnScale == nullptr || bnBias == nullptr)
     {
         MIOPEN_THROW(miopenStatusBadParm);
     }
-    if(xDesc.GetNumDims() != yDesc.GetNumDims() ||
-       xDesc.GetNumDims() != bnScaleBiasMeanVarDesc.GetNumDims())
+    if(xDesc.GetNumDims() != yDesc.GetNumDims() || xDesc.GetNumDims() != scaleDesc.GetNumDims() ||
+       xDesc.GetNumDims() != biasDesc.GetNumDims() ||
+       xDesc.GetNumDims() != savedMeanDesc.GetNumDims() ||
+       xDesc.GetNumDims() != savedVarianceDesc.GetNumDims())
     {
         MIOPEN_THROW(miopenStatusBadParm);
     }
@@ -105,9 +111,9 @@ void BatchNormForwardTraining(Handle& handle,
     {
         miopen::checkNumericsInput(handle, xDesc, x);
         if(bnScale != nullptr)
-            miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, bnScale);
+            miopen::checkNumericsInput(handle, scaleDesc, bnScale);
         if(bnBias != nullptr)
-            miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, bnBias);
+            miopen::checkNumericsInput(handle, biasDesc, bnBias);
     }
 
     const auto resultsave    = resultSaveMean != nullptr && resultSaveInvVariance != nullptr;
@@ -116,7 +122,10 @@ void BatchNormForwardTraining(Handle& handle,
     const auto problem = batchnorm::ProblemDescription{bn_mode,
                                                        xDesc,
                                                        yDesc,
-                                                       bnScaleBiasMeanVarDesc,
+                                                       scaleDesc,
+                                                       biasDesc,
+                                                       savedMeanDesc,
+                                                       savedVarianceDesc,
                                                        expAvgFactor,
                                                        epsilon,
                                                        resultsave,
@@ -153,15 +162,16 @@ void BatchNormForwardTraining(Handle& handle,
     {
         miopen::checkNumericsOutput(handle, yDesc, y);
         if(resultRunningMean != nullptr)
-            miopen::checkNumericsOutput(handle, bnScaleBiasMeanVarDesc, resultRunningMean);
+            miopen::checkNumericsOutput(handle, savedMeanDesc, resultRunningMean);
         if(resultRunningVariance != nullptr)
-            miopen::checkNumericsOutput(handle, bnScaleBiasMeanVarDesc, resultRunningVariance);
+            miopen::checkNumericsOutput(handle, savedVarianceDesc, resultRunningVariance);
         if(resultSaveMean != nullptr)
-            miopen::checkNumericsOutput(handle, bnScaleBiasMeanVarDesc, resultSaveMean);
+            miopen::checkNumericsOutput(handle, savedMeanDesc, resultSaveMean);
         if(resultSaveInvVariance != nullptr)
-            miopen::checkNumericsOutput(handle, bnScaleBiasMeanVarDesc, resultSaveInvVariance);
+            miopen::checkNumericsOutput(handle, savedVarianceDesc, resultSaveInvVariance);
     }
 }
+
 //================== END FWD TRAIN ===================
 
 //============ BEGIN FORWARD INFERENCE ===============
@@ -173,31 +183,37 @@ void BatchNormForwardInference(Handle& handle,
                                ConstData_t x,
                                const TensorDescriptor& yDesc,
                                Data_t y,
-                               const TensorDescriptor& bnScaleBiasMeanVarDesc,
+                               const TensorDescriptor& scaleDesc,
+                               const TensorDescriptor& biasDesc,
+                               const TensorDescriptor& estMeanDesc,
+                               const TensorDescriptor& estVarianceDesc,
                                ConstData_t bnScale,
                                ConstData_t bnBias,
                                ConstData_t estimatedMean,
                                ConstData_t estimatedVariance,
                                double epsilon)
 {
+
     if(miopen::CheckNumericsEnabled())
     {
         miopen::checkNumericsInput(handle, xDesc, x);
-        miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, bnScale);
-        miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, bnBias);
-        miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, estimatedMean);
-        miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, estimatedVariance);
+        miopen::checkNumericsInput(handle, scaleDesc, bnScale);
+        miopen::checkNumericsInput(handle, biasDesc, bnBias);
+        miopen::checkNumericsInput(handle, estMeanDesc, estimatedMean);
+        miopen::checkNumericsInput(handle, estVarianceDesc, estimatedVariance);
     }
 
     if(estimatedMean != nullptr && estimatedVariance != nullptr)
     {
-
         if(x == nullptr || y == nullptr || bnScale == nullptr || bnBias == nullptr)
         {
             MIOPEN_THROW(miopenStatusBadParm);
         }
         if(xDesc.GetNumDims() != yDesc.GetNumDims() ||
-           xDesc.GetNumDims() != bnScaleBiasMeanVarDesc.GetNumDims())
+           xDesc.GetNumDims() != scaleDesc.GetNumDims() ||
+           xDesc.GetNumDims() != biasDesc.GetNumDims() ||
+           xDesc.GetNumDims() != estMeanDesc.GetNumDims() ||
+           xDesc.GetNumDims() != estVarianceDesc.GetNumDims())
         {
             MIOPEN_THROW(miopenStatusBadParm);
         }
@@ -216,8 +232,8 @@ void BatchNormForwardInference(Handle& handle,
             MIOPEN_THROW(miopenStatusBadParm);
         }
 
-        const auto problem =
-            batchnorm::ProblemDescription{bn_mode, xDesc, yDesc, bnScaleBiasMeanVarDesc, epsilon};
+        const auto problem = batchnorm::ProblemDescription{
+            bn_mode, xDesc, yDesc, scaleDesc, biasDesc, estMeanDesc, estVarianceDesc, epsilon};
 
         const auto invoke_params = [&]() {
             auto tmp              = batchnorm::InfInvokeParams{};
@@ -250,7 +266,10 @@ void BatchNormForwardInference(Handle& handle,
                                  x,
                                  yDesc,
                                  y,
-                                 bnScaleBiasMeanVarDesc,
+                                 scaleDesc,
+                                 biasDesc,
+                                 estMeanDesc,
+                                 estVarianceDesc,
                                  bnScale,
                                  bnBias,
                                  0,
@@ -265,9 +284,11 @@ void BatchNormForwardInference(Handle& handle,
         miopen::checkNumericsOutput(handle, yDesc, y);
     }
 }
+
 //================= END FORWARD INFERENCE ====================
 
 //=============== BEGIN BACKWARDS PROPAGATION ================
+
 void BatchNormBackward(Handle& handle,
                        miopenBatchNormMode_t bn_mode,
                        const void* alphaDataDiff,
@@ -280,7 +301,10 @@ void BatchNormBackward(Handle& handle,
                        ConstData_t dy,
                        const TensorDescriptor& dxDesc,
                        Data_t dx,
-                       const TensorDescriptor& bnScaleBiasDiffDesc,
+                       const TensorDescriptor& scaleDesc,
+                       const TensorDescriptor& biasDesc,
+                       const TensorDescriptor& savedMeanDesc,
+                       const TensorDescriptor& savedVarianceDesc,
                        ConstData_t bnScale,
                        Data_t resultBnScaleDiff,
                        Data_t resultBnBiasDiff,
@@ -296,20 +320,23 @@ void BatchNormBackward(Handle& handle,
     {
         miopen::checkNumericsInput(handle, xDesc, x);
         miopen::checkNumericsInput(handle, dyDesc, dy);
-        miopen::checkNumericsInput(handle, bnScaleBiasDiffDesc, bnScale);
+        miopen::checkNumericsInput(handle, scaleDesc, bnScale);
+        miopen::checkNumericsInput(handle, biasDesc, bnScale);
 
         if(savedMean != nullptr)
-            miopen::checkNumericsInput(handle, bnScaleBiasDiffDesc, savedMean);
+            miopen::checkNumericsInput(handle, savedMeanDesc, savedMean);
         if(savedInvVariance != nullptr)
-            miopen::checkNumericsInput(handle, bnScaleBiasDiffDesc, savedInvVariance);
+            miopen::checkNumericsInput(handle, savedVarianceDesc, savedInvVariance);
     }
 
     if(x == nullptr || dy == nullptr || bnScale == nullptr || dx == nullptr)
     {
         MIOPEN_THROW(miopenStatusBadParm);
     }
-    if(xDesc.GetNumDims() != dyDesc.GetNumDims() ||
-       xDesc.GetNumDims() != bnScaleBiasDiffDesc.GetNumDims())
+    if(xDesc.GetNumDims() != dyDesc.GetNumDims() || xDesc.GetNumDims() != scaleDesc.GetNumDims() ||
+       xDesc.GetNumDims() != biasDesc.GetNumDims() ||
+       xDesc.GetNumDims() != savedMeanDesc.GetNumDims() ||
+       xDesc.GetNumDims() != savedVarianceDesc.GetNumDims())
     {
         MIOPEN_THROW(miopenStatusBadParm);
     }
@@ -336,8 +363,16 @@ void BatchNormBackward(Handle& handle,
 
     const auto useSaved = savedMean != nullptr && savedInvVariance != nullptr;
 
-    const auto problem = batchnorm::ProblemDescription{
-        bn_mode, xDesc, dyDesc, dxDesc, bnScaleBiasDiffDesc, epsilon, useSaved};
+    const auto problem = batchnorm::ProblemDescription{bn_mode,
+                                                       xDesc,
+                                                       dyDesc,
+                                                       dxDesc,
+                                                       scaleDesc,
+                                                       biasDesc,
+                                                       savedMeanDesc,
+                                                       savedVarianceDesc,
+                                                       epsilon,
+                                                       useSaved};
 
     const auto algo = bn_mode == miopenBNSpatial
                           ? AlgorithmName{"miopenBatchNormBackwardPropSpatial"}
@@ -368,8 +403,8 @@ void BatchNormBackward(Handle& handle,
     if(miopen::CheckNumericsEnabled())
     {
         miopen::checkNumericsOutput(handle, dxDesc, dx);
-        miopen::checkNumericsOutput(handle, bnScaleBiasDiffDesc, resultBnScaleDiff);
-        miopen::checkNumericsOutput(handle, bnScaleBiasDiffDesc, resultBnBiasDiff);
+        miopen::checkNumericsOutput(handle, scaleDesc, resultBnScaleDiff);
+        miopen::checkNumericsOutput(handle, biasDesc, resultBnBiasDiff);
     }
 }
 } // namespace miopen
diff --git a/test/bn_3d_peract_test.cpp b/test/bn_3d_peract_test.cpp
index 19fd15e7ce..c5f96ff9ba 100644
--- a/test/bn_3d_peract_test.cpp
+++ b/test/bn_3d_peract_test.cpp
@@ -281,6 +281,9 @@ struct verify_forward_train_3d_bn_per_activation
                                          BuildReshaped4DTensorDescriptor(out.desc),
                                          out_dev.get(),
                                          BuildReshaped4DTensorDescriptor(scale.desc),
+                                         BuildReshaped4DTensorDescriptor(shift.desc),
+                                         BuildReshaped4DTensorDescriptor(shift.desc),
+                                         BuildReshaped4DTensorDescriptor(shift.desc),
                                          scale_dev.get(),
                                          shift_dev.get(),
                                          expAvgFactor,
@@ -450,6 +453,9 @@ struct verify_forward_infer_3d_bn_per_activation_recalc
                                           BuildReshaped4DTensorDescriptor(out.desc),
                                           out_dev.get(),
                                           BuildReshaped4DTensorDescriptor(scale.desc),
+                                          BuildReshaped4DTensorDescriptor(shift.desc),
+                                          BuildReshaped4DTensorDescriptor(shift.desc),
+                                          BuildReshaped4DTensorDescriptor(shift.desc),
                                           scale_dev.get(),
                                           shift_dev.get(),
                                           nullptr,
@@ -573,6 +579,9 @@ struct verify_forward_infer_3d_bn_per_activation_use_est
                                           BuildReshaped4DTensorDescriptor(out.desc),
                                           out_dev.get(),
                                           BuildReshaped4DTensorDescriptor(scale.desc),
+                                          BuildReshaped4DTensorDescriptor(shift.desc),
+                                          BuildReshaped4DTensorDescriptor(shift.desc),
+                                          BuildReshaped4DTensorDescriptor(shift.desc),
                                           scale_dev.get(),
                                           shift_dev.get(),
                                           estMean_dev.get(),
@@ -747,6 +756,9 @@ struct verify_backward_3d_bn_per_activation_use_saved
                                   BuildReshaped4DTensorDescriptor(dx_out.desc),
                                   dx_out_dev.get(),
                                   BuildReshaped4DTensorDescriptor(scale.desc),
+                                  BuildReshaped4DTensorDescriptor(dshift.desc),
+                                  BuildReshaped4DTensorDescriptor(dshift.desc),
+                                  BuildReshaped4DTensorDescriptor(dshift.desc),
                                   scale_dev.get(),
                                   dscale_dev.get(),
                                   dshift_dev.get(),
@@ -948,6 +960,9 @@ struct verify_backward_3d_bn_per_activation_recalc
                                   BuildReshaped4DTensorDescriptor(dx_out.desc),
                                   dx_out_dev.get(),
                                   BuildReshaped4DTensorDescriptor(scale.desc),
+                                  BuildReshaped4DTensorDescriptor(dshift.desc),
+                                  BuildReshaped4DTensorDescriptor(dshift.desc),
+                                  BuildReshaped4DTensorDescriptor(dshift.desc),
                                   scale_dev.get(),
                                   dscale_dev.get(),
                                   dshift_dev.get(),
diff --git a/test/bn_3d_spatial_test.cpp b/test/bn_3d_spatial_test.cpp
index 8d428fca2b..08bfdb5a57 100644
--- a/test/bn_3d_spatial_test.cpp
+++ b/test/bn_3d_spatial_test.cpp
@@ -327,6 +327,9 @@ struct verify_forward_train_3d_bn_spatial
                                          miopen::BuildReshaped4DTensorDescriptor(out.desc),
                                          out_dev.get(),
                                          miopen::BuildReshaped4DTensorDescriptor(scale.desc),
+                                         miopen::BuildReshaped4DTensorDescriptor(shift.desc),
+                                         miopen::BuildReshaped4DTensorDescriptor(shift.desc),
+                                         miopen::BuildReshaped4DTensorDescriptor(shift.desc),
                                          scale_dev.get(),
                                          shift_dev.get(),
                                          expAvgFactor,
@@ -516,6 +519,9 @@ struct verify_forward_infer_3d_bn_spatial_recalc
                                           miopen::BuildReshaped4DTensorDescriptor(out.desc),
                                           out_dev.get(),
                                           miopen::BuildReshaped4DTensorDescriptor(scale.desc),
+                                          miopen::BuildReshaped4DTensorDescriptor(shift.desc),
+                                          miopen::BuildReshaped4DTensorDescriptor(shift.desc),
+                                          miopen::BuildReshaped4DTensorDescriptor(shift.desc),
                                           scale_dev.get(),
                                           shift_dev.get(),
                                           nullptr,
@@ -632,6 +638,9 @@ struct verify_forward_infer_3d_bn_spatial_use_est
                                           miopen::BuildReshaped4DTensorDescriptor(out.desc),
                                           out_dev.get(),
                                           miopen::BuildReshaped4DTensorDescriptor(scale.desc),
+                                          miopen::BuildReshaped4DTensorDescriptor(shift.desc),
+                                          miopen::BuildReshaped4DTensorDescriptor(shift.desc),
+                                          miopen::BuildReshaped4DTensorDescriptor(shift.desc),
                                           scale_dev.get(),
                                           shift_dev.get(),
                                           estMean_dev.get(),
@@ -913,6 +922,9 @@ struct verify_backward_3d_bn_spatial_recalc
                                   miopen::BuildReshaped4DTensorDescriptor(dx_out.desc),
                                   dx_out_dev.get(),
                                   miopen::BuildReshaped4DTensorDescriptor(scale.desc),
+                                  miopen::BuildReshaped4DTensorDescriptor(dshift.desc),
+                                  miopen::BuildReshaped4DTensorDescriptor(dshift.desc),
+                                  miopen::BuildReshaped4DTensorDescriptor(dshift.desc),
                                   scale_dev.get(),
                                   dscale_dev.get(),
                                   dshift_dev.get(),
@@ -1138,6 +1150,9 @@ struct verify_backward_3d_bn_spatial_use_saved
                                   miopen::BuildReshaped4DTensorDescriptor(dx_out.desc),
                                   dx_out_dev.get(),
                                   miopen::BuildReshaped4DTensorDescriptor(scale.desc),
+                                  miopen::BuildReshaped4DTensorDescriptor(dshift.desc),
+                                  miopen::BuildReshaped4DTensorDescriptor(dshift.desc),
+                                  miopen::BuildReshaped4DTensorDescriptor(dshift.desc),
                                   scale_dev.get(),
                                   dscale_dev.get(),
                                   dshift_dev.get(),
diff --git a/test/bn_peract_test.cpp b/test/bn_peract_test.cpp
index 6622230666..4d83e05df7 100644
--- a/test/bn_peract_test.cpp
+++ b/test/bn_peract_test.cpp
@@ -271,6 +271,9 @@ struct verify_forward_train_bn_per_activation
                                          out.desc,
                                          out_dev.get(),
                                          scale.desc,
+                                         shift.desc,
+                                         shift.desc,
+                                         shift.desc,
                                          scale_dev.get(),
                                          shift_dev.get(),
                                          expAvgFactor,
@@ -433,6 +436,9 @@ struct verify_forward_infer_bn_per_activation_recalc
                                           out.desc,
                                           out_dev.get(),
                                           scale.desc,
+                                          shift.desc,
+                                          shift.desc,
+                                          shift.desc,
                                           scale_dev.get(),
                                           shift_dev.get(),
                                           nullptr,
@@ -550,6 +556,9 @@ struct verify_forward_infer_bn_per_activation_use_est
                                           out.desc,
                                           out_dev.get(),
                                           scale.desc,
+                                          shift.desc,
+                                          shift.desc,
+                                          shift.desc,
                                           scale_dev.get(),
                                           shift_dev.get(),
                                           estMean_dev.get(),
@@ -716,6 +725,9 @@ struct verify_backward_bn_per_activation_use_saved
                                   dx_out.desc,
                                   dx_out_dev.get(),
                                   scale.desc,
+                                  dshift.desc,
+                                  dshift.desc,
+                                  dshift.desc,
                                   scale_dev.get(),
                                   dscale_dev.get(),
                                   dshift_dev.get(),
@@ -909,6 +921,9 @@ struct verify_backward_bn_per_activation_recalc
                                   dx_out.desc,
                                   dx_out_dev.get(),
                                   scale.desc,
+                                  dshift.desc,
+                                  dshift.desc,
+                                  dshift.desc,
                                   scale_dev.get(),
                                   dscale_dev.get(),
                                   dshift_dev.get(),
diff --git a/test/bn_spatial_test.cpp b/test/bn_spatial_test.cpp
index 82d1cc271b..95a8ee099a 100644
--- a/test/bn_spatial_test.cpp
+++ b/test/bn_spatial_test.cpp
@@ -308,6 +308,9 @@ struct verify_forward_train_bn_spatial
                                          out.desc,
                                          out_dev.get(),
                                          scale.desc,
+                                         shift.desc,
+                                         shift.desc,
+                                         shift.desc,
                                          scale_dev.get(),
                                          shift_dev.get(),
                                          expAvgFactor,
@@ -484,6 +487,9 @@ struct verify_forward_infer_bn_spatial_recalc
                                           out.desc,
                                           out_dev.get(),
                                           scale.desc,
+                                          shift.desc,
+                                          shift.desc,
+                                          shift.desc,
                                           scale_dev.get(),
                                           shift_dev.get(),
                                           nullptr,
@@ -596,6 +602,9 @@ struct verify_forward_infer_bn_spatial_use_est
                                           out.desc,
                                           out_dev.get(),
                                           scale.desc,
+                                          shift.desc,
+                                          shift.desc,
+                                          shift.desc,
                                           scale_dev.get(),
                                           shift_dev.get(),
                                           estMean_dev.get(),
@@ -853,6 +862,9 @@ struct verify_backward_bn_spatial_recalc
                                   dx_out.desc,
                                   dx_out_dev.get(),
                                   scale.desc,
+                                  dshift.desc,
+                                  dshift.desc,
+                                  dshift.desc,
                                   scale_dev.get(),
                                   dscale_dev.get(),
                                   dshift_dev.get(),
@@ -1065,6 +1077,9 @@ struct verify_backward_bn_spatial_use_saved
                                   dx_out.desc,
                                   dx_out_dev.get(),
                                   scale.desc,
+                                  dshift.desc,
+                                  dshift.desc,
+                                  dshift.desc,
                                   scale_dev.get(),
                                   dscale_dev.get(),
                                   dshift_dev.get(),

From 978bba1a7f6e7d1d6ee90827890c28e4688dc15f Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Tue, 15 Oct 2024 15:26:30 +0000
Subject: [PATCH 21/27] add test for V2 api

---
 include/miopen/miopen.h     | 112 +++++++++++++++++-
 test/gtest/bn.hpp           | 225 ++++++++++++++++++++++++++----------
 test/gtest/bn_bwd.cpp       | 117 +++++++++++++++----
 test/gtest/bn_fwd_train.cpp | 113 ++++++++++++++----
 test/gtest/bn_infer.cpp     | 126 ++++++++++++++------
 test/gtest/bn_test_data.hpp |  21 +++-
 6 files changed, 578 insertions(+), 136 deletions(-)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 4244b5d63f..63c3335b1a 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -2737,7 +2737,46 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle,
                                         double epsilon,
                                         void* resultSaveMean,
                                         void* resultSaveInvVariance);
-
+/*! @brief Execute forward training layer for batch normalization
+ *
+ * Batch normalization pass for forward training pass.
+ * Takes in batch normalization mode bn_mode and input tensor x, output tensor y, bnBias and bnScale
+ * with their descriptor.
+ *
+ * If either resultSaveMean, or resultSaveInvVariance are null pointers then the values for the mean
+ * and inverse variance will not be used.
+ *
+ * Likewise, if either resultRunningMean, or resultRunningVariance are null pointers then the values
+ * for the running mean and variance will not be saved.
+ * Running averages and variances are scaled using an exponential averaging factor: \f[
+ * \mu_{old} = \mu_{new}*factor + \mu_{old}*(1-factor)
+ * \f]
+ * where \f[
+ * factor=1/(1+iteration)
+ * \f]
+ *
+ * @param handle                    MIOpen handle (input)
+ * @param bn_mode                   Batch normalization mode (input)
+ * @param alpha                     Floating point scaling factor, allocated on the host (input)
+ * @param beta                      Floating point shift factor, allocated on the host (input)
+ * @param xDesc                     Tensor descriptor for data input tensor x (input)
+ * @param x                         Data tensor x (input)
+ * @param yDesc                     Tensor descriptor for output data tensor y (input)
+ * @param y                         Data tensor y (output)
+ * @param ScaleDesc                 Tensor descriptor for BN scaling
+ * @param biasVarDesc               Tensor descriptor for BN bias
+ * @param savedMeanDesc             Tensor descriptor for BN saved Mean
+ * @param savedVarDesc              Tensor descriptor for BN saved Variance
+ * @param bnScale                   Batch norm scaling, gamma, tensor (input)
+ * @param bnBias                    Batch norm bias, beta, tensor (input)
+ * @param expAvgFactor              Exponential averaging factor (input)
+ * @param resultRunningMean         Running average saved for inference (output)
+ * @param resultRunningVariance     Running variance saved for inference (output)
+ * @param epsilon                   Value to stablize inverse variance calculation (input)
+ * @param resultSaveMean            Saved mini-batch mean for backwards pass (output)
+ * @param resultSaveInvVariance     Saved mini-batch inverse variance for backwards pass (output)
+ * @return                          miopenStatus_t
+ */
 MIOPEN_EXPORT miopenStatus_t
 miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle,
                                            miopenBatchNormMode_t bn_mode,
@@ -2805,6 +2844,37 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle,
                                          void* estimatedVariance,
                                          double epsilon);
 
+/*! @brief Execute forward inference layer for batch normalization
+ *
+ * Batch normalization pass for forward inference pass.
+ * Takes in batch normalization mode bn_mode and input tensor x, output tensor y, bnBias and bnScale
+ * with their descriptor.
+ *
+ * If either estimatedMean, or estimatedVariance are null pointers then the values for the mean and
+ * variance will be calculated from input data and this calculated mean and variance will be used
+ * to update input values.
+ * If variance is zero and epsilon is also zero, this function outputs NAN values.  Input espilon
+ * value should always be non zero positive value.
+ *
+ * @param handle                    MIOpen handle (input)
+ * @param bn_mode                   Batch normalization mode (input)
+ * @param alpha                     Floating point scaling factor, allocated on the host (input)
+ * @param beta                      Floating point shift factor, allocated on the host (input)
+ * @param xDesc                     Tensor descriptor for data input tensor x (input)
+ * @param x                         Data tensor x (input)
+ * @param yDesc                     Tensor descriptor for output data tensor y (input)
+ * @param y                         Data tensor y (output)
+ * @param ScaleDesc                 Tensor descriptor for BN scaling
+ * @param biasVarDesc               Tensor descriptor for BN bias
+ * @param estMeanDesc               Tensor descriptor for BN estimated Mean
+ * @param estVarianceDesc           Tensor descriptor for BN estimated Variance
+ * @param bnScale                   Batch norm scaling, gamma, tensor (input)
+ * @param bnBias                    Batch norm bias, beta, tensor (input)
+ * @param estimatedMean             Running average saved during forward training (input)
+ * @param estimatedVariance         Running variance saved during forward training (input)
+ * @param epsilon                   Value to stabilize inverse variance calculation (input)
+ * @return                          miopenStatus_t
+ */
 MIOPEN_EXPORT miopenStatus_t
 miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle,
                                             miopenBatchNormMode_t bn_mode,
@@ -2815,7 +2885,7 @@ miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle,
                                             const miopenTensorDescriptor_t yDesc,
                                             void* y,
                                             const miopenTensorDescriptor_t scaleDesc,
-                                            const miopenTensorDescriptor_t BiasDesc,
+                                            const miopenTensorDescriptor_t biasDesc,
                                             const miopenTensorDescriptor_t estMeanDesc,
                                             const miopenTensorDescriptor_t estVarianceDesc,
                                             void* bnScale,
@@ -2879,6 +2949,44 @@ miopenBatchNormalizationBackward(miopenHandle_t handle,
                                  const void* savedMean,
                                  const void* savedInvVariance);
 
+/*! @brief Execute backwards propagation layer for batch normalization
+ *
+ * Batch normalization pass for backwards propagation training pass.
+ * The method for backwards propagation batch normalization.
+ *
+ * Takes in batch normalization mode bn_mode and input tensor data x, input activation tensor dy,
+ * output tensor dx, the learned tensors resultBNBiasDiff and resultBNScaleDiff with their
+ * descriptor.
+ *
+ * If BOTH savedMean, and savedVariance are not null pointers then the method will use the saved
+ * mean and variance calculated by the forward training phase.
+ *
+ * @param handle                    MIOpen handle (input)
+ * @param bn_mode                   Batch normalization mode (input)
+ * @param alphaDataDiff             Floating point scaling factor, allocated on the host (input)
+ * @param betaDataDiff              Floating point shift factor, allocated on the host (input)
+ * @param alphaParamDiff            Floating point scaling factor, allocated on the host (input)
+ * @param betaParamDiff             Floating point shift factor, allocated on the host (input)
+ * @param xDesc                     Tensor descriptor for data input tensor x (input)
+ * @param x                         Data tensor x (input)
+ * @param dyDesc                    Tensor descriptor for output data tensor y (input)
+ * @param dy                        Data tensor y (input)
+ * @param dxDesc                    Tensor descriptor for output data tensor dx (input)
+ * @param dx                        Data delta tensor dx (output)
+ * @param scaleDesc                 Tensor descriptor for scaling descriptor (input)
+ * @param biasDesc                  Tensor descriptor for bias/shift descriptor (input)
+ * @param savedMeanDesc             Tensor descriptor for saved Mean  descriptor (input)
+ * @param savedVarDesc              Tensor descriptor for saved Variance descriptor (input)
+ * , shifting, saved variance and
+ * mean (input)
+ * @param bnScale                   Batch norm scaling, gamma, tensor (input)
+ * @param resultBnScaleDiff         Tensor for dscale (output)
+ * @param resultBnBiasDiff          Tensor for dbias (output)
+ * @param epsilon                   Value to stabilize inverse variance calculation (input)
+ * @param savedMean                 Saved mini-batch mean for backwards pass (input)
+ * @param savedInvVariance          Saved mini-bathc inverse variance for backwards pass (input)
+ * @return                          miopenStatus_t
+ */
 MIOPEN_EXPORT miopenStatus_t
 miopenBatchNormalizationBackward_V2(miopenHandle_t handle,
                                     miopenBatchNormMode_t bn_mode,
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index f5227217e4..fdff351f79 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -32,17 +32,25 @@
 #include "bn_test_data.hpp"
 #include "test_operations.hpp"
 
+// Define an enum to identify which version of BN api to call
+enum BNApiType
+{
+    testBNAPIV1,
+    testBNAPIV2,
+};
+
 template <typename XDataType,
           typename YDataType,
           typename ScaleDataType,
           typename BiasDataType,
           typename MeanVarDataType>
-struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopenTensorLayout_t>>
+struct BNInferTest
+    : public ::testing::TestWithParam<std::tuple<BNTestCase, miopenTensorLayout_t, BNApiType>>
 {
 protected:
     void SetUp() override
     {
-        std::tie(bn_config, tensor_layout) = GetParam();
+        std::tie(bn_config, tensor_layout, api_type) = GetParam();
         bn_infer_test_data.SetUpImpl(bn_config, tensor_layout);
 
         auto&& handle = get_handle();
@@ -51,21 +59,47 @@ struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miop
             test_skipped = true;
             GTEST_SKIP() << "Not Applicable on " << handle.GetDeviceName() << " Architecture";
         }
-        auto res =
-            miopenBatchNormalizationForwardInference(&handle,
-                                                     bn_config.mode,
-                                                     &bn_infer_test_data.alpha,
-                                                     &bn_infer_test_data.beta,
-                                                     &bn_infer_test_data.input.desc,
-                                                     bn_infer_test_data.in_dev.get(),
-                                                     &bn_infer_test_data.output.desc,
-                                                     bn_infer_test_data.out_dev.get(),
-                                                     &bn_infer_test_data.scale.desc,
-                                                     bn_infer_test_data.scale_dev.get(),
-                                                     bn_infer_test_data.shift_dev.get(),
-                                                     bn_infer_test_data.estMean_dev.get(),
-                                                     bn_infer_test_data.estVariance_dev.get(),
-                                                     bn_infer_test_data.epsilon);
+        miopenStatus_t res;
+        if(api_type == BNApiType::testBNAPIV1)
+        {
+            res = miopenBatchNormalizationForwardInference(&handle,
+                                                           bn_config.mode,
+                                                           &bn_infer_test_data.alpha,
+                                                           &bn_infer_test_data.beta,
+                                                           &bn_infer_test_data.input.desc,
+                                                           bn_infer_test_data.in_dev.get(),
+                                                           &bn_infer_test_data.output.desc,
+                                                           bn_infer_test_data.out_dev.get(),
+                                                           &bn_infer_test_data.scale.desc,
+                                                           bn_infer_test_data.scale_dev.get(),
+                                                           bn_infer_test_data.shift_dev.get(),
+                                                           bn_infer_test_data.estMean_dev.get(),
+                                                           bn_infer_test_data.estVariance_dev.get(),
+                                                           bn_infer_test_data.epsilon);
+        }
+        else if(api_type == BNApiType::testBNAPIV2)
+        {
+            res = miopenBatchNormalizationForwardInference_V2(
+                &handle,
+                bn_config.mode,
+                &bn_infer_test_data.alpha,
+                &bn_infer_test_data.beta,
+                &bn_infer_test_data.input.desc,
+                bn_infer_test_data.in_dev.get(),
+                &bn_infer_test_data.output.desc,
+                bn_infer_test_data.out_dev.get(),
+                &bn_infer_test_data.scale.desc,
+                &bn_infer_test_data.shift.desc,
+                &bn_infer_test_data.estMean.desc,
+                &bn_infer_test_data.estVariance.desc,
+                bn_infer_test_data.scale_dev.get(),
+                bn_infer_test_data.shift_dev.get(),
+                bn_infer_test_data.estMean_dev.get(),
+                bn_infer_test_data.estVariance_dev.get(),
+                bn_infer_test_data.epsilon);
+        }
+        else
+            GTEST_FAIL() << "ERROR: unknown bn api type!!";
         if(res != miopenStatusSuccess)
         {
             GTEST_FAIL() << "miopenBatchNormalizationForwardInference failed";
@@ -96,6 +130,7 @@ struct BNInferTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miop
     BNInferTestData<XDataType, YDataType, ScaleDataType, BiasDataType, MeanVarDataType, BNTestCase>
         bn_infer_test_data;
     miopenTensorLayout_t tensor_layout;
+    BNApiType api_type;
 };
 
 template <typename XDataType,
@@ -105,34 +140,71 @@ template <typename XDataType,
           typename ScaleDataType,
           typename DscaleDbiasDataType,
           typename MeanVarDataType>
-struct BNBwdTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopenTensorLayout_t>>
+struct BNBwdTest
+    : public ::testing::TestWithParam<std::tuple<BNTestCase, miopenTensorLayout_t, BNApiType>>
 {
 protected:
     void SetUp() override
     {
-        std::tie(bn_config, tensor_layout) = GetParam();
+        std::tie(bn_config, tensor_layout, api_type) = GetParam();
         bn_bwd_test_data.SetUpImpl(bn_config, tensor_layout);
 
         auto&& handle = get_handle();
-        auto res      = miopenBatchNormalizationBackward(&handle,
-                                                    bn_config.mode,
-                                                    &bn_bwd_test_data.alphaDataDiff,
-                                                    &bn_bwd_test_data.betaDataDiff,
-                                                    &bn_bwd_test_data.alphaParamDiff,
-                                                    &bn_bwd_test_data.betaParamDiff,
-                                                    &bn_bwd_test_data.input.desc,
-                                                    bn_bwd_test_data.in_dev.get(),
-                                                    &bn_bwd_test_data.dy.desc,
-                                                    bn_bwd_test_data.dy_dev.get(),
-                                                    &bn_bwd_test_data.output.desc,
-                                                    bn_bwd_test_data.out_dev.get(),
-                                                    &bn_bwd_test_data.bnScale.desc,
-                                                    bn_bwd_test_data.bnScale_dev.get(),
-                                                    bn_bwd_test_data.dScale_dev.get(),
-                                                    bn_bwd_test_data.dBias_dev.get(),
-                                                    bn_bwd_test_data.epsilon,
-                                                    bn_bwd_test_data.savedMean_dev.get(),
-                                                    bn_bwd_test_data.savedInvVar_dev.get());
+        if(!miopen::solver::ck_utility::is_ck_whitelist(handle.GetStream()))
+        {
+            test_skipped = true;
+            GTEST_SKIP() << "Not Applicable on " << handle.GetDeviceName() << " Architecture";
+        }
+        miopenStatus_t res;
+        if(api_type == BNApiType::testBNAPIV1)
+        {
+            res = miopenBatchNormalizationBackward(&handle,
+                                                   bn_config.mode,
+                                                   &bn_bwd_test_data.alphaDataDiff,
+                                                   &bn_bwd_test_data.betaDataDiff,
+                                                   &bn_bwd_test_data.alphaParamDiff,
+                                                   &bn_bwd_test_data.betaParamDiff,
+                                                   &bn_bwd_test_data.input.desc,
+                                                   bn_bwd_test_data.in_dev.get(),
+                                                   &bn_bwd_test_data.dy.desc,
+                                                   bn_bwd_test_data.dy_dev.get(),
+                                                   &bn_bwd_test_data.output.desc,
+                                                   bn_bwd_test_data.out_dev.get(),
+                                                   &bn_bwd_test_data.bnScale.desc,
+                                                   bn_bwd_test_data.bnScale_dev.get(),
+                                                   bn_bwd_test_data.dScale_dev.get(),
+                                                   bn_bwd_test_data.dBias_dev.get(),
+                                                   bn_bwd_test_data.epsilon,
+                                                   bn_bwd_test_data.savedMean_dev.get(),
+                                                   bn_bwd_test_data.savedInvVar_dev.get());
+        }
+        else if(api_type == BNApiType::testBNAPIV2)
+        {
+            res = miopenBatchNormalizationBackward_V2(&handle,
+                                                      bn_config.mode,
+                                                      &bn_bwd_test_data.alphaDataDiff,
+                                                      &bn_bwd_test_data.betaDataDiff,
+                                                      &bn_bwd_test_data.alphaParamDiff,
+                                                      &bn_bwd_test_data.betaParamDiff,
+                                                      &bn_bwd_test_data.input.desc,
+                                                      bn_bwd_test_data.in_dev.get(),
+                                                      &bn_bwd_test_data.dy.desc,
+                                                      bn_bwd_test_data.dy_dev.get(),
+                                                      &bn_bwd_test_data.output.desc,
+                                                      bn_bwd_test_data.out_dev.get(),
+                                                      &bn_bwd_test_data.bnScale.desc,
+                                                      &bn_bwd_test_data.dBias.desc,
+                                                      &bn_bwd_test_data.savedMean.desc,
+                                                      &bn_bwd_test_data.savedInvVar.desc,
+                                                      bn_bwd_test_data.bnScale_dev.get(),
+                                                      bn_bwd_test_data.dScale_dev.get(),
+                                                      bn_bwd_test_data.dBias_dev.get(),
+                                                      bn_bwd_test_data.epsilon,
+                                                      bn_bwd_test_data.savedMean_dev.get(),
+                                                      bn_bwd_test_data.savedInvVar_dev.get());
+        }
+        else
+            GTEST_FAIL() << "ERROR: unknown bn api type!!";
         if(res != miopenStatusSuccess)
         {
             GTEST_FAIL() << "miopenBatchNormalizationBackward failed";
@@ -183,6 +255,7 @@ struct BNBwdTest : public ::testing::TestWithParam<std::tuple<BNTestCase, miopen
                   BNTestCase>
         bn_bwd_test_data;
     miopenTensorLayout_t tensor_layout;
+    BNApiType api_type;
 };
 
 template <typename XDataType,
@@ -191,33 +264,68 @@ template <typename XDataType,
           typename BiasDataType,
           typename AccDataType>
 struct BNFwdTrainTest
-    : public ::testing::TestWithParam<std::tuple<BNTestCase, miopenTensorLayout_t>>
+    : public ::testing::TestWithParam<std::tuple<BNTestCase, miopenTensorLayout_t, BNApiType>>
 {
 protected:
     void SetUp() override
     {
-        std::tie(bn_config, tensor_layout) = GetParam();
+        std::tie(bn_config, tensor_layout, api_type) = GetParam();
         bn_fwd_train_test_data.SetUpImpl(bn_config, tensor_layout);
 
         auto&& handle = get_handle();
-        auto res =
-            miopenBatchNormalizationForwardTraining(&handle,
-                                                    bn_config.mode,
-                                                    &bn_fwd_train_test_data.alpha,
-                                                    &bn_fwd_train_test_data.beta,
-                                                    &bn_fwd_train_test_data.input.desc,
-                                                    bn_fwd_train_test_data.in_dev.get(),
-                                                    &bn_fwd_train_test_data.output.desc,
-                                                    bn_fwd_train_test_data.out_dev.get(),
-                                                    &bn_fwd_train_test_data.scale.desc,
-                                                    bn_fwd_train_test_data.scale_dev.get(),
-                                                    bn_fwd_train_test_data.shift_dev.get(),
-                                                    bn_fwd_train_test_data.averageFactor,
-                                                    bn_fwd_train_test_data.runMean_dev.get(),
-                                                    bn_fwd_train_test_data.runVariance_dev.get(),
-                                                    bn_fwd_train_test_data.epsilon,
-                                                    bn_fwd_train_test_data.saveMean_dev.get(),
-                                                    bn_fwd_train_test_data.saveVariance_dev.get());
+        if(!miopen::solver::ck_utility::is_ck_whitelist(handle.GetStream()))
+        {
+            test_skipped = true;
+            GTEST_SKIP() << "Not Applicable on " << handle.GetDeviceName() << " Architecture";
+        }
+        miopenStatus_t res;
+        if(api_type == BNApiType::testBNAPIV1)
+        {
+            res = miopenBatchNormalizationForwardTraining(
+                &handle,
+                bn_config.mode,
+                &bn_fwd_train_test_data.alpha,
+                &bn_fwd_train_test_data.beta,
+                &bn_fwd_train_test_data.input.desc,
+                bn_fwd_train_test_data.in_dev.get(),
+                &bn_fwd_train_test_data.output.desc,
+                bn_fwd_train_test_data.out_dev.get(),
+                &bn_fwd_train_test_data.scale.desc,
+                bn_fwd_train_test_data.scale_dev.get(),
+                bn_fwd_train_test_data.shift_dev.get(),
+                bn_fwd_train_test_data.averageFactor,
+                bn_fwd_train_test_data.runMean_dev.get(),
+                bn_fwd_train_test_data.runVariance_dev.get(),
+                bn_fwd_train_test_data.epsilon,
+                bn_fwd_train_test_data.saveMean_dev.get(),
+                bn_fwd_train_test_data.saveVariance_dev.get());
+        }
+        else if(api_type == BNApiType::testBNAPIV2)
+        {
+            res = miopenBatchNormalizationForwardTraining_V2(
+                &handle,
+                bn_config.mode,
+                &bn_fwd_train_test_data.alpha,
+                &bn_fwd_train_test_data.beta,
+                &bn_fwd_train_test_data.input.desc,
+                bn_fwd_train_test_data.in_dev.get(),
+                &bn_fwd_train_test_data.output.desc,
+                bn_fwd_train_test_data.out_dev.get(),
+                &bn_fwd_train_test_data.scale.desc,
+                &bn_fwd_train_test_data.shift.desc,
+                &bn_fwd_train_test_data.saveMean.desc,
+                &bn_fwd_train_test_data.saveVariance.desc,
+                bn_fwd_train_test_data.scale_dev.get(),
+                bn_fwd_train_test_data.shift_dev.get(),
+                bn_fwd_train_test_data.averageFactor,
+                bn_fwd_train_test_data.runMean_dev.get(),
+                bn_fwd_train_test_data.runVariance_dev.get(),
+                bn_fwd_train_test_data.epsilon,
+                bn_fwd_train_test_data.saveMean_dev.get(),
+                bn_fwd_train_test_data.saveVariance_dev.get());
+        }
+        else
+            GTEST_FAIL() << "ERROR: unknown bn api type!!";
         if(res != miopenStatusSuccess)
         {
             GTEST_FAIL() << "miopenBatchNormalizationForwardTraining failed";
@@ -275,4 +383,5 @@ struct BNFwdTrainTest
     BNFwdTrainTestData<XDataType, YDataType, ScaleDataType, BiasDataType, AccDataType, BNTestCase>
         bn_fwd_train_test_data;
     miopenTensorLayout_t tensor_layout;
+    BNApiType api_type;
 };
diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp
index f2d54e8077..df093a4710 100644
--- a/test/gtest/bn_bwd.cpp
+++ b/test/gtest/bn_bwd.cpp
@@ -26,46 +26,123 @@
 
 #include "bn.hpp"
 
-struct GPU_BNBwd_FP16
+// https://github.com/ROCm/MIOpen/issues/1549
+// NCHW solver accepts
+// XDataType       : half_float::half
+// YDataYype       : half_float::half
+// ScaleDataType   : half_float::half
+// BiasDataType    : half_float::half
+// MeanVarDataType : half_float::half
+// struct GPU_BN_V1_BwdNCHW_FP16 : BNBwdTest<half_float::half, half_float::half, half_float::half,
+// half_float::half, half_float::half, half_float::half, half_float::half>
+// {
+// };
+
+// NHWC solver accepts
+// XDataType       : half_float::half
+// YDataYype       : half_float::half
+// ScaleDataType   : half_float::half
+// BiasDataType    : half_float::half
+// MeanVarDataType : float
+struct GPU_BN_V2_BwdNHWC_FP16
     : BNBwdTest<half_float::half, float, float, float, half_float::half, float, float>
 {
 };
 
-struct GPU_BNBwd_FP32 : BNBwdTest<float, float, float, float, float, float, float>
+// bf16 NHWC solver accepts is only on CK solver
+// XDataType       : bfloat16
+// YDataYype       : bfloat16
+// ScaleDataType   : bfloat16
+// BiasDataType    : bfloat16
+// MeanVarDataType : float
+struct GPU_BN_V1_BwdNHWC_BFP16 : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float>
+{
+};
+
+struct GPU_BN_V2_BwdNHWC_BFP16 : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float>
+{
+};
+
+struct GPU_BN_V1_Bwd_FP32 : BNBwdTest<float, float, float, float, float, float, float>
+{
+};
+
+struct GPU_BN_V2_Bwd_FP32 : BNBwdTest<float, float, float, float, float, float, float>
 {
 };
 
-struct GPU_BNBwd_BFP16 : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float>
+struct GPU_BN_V1_BwdNHWC_FP64 : BNBwdTest<double, double, double, double, double, double, double>
 {
 };
 
-struct GPU_BNBwd_FP64 : BNBwdTest<double, double, double, double, double, double, double>
+struct GPU_BN_V2_BwdNHWC_FP64 : BNBwdTest<double, double, double, double, double, double, double>
 {
 };
 
-TEST_P(GPU_BNBwd_FP16, BnBwdCKHalf) {}
+// fp16
+// TEST_P(GPU_BN_V1_BwdNCHW_FP16, BnV1BwdHalf) {}
+TEST_P(GPU_BN_V2_BwdNHWC_FP16, BnV2BwdCKHalf) {}
+
+// float
+TEST_P(GPU_BN_V1_Bwd_FP32, BnV1BwdFloat) {}
+TEST_P(GPU_BN_V2_Bwd_FP32, BnV2BwdFloat) {}
+
+// bfp16 is only on CK solver
+TEST_P(GPU_BN_V1_BwdNHWC_BFP16, BnV1BwdCKBfloat) {}
+TEST_P(GPU_BN_V2_BwdNHWC_BFP16, BnV2BwdCKBfloat) {}
 
-TEST_P(GPU_BNBwd_FP32, BnBwdCKFloat) {}
+// double is only on CK solver
+TEST_P(GPU_BN_V1_BwdNHWC_FP64, BnV1BwdCKDouble) {}
+TEST_P(GPU_BN_V2_BwdNHWC_FP64, BnV2BwdCKDouble) {}
 
-TEST_P(GPU_BNBwd_BFP16, BnBwdCKBFloat16) {}
-TEST_P(GPU_BNBwd_FP64, BnBwdCKDouble) {}
+// // fp16
+// INSTANTIATE_TEST_SUITE_P(Smoke,
+//                          GPU_BN_V1_BwdNCHW_FP16,
+//                          testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+//                                           testing::Values(miopenTensorNCHW),
+//                                           testing::ValuesIn({testBNAPIV1})));
+
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_BN_V2_BwdNHWC_FP16,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV2})));
+
+// fp32
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_BN_V1_Bwd_FP32,
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                                          testing::Values(miopenTensorNCHW),
+                                          testing::ValuesIn({testBNAPIV1})));
+
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_BN_V2_Bwd_FP32,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV2})));
 
+// bfp16 is only on CK solver
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BNBwd_FP16,
-                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC)));
+                         GPU_BN_V1_BwdNHWC_BFP16,
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV1})));
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BNBwd_FP32,
-                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC)));
+                         GPU_BN_V2_BwdNHWC_BFP16,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV2})));
 
+// fp64 is only on CK solver
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BNBwd_BFP16,
-                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC)));
+                         GPU_BN_V1_BwdNHWC_FP64,
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV1})));
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BNBwd_FP64,
-                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC)));
+                         GPU_BN_V2_BwdNHWC_FP64,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV2})));
diff --git a/test/gtest/bn_fwd_train.cpp b/test/gtest/bn_fwd_train.cpp
index b5dbed4705..ed25631175 100644
--- a/test/gtest/bn_fwd_train.cpp
+++ b/test/gtest/bn_fwd_train.cpp
@@ -26,46 +26,119 @@
 
 #include "bn.hpp"
 
-struct GPU_BNFwdTrain_FP16
+// ** OCL kernel for fwd training are failing gtest **
+// ** Hence, this gtest only tests CK solvers **
+
+// NHWC solver accepts
+// XDataType       : half_float::half
+// YDataYype       : half_float::half
+// ScaleDataType   : half_float::half
+// BiasDataType    : half_float::half
+// MeanVarDataType : float
+struct GPU_BN_V1_FwdTrainNHWC_FP16
+    : BNFwdTrainTest<half_float::half, half_float::half, half_float::half, half_float::half, float>
+{
+};
+
+struct GPU_BN_V2_FwdTrainNHWC_FP16
     : BNFwdTrainTest<half_float::half, half_float::half, half_float::half, half_float::half, float>
 {
 };
 
-struct GPU_BNFwdTrain_FP32 : BNFwdTrainTest<float, float, float, float, float>
+// bf16 NHWC solver accepts is only on CK solver
+// XDataType       : bfloat16
+// YDataYype       : bfloat16
+// ScaleDataType   : bfloat16
+// BiasDataType    : bfloat16
+// MeanVarDataType : float
+struct GPU_BN_V1_FwdTrainNHWC_BFP16 : BNFwdTrainTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
+{
+};
+
+struct GPU_BN_V2_FwdTrainNHWC_BFP16 : BNFwdTrainTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
 {
 };
 
-struct GPU_BNFwdTrain_FP64 : BNFwdTrainTest<double, double, double, double, double>
+struct GPU_BN_V1_FwdTrainNHWC_FP32 : BNFwdTrainTest<float, float, float, float, float>
 {
 };
 
-struct GPU_BNFwdTrain_BFP16 : BNFwdTrainTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
+struct GPU_BN_V2_FwdTrainNHWC_FP32 : BNFwdTrainTest<float, float, float, float, float>
 {
 };
 
-TEST_P(GPU_BNFwdTrain_FP16, BnFwdTrainCKHalf) {}
+struct GPU_BN_V1_FwdTrainNHWC_FP64 : BNFwdTrainTest<double, double, double, double, double>
+{
+};
 
-TEST_P(GPU_BNFwdTrain_FP32, BnFwdTrainCKFloat) {}
+struct GPU_BN_V2_FwdTrainNHWC_FP64 : BNFwdTrainTest<double, double, double, double, double>
+{
+};
 
-TEST_P(GPU_BNFwdTrain_FP64, BnFwdTrainCKDouble) {}
-TEST_P(GPU_BNFwdTrain_BFP16, BnFwdTrainCKBFloat16) {}
+// fp16
+TEST_P(GPU_BN_V1_FwdTrainNHWC_FP16, BnV1FwdTrainHalf) {}
+TEST_P(GPU_BN_V2_FwdTrainNHWC_FP16, BnV2FwdTrainCKHalf) {}
+
+// float
+TEST_P(GPU_BN_V1_FwdTrainNHWC_FP32, BnV1FwdTrainFloat) {}
+TEST_P(GPU_BN_V2_FwdTrainNHWC_FP32, BnV2FwdTrainFloat) {}
+
+// bfp16
+TEST_P(GPU_BN_V1_FwdTrainNHWC_BFP16, BnV1FwdTrainCKBfloat) {}
+TEST_P(GPU_BN_V2_FwdTrainNHWC_BFP16, BnV2FwdTrainCKBfloat) {}
+
+// double
+TEST_P(GPU_BN_V1_FwdTrainNHWC_FP64, BnV1FwdTrainCKDouble) {}
+TEST_P(GPU_BN_V2_FwdTrainNHWC_FP64, BnV2FwdTrainCKDouble) {}
+
+// fp16
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_BN_V1_FwdTrainNHWC_FP16,
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV1})));
+
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_BN_V2_FwdTrainNHWC_FP16,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV2})));
+
+// fp32
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_BN_V1_FwdTrainNHWC_FP32,
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV1})));
+
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_BN_V2_FwdTrainNHWC_FP32,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV2})));
 
+// bfp16
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BNFwdTrain_FP16,
-                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC)));
+                         GPU_BN_V1_FwdTrainNHWC_BFP16,
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV1})));
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BNFwdTrain_FP32,
-                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC)));
+                         GPU_BN_V2_FwdTrainNHWC_BFP16,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV2})));
 
+// fp64
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BNFwdTrain_FP64,
-                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC)));
+                         GPU_BN_V1_FwdTrainNHWC_FP64,
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV1})));
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BNFwdTrain_BFP16,
-                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC)));
+                         GPU_BN_V2_FwdTrainNHWC_FP64,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV2})));
diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp
index 081d583213..aea15c097e 100644
--- a/test/gtest/bn_infer.cpp
+++ b/test/gtest/bn_infer.cpp
@@ -27,66 +27,124 @@
 #include "bn.hpp"
 
 // NCHW solver accepts
-// XDataType       : half
-// YDataYype       : half
-// ScaleDataType   : float
-// BiasDataType    : float
-// MeanVarDataType : float
-struct GPU_BNInferNCHW_FP16 : BNInferTest<half_float::half, half_float::half, float, float, float>
+// XDataType       : half_float::half
+// YDataYype       : half_float::half
+// ScaleDataType   : half_float::half
+// BiasDataType    : half_float::half
+// MeanVarDataType : half_float::half
+struct GPU_BN_V1_InferNCHW_FP16 : BNInferTest<half_float::half,
+                                              half_float::half,
+                                              half_float::half,
+                                              half_float::half,
+                                              half_float::half>
 {
 };
 
 // NHWC solver accepts
-// XDataType       : half
-// YDataYype       : half
-// ScaleDataType   : half
-// BiasDataType    : half
+// XDataType       : half_float::half
+// YDataYype       : half_float::half
+// ScaleDataType   : half_float::half
+// BiasDataType    : half_float::half
 // MeanVarDataType : float
-struct GPU_BNInferNHWC_FP16
+struct GPU_BN_V2_InferNHWC_FP16
     : BNInferTest<half_float::half, half_float::half, half_float::half, half_float::half, float>
 {
 };
 
-struct GPU_BNInfer_FP32 : BNInferTest<float, float, float, float, float>
+// bf16 NHWC solver accepts is only on CK solver
+// XDataType       : bfloat16
+// YDataYype       : bfloat16
+// ScaleDataType   : bfloat16
+// BiasDataType    : bfloat16
+// MeanVarDataType : float
+struct GPU_BN_V1_InferNHWC_BFP16 : BNInferTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
+{
+};
+
+struct GPU_BN_V2_InferNHWC_BFP16 : BNInferTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
 {
 };
 
-struct GPU_BNInfer_FP64 : BNInferTest<double, double, double, double, double>
+struct GPU_BN_V1_Infer_FP32 : BNInferTest<float, float, float, float, float>
 {
 };
 
-struct GPU_BNInfer_BFP16 : BNInferTest<bfloat16, bfloat16, bfloat16, bfloat16, float>
+struct GPU_BN_V2_Infer_FP32 : BNInferTest<float, float, float, float, float>
 {
 };
 
-TEST_P(GPU_BNInferNCHW_FP16, BnInferCKHalf) {}
-TEST_P(GPU_BNInferNHWC_FP16, BnInferCKHalf) {}
+struct GPU_BN_V1_InferNHWC_FP64 : BNInferTest<double, double, double, double, double>
+{
+};
+
+struct GPU_BN_V2_InferNHWC_FP64 : BNInferTest<double, double, double, double, double>
+{
+};
 
-TEST_P(GPU_BNInfer_FP32, BnInferCKFloat) {}
-TEST_P(GPU_BNInfer_FP64, BnInferCKDouble) {}
-TEST_P(GPU_BNInfer_BFP16, BnInferCKBFloat16) {}
+// fp16
+TEST_P(GPU_BN_V1_InferNCHW_FP16, BnV1InferHalf) {}
+TEST_P(GPU_BN_V2_InferNHWC_FP16, BnV2InferCKHalf) {}
+
+// float
+TEST_P(GPU_BN_V1_Infer_FP32, BnV1InferFloat) {}
+TEST_P(GPU_BN_V2_Infer_FP32, BnV2InferFloat) {}
+
+// bfp16 is only on CK solver
+TEST_P(GPU_BN_V1_InferNHWC_BFP16, BnV1InferCKBfloat) {}
+TEST_P(GPU_BN_V2_InferNHWC_BFP16, BnV2InferCKBfloat) {}
+
+// double is only on CK solver
+TEST_P(GPU_BN_V1_InferNHWC_FP64, BnV1InferCKDouble) {}
+TEST_P(GPU_BN_V2_InferNHWC_FP64, BnV2InferCKDouble) {}
+
+// fp16
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_BN_V1_InferNCHW_FP16,
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                                          testing::Values(miopenTensorNCHW),
+                                          testing::ValuesIn({testBNAPIV1})));
+
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_BN_V2_InferNHWC_FP16,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV2})));
+
+// fp32
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_BN_V1_Infer_FP32,
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                                          testing::Values(miopenTensorNCHW),
+                                          testing::ValuesIn({testBNAPIV1})));
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BNInferNCHW_FP16,
-                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-                                          testing::Values(miopenTensorNCHW)));
+                         GPU_BN_V2_Infer_FP32,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV2})));
 
+// bfp16 is only on CK solver
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BNInferNHWC_FP16,
-                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-                                          testing::Values(miopenTensorNHWC)));
+                         GPU_BN_V1_InferNHWC_BFP16,
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV1})));
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BNInfer_FP32,
-                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-                                          testing::ValuesIn({miopenTensorNHWC, miopenTensorNCHW})));
+                         GPU_BN_V2_InferNHWC_BFP16,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV2})));
 
+// fp64 is only on CK solver
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BNInfer_FP64,
-                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-                                          testing::ValuesIn({miopenTensorNHWC})));
+                         GPU_BN_V1_InferNHWC_FP64,
+                         testing::Combine(testing::ValuesIn(NetworkSmall<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV1})));
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BNInfer_BFP16,
-                         testing::Combine(testing::ValuesIn(Network1<BNTestCase>()),
-                                          testing::ValuesIn({miopenTensorNHWC})));
+                         GPU_BN_V2_InferNHWC_FP64,
+                         testing::Combine(testing::ValuesIn(NetworkLarge<BNTestCase>()),
+                                          testing::Values(miopenTensorNHWC),
+                                          testing::ValuesIn({testBNAPIV2})));
diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp
index fcf237400b..2d8ab5c5bf 100644
--- a/test/gtest/bn_test_data.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -56,10 +56,13 @@ struct BNTestCase
 };
 
 template <typename T>
-std::vector<T> Network1();
+std::vector<T> NetworkSmall();
+
+template <typename T>
+std::vector<T> NetworkLarge();
 
 template <>
-inline std::vector<BNTestCase> Network1()
+inline std::vector<BNTestCase> NetworkLarge()
 {
     // pyt_mlperf_resnet50v1.5
     return {
@@ -95,6 +98,20 @@ inline std::vector<BNTestCase> Network1()
         {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}};
 }
 
+template <>
+inline std::vector<BNTestCase> NetworkSmall()
+{
+    // pyt_mlperf_resnet50v1.5
+    return {
+        {192, 2, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0},
+        {16, 8, 132, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0},
+        // {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0},
+        // {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+
+    };
+}
+
 template <typename XDataType, typename YDataType, typename TConfig>
 struct BNTestData
 {

From d5212dbbb6eee023afbb10c1147dd0551147b1ad Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Tue, 15 Oct 2024 15:50:04 +0000
Subject: [PATCH 22/27] typo

---
 test/gtest/bn_test_data.hpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp
index 2d8ab5c5bf..9afa8ea4ed 100644
--- a/test/gtest/bn_test_data.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -105,9 +105,8 @@ inline std::vector<BNTestCase> NetworkSmall()
     return {
         {192, 2, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0},
         {16, 8, 132, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0},
-        // {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0},
-        // {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
+        {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0},
+        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
 
     };
 }

From 7cef7665e1e4c1ade816b9873ff20c6188b82621 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Wed, 16 Oct 2024 20:01:42 +0000
Subject: [PATCH 23/27] address review comments

---
 driver/bn_driver.hpp   | 25 +++++++------------
 src/batch_norm_api.cpp | 54 +++++++++++++++++++++---------------------
 2 files changed, 36 insertions(+), 43 deletions(-)

diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index e8ae9ff216..da1be3a066 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -101,7 +101,7 @@ class BatchNormDriver : public Driver
     int VerifyForward() override;
 
     // Helper function to check the Layout type short names
-    int ChkLayout_ShortName();
+    bool ChkLayout_ShortName();
     // function to validate the Layout type parameters.
     // layout parameter value to std (NCHW/NHWC/NCDHW/NDHWC) values,
     // defined in MIOpen lib.
@@ -345,14 +345,14 @@ std::vector<int> BatchNormDriver<Tgpu, Tref, Tmix>::GetInputTensorLengthsFromCmd
 }
 
 template <typename Tgpu, typename Tref, typename Tmix>
-int BatchNormDriver<Tgpu, Tref, Tmix>::ChkLayout_ShortName()
+bool BatchNormDriver<Tgpu, Tref, Tmix>::ChkLayout_ShortName()
 {
     // check for short name of layout type
     if(inflags.FindShortName("layout") == 'L')
     {
         // do noting
         // found valid short names
-        return 0;
+        return true;
     }
     else
     {
@@ -364,23 +364,16 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::ChkLayout_ShortName()
 template <typename Tgpu, typename Tref, typename Tmix>
 void BatchNormDriver<Tgpu, Tref, Tmix>::ValidateLayoutInputParameters(std::string layout_value)
 {
-    if((ChkLayout_ShortName()))
+    if(!ChkLayout_ShortName())
     {
-        std::cerr << " Invalid Layout Short Name = " << ChkLayout_ShortName() << std::endl;
+        std::cerr << "Invalid Layout Short Name = " << inflags.FindShortName("layout") << std::endl;
         exit(EXIT_FAILURE);
     }
-    else
+    if((layout_value.compare("NCHW") != 0) && (layout_value.compare("NHWC") != 0) &&
+       (layout_value.compare("NCDHW") != 0) && (layout_value.compare("NDHWC") != 0))
     {
-        if((layout_value.compare("NCHW") == 0) || (layout_value.compare("NHWC") == 0) ||
-           (layout_value.compare("NCDHW") == 0) || (layout_value.compare("NDHWC") == 0))
-        {
-            // do nothing,Values are matching as defined in Lib.
-        }
-        else
-        {
-            std::cerr << "Invalid Layout Parameter Value - " << layout_value << std::endl;
-            exit(EXIT_FAILURE);
-        }
+        std::cerr << "Invalid Layout Parameter Value - " << layout_value << std::endl;
+        exit(EXIT_FAILURE);
     }
 }
 
diff --git a/src/batch_norm_api.cpp b/src/batch_norm_api.cpp
index 72e6a64554..56d6fbb5e8 100644
--- a/src/batch_norm_api.cpp
+++ b/src/batch_norm_api.cpp
@@ -138,9 +138,9 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle,
                                                       yDesc,
                                                       y,
                                                       bnScaleBiasMeanVarDesc,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr,
+                                                      bnScaleBiasMeanVarDesc,
+                                                      bnScaleBiasMeanVarDesc,
+                                                      bnScaleBiasMeanVarDesc,
                                                       bnScale,
                                                       bnBias,
                                                       expAvgFactor,
@@ -185,9 +185,9 @@ miopenBatchNormalizationBackward(miopenHandle_t handle,
                                                dxDesc,
                                                dx,
                                                bnScaleBiasDiffDesc,
-                                               nullptr,
-                                               nullptr,
-                                               nullptr,
+                                               bnScaleBiasDiffDesc,
+                                               bnScaleBiasDiffDesc,
+                                               bnScaleBiasDiffDesc,
                                                bnScale,
                                                resultBnScaleDiff,
                                                resultBnBiasDiff,
@@ -222,9 +222,9 @@ miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle,
                         yDesc,
                         y,
                         scaleDesc,
-                        (BiasDesc == nullptr) ? scaleDesc : BiasDesc,
-                        (estMeanDesc == nullptr) ? scaleDesc : estMeanDesc,
-                        (estVarianceDesc == nullptr) ? scaleDesc : estVarianceDesc,
+                        BiasDesc,
+                        estMeanDesc,
+                        estVarianceDesc,
                         bnScale,
                         bnBias,
                         estimatedMean,
@@ -232,7 +232,7 @@ miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle,
                         epsilon);
 
     miopen::debug::LogCmdBNorm(xDesc,
-                               (estMeanDesc == nullptr) ? scaleDesc : estMeanDesc,
+                               estMeanDesc,
                                bn_mode,
                                estimatedMean,
                                estimatedVariance,
@@ -256,9 +256,9 @@ miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle,
                         : miopen::deref(yDesc),
             DataCast(y),
             miopen::deref(scaleDesc),
-            miopen::deref((BiasDesc == nullptr) ? scaleDesc : BiasDesc),
-            miopen::deref((estMeanDesc == nullptr) ? scaleDesc : estMeanDesc),
-            miopen::deref((estVarianceDesc == nullptr) ? scaleDesc : estVarianceDesc),
+            miopen::deref(BiasDesc),
+            miopen::deref(estMeanDesc),
+            miopen::deref(estVarianceDesc),
             DataCast(bnScale),
             DataCast(bnBias),
             DataCast(estimatedMean),
@@ -296,9 +296,9 @@ miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle,
                         yDesc,
                         y,
                         scaleDesc,
-                        (BiasDesc == nullptr) ? scaleDesc : BiasDesc,
-                        (savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc,
-                        (savedVarianceDesc == nullptr) ? scaleDesc : savedVarianceDesc,
+                        BiasDesc,
+                        savedMeanDesc,
+                        savedVarianceDesc,
                         bnScale,
                         bnBias,
                         expAvgFactor,
@@ -309,7 +309,7 @@ miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle,
                         resultSaveInvVariance);
 
     miopen::debug::LogCmdBNorm(xDesc,
-                               (savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc,
+                               savedMeanDesc,
                                bn_mode,
                                resultRunningMean,
                                resultRunningVariance,
@@ -332,9 +332,9 @@ miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle,
                         : miopen::deref(yDesc),
             DataCast(y),
             miopen::deref(scaleDesc),
-            miopen::deref((BiasDesc == nullptr) ? scaleDesc : BiasDesc),
-            miopen::deref((savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc),
-            miopen::deref((savedVarianceDesc == nullptr) ? scaleDesc : savedVarianceDesc),
+            miopen::deref(BiasDesc),
+            miopen::deref(savedMeanDesc),
+            miopen::deref(savedVarianceDesc),
             DataCast(bnScale),
             DataCast(bnBias),
             expAvgFactor,
@@ -379,9 +379,9 @@ miopenBatchNormalizationBackward_V2(miopenHandle_t handle,
                         dxDesc,
                         dx,
                         scaleDesc,
-                        (BiasDesc == nullptr) ? scaleDesc : BiasDesc,
-                        (savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc,
-                        (savedVarianceDesc == nullptr) ? scaleDesc : savedVarianceDesc,
+                        BiasDesc,
+                        savedMeanDesc,
+                        savedVarianceDesc,
                         bnScale,
                         resultBnScaleDiff,
                         resultBnBiasDiff,
@@ -389,7 +389,7 @@ miopenBatchNormalizationBackward_V2(miopenHandle_t handle,
                         savedMean,
                         savedInvVariance);
     miopen::debug::LogCmdBNorm(xDesc,
-                               (savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc,
+                               savedMeanDesc,
                                bn_mode,
                                nullptr,
                                nullptr,
@@ -417,9 +417,9 @@ miopenBatchNormalizationBackward_V2(miopenHandle_t handle,
                         : miopen::deref(dxDesc),
             DataCast(dx),
             miopen::deref(scaleDesc),
-            miopen::deref((BiasDesc == nullptr) ? scaleDesc : BiasDesc),
-            miopen::deref((savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc),
-            miopen::deref((savedVarianceDesc == nullptr) ? scaleDesc : savedVarianceDesc),
+            miopen::deref(BiasDesc),
+            miopen::deref(savedMeanDesc),
+            miopen::deref(savedVarianceDesc),
             DataCast(bnScale),
             DataCast(resultBnScaleDiff),
             DataCast(resultBnBiasDiff),

From 379a72020907d09bf8e3bc9a190192f0ea49da11 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Wed, 16 Oct 2024 20:22:25 +0000
Subject: [PATCH 24/27] driver to use V2 bath norm API

---
 driver/bn_driver.hpp     | 329 +++++++++++++++++++++------------------
 src/driver_arguments.cpp |   2 +-
 2 files changed, 179 insertions(+), 152 deletions(-)

diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index da1be3a066..238b4ea1e6 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -602,37 +602,43 @@ void BatchNormDriver<Tgpu, Tref, Tmix>::runGPUFwdInference(Tref epsilon, float a
 
     if(keepRunningMeanVar)
     { // use precalculated mean and variance
-        miopenBatchNormalizationForwardInference(GetHandle(),
-                                                 bn_mode,
-                                                 &alpha,
-                                                 &beta,
-                                                 &in.GetTensor().desc,
-                                                 in.GetDevicePtr(),
-                                                 &out.GetTensor().desc,
-                                                 out.GetDevicePtr(),
-                                                 &scale.GetTensor().desc,
-                                                 scale.GetDevicePtr(),
-                                                 bias.GetDevicePtr(),
-                                                 estMean.GetDevicePtr(),
-                                                 estVariance.GetDevicePtr(),
-                                                 epsilon);
+        miopenBatchNormalizationForwardInference_V2(GetHandle(),
+                                                    bn_mode,
+                                                    &alpha,
+                                                    &beta,
+                                                    &in.GetTensor().desc,
+                                                    in.GetDevicePtr(),
+                                                    &out.GetTensor().desc,
+                                                    out.GetDevicePtr(),
+                                                    &scale.GetTensor().desc,
+                                                    &bias.GetTensor().desc,
+                                                    &estMean.GetTensor().desc,
+                                                    &estVariance.GetTensor().desc,
+                                                    scale.GetDevicePtr(),
+                                                    bias.GetDevicePtr(),
+                                                    estMean.GetDevicePtr(),
+                                                    estVariance.GetDevicePtr(),
+                                                    epsilon);
     }
     else
     { // recalculate mean and variance
-        miopenBatchNormalizationForwardInference(GetHandle(),
-                                                 bn_mode,
-                                                 &alpha,
-                                                 &beta,
-                                                 &in.GetTensor().desc,
-                                                 in.GetDevicePtr(),
-                                                 &out.GetTensor().desc,
-                                                 out.GetDevicePtr(),
-                                                 &scale.GetTensor().desc,
-                                                 scale.GetDevicePtr(),
-                                                 bias.GetDevicePtr(),
-                                                 nullptr,
-                                                 nullptr,
-                                                 epsilon);
+        miopenBatchNormalizationForwardInference_V2(GetHandle(),
+                                                    bn_mode,
+                                                    &alpha,
+                                                    &beta,
+                                                    &in.GetTensor().desc,
+                                                    in.GetDevicePtr(),
+                                                    &out.GetTensor().desc,
+                                                    out.GetDevicePtr(),
+                                                    &scale.GetTensor().desc,
+                                                    &bias.GetTensor().desc,
+                                                    &estMean.GetTensor().desc,
+                                                    &estVariance.GetTensor().desc,
+                                                    scale.GetDevicePtr(),
+                                                    bias.GetDevicePtr(),
+                                                    nullptr,
+                                                    nullptr,
+                                                    epsilon);
     }
 
     return;
@@ -646,103 +652,118 @@ void BatchNormDriver<Tgpu, Tref, Tmix>::runGPUFwdTrain(Tref epsilon,
 {
     if(saveMeanVar && keepRunningMeanVar)
     {
-        miopenBatchNormalizationForwardTraining(GetHandle(),
-                                                bn_mode,
-                                                &alpha,
-                                                &beta,
-                                                &in.GetTensor().desc,
-                                                in.GetDevicePtr(),
-                                                &out.GetTensor().desc,
-                                                out.GetDevicePtr(),
-                                                &scale.GetTensor().desc,
-                                                scale.GetDevicePtr(),
-                                                bias.GetDevicePtr(),
-                                                eAF,
-                                                runMean.GetDevicePtr(),
-                                                runVariance.GetDevicePtr(),
-                                                epsilon,
-                                                savedMean.GetDevicePtr(),
-                                                savedVariance.GetDevicePtr());
+        miopenBatchNormalizationForwardTraining_V2(GetHandle(),
+                                                   bn_mode,
+                                                   &alpha,
+                                                   &beta,
+                                                   &in.GetTensor().desc,
+                                                   in.GetDevicePtr(),
+                                                   &out.GetTensor().desc,
+                                                   out.GetDevicePtr(),
+                                                   &scale.GetTensor().desc,
+                                                   &bias.GetTensor().desc,
+                                                   &savedMean.GetTensor().desc,
+                                                   &savedVariance.GetTensor().desc,
+                                                   scale.GetDevicePtr(),
+                                                   bias.GetDevicePtr(),
+                                                   eAF,
+                                                   runMean.GetDevicePtr(),
+                                                   runVariance.GetDevicePtr(),
+                                                   epsilon,
+                                                   savedMean.GetDevicePtr(),
+                                                   savedVariance.GetDevicePtr());
     }
     else if(saveMeanVar)
     {
-        miopenBatchNormalizationForwardTraining(GetHandle(),
-                                                bn_mode,
-                                                &alpha,
-                                                &beta,
-                                                &in.GetTensor().desc,
-                                                in.GetDevicePtr(),
-                                                &out.GetTensor().desc,
-                                                out.GetDevicePtr(),
-                                                &scale.GetTensor().desc,
-                                                scale.GetDevicePtr(),
-                                                bias.GetDevicePtr(),
-                                                eAF,
-                                                nullptr,
-                                                nullptr,
-                                                epsilon,
-                                                savedMean.GetDevicePtr(),
-                                                savedVariance.GetDevicePtr());
+        miopenBatchNormalizationForwardTraining_V2(GetHandle(),
+                                                   bn_mode,
+                                                   &alpha,
+                                                   &beta,
+                                                   &in.GetTensor().desc,
+                                                   in.GetDevicePtr(),
+                                                   &out.GetTensor().desc,
+                                                   out.GetDevicePtr(),
+                                                   &scale.GetTensor().desc,
+                                                   &bias.GetTensor().desc,
+                                                   &savedMean.GetTensor().desc,
+                                                   &savedVariance.GetTensor().desc,
+                                                   scale.GetDevicePtr(),
+                                                   bias.GetDevicePtr(),
+                                                   eAF,
+                                                   nullptr,
+                                                   nullptr,
+                                                   epsilon,
+                                                   savedMean.GetDevicePtr(),
+                                                   savedVariance.GetDevicePtr());
     }
     else if(keepRunningMeanVar)
     {
-        miopenBatchNormalizationForwardTraining(GetHandle(),
-                                                bn_mode,
-                                                &alpha,
-                                                &beta,
-                                                &in.GetTensor().desc,
-                                                in.GetDevicePtr(),
-                                                &out.GetTensor().desc,
-                                                out.GetDevicePtr(),
-                                                &scale.GetTensor().desc,
-                                                scale.GetDevicePtr(),
-                                                bias.GetDevicePtr(),
-                                                eAF,
-                                                runMean.GetDevicePtr(),
-                                                runVariance.GetDevicePtr(),
-                                                epsilon,
-                                                nullptr,
-                                                nullptr);
+        miopenBatchNormalizationForwardTraining_V2(GetHandle(),
+                                                   bn_mode,
+                                                   &alpha,
+                                                   &beta,
+                                                   &in.GetTensor().desc,
+                                                   in.GetDevicePtr(),
+                                                   &out.GetTensor().desc,
+                                                   out.GetDevicePtr(),
+                                                   &scale.GetTensor().desc,
+                                                   &bias.GetTensor().desc,
+                                                   &savedMean.GetTensor().desc,
+                                                   &savedVariance.GetTensor().desc,
+                                                   scale.GetDevicePtr(),
+                                                   bias.GetDevicePtr(),
+                                                   eAF,
+                                                   runMean.GetDevicePtr(),
+                                                   runVariance.GetDevicePtr(),
+                                                   epsilon,
+                                                   nullptr,
+                                                   nullptr);
     }
     else
     {
-        miopenBatchNormalizationForwardTraining(GetHandle(),
-                                                bn_mode,
-                                                &alpha,
-                                                &beta,
-                                                &in.GetTensor().desc,
-                                                in.GetDevicePtr(),
-                                                &out.GetTensor().desc,
-                                                out.GetDevicePtr(),
-                                                &scale.GetTensor().desc,
-                                                scale.GetDevicePtr(),
-                                                bias.GetDevicePtr(),
-                                                eAF,
-                                                nullptr,
-                                                nullptr,
-                                                epsilon,
-                                                nullptr,
-                                                nullptr);
+        miopenBatchNormalizationForwardTraining_V2(GetHandle(),
+                                                   bn_mode,
+                                                   &alpha,
+                                                   &beta,
+                                                   &in.GetTensor().desc,
+                                                   in.GetDevicePtr(),
+                                                   &out.GetTensor().desc,
+                                                   out.GetDevicePtr(),
+                                                   &scale.GetTensor().desc,
+                                                   &bias.GetTensor().desc,
+                                                   &savedMean.GetTensor().desc,
+                                                   &savedVariance.GetTensor().desc,
+                                                   scale.GetDevicePtr(),
+                                                   bias.GetDevicePtr(),
+                                                   eAF,
+                                                   nullptr,
+                                                   nullptr,
+                                                   epsilon,
+                                                   nullptr,
+                                                   nullptr);
     }
 
 #ifdef BN_RUNFOR_PROFILER
-    miopenBatchNormalizationForwardTraining(GetHandle(),
-                                            bn_mode,
-                                            &alpha,
-                                            &beta,
-                                            &in.GetTensor().desc,
-                                            in.GetDevicePtr(),
-                                            &out.GetTensor().desc,
-                                            out.GetDevicePtr(),
-                                            &scale.GetTensor().desc,
-                                            scale.GetDevicePtr(),
-                                            bias.GetDevicePtr(),
-                                            eAF,
-                                            nullptr,
-                                            nullptr,
-                                            epsilon,
-                                            nullptr,
-                                            nullptr);
+    miopenBatchNormalizationForwardTraining_V2(GetHandle(),
+                                               bn_mode,
+                                               &alpha,
+                                               &beta,
+                                               &in.GetTensor().desc,
+                                               in.GetDevicePtr(),
+                                               &out.GetTensor().desc,
+                                               out.GetDevicePtr(),
+                                               &scale.GetTensor().desc,
+                                               &bias.GetTensor().desc,
+                                               &savedMean.GetTensor().desc,
+                                               &savedVariance.GetTensor().desc,
+                                               scale.GetDevicePtr(),
+                                               bias.GetDevicePtr(),
+                                               eAF,
+                                               nullptr,
+                                               nullptr,
+                                               epsilon,
+                                               nullptr,
+                                               nullptr);
 #endif
 }
 
@@ -1015,47 +1036,53 @@ int BatchNormDriver<Tgpu, Tref, Tmix>::RunBackwardGPU()
 
         if(saveMeanVar)
         {
-            miopenBatchNormalizationBackward(GetHandle(),
-                                             bn_mode,
-                                             &alphaDataDiff,
-                                             &betaDataDiff,
-                                             &alphaParamDiff,
-                                             &betaParamDiff,
-                                             &in.GetTensor().desc,
-                                             in.GetDevicePtr(),
-                                             &dy.GetTensor().desc,
-                                             dy.GetDevicePtr(),
-                                             &out_bwd.GetTensor().desc,
-                                             out_bwd.GetDevicePtr(),
-                                             &bnScale.GetTensor().desc,
-                                             bnScale.GetDevicePtr(),
-                                             dScale.GetDevicePtr(),
-                                             dBias.GetDevicePtr(),
-                                             epsilon,
-                                             savedMean.GetDevicePtr(),
-                                             savedInvVar.GetDevicePtr());
+            miopenBatchNormalizationBackward_V2(GetHandle(),
+                                                bn_mode,
+                                                &alphaDataDiff,
+                                                &betaDataDiff,
+                                                &alphaParamDiff,
+                                                &betaParamDiff,
+                                                &in.GetTensor().desc,
+                                                in.GetDevicePtr(),
+                                                &dy.GetTensor().desc,
+                                                dy.GetDevicePtr(),
+                                                &out_bwd.GetTensor().desc,
+                                                out_bwd.GetDevicePtr(),
+                                                &bnScale.GetTensor().desc,
+                                                &dBias.GetTensor().desc,
+                                                &savedMean.GetTensor().desc,
+                                                &savedInvVar.GetTensor().desc,
+                                                bnScale.GetDevicePtr(),
+                                                dScale.GetDevicePtr(),
+                                                dBias.GetDevicePtr(),
+                                                epsilon,
+                                                savedMean.GetDevicePtr(),
+                                                savedInvVar.GetDevicePtr());
         }
         else
         {
-            miopenBatchNormalizationBackward(GetHandle(),
-                                             bn_mode,
-                                             &alphaDataDiff,
-                                             &betaDataDiff,
-                                             &alphaParamDiff,
-                                             &betaParamDiff,
-                                             &in.GetTensor().desc,
-                                             in.GetDevicePtr(),
-                                             &dy.GetTensor().desc,
-                                             dy.GetDevicePtr(),
-                                             &out_bwd.GetTensor().desc,
-                                             out_bwd.GetDevicePtr(),
-                                             &bnScale.GetTensor().desc,
-                                             bnScale.GetDevicePtr(),
-                                             dScale.GetDevicePtr(),
-                                             dBias.GetDevicePtr(),
-                                             epsilon,
-                                             nullptr,
-                                             nullptr);
+            miopenBatchNormalizationBackward_V2(GetHandle(),
+                                                bn_mode,
+                                                &alphaDataDiff,
+                                                &betaDataDiff,
+                                                &alphaParamDiff,
+                                                &betaParamDiff,
+                                                &in.GetTensor().desc,
+                                                in.GetDevicePtr(),
+                                                &dy.GetTensor().desc,
+                                                dy.GetDevicePtr(),
+                                                &out_bwd.GetTensor().desc,
+                                                out_bwd.GetDevicePtr(),
+                                                &bnScale.GetTensor().desc,
+                                                &dBias.GetTensor().desc,
+                                                &savedMean.GetTensor().desc,
+                                                &savedInvVar.GetTensor().desc,
+                                                bnScale.GetDevicePtr(),
+                                                dScale.GetDevicePtr(),
+                                                dBias.GetDevicePtr(),
+                                                epsilon,
+                                                nullptr,
+                                                nullptr);
         }
 
         miopen::deref(GetHandle()).Finish();
diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp
index 971977afa9..e75ec31902 100644
--- a/src/driver_arguments.cpp
+++ b/src/driver_arguments.cpp
@@ -76,7 +76,7 @@ void BnDataType(std::stringstream& ss,
     }
     else if(xDesc.GetType() == miopenHalf && sMeanDesc.GetType() == miopenFloat)
     {
-        ss << "bnormbfp16fp32";
+        ss << "bnormfp16fp32";
     }
     else if(xDesc.GetType() == miopenBFloat16 && sMeanDesc.GetType() == miopenFloat)
     {

From c746d3dfacf7a1f7424dfee0f54b0631a5ef7994 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Thu, 17 Oct 2024 00:14:09 +0000
Subject: [PATCH 25/27] Update fin to develop branch

---
 fin | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fin b/fin
index 8c40a3c3b4..344cf42f6c 160000
--- a/fin
+++ b/fin
@@ -1 +1 @@
-Subproject commit 8c40a3c3b41a7d2fb31a8e747155fde4223919b9
+Subproject commit 344cf42f6c18f309f3d1dd08af1cd7b73dd38e46

From d2b851fa7e3af20738e35a9919b040abe38d7284 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Thu, 17 Oct 2024 01:51:33 +0000
Subject: [PATCH 26/27] fix hip tidy

---
 test/gtest/bn.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index fdff351f79..e1f192c37d 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -59,7 +59,7 @@ struct BNInferTest
             test_skipped = true;
             GTEST_SKIP() << "Not Applicable on " << handle.GetDeviceName() << " Architecture";
         }
-        miopenStatus_t res;
+        miopenStatus_t res = miopenStatusUnknownError;
         if(api_type == BNApiType::testBNAPIV1)
         {
             res = miopenBatchNormalizationForwardInference(&handle,
@@ -155,7 +155,7 @@ struct BNBwdTest
             test_skipped = true;
             GTEST_SKIP() << "Not Applicable on " << handle.GetDeviceName() << " Architecture";
         }
-        miopenStatus_t res;
+        miopenStatus_t res = miopenStatusUnknownError;
         if(api_type == BNApiType::testBNAPIV1)
         {
             res = miopenBatchNormalizationBackward(&handle,
@@ -278,7 +278,7 @@ struct BNFwdTrainTest
             test_skipped = true;
             GTEST_SKIP() << "Not Applicable on " << handle.GetDeviceName() << " Architecture";
         }
-        miopenStatus_t res;
+        miopenStatus_t res = miopenStatusUnknownError;
         if(api_type == BNApiType::testBNAPIV1)
         {
             res = miopenBatchNormalizationForwardTraining(

From cf62a8f194274657c663a9fadf6a548a24654e91 Mon Sep 17 00:00:00 2001
From: Bibek Ghimire <bghimire@amd.com>
Date: Thu, 17 Oct 2024 03:48:05 +0000
Subject: [PATCH 27/27] fix CI

---
 src/batch_norm_api.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/batch_norm_api.cpp b/src/batch_norm_api.cpp
index 56d6fbb5e8..d3b824cee0 100644
--- a/src/batch_norm_api.cpp
+++ b/src/batch_norm_api.cpp
@@ -100,9 +100,9 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle,
                                                        yDesc,
                                                        y,
                                                        bnScaleBiasMeanVarDesc,
-                                                       nullptr,
-                                                       nullptr,
-                                                       nullptr,
+                                                       bnScaleBiasMeanVarDesc,
+                                                       bnScaleBiasMeanVarDesc,
+                                                       bnScaleBiasMeanVarDesc,
                                                        bnScale,
                                                        bnBias,
                                                        estimatedMean,