ROCm · bghimireamd · Sep 4, 2024 · Sep 4, 2024 · Sep 5, 2024 · Sep 5, 2024
@@ -180,135 +180,6 @@ static inline miopenDataType_t DataTypeFromShortString(const std::string& type)
     }
 }
 
-template <typename Tgpu>
-class GpumemTensor
-{
-    std::unique_ptr<GPUMem> dev;
-    tensor<Tgpu> host;
-    bool is_gpualloc = false;
-
-public:
-    void SetGpuallocMode(bool v) { is_gpualloc = v; }
-    tensor<Tgpu>& GetTensor() { return host; }
-
-    void AllocOnHost(miopenTensorDescriptor_t t)
-    {
-        host = tensor<Tgpu>(miopen::deref(t));
-        if(is_gpualloc) // We do not need host data.
-        {
-            host.data.clear();
-            host.data.shrink_to_fit(); // To free host memory.
-        }
-    }
-
-    std::vector<Tgpu>& GetVector()
-    {
-        if(is_gpualloc)
-            MIOPEN_THROW("[MIOpenDriver] GpumemTensor::GetVector should not be called in "
-                         "'--gpualloc 1' mode");
-        return host.data;
-    }
-
-    Tgpu* GetVectorData() { return is_gpualloc ? nullptr : host.data.data(); }
-    std::size_t GetVectorSize() const { return is_gpualloc ? 0 : host.data.size(); }
-
-    void
-    InitHostData(const size_t sz,     //
-                 const bool do_write, // If set to false, then only generate random data. This is
-                                      // necessary to reproduce values in input buffers even if some
-                                      // directions are skipped. For example, inputs for Backward
-                                      // will be the same for both "-F 0" and "-F 2".
-                 std::function<Tgpu()> generator)
-    {
-        if(is_gpualloc)
-        {
-            /// In gpualloc mode, we do not care about reproducibility of results, because
-            /// validation is not used. Therefore, we do not have to always generate random value
-            /// (\ref move_rand)
-            return;
-        }
-
-        for(size_t i = 0; i < sz; ++i)
-        {
-            /// \anchor move_rand
-            /// Generate random value, even if buffer is unused. This provides the same
-            /// initialization of input buffers regardless of which kinds of
-            /// convolutions are currently selectedfor testing (see the "-F" option).
-            /// Verification cache would be broken otherwise.
-            auto val = generator();
-            if(do_write)
-                GetVector()[i] = val;
-        }
-    }
-
-    status_t AllocOnDevice(stream, context_t ctx, const size_t sz)
-    {
-        dev = std::make_unique<GPUMem>(ctx, sz, sizeof(Tgpu));
-        return STATUS_SUCCESS;
-    }
-
-    status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz)
-    {
-        AllocOnDevice(q, ctx, sz);
-        if(is_gpualloc)
-        {
-            /// \anchor gpualloc_random_init
-            /// In gpualloc mode, we do not want to leave input buffers uninitialized, because
-            /// there could be NaNs and Infs, which may affect the performance (which we are
-            /// interested to evaluate in this mode). Initialization with all 0's is not the
-            /// best choice as well, because GPU HW may optimize out computations with 0's and
-            /// that could affect performance of kernels too. That is why we are using
-            /// rocrand to initialize input buffers.
-            ///
-            /// However we do not care about precision in gpualloc mode, because validation
-            /// is not used. Therefore, range (0,1] is fine.
-            return gpumemrand::gen_0_1(static_cast<Tgpu*>(GetDevicePtr()), sz);
-        }
-        return dev->ToGPU(q, GetVectorData());
-    }
-
-    template <typename T>
-    status_t AllocOnDevice(stream, context_t ctx, const size_t sz, std::vector<T>&)
-    {
-        static_assert(std::is_same<T, float>::value           //
-                          || std::is_same<T, int32_t>::value, //
-                      "Before enabling more types, check thoroughly.");
-        dev = std::make_unique<GPUMem>(ctx, sz, sizeof(T));
-        return STATUS_SUCCESS;
-    }
-
-    template <typename T>
-    status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz, std::vector<T>& init)
-    {
-        AllocOnDevice(q, ctx, sz, init);
-        if(is_gpualloc)
-        {
-            /// \ref gpualloc_random_init
-            return gpumemrand::gen_0_1(static_cast<Tgpu*>(GetDevicePtr()), sz);
-        }
-        return dev->ToGPU(q, init.data());
-    }
-
-    status_t CopyFromDeviceToHost(stream q)
-    {
-        return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, GetVectorData());
-    }
-
-    template <typename T>
-    status_t CopyFromDeviceToHost(stream q, tensor<T>& t)
-    {
-        return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, t.data.data());
-    }
-
-    template <typename T>
-    status_t CopyFromDeviceToHost(stream q, std::vector<T>& v)
-    {
-        return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, v.data());
-    }
-
-    auto GetDevicePtr() -> auto { return dev->GetMem(); }
-};
-
 template <typename Tgpu>
 class GpumemVector
 {

@@ -31,7 +31,11 @@ static Driver* makeDriver(const std::string& base_arg)
     if(base_arg == "bnorm")
         return new BatchNormDriver<float, double>();
     if(base_arg == "bnormfp16")
+        return new BatchNormDriver<float16, double, float16>();
+    if(base_arg == "bnormfp16fp32")
         return new BatchNormDriver<float16, double, float>();
+    if(base_arg == "bnormbfp16fp32")
+        return new BatchNormDriver<bfloat16, double, float>();
     return nullptr;
 }
 

@@ -38,6 +38,9 @@
 #include <miopen/logger.hpp>
 #include <miopen/miopen.h>
 #include <miopen/bfloat16.hpp>
+#include <../test/tensor_holder.hpp>
+#include "util_driver.hpp"
+#include "rocrand_wrapper.hpp"
 using half         = half_float::half;
 using hip_bfloat16 = bfloat16;
 #include <hip_float8.hpp>
@@ -157,6 +160,140 @@ struct GPUMem
 #endif
 };
 
+template <typename Tgpu>
+class GpumemTensor
+{
+    std::unique_ptr<GPUMem> dev;
+    tensor<Tgpu> host;
+    bool is_gpualloc = false;
+
+public:
+    void SetGpuallocMode(bool v) { is_gpualloc = v; }
+    tensor<Tgpu>& GetTensor() { return host; }
+
+    void AllocOnHost(miopenTensorDescriptor_t t)
+    {
+        host = tensor<Tgpu>(miopen::deref(t));
+        if(is_gpualloc) // We do not need host data.
+        {
+            host.data.clear();
+            host.data.shrink_to_fit(); // To free host memory.
+        }
+    }
+    template <typename T>
+    void AllocOnHost(tensor<T> t)
+    {
+        AllocOnHost(&t.desc);
+    }
+
+    std::vector<Tgpu>& GetVector()
+    {
+        if(is_gpualloc)
+            MIOPEN_THROW("[MIOpenDriver] GpumemTensor::GetVector should not be called in "
+                         "'--gpualloc 1' mode");
+        return host.data;
+    }
+
+    Tgpu* GetVectorData() { return is_gpualloc ? nullptr : host.data.data(); }
+    std::size_t GetVectorSize() const { return is_gpualloc ? 0 : host.data.size(); }
+
+    void
+    InitHostData(const size_t sz,     //
+                 const bool do_write, // If set to false, then only generate random data. This is
+                                      // necessary to reproduce values in input buffers even if some
+                                      // directions are skipped. For example, inputs for Backward
+                                      // will be the same for both "-F 0" and "-F 2".
+                 std::function<Tgpu()> generator)
+    {
+        if(is_gpualloc)
+        {
+            /// In gpualloc mode, we do not care about reproducibility of results, because
+            /// validation is not used. Therefore, we do not have to always generate random value
+            /// (\ref move_rand)
+            return;
+        }
+
+        for(size_t i = 0; i < sz; ++i)
+        {
+            /// \anchor move_rand
+            /// Generate random value, even if buffer is unused. This provides the same
+            /// initialization of input buffers regardless of which kinds of
+            /// convolutions are currently selectedfor testing (see the "-F" option).
+            /// Verification cache would be broken otherwise.
+            auto val = generator();
+            if(do_write)
+                GetVector()[i] = val;
+        }
+    }
+
+    status_t AllocOnDevice(stream, context_t ctx, const size_t sz)
+    {
+        dev = std::make_unique<GPUMem>(ctx, sz, sizeof(Tgpu));
+        return STATUS_SUCCESS;
+    }
+
+    status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz)
+    {
+        AllocOnDevice(q, ctx, sz);
+        if(is_gpualloc)
+        {
+            /// \anchor gpualloc_random_init
+            /// In gpualloc mode, we do not want to leave input buffers uninitialized, because
+            /// there could be NaNs and Infs, which may affect the performance (which we are
+            /// interested to evaluate in this mode). Initialization with all 0's is not the
+            /// best choice as well, because GPU HW may optimize out computations with 0's and
+            /// that could affect performance of kernels too. That is why we are using
+            /// rocrand to initialize input buffers.
+            ///
+            /// However we do not care about precision in gpualloc mode, because validation
+            /// is not used. Therefore, range (0,1] is fine.
+            return gpumemrand::gen_0_1(static_cast<Tgpu*>(GetDevicePtr()), sz);
+        }
+        return dev->ToGPU(q, GetVectorData());
+    }
+
+    template <typename T>
+    status_t AllocOnDevice(stream, context_t ctx, const size_t sz, std::vector<T>&)
+    {
+        static_assert(std::is_same<T, float>::value           //
+                          || std::is_same<T, int32_t>::value, //
+                      "Before enabling more types, check thoroughly.");
+        dev = std::make_unique<GPUMem>(ctx, sz, sizeof(T));
+        return STATUS_SUCCESS;
+    }
+
+    template <typename T>
+    status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz, std::vector<T>& init)
+    {
+        AllocOnDevice(q, ctx, sz, init);
+        if(is_gpualloc)
+        {
+            /// \ref gpualloc_random_init
+            return gpumemrand::gen_0_1(static_cast<Tgpu*>(GetDevicePtr()), sz);
+        }
+        return dev->ToGPU(q, init.data());
+    }
+
+    status_t CopyFromDeviceToHost(stream q)
+    {
+        return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, GetVectorData());
+    }
+
+    template <typename T>
+    status_t CopyFromDeviceToHost(stream q, tensor<T>& t)
+    {
+        return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, t.data.data());
+    }
+
+    template <typename T>
+    status_t CopyFromDeviceToHost(stream q, std::vector<T>& v)
+    {
+        return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, v.data());
+    }
+
+    auto GetDevicePtr() -> auto { return dev->GetMem(); }
+};
+
 inline void PadBufferSize(size_t& sz, int datatype_sz)
 {
     size_t page_sz = (2 * 1024 * 1024) / datatype_sz;
@@ -193,16 +330,16 @@ inline std::string ParseBaseArg(int argc, char* argv[])
     if(arg != "conv" && arg != "convfp16" && arg != "convint8" && arg != "convbfp16" &&
        arg != "pool" && arg != "poolfp16" && arg != "lrn" && arg != "lrnfp16" && arg != "activ" &&
        arg != "activfp16" && arg != "softmax" && arg != "softmaxfp16" && arg != "bnorm" &&
-       arg != "bnormfp16" && arg != "rnn" && arg != "rnnfp16" && arg != "rnn_seq" &&
-       arg != "rnn_seqfp16" && arg != "gemm" && arg != "gemmfp16" && arg != "ctc" &&
-       arg != "dropout" && arg != "dropoutfp16" && arg != "tensorop" && arg != "reduce" &&
-       arg != "reducefp16" && arg != "reducefp64" && arg != "layernorm" && arg != "layernormfp16" &&
-       arg != "layernormbfp16" && arg != "sum" && arg != "sumfp16" && arg != "sumbfp16" &&
-       arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" &&
-       arg != "catfp16" && arg != "catbfp16" && arg != "addlayernorm" &&
-       arg != "addlayernormfp16" && arg != "addlayernormbfp16" && arg != "t5layernorm" &&
-       arg != "t5layernormfp16" && arg != "t5layernormbfp16" && arg != "adam" &&
-       arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" &&
+       arg != "bnormfp16" && arg != "bnormfp16fp32" && arg != "bnormbfp16fp32" && arg != "rnn" &&
+       arg != "rnnfp16" && arg != "rnn_seq" && arg != "rnn_seqfp16" && arg != "gemm" &&
+       arg != "gemmfp16" && arg != "ctc" && arg != "dropout" && arg != "dropoutfp16" &&
+       arg != "tensorop" && arg != "reduce" && arg != "reducefp16" && arg != "reducefp64" &&
+       arg != "layernorm" && arg != "layernormfp16" && arg != "layernormbfp16" && arg != "sum" &&
+       arg != "sumfp16" && arg != "sumbfp16" && arg != "groupnorm" && arg != "groupnormfp16" &&
+       arg != "groupnormbfp16" && arg != "cat" && arg != "catfp16" && arg != "catbfp16" &&
+       arg != "addlayernorm" && arg != "addlayernormfp16" && arg != "addlayernormbfp16" &&
+       arg != "t5layernorm" && arg != "t5layernormfp16" && arg != "t5layernormbfp16" &&
+       arg != "adam" && arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" &&
        arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "adamw" &&
        arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" &&
        arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" &&

@@ -67,14 +67,28 @@ void DeriveBNTensorDescriptor(TensorDescriptor& derivedBnDesc,
 TensorDescriptor BuildReshaped4DTensorDescriptor(const miopen::TensorDescriptor& tDesc)
 {
     auto dataType = tDesc.GetType();
+    auto layout   = tDesc.GetLayout_t();
+    if(layout == miopenTensorNCDHW)
+    {
+        layout = miopenTensorNCHW;
+    }
+    else if(layout == miopenTensorNDHWC)
+    {
+        layout = miopenTensorNHWC;
+    }
+    else
+    {
+        std::cout << "Cannot handle layout : " << layout << "\n";
+        exit(EXIT_FAILURE); // NOLINT (concurrency-mt-unsafe)
+    }
     std::vector<size_t> dims(tDesc.GetLengths());
 
     // NxCxDxHxW -> NxCx(D*H)xW
     dims[2] *= dims[3];
     dims[3] = dims[4];
     dims.pop_back();
 
-    return {dataType, dims};
+    return {dataType, layout, dims};
 }
 
 void profileSequence(const Handle& handle, unsigned char select, float* ctime)

@@ -66,6 +66,10 @@ void BnDataType(std::stringstream& ss, const miopen::TensorDescriptor& desc)
     {
         ss << "bnormfp16";
     }
+    if(desc.GetType() == miopenBFloat16)
+    {
+        ss << "bnormbfp16";
+    }
     else
     {
         ss << "bnorm";
@@ -248,6 +252,7 @@ std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc,
                      resultRunningVariance,
                      resultSaveMean,
                      resultSaveInvVariance);
+        ss << " --layout " << miopen::deref(xDesc).GetLayout_str();
     }
     return ss.str();
 }

@@ -201,7 +201,8 @@ bool BnCKBwdBackward::IsApplicable(
         return false;
     if(bn_problem.GetDirection() != miopen::batchnorm::Direction::Backward)
         return false;
-
+    if(!bn_problem.Is2D())
+        return false;
     switch(bn_problem.GetXDesc().GetType())
     {
     case miopenFloat: return CheckCKApplicability<F32, F32, F32, F32, F32, F32, F32>(bn_problem);

@@ -41,6 +41,8 @@ namespace batchnorm {
 bool BnBwdTrainingPerActivation::IsApplicable(
     const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem) const
 {
+    if(!problem.Is2D())
+        return false;
     return problem.GetDirection() == miopen::batchnorm::Direction::Backward &&
            problem.GetMode() == miopenBNPerActivation;
 }