From 9873c364d52c170f7f0a4e8871b3e04117c4783b Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Wed, 4 Sep 2024 14:12:26 +0000 Subject: [PATCH 01/27] Initial attempt to enable NHWC layout for batch norm driver command --- driver/bn_driver.hpp | 1261 ++++++++++++++++------------------- driver/dm_bnorm.cpp | 4 +- driver/gemm_driver.hpp | 2 +- test/fusionHost.hpp | 34 +- test/gtest/bn.hpp | 5 + test/gtest/bn_test_data.hpp | 61 +- 6 files changed, 646 insertions(+), 721 deletions(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index 4b94ac42d8..9c78bfb869 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -33,12 +33,15 @@ #include "tensor_driver.hpp" #include "timer.hpp" #include "util_driver.hpp" +#include "rocrand_wrapper.hpp" #include "../test/verify.hpp" +#include "../test/fusionHost.hpp" #include #include #include +#include "miopen/batch_norm.hpp" #include #include @@ -60,9 +63,147 @@ #define MIO_DRIVER_BN_REFERENCE_COMPUTE_3D_AS_2D 1 // Resolves issue #1974 +//======================== + + +template +class GpumemTensor +{ + std::unique_ptr dev; + tensor host; + bool is_gpualloc = false; + +public: + void SetGpuallocMode(bool v) { is_gpualloc = v; } + tensor& GetTensor() { return host; } + + void AllocOnHost(miopenTensorDescriptor_t t) + { + host = tensor(miopen::deref(t)); + if(is_gpualloc) // We do not need host data. + { + host.data.clear(); + host.data.shrink_to_fit(); // To free host memory. + } + } + template + void AllocOnHost(tensor t) + { + AllocOnHost(&t.desc); + } + + std::vector& GetVector() + { + if(is_gpualloc) + MIOPEN_THROW("[MIOpenDriver] GpumemTensor::GetVector should not be called in " + "'--gpualloc 1' mode"); + return host.data; + } + + Tgpu* GetVectorData() { return is_gpualloc ? nullptr : host.data.data(); } + std::size_t GetVectorSize() const { return is_gpualloc ? 0 : host.data.size(); } + + void + InitHostData(const size_t sz, // + const bool do_write, // If set to false, then only generate random data. This is + // necessary to reproduce values in input buffers even if some + // directions are skipped. For example, inputs for Backward + // will be the same for both "-F 0" and "-F 2". + std::function generator) + { + if(is_gpualloc) + { + /// In gpualloc mode, we do not care about reproducibility of results, because + /// validation is not used. Therefore, we do not have to always generate random value + /// (\ref move_rand) + return; + } + + for(size_t i = 0; i < sz; ++i) + { + /// \anchor move_rand + /// Generate random value, even if buffer is unused. This provides the same + /// initialization of input buffers regardless of which kinds of + /// convolutions are currently selectedfor testing (see the "-F" option). + /// Verification cache would be broken otherwise. + auto val = generator(); + if(do_write) + GetVector()[i] = val; + } + } + + status_t AllocOnDevice(stream, context_t ctx, const size_t sz) + { + dev = std::make_unique(ctx, sz, sizeof(Tgpu)); + return STATUS_SUCCESS; + } + + status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz) + { + AllocOnDevice(q, ctx, sz); + if(is_gpualloc) + { + /// \anchor gpualloc_random_init + /// In gpualloc mode, we do not want to leave input buffers uninitialized, because + /// there could be NaNs and Infs, which may affect the performance (which we are + /// interested to evaluate in this mode). Initialization with all 0's is not the + /// best choice as well, because GPU HW may optimize out computations with 0's and + /// that could affect performance of kernels too. That is why we are using + /// rocrand to initialize input buffers. + /// + /// However we do not care about precision in gpualloc mode, because validation + /// is not used. Therefore, range (0,1] is fine. + return gpumemrand::gen_0_1(static_cast(GetDevicePtr()), sz); + } + return dev->ToGPU(q, GetVectorData()); + } + + template + status_t AllocOnDevice(stream, context_t ctx, const size_t sz, std::vector&) + { + static_assert(std::is_same::value // + || std::is_same::value, // + "Before enabling more types, check thoroughly."); + dev = std::make_unique(ctx, sz, sizeof(T)); + return STATUS_SUCCESS; + } + + template + status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz, std::vector& init) + { + AllocOnDevice(q, ctx, sz, init); + if(is_gpualloc) + { + /// \ref gpualloc_random_init + return gpumemrand::gen_0_1(static_cast(GetDevicePtr()), sz); + } + return dev->ToGPU(q, init.data()); + } + + status_t CopyFromDeviceToHost(stream q) + { + return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, GetVectorData()); + } + + template + status_t CopyFromDeviceToHost(stream q, tensor& t) + { + return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, t.data.data()); + } + + template + status_t CopyFromDeviceToHost(stream q, std::vector& v) + { + return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, v.data()); + } + + auto GetDevicePtr() -> auto { return dev->GetMem(); } +}; +//======================== + //#define BN_RUNFOR_PROFILER -template +template class BatchNormDriver : public Driver { public: @@ -70,9 +211,9 @@ class BatchNormDriver : public Driver { miopenCreateTensorDescriptor(&inputTensor); miopenCreateTensorDescriptor(&outputTensor); - miopenCreateTensorDescriptor(&biasScaleTensor); - miopenCreateTensorDescriptor(&dxOutputTensor); - miopenCreateTensorDescriptor(&dyInputTensor); + // miopenCreateTensorDescriptor(&biasScaleTensor); + // miopenCreateTensorDescriptor(&dxOutputTensor); + // miopenCreateTensorDescriptor(&dyInputTensor); data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf; } @@ -100,9 +241,9 @@ class BatchNormDriver : public Driver void runGPUBwd(Tref epsilon, float alpha, float beta); void runCPUFwdInference( - Tref epsilon, int batch_sz, int channels, int height, int width, int depth = 0); + Tref epsilon); void runCPUFwdTrain( - Tref epsilon, Tref eAF, int batch_sz, int channels, int height, int width, int depth = 0); + Tref epsilon, Tref eAF); int VerifyBackward() override; int VerifyForward() override; @@ -111,13 +252,15 @@ class BatchNormDriver : public Driver { miopenDestroyTensorDescriptor(outputTensor); miopenDestroyTensorDescriptor(inputTensor); - miopenDestroyTensorDescriptor(biasScaleTensor); - miopenDestroyTensorDescriptor(dxOutputTensor); - miopenDestroyTensorDescriptor(dyInputTensor); + // miopenDestroyTensorDescriptor(biasScaleTensor); + // miopenDestroyTensorDescriptor(dxOutputTensor); + // miopenDestroyTensorDescriptor(dyInputTensor); } private: miopenBatchNormMode_t bn_mode; + miopenActivationMode_t activ_mode = miopenActivationRELU; + bool saveMeanVar; bool bsaveMeanVar; bool keepRunningMeanVar; @@ -126,67 +269,84 @@ class BatchNormDriver : public Driver int forw; int back; + bool isFwdInfer = false; + bool isFwdTrain = false; + bool isBwd = false; + InputFlags inflags; bool isDepthSpecified = false; - miopenTensorDescriptor_t inputTensor; - miopenTensorDescriptor_t biasScaleTensor; + miopenTensorDescriptor_t inputTensor; miopenTensorDescriptor_t outputTensor; + // // forward + // miopenTensorDescriptor_t scaleTensor; + // miopenTensorDescriptor_t biasTensor; + + // // forward inference + // miopenTensorDescriptor_t estMean; + // miopenTensorDescriptor_t estVariance; - // Backwards - miopenTensorDescriptor_t dyInputTensor; - miopenTensorDescriptor_t dxOutputTensor; - - std::unique_ptr dyin_dev; // this is the output of fwd - std::unique_ptr in_dev; - std::unique_ptr out_dev; - std::unique_ptr scale_dev; - std::unique_ptr bias_dev; - - std::unique_ptr dxout_dev; - std::unique_ptr dscale_dev; - std::unique_ptr dbias_dev; - - std::unique_ptr runningMean_dev; - std::unique_ptr runningVariance_dev; - std::unique_ptr saveMean_dev; - std::unique_ptr saveInvVariance_dev; - - std::vector dyin; // output of forward - std::vector in; - std::vector out; - std::vector out_host; - std::vector dxout; - std::vector dxout_host; - - std::vector scale; - std::vector scale_host; - std::vector bias; - std::vector bias_host; - - std::vector dscale; - std::vector dscale_host; - std::vector dbias; - std::vector dbias_host; - - std::vector runningMean; - std::vector runningVariance; - std::vector runningMean_host; - std::vector runningVariance_host; - - std::vector saveMean; - std::vector saveInvVariance; - - std::vector saveMean_host; - std::vector saveInvVariance_host; - - int createSaveBuffers(); - int createRunningBuffers(); + // // forward training + // miopenTensorDescriptor_t savedMean; + // miopenTensorDescriptor_t savedVariance; + // miopenTensorDescriptor_t runMean; + // miopenTensorDescriptor_t runVariance; + + // // backward + // miopenTensorDescriptor_t bnScale; + // miopenTensorDescriptor_t dy; + + // miopenTensorDescriptor_t dScale; + // miopenTensorDescriptor_t dBias; + // miopenTensorDescriptor_t savedMean; + // miopenTensorDescriptor_t savedInvVar; + + // -------------- + + + GpumemTensor in; // done + GpumemTensor out; // done + tensor out_ref; + + + // forward + GpumemTensor scale; // done + GpumemTensor bias; // done + + // forward inference + GpumemTensor estMean; + GpumemTensor estVariance; + + // forward training + GpumemTensor savedMean; + tensor savedMean_ref; + + GpumemTensor savedVariance; + tensor savedVariance_ref; + + GpumemTensor runMean; + tensor runMean_ref; + + GpumemTensor runVariance; + tensor runVariance_ref; + + + // backward + GpumemTensor bnScale; + GpumemTensor dy; + + GpumemTensor dScale; + tensor dScale_ref; + GpumemTensor dBias; + tensor dBias_ref; + + GpumemTensor savedInvVar; + Tref maxval; }; -template -int BatchNormDriver::ParseCmdLineArgs(int argc, char* argv[]) +template +int BatchNormDriver::ParseCmdLineArgs(int argc, char* argv[]) { inflags.Parse(argc, argv); @@ -198,56 +358,56 @@ int BatchNormDriver::ParseCmdLineArgs(int argc, char* argv[]) return miopenStatusSuccess; } -template -int BatchNormDriver::GetandSetData() +template +int BatchNormDriver::GetandSetData() { SetBNParametersFromCmdLineArgs(); std::vector in_len = GetInputTensorLengthsFromCmdLine(); - - std::vector sb_len; - if(bn_mode == miopenBNPerActivation) + + // change this to supoort NHWC too + in.AllocOnHost(tensor{miopenTensorNCHW, in_len}); + out.AllocOnHost(tensor{miopenTensorNCHW, in_len}); + auto derivedBnDesc = miopen::TensorDescriptor{}; + miopen::DeriveBNTensorDescriptor(derivedBnDesc, + in.GetTensor().desc, + bn_mode); + if(isFwdInfer || isFwdTrain) { - // 1xCxHxW | in_len.size = 4 - sb_len.push_back(1); - sb_len.push_back(in_len[1]); - sb_len.push_back(in_len[2]); - sb_len.push_back(in_len[3]); - - // 1xCxDxHxW | in_len.size = 5 - if(in_len.size() == 5) - { - sb_len.push_back(in_len[4]); - } + scale.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); + bias.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); } - else if(bn_mode == miopenBNSpatial) - { // 1xCx1x1 - sb_len.push_back(1); - sb_len.push_back(in_len[1]); - sb_len.push_back(1); - sb_len.push_back(1); - - // 1xCx1x1x1 - if(in_len.size() == 5) - { - sb_len.push_back(1); - } + if(isFwdInfer) + { + estMean.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); + estVariance.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); } + if(isFwdTrain && saveMeanVar) + { + savedMean.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); + savedVariance.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); + } + if(isFwdTrain && keepRunningMeanVar) + { + runMean.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); + runVariance.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); + } + if(isBwd) + { + bnScale.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); + dy.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); - SetTensorNd(inputTensor, in_len, data_type); - SetTensorNd(biasScaleTensor, sb_len, ((sizeof(Tmix) == 4) ? miopenFloat : miopenHalf)); - SetTensorNd(outputTensor, in_len, data_type); - - // backwards - SetTensorNd(dyInputTensor, in_len, data_type); - SetTensorNd(dxOutputTensor, in_len, data_type); - + dScale.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); + dBias.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); + savedMean.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); + savedInvVar.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); + } return miopenStatusSuccess; } -template -int BatchNormDriver::AddCmdLineArgs() +template +int BatchNormDriver::AddCmdLineArgs() { inflags.AddInputFlag( "forw", @@ -294,8 +454,8 @@ int BatchNormDriver::AddCmdLineArgs() return miopenStatusSuccess; } -template -std::vector BatchNormDriver::GetInputTensorLengthsFromCmdLine() +template +std::vector BatchNormDriver::GetInputTensorLengthsFromCmdLine() { int in_n = inflags.GetValueInt("batchsize"); int in_c = inflags.GetValueInt("in_channels"); @@ -317,8 +477,8 @@ std::vector BatchNormDriver::GetInputTensorLengthsFromCmd } } -template -int BatchNormDriver::SetBNParametersFromCmdLineArgs() +template +int BatchNormDriver::SetBNParametersFromCmdLineArgs() { // double bnAlpha = inflags.GetValueDouble("alpha"); @@ -395,242 +555,61 @@ int BatchNormDriver::SetBNParametersFromCmdLineArgs() forw = 1; } - return miopenStatusSuccess; -} - -template -int BatchNormDriver::createSaveBuffers() -{ - - status_t status = STATUS_SUCCESS; - DEFINE_CONTEXT(ctx); -#if MIOPEN_BACKEND_OPENCL - clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); -#endif - - size_t sb_sz = GetTensorSize(biasScaleTensor); - - if(saveMeanVar) + if(forw == 1) { - // GPU allocation - saveMean_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - saveInvVariance_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - - if(back == 1) - { - // GPU host allocation - saveMean = std::vector(sb_sz, static_cast(0)); - saveInvVariance = std::vector(sb_sz, static_cast(0)); - - // CPU allocation - saveMean_host = std::vector(sb_sz, static_cast(0)); - saveInvVariance_host = std::vector(sb_sz, static_cast(0)); - - // Populate - for(int i = 0; i < sb_sz; i++) - { - saveMean[i] = prng::gen_canonical(); - saveMean_host[i] = static_cast(saveMean[i]); - saveInvVariance[i] = prng::gen_canonical(); - saveInvVariance_host[i] = static_cast(saveInvVariance[i]); - } - } - else - { - // GPU host allocation - saveMean = std::vector(sb_sz, static_cast(0)); - saveInvVariance = std::vector(sb_sz, static_cast(0)); - - // CPU allocation - saveMean_host = std::vector(sb_sz, static_cast(0)); - saveInvVariance_host = std::vector(sb_sz, static_cast(0)); - } - // GPU data transfer - status |= saveMean_dev->ToGPU(q, saveMean.data()); - status |= saveInvVariance_dev->ToGPU(q, saveInvVariance.data()); + isFwdInfer = true; } - else + else if(forw == 2) { - saveMean_dev = nullptr; - saveInvVariance_dev = nullptr; + isFwdTrain = true; + } + else{ + isBwd = true; } - - if(status != STATUS_SUCCESS) - printf("Error copying data to GPU\n"); return miopenStatusSuccess; } -template -int BatchNormDriver::createRunningBuffers() + +template +int BatchNormDriver::AllocateBuffersAndCopy() { status_t status = STATUS_SUCCESS; DEFINE_CONTEXT(ctx); #if MIOPEN_BACKEND_OPENCL clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); #endif - size_t sb_sz = GetTensorSize(biasScaleTensor); - - if(keepRunningMeanVar) + status |= in.AllocOnDeviceAndInit(q, ctx, in.GetTensor().desc.GetElementSpace()); + status |= out.AllocOnDeviceAndInit(q, ctx, out.GetTensor().desc.GetElementSpace()); + if(isFwdInfer || isFwdTrain) { - // GPU allocation - runningMean_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - runningVariance_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - - if(forw == 2) - { - // GPU host allocation - runningMean = std::vector(sb_sz, static_cast(0)); - runningVariance = std::vector(sb_sz, static_cast(0)); - - // CPU allocation - runningMean_host = std::vector(sb_sz, static_cast(0)); - runningVariance_host = std::vector(sb_sz, static_cast(0)); - - // Populate - for(int i = 0; i < sb_sz; i++) - { - runningMean[i] = prng::gen_canonical(); - runningMean_host[i] = static_cast(runningMean[i]); - runningVariance[i] = prng::gen_canonical(); - runningVariance_host[i] = static_cast(runningVariance[i]); - } - } - else - { - // GPU host allocation - runningMean = std::vector(sb_sz, static_cast(0)); - runningVariance = std::vector(sb_sz, static_cast(0)); - - // CPU allocation - runningMean_host = std::vector(sb_sz, static_cast(0)); - runningVariance_host = std::vector(sb_sz, static_cast(0)); - } - - // GPU data transfer - status |= runningMean_dev->ToGPU(q, runningMean.data()); - status |= runningVariance_dev->ToGPU(q, runningVariance.data()); + status |= scale.AllocOnDeviceAndInit(q, ctx, scale.GetTensor().desc.GetElementSpace()); + status |= bias.AllocOnDeviceAndInit(q, ctx, bias.GetTensor().desc.GetElementSpace()); } - else + if(isFwdInfer) { - runningMean_dev = nullptr; - runningVariance_dev = nullptr; + status |= estMean.AllocOnDeviceAndInit(q, ctx, estMean.GetTensor().desc.GetElementSpace()); + status |= estVariance.AllocOnDeviceAndInit(q, ctx, estVariance.GetTensor().desc.GetElementSpace()); } - if(status != STATUS_SUCCESS) - printf("Error copying data to GPU\n"); - - return miopenStatusSuccess; -} - -template -int BatchNormDriver::AllocateBuffersAndCopy() -{ - status_t status = STATUS_SUCCESS; - DEFINE_CONTEXT(ctx); -#if MIOPEN_BACKEND_OPENCL - clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); -#endif - - size_t in_sz = GetTensorSize(inputTensor); - size_t sb_sz = GetTensorSize(biasScaleTensor); - - if(forw) + if(isFwdTrain && saveMeanVar) { - - size_t out_sz = GetTensorSize(outputTensor); - - // GPU allocation - in_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); - scale_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - bias_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - out_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); - - // GPU host allocation - in = std::vector(in_sz, static_cast(0)); - out = std::vector(out_sz, static_cast(0)); - scale = std::vector(sb_sz, static_cast(0)); - bias = std::vector(sb_sz, static_cast(0)); - - // CPU allocation - out_host = std::vector(out_sz, static_cast(0)); - scale_host = std::vector(sb_sz, static_cast(0)); - bias_host = std::vector(sb_sz, static_cast(0)); - - // Data initialization - for(int i = 0; i < in_sz; i++) - { - in[i] = prng::gen_canonical(); - } - status |= in_dev->ToGPU(q, in.data()); - - // Using random beta and gamma - for(int i = 0; i < sb_sz; i++) - { - scale[i] = prng::gen_canonical(); - scale_host[i] = static_cast(scale[i]); - bias[i] = prng::gen_canonical(); - bias_host[i] = static_cast(bias[i]); - } - status |= scale_dev->ToGPU(q, scale.data()); - status |= bias_dev->ToGPU(q, bias.data()); - status |= out_dev->ToGPU(q, out.data()); - - if(forw == 1) - { // training - status |= createRunningBuffers(); - status |= createSaveBuffers(); - } - else if(forw == 2) - { // inference - status |= createRunningBuffers(); - } - } // end forward - - if(back == 1) + status |= savedMean.AllocOnDeviceAndInit(q, ctx, savedMean.GetTensor().desc.GetElementSpace()); + status |= savedVariance.AllocOnDeviceAndInit(q, ctx, savedVariance.GetTensor().desc.GetElementSpace()); + } + if(isFwdTrain && keepRunningMeanVar) { + status |= runMean.AllocOnDeviceAndInit(q, ctx, runMean.GetTensor().desc.GetElementSpace()); + status |= runVariance.AllocOnDeviceAndInit(q, ctx, runVariance.GetTensor().desc.GetElementSpace()); + } + if(isBwd) + { + status |= bnScale.AllocOnDeviceAndInit(q, ctx, bnScale.GetTensor().desc.GetElementSpace()); + status |= dy.AllocOnDeviceAndInit(q, ctx, dy.GetTensor().desc.GetElementSpace()); - size_t out_sz = GetTensorSize(dxOutputTensor); - - // GPU allocation - in_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); - dyin_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); - dxout_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); - dscale_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - dbias_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - scale_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - - // GPU host allocation - in = std::vector(in_sz, static_cast(0)); - dyin = std::vector(in_sz, static_cast(0)); - dxout = std::vector(out_sz, static_cast(0)); - dscale = std::vector(sb_sz, static_cast(0)); - dbias = std::vector(sb_sz, static_cast(0)); - scale = std::vector(sb_sz, static_cast(0)); - - // CPU allocation - dxout_host = std::vector(out_sz, static_cast(0)); - dscale_host = std::vector(sb_sz, static_cast(0)); - dbias_host = std::vector(sb_sz, static_cast(0)); - - // Populate - for(int i = 0; i < sb_sz; i++) - { - scale[i] = prng::gen_canonical(); - } - status |= scale_dev->ToGPU(q, scale.data()); - status |= dscale_dev->ToGPU(q, dscale.data()); - status |= dbias_dev->ToGPU(q, dbias.data()); - - for(int i = 0; i < in_sz; i++) - { - dyin[i] = prng::gen_canonical(); - in[i] = prng::gen_canonical(); - } - status |= dyin_dev->ToGPU(q, dyin.data()); - status |= in_dev->ToGPU(q, in.data()); - status |= dxout_dev->ToGPU(q, dxout.data()); - - status |= createSaveBuffers(); + status |= dScale.AllocOnDeviceAndInit(q, ctx, dScale.GetTensor().desc.GetElementSpace()); + status |= dBias.AllocOnDeviceAndInit(q, ctx, dBias.GetTensor().desc.GetElementSpace()); + status |= savedMean.AllocOnDeviceAndInit(q, ctx, savedMean.GetTensor().desc.GetElementSpace()); + status |= savedInvVar.AllocOnDeviceAndInit(q, ctx, savedInvVar.GetTensor().desc.GetElementSpace()); } if(status != STATUS_SUCCESS) @@ -639,8 +618,8 @@ int BatchNormDriver::AllocateBuffersAndCopy() return miopenStatusSuccess; } -template -void BatchNormDriver::runGPUFwdInference(Tref epsilon, float alpha, float beta) +template +void BatchNormDriver::runGPUFwdInference(Tref epsilon, float alpha, float beta) { if(keepRunningMeanVar) @@ -649,15 +628,15 @@ void BatchNormDriver::runGPUFwdInference(Tref epsilon, float a bn_mode, &alpha, &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - runningMean_dev->GetMem(), - runningVariance_dev->GetMem(), + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + estMean.GetDevicePtr(), + estVariance.GetDevicePtr(), epsilon); } else @@ -666,13 +645,13 @@ void BatchNormDriver::runGPUFwdInference(Tref epsilon, float a bn_mode, &alpha, &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), nullptr, nullptr, epsilon); @@ -681,8 +660,8 @@ void BatchNormDriver::runGPUFwdInference(Tref epsilon, float a return; } -template -void BatchNormDriver::runGPUFwdTrain(Tref epsilon, +template +void BatchNormDriver::runGPUFwdTrain(Tref epsilon, Tref eAF, float alpha, float beta) @@ -690,107 +669,107 @@ void BatchNormDriver::runGPUFwdTrain(Tref epsilon, if(saveMeanVar && keepRunningMeanVar) { miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - eAF, - runningMean_dev->GetMem(), - runningVariance_dev->GetMem(), - epsilon, - saveMean_dev->GetMem(), - saveInvVariance_dev->GetMem()); + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + runMean.GetDevicePtr(), + runVariance.GetDevicePtr(), + epsilon, + savedMean.GetDevicePtr(), + savedVariance.GetDevicePtr()); } else if(saveMeanVar) { miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - eAF, - nullptr, - nullptr, - epsilon, - saveMean_dev->GetMem(), - saveInvVariance_dev->GetMem()); + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + nullptr, + nullptr, + epsilon, + savedMean.GetDevicePtr(), + savedVariance.GetDevicePtr()); } else if(keepRunningMeanVar) { miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - eAF, - runningMean_dev->GetMem(), - runningVariance_dev->GetMem(), - epsilon, - nullptr, - nullptr); + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + runMean.GetDevicePtr(), + runVariance.GetDevicePtr(), + epsilon, + nullptr, + nullptr); } else { miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - eAF, - nullptr, - nullptr, - epsilon, - nullptr, - nullptr); + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + nullptr, + nullptr, + epsilon, + nullptr, + nullptr); } #ifdef BN_RUNFOR_PROFILER miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - eAF, - nullptr, - nullptr, - epsilon, - nullptr, - nullptr); + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + nullptr, + nullptr, + epsilon, + nullptr, + nullptr); #endif } -template -int BatchNormDriver::RunForwardGPU() +template +int BatchNormDriver::RunForwardGPU() { float alpha = static_cast(1), beta = static_cast(0); @@ -889,42 +868,33 @@ int BatchNormDriver::RunForwardGPU() return miopenStatusSuccess; } -template -void BatchNormDriver::runCPUFwdInference( - Tref epsilon, int batch_sz, int channels, int height, int width, int depth) +template +void BatchNormDriver::runCPUFwdInference( + Tref epsilon) { if(bn_mode == miopenBNPerActivation) { // 1xCxHxW - miopenBNFwdInferPerActivationRunHost(/* alpha, beta, */ batch_sz, - channels, - (isDepthSpecified ? depth : 1), - height, - width, - in.data(), - out_host.data(), - scale_host.data(), - bias_host.data(), - epsilon, - keepRunningMeanVar, - runningMean_host.data(), - runningVariance_host.data()); + // handle 3d case + batchNormPerActivHostInference(in.GetTensor(), + out_ref, + scale.GetTensor(), + bias.GetTensor(), + epsilon, + estMean.GetTensor(), + estVariance.GetTensor()); } else if(bn_mode == miopenBNSpatial) { // 1xCx1x1 - miopenBNFwdInferSpatialRunHost(/* alpha, beta, */ batch_sz, - channels, - (isDepthSpecified ? depth : 1), - height, - width, - in.data(), - out_host.data(), - scale_host.data(), - bias_host.data(), - epsilon, - keepRunningMeanVar, - runningMean_host.data(), - runningVariance_host.data()); + batchNormSpatialHostInference(in.GetTensor(), + out_ref, + scale.GetTensor(), + bias.GetTensor(), + epsilon, + estMean.GetTensor(), + estVariance.GetTensor() + ); + } else { @@ -935,60 +905,36 @@ void BatchNormDriver::runCPUFwdInference( return; } -template -void BatchNormDriver::runCPUFwdTrain( - Tref epsilon, Tref eAF, int batch_sz, int channels, int height, int width, int depth) +template +void BatchNormDriver::runCPUFwdTrain( + Tref epsilon, Tref eAF) { if(bn_mode == miopenBNPerActivation) { // 1xCxHxW - miopenBNFwdTrainPerActivationRunHost(/* alpha, beta, */ batch_sz, - channels, -#if MIO_DRIVER_BN_REFERENCE_COMPUTE_3D_AS_2D - 1, - height * (isDepthSpecified ? depth : 1), -#else - (isDepthSpecified ? depth : 1), - height, -#endif - width, - in.data(), - out_host.data(), - scale_host.data(), - bias_host.data(), - epsilon, - saveMeanVar, - keepRunningMeanVar, - saveMean_host.data(), - saveInvVariance_host.data(), - runningMean_host.data(), - runningVariance_host.data(), - eAF); + batchNormPerActHostFwdTrain(in.GetTensor(), + out_ref, + scale.GetTensor(), + bias.GetTensor(), + static_cast(epsilon), + static_cast(eAF), + savedMean_ref, + savedVariance_ref, + runMean_ref, + runVariance_ref); } else if(bn_mode == miopenBNSpatial) { // 1xCx1x1 - miopenBNFwdTrainSpatialRunHost(/* alpha, beta, */ batch_sz, - channels, -#if MIO_DRIVER_BN_REFERENCE_COMPUTE_3D_AS_2D - 1, - height * (isDepthSpecified ? depth : 1), -#else - (isDepthSpecified ? depth : 1), - height, -#endif - width, - in.data(), - out_host.data(), - scale_host.data(), - bias_host.data(), - epsilon, - saveMeanVar, - keepRunningMeanVar, - saveMean_host.data(), - saveInvVariance_host.data(), - runningMean_host.data(), - runningVariance_host.data(), - eAF); + batchNormSpatialHostFwdTrain(in.GetTensor(), + out_ref, + scale.GetTensor(), + bias.GetTensor(), + static_cast(epsilon), + static_cast(eAF), + savedMean_ref, + savedVariance_ref, + runMean_ref, + runVariance_ref); } else { @@ -998,22 +944,9 @@ void BatchNormDriver::runCPUFwdTrain( } } -template -int BatchNormDriver::RunForwardCPU() +template +int BatchNormDriver::RunForwardCPU() { - int nIn = 0, cIn = 0, dIn = 0, hIn = 0, wIn = 0; - - if(isDepthSpecified) - miopenGet5dTensorDescriptorLengths(inputTensor, &nIn, &cIn, &dIn, &hIn, &wIn); - else - miopenGet4dTensorDescriptorLengths(inputTensor, &nIn, &cIn, &hIn, &wIn); - - int batch_sz = nIn; - int channels = cIn; - int height = hIn; - int width = wIn; - int depth = dIn; - // T alpha = 0., beta = 0.; Tref epsilon = static_cast(EPSILON); Tref eAF = static_cast(1.0); @@ -1024,19 +957,19 @@ int BatchNormDriver::RunForwardCPU() { eAF = static_cast(1.0) / (static_cast(i) + static_cast(1.0)); runCPUFwdTrain( - epsilon, eAF, /* alpha, beta,*/ batch_sz, channels, height, width, depth); + epsilon, eAF /* alpha, beta,*/); } } else if(forw == 2) { // inference only - runCPUFwdInference(epsilon, /* alpha, beta,*/ batch_sz, channels, height, width, depth); + runCPUFwdInference(epsilon); } return miopenStatusSuccess; } -template -int BatchNormDriver::RunBackwardGPU() +template +int BatchNormDriver::RunBackwardGPU() { if(!back) @@ -1059,46 +992,47 @@ int BatchNormDriver::RunBackwardGPU() if(saveMeanVar) { miopenBatchNormalizationBackward(GetHandle(), - bn_mode, - &alphaDataDiff, - &betaDataDiff, - &alphaParamDiff, - &betaParamDiff, - inputTensor, - in_dev->GetMem(), - dyInputTensor, - dyin_dev->GetMem(), - dxOutputTensor, - dxout_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - dscale_dev->GetMem(), - dbias_dev->GetMem(), - epsilon, - saveMean_dev->GetMem(), - saveInvVariance_dev->GetMem()); + bn_mode, + &alphaDataDiff, + &betaDataDiff, + &alphaParamDiff, + &betaParamDiff, + &in.GetTensor().desc, + in.GetDevicePtr(), + &dy.GetTensor().desc, + dy.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &bnScale.GetTensor().desc, + bnScale.GetDevicePtr(), + dScale.GetDevicePtr(), + dBias.GetDevicePtr(), + epsilon, + savedMean.GetDevicePtr(), + savedInvVar.GetDevicePtr() + ); } else { miopenBatchNormalizationBackward(GetHandle(), - bn_mode, - &alphaDataDiff, - &betaDataDiff, - &alphaParamDiff, - &betaParamDiff, - inputTensor, - in_dev->GetMem(), - dyInputTensor, - dyin_dev->GetMem(), - dxOutputTensor, - dxout_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - dscale_dev->GetMem(), - dbias_dev->GetMem(), - epsilon, - nullptr, - nullptr); + bn_mode, + &alphaDataDiff, + &betaDataDiff, + &alphaParamDiff, + &betaParamDiff, + &in.GetTensor().desc, + in.GetDevicePtr(), + &dy.GetTensor().desc, + dy.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &bnScale.GetTensor().desc, + bnScale.GetDevicePtr(), + dScale.GetDevicePtr(), + dBias.GetDevicePtr(), + epsilon, + nullptr, + nullptr); } miopen::deref(GetHandle()).Finish(); @@ -1152,8 +1086,8 @@ int BatchNormDriver::RunBackwardGPU() return miopenStatusSuccess; } -template -int BatchNormDriver::VerifyForward() +template +int BatchNormDriver::VerifyForward() { // jump out since we are forcing forward off when doing backwards. @@ -1176,27 +1110,27 @@ int BatchNormDriver::VerifyForward() if(keepRunningMeanVar) { // copy back for verification - runningMean_dev->FromGPU(GetStream(), runningMean.data()); - runningVariance_dev->FromGPU(GetStream(), runningVariance.data()); + runMean.CopyFromDeviceToHost(GetStream()); + runVariance.CopyFromDeviceToHost(GetStream()); - auto errorRunMean = miopen::rms_range(runningMean_host, runningMean); + auto errorRunMean = miopen::rms_range(runMean_ref.data, runMean.GetVector()); if(!std::isfinite(errorRunMean) || errorRunMean > maxrms) { std::cout << "Forward train batch norm verification FAILED on running mean: " << errorRunMean << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < runningMean.size() && i < runningMean_host.size() && + for(int i = 0; i < runMean.GetVector().size() && i < runMean_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(runningMean[i]) - fabs(runningMean_host[i]))); + diff = fabs(Tmix(fabs(runMean.GetVector()[i]) - fabs(runMean_ref.data[i]))); if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "rm[" << i << "]: " << runningMean[i]; - std::cout << ", rm_host[" << i << "]: " << runningMean_host[i]; + std::cout << "rm[" << i << "]: " << runMean.GetVector()[i]; + std::cout << ", rm_host[" << i << "]: " << runMean_ref.data[i]; std::cout << ", diff[" << i - << "]: " << Tmix(fabs(runningMean[i]) - fabs(runningMean_host[i])) + << "]: " << Tmix(fabs(runMean.GetVector()[i]) - fabs(runMean_ref.data[i])) << std::endl; } } @@ -1208,24 +1142,24 @@ int BatchNormDriver::VerifyForward() << errorRunMean << ')' << std::endl; } - auto errorRunVar = miopen::rms_range(runningVariance_host, runningVariance); + auto errorRunVar = miopen::rms_range(runVariance_ref.data, runVariance.GetVector()); if(!std::isfinite(errorRunVar) || errorRunVar > maxrms) { std::cout << "Forward train batch norm verification FAILED on running variance: " << errorRunVar << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < runningVariance.size() && i < runningVariance_host.size() && + for(int i = 0; i < runVariance.GetVector().size() && i < runVariance_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(runningVariance[i]) - fabs(runningVariance_host[i]))); + diff = fabs(Tmix(fabs(runVariance.GetVector()[i]) - fabs(runVariance_ref.data[i]))); if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "rv[" << i << "]: " << runningVariance[i]; - std::cout << ", rv_host[" << i << "]: " << runningVariance_host[i]; + std::cout << "rv[" << i << "]: " << runVariance.GetVector()[i]; + std::cout << ", rv_host[" << i << "]: " << runVariance_ref.data[i]; std::cout << ", diff[" << i << "]: " - << Tmix(fabs(runningVariance[i]) - fabs(runningVariance_host[i])) + << Tmix(fabs(runVariance.GetVector()[i]) - fabs(runVariance_ref.data[i])) << std::endl; } } @@ -1240,10 +1174,14 @@ int BatchNormDriver::VerifyForward() if(saveMeanVar) { // copy back for verification - saveMean_dev->FromGPU(GetStream(), saveMean.data()); - saveInvVariance_dev->FromGPU(GetStream(), saveInvVariance.data()); + // saveMean_dev->FromGPU(GetStream(), savedMean.data()); + // saveInvVariance_dev->FromGPU(GetStream(), savedInvVar.data()); + + savedMean.CopyFromDeviceToHost(GetStream()); + savedVariance.CopyFromDeviceToHost(GetStream()); + maxval = static_cast(0.0); - auto errorSaveMean = miopen::rms_range(saveMean_host, saveMean); + auto errorSaveMean = miopen::rms_range(savedVariance_ref.data, savedMean.GetVector()); if(!std::isfinite(errorSaveMean) || errorSaveMean > maxrms) { std::cout << "Forward train batch norm verification FAILED on saved mean: " @@ -1251,17 +1189,17 @@ int BatchNormDriver::VerifyForward() anError = true; #if(MIO_BN_DEBUG == 1) for(int i = 0; - i < saveMean.size() && i < saveMean_host.size() && i < MIO_BN_MAX_DEBUGLOOP; + i < savedMean.GetVector().size() && i < savedVariance_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(saveMean[i]) - fabs(saveMean_host[i]))); + diff = fabs(Tmix(fabs(savedMean.GetVector()[i]) - fabs(savedVariance_ref.data[i]))); maxval = maxval < diff ? diff : maxval; if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "sm[" << i << "]: " << saveMean[i]; - std::cout << ", sm_host[" << i << "]: " << saveMean_host[i]; + std::cout << "sm[" << i << "]: " << savedMean.GetVector()[i]; + std::cout << ", sm_host[" << i << "]: " << savedVariance_ref.data[i]; std::cout << ", diff[" << i - << "]: " << Tmix(fabs(saveMean[i]) - fabs(saveMean_host[i])) + << "]: " << Tmix(fabs(savedMean.GetVector()[i]) - fabs(savedVariance_ref.data[i])) << std::endl; } } @@ -1274,7 +1212,7 @@ int BatchNormDriver::VerifyForward() << errorSaveMean << ')' << std::endl; } - auto errorSaveVar = miopen::rms_range(saveInvVariance_host, saveInvVariance); + auto errorSaveVar = miopen::rms_range(savedVariance_ref.data, savedVariance.GetVector()); if(!std::isfinite(errorSaveVar) || errorSaveVar > maxrms) { std::cout @@ -1282,17 +1220,17 @@ int BatchNormDriver::VerifyForward() << errorSaveVar << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < saveInvVariance.size() && i < saveInvVariance_host.size() && + for(int i = 0; i < savedVariance.GetVector().size() && i < savedVariance_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(saveInvVariance[i]) - fabs(saveInvVariance_host[i]))); + diff = fabs(Tmix(fabs(savedVariance.GetVector()[i]) - fabs(savedVariance_ref.data[i]))); if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "sv[" << i << "]: " << saveInvVariance[i]; - std::cout << ", sv_host[" << i << "]: " << saveInvVariance_host[i]; + std::cout << "sv[" << i << "]: " << savedVariance.GetVector()[i]; + std::cout << ", sv_host[" << i << "]: " << savedVariance_ref.data[i]; std::cout << ", diff[" << i << "]: " - << Tmix(fabs(saveInvVariance[i]) - fabs(saveInvVariance_host[i])) + << Tmix(fabs(savedVariance.GetVector()[i]) - fabs(savedVariance_ref.data[i])) << std::endl; } } @@ -1308,37 +1246,39 @@ int BatchNormDriver::VerifyForward() } // Check output tensor error - out_dev->FromGPU(GetStream(), out.data()); + // out_dev->FromGPU(GetStream(), out.data()); + out.CopyFromDeviceToHost(GetStream()); + maxval = static_cast(0.0); - auto errorOut = miopen::rms_range(out_host, out); + auto errorOut = miopen::rms_range(out_ref.data, out.GetVector()); if(!std::isfinite(errorOut) || errorOut > maxrms) { std::cout << "Forward batch norm verification FAILED on output: " << errorOut << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) unsigned int count = 0; - for(int i = 0; i < out.size() && i < out_host.size(); i++) + for(int i = 0; i < out.GetVector().size() && i < out_ref.data.size(); i++) { - if(std::isnan(out[i])) + if(std::isnan(out.GetVector()[i])) { - std::cout << "out[" << i << "] produced a nan: " << out[i] << std::endl; + std::cout << "out[" << i << "] produced a nan: " << out.GetVector()[i] << std::endl; } - if(std::isnan(out_host[i])) + if(std::isnan(out_ref.data[i])) { - std::cout << "out_host[" << i << "] produced a nan: " << out_host[i] << std::endl; + std::cout << "out_ref[" << i << "] produced a nan: " << out_ref.data[i] << std::endl; } - diff = Tref(fabs(out[i]) - fabs(out_host[i])); + diff = Tref(fabs(out.GetVector()[i]) - fabs(out_ref.data[i])); maxval = maxval < diff ? diff : maxval; if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "out[" << i << "]: " << out[i]; - std::cout << ", out_host[" << i << "]: " << out_host[i]; - std::cout << ", diff[" << i << "]: " << Tref(out[i] - out_host[i]) << std::endl; + std::cout << "out[" << i << "]: " << out.GetVector()[i]; + std::cout << ", out_ref.data[" << i << "]: " << out_ref.data[i]; + std::cout << ", diff[" << i << "]: " << Tref(out.GetVector()[i] - out_ref.data[i]) << std::endl; count++; } } - std::cout << "Number of elements: " << out.size() << std::endl; + std::cout << "Number of elements: " << out.GetVector().size() << std::endl; std::cout << "Number of bad elements: " << count << std::endl; std::cout << "max difference in output: " << maxval << std::endl; #endif @@ -1358,68 +1298,46 @@ int BatchNormDriver::VerifyForward() return miopenStatusSuccess; } -template -int BatchNormDriver::RunBackwardCPU() +template +int BatchNormDriver::RunBackwardCPU() { if(!back) return miopenStatusSuccess; - int nIn = 0, cIn = 0, dIn = 0, hIn = 0, wIn = 0; - if(isDepthSpecified) - miopenGet5dTensorDescriptorLengths(inputTensor, &nIn, &cIn, &dIn, &hIn, &wIn); - else - miopenGet4dTensorDescriptorLengths(inputTensor, &nIn, &cIn, &hIn, &wIn); - - int batch_sz = nIn; - int channels = cIn; - int height = hIn; - int width = wIn; - int depth = dIn; - // T alphaDiff = 1, betaDiff = 0; // T alphaParam = 1, betaParam = 0; - Tref epsilon = static_cast(EPSILON); + float alpha = static_cast(1), beta = static_cast(0), gamma = static_cast(1); if(bn_mode == miopenBNPerActivation) - { // 1xCxHxW - miopenBNBwdPerActivationRunHost(/* alphaDiff, betaDiff, alphaParam, - betaParam, */ - batch_sz, - channels, - (isDepthSpecified ? depth : 1), - height, - width, - in.data(), - dyin.data(), - dxout_host.data(), - scale.data(), - dscale_host.data(), - dbias_host.data(), - epsilon, - saveMeanVar, - saveMean_host.data(), - saveInvVariance_host.data()); + { + // 1xCxHxW + batchNormActivSpatialHostBwdTrain(activ_mode, + gamma, + beta, + alpha, + in.GetTensor(), + dy.GetTensor(), + out.GetTensor(), + out_ref, + scale.GetTensor(), + bias.GetTensor(), + dScale_ref, + dBias_ref, + savedMean.GetTensor(), + savedInvVar.GetTensor()); + } else if(bn_mode == miopenBNSpatial) { // 1xCx1x1 - miopenBNBwdSpatialRunHost(/* alphaDiff, betaDiff, alphaParam, betaParam, - */ - batch_sz, - channels, - (isDepthSpecified ? depth : 1), - height, - width, - in.data(), - dyin.data(), - dxout_host.data(), - scale.data(), - dscale_host.data(), - dbias_host.data(), - epsilon, - saveMeanVar, - saveMean_host.data(), - saveInvVariance_host.data()); + batchNormSpatialHostBwdTrain(in.GetTensor(), + dy.GetTensor(), + out_ref, + scale.GetTensor(), + dScale_ref, + dBias_ref, + savedMean.GetTensor(), + savedInvVar.GetTensor()); } else { @@ -1431,8 +1349,8 @@ int BatchNormDriver::RunBackwardCPU() return miopenStatusSuccess; } -template -int BatchNormDriver::VerifyBackward() +template +int BatchNormDriver::VerifyBackward() { if(!back) @@ -1442,34 +1360,35 @@ int BatchNormDriver::VerifyBackward() bool anError = false; RunBackwardCPU(); - - dxout_dev->FromGPU(GetStream(), dxout.data()); - dscale_dev->FromGPU(GetStream(), dscale.data()); - dbias_dev->FromGPU(GetStream(), dbias.data()); + + out.CopyFromDeviceToHost(GetStream()); + dScale.CopyFromDeviceToHost(GetStream()); + dBias.CopyFromDeviceToHost(GetStream()); + #if(MIO_BN_DEBUG == 1) const Tref tolerance = static_cast(1000 * (sizeof(Tgpu) == 4) ? ERRTOL_FP32 : ERRTOL_FP16); Tref diff = static_cast(0.0); #endif maxval = static_cast(0.0); - auto errordxout = miopen::rms_range(dxout_host, dxout); + auto errordxout = miopen::rms_range(out_ref.data, out.GetVector()); if(!std::isfinite(errordxout) || errordxout > maxrms) { std::cout << "Backwards prop batch norm verification FAILED on dx: " << errordxout << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < dxout.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) + for(int i = 0; i < out_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tgpu(fabs(dxout[i]) - fabs(dxout_host[i]))); + diff = fabs(Tgpu(fabs(out_ref.data[i]) - fabs(out.GetVector()[i]))); maxval = maxval < diff ? diff : maxval; if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "dxout[" << i << "]: " << dxout[i]; - std::cout << "\tdxout_host[" << i << "]: " << dxout_host[i]; - std::cout << "\tdiff[" << i << "]: " << Tgpu(fabs(dxout[i]) - fabs(dxout_host[i])); + std::cout << "out_ref[" << i << "]: " << out_ref.data[i]; + std::cout << "\tout.GetVector()[" << i << "]: " << out.GetVector()[i]; + std::cout << "\tdiff[" << i << "]: " << Tgpu(fabs(out_ref.data[i]) - fabs(out.GetVector()[i])); std::cout << "\tratioH: " - << fabs(fabs(dxout[i]) - fabs(dxout_host[i])) / fabs(dxout_host[i]) + << fabs(fabs(out_ref.data[i]) - fabs(out.GetVector()[i])) / fabs(out.GetVector()[i]) << std::endl; } } @@ -1483,25 +1402,25 @@ int BatchNormDriver::VerifyBackward() } maxval = static_cast(0.0); - auto errordscale = miopen::rms_range(dscale_host, dscale); + auto errordscale = miopen::rms_range(dScale_ref.data, dScale.GetVector()); if(!std::isfinite(errordscale) || errordscale > maxrms) { std::cout << "Backwards prop batch norm verification FAILED on dscale: " << errordscale << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < dscale.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) + for(int i = 0; i < dScale.GetVector().size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(dscale[i]) - fabs(dscale_host[i]))); + diff = fabs(Tmix(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i]))); maxval = maxval < diff ? diff : maxval; if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "dscale[" << i << "]: " << dscale[i]; - std::cout << "\tdscale_host[" << i << "]: " << dscale_host[i]; + std::cout << "dscale[" << i << "]: " << dScale.GetVector()[i]; + std::cout << "\tdscale_host[" << i << "]: " << dScale_ref.data[i]; std::cout << "\tdiff[" << i - << "]: " << Tmix(fabs(dscale[i]) - fabs(dscale_host[i])); + << "]: " << Tmix(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i])); std::cout << "\tratioH: " - << fabs(fabs(dscale[i]) - fabs(dscale_host[i])) / fabs(dscale_host[i]) + << fabs(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i])) / fabs(dScale_ref.data[i]) << std::endl; } } @@ -1514,23 +1433,23 @@ int BatchNormDriver::VerifyBackward() << ')' << std::endl; } - auto errordbias = miopen::rms_range(dbias_host, dbias); + auto errordbias = miopen::rms_range(dBias_ref.data, dBias.GetVector()); if(!std::isfinite(errordbias) || errordbias > maxrms) { std::cout << "Backwards prop batch norm verification FAILED on dbias: " << errordbias << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < dbias.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) + for(int i = 0; i < dBias.GetVector().size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(dbias[i]) - fabs(dbias_host[i]))); + diff = fabs(Tmix(fabs( dBias.GetVector()[i]) - fabs(dBias_ref.data[i]))); if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "dbias[" << i << "]: " << dbias[i]; - std::cout << "\tdbias_host[" << i << "]: " << dbias_host[i]; - std::cout << "\tdiff[" << i << "]: " << Tmix(fabs(dbias[i]) - fabs(dbias_host[i])); + std::cout << "dbias[" << i << "]: " << dBias.GetVector()[i]; + std::cout << "\tdbias_host[" << i << "]: " << dBias_ref.data[i]; + std::cout << "\tdiff[" << i << "]: " << Tmix(fabs( dBias.GetVector()[i]) - fabs(dBias_ref.data[i])); std::cout << "\tratioH: " - << fabs(fabs(dbias[i]) - fabs(dbias_host[i])) / fabs(dbias_host[i]) + << fabs(fabs( dBias.GetVector()[i]) - fabs(dBias_ref.data[i])) / fabs(dBias_ref.data[i]) << std::endl; } } diff --git a/driver/dm_bnorm.cpp b/driver/dm_bnorm.cpp index c7bab90bb5..4333787e5e 100644 --- a/driver/dm_bnorm.cpp +++ b/driver/dm_bnorm.cpp @@ -29,9 +29,9 @@ static Driver* makeDriver(const std::string& base_arg) { if(base_arg == "bnorm") - return new BatchNormDriver(); + return new BatchNormDriver(); if(base_arg == "bnormfp16") - return new BatchNormDriver(); + return new BatchNormDriver(); return nullptr; } diff --git a/driver/gemm_driver.hpp b/driver/gemm_driver.hpp index 772104544e..282173101d 100644 --- a/driver/gemm_driver.hpp +++ b/driver/gemm_driver.hpp @@ -148,7 +148,7 @@ class GemmDriver : public Driver std::vector c; std::vector chost; - T alpha, beta; + T alpha, beta, gamma; miopen::GemmDescriptor gemm_desc = { false, false, false, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.0f, 0.0f, miopenFloat, false}; diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index c0c49b06b6..2bf2e47c4a 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -171,8 +171,8 @@ void batchNormSpatialHostInference(const tensor& input, template void batchNormPerActivHostInference(const tensor& input, tensor& output, - const tensor& scale, - const tensor& bias, + const tensor& scale, + const tensor& bias, double epsilon, const tensor& estimatedMean, const tensor& estimatedVariance) @@ -278,15 +278,15 @@ void batchNormSpatialHostFwdTrain(const tensor& input, }); } -template -void batchNormSpatialHostBwdTrain(const tensor& x_input, - const tensor& dy_input, - tensor& dx_out, - const tensor& scale, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar) +template +void batchNormSpatialHostBwdTrain(const tensor& x_input, + const tensor& dy_input, + tensor& dx_out, + const tensor& scale, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar) { int height, width, n_batch, channels; @@ -334,7 +334,7 @@ void batchNormSpatialHostBwdTrain(const tensor& x_input, double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); double tmp3 = (scale(0, cidx, 0, 0) * invVar) / nhw; - dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); + dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); } // end for(n_batchs) } // for (column) } // for (row) @@ -347,11 +347,11 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, double beta, double alpha, const tensor& x_input, - const tensor& dy_input, + const tensor& dy_input, const tensor& y_input, tensor& dx_out, - const tensor& scale, - const tensor& bias, + const tensor& scale, + const tensor& bias, tensor& dscale, tensor& dbias, const tensor& savedMean, @@ -432,8 +432,8 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, template void batchNormPerActHostFwdTrain(const tensor& input, tensor& out, - const tensor& scale, - const tensor& bias, + const tensor& scale, + const tensor& bias, double epsilon, double expAvgFactor, tensor& saveMean, diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index f5227217e4..16d788a70c 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -120,14 +120,19 @@ struct BNBwdTest : public ::testing::TestWithParam Network1() { // pyt_mlperf_resnet50v1.5 return { - {192, 1, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, - {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0}, - {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}}; + {4, 2, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0}, + // {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0} + }; } template From 5c8a57b12ac6f0b8da6637d2daef619d6c942f59 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 5 Sep 2024 16:27:42 +0000 Subject: [PATCH 02/27] runs infer, forward train and bwd driver command --- driver/bn_driver.hpp | 595 ++++++++++++++++++++------------------- src/driver_arguments.cpp | 1 + src/tensor.cpp | 1 - test/fusionHost.hpp | 22 +- test/na_train.cpp | 28 +- test/na_train_find2.cpp | 28 +- 6 files changed, 345 insertions(+), 330 deletions(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index 9c78bfb869..c56fb1a4a3 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -65,7 +65,6 @@ //======================== - template class GpumemTensor { @@ -240,10 +239,8 @@ class BatchNormDriver : public Driver void runGPUFwdTrain(Tref epsilon, Tref eAF, float alpha, float beta); void runGPUBwd(Tref epsilon, float alpha, float beta); - void runCPUFwdInference( - Tref epsilon); - void runCPUFwdTrain( - Tref epsilon, Tref eAF); + void runCPUFwdInference(Tref epsilon); + void runCPUFwdTrain(Tref epsilon, Tref eAF); int VerifyBackward() override; int VerifyForward() override; @@ -271,47 +268,21 @@ class BatchNormDriver : public Driver bool isFwdInfer = false; bool isFwdTrain = false; - bool isBwd = false; + bool isBwd = false; InputFlags inflags; bool isDepthSpecified = false; - miopenTensorDescriptor_t inputTensor; + miopenTensorDescriptor_t inputTensor; miopenTensorDescriptor_t outputTensor; - // // forward - // miopenTensorDescriptor_t scaleTensor; - // miopenTensorDescriptor_t biasTensor; - - // // forward inference - // miopenTensorDescriptor_t estMean; - // miopenTensorDescriptor_t estVariance; - - // // forward training - // miopenTensorDescriptor_t savedMean; - // miopenTensorDescriptor_t savedVariance; - // miopenTensorDescriptor_t runMean; - // miopenTensorDescriptor_t runVariance; - - // // backward - // miopenTensorDescriptor_t bnScale; - // miopenTensorDescriptor_t dy; - - // miopenTensorDescriptor_t dScale; - // miopenTensorDescriptor_t dBias; - // miopenTensorDescriptor_t savedMean; - // miopenTensorDescriptor_t savedInvVar; - // -------------- - - - GpumemTensor in; // done - GpumemTensor out; // done + GpumemTensor in; + GpumemTensor out; tensor out_ref; - // forward - GpumemTensor scale; // done - GpumemTensor bias; // done + GpumemTensor scale; + GpumemTensor bias; // forward inference GpumemTensor estMean; @@ -320,29 +291,26 @@ class BatchNormDriver : public Driver // forward training GpumemTensor savedMean; tensor savedMean_ref; - GpumemTensor savedVariance; tensor savedVariance_ref; - GpumemTensor runMean; tensor runMean_ref; - GpumemTensor runVariance; tensor runVariance_ref; - // backward GpumemTensor bnScale; - GpumemTensor dy; + GpumemTensor dy; GpumemTensor dScale; tensor dScale_ref; GpumemTensor dBias; tensor dBias_ref; - GpumemTensor savedInvVar; - + Tref maxval; + + miopenTensorLayout_t bn_layout; }; template @@ -365,43 +333,42 @@ int BatchNormDriver::GetandSetData() SetBNParametersFromCmdLineArgs(); std::vector in_len = GetInputTensorLengthsFromCmdLine(); - - // change this to supoort NHWC too - in.AllocOnHost(tensor{miopenTensorNCHW, in_len}); - out.AllocOnHost(tensor{miopenTensorNCHW, in_len}); + + in.AllocOnHost(tensor{bn_layout, in_len}); + out.AllocOnHost(tensor{bn_layout, in_len}); auto derivedBnDesc = miopen::TensorDescriptor{}; - miopen::DeriveBNTensorDescriptor(derivedBnDesc, - in.GetTensor().desc, - bn_mode); + miopen::DeriveBNTensorDescriptor(derivedBnDesc, in.GetTensor().desc, bn_mode); if(isFwdInfer || isFwdTrain) { - scale.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); - bias.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); + scale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + bias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); } if(isFwdInfer) { - estMean.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); - estVariance.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); + estMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + estVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); } - if(isFwdTrain && saveMeanVar) + else if(isFwdTrain) { - savedMean.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); - savedVariance.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); + savedMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + savedVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + runMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + runVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); } - if(isFwdTrain && keepRunningMeanVar) + else if(isBwd) { - runMean.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); - runVariance.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); + bnScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + dy.AllocOnHost(tensor{bn_layout, in_len}); + + dScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + dBias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + savedMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + savedInvVar.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); } - if(isBwd) + else { - bnScale.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); - dy.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); - - dScale.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); - dBias.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); - savedMean.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); - savedInvVar.AllocOnHost(tensor{miopenTensorNCHW, derivedBnDesc.GetLengths()}); + std::cout << "\nUnknown batch norm state!\n"; + exit(EXIT_FAILURE); } return miopenStatusSuccess; } @@ -425,6 +392,14 @@ int BatchNormDriver::AddCmdLineArgs() inflags.AddInputFlag("in_h", 'H', "32", "Input Height (Default=32)", "int"); inflags.AddInputFlag("in_w", 'W', "32", "Input Width (Default=32)", "int"); inflags.AddInputFlag("in_d", 'D', "0", "Input Depth (Default=0)", "int"); + + inflags.AddInputFlag("layout", + 'L', + "NCHW", + "Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)", + "string", + true); + inflags.AddInputFlag("alpha", 'A', "1.0", "Alpha (Default=1.0)", "float"); inflags.AddInputFlag("beta", 'B', "0.", "Beta (Default=0.)", "float"); inflags.AddInputFlag("iter", 'i', "1", "Number of Iterations (Default=1)", "int"); @@ -484,6 +459,22 @@ int BatchNormDriver::SetBNParametersFromCmdLineArgs() // double bnAlpha = inflags.GetValueDouble("alpha"); // double bnBeta = inflags.GetValueDouble("beta"); + std::string layout = inflags.GetValueStr("layout"); + + if(layout == "NCHW") + { + bn_layout = miopenTensorNCHW; + } + else if(layout == "NHWC") + { + bn_layout = miopenTensorNHWC; + } + else + { + std::cout << "Cannot handle layout : " << layout << "\n"; + exit(EXIT_FAILURE); // NOLINT (concurrency-mt-unsafe) + } + // batch norm mode type if(inflags.GetValueInt("mode") == 0) { @@ -557,20 +548,20 @@ int BatchNormDriver::SetBNParametersFromCmdLineArgs() if(forw == 1) { - isFwdInfer = true; + isFwdTrain = true; } else if(forw == 2) { - isFwdTrain = true; + isFwdInfer = true; } - else{ + else + { isBwd = true; } return miopenStatusSuccess; } - template int BatchNormDriver::AllocateBuffersAndCopy() { @@ -581,6 +572,7 @@ int BatchNormDriver::AllocateBuffersAndCopy() #endif status |= in.AllocOnDeviceAndInit(q, ctx, in.GetTensor().desc.GetElementSpace()); status |= out.AllocOnDeviceAndInit(q, ctx, out.GetTensor().desc.GetElementSpace()); + out_ref = out.GetTensor(); if(isFwdInfer || isFwdTrain) { status |= scale.AllocOnDeviceAndInit(q, ctx, scale.GetTensor().desc.GetElementSpace()); @@ -589,17 +581,23 @@ int BatchNormDriver::AllocateBuffersAndCopy() if(isFwdInfer) { status |= estMean.AllocOnDeviceAndInit(q, ctx, estMean.GetTensor().desc.GetElementSpace()); - status |= estVariance.AllocOnDeviceAndInit(q, ctx, estVariance.GetTensor().desc.GetElementSpace()); + status |= estVariance.AllocOnDeviceAndInit( + q, ctx, estVariance.GetTensor().desc.GetElementSpace()); } - if(isFwdTrain && saveMeanVar) - { - status |= savedMean.AllocOnDeviceAndInit(q, ctx, savedMean.GetTensor().desc.GetElementSpace()); - status |= savedVariance.AllocOnDeviceAndInit(q, ctx, savedVariance.GetTensor().desc.GetElementSpace()); - } - if(isFwdTrain && keepRunningMeanVar) + if(isFwdTrain) { + status |= + savedMean.AllocOnDeviceAndInit(q, ctx, savedMean.GetTensor().desc.GetElementSpace()); + status |= savedVariance.AllocOnDeviceAndInit( + q, ctx, savedVariance.GetTensor().desc.GetElementSpace()); status |= runMean.AllocOnDeviceAndInit(q, ctx, runMean.GetTensor().desc.GetElementSpace()); - status |= runVariance.AllocOnDeviceAndInit(q, ctx, runVariance.GetTensor().desc.GetElementSpace()); + status |= runVariance.AllocOnDeviceAndInit( + q, ctx, runVariance.GetTensor().desc.GetElementSpace()); + + savedMean_ref = savedMean.GetTensor(); + savedVariance_ref = savedVariance.GetTensor(); + runMean_ref = runMean.GetTensor(); + runVariance_ref = runVariance.GetTensor(); } if(isBwd) { @@ -608,8 +606,13 @@ int BatchNormDriver::AllocateBuffersAndCopy() status |= dScale.AllocOnDeviceAndInit(q, ctx, dScale.GetTensor().desc.GetElementSpace()); status |= dBias.AllocOnDeviceAndInit(q, ctx, dBias.GetTensor().desc.GetElementSpace()); - status |= savedMean.AllocOnDeviceAndInit(q, ctx, savedMean.GetTensor().desc.GetElementSpace()); - status |= savedInvVar.AllocOnDeviceAndInit(q, ctx, savedInvVar.GetTensor().desc.GetElementSpace()); + status |= + savedMean.AllocOnDeviceAndInit(q, ctx, savedMean.GetTensor().desc.GetElementSpace()); + status |= savedInvVar.AllocOnDeviceAndInit( + q, ctx, savedInvVar.GetTensor().desc.GetElementSpace()); + + dScale_ref = dScale.GetTensor(); + dBias_ref = dBias.GetTensor(); } if(status != STATUS_SUCCESS) @@ -619,7 +622,9 @@ int BatchNormDriver::AllocateBuffersAndCopy() } template -void BatchNormDriver::runGPUFwdInference(Tref epsilon, float alpha, float beta) +void BatchNormDriver::runGPUFwdInference(Tref epsilon, + float alpha, + float beta) { if(keepRunningMeanVar) @@ -662,109 +667,109 @@ void BatchNormDriver::runGPUFwdInference(Tref epsilon, fl template void BatchNormDriver::runGPUFwdTrain(Tref epsilon, - Tref eAF, - float alpha, - float beta) + Tref eAF, + float alpha, + float beta) { if(saveMeanVar && keepRunningMeanVar) { miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - &in.GetTensor().desc, - in.GetDevicePtr(), - &out.GetTensor().desc, - out.GetDevicePtr(), - &scale.GetTensor().desc, - scale.GetDevicePtr(), - bias.GetDevicePtr(), - eAF, - runMean.GetDevicePtr(), - runVariance.GetDevicePtr(), - epsilon, - savedMean.GetDevicePtr(), - savedVariance.GetDevicePtr()); + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + runMean.GetDevicePtr(), + runVariance.GetDevicePtr(), + epsilon, + savedMean.GetDevicePtr(), + savedVariance.GetDevicePtr()); } else if(saveMeanVar) { miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - &in.GetTensor().desc, - in.GetDevicePtr(), - &out.GetTensor().desc, - out.GetDevicePtr(), - &scale.GetTensor().desc, - scale.GetDevicePtr(), - bias.GetDevicePtr(), - eAF, - nullptr, - nullptr, - epsilon, - savedMean.GetDevicePtr(), - savedVariance.GetDevicePtr()); + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + nullptr, + nullptr, + epsilon, + savedMean.GetDevicePtr(), + savedVariance.GetDevicePtr()); } else if(keepRunningMeanVar) { miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - &in.GetTensor().desc, - in.GetDevicePtr(), - &out.GetTensor().desc, - out.GetDevicePtr(), - &scale.GetTensor().desc, - scale.GetDevicePtr(), - bias.GetDevicePtr(), - eAF, - runMean.GetDevicePtr(), - runVariance.GetDevicePtr(), - epsilon, - nullptr, - nullptr); + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + runMean.GetDevicePtr(), + runVariance.GetDevicePtr(), + epsilon, + nullptr, + nullptr); } else { miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - &in.GetTensor().desc, - in.GetDevicePtr(), - &out.GetTensor().desc, - out.GetDevicePtr(), - &scale.GetTensor().desc, - scale.GetDevicePtr(), - bias.GetDevicePtr(), - eAF, - nullptr, - nullptr, - epsilon, - nullptr, - nullptr); + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + nullptr, + nullptr, + epsilon, + nullptr, + nullptr); } #ifdef BN_RUNFOR_PROFILER miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - &in.GetTensor().desc, - in.GetDevicePtr(), - &out.GetTensor().desc, - out.GetDevicePtr(), - &scale.GetTensor().desc, - scale.GetDevicePtr(), - bias.GetDevicePtr(), - eAF, - nullptr, - nullptr, - epsilon, - nullptr, - nullptr); + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + nullptr, + nullptr, + epsilon, + nullptr, + nullptr); #endif } @@ -869,20 +874,19 @@ int BatchNormDriver::RunForwardGPU() } template -void BatchNormDriver::runCPUFwdInference( - Tref epsilon) +void BatchNormDriver::runCPUFwdInference(Tref epsilon) { if(bn_mode == miopenBNPerActivation) { // 1xCxHxW // handle 3d case batchNormPerActivHostInference(in.GetTensor(), - out_ref, - scale.GetTensor(), - bias.GetTensor(), - epsilon, - estMean.GetTensor(), - estVariance.GetTensor()); + out_ref, + scale.GetTensor(), + bias.GetTensor(), + epsilon, + estMean.GetTensor(), + estVariance.GetTensor()); } else if(bn_mode == miopenBNSpatial) { // 1xCx1x1 @@ -892,9 +896,7 @@ void BatchNormDriver::runCPUFwdInference( bias.GetTensor(), epsilon, estMean.GetTensor(), - estVariance.GetTensor() - ); - + estVariance.GetTensor()); } else { @@ -906,35 +908,34 @@ void BatchNormDriver::runCPUFwdInference( } template -void BatchNormDriver::runCPUFwdTrain( - Tref epsilon, Tref eAF) +void BatchNormDriver::runCPUFwdTrain(Tref epsilon, Tref eAF) { if(bn_mode == miopenBNPerActivation) { // 1xCxHxW batchNormPerActHostFwdTrain(in.GetTensor(), - out_ref, - scale.GetTensor(), - bias.GetTensor(), - static_cast(epsilon), - static_cast(eAF), - savedMean_ref, - savedVariance_ref, - runMean_ref, - runVariance_ref); + out_ref, + scale.GetTensor(), + bias.GetTensor(), + static_cast(epsilon), + static_cast(eAF), + savedMean_ref, + savedVariance_ref, + runMean_ref, + runVariance_ref); } else if(bn_mode == miopenBNSpatial) { // 1xCx1x1 batchNormSpatialHostFwdTrain(in.GetTensor(), - out_ref, - scale.GetTensor(), - bias.GetTensor(), - static_cast(epsilon), - static_cast(eAF), - savedMean_ref, - savedVariance_ref, - runMean_ref, - runVariance_ref); + out_ref, + scale.GetTensor(), + bias.GetTensor(), + static_cast(epsilon), + static_cast(eAF), + savedMean_ref, + savedVariance_ref, + runMean_ref, + runVariance_ref); } else { @@ -956,8 +957,7 @@ int BatchNormDriver::RunForwardCPU() for(int i = 0; i < inflags.GetValueInt("iter"); i++) { eAF = static_cast(1.0) / (static_cast(i) + static_cast(1.0)); - runCPUFwdTrain( - epsilon, eAF /* alpha, beta,*/); + runCPUFwdTrain(epsilon, eAF /* alpha, beta,*/); } } else if(forw == 2) @@ -992,47 +992,46 @@ int BatchNormDriver::RunBackwardGPU() if(saveMeanVar) { miopenBatchNormalizationBackward(GetHandle(), - bn_mode, - &alphaDataDiff, - &betaDataDiff, - &alphaParamDiff, - &betaParamDiff, - &in.GetTensor().desc, - in.GetDevicePtr(), - &dy.GetTensor().desc, - dy.GetDevicePtr(), - &out.GetTensor().desc, - out.GetDevicePtr(), - &bnScale.GetTensor().desc, - bnScale.GetDevicePtr(), - dScale.GetDevicePtr(), - dBias.GetDevicePtr(), - epsilon, - savedMean.GetDevicePtr(), - savedInvVar.GetDevicePtr() - ); + bn_mode, + &alphaDataDiff, + &betaDataDiff, + &alphaParamDiff, + &betaParamDiff, + &in.GetTensor().desc, + in.GetDevicePtr(), + &dy.GetTensor().desc, + dy.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &bnScale.GetTensor().desc, + bnScale.GetDevicePtr(), + dScale.GetDevicePtr(), + dBias.GetDevicePtr(), + epsilon, + savedMean.GetDevicePtr(), + savedInvVar.GetDevicePtr()); } else { miopenBatchNormalizationBackward(GetHandle(), - bn_mode, - &alphaDataDiff, - &betaDataDiff, - &alphaParamDiff, - &betaParamDiff, - &in.GetTensor().desc, - in.GetDevicePtr(), - &dy.GetTensor().desc, - dy.GetDevicePtr(), - &out.GetTensor().desc, - out.GetDevicePtr(), - &bnScale.GetTensor().desc, - bnScale.GetDevicePtr(), - dScale.GetDevicePtr(), - dBias.GetDevicePtr(), - epsilon, - nullptr, - nullptr); + bn_mode, + &alphaDataDiff, + &betaDataDiff, + &alphaParamDiff, + &betaParamDiff, + &in.GetTensor().desc, + in.GetDevicePtr(), + &dy.GetTensor().desc, + dy.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &bnScale.GetTensor().desc, + bnScale.GetDevicePtr(), + dScale.GetDevicePtr(), + dBias.GetDevicePtr(), + epsilon, + nullptr, + nullptr); } miopen::deref(GetHandle()).Finish(); @@ -1129,8 +1128,8 @@ int BatchNormDriver::VerifyForward() { std::cout << "rm[" << i << "]: " << runMean.GetVector()[i]; std::cout << ", rm_host[" << i << "]: " << runMean_ref.data[i]; - std::cout << ", diff[" << i - << "]: " << Tmix(fabs(runMean.GetVector()[i]) - fabs(runMean_ref.data[i])) + std::cout << ", diff[" << i << "]: " + << Tmix(fabs(runMean.GetVector()[i]) - fabs(runMean_ref.data[i])) << std::endl; } } @@ -1149,17 +1148,19 @@ int BatchNormDriver::VerifyForward() << errorRunVar << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < runVariance.GetVector().size() && i < runVariance_ref.data.size() && - i < MIO_BN_MAX_DEBUGLOOP; + for(int i = 0; i < runVariance.GetVector().size() && + i < runVariance_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(runVariance.GetVector()[i]) - fabs(runVariance_ref.data[i]))); + diff = fabs( + Tmix(fabs(runVariance.GetVector()[i]) - fabs(runVariance_ref.data[i]))); if(!std::isfinite(diff) || diff > tolerance) { std::cout << "rv[" << i << "]: " << runVariance.GetVector()[i]; std::cout << ", rv_host[" << i << "]: " << runVariance_ref.data[i]; std::cout << ", diff[" << i << "]: " - << Tmix(fabs(runVariance.GetVector()[i]) - fabs(runVariance_ref.data[i])) + << Tmix(fabs(runVariance.GetVector()[i]) - + fabs(runVariance_ref.data[i])) << std::endl; } } @@ -1176,7 +1177,7 @@ int BatchNormDriver::VerifyForward() { // copy back for verification // saveMean_dev->FromGPU(GetStream(), savedMean.data()); // saveInvVariance_dev->FromGPU(GetStream(), savedInvVar.data()); - + savedMean.CopyFromDeviceToHost(GetStream()); savedVariance.CopyFromDeviceToHost(GetStream()); @@ -1188,18 +1189,20 @@ int BatchNormDriver::VerifyForward() << errorSaveMean << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; - i < savedMean.GetVector().size() && i < savedVariance_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; + for(int i = 0; i < savedMean.GetVector().size() && + i < savedVariance_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(savedMean.GetVector()[i]) - fabs(savedVariance_ref.data[i]))); + diff = fabs( + Tmix(fabs(savedMean.GetVector()[i]) - fabs(savedVariance_ref.data[i]))); maxval = maxval < diff ? diff : maxval; if(!std::isfinite(diff) || diff > tolerance) { std::cout << "sm[" << i << "]: " << savedMean.GetVector()[i]; std::cout << ", sm_host[" << i << "]: " << savedVariance_ref.data[i]; - std::cout << ", diff[" << i - << "]: " << Tmix(fabs(savedMean.GetVector()[i]) - fabs(savedVariance_ref.data[i])) + std::cout << ", diff[" << i << "]: " + << Tmix(fabs(savedMean.GetVector()[i]) - + fabs(savedVariance_ref.data[i])) << std::endl; } } @@ -1212,7 +1215,8 @@ int BatchNormDriver::VerifyForward() << errorSaveMean << ')' << std::endl; } - auto errorSaveVar = miopen::rms_range(savedVariance_ref.data, savedVariance.GetVector()); + auto errorSaveVar = + miopen::rms_range(savedVariance_ref.data, savedVariance.GetVector()); if(!std::isfinite(errorSaveVar) || errorSaveVar > maxrms) { std::cout @@ -1220,17 +1224,19 @@ int BatchNormDriver::VerifyForward() << errorSaveVar << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < savedVariance.GetVector().size() && i < savedVariance_ref.data.size() && - i < MIO_BN_MAX_DEBUGLOOP; + for(int i = 0; i < savedVariance.GetVector().size() && + i < savedVariance_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(savedVariance.GetVector()[i]) - fabs(savedVariance_ref.data[i]))); + diff = fabs( + Tmix(fabs(savedVariance.GetVector()[i]) - fabs(savedVariance_ref.data[i]))); if(!std::isfinite(diff) || diff > tolerance) { std::cout << "sv[" << i << "]: " << savedVariance.GetVector()[i]; std::cout << ", sv_host[" << i << "]: " << savedVariance_ref.data[i]; std::cout << ", diff[" << i << "]: " - << Tmix(fabs(savedVariance.GetVector()[i]) - fabs(savedVariance_ref.data[i])) + << Tmix(fabs(savedVariance.GetVector()[i]) - + fabs(savedVariance_ref.data[i])) << std::endl; } } @@ -1265,7 +1271,8 @@ int BatchNormDriver::VerifyForward() } if(std::isnan(out_ref.data[i])) { - std::cout << "out_ref[" << i << "] produced a nan: " << out_ref.data[i] << std::endl; + std::cout << "out_ref[" << i << "] produced a nan: " << out_ref.data[i] + << std::endl; } diff = Tref(fabs(out.GetVector()[i]) - fabs(out_ref.data[i])); maxval = maxval < diff ? diff : maxval; @@ -1273,7 +1280,8 @@ int BatchNormDriver::VerifyForward() { std::cout << "out[" << i << "]: " << out.GetVector()[i]; std::cout << ", out_ref.data[" << i << "]: " << out_ref.data[i]; - std::cout << ", diff[" << i << "]: " << Tref(out.GetVector()[i] - out_ref.data[i]) << std::endl; + std::cout << ", diff[" << i << "]: " << Tref(out.GetVector()[i] - out_ref.data[i]) + << std::endl; count++; } } @@ -1307,37 +1315,37 @@ int BatchNormDriver::RunBackwardCPU() // T alphaDiff = 1, betaDiff = 0; // T alphaParam = 1, betaParam = 0; - float alpha = static_cast(1), beta = static_cast(0), gamma = static_cast(1); + double alpha = static_cast(1), beta = static_cast(0), + gamma = static_cast(1); if(bn_mode == miopenBNPerActivation) - { + { // 1xCxHxW batchNormActivSpatialHostBwdTrain(activ_mode, - gamma, - beta, - alpha, - in.GetTensor(), - dy.GetTensor(), - out.GetTensor(), - out_ref, - scale.GetTensor(), - bias.GetTensor(), - dScale_ref, - dBias_ref, - savedMean.GetTensor(), - savedInvVar.GetTensor()); - + gamma, + beta, + alpha, + in.GetTensor(), + out.GetTensor(), + out_ref, + bnScale.GetTensor(), + dy.GetTensor(), + dBias.GetTensor(), + dScale_ref, + dBias_ref, + savedMean.GetTensor(), + savedInvVar.GetTensor()); } else if(bn_mode == miopenBNSpatial) - { // 1xCx1x1 - batchNormSpatialHostBwdTrain(in.GetTensor(), - dy.GetTensor(), - out_ref, - scale.GetTensor(), - dScale_ref, - dBias_ref, - savedMean.GetTensor(), - savedInvVar.GetTensor()); + { // 1xCx1x1 + batchNormSpatialHostBwdTrain(in.GetTensor(), + dy.GetTensor(), + out_ref, + bnScale.GetTensor(), + dScale_ref, + dBias_ref, + savedMean.GetTensor(), + savedInvVar.GetTensor()); } else { @@ -1360,11 +1368,11 @@ int BatchNormDriver::VerifyBackward() bool anError = false; RunBackwardCPU(); - + out.CopyFromDeviceToHost(GetStream()); dScale.CopyFromDeviceToHost(GetStream()); dBias.CopyFromDeviceToHost(GetStream()); - + #if(MIO_BN_DEBUG == 1) const Tref tolerance = static_cast(1000 * (sizeof(Tgpu) == 4) ? ERRTOL_FP32 : ERRTOL_FP16); @@ -1386,9 +1394,11 @@ int BatchNormDriver::VerifyBackward() { std::cout << "out_ref[" << i << "]: " << out_ref.data[i]; std::cout << "\tout.GetVector()[" << i << "]: " << out.GetVector()[i]; - std::cout << "\tdiff[" << i << "]: " << Tgpu(fabs(out_ref.data[i]) - fabs(out.GetVector()[i])); + std::cout << "\tdiff[" << i + << "]: " << Tgpu(fabs(out_ref.data[i]) - fabs(out.GetVector()[i])); std::cout << "\tratioH: " - << fabs(fabs(out_ref.data[i]) - fabs(out.GetVector()[i])) / fabs(out.GetVector()[i]) + << fabs(fabs(out_ref.data[i]) - fabs(out.GetVector()[i])) / + fabs(out.GetVector()[i]) << std::endl; } } @@ -1420,7 +1430,8 @@ int BatchNormDriver::VerifyBackward() std::cout << "\tdiff[" << i << "]: " << Tmix(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i])); std::cout << "\tratioH: " - << fabs(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i])) / fabs(dScale_ref.data[i]) + << fabs(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i])) / + fabs(dScale_ref.data[i]) << std::endl; } } @@ -1440,16 +1451,18 @@ int BatchNormDriver::VerifyBackward() << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < dBias.GetVector().size() && i < MIO_BN_MAX_DEBUGLOOP; i++) + for(int i = 0; i < dBias.GetVector().size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs( dBias.GetVector()[i]) - fabs(dBias_ref.data[i]))); + diff = fabs(Tmix(fabs(dBias.GetVector()[i]) - fabs(dBias_ref.data[i]))); if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "dbias[" << i << "]: " << dBias.GetVector()[i]; + std::cout << "dbias[" << i << "]: " << dBias.GetVector()[i]; std::cout << "\tdbias_host[" << i << "]: " << dBias_ref.data[i]; - std::cout << "\tdiff[" << i << "]: " << Tmix(fabs( dBias.GetVector()[i]) - fabs(dBias_ref.data[i])); + std::cout << "\tdiff[" << i + << "]: " << Tmix(fabs(dBias.GetVector()[i]) - fabs(dBias_ref.data[i])); std::cout << "\tratioH: " - << fabs(fabs( dBias.GetVector()[i]) - fabs(dBias_ref.data[i])) / fabs(dBias_ref.data[i]) + << fabs(fabs(dBias.GetVector()[i]) - fabs(dBias_ref.data[i])) / + fabs(dBias_ref.data[i]) << std::endl; } } diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp index 82b4fb156f..97fe16d7c4 100644 --- a/src/driver_arguments.cpp +++ b/src/driver_arguments.cpp @@ -248,6 +248,7 @@ std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc, resultRunningVariance, resultSaveMean, resultSaveInvVariance); + ss << " --layout " << miopen::deref(xDesc).GetLayout_str(); } return ss.str(); } diff --git a/src/tensor.cpp b/src/tensor.cpp index 7ec4c4e581..f0fbd86a7b 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -35,7 +35,6 @@ #include #include #include - namespace miopen { namespace { diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index 2bf2e47c4a..0bd8df42e8 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -278,11 +278,11 @@ void batchNormSpatialHostFwdTrain(const tensor& input, }); } -template +template void batchNormSpatialHostBwdTrain(const tensor& x_input, const tensor& dy_input, tensor& dx_out, - const tensor& scale, + const tensor& bnScale, tensor& dscale, tensor& dbias, const tensor& savedMean, @@ -333,7 +333,7 @@ void batchNormSpatialHostBwdTrain(const tensor& x_input, double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); - double tmp3 = (scale(0, cidx, 0, 0) * invVar) / nhw; + double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw; dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); } // end for(n_batchs) } // for (column) @@ -347,11 +347,11 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, double beta, double alpha, const tensor& x_input, - const tensor& dy_input, const tensor& y_input, tensor& dx_out, - const tensor& scale, - const tensor& bias, + const tensor& bnScale, + const tensor& dy_input, + const tensor& bias, tensor& dscale, tensor& dbias, const tensor& savedMean, @@ -387,7 +387,8 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, elemStd = static_cast(x_input(bidx, cidx, row, column)) - mean; // (x_i - mean) xhat[xhat_index] = elemStd * invVar; - double bnrefowd = scale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); + double bnrefowd = + bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); activationHostBwdElement(activMode, gamma, beta, @@ -408,8 +409,9 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, { // via columns for(int bidx = 0; bidx < n_batch; bidx++) { // via mini_batch - xhat_index = in_cstride * bidx + (width * row + column); - double bnrefowd = scale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); + xhat_index = in_cstride * bidx + (width * row + column); + double bnrefowd = + bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); activationHostBwdElement(activMode, gamma, beta, @@ -421,7 +423,7 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, // double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); double tmp1 = nhw * dyelem - dbias(0, cidx, 0, 0); double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); - double tmp3 = (scale(0, cidx, 0, 0) * invVar) / nhw; + double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw; dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); } // end for(n_batchs) } // for (column) diff --git a/test/na_train.cpp b/test/na_train.cpp index e776f4414e..f2c1c9af1b 100644 --- a/test/na_train.cpp +++ b/test/na_train.cpp @@ -302,20 +302,20 @@ struct verify_bwd_batchnorm_spatial_activ std::fill(dgamma.begin(), dgamma.end(), 0.); std::fill(dbeta.begin(), dbeta.end(), 0.); - batchNormActivSpatialHostBwdTrain(activ_mode, - activ_gamma, - activ_beta, - activ_alpha, - x, - dy, - y, - dx, - bnscale, - bnbias, - dgamma, - dbeta, - savedMean, - savedInvVar); + // batchNormActivSpatialHostBwdTrain(activ_mode, + // activ_gamma, + // activ_beta, + // activ_alpha, + // x, + // y, + // dx, + // bnscale, + // dy, + // bnbias, + // dgamma, + // dbeta, + // savedMean, + // savedInvVar); return std::make_tuple(dx, dgamma, dbeta); } diff --git a/test/na_train_find2.cpp b/test/na_train_find2.cpp index 51868ae77b..edb79874b0 100644 --- a/test/na_train_find2.cpp +++ b/test/na_train_find2.cpp @@ -380,20 +380,20 @@ struct verify_bwd_batchnorm_spatial_activ std::fill(dgamma.begin(), dgamma.end(), 0.); std::fill(dbeta.begin(), dbeta.end(), 0.); - batchNormActivSpatialHostBwdTrain(activ_mode, - activ_gamma, - activ_beta, - activ_alpha, - x, - dy, - y, - dx, - bnscale, - bnbias, - dgamma, - dbeta, - savedMean, - savedInvVar); + // batchNormActivSpatialHostBwdTrain(activ_mode, + // activ_gamma, + // activ_beta, + // activ_alpha, + // x, + // y, + // dx, + // bnscale, + // dy, + // bnbias, + // dgamma, + // dbeta, + // savedMean, + // savedInvVar); return std::make_tuple(dx, dgamma, dbeta); } From d4ae0e1c46b2f5eef4e468546a9396ec3ea73c86 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 5 Sep 2024 18:09:02 +0000 Subject: [PATCH 03/27] fix build error --- test/fusionHost.hpp | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index 0bd8df42e8..4e61d89edb 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -168,14 +168,14 @@ void batchNormSpatialHostInference(const tensor& input, }); } -template +template void batchNormPerActivHostInference(const tensor& input, tensor& output, - const tensor& scale, - const tensor& bias, + const tensor& scale, + const tensor& bias, double epsilon, - const tensor& estimatedMean, - const tensor& estimatedVariance) + const tensor& estimatedMean, + const tensor& estimatedVariance) { int n_batches, channels, height, width; std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); @@ -278,15 +278,15 @@ void batchNormSpatialHostFwdTrain(const tensor& input, }); } -template +template void batchNormSpatialHostBwdTrain(const tensor& x_input, - const tensor& dy_input, - tensor& dx_out, + const tensor& dy_input, + tensor& dx_out, const tensor& bnScale, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar) + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar) { int height, width, n_batch, channels; @@ -431,17 +431,17 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, }); // for (channel) } -template +template void batchNormPerActHostFwdTrain(const tensor& input, tensor& out, - const tensor& scale, - const tensor& bias, + const tensor& scale, + const tensor& bias, double epsilon, double expAvgFactor, - tensor& saveMean, - tensor& saveInvVar, - tensor& runMean, - tensor& runVar) + tensor& saveMean, + tensor& saveInvVar, + tensor& runMean, + tensor& runVar) { int height, width, n_batch, channels; From db2a42f34bf810f031c0624b81c9ece5a30c943f Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 5 Sep 2024 18:21:27 +0000 Subject: [PATCH 04/27] undo template change --- driver/bn_driver.hpp | 128 +++++++++++++++++++++---------------------- driver/dm_bnorm.cpp | 4 +- 2 files changed, 65 insertions(+), 67 deletions(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index c56fb1a4a3..72db700a3f 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -202,7 +202,7 @@ class GpumemTensor //#define BN_RUNFOR_PROFILER -template +template class BatchNormDriver : public Driver { public: @@ -285,36 +285,36 @@ class BatchNormDriver : public Driver GpumemTensor bias; // forward inference - GpumemTensor estMean; - GpumemTensor estVariance; + GpumemTensor estMean; + GpumemTensor estVariance; // forward training - GpumemTensor savedMean; - tensor savedMean_ref; - GpumemTensor savedVariance; - tensor savedVariance_ref; - GpumemTensor runMean; - tensor runMean_ref; - GpumemTensor runVariance; - tensor runVariance_ref; + GpumemTensor savedMean; + tensor savedMean_ref; + GpumemTensor savedVariance; + tensor savedVariance_ref; + GpumemTensor runMean; + tensor runMean_ref; + GpumemTensor runVariance; + tensor runVariance_ref; // backward GpumemTensor bnScale; - GpumemTensor dy; - GpumemTensor dScale; - tensor dScale_ref; - GpumemTensor dBias; - tensor dBias_ref; - GpumemTensor savedInvVar; + GpumemTensor dy; + GpumemTensor dScale; + tensor dScale_ref; + GpumemTensor dBias; + tensor dBias_ref; + GpumemTensor savedInvVar; Tref maxval; miopenTensorLayout_t bn_layout; }; -template -int BatchNormDriver::ParseCmdLineArgs(int argc, char* argv[]) +template +int BatchNormDriver::ParseCmdLineArgs(int argc, char* argv[]) { inflags.Parse(argc, argv); @@ -326,8 +326,8 @@ int BatchNormDriver::ParseCmdLineArgs(int argc, char* arg return miopenStatusSuccess; } -template -int BatchNormDriver::GetandSetData() +template +int BatchNormDriver::GetandSetData() { SetBNParametersFromCmdLineArgs(); @@ -345,25 +345,25 @@ int BatchNormDriver::GetandSetData() } if(isFwdInfer) { - estMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - estVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + estMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + estVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); } else if(isFwdTrain) { - savedMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - savedVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - runMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - runVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + savedMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + savedVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + runMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + runVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); } else if(isBwd) { bnScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - dy.AllocOnHost(tensor{bn_layout, in_len}); + dy.AllocOnHost(tensor{bn_layout, in_len}); - dScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - dBias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - savedMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - savedInvVar.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + dScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + dBias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + savedMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + savedInvVar.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); } else { @@ -373,8 +373,8 @@ int BatchNormDriver::GetandSetData() return miopenStatusSuccess; } -template -int BatchNormDriver::AddCmdLineArgs() +template +int BatchNormDriver::AddCmdLineArgs() { inflags.AddInputFlag( "forw", @@ -429,8 +429,8 @@ int BatchNormDriver::AddCmdLineArgs() return miopenStatusSuccess; } -template -std::vector BatchNormDriver::GetInputTensorLengthsFromCmdLine() +template +std::vector BatchNormDriver::GetInputTensorLengthsFromCmdLine() { int in_n = inflags.GetValueInt("batchsize"); int in_c = inflags.GetValueInt("in_channels"); @@ -452,8 +452,8 @@ std::vector BatchNormDriver::GetInputTensorLengthsFr } } -template -int BatchNormDriver::SetBNParametersFromCmdLineArgs() +template +int BatchNormDriver::SetBNParametersFromCmdLineArgs() { // double bnAlpha = inflags.GetValueDouble("alpha"); @@ -562,8 +562,8 @@ int BatchNormDriver::SetBNParametersFromCmdLineArgs() return miopenStatusSuccess; } -template -int BatchNormDriver::AllocateBuffersAndCopy() +template +int BatchNormDriver::AllocateBuffersAndCopy() { status_t status = STATUS_SUCCESS; DEFINE_CONTEXT(ctx); @@ -621,10 +621,8 @@ int BatchNormDriver::AllocateBuffersAndCopy() return miopenStatusSuccess; } -template -void BatchNormDriver::runGPUFwdInference(Tref epsilon, - float alpha, - float beta) +template +void BatchNormDriver::runGPUFwdInference(Tref epsilon, float alpha, float beta) { if(keepRunningMeanVar) @@ -665,11 +663,11 @@ void BatchNormDriver::runGPUFwdInference(Tref epsilon, return; } -template -void BatchNormDriver::runGPUFwdTrain(Tref epsilon, - Tref eAF, - float alpha, - float beta) +template +void BatchNormDriver::runGPUFwdTrain(Tref epsilon, + Tref eAF, + float alpha, + float beta) { if(saveMeanVar && keepRunningMeanVar) { @@ -773,8 +771,8 @@ void BatchNormDriver::runGPUFwdTrain(Tref epsilon, #endif } -template -int BatchNormDriver::RunForwardGPU() +template +int BatchNormDriver::RunForwardGPU() { float alpha = static_cast(1), beta = static_cast(0); @@ -873,8 +871,8 @@ int BatchNormDriver::RunForwardGPU() return miopenStatusSuccess; } -template -void BatchNormDriver::runCPUFwdInference(Tref epsilon) +template +void BatchNormDriver::runCPUFwdInference(Tref epsilon) { if(bn_mode == miopenBNPerActivation) @@ -907,8 +905,8 @@ void BatchNormDriver::runCPUFwdInference(Tref epsilon) return; } -template -void BatchNormDriver::runCPUFwdTrain(Tref epsilon, Tref eAF) +template +void BatchNormDriver::runCPUFwdTrain(Tref epsilon, Tref eAF) { if(bn_mode == miopenBNPerActivation) @@ -945,8 +943,8 @@ void BatchNormDriver::runCPUFwdTrain(Tref epsilon, Tref e } } -template -int BatchNormDriver::RunForwardCPU() +template +int BatchNormDriver::RunForwardCPU() { // T alpha = 0., beta = 0.; Tref epsilon = static_cast(EPSILON); @@ -968,8 +966,8 @@ int BatchNormDriver::RunForwardCPU() return miopenStatusSuccess; } -template -int BatchNormDriver::RunBackwardGPU() +template +int BatchNormDriver::RunBackwardGPU() { if(!back) @@ -1085,8 +1083,8 @@ int BatchNormDriver::RunBackwardGPU() return miopenStatusSuccess; } -template -int BatchNormDriver::VerifyForward() +template +int BatchNormDriver::VerifyForward() { // jump out since we are forcing forward off when doing backwards. @@ -1306,8 +1304,8 @@ int BatchNormDriver::VerifyForward() return miopenStatusSuccess; } -template -int BatchNormDriver::RunBackwardCPU() +template +int BatchNormDriver::RunBackwardCPU() { if(!back) @@ -1357,8 +1355,8 @@ int BatchNormDriver::RunBackwardCPU() return miopenStatusSuccess; } -template -int BatchNormDriver::VerifyBackward() +template +int BatchNormDriver::VerifyBackward() { if(!back) diff --git a/driver/dm_bnorm.cpp b/driver/dm_bnorm.cpp index 4333787e5e..c7bab90bb5 100644 --- a/driver/dm_bnorm.cpp +++ b/driver/dm_bnorm.cpp @@ -29,9 +29,9 @@ static Driver* makeDriver(const std::string& base_arg) { if(base_arg == "bnorm") - return new BatchNormDriver(); + return new BatchNormDriver(); if(base_arg == "bnormfp16") - return new BatchNormDriver(); + return new BatchNormDriver(); return nullptr; } From f843c8c3101119688be9e639b49219fb66cd8eb4 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 5 Sep 2024 18:57:51 +0000 Subject: [PATCH 05/27] fixed batchNormActivSpatialHostBwdTrain issue --- src/tensor.cpp | 1 + test/fusionHost.hpp | 52 ++++++++++++++++++++++++----------------- test/gtest/bn.hpp | 5 ---- test/na_train.cpp | 28 +++++++++++----------- test/na_train_find2.cpp | 28 +++++++++++----------- 5 files changed, 59 insertions(+), 55 deletions(-) diff --git a/src/tensor.cpp b/src/tensor.cpp index f0fbd86a7b..7ec4c4e581 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -35,6 +35,7 @@ #include #include #include + namespace miopen { namespace { diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index 4e61d89edb..3751dde95b 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -278,15 +278,19 @@ void batchNormSpatialHostFwdTrain(const tensor& input, }); } -template -void batchNormSpatialHostBwdTrain(const tensor& x_input, - const tensor& dy_input, - tensor& dx_out, - const tensor& bnScale, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar) +template +void batchNormSpatialHostBwdTrain(const tensor& x_input, + const tensor& dy_input, + tensor& dx_out, + const tensor& bnScale, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar) { int height, width, n_batch, channels; @@ -334,28 +338,32 @@ void batchNormSpatialHostBwdTrain(const tensor& x_input, double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw; - dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); + dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); } // end for(n_batchs) } // for (column) } // for (row) }); // for (channel) } -template +template void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, double gamma, double beta, double alpha, - const tensor& x_input, - const tensor& y_input, - tensor& dx_out, - const tensor& bnScale, - const tensor& dy_input, - const tensor& bias, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar) + const tensor& x_input, + const tensor& y_input, + tensor& dx_out, + const tensor& bnScale, + const tensor& dy_input, + const tensor& bias, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar) { int height, width, n_batch, channels; @@ -424,7 +432,7 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, double tmp1 = nhw * dyelem - dbias(0, cidx, 0, 0); double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw; - dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); + dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); } // end for(n_batchs) } // for (column) } // for (row) diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index 16d788a70c..f5227217e4 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -120,19 +120,14 @@ struct BNBwdTest : public ::testing::TestWithParam Date: Thu, 5 Sep 2024 19:15:11 +0000 Subject: [PATCH 06/27] undo minor changes --- driver/bn_driver.hpp | 4 ++-- driver/gemm_driver.hpp | 2 +- test/fusionHost.hpp | 4 ++-- test/na_train.cpp | 4 ++-- test/na_train_find2.cpp | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index 72db700a3f..e7cf441502 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -1324,10 +1324,10 @@ int BatchNormDriver::RunBackwardCPU() beta, alpha, in.GetTensor(), - out.GetTensor(), + dy.GetTensor(), out_ref, bnScale.GetTensor(), - dy.GetTensor(), + out.GetTensor(), dBias.GetTensor(), dScale_ref, dBias_ref, diff --git a/driver/gemm_driver.hpp b/driver/gemm_driver.hpp index 282173101d..772104544e 100644 --- a/driver/gemm_driver.hpp +++ b/driver/gemm_driver.hpp @@ -148,7 +148,7 @@ class GemmDriver : public Driver std::vector c; std::vector chost; - T alpha, beta, gamma; + T alpha, beta; miopen::GemmDescriptor gemm_desc = { false, false, false, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1.0f, 0.0f, miopenFloat, false}; diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index 3751dde95b..713dc4b567 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -355,10 +355,10 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, double beta, double alpha, const tensor& x_input, - const tensor& y_input, + const tensor& dy_input, tensor& dx_out, const tensor& bnScale, - const tensor& dy_input, + const tensor& y_input, const tensor& bias, tensor& dscale, tensor& dbias, diff --git a/test/na_train.cpp b/test/na_train.cpp index 3541245f80..b63a0dce57 100644 --- a/test/na_train.cpp +++ b/test/na_train.cpp @@ -307,10 +307,10 @@ struct verify_bwd_batchnorm_spatial_activ activ_beta, activ_alpha, x, - y, + dy, dx, bnscale, - dy, + y, bnbias, dgamma, dbeta, diff --git a/test/na_train_find2.cpp b/test/na_train_find2.cpp index fd123b324f..bf837905d4 100644 --- a/test/na_train_find2.cpp +++ b/test/na_train_find2.cpp @@ -385,10 +385,10 @@ struct verify_bwd_batchnorm_spatial_activ activ_beta, activ_alpha, x, - y, + dy, dx, bnscale, - dy, + y, bnbias, dgamma, dbeta, From bd36353007d645824058aed3ce9a9be8e3b7c12d Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 5 Sep 2024 19:27:33 +0000 Subject: [PATCH 07/27] revert few changes --- driver/bn_driver.hpp | 2 +- test/fusionHost.hpp | 2 +- test/gtest/bn_test_data.hpp | 61 ++++++++++++++++++------------------- test/na_train.cpp | 2 +- test/na_train_find2.cpp | 2 +- 5 files changed, 34 insertions(+), 35 deletions(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index e7cf441502..5e77c3ba43 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -1325,9 +1325,9 @@ int BatchNormDriver::RunBackwardCPU() alpha, in.GetTensor(), dy.GetTensor(), + out.GetTensor(), out_ref, bnScale.GetTensor(), - out.GetTensor(), dBias.GetTensor(), dScale_ref, dBias_ref, diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index 713dc4b567..a2cdad46b7 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -356,9 +356,9 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, double alpha, const tensor& x_input, const tensor& dy_input, + const tensor& y_input, tensor& dx_out, const tensor& bnScale, - const tensor& y_input, const tensor& bias, tensor& dscale, tensor& dbias, diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index 2eafaf4665..fcf237400b 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -63,37 +63,36 @@ inline std::vector Network1() { // pyt_mlperf_resnet50v1.5 return { - {4, 2, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0}, - // {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - // {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - // {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - // {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - // {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - // {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - // {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - // {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - // {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0} - }; + {192, 1, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, + {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0}, + {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}}; } template diff --git a/test/na_train.cpp b/test/na_train.cpp index b63a0dce57..e776f4414e 100644 --- a/test/na_train.cpp +++ b/test/na_train.cpp @@ -308,9 +308,9 @@ struct verify_bwd_batchnorm_spatial_activ activ_alpha, x, dy, + y, dx, bnscale, - y, bnbias, dgamma, dbeta, diff --git a/test/na_train_find2.cpp b/test/na_train_find2.cpp index bf837905d4..51868ae77b 100644 --- a/test/na_train_find2.cpp +++ b/test/na_train_find2.cpp @@ -386,9 +386,9 @@ struct verify_bwd_batchnorm_spatial_activ activ_alpha, x, dy, + y, dx, bnscale, - y, bnbias, dgamma, dbeta, From 53a83eaa9a6af7598e4bffbb5d0944d21ef3cc82 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Fri, 6 Sep 2024 01:39:53 +0000 Subject: [PATCH 08/27] fix run time error --- driver/bn_driver.hpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index 5e77c3ba43..037950ef74 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -345,25 +345,25 @@ int BatchNormDriver::GetandSetData() } if(isFwdInfer) { - estMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - estVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + estMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + estVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); } else if(isFwdTrain) { - savedMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - savedVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - runMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - runVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + savedMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + savedVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + runMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + runVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); } else if(isBwd) { bnScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - dy.AllocOnHost(tensor{bn_layout, in_len}); + dy.AllocOnHost(tensor{bn_layout, in_len}); - dScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - dBias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - savedMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - savedInvVar.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + dScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + dBias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + savedMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + savedInvVar.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); } else { From eb36fcbcf3cf8bc04818dc748f910ea4b9eeb0df Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Fri, 6 Sep 2024 01:55:38 +0000 Subject: [PATCH 09/27] moved GpumemTensor to driver.hpp --- driver/bn_driver.hpp | 137 ----------------------------------------- driver/conv_driver.hpp | 129 -------------------------------------- driver/driver.hpp | 137 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 137 insertions(+), 266 deletions(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index 037950ef74..7179471ddc 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -63,143 +63,6 @@ #define MIO_DRIVER_BN_REFERENCE_COMPUTE_3D_AS_2D 1 // Resolves issue #1974 -//======================== - -template -class GpumemTensor -{ - std::unique_ptr dev; - tensor host; - bool is_gpualloc = false; - -public: - void SetGpuallocMode(bool v) { is_gpualloc = v; } - tensor& GetTensor() { return host; } - - void AllocOnHost(miopenTensorDescriptor_t t) - { - host = tensor(miopen::deref(t)); - if(is_gpualloc) // We do not need host data. - { - host.data.clear(); - host.data.shrink_to_fit(); // To free host memory. - } - } - template - void AllocOnHost(tensor t) - { - AllocOnHost(&t.desc); - } - - std::vector& GetVector() - { - if(is_gpualloc) - MIOPEN_THROW("[MIOpenDriver] GpumemTensor::GetVector should not be called in " - "'--gpualloc 1' mode"); - return host.data; - } - - Tgpu* GetVectorData() { return is_gpualloc ? nullptr : host.data.data(); } - std::size_t GetVectorSize() const { return is_gpualloc ? 0 : host.data.size(); } - - void - InitHostData(const size_t sz, // - const bool do_write, // If set to false, then only generate random data. This is - // necessary to reproduce values in input buffers even if some - // directions are skipped. For example, inputs for Backward - // will be the same for both "-F 0" and "-F 2". - std::function generator) - { - if(is_gpualloc) - { - /// In gpualloc mode, we do not care about reproducibility of results, because - /// validation is not used. Therefore, we do not have to always generate random value - /// (\ref move_rand) - return; - } - - for(size_t i = 0; i < sz; ++i) - { - /// \anchor move_rand - /// Generate random value, even if buffer is unused. This provides the same - /// initialization of input buffers regardless of which kinds of - /// convolutions are currently selectedfor testing (see the "-F" option). - /// Verification cache would be broken otherwise. - auto val = generator(); - if(do_write) - GetVector()[i] = val; - } - } - - status_t AllocOnDevice(stream, context_t ctx, const size_t sz) - { - dev = std::make_unique(ctx, sz, sizeof(Tgpu)); - return STATUS_SUCCESS; - } - - status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz) - { - AllocOnDevice(q, ctx, sz); - if(is_gpualloc) - { - /// \anchor gpualloc_random_init - /// In gpualloc mode, we do not want to leave input buffers uninitialized, because - /// there could be NaNs and Infs, which may affect the performance (which we are - /// interested to evaluate in this mode). Initialization with all 0's is not the - /// best choice as well, because GPU HW may optimize out computations with 0's and - /// that could affect performance of kernels too. That is why we are using - /// rocrand to initialize input buffers. - /// - /// However we do not care about precision in gpualloc mode, because validation - /// is not used. Therefore, range (0,1] is fine. - return gpumemrand::gen_0_1(static_cast(GetDevicePtr()), sz); - } - return dev->ToGPU(q, GetVectorData()); - } - - template - status_t AllocOnDevice(stream, context_t ctx, const size_t sz, std::vector&) - { - static_assert(std::is_same::value // - || std::is_same::value, // - "Before enabling more types, check thoroughly."); - dev = std::make_unique(ctx, sz, sizeof(T)); - return STATUS_SUCCESS; - } - - template - status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz, std::vector& init) - { - AllocOnDevice(q, ctx, sz, init); - if(is_gpualloc) - { - /// \ref gpualloc_random_init - return gpumemrand::gen_0_1(static_cast(GetDevicePtr()), sz); - } - return dev->ToGPU(q, init.data()); - } - - status_t CopyFromDeviceToHost(stream q) - { - return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, GetVectorData()); - } - - template - status_t CopyFromDeviceToHost(stream q, tensor& t) - { - return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, t.data.data()); - } - - template - status_t CopyFromDeviceToHost(stream q, std::vector& v) - { - return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, v.data()); - } - - auto GetDevicePtr() -> auto { return dev->GetMem(); } -}; -//======================== - //#define BN_RUNFOR_PROFILER template diff --git a/driver/conv_driver.hpp b/driver/conv_driver.hpp index 8f9e836345..48658164de 100644 --- a/driver/conv_driver.hpp +++ b/driver/conv_driver.hpp @@ -180,135 +180,6 @@ static inline miopenDataType_t DataTypeFromShortString(const std::string& type) } } -template -class GpumemTensor -{ - std::unique_ptr dev; - tensor host; - bool is_gpualloc = false; - -public: - void SetGpuallocMode(bool v) { is_gpualloc = v; } - tensor& GetTensor() { return host; } - - void AllocOnHost(miopenTensorDescriptor_t t) - { - host = tensor(miopen::deref(t)); - if(is_gpualloc) // We do not need host data. - { - host.data.clear(); - host.data.shrink_to_fit(); // To free host memory. - } - } - - std::vector& GetVector() - { - if(is_gpualloc) - MIOPEN_THROW("[MIOpenDriver] GpumemTensor::GetVector should not be called in " - "'--gpualloc 1' mode"); - return host.data; - } - - Tgpu* GetVectorData() { return is_gpualloc ? nullptr : host.data.data(); } - std::size_t GetVectorSize() const { return is_gpualloc ? 0 : host.data.size(); } - - void - InitHostData(const size_t sz, // - const bool do_write, // If set to false, then only generate random data. This is - // necessary to reproduce values in input buffers even if some - // directions are skipped. For example, inputs for Backward - // will be the same for both "-F 0" and "-F 2". - std::function generator) - { - if(is_gpualloc) - { - /// In gpualloc mode, we do not care about reproducibility of results, because - /// validation is not used. Therefore, we do not have to always generate random value - /// (\ref move_rand) - return; - } - - for(size_t i = 0; i < sz; ++i) - { - /// \anchor move_rand - /// Generate random value, even if buffer is unused. This provides the same - /// initialization of input buffers regardless of which kinds of - /// convolutions are currently selectedfor testing (see the "-F" option). - /// Verification cache would be broken otherwise. - auto val = generator(); - if(do_write) - GetVector()[i] = val; - } - } - - status_t AllocOnDevice(stream, context_t ctx, const size_t sz) - { - dev = std::make_unique(ctx, sz, sizeof(Tgpu)); - return STATUS_SUCCESS; - } - - status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz) - { - AllocOnDevice(q, ctx, sz); - if(is_gpualloc) - { - /// \anchor gpualloc_random_init - /// In gpualloc mode, we do not want to leave input buffers uninitialized, because - /// there could be NaNs and Infs, which may affect the performance (which we are - /// interested to evaluate in this mode). Initialization with all 0's is not the - /// best choice as well, because GPU HW may optimize out computations with 0's and - /// that could affect performance of kernels too. That is why we are using - /// rocrand to initialize input buffers. - /// - /// However we do not care about precision in gpualloc mode, because validation - /// is not used. Therefore, range (0,1] is fine. - return gpumemrand::gen_0_1(static_cast(GetDevicePtr()), sz); - } - return dev->ToGPU(q, GetVectorData()); - } - - template - status_t AllocOnDevice(stream, context_t ctx, const size_t sz, std::vector&) - { - static_assert(std::is_same::value // - || std::is_same::value, // - "Before enabling more types, check thoroughly."); - dev = std::make_unique(ctx, sz, sizeof(T)); - return STATUS_SUCCESS; - } - - template - status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz, std::vector& init) - { - AllocOnDevice(q, ctx, sz, init); - if(is_gpualloc) - { - /// \ref gpualloc_random_init - return gpumemrand::gen_0_1(static_cast(GetDevicePtr()), sz); - } - return dev->ToGPU(q, init.data()); - } - - status_t CopyFromDeviceToHost(stream q) - { - return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, GetVectorData()); - } - - template - status_t CopyFromDeviceToHost(stream q, tensor& t) - { - return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, t.data.data()); - } - - template - status_t CopyFromDeviceToHost(stream q, std::vector& v) - { - return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, v.data()); - } - - auto GetDevicePtr() -> auto { return dev->GetMem(); } -}; - template class GpumemVector { diff --git a/driver/driver.hpp b/driver/driver.hpp index d0c708ff1d..c9decb2185 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -38,6 +38,9 @@ #include #include #include +#include <../test/tensor_holder.hpp> +#include "util_driver.hpp" +#include "rocrand_wrapper.hpp" using half = half_float::half; using hip_bfloat16 = bfloat16; #include @@ -157,6 +160,140 @@ struct GPUMem #endif }; +template +class GpumemTensor +{ + std::unique_ptr dev; + tensor host; + bool is_gpualloc = false; + +public: + void SetGpuallocMode(bool v) { is_gpualloc = v; } + tensor& GetTensor() { return host; } + + void AllocOnHost(miopenTensorDescriptor_t t) + { + host = tensor(miopen::deref(t)); + if(is_gpualloc) // We do not need host data. + { + host.data.clear(); + host.data.shrink_to_fit(); // To free host memory. + } + } + template + void AllocOnHost(tensor t) + { + AllocOnHost(&t.desc); + } + + std::vector& GetVector() + { + if(is_gpualloc) + MIOPEN_THROW("[MIOpenDriver] GpumemTensor::GetVector should not be called in " + "'--gpualloc 1' mode"); + return host.data; + } + + Tgpu* GetVectorData() { return is_gpualloc ? nullptr : host.data.data(); } + std::size_t GetVectorSize() const { return is_gpualloc ? 0 : host.data.size(); } + + void + InitHostData(const size_t sz, // + const bool do_write, // If set to false, then only generate random data. This is + // necessary to reproduce values in input buffers even if some + // directions are skipped. For example, inputs for Backward + // will be the same for both "-F 0" and "-F 2". + std::function generator) + { + if(is_gpualloc) + { + /// In gpualloc mode, we do not care about reproducibility of results, because + /// validation is not used. Therefore, we do not have to always generate random value + /// (\ref move_rand) + return; + } + + for(size_t i = 0; i < sz; ++i) + { + /// \anchor move_rand + /// Generate random value, even if buffer is unused. This provides the same + /// initialization of input buffers regardless of which kinds of + /// convolutions are currently selectedfor testing (see the "-F" option). + /// Verification cache would be broken otherwise. + auto val = generator(); + if(do_write) + GetVector()[i] = val; + } + } + + status_t AllocOnDevice(stream, context_t ctx, const size_t sz) + { + dev = std::make_unique(ctx, sz, sizeof(Tgpu)); + return STATUS_SUCCESS; + } + + status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz) + { + AllocOnDevice(q, ctx, sz); + if(is_gpualloc) + { + /// \anchor gpualloc_random_init + /// In gpualloc mode, we do not want to leave input buffers uninitialized, because + /// there could be NaNs and Infs, which may affect the performance (which we are + /// interested to evaluate in this mode). Initialization with all 0's is not the + /// best choice as well, because GPU HW may optimize out computations with 0's and + /// that could affect performance of kernels too. That is why we are using + /// rocrand to initialize input buffers. + /// + /// However we do not care about precision in gpualloc mode, because validation + /// is not used. Therefore, range (0,1] is fine. + return gpumemrand::gen_0_1(static_cast(GetDevicePtr()), sz); + } + return dev->ToGPU(q, GetVectorData()); + } + + template + status_t AllocOnDevice(stream, context_t ctx, const size_t sz, std::vector&) + { + static_assert(std::is_same::value // + || std::is_same::value, // + "Before enabling more types, check thoroughly."); + dev = std::make_unique(ctx, sz, sizeof(T)); + return STATUS_SUCCESS; + } + + template + status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz, std::vector& init) + { + AllocOnDevice(q, ctx, sz, init); + if(is_gpualloc) + { + /// \ref gpualloc_random_init + return gpumemrand::gen_0_1(static_cast(GetDevicePtr()), sz); + } + return dev->ToGPU(q, init.data()); + } + + status_t CopyFromDeviceToHost(stream q) + { + return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, GetVectorData()); + } + + template + status_t CopyFromDeviceToHost(stream q, tensor& t) + { + return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, t.data.data()); + } + + template + status_t CopyFromDeviceToHost(stream q, std::vector& v) + { + return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, v.data()); + } + + auto GetDevicePtr() -> auto { return dev->GetMem(); } +}; + inline void PadBufferSize(size_t& sz, int datatype_sz) { size_t page_sz = (2 * 1024 * 1024) / datatype_sz; From 7496f3b744e0712df81bb4943f8cd141ac93d535 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Fri, 6 Sep 2024 19:32:29 +0000 Subject: [PATCH 10/27] fix correctness --- driver/bn_driver.hpp | 53 +++++++++++++++++++++++++++++++++++++++----- test/fusionHost.hpp | 2 +- test/gtest/bn.hpp | 1 - 3 files changed, 48 insertions(+), 8 deletions(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index 7179471ddc..359deffaa8 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -36,6 +36,7 @@ #include "rocrand_wrapper.hpp" #include "../test/verify.hpp" +#include "../test/random.hpp" #include "../test/fusionHost.hpp" #include @@ -162,7 +163,7 @@ class BatchNormDriver : public Driver tensor runVariance_ref; // backward - GpumemTensor bnScale; + GpumemTensor bnScale; GpumemTensor dy; GpumemTensor dScale; @@ -197,19 +198,38 @@ int BatchNormDriver::GetandSetData() std::vector in_len = GetInputTensorLengthsFromCmdLine(); + auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; + in.AllocOnHost(tensor{bn_layout, in_len}); + in.InitHostData(in.GetTensor().desc.GetElementSize(), true, gen_value); + out.AllocOnHost(tensor{bn_layout, in_len}); + // out.InitHostData(in.GetTensor().desc.GetElementSize(), true, gen_value); + auto derivedBnDesc = miopen::TensorDescriptor{}; miopen::DeriveBNTensorDescriptor(derivedBnDesc, in.GetTensor().desc, bn_mode); + if(isFwdInfer || isFwdTrain) { scale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); bias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + + auto gen_value_scale_bias = [](auto...) { + return prng::gen_descreet_uniform_sign(1e-2, 100); + }; + + scale.InitHostData(scale.GetTensor().desc.GetElementSize(), true, gen_value_scale_bias); + bias.InitHostData(bias.GetTensor().desc.GetElementSize(), true, gen_value_scale_bias); } if(isFwdInfer) { estMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); estVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + + auto gen_value_emean = [](auto...) { + return prng::gen_descreet_uniform_sign(1e-2, 100); + }; + estMean.InitHostData(estMean.GetTensor().desc.GetElementSize(), true, gen_value_emean); } else if(isFwdTrain) { @@ -217,16 +237,36 @@ int BatchNormDriver::GetandSetData() savedVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); runMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); runVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + + auto gen_var = [](auto...) { + return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); + }; + runMean.InitHostData(runMean.GetTensor().desc.GetElementSize(), true, gen_var); + runVariance.InitHostData(runVariance.GetTensor().desc.GetElementSize(), true, gen_var); } else if(isBwd) { - bnScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + bnScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); dy.AllocOnHost(tensor{bn_layout, in_len}); + auto gen_value_bwd = [](auto...) { + return prng::gen_descreet_uniform_sign(1e-2, 100); + }; + + dy.InitHostData(dy.GetTensor().desc.GetElementSize(), true, gen_value_bwd); + bnScale.InitHostData(bnScale.GetTensor().desc.GetElementSize(), true, gen_value_bwd); + dScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); dBias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); savedMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); savedInvVar.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + + savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_value_bwd); + + auto gen_inv_var = [](auto...) { + return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); + }; + savedInvVar.InitHostData(savedInvVar.GetTensor().desc.GetElementSize(), true, gen_inv_var); } else { @@ -1112,8 +1152,6 @@ int BatchNormDriver::VerifyForward() } // end if(saveMeanVar) } - // Check output tensor error - // out_dev->FromGPU(GetStream(), out.data()); out.CopyFromDeviceToHost(GetStream()); maxval = static_cast(0.0); @@ -1179,6 +1217,9 @@ int BatchNormDriver::RunBackwardCPU() double alpha = static_cast(1), beta = static_cast(0), gamma = static_cast(1); + // float alphaDataDiff = static_cast(1), betaDataDiff = static_cast(0); + // float alphaParamDiff = static_cast(1), betaParamDiff = static_cast(0); + if(bn_mode == miopenBNPerActivation) { // 1xCxHxW @@ -1282,8 +1323,8 @@ int BatchNormDriver::VerifyBackward() #if(MIO_BN_DEBUG == 1) for(int i = 0; i < dScale.GetVector().size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i]))); - maxval = maxval < diff ? diff : maxval; + auto diff = fabs(Tmix(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i]))); + maxval = maxval < diff ? diff : maxval; if(!std::isfinite(diff) || diff > tolerance) { std::cout << "dscale[" << i << "]: " << dScale.GetVector()[i]; diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index a2cdad46b7..a65832b9de 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -162,6 +162,7 @@ void batchNormSpatialHostInference(const tensor& input, output(bidx, cidx, row, column) = static_cast(scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); // printf("output: %f\n",scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); + // std::cout << output(bidx, cidx, row, column) << ","; } } } @@ -292,7 +293,6 @@ void batchNormSpatialHostBwdTrain(const tensor& x_input, const tensor& savedMean, const tensor& savedInvVar) { - int height, width, n_batch, channels; std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); auto nhw = double(height * width * n_batch); diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index f5227217e4..5699bea4c0 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -165,7 +165,6 @@ struct BNBwdTest : public ::testing::TestWithParam(bn_bwd_test_data); - test::CompareTensor(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 5e-4); test::CompareTensor(bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, 5e-4); test::CompareTensor(bn_bwd_test_data.dBias, bn_bwd_test_data.dBias_ref, 5e-4); From 497c47dd87694b0a5ca73be1b6c5dfe430e42398 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Sat, 7 Sep 2024 01:56:12 +0000 Subject: [PATCH 11/27] passing bin/MIOpenDriver bnorm --- driver/bn_driver.hpp | 14 +++++++------- src/ocl/batchnormocl.cpp | 8 ++++---- test/gtest/bn.hpp | 1 + 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index 359deffaa8..35972ce38e 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -1083,27 +1083,26 @@ int BatchNormDriver::VerifyForward() savedVariance.CopyFromDeviceToHost(GetStream()); maxval = static_cast(0.0); - auto errorSaveMean = miopen::rms_range(savedVariance_ref.data, savedMean.GetVector()); + auto errorSaveMean = miopen::rms_range(savedMean_ref.data, savedMean.GetVector()); if(!std::isfinite(errorSaveMean) || errorSaveMean > maxrms) { std::cout << "Forward train batch norm verification FAILED on saved mean: " << errorSaveMean << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < savedMean.GetVector().size() && - i < savedVariance_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; + for(int i = 0; i < savedMean.GetVector().size() && i < savedMean_ref.data.size() && + i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs( - Tmix(fabs(savedMean.GetVector()[i]) - fabs(savedVariance_ref.data[i]))); + diff = fabs(Tmix(fabs(savedMean.GetVector()[i]) - fabs(savedMean_ref.data[i]))); maxval = maxval < diff ? diff : maxval; if(!std::isfinite(diff) || diff > tolerance) { std::cout << "sm[" << i << "]: " << savedMean.GetVector()[i]; - std::cout << ", sm_host[" << i << "]: " << savedVariance_ref.data[i]; + std::cout << ", sm_host[" << i << "]: " << savedMean_ref.data[i]; std::cout << ", diff[" << i << "]: " << Tmix(fabs(savedMean.GetVector()[i]) - - fabs(savedVariance_ref.data[i])) + fabs(savedMean_ref.data[i])) << std::endl; } } @@ -1156,6 +1155,7 @@ int BatchNormDriver::VerifyForward() maxval = static_cast(0.0); auto errorOut = miopen::rms_range(out_ref.data, out.GetVector()); + if(!std::isfinite(errorOut) || errorOut > maxrms) { std::cout << "Forward batch norm verification FAILED on output: " << errorOut << std::endl; diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp index 40bcd34935..205bae8bc4 100644 --- a/src/ocl/batchnormocl.cpp +++ b/src/ocl/batchnormocl.cpp @@ -313,10 +313,10 @@ void BatchNormBackward(Handle& handle, { MIOPEN_THROW(miopenStatusBadParm); } - if(dxDesc.GetType() != dyDesc.GetType()) - { - MIOPEN_THROW(miopenStatusBadParm); - } + // if(dxDesc.GetType() != dyDesc.GetType()) + // { + // MIOPEN_THROW(miopenStatusBadParm); + // } if(xDesc.GetNumDims() < 3) { MIOPEN_THROW(miopenStatusBadParm); diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index 5699bea4c0..f5227217e4 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -165,6 +165,7 @@ struct BNBwdTest : public ::testing::TestWithParam(bn_bwd_test_data); + test::CompareTensor(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 5e-4); test::CompareTensor(bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, 5e-4); test::CompareTensor(bn_bwd_test_data.dBias, bn_bwd_test_data.dBias_ref, 5e-4); From 22a3384d858a1ea98f9c83db3a61d90d6b072a35 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Tue, 24 Sep 2024 21:59:01 +0000 Subject: [PATCH 12/27] now working driver command for float, fp16 and bfp16 for batch norm --- driver/bn_driver.hpp | 151 ++++++++++++++++++++------------- driver/dm_bnorm.cpp | 6 ++ driver/driver.hpp | 20 ++--- src/batch_norm_api.cpp | 1 + src/driver_arguments.cpp | 4 + test/fusionHost.hpp | 69 ++++++++------- test/gtest/bn.hpp | 1 - test/gtest/bn_bwd.cpp | 48 +++++------ test/gtest/test_operations.hpp | 26 ++++++ 9 files changed, 197 insertions(+), 129 deletions(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index 35972ce38e..a922b97acf 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -142,7 +142,7 @@ class BatchNormDriver : public Driver GpumemTensor in; GpumemTensor out; - tensor out_ref; + tensor out_ref; // forward GpumemTensor scale; @@ -152,25 +152,30 @@ class BatchNormDriver : public Driver GpumemTensor estMean; GpumemTensor estVariance; - // forward training GpumemTensor savedMean; - tensor savedMean_ref; + tensor savedMean_ref; + + // forward training GpumemTensor savedVariance; - tensor savedVariance_ref; GpumemTensor runMean; - tensor runMean_ref; GpumemTensor runVariance; - tensor runVariance_ref; + // ref + tensor savedVariance_ref; + tensor runMean_ref; + tensor runVariance_ref; // backward - GpumemTensor bnScale; + GpumemTensor out_bwd; - GpumemTensor dy; + GpumemTensor bnScale; GpumemTensor dScale; - tensor dScale_ref; GpumemTensor dBias; - tensor dBias_ref; + // savedMean declared above as Tmix as well GpumemTensor savedInvVar; + GpumemTensor dy; + + tensor dBias_ref; + tensor dScale_ref; Tref maxval; @@ -203,14 +208,12 @@ int BatchNormDriver::GetandSetData() in.AllocOnHost(tensor{bn_layout, in_len}); in.InitHostData(in.GetTensor().desc.GetElementSize(), true, gen_value); - out.AllocOnHost(tensor{bn_layout, in_len}); - // out.InitHostData(in.GetTensor().desc.GetElementSize(), true, gen_value); - auto derivedBnDesc = miopen::TensorDescriptor{}; miopen::DeriveBNTensorDescriptor(derivedBnDesc, in.GetTensor().desc, bn_mode); if(isFwdInfer || isFwdTrain) { + out.AllocOnHost(tensor{bn_layout, in_len}); scale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); bias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); @@ -246,27 +249,33 @@ int BatchNormDriver::GetandSetData() } else if(isBwd) { - bnScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + out_bwd.AllocOnHost(tensor{bn_layout, in_len}); + + bnScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); dy.AllocOnHost(tensor{bn_layout, in_len}); - auto gen_value_bwd = [](auto...) { - return prng::gen_descreet_uniform_sign(1e-2, 100); + auto gen_var_bwd = [](auto...) { + return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); }; - - dy.InitHostData(dy.GetTensor().desc.GetElementSize(), true, gen_value_bwd); - bnScale.InitHostData(bnScale.GetTensor().desc.GetElementSize(), true, gen_value_bwd); + dy.InitHostData(dy.GetTensor().desc.GetElementSize(), true, gen_var_bwd); dScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); dBias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); savedMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); savedInvVar.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_value_bwd); + bnScale.InitHostData(bnScale.GetTensor().desc.GetElementSize(), true, gen_value); - auto gen_inv_var = [](auto...) { - return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); - }; - savedInvVar.InitHostData(savedInvVar.GetTensor().desc.GetElementSize(), true, gen_inv_var); + if(saveMeanVar && keepRunningMeanVar) + { + savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_var_bwd); + + auto gen_in_var = [](auto...) { + return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); + }; + savedInvVar.InitHostData( + savedInvVar.GetTensor().desc.GetElementSize(), true, gen_in_var); + } } else { @@ -473,49 +482,66 @@ int BatchNormDriver::AllocateBuffersAndCopy() #if MIOPEN_BACKEND_OPENCL clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); #endif - status |= in.AllocOnDeviceAndInit(q, ctx, in.GetTensor().desc.GetElementSpace()); - status |= out.AllocOnDeviceAndInit(q, ctx, out.GetTensor().desc.GetElementSpace()); - out_ref = out.GetTensor(); + status |= in.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&in.GetTensor().desc)); + if(isFwdInfer || isFwdTrain) { - status |= scale.AllocOnDeviceAndInit(q, ctx, scale.GetTensor().desc.GetElementSpace()); - status |= bias.AllocOnDeviceAndInit(q, ctx, bias.GetTensor().desc.GetElementSpace()); + status |= out.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&out.GetTensor().desc)); + out_ref = + tensor{out.GetTensor().desc.GetLayout_t(), out.GetTensor().desc.GetLengths()}; + status |= scale.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&scale.GetTensor().desc)); + status |= bias.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&bias.GetTensor().desc)); } if(isFwdInfer) { - status |= estMean.AllocOnDeviceAndInit(q, ctx, estMean.GetTensor().desc.GetElementSpace()); - status |= estVariance.AllocOnDeviceAndInit( - q, ctx, estVariance.GetTensor().desc.GetElementSpace()); + status |= estMean.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&estMean.GetTensor().desc)); + status |= + estVariance.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&estVariance.GetTensor().desc)); } if(isFwdTrain) { status |= - savedMean.AllocOnDeviceAndInit(q, ctx, savedMean.GetTensor().desc.GetElementSpace()); + savedMean.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&savedMean.GetTensor().desc)); status |= savedVariance.AllocOnDeviceAndInit( - q, ctx, savedVariance.GetTensor().desc.GetElementSpace()); - status |= runMean.AllocOnDeviceAndInit(q, ctx, runMean.GetTensor().desc.GetElementSpace()); - status |= runVariance.AllocOnDeviceAndInit( - q, ctx, runVariance.GetTensor().desc.GetElementSpace()); + q, ctx, GetTensorSize(&savedVariance.GetTensor().desc)); + status |= runMean.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&runMean.GetTensor().desc)); + status |= + runVariance.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&runVariance.GetTensor().desc)); - savedMean_ref = savedMean.GetTensor(); - savedVariance_ref = savedVariance.GetTensor(); - runMean_ref = runMean.GetTensor(); - runVariance_ref = runVariance.GetTensor(); + savedMean_ref = tensor{savedMean.GetTensor().desc.GetLayout_t(), + savedMean.GetTensor().desc.GetLengths()}; + + savedVariance_ref = tensor{savedVariance.GetTensor().desc.GetLayout_t(), + savedVariance.GetTensor().desc.GetLengths()}; + + runMean_ref = tensor{runMean.GetTensor().desc.GetLayout_t(), + runMean.GetTensor().desc.GetLengths()}; + + runVariance_ref = tensor{runVariance.GetTensor().desc.GetLayout_t(), + runVariance.GetTensor().desc.GetLengths()}; } if(isBwd) { - status |= bnScale.AllocOnDeviceAndInit(q, ctx, bnScale.GetTensor().desc.GetElementSpace()); - status |= dy.AllocOnDeviceAndInit(q, ctx, dy.GetTensor().desc.GetElementSpace()); + status |= out_bwd.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&out_bwd.GetTensor().desc)); + + out_ref = tensor{out_bwd.GetTensor().desc.GetLayout_t(), + out_bwd.GetTensor().desc.GetLengths()}; - status |= dScale.AllocOnDeviceAndInit(q, ctx, dScale.GetTensor().desc.GetElementSpace()); - status |= dBias.AllocOnDeviceAndInit(q, ctx, dBias.GetTensor().desc.GetElementSpace()); + status |= bnScale.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&bnScale.GetTensor().desc)); + status |= dy.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&dy.GetTensor().desc)); + + status |= dScale.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&dScale.GetTensor().desc)); + status |= dBias.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&dBias.GetTensor().desc)); + status |= + savedMean.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&savedMean.GetTensor().desc)); status |= - savedMean.AllocOnDeviceAndInit(q, ctx, savedMean.GetTensor().desc.GetElementSpace()); - status |= savedInvVar.AllocOnDeviceAndInit( - q, ctx, savedInvVar.GetTensor().desc.GetElementSpace()); + savedInvVar.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&savedInvVar.GetTensor().desc)); - dScale_ref = dScale.GetTensor(); - dBias_ref = dBias.GetTensor(); + dScale_ref = tensor{dScale.GetTensor().desc.GetLayout_t(), + dScale.GetTensor().desc.GetLengths()}; + + dBias_ref = + tensor{dBias.GetTensor().desc.GetLayout_t(), dBias.GetTensor().desc.GetLengths()}; } if(status != STATUS_SUCCESS) @@ -902,8 +928,8 @@ int BatchNormDriver::RunBackwardGPU() in.GetDevicePtr(), &dy.GetTensor().desc, dy.GetDevicePtr(), - &out.GetTensor().desc, - out.GetDevicePtr(), + &out_bwd.GetTensor().desc, + out_bwd.GetDevicePtr(), &bnScale.GetTensor().desc, bnScale.GetDevicePtr(), dScale.GetDevicePtr(), @@ -924,8 +950,8 @@ int BatchNormDriver::RunBackwardGPU() in.GetDevicePtr(), &dy.GetTensor().desc, dy.GetDevicePtr(), - &out.GetTensor().desc, - out.GetDevicePtr(), + &out_bwd.GetTensor().desc, + out_bwd.GetDevicePtr(), &bnScale.GetTensor().desc, bnScale.GetDevicePtr(), dScale.GetDevicePtr(), @@ -1014,6 +1040,7 @@ int BatchNormDriver::VerifyForward() runVariance.CopyFromDeviceToHost(GetStream()); auto errorRunMean = miopen::rms_range(runMean_ref.data, runMean.GetVector()); + if(!std::isfinite(errorRunMean) || errorRunMean > maxrms) { std::cout << "Forward train batch norm verification FAILED on running mean: " @@ -1240,6 +1267,7 @@ int BatchNormDriver::RunBackwardCPU() } else if(bn_mode == miopenBNSpatial) { // 1xCx1x1 + batchNormSpatialHostBwdTrain(in.GetTensor(), dy.GetTensor(), out_ref, @@ -1271,7 +1299,7 @@ int BatchNormDriver::VerifyBackward() RunBackwardCPU(); - out.CopyFromDeviceToHost(GetStream()); + out_bwd.CopyFromDeviceToHost(GetStream()); dScale.CopyFromDeviceToHost(GetStream()); dBias.CopyFromDeviceToHost(GetStream()); @@ -1281,7 +1309,8 @@ int BatchNormDriver::VerifyBackward() Tref diff = static_cast(0.0); #endif maxval = static_cast(0.0); - auto errordxout = miopen::rms_range(out_ref.data, out.GetVector()); + auto errordxout = miopen::rms_range(out_ref.data, out_bwd.GetVector()); + if(!std::isfinite(errordxout) || errordxout > maxrms) { std::cout << "Backwards prop batch norm verification FAILED on dx: " << errordxout @@ -1290,17 +1319,17 @@ int BatchNormDriver::VerifyBackward() #if(MIO_BN_DEBUG == 1) for(int i = 0; i < out_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tgpu(fabs(out_ref.data[i]) - fabs(out.GetVector()[i]))); + diff = fabs(Tgpu(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i]))); maxval = maxval < diff ? diff : maxval; if(!std::isfinite(diff) || diff > tolerance) { std::cout << "out_ref[" << i << "]: " << out_ref.data[i]; - std::cout << "\tout.GetVector()[" << i << "]: " << out.GetVector()[i]; + std::cout << "\tout_bwd.GetVector()[" << i << "]: " << out_bwd.GetVector()[i]; std::cout << "\tdiff[" << i - << "]: " << Tgpu(fabs(out_ref.data[i]) - fabs(out.GetVector()[i])); + << "]: " << Tgpu(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i])); std::cout << "\tratioH: " - << fabs(fabs(out_ref.data[i]) - fabs(out.GetVector()[i])) / - fabs(out.GetVector()[i]) + << fabs(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i])) / + fabs(out_bwd.GetVector()[i]) << std::endl; } } diff --git a/driver/dm_bnorm.cpp b/driver/dm_bnorm.cpp index c7bab90bb5..23340adc94 100644 --- a/driver/dm_bnorm.cpp +++ b/driver/dm_bnorm.cpp @@ -26,12 +26,18 @@ #include "bn_driver.hpp" #include "registry_driver_maker.hpp" +// template + static Driver* makeDriver(const std::string& base_arg) { if(base_arg == "bnorm") return new BatchNormDriver(); if(base_arg == "bnormfp16") + return new BatchNormDriver(); + if(base_arg == "bnormfp16fp32") return new BatchNormDriver(); + if(base_arg == "bnormbfp16fp32") + return new BatchNormDriver(); return nullptr; } diff --git a/driver/driver.hpp b/driver/driver.hpp index c9decb2185..64400cc405 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -330,16 +330,16 @@ inline std::string ParseBaseArg(int argc, char* argv[]) if(arg != "conv" && arg != "convfp16" && arg != "convint8" && arg != "convbfp16" && arg != "pool" && arg != "poolfp16" && arg != "lrn" && arg != "lrnfp16" && arg != "activ" && arg != "activfp16" && arg != "softmax" && arg != "softmaxfp16" && arg != "bnorm" && - arg != "bnormfp16" && arg != "rnn" && arg != "rnnfp16" && arg != "rnn_seq" && - arg != "rnn_seqfp16" && arg != "gemm" && arg != "gemmfp16" && arg != "ctc" && - arg != "dropout" && arg != "dropoutfp16" && arg != "tensorop" && arg != "reduce" && - arg != "reducefp16" && arg != "reducefp64" && arg != "layernorm" && arg != "layernormfp16" && - arg != "layernormbfp16" && arg != "sum" && arg != "sumfp16" && arg != "sumbfp16" && - arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" && - arg != "catfp16" && arg != "catbfp16" && arg != "addlayernorm" && - arg != "addlayernormfp16" && arg != "addlayernormbfp16" && arg != "t5layernorm" && - arg != "t5layernormfp16" && arg != "t5layernormbfp16" && arg != "adam" && - arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" && + arg != "bnormfp16" && arg != "bnormfp16fp32" && arg != "bnormbfp16fp32" && arg != "rnn" && + arg != "rnnfp16" && arg != "rnn_seq" && arg != "rnn_seqfp16" && arg != "gemm" && + arg != "gemmfp16" && arg != "ctc" && arg != "dropout" && arg != "dropoutfp16" && + arg != "tensorop" && arg != "reduce" && arg != "reducefp16" && arg != "reducefp64" && + arg != "layernorm" && arg != "layernormfp16" && arg != "layernormbfp16" && arg != "sum" && + arg != "sumfp16" && arg != "sumbfp16" && arg != "groupnorm" && arg != "groupnormfp16" && + arg != "groupnormbfp16" && arg != "cat" && arg != "catfp16" && arg != "catbfp16" && + arg != "addlayernorm" && arg != "addlayernormfp16" && arg != "addlayernormbfp16" && + arg != "t5layernorm" && arg != "t5layernormfp16" && arg != "t5layernormbfp16" && + arg != "adam" && arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" && arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "adamw" && arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" && arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" && diff --git a/src/batch_norm_api.cpp b/src/batch_norm_api.cpp index 8f184a9508..3a2de02d01 100644 --- a/src/batch_norm_api.cpp +++ b/src/batch_norm_api.cpp @@ -251,6 +251,7 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, savedMean, savedInvVariance, miopen::debug::BatchNormDirection_t::Backward); + // In case of NxCxDxHxW int size{0}; miopenGetTensorDescriptorSize(xDesc, &size); diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp index 97fe16d7c4..57034e5378 100644 --- a/src/driver_arguments.cpp +++ b/src/driver_arguments.cpp @@ -66,6 +66,10 @@ void BnDataType(std::stringstream& ss, const miopen::TensorDescriptor& desc) { ss << "bnormfp16"; } + if(desc.GetType() == miopenBFloat16) + { + ss << "bnormbfp16"; + } else { ss << "bnorm"; diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index a65832b9de..ec271ef967 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -134,9 +134,9 @@ void convHostForward(const tensor& input, } } -template +template void batchNormSpatialHostInference(const tensor& input, - tensor& output, + tensor& output, const tensor& scale, const tensor& bias, double epsilon, @@ -169,9 +169,9 @@ void batchNormSpatialHostInference(const tensor& input, }); } -template +template void batchNormPerActivHostInference(const tensor& input, - tensor& output, + tensor& output, const tensor& scale, const tensor& bias, double epsilon, @@ -203,17 +203,17 @@ void batchNormPerActivHostInference(const tensor& input, }); } -template +template void batchNormSpatialHostFwdTrain(const tensor& input, - tensor& out, + tensor& out, const tensor& scale, const tensor& bias, double epsilon, double expAvgFactor, - tensor& saveMean, - tensor& saveInvVar, - tensor& runMean, - tensor& runVar) + tensor& saveMean, + tensor& saveInvVar, + tensor& runMean, + tensor& runVar) { int height, width, n_batch, channels; @@ -281,15 +281,15 @@ void batchNormSpatialHostFwdTrain(const tensor& input, template + typename AccDataType, + typename RefDataType> void batchNormSpatialHostBwdTrain(const tensor& x_input, const tensor& dy_input, - tensor& dx_out, + tensor& dx_out, const tensor& bnScale, - tensor& dscale, - tensor& dbias, + tensor& dscale, + tensor& dbias, const tensor& savedMean, const tensor& savedInvVar) { @@ -338,7 +338,8 @@ void batchNormSpatialHostBwdTrain(const tensor& x_input, double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw; - dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); + dx_out(bidx, cidx, row, column) = + static_cast(tmp3 * (tmp2 + tmp1)); } // end for(n_batchs) } // for (column) } // for (row) @@ -349,7 +350,9 @@ template + typename AccDataType, + typename OutRefDataType, + typename RefDataType> void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, double gamma, double beta, @@ -357,11 +360,11 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, const tensor& x_input, const tensor& dy_input, const tensor& y_input, - tensor& dx_out, + tensor& dx_out, const tensor& bnScale, const tensor& bias, - tensor& dscale, - tensor& dbias, + tensor& dscale, + tensor& dbias, const tensor& savedMean, const tensor& savedInvVar) { @@ -439,17 +442,17 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, }); // for (channel) } -template +template void batchNormPerActHostFwdTrain(const tensor& input, - tensor& out, + tensor& out, const tensor& scale, const tensor& bias, double epsilon, double expAvgFactor, - tensor& saveMean, - tensor& saveInvVar, - tensor& runMean, - tensor& runVar) + tensor& saveMean, + tensor& saveInvVar, + tensor& runMean, + tensor& runVar) { int height, width, n_batch, channels; @@ -493,7 +496,7 @@ void batchNormPerActHostFwdTrain(const tensor& input, elemStd = (input(bidx, cidx, row, column) - mean_accum); // (x_i - mean) inhat = elemStd * elemInvVar; // #5 Gamma and Beta adjust :: y_i = gamma*x_hat + beta - out(bidx, cidx, row, column) = static_cast( + out(bidx, cidx, row, column) = static_cast( scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column)); } // end for(n_batch) @@ -506,21 +509,21 @@ void batchNormPerActHostFwdTrain(const tensor& input, runVar(0, cidx, row, column) = (1 - expAvgFactor) * runVar(0, cidx, row, column) + expAvgFactor * adjust; - saveMean(0, cidx, row, column) = static_cast(mean_accum); - saveInvVar(0, cidx, row, column) = static_cast(elemInvVar); + saveMean(0, cidx, row, column) = static_cast(mean_accum); + saveInvVar(0, cidx, row, column) = static_cast(elemInvVar); } // for (column) } // for (row) }); } -template +template void batchNormPerActHostBwdTrain(const tensor& x_input, const tensor& dy_input, const tensor& scale, - tensor& dscale, - tensor& dbias, - tensor& dx_out, + tensor& dscale, + tensor& dbias, + tensor& dx_out, const tensor& savedMean, const tensor& savedInvVar) { diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index f5227217e4..5699bea4c0 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -165,7 +165,6 @@ struct BNBwdTest : public ::testing::TestWithParam(bn_bwd_test_data); - test::CompareTensor(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 5e-4); test::CompareTensor(bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, 5e-4); test::CompareTensor(bn_bwd_test_data.dBias, bn_bwd_test_data.dBias_ref, 5e-4); diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp index f2d54e8077..f14b008233 100644 --- a/test/gtest/bn_bwd.cpp +++ b/test/gtest/bn_bwd.cpp @@ -31,41 +31,41 @@ struct GPU_BNBwd_FP16 { }; -struct GPU_BNBwd_FP32 : BNBwdTest -{ -}; +// struct GPU_BNBwd_FP32 : BNBwdTest +// { +// }; -struct GPU_BNBwd_BFP16 : BNBwdTest -{ -}; +// struct GPU_BNBwd_BFP16 : BNBwdTest +// { +// }; -struct GPU_BNBwd_FP64 : BNBwdTest -{ -}; +// struct GPU_BNBwd_FP64 : BNBwdTest +// { +// }; TEST_P(GPU_BNBwd_FP16, BnBwdCKHalf) {} -TEST_P(GPU_BNBwd_FP32, BnBwdCKFloat) {} +// TEST_P(GPU_BNBwd_FP32, BnBwdCKFloat) {} -TEST_P(GPU_BNBwd_BFP16, BnBwdCKBFloat16) {} -TEST_P(GPU_BNBwd_FP64, BnBwdCKDouble) {} +// TEST_P(GPU_BNBwd_BFP16, BnBwdCKBFloat16) {} +// TEST_P(GPU_BNBwd_FP64, BnBwdCKDouble) {} INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNBwd_FP16, testing::Combine(testing::ValuesIn(Network1()), testing::Values(miopenTensorNHWC))); -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNBwd_FP32, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); +// INSTANTIATE_TEST_SUITE_P(Smoke, +// GPU_BNBwd_FP32, +// testing::Combine(testing::ValuesIn(Network1()), +// testing::Values(miopenTensorNHWC))); -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNBwd_BFP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); +// INSTANTIATE_TEST_SUITE_P(Smoke, +// GPU_BNBwd_BFP16, +// testing::Combine(testing::ValuesIn(Network1()), +// testing::Values(miopenTensorNHWC))); -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNBwd_FP64, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); +// INSTANTIATE_TEST_SUITE_P(Smoke, +// GPU_BNBwd_FP64, +// testing::Combine(testing::ValuesIn(Network1()), +// testing::Values(miopenTensorNHWC))); diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp index 298ac55e3e..2abcb05fe7 100644 --- a/test/gtest/test_operations.hpp +++ b/test/gtest/test_operations.hpp @@ -48,6 +48,32 @@ template void ComputeCPUBNBwd(DLModule& dl_module) { + std::cout << "\n====start dy====\n"; + for(int i = 0; i < dl_module.dy.data.size(); ++i) + { + std::cout << dl_module.dy[i] << ","; + } + std::cout << "\n"; + std::cout << "\n====start bnScale====\n"; + for(int i = 0; i < dl_module.bnScale.data.size(); ++i) + { + std::cout << dl_module.bnScale[i] << ","; + } + std::cout << "\n"; + std::cout << "\n====start savedMean====\n"; + for(int i = 0; i < dl_module.savedMean.data.size(); ++i) + { + std::cout << dl_module.savedMean[i] << ","; + } + std::cout << "\n"; + std::cout << "\n====start savedInvVar====\n"; + for(int i = 0; i < dl_module.savedInvVar.data.size(); ++i) + { + std::cout << dl_module.savedInvVar[i] << ","; + } + std::cout << "\n"; + + // todo : need to do based on bn_mode batchNormSpatialHostBwdTrain(dl_module.input, dl_module.dy, dl_module.ref_out, From 0c6957396b7dd446342b930c2e1e061c97f293dc Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Wed, 25 Sep 2024 13:49:40 +0000 Subject: [PATCH 13/27] cleanups --- src/batch_norm_api.cpp | 1 - src/ocl/batchnormocl.cpp | 8 +++--- test/gtest/bn.hpp | 1 + test/gtest/bn_bwd.cpp | 48 +++++++++++++++++----------------- test/gtest/test_operations.hpp | 26 ------------------ 5 files changed, 29 insertions(+), 55 deletions(-) diff --git a/src/batch_norm_api.cpp b/src/batch_norm_api.cpp index 3a2de02d01..8f184a9508 100644 --- a/src/batch_norm_api.cpp +++ b/src/batch_norm_api.cpp @@ -251,7 +251,6 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, savedMean, savedInvVariance, miopen::debug::BatchNormDirection_t::Backward); - // In case of NxCxDxHxW int size{0}; miopenGetTensorDescriptorSize(xDesc, &size); diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp index 205bae8bc4..40bcd34935 100644 --- a/src/ocl/batchnormocl.cpp +++ b/src/ocl/batchnormocl.cpp @@ -313,10 +313,10 @@ void BatchNormBackward(Handle& handle, { MIOPEN_THROW(miopenStatusBadParm); } - // if(dxDesc.GetType() != dyDesc.GetType()) - // { - // MIOPEN_THROW(miopenStatusBadParm); - // } + if(dxDesc.GetType() != dyDesc.GetType()) + { + MIOPEN_THROW(miopenStatusBadParm); + } if(xDesc.GetNumDims() < 3) { MIOPEN_THROW(miopenStatusBadParm); diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index 5699bea4c0..f5227217e4 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -165,6 +165,7 @@ struct BNBwdTest : public ::testing::TestWithParam(bn_bwd_test_data); + test::CompareTensor(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, 5e-4); test::CompareTensor(bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, 5e-4); test::CompareTensor(bn_bwd_test_data.dBias, bn_bwd_test_data.dBias_ref, 5e-4); diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp index f14b008233..f2d54e8077 100644 --- a/test/gtest/bn_bwd.cpp +++ b/test/gtest/bn_bwd.cpp @@ -31,41 +31,41 @@ struct GPU_BNBwd_FP16 { }; -// struct GPU_BNBwd_FP32 : BNBwdTest -// { -// }; +struct GPU_BNBwd_FP32 : BNBwdTest +{ +}; -// struct GPU_BNBwd_BFP16 : BNBwdTest -// { -// }; +struct GPU_BNBwd_BFP16 : BNBwdTest +{ +}; -// struct GPU_BNBwd_FP64 : BNBwdTest -// { -// }; +struct GPU_BNBwd_FP64 : BNBwdTest +{ +}; TEST_P(GPU_BNBwd_FP16, BnBwdCKHalf) {} -// TEST_P(GPU_BNBwd_FP32, BnBwdCKFloat) {} +TEST_P(GPU_BNBwd_FP32, BnBwdCKFloat) {} -// TEST_P(GPU_BNBwd_BFP16, BnBwdCKBFloat16) {} -// TEST_P(GPU_BNBwd_FP64, BnBwdCKDouble) {} +TEST_P(GPU_BNBwd_BFP16, BnBwdCKBFloat16) {} +TEST_P(GPU_BNBwd_FP64, BnBwdCKDouble) {} INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNBwd_FP16, testing::Combine(testing::ValuesIn(Network1()), testing::Values(miopenTensorNHWC))); -// INSTANTIATE_TEST_SUITE_P(Smoke, -// GPU_BNBwd_FP32, -// testing::Combine(testing::ValuesIn(Network1()), -// testing::Values(miopenTensorNHWC))); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BNBwd_FP32, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); -// INSTANTIATE_TEST_SUITE_P(Smoke, -// GPU_BNBwd_BFP16, -// testing::Combine(testing::ValuesIn(Network1()), -// testing::Values(miopenTensorNHWC))); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BNBwd_BFP16, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); -// INSTANTIATE_TEST_SUITE_P(Smoke, -// GPU_BNBwd_FP64, -// testing::Combine(testing::ValuesIn(Network1()), -// testing::Values(miopenTensorNHWC))); +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BNBwd_FP64, + testing::Combine(testing::ValuesIn(Network1()), + testing::Values(miopenTensorNHWC))); diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp index 2abcb05fe7..298ac55e3e 100644 --- a/test/gtest/test_operations.hpp +++ b/test/gtest/test_operations.hpp @@ -48,32 +48,6 @@ template void ComputeCPUBNBwd(DLModule& dl_module) { - std::cout << "\n====start dy====\n"; - for(int i = 0; i < dl_module.dy.data.size(); ++i) - { - std::cout << dl_module.dy[i] << ","; - } - std::cout << "\n"; - std::cout << "\n====start bnScale====\n"; - for(int i = 0; i < dl_module.bnScale.data.size(); ++i) - { - std::cout << dl_module.bnScale[i] << ","; - } - std::cout << "\n"; - std::cout << "\n====start savedMean====\n"; - for(int i = 0; i < dl_module.savedMean.data.size(); ++i) - { - std::cout << dl_module.savedMean[i] << ","; - } - std::cout << "\n"; - std::cout << "\n====start savedInvVar====\n"; - for(int i = 0; i < dl_module.savedInvVar.data.size(); ++i) - { - std::cout << dl_module.savedInvVar[i] << ","; - } - std::cout << "\n"; - - // todo : need to do based on bn_mode batchNormSpatialHostBwdTrain(dl_module.input, dl_module.dy, dl_module.ref_out, From aeebe811a9f0d30b3a14ec553ccc15912eeba6b6 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Wed, 25 Sep 2024 14:00:54 +0000 Subject: [PATCH 14/27] remove old variables --- driver/bn_driver.hpp | 39 +++++++++------------------------------ 1 file changed, 9 insertions(+), 30 deletions(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index a922b97acf..22d55665dd 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -70,16 +70,7 @@ template class BatchNormDriver : public Driver { public: - BatchNormDriver() : Driver() - { - miopenCreateTensorDescriptor(&inputTensor); - miopenCreateTensorDescriptor(&outputTensor); - // miopenCreateTensorDescriptor(&biasScaleTensor); - // miopenCreateTensorDescriptor(&dxOutputTensor); - // miopenCreateTensorDescriptor(&dyInputTensor); - - data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf; - } + BatchNormDriver() : Driver() { data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf; } int AddCmdLineArgs() override; int ParseCmdLineArgs(int argc, char* argv[]) override; @@ -109,14 +100,7 @@ class BatchNormDriver : public Driver int VerifyBackward() override; int VerifyForward() override; - ~BatchNormDriver() override - { - miopenDestroyTensorDescriptor(outputTensor); - miopenDestroyTensorDescriptor(inputTensor); - // miopenDestroyTensorDescriptor(biasScaleTensor); - // miopenDestroyTensorDescriptor(dxOutputTensor); - // miopenDestroyTensorDescriptor(dyInputTensor); - } + ~BatchNormDriver() override {} private: miopenBatchNormMode_t bn_mode; @@ -137,9 +121,6 @@ class BatchNormDriver : public Driver InputFlags inflags; bool isDepthSpecified = false; - miopenTensorDescriptor_t inputTensor; - miopenTensorDescriptor_t outputTensor; - GpumemTensor in; GpumemTensor out; tensor out_ref; @@ -777,9 +758,9 @@ int BatchNormDriver::RunForwardGPU() avgtime / (iters - 1), iters - 1); int in_n, in_c, in_h, in_w; - std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); + std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(in.GetTensor().desc.GetLengths()); size_t M = in_n * in_c * in_h * in_w; - size_t dataSz = (M + 2 * in_c) * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); + size_t dataSz = (M + 2 * in_c) * miopen::GetTypeSize(in.GetTensor().desc.GetType()); float rdCnt = -1.0; float wrCnt = 1.0; if(forw == 1) @@ -980,13 +961,11 @@ int BatchNormDriver::RunBackwardGPU() avgtime += time; int in_n, in_c, in_h, in_w; - std::tie(in_n, in_c, in_h, in_w) = - miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); - size_t M = in_n * in_c * in_h * in_w; - size_t dataSz = - (M + 2 * in_c) * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); - float rdCnt = 2.0; - float wrCnt = 1.0; + std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(in.GetTensor().desc.GetLengths()); + size_t M = in_n * in_c * in_h * in_w; + size_t dataSz = (M + 2 * in_c) * miopen::GetTypeSize(in.GetTensor().desc.GetType()); + float rdCnt = 2.0; + float wrCnt = 1.0; // layer, flopCnt, reads, writes, GFLOPS, GB/s, timeMs printf("stats: bnormb, 0, %zu, %zu, 0, %f, %f\n", dataSz, From 5379f94a3fa869682ccefc25add693208f75744f Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Wed, 25 Sep 2024 14:13:26 +0000 Subject: [PATCH 15/27] remove dead code --- driver/dm_bnorm.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/driver/dm_bnorm.cpp b/driver/dm_bnorm.cpp index 23340adc94..24e986fa1d 100644 --- a/driver/dm_bnorm.cpp +++ b/driver/dm_bnorm.cpp @@ -26,8 +26,6 @@ #include "bn_driver.hpp" #include "registry_driver_maker.hpp" -// template - static Driver* makeDriver(const std::string& base_arg) { if(base_arg == "bnorm") From a6ec4f029e5775e9559195088137c0e050748868 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 26 Sep 2024 20:28:39 +0000 Subject: [PATCH 16/27] add 3d --- driver/bn_driver.hpp | 132 ++++++++++++++++-- src/batch_norm.cpp | 3 +- src/solver/batchnorm/backward_ck.cpp | 3 +- .../batchnorm/backward_per_activation.cpp | 2 + .../batchnorm/backward_spatial_multiple.cpp | 4 + .../batchnorm/backward_spatial_single.cpp | 2 + 6 files changed, 133 insertions(+), 13 deletions(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index 22d55665dd..396ad6b218 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -100,6 +100,13 @@ class BatchNormDriver : public Driver int VerifyBackward() override; int VerifyForward() override; + // Helper function to check the Layout type short names + int ChkLayout_ShortName(); + // function to validate the Layout type parameters. + // layout parameter value to std (NCHW/NHWC/NCDHW/NDHWC) values, + // defined in MIOpen lib. + void ValidateLayoutInputParameters(std::string layout_type); + ~BatchNormDriver() override {} private: @@ -145,7 +152,7 @@ class BatchNormDriver : public Driver tensor runMean_ref; tensor runVariance_ref; - // backward + // backward needed different type for bwd. GpumemTensor out_bwd; GpumemTensor bnScale; @@ -180,9 +187,8 @@ template int BatchNormDriver::GetandSetData() { - SetBNParametersFromCmdLineArgs(); - std::vector in_len = GetInputTensorLengthsFromCmdLine(); + SetBNParametersFromCmdLineArgs(); auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; @@ -286,12 +292,8 @@ int BatchNormDriver::AddCmdLineArgs() inflags.AddInputFlag("in_w", 'W', "32", "Input Width (Default=32)", "int"); inflags.AddInputFlag("in_d", 'D', "0", "Input Depth (Default=0)", "int"); - inflags.AddInputFlag("layout", - 'L', - "NCHW", - "Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)", - "string", - true); + inflags.AddInputFlag( + "layout", 'L', "", "Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)", "string", true); inflags.AddInputFlag("alpha", 'A', "1.0", "Alpha (Default=1.0)", "float"); inflags.AddInputFlag("beta", 'B', "0.", "Beta (Default=0.)", "float"); @@ -345,6 +347,46 @@ std::vector BatchNormDriver::GetInputTensorLengthsFromCmd } } +template +int BatchNormDriver::ChkLayout_ShortName() +{ + // check for short name of layout type + if(inflags.FindShortName("layout") == 'I') + { + // do noting + // found valid short names + return 0; + } + else + { + std::cerr << "Error:Invalid Short Name for layout!" << std::endl; + exit(EXIT_FAILURE); + } +} + +template +void BatchNormDriver::ValidateLayoutInputParameters(std::string layout_value) +{ + if((ChkLayout_ShortName())) + { + std::cerr << " Invalid Layout Short Name = " << ChkLayout_ShortName() << std::endl; + exit(EXIT_FAILURE); + } + else + { + if((layout_value.compare("NCHW") == 0) || (layout_value.compare("NHWC") == 0) || + (layout_value.compare("NCDHW") == 0) || (layout_value.compare("NDHWC") == 0)) + { + // do nothing,Values are matching as defined in Lib. + } + else + { + std::cerr << "Invalid Layout Parameter Value - " << layout_value << std::endl; + exit(EXIT_FAILURE); + } + } +} + template int BatchNormDriver::SetBNParametersFromCmdLineArgs() { @@ -352,6 +394,21 @@ int BatchNormDriver::SetBNParametersFromCmdLineArgs() // double bnAlpha = inflags.GetValueDouble("alpha"); // double bnBeta = inflags.GetValueDouble("beta"); + const std::string default_layout = isDepthSpecified ? "NCDHW" : "NCHW"; + + // inflags value is empty, default value is used + // if it is supplied via cmd line, check the value. + if(inflags.GetValueStr("layout").empty()) + { + inflags.SetValue("layout", default_layout); + } + else + { + std::string layoutValue = inflags.GetValueStr("layout"); + ValidateLayoutInputParameters(layoutValue); + inflags.SetValue("layout", layoutValue); + } + std::string layout = inflags.GetValueStr("layout"); if(layout == "NCHW") @@ -362,6 +419,14 @@ int BatchNormDriver::SetBNParametersFromCmdLineArgs() { bn_layout = miopenTensorNHWC; } + else if(layout == "NCDHW") + { + bn_layout = miopenTensorNCDHW; + } + else if(layout == "NDHWC") + { + bn_layout = miopenTensorNDHWC; + } else { std::cout << "Cannot handle layout : " << layout << "\n"; @@ -784,6 +849,20 @@ int BatchNormDriver::RunForwardGPU() template void BatchNormDriver::runCPUFwdInference(Tref epsilon) { + int size{0}; + miopenGetTensorDescriptorSize(&in.GetTensor().desc, &size); + + if(size == 5) + { + in.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(in.GetTensor().desc); + out_ref.desc = miopen::BuildReshaped4DTensorDescriptor(out_ref.desc); + scale.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(scale.GetTensor().desc); + bias.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(bias.GetTensor().desc); + estMean.GetTensor().desc = + miopen::BuildReshaped4DTensorDescriptor(estMean.GetTensor().desc); + estVariance.GetTensor().desc = + miopen::BuildReshaped4DTensorDescriptor(estVariance.GetTensor().desc); + } if(bn_mode == miopenBNPerActivation) { // 1xCxHxW @@ -798,6 +877,7 @@ void BatchNormDriver::runCPUFwdInference(Tref epsilon) } else if(bn_mode == miopenBNSpatial) { // 1xCx1x1 + batchNormSpatialHostInference(in.GetTensor(), out_ref, scale.GetTensor(), @@ -818,7 +898,19 @@ void BatchNormDriver::runCPUFwdInference(Tref epsilon) template void BatchNormDriver::runCPUFwdTrain(Tref epsilon, Tref eAF) { - + int size{0}; + miopenGetTensorDescriptorSize(&in.GetTensor().desc, &size); + if(size == 5) + { + in.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(in.GetTensor().desc); + out_ref.desc = miopen::BuildReshaped4DTensorDescriptor(out_ref.desc); + scale.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(scale.GetTensor().desc); + bias.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(bias.GetTensor().desc); + savedMean_ref.desc = miopen::BuildReshaped4DTensorDescriptor(savedMean_ref.desc); + savedVariance_ref.desc = miopen::BuildReshaped4DTensorDescriptor(savedVariance_ref.desc); + runMean_ref.desc = miopen::BuildReshaped4DTensorDescriptor(runMean_ref.desc); + runVariance_ref.desc = miopen::BuildReshaped4DTensorDescriptor(runVariance_ref.desc); + } if(bn_mode == miopenBNPerActivation) { // 1xCxHxW batchNormPerActHostFwdTrain(in.GetTensor(), @@ -879,7 +971,6 @@ int BatchNormDriver::RunForwardCPU() template int BatchNormDriver::RunBackwardGPU() { - if(!back) return miopenStatusSuccess; @@ -1225,6 +1316,25 @@ int BatchNormDriver::RunBackwardCPU() // float alphaDataDiff = static_cast(1), betaDataDiff = static_cast(0); // float alphaParamDiff = static_cast(1), betaParamDiff = static_cast(0); + int size{0}; + miopenGetTensorDescriptorSize(&in.GetTensor().desc, &size); + if(size == 5) + { + in.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(in.GetTensor().desc); + dy.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(dy.GetTensor().desc); + out_bwd.GetTensor().desc = + miopen::BuildReshaped4DTensorDescriptor(out_bwd.GetTensor().desc); + out_ref.desc = miopen::BuildReshaped4DTensorDescriptor(out_ref.desc); + bnScale.GetTensor().desc = + miopen::BuildReshaped4DTensorDescriptor(bnScale.GetTensor().desc); + dBias.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(dBias.GetTensor().desc); + dScale_ref.desc = miopen::BuildReshaped4DTensorDescriptor(dScale_ref.desc); + dBias_ref.desc = miopen::BuildReshaped4DTensorDescriptor(dBias_ref.desc); + savedMean.GetTensor().desc = + miopen::BuildReshaped4DTensorDescriptor(savedMean.GetTensor().desc); + savedInvVar.GetTensor().desc = + miopen::BuildReshaped4DTensorDescriptor(savedInvVar.GetTensor().desc); + } if(bn_mode == miopenBNPerActivation) { diff --git a/src/batch_norm.cpp b/src/batch_norm.cpp index a3c5f93e36..1b8f4ce640 100644 --- a/src/batch_norm.cpp +++ b/src/batch_norm.cpp @@ -67,6 +67,7 @@ void DeriveBNTensorDescriptor(TensorDescriptor& derivedBnDesc, TensorDescriptor BuildReshaped4DTensorDescriptor(const miopen::TensorDescriptor& tDesc) { auto dataType = tDesc.GetType(); + auto layout = tDesc.GetLayout_t(); std::vector dims(tDesc.GetLengths()); // NxCxDxHxW -> NxCx(D*H)xW @@ -74,7 +75,7 @@ TensorDescriptor BuildReshaped4DTensorDescriptor(const miopen::TensorDescriptor& dims[3] = dims[4]; dims.pop_back(); - return {dataType, dims}; + return {dataType, layout, dims}; } void profileSequence(const Handle& handle, unsigned char select, float* ctime) diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp index 7769e4d563..bca7afc3a5 100644 --- a/src/solver/batchnorm/backward_ck.cpp +++ b/src/solver/batchnorm/backward_ck.cpp @@ -201,7 +201,8 @@ bool BnCKBwdBackward::IsApplicable( return false; if(bn_problem.GetDirection() != miopen::batchnorm::Direction::Backward) return false; - + if(!bn_problem.Is2D()) + return false; switch(bn_problem.GetXDesc().GetType()) { case miopenFloat: return CheckCKApplicability(bn_problem); diff --git a/src/solver/batchnorm/backward_per_activation.cpp b/src/solver/batchnorm/backward_per_activation.cpp index 93cf670194..af52fbc339 100644 --- a/src/solver/batchnorm/backward_per_activation.cpp +++ b/src/solver/batchnorm/backward_per_activation.cpp @@ -41,6 +41,8 @@ namespace batchnorm { bool BnBwdTrainingPerActivation::IsApplicable( const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem) const { + if(!problem.Is2D()) + return false; return problem.GetDirection() == miopen::batchnorm::Direction::Backward && problem.GetMode() == miopenBNPerActivation; } diff --git a/src/solver/batchnorm/backward_spatial_multiple.cpp b/src/solver/batchnorm/backward_spatial_multiple.cpp index 29bbd5dba9..7fa9c0f89a 100644 --- a/src/solver/batchnorm/backward_spatial_multiple.cpp +++ b/src/solver/batchnorm/backward_spatial_multiple.cpp @@ -44,6 +44,10 @@ bool BnBwdTrainingSpatialMultiple::IsApplicable( if(problem.GetDirection() != miopen::batchnorm::Direction::Backward || problem.GetMode() != miopenBNSpatial) return false; + if(!problem.Is2D()) + { + return false; + } #if WORKAROUND_ISSUE_1549_FP16_BUILD_ERROR if(problem.GetXDesc().GetType() == miopenHalf && diff --git a/src/solver/batchnorm/backward_spatial_single.cpp b/src/solver/batchnorm/backward_spatial_single.cpp index 30b0c0495f..86fa5a68c7 100644 --- a/src/solver/batchnorm/backward_spatial_single.cpp +++ b/src/solver/batchnorm/backward_spatial_single.cpp @@ -45,6 +45,8 @@ bool BnBwdTrainingSpatialSingle::IsApplicable( if(problem.GetDirection() != miopen::batchnorm::Direction::Backward || problem.GetMode() != miopenBNSpatial) return false; + if(!problem.Is2D()) + return false; #if WORKAROUND_ISSUE_1549_FP16_BUILD_ERROR if(problem.GetXDesc().GetType() == miopenHalf && From ebd014bceae4a4543b9fa99a821f52dbc39dccee Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 26 Sep 2024 21:12:04 +0000 Subject: [PATCH 17/27] fix minor layout issue in bn --- driver/bn_driver.hpp | 2 +- src/batch_norm.cpp | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index 396ad6b218..dc42190314 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -351,7 +351,7 @@ template int BatchNormDriver::ChkLayout_ShortName() { // check for short name of layout type - if(inflags.FindShortName("layout") == 'I') + if(inflags.FindShortName("layout") == 'L') { // do noting // found valid short names diff --git a/src/batch_norm.cpp b/src/batch_norm.cpp index 1b8f4ce640..938809d81c 100644 --- a/src/batch_norm.cpp +++ b/src/batch_norm.cpp @@ -68,6 +68,19 @@ TensorDescriptor BuildReshaped4DTensorDescriptor(const miopen::TensorDescriptor& { auto dataType = tDesc.GetType(); auto layout = tDesc.GetLayout_t(); + if(layout == miopenTensorNCDHW) + { + layout = miopenTensorNCHW; + } + else if(layout == miopenTensorNDHWC) + { + layout = miopenTensorNHWC; + } + else + { + std::cout << "Cannot handle layout : " << layout << "\n"; + exit(EXIT_FAILURE); // NOLINT (concurrency-mt-unsafe) + } std::vector dims(tDesc.GetLengths()); // NxCxDxHxW -> NxCx(D*H)xW From 35cdca1fc918b6d565bfa5e86aa46d4aabd10884 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Wed, 2 Oct 2024 05:36:57 +0000 Subject: [PATCH 18/27] fix run variance issue --- driver/bn_driver.hpp | 113 +++++++++++++++++++++++++++++-------------- test/fusionHost.hpp | 65 ++++++++++++++++++++----- 2 files changed, 130 insertions(+), 48 deletions(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index dc42190314..e8ae9ff216 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -253,22 +253,19 @@ int BatchNormDriver::GetandSetData() bnScale.InitHostData(bnScale.GetTensor().desc.GetElementSize(), true, gen_value); - if(saveMeanVar && keepRunningMeanVar) - { - savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_var_bwd); + savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_var_bwd); - auto gen_in_var = [](auto...) { - return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); - }; - savedInvVar.InitHostData( - savedInvVar.GetTensor().desc.GetElementSize(), true, gen_in_var); - } + auto gen_in_var = [](auto...) { + return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); + }; + savedInvVar.InitHostData(savedInvVar.GetTensor().desc.GetElementSize(), true, gen_in_var); } else { std::cout << "\nUnknown batch norm state!\n"; exit(EXIT_FAILURE); } + return miopenStatusSuccess; } @@ -590,6 +587,16 @@ int BatchNormDriver::AllocateBuffersAndCopy() tensor{dBias.GetTensor().desc.GetLayout_t(), dBias.GetTensor().desc.GetLengths()}; } + for(size_t i = 0; i < runMean.GetVector().size(); ++i) + { + runMean_ref.data[i] = static_cast(runMean.GetVector()[i]); + } + + for(size_t i = 0; i < runVariance.GetVector().size(); ++i) + { + runVariance_ref.data[i] = static_cast(runVariance.GetVector()[i]); + } + if(status != STATUS_SUCCESS) printf("Fatal: Error copying data to GPU\nExiting...\n\n"); @@ -913,6 +920,7 @@ void BatchNormDriver::runCPUFwdTrain(Tref epsilon, Tref eAF) } if(bn_mode == miopenBNPerActivation) { // 1xCxHxW + batchNormPerActHostFwdTrain(in.GetTensor(), out_ref, scale.GetTensor(), @@ -926,16 +934,34 @@ void BatchNormDriver::runCPUFwdTrain(Tref epsilon, Tref eAF) } else if(bn_mode == miopenBNSpatial) { // 1xCx1x1 - batchNormSpatialHostFwdTrain(in.GetTensor(), - out_ref, - scale.GetTensor(), - bias.GetTensor(), - static_cast(epsilon), - static_cast(eAF), - savedMean_ref, - savedVariance_ref, - runMean_ref, - runVariance_ref); + + if(forw == 2 && !keepRunningMeanVar) + { + tensor empty_tensor; + batchNormSpatialHostFwdTrain(in.GetTensor(), + out_ref, + scale.GetTensor(), + bias.GetTensor(), + static_cast(epsilon), + static_cast(eAF), + empty_tensor, // savedMean_ref + empty_tensor, // savedVariance_ref + empty_tensor, // runMean_ref + empty_tensor); // runVariance_ref + } + else + { + batchNormSpatialHostFwdTrain(in.GetTensor(), + out_ref, + scale.GetTensor(), + bias.GetTensor(), + static_cast(epsilon), + static_cast(eAF), + savedMean_ref, + savedVariance_ref, + runMean_ref, + runVariance_ref); + } } else { @@ -952,7 +978,7 @@ int BatchNormDriver::RunForwardCPU() Tref epsilon = static_cast(EPSILON); Tref eAF = static_cast(1.0); - if(forw == 1) + if(forw == 1 || (forw == 2 && !keepRunningMeanVar)) { // training only for(int i = 0; i < inflags.GetValueInt("iter"); i++) { @@ -960,10 +986,16 @@ int BatchNormDriver::RunForwardCPU() runCPUFwdTrain(epsilon, eAF /* alpha, beta,*/); } } - else if(forw == 2) - { // inference only + else if(forw == 2 && keepRunningMeanVar) + { + // inference only runCPUFwdInference(epsilon); } + else + { + printf("Unsupported forward cpu run state.\nExiting...\n\n"); + exit(EXIT_FAILURE); // NOLINT (concurrency-mt-unsafe) + } return miopenStatusSuccess; } @@ -1173,12 +1205,8 @@ int BatchNormDriver::VerifyForward() if(saveMeanVar) { // copy back for verification - // saveMean_dev->FromGPU(GetStream(), savedMean.data()); - // saveInvVariance_dev->FromGPU(GetStream(), savedInvVar.data()); - savedMean.CopyFromDeviceToHost(GetStream()); savedVariance.CopyFromDeviceToHost(GetStream()); - maxval = static_cast(0.0); auto errorSaveMean = miopen::rms_range(savedMean_ref.data, savedMean.GetVector()); if(!std::isfinite(errorSaveMean) || errorSaveMean > maxrms) @@ -1252,7 +1280,6 @@ int BatchNormDriver::VerifyForward() maxval = static_cast(0.0); auto errorOut = miopen::rms_range(out_ref.data, out.GetVector()); - if(!std::isfinite(errorOut) || errorOut > maxrms) { std::cout << "Forward batch norm verification FAILED on output: " << errorOut << std::endl; @@ -1356,15 +1383,30 @@ int BatchNormDriver::RunBackwardCPU() } else if(bn_mode == miopenBNSpatial) { // 1xCx1x1 + if(saveMeanVar) + { - batchNormSpatialHostBwdTrain(in.GetTensor(), - dy.GetTensor(), - out_ref, - bnScale.GetTensor(), - dScale_ref, - dBias_ref, - savedMean.GetTensor(), - savedInvVar.GetTensor()); + batchNormSpatialHostBwdTrain(in.GetTensor(), + dy.GetTensor(), + out_ref, + bnScale.GetTensor(), + dScale_ref, + dBias_ref, + savedMean.GetTensor(), + savedInvVar.GetTensor()); + } + else + { + tensor empty_tensor; + batchNormSpatialHostBwdTrain(in.GetTensor(), + dy.GetTensor(), + out_ref, + bnScale.GetTensor(), + dScale_ref, + dBias_ref, + empty_tensor, + empty_tensor); + } } else { @@ -1399,7 +1441,6 @@ int BatchNormDriver::VerifyBackward() #endif maxval = static_cast(0.0); auto errordxout = miopen::rms_range(out_ref.data, out_bwd.GetVector()); - if(!std::isfinite(errordxout) || errordxout > maxrms) { std::cout << "Backwards prop batch norm verification FAILED on dx: " << errordxout diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index ec271ef967..d525b79cf6 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -266,16 +266,21 @@ void batchNormSpatialHostFwdTrain(const tensor& input, } // for (column) } // for (row) } // end for(n_batchs) - - saveMean(0, cidx, 0, 0) = mean_accum; - saveInvVar(0, cidx, 0, 0) = invVar; - - newRunMean = runMean(0, cidx, 0, 0) * (1 - expAvgFactor); - runMean(0, cidx, 0, 0) = mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp - // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n) - adjust = - (n_batch * height * width == 1) ? variance_accum : (nhw / (nhw - 1)) * variance_accum; - runVar(0, cidx, 0, 0) = (1 - expAvgFactor) * runVar(0, cidx, 0, 0) + expAvgFactor * adjust; + if(!saveMean.data.empty()) + { + saveMean(0, cidx, 0, 0) = mean_accum; + saveInvVar(0, cidx, 0, 0) = invVar; + } + if(!runMean.data.empty()) + { + newRunMean = runMean(0, cidx, 0, 0) * (1 - expAvgFactor); + runMean(0, cidx, 0, 0) = mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp + // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n) + adjust = (n_batch * height * width == 1) ? variance_accum + : (nhw / (nhw - 1)) * variance_accum; + runVar(0, cidx, 0, 0) = + (1 - expAvgFactor) * runVar(0, cidx, 0, 0) + expAvgFactor * adjust; + } }); } @@ -301,14 +306,50 @@ void batchNormSpatialHostBwdTrain(const tensor& x_input, par_for(channels, 1, [&](int cidx) { double elemStd = 0.; unsigned int xhat_index; - double mean = savedMean(0, cidx, 0, 0); // HxW elements - double invVar = savedInvVar(0, cidx, 0, 0); // HxW elements + double mean = 0.0; + double invVar = 0.0; double dyelem = 0.; std::vector xhat(static_cast(n_batch) * in_cstride, 0.0); // process the batch per channel dscale(0, cidx, 0, 0) = 0.; dbias(0, cidx, 0, 0) = 0.; + if(!savedMean.data.empty()) + { + + mean = savedMean(0, cidx, 0, 0); // HxW elements + invVar = savedInvVar(0, cidx, 0, 0); // HxW elements + } + else + { + double variance_accum = 0.; + double mean_accum = 0.; + double inv_Var = 0.; + + // process the batch per channel + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + // #1 calculate the mean + // iterating through the stack of images in the mini_batch + auto inval = static_cast(x_input(bidx, cidx, row, column)); + mean_accum += inval; + variance_accum += inval * inval; + } // end for (column) + } // end for (row) + } // end for (n) + + mean_accum /= nhw; + variance_accum /= nhw; + variance_accum += (-mean_accum * mean_accum); + inv_Var = 1.0 / sqrt(variance_accum); + + mean = mean_accum; + invVar = inv_Var; + } for(int row = 0; row < height; row++) { // via rows for(int column = 0; column < width; column++) From 5893793e1949410a24cd23d7470e4adb96a226e8 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 3 Oct 2024 14:02:11 +0000 Subject: [PATCH 19/27] fixed review comments --- src/driver_arguments.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp index 1bd82b71b4..ce6a7593b5 100644 --- a/src/driver_arguments.cpp +++ b/src/driver_arguments.cpp @@ -66,7 +66,7 @@ void BnDataType(std::stringstream& ss, const miopen::TensorDescriptor& desc) { ss << "bnormfp16"; } - if(desc.GetType() == miopenBFloat16) + else if(desc.GetType() == miopenBFloat16) { ss << "bnormbfp16"; } From 6463c480587ed502f3f08232defee583f72c4451 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Wed, 9 Oct 2024 15:18:26 +0000 Subject: [PATCH 20/27] create new API for batch norm --- include/miopen/miopen.h | 65 +++++ src/batch_norm_api.cpp | 246 ++++++++++++++---- src/driver_arguments.cpp | 23 +- src/fusion.cpp | 1 + src/include/miopen/batch_norm.hpp | 77 +++--- .../miopen/batchnorm/problem_description.hpp | 40 ++- src/include/miopen/driver_arguments.hpp | 1 + .../miopen/fusion/problem_description.hpp | 15 +- src/ocl/batchnormocl.cpp | 97 ++++--- test/bn_3d_peract_test.cpp | 15 ++ test/bn_3d_spatial_test.cpp | 15 ++ test/bn_peract_test.cpp | 15 ++ test/bn_spatial_test.cpp | 15 ++ 13 files changed, 494 insertions(+), 131 deletions(-) diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 74be683ca3..4244b5d63f 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -2738,6 +2738,28 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle, void* resultSaveMean, void* resultSaveInvVariance); +MIOPEN_EXPORT miopenStatus_t +miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void* alpha, + void* beta, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t biasVarDesc, + const miopenTensorDescriptor_t savedMeanDesc, + const miopenTensorDescriptor_t savedVarDesc, + void* bnScale, + void* bnBias, + double expAvgFactor, + void* resultRunningMean, + void* resultRunningVariance, + double epsilon, + void* resultSaveMean, + void* resultSaveInvVariance); + /*! @brief Execute forward inference layer for batch normalization * * Batch normalization pass for forward inference pass. @@ -2783,6 +2805,25 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, void* estimatedVariance, double epsilon); +MIOPEN_EXPORT miopenStatus_t +miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void* alpha, + void* beta, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t BiasDesc, + const miopenTensorDescriptor_t estMeanDesc, + const miopenTensorDescriptor_t estVarianceDesc, + void* bnScale, + void* bnBias, + void* estimatedMean, + void* estimatedVariance, + double epsilon); + /*! @brief Execute backwards propagation layer for batch normalization * * Batch normalization pass for backwards propagation training pass. @@ -2838,6 +2879,30 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, const void* savedMean, const void* savedInvVariance); +MIOPEN_EXPORT miopenStatus_t +miopenBatchNormalizationBackward_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + const void* alphaDataDiff, + const void* betaDataDiff, + const void* alphaParamDiff, + const void* betaParamDiff, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t dyDesc, + const void* dy, + const miopenTensorDescriptor_t dxDesc, + void* dx, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t biasDesc, + const miopenTensorDescriptor_t savedMeanDesc, + const miopenTensorDescriptor_t savedVarDesc, + const void* bnScale, + void* resultBnScaleDiff, + void* resultBnBiasDiff, + double epsilon, + const void* savedMean, + const void* savedInvVariance); + /** @} */ // CLOSEOUT BATCHNORM DOXYGEN GROUP diff --git a/src/batch_norm_api.cpp b/src/batch_norm_api.cpp index 8f184a9508..72e6a64554 100644 --- a/src/batch_norm_api.cpp +++ b/src/batch_norm_api.cpp @@ -50,6 +50,7 @@ namespace miopen { namespace debug { void LogCmdBNorm(const miopenTensorDescriptor_t xDesc, + const miopenTensorDescriptor_t sMeanDesc, miopenBatchNormMode_t bn_mode, const void* resultRunningMean, const void* resultRunningVariance, @@ -60,6 +61,7 @@ void LogCmdBNorm(const miopenTensorDescriptor_t xDesc, if(miopen::IsLoggingCmd()) { const std::string& str = BnormArgsForMIOpenDriver(xDesc, + sMeanDesc, bn_mode, resultRunningMean, resultRunningVariance, @@ -88,6 +90,130 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, void* estimatedMean, void* estimatedVariance, double epsilon) +{ + return miopenBatchNormalizationForwardInference_V2(handle, + bn_mode, + alpha, + beta, + xDesc, + x, + yDesc, + y, + bnScaleBiasMeanVarDesc, + nullptr, + nullptr, + nullptr, + bnScale, + bnBias, + estimatedMean, + estimatedVariance, + epsilon); +} + +extern "C" miopenStatus_t +miopenBatchNormalizationForwardTraining(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void* alpha, + void* beta, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t bnScaleBiasMeanVarDesc, + void* bnScale, + void* bnBias, + double expAvgFactor, + void* resultRunningMean, + void* resultRunningVariance, + double epsilon, + void* resultSaveMean, + void* resultSaveInvVariance) +{ + return miopenBatchNormalizationForwardTraining_V2(handle, + bn_mode, + alpha, + beta, + xDesc, + x, + yDesc, + y, + bnScaleBiasMeanVarDesc, + nullptr, + nullptr, + nullptr, + bnScale, + bnBias, + expAvgFactor, + resultRunningMean, + resultRunningVariance, + epsilon, + resultSaveMean, + resultSaveInvVariance); +} + +extern "C" miopenStatus_t +miopenBatchNormalizationBackward(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + const void* alphaDataDiff, + const void* betaDataDiff, + const void* alphaParamDiff, + const void* betaParamDiff, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t dyDesc, + const void* dy, + const miopenTensorDescriptor_t dxDesc, + void* dx, + const miopenTensorDescriptor_t bnScaleBiasDiffDesc, + const void* bnScale, + void* resultBnScaleDiff, + void* resultBnBiasDiff, + double epsilon, + const void* savedMean, + const void* savedInvVariance) +{ + return miopenBatchNormalizationBackward_V2(handle, + bn_mode, + alphaDataDiff, + betaDataDiff, + alphaParamDiff, + betaParamDiff, + xDesc, + x, + dyDesc, + dy, + dxDesc, + dx, + bnScaleBiasDiffDesc, + nullptr, + nullptr, + nullptr, + bnScale, + resultBnScaleDiff, + resultBnBiasDiff, + epsilon, + savedMean, + savedInvVariance); +} + +extern "C" miopenStatus_t +miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void* alpha, + void* beta, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t BiasDesc, + const miopenTensorDescriptor_t estMeanDesc, + const miopenTensorDescriptor_t estVarianceDesc, + void* bnScale, + void* bnBias, + void* estimatedMean, + void* estimatedVariance, + double epsilon) { MIOPEN_LOG_FUNCTION(handle, bn_mode, @@ -95,7 +221,10 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, x, yDesc, y, - bnScaleBiasMeanVarDesc, + scaleDesc, + (BiasDesc == nullptr) ? scaleDesc : BiasDesc, + (estMeanDesc == nullptr) ? scaleDesc : estMeanDesc, + (estVarianceDesc == nullptr) ? scaleDesc : estVarianceDesc, bnScale, bnBias, estimatedMean, @@ -103,12 +232,14 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, epsilon); miopen::debug::LogCmdBNorm(xDesc, + (estMeanDesc == nullptr) ? scaleDesc : estMeanDesc, bn_mode, estimatedMean, estimatedVariance, nullptr, nullptr, miopen::debug::BatchNormDirection_t::ForwardInference); + // In case of NxCxDxHxW int size{0}; miopenGetTensorDescriptorSize(xDesc, &size); @@ -124,9 +255,10 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, (size == 5) ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(yDesc)) : miopen::deref(yDesc), DataCast(y), - (size == 5) - ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(bnScaleBiasMeanVarDesc)) - : miopen::deref(bnScaleBiasMeanVarDesc), + miopen::deref(scaleDesc), + miopen::deref((BiasDesc == nullptr) ? scaleDesc : BiasDesc), + miopen::deref((estMeanDesc == nullptr) ? scaleDesc : estMeanDesc), + miopen::deref((estVarianceDesc == nullptr) ? scaleDesc : estVarianceDesc), DataCast(bnScale), DataCast(bnBias), DataCast(estimatedMean), @@ -136,32 +268,37 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, } extern "C" miopenStatus_t -miopenBatchNormalizationForwardTraining(miopenHandle_t handle, - miopenBatchNormMode_t bn_mode, - void* alpha, - void* beta, - const miopenTensorDescriptor_t xDesc, - const void* x, - const miopenTensorDescriptor_t yDesc, - void* y, - const miopenTensorDescriptor_t bnScaleBiasMeanVarDesc, - void* bnScale, - void* bnBias, - double expAvgFactor, - void* resultRunningMean, - void* resultRunningVariance, - double epsilon, - void* resultSaveMean, - void* resultSaveInvVariance) +miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void* alpha, + void* beta, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t BiasDesc, + const miopenTensorDescriptor_t savedMeanDesc, + const miopenTensorDescriptor_t savedVarianceDesc, + void* bnScale, + void* bnBias, + double expAvgFactor, + void* resultRunningMean, + void* resultRunningVariance, + double epsilon, + void* resultSaveMean, + void* resultSaveInvVariance) { - MIOPEN_LOG_FUNCTION(handle, bn_mode, xDesc, x, yDesc, y, - bnScaleBiasMeanVarDesc, + scaleDesc, + (BiasDesc == nullptr) ? scaleDesc : BiasDesc, + (savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc, + (savedVarianceDesc == nullptr) ? scaleDesc : savedVarianceDesc, bnScale, bnBias, expAvgFactor, @@ -172,6 +309,7 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle, resultSaveInvVariance); miopen::debug::LogCmdBNorm(xDesc, + (savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc, bn_mode, resultRunningMean, resultRunningVariance, @@ -193,9 +331,10 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle, (size == 5) ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(yDesc)) : miopen::deref(yDesc), DataCast(y), - (size == 5) - ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(bnScaleBiasMeanVarDesc)) - : miopen::deref(bnScaleBiasMeanVarDesc), + miopen::deref(scaleDesc), + miopen::deref((BiasDesc == nullptr) ? scaleDesc : BiasDesc), + miopen::deref((savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc), + miopen::deref((savedVarianceDesc == nullptr) ? scaleDesc : savedVarianceDesc), DataCast(bnScale), DataCast(bnBias), expAvgFactor, @@ -208,27 +347,29 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle, } extern "C" miopenStatus_t -miopenBatchNormalizationBackward(miopenHandle_t handle, - miopenBatchNormMode_t bn_mode, - const void* alphaDataDiff, - const void* betaDataDiff, - const void* alphaParamDiff, - const void* betaParamDiff, - const miopenTensorDescriptor_t xDesc, - const void* x, - const miopenTensorDescriptor_t dyDesc, - const void* dy, - const miopenTensorDescriptor_t dxDesc, - void* dx, - const miopenTensorDescriptor_t bnScaleBiasDiffDesc, - const void* bnScale, - void* resultBnScaleDiff, - void* resultBnBiasDiff, - double epsilon, - const void* savedMean, - const void* savedInvVariance) +miopenBatchNormalizationBackward_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + const void* alphaDataDiff, + const void* betaDataDiff, + const void* alphaParamDiff, + const void* betaParamDiff, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t dyDesc, + const void* dy, + const miopenTensorDescriptor_t dxDesc, + void* dx, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t BiasDesc, + const miopenTensorDescriptor_t savedMeanDesc, + const miopenTensorDescriptor_t savedVarianceDesc, + const void* bnScale, + void* resultBnScaleDiff, + void* resultBnBiasDiff, + double epsilon, + const void* savedMean, + const void* savedInvVariance) { - MIOPEN_LOG_FUNCTION(handle, bn_mode, xDesc, @@ -237,7 +378,10 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, dy, dxDesc, dx, - bnScaleBiasDiffDesc, + scaleDesc, + (BiasDesc == nullptr) ? scaleDesc : BiasDesc, + (savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc, + (savedVarianceDesc == nullptr) ? scaleDesc : savedVarianceDesc, bnScale, resultBnScaleDiff, resultBnBiasDiff, @@ -245,6 +389,7 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, savedMean, savedInvVariance); miopen::debug::LogCmdBNorm(xDesc, + (savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc, bn_mode, nullptr, nullptr, @@ -271,9 +416,10 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, (size == 5) ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(dxDesc)) : miopen::deref(dxDesc), DataCast(dx), - (size == 5) - ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(bnScaleBiasDiffDesc)) - : miopen::deref(bnScaleBiasDiffDesc), + miopen::deref(scaleDesc), + miopen::deref((BiasDesc == nullptr) ? scaleDesc : BiasDesc), + miopen::deref((savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc), + miopen::deref((savedVarianceDesc == nullptr) ? scaleDesc : savedVarianceDesc), DataCast(bnScale), DataCast(resultBnScaleDiff), DataCast(resultBnBiasDiff), diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp index ce6a7593b5..971977afa9 100644 --- a/src/driver_arguments.cpp +++ b/src/driver_arguments.cpp @@ -60,16 +60,28 @@ void ConvDataType(std::stringstream& ss, const miopen::TensorDescriptor& desc) } } -void BnDataType(std::stringstream& ss, const miopen::TensorDescriptor& desc) +// test based on the input tensor and scaleMean. +// We choose scaleMean because its a accumulator type. +void BnDataType(std::stringstream& ss, + const miopen::TensorDescriptor& xDesc, + const miopen::TensorDescriptor& sMeanDesc) { - if(desc.GetType() == miopenHalf) + if(xDesc.GetType() == miopenHalf && sMeanDesc.GetType() == miopenHalf) { ss << "bnormfp16"; } - else if(desc.GetType() == miopenBFloat16) + else if(xDesc.GetType() == miopenBFloat16 && sMeanDesc.GetType() == miopenBFloat16) { ss << "bnormbfp16"; } + else if(xDesc.GetType() == miopenHalf && sMeanDesc.GetType() == miopenFloat) + { + ss << "bnormbfp16fp32"; + } + else if(xDesc.GetType() == miopenBFloat16 && sMeanDesc.GetType() == miopenFloat) + { + ss << "bnormbfp16fp32"; + } else { ss << "bnorm"; @@ -215,7 +227,8 @@ std::string ConvArgsForMIOpenDriver(const miopen::TensorDescriptor& xDesc, return ss.str(); } -std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc, +std::string BnormArgsForMIOpenDriver(const miopenTensorDescriptor_t xDesc, + const miopenTensorDescriptor_t sMeanDesc, miopenBatchNormMode_t bn_mode, const void* resultRunningMean, const void* resultRunningVariance, @@ -228,7 +241,7 @@ std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc, miopenGetTensorDescriptorSize(xDesc, &size); std::stringstream ss; if(print_for_bn_driver) - BnDataType(ss, miopen::deref(xDesc)); + BnDataType(ss, miopen::deref(xDesc), miopen::deref(sMeanDesc)); ss << " -n " << miopen::deref(xDesc).GetLengths()[0] // clang-format off << " -c " << miopen::deref(xDesc).GetLengths()[1]; diff --git a/src/fusion.cpp b/src/fusion.cpp index a9ef5e27a9..e536f6a1a1 100644 --- a/src/fusion.cpp +++ b/src/fusion.cpp @@ -390,6 +390,7 @@ std::string LogCmdBnormFusion(const miopenFusionPlanDescriptor_t fusePlanDesc, i if(bn_op != nullptr) { str += BnormArgsForMIOpenDriver(&bn_op->input_desc, + &bn_op->base_desc, bn_op->mode, nullptr, nullptr, diff --git a/src/include/miopen/batch_norm.hpp b/src/include/miopen/batch_norm.hpp index 50c309550c..92444f039b 100644 --- a/src/include/miopen/batch_norm.hpp +++ b/src/include/miopen/batch_norm.hpp @@ -163,40 +163,44 @@ void bnFwdTrainSelectMulti(const Handle& handle, void profileSequence(const Handle& handle, unsigned char select, float* ctime); -MIOPEN_INTERNALS_EXPORT void -BatchNormForwardInference(Handle& handle, - miopenBatchNormMode_t bn_mode, - const void* alpha, - const void* beta, - const TensorDescriptor& xDesc, - ConstData_t x, - const TensorDescriptor& yDesc, - Data_t y, - const TensorDescriptor& bnScaleBiasMeanVarDesc, - ConstData_t bnScale, - ConstData_t bnBias, - ConstData_t estimatedMean, - ConstData_t estimatedVariance, - double epsilon); - -MIOPEN_INTERNALS_EXPORT void -BatchNormForwardTraining(Handle& handle, - miopenBatchNormMode_t bn_mode, - const void* alpha, /* these don't seem to be used in conv */ - const void* beta, - const TensorDescriptor& xDesc, - ConstData_t x, - const TensorDescriptor& yDesc, - Data_t y, - const TensorDescriptor& bnScaleBiasMeanVarDesc, - ConstData_t bnScale, - ConstData_t bnBias, - double expAvgFactor, - Data_t resultRunningMean, - Data_t resultRunningVariance, - double epsilon, - Data_t resultSaveMean, - Data_t resultSaveInvVariance); +MIOPEN_INTERNALS_EXPORT void BatchNormForwardInference(Handle& handle, + miopenBatchNormMode_t bn_mode, + const void* alpha, + const void* beta, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& yDesc, + Data_t y, + const TensorDescriptor& scaleDesc, + const TensorDescriptor& BiasDesc, + const TensorDescriptor& estMeanDesc, + const TensorDescriptor& estVarianceDesc, + ConstData_t bnScale, + ConstData_t bnBias, + ConstData_t estimatedMean, + ConstData_t estimatedVariance, + double epsilon); + +MIOPEN_INTERNALS_EXPORT void BatchNormForwardTraining(Handle& handle, + miopenBatchNormMode_t bn_mode, + const void* alpha, + const void* beta, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& yDesc, + Data_t y, + const TensorDescriptor& scaleDesc, + const TensorDescriptor& biasDesc, + const TensorDescriptor& savedMeanDesc, + const TensorDescriptor& savedVarianceDesc, + ConstData_t bnScale, + ConstData_t bnBias, + double expAvgFactor, + Data_t resultRunningMean, + Data_t resultRunningVariance, + double epsilon, + Data_t resultSaveMean, + Data_t resultSaveInvVariance); MIOPEN_INTERNALS_EXPORT void BatchNormBackward(Handle& handle, miopenBatchNormMode_t bn_mode, @@ -210,7 +214,10 @@ MIOPEN_INTERNALS_EXPORT void BatchNormBackward(Handle& handle, ConstData_t dy, const TensorDescriptor& dxDesc, Data_t dx, - const TensorDescriptor& bnScaleBiasDiffDesc, + const TensorDescriptor& scaleDesc, + const TensorDescriptor& BiasDesc, + const TensorDescriptor& savedMeanDesc, + const TensorDescriptor& savedVarianceDesc, ConstData_t bnScale, Data_t resultBnScaleDiff, Data_t resultBnBiasDiff, diff --git a/src/include/miopen/batchnorm/problem_description.hpp b/src/include/miopen/batchnorm/problem_description.hpp index b87494b725..d28e91adfd 100644 --- a/src/include/miopen/batchnorm/problem_description.hpp +++ b/src/include/miopen/batchnorm/problem_description.hpp @@ -58,7 +58,10 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob ProblemDescription(miopenBatchNormMode_t bn_mode_, const TensorDescriptor& xDesc_, const TensorDescriptor& yDesc_, - const TensorDescriptor& bnScaleBiasMeanVarDesc_, + const TensorDescriptor& scaleDesc_, + const TensorDescriptor& biasDesc_, + const TensorDescriptor& sMeanDesc_, + const TensorDescriptor& sVarianceDesc_, double expAvgFactor_, double epsilon_, bool resultsave_, @@ -67,7 +70,10 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob bn_mode(bn_mode_), xDesc(xDesc_), yOrDyDesc(yDesc_), - scaleBiasDesc(bnScaleBiasMeanVarDesc_), + scaleDesc(scaleDesc_), + biasDesc(biasDesc_), + sMeanDesc(sMeanDesc_), + sVarianceDesc(sVarianceDesc_), expAvgFactor(expAvgFactor_), epsilon(epsilon_), resultsave(resultsave_), @@ -82,13 +88,19 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob ProblemDescription(miopenBatchNormMode_t bn_mode_, const TensorDescriptor& xDesc_, const TensorDescriptor& yDesc_, - const TensorDescriptor& bnScaleBiasMeanVarDesc_, + const TensorDescriptor& scaleDesc_, + const TensorDescriptor& biasDesc_, + const TensorDescriptor& sMeanDesc_, + const TensorDescriptor& sVarianceDesc_, double epsilon_) : direction(Direction::ForwardInference), bn_mode(bn_mode_), xDesc(xDesc_), yOrDyDesc(yDesc_), - scaleBiasDesc(bnScaleBiasMeanVarDesc_), + scaleDesc(scaleDesc_), + biasDesc(biasDesc_), + sMeanDesc(sMeanDesc_), + sVarianceDesc(sVarianceDesc_), epsilon(epsilon_) { SetSpatialDims(); @@ -101,7 +113,10 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob const TensorDescriptor& xDesc_, const TensorDescriptor& dyDesc_, const TensorDescriptor& dxDesc_, - const TensorDescriptor& bnScaleBiasDiffDesc_, + const TensorDescriptor& scaleDesc_, + const TensorDescriptor& biasDesc_, + const TensorDescriptor& sMeanDesc_, + const TensorDescriptor& sVarianceDesc_, double epsilon_, bool useSaved_) : direction(Direction::Backward), @@ -109,7 +124,10 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob xDesc(xDesc_), yOrDyDesc(dyDesc_), dxDesc(dxDesc_), - scaleBiasDesc(bnScaleBiasDiffDesc_), + scaleDesc(scaleDesc_), + biasDesc(biasDesc_), + sMeanDesc(sMeanDesc_), + sVarianceDesc(sVarianceDesc_), epsilon(epsilon_), useSaved(useSaved_) { @@ -153,13 +171,13 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob const TensorDescriptor& GetBnScaleBiasMeanVarDesc() const { assert(direction == Direction::ForwardTraining || direction == Direction::ForwardInference); - return scaleBiasDesc; + return scaleDesc; } const TensorDescriptor& GetScaleBiasDiffDesc() const { assert(direction == Direction::Backward); - return scaleBiasDesc; + return scaleDesc; } bool GetResultSave() const @@ -215,7 +233,11 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob TensorDescriptor xDesc; // input TensorDescriptor yOrDyDesc; // output TensorDescriptor dxDesc; - TensorDescriptor scaleBiasDesc; + + TensorDescriptor scaleDesc; // scale + TensorDescriptor biasDesc; // bias (shift) + TensorDescriptor sMeanDesc; + TensorDescriptor sVarianceDesc; #ifdef __clang__ #pragma clang diagnostic push diff --git a/src/include/miopen/driver_arguments.hpp b/src/include/miopen/driver_arguments.hpp index da4064b7f0..a964e7fe27 100644 --- a/src/include/miopen/driver_arguments.hpp +++ b/src/include/miopen/driver_arguments.hpp @@ -67,6 +67,7 @@ std::string ConvArgsForMIOpenDriver(const miopen::TensorDescriptor& xDesc, bool print_for_conv_driver = true); std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc, + miopenTensorDescriptor_t sMeanDesc, miopenBatchNormMode_t bn_mode, const void* resultRunningMean, const void* resultRunningVariance, diff --git a/src/include/miopen/fusion/problem_description.hpp b/src/include/miopen/fusion/problem_description.hpp index bcb37878d9..b3d1669cee 100644 --- a/src/include/miopen/fusion/problem_description.hpp +++ b/src/include/miopen/fusion/problem_description.hpp @@ -128,7 +128,14 @@ struct FusionDescription : ProblemDescriptionBase dynamic_cast(*fusion_plan_desc->op_map[idx]); miopen::TensorDescriptor out_desc; bn_op.GetOutputDesc(out_desc); - return {bn_op.mode, bn_op.input_desc, out_desc, bn_op.base_desc, not_used}; + return {bn_op.mode, + bn_op.input_desc, + out_desc, + bn_op.base_desc, + bn_op.base_desc, + bn_op.base_desc, + bn_op.base_desc, + not_used}; } else if(dir == miopen::batchnorm::Direction::ForwardTraining) { @@ -140,6 +147,9 @@ struct FusionDescription : ProblemDescriptionBase bn_op.input_desc, out_desc, bn_op.base_desc, + bn_op.base_desc, + bn_op.base_desc, + bn_op.base_desc, not_used, // expAvgFactor filler not_used, true /* resultSave*/, @@ -156,6 +166,9 @@ struct FusionDescription : ProblemDescriptionBase out_desc, bn_op.input_desc, {} /*bn_op.base_desc*/, + {} /*bn_op.base_desc*/, + {} /*bn_op.base_desc*/, + {} /*bn_op.base_desc*/, not_used, bn_op.useBatchStats /*useSaved*/}; } diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp index 40bcd34935..f33c5ac5db 100644 --- a/src/ocl/batchnormocl.cpp +++ b/src/ocl/batchnormocl.cpp @@ -55,6 +55,8 @@ miopen::PerformanceDb GetDb(const miopen::ExecutionContext& ctx, } } // namespace batchnorm +//============ BEGIN FORWARD TRAINING =============== + void BatchNormForwardTraining(Handle& handle, miopenBatchNormMode_t bn_mode, const void* alpha, @@ -63,7 +65,10 @@ void BatchNormForwardTraining(Handle& handle, ConstData_t x, const TensorDescriptor& yDesc, Data_t y, - const TensorDescriptor& bnScaleBiasMeanVarDesc, + const TensorDescriptor& scaleDesc, + const TensorDescriptor& biasDesc, + const TensorDescriptor& savedMeanDesc, + const TensorDescriptor& savedVarianceDesc, ConstData_t bnScale, ConstData_t bnBias, double expAvgFactor, @@ -73,13 +78,14 @@ void BatchNormForwardTraining(Handle& handle, Data_t resultSaveMean, Data_t resultSaveInvVariance) { - if(x == nullptr || y == nullptr || bnScale == nullptr || bnBias == nullptr) { MIOPEN_THROW(miopenStatusBadParm); } - if(xDesc.GetNumDims() != yDesc.GetNumDims() || - xDesc.GetNumDims() != bnScaleBiasMeanVarDesc.GetNumDims()) + if(xDesc.GetNumDims() != yDesc.GetNumDims() || xDesc.GetNumDims() != scaleDesc.GetNumDims() || + xDesc.GetNumDims() != biasDesc.GetNumDims() || + xDesc.GetNumDims() != savedMeanDesc.GetNumDims() || + xDesc.GetNumDims() != savedVarianceDesc.GetNumDims()) { MIOPEN_THROW(miopenStatusBadParm); } @@ -105,9 +111,9 @@ void BatchNormForwardTraining(Handle& handle, { miopen::checkNumericsInput(handle, xDesc, x); if(bnScale != nullptr) - miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, bnScale); + miopen::checkNumericsInput(handle, scaleDesc, bnScale); if(bnBias != nullptr) - miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, bnBias); + miopen::checkNumericsInput(handle, biasDesc, bnBias); } const auto resultsave = resultSaveMean != nullptr && resultSaveInvVariance != nullptr; @@ -116,7 +122,10 @@ void BatchNormForwardTraining(Handle& handle, const auto problem = batchnorm::ProblemDescription{bn_mode, xDesc, yDesc, - bnScaleBiasMeanVarDesc, + scaleDesc, + biasDesc, + savedMeanDesc, + savedVarianceDesc, expAvgFactor, epsilon, resultsave, @@ -153,15 +162,16 @@ void BatchNormForwardTraining(Handle& handle, { miopen::checkNumericsOutput(handle, yDesc, y); if(resultRunningMean != nullptr) - miopen::checkNumericsOutput(handle, bnScaleBiasMeanVarDesc, resultRunningMean); + miopen::checkNumericsOutput(handle, savedMeanDesc, resultRunningMean); if(resultRunningVariance != nullptr) - miopen::checkNumericsOutput(handle, bnScaleBiasMeanVarDesc, resultRunningVariance); + miopen::checkNumericsOutput(handle, savedVarianceDesc, resultRunningVariance); if(resultSaveMean != nullptr) - miopen::checkNumericsOutput(handle, bnScaleBiasMeanVarDesc, resultSaveMean); + miopen::checkNumericsOutput(handle, savedMeanDesc, resultSaveMean); if(resultSaveInvVariance != nullptr) - miopen::checkNumericsOutput(handle, bnScaleBiasMeanVarDesc, resultSaveInvVariance); + miopen::checkNumericsOutput(handle, savedVarianceDesc, resultSaveInvVariance); } } + //================== END FWD TRAIN =================== //============ BEGIN FORWARD INFERENCE =============== @@ -173,31 +183,37 @@ void BatchNormForwardInference(Handle& handle, ConstData_t x, const TensorDescriptor& yDesc, Data_t y, - const TensorDescriptor& bnScaleBiasMeanVarDesc, + const TensorDescriptor& scaleDesc, + const TensorDescriptor& biasDesc, + const TensorDescriptor& estMeanDesc, + const TensorDescriptor& estVarianceDesc, ConstData_t bnScale, ConstData_t bnBias, ConstData_t estimatedMean, ConstData_t estimatedVariance, double epsilon) { + if(miopen::CheckNumericsEnabled()) { miopen::checkNumericsInput(handle, xDesc, x); - miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, bnScale); - miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, bnBias); - miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, estimatedMean); - miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, estimatedVariance); + miopen::checkNumericsInput(handle, scaleDesc, bnScale); + miopen::checkNumericsInput(handle, biasDesc, bnBias); + miopen::checkNumericsInput(handle, estMeanDesc, estimatedMean); + miopen::checkNumericsInput(handle, estVarianceDesc, estimatedVariance); } if(estimatedMean != nullptr && estimatedVariance != nullptr) { - if(x == nullptr || y == nullptr || bnScale == nullptr || bnBias == nullptr) { MIOPEN_THROW(miopenStatusBadParm); } if(xDesc.GetNumDims() != yDesc.GetNumDims() || - xDesc.GetNumDims() != bnScaleBiasMeanVarDesc.GetNumDims()) + xDesc.GetNumDims() != scaleDesc.GetNumDims() || + xDesc.GetNumDims() != biasDesc.GetNumDims() || + xDesc.GetNumDims() != estMeanDesc.GetNumDims() || + xDesc.GetNumDims() != estVarianceDesc.GetNumDims()) { MIOPEN_THROW(miopenStatusBadParm); } @@ -216,8 +232,8 @@ void BatchNormForwardInference(Handle& handle, MIOPEN_THROW(miopenStatusBadParm); } - const auto problem = - batchnorm::ProblemDescription{bn_mode, xDesc, yDesc, bnScaleBiasMeanVarDesc, epsilon}; + const auto problem = batchnorm::ProblemDescription{ + bn_mode, xDesc, yDesc, scaleDesc, biasDesc, estMeanDesc, estVarianceDesc, epsilon}; const auto invoke_params = [&]() { auto tmp = batchnorm::InfInvokeParams{}; @@ -250,7 +266,10 @@ void BatchNormForwardInference(Handle& handle, x, yDesc, y, - bnScaleBiasMeanVarDesc, + scaleDesc, + biasDesc, + estMeanDesc, + estVarianceDesc, bnScale, bnBias, 0, @@ -265,9 +284,11 @@ void BatchNormForwardInference(Handle& handle, miopen::checkNumericsOutput(handle, yDesc, y); } } + //================= END FORWARD INFERENCE ==================== //=============== BEGIN BACKWARDS PROPAGATION ================ + void BatchNormBackward(Handle& handle, miopenBatchNormMode_t bn_mode, const void* alphaDataDiff, @@ -280,7 +301,10 @@ void BatchNormBackward(Handle& handle, ConstData_t dy, const TensorDescriptor& dxDesc, Data_t dx, - const TensorDescriptor& bnScaleBiasDiffDesc, + const TensorDescriptor& scaleDesc, + const TensorDescriptor& biasDesc, + const TensorDescriptor& savedMeanDesc, + const TensorDescriptor& savedVarianceDesc, ConstData_t bnScale, Data_t resultBnScaleDiff, Data_t resultBnBiasDiff, @@ -296,20 +320,23 @@ void BatchNormBackward(Handle& handle, { miopen::checkNumericsInput(handle, xDesc, x); miopen::checkNumericsInput(handle, dyDesc, dy); - miopen::checkNumericsInput(handle, bnScaleBiasDiffDesc, bnScale); + miopen::checkNumericsInput(handle, scaleDesc, bnScale); + miopen::checkNumericsInput(handle, biasDesc, bnScale); if(savedMean != nullptr) - miopen::checkNumericsInput(handle, bnScaleBiasDiffDesc, savedMean); + miopen::checkNumericsInput(handle, savedMeanDesc, savedMean); if(savedInvVariance != nullptr) - miopen::checkNumericsInput(handle, bnScaleBiasDiffDesc, savedInvVariance); + miopen::checkNumericsInput(handle, savedVarianceDesc, savedInvVariance); } if(x == nullptr || dy == nullptr || bnScale == nullptr || dx == nullptr) { MIOPEN_THROW(miopenStatusBadParm); } - if(xDesc.GetNumDims() != dyDesc.GetNumDims() || - xDesc.GetNumDims() != bnScaleBiasDiffDesc.GetNumDims()) + if(xDesc.GetNumDims() != dyDesc.GetNumDims() || xDesc.GetNumDims() != scaleDesc.GetNumDims() || + xDesc.GetNumDims() != biasDesc.GetNumDims() || + xDesc.GetNumDims() != savedMeanDesc.GetNumDims() || + xDesc.GetNumDims() != savedVarianceDesc.GetNumDims()) { MIOPEN_THROW(miopenStatusBadParm); } @@ -336,8 +363,16 @@ void BatchNormBackward(Handle& handle, const auto useSaved = savedMean != nullptr && savedInvVariance != nullptr; - const auto problem = batchnorm::ProblemDescription{ - bn_mode, xDesc, dyDesc, dxDesc, bnScaleBiasDiffDesc, epsilon, useSaved}; + const auto problem = batchnorm::ProblemDescription{bn_mode, + xDesc, + dyDesc, + dxDesc, + scaleDesc, + biasDesc, + savedMeanDesc, + savedVarianceDesc, + epsilon, + useSaved}; const auto algo = bn_mode == miopenBNSpatial ? AlgorithmName{"miopenBatchNormBackwardPropSpatial"} @@ -368,8 +403,8 @@ void BatchNormBackward(Handle& handle, if(miopen::CheckNumericsEnabled()) { miopen::checkNumericsOutput(handle, dxDesc, dx); - miopen::checkNumericsOutput(handle, bnScaleBiasDiffDesc, resultBnScaleDiff); - miopen::checkNumericsOutput(handle, bnScaleBiasDiffDesc, resultBnBiasDiff); + miopen::checkNumericsOutput(handle, scaleDesc, resultBnScaleDiff); + miopen::checkNumericsOutput(handle, biasDesc, resultBnBiasDiff); } } } // namespace miopen diff --git a/test/bn_3d_peract_test.cpp b/test/bn_3d_peract_test.cpp index 19fd15e7ce..c5f96ff9ba 100644 --- a/test/bn_3d_peract_test.cpp +++ b/test/bn_3d_peract_test.cpp @@ -281,6 +281,9 @@ struct verify_forward_train_3d_bn_per_activation BuildReshaped4DTensorDescriptor(out.desc), out_dev.get(), BuildReshaped4DTensorDescriptor(scale.desc), + BuildReshaped4DTensorDescriptor(shift.desc), + BuildReshaped4DTensorDescriptor(shift.desc), + BuildReshaped4DTensorDescriptor(shift.desc), scale_dev.get(), shift_dev.get(), expAvgFactor, @@ -450,6 +453,9 @@ struct verify_forward_infer_3d_bn_per_activation_recalc BuildReshaped4DTensorDescriptor(out.desc), out_dev.get(), BuildReshaped4DTensorDescriptor(scale.desc), + BuildReshaped4DTensorDescriptor(shift.desc), + BuildReshaped4DTensorDescriptor(shift.desc), + BuildReshaped4DTensorDescriptor(shift.desc), scale_dev.get(), shift_dev.get(), nullptr, @@ -573,6 +579,9 @@ struct verify_forward_infer_3d_bn_per_activation_use_est BuildReshaped4DTensorDescriptor(out.desc), out_dev.get(), BuildReshaped4DTensorDescriptor(scale.desc), + BuildReshaped4DTensorDescriptor(shift.desc), + BuildReshaped4DTensorDescriptor(shift.desc), + BuildReshaped4DTensorDescriptor(shift.desc), scale_dev.get(), shift_dev.get(), estMean_dev.get(), @@ -747,6 +756,9 @@ struct verify_backward_3d_bn_per_activation_use_saved BuildReshaped4DTensorDescriptor(dx_out.desc), dx_out_dev.get(), BuildReshaped4DTensorDescriptor(scale.desc), + BuildReshaped4DTensorDescriptor(dshift.desc), + BuildReshaped4DTensorDescriptor(dshift.desc), + BuildReshaped4DTensorDescriptor(dshift.desc), scale_dev.get(), dscale_dev.get(), dshift_dev.get(), @@ -948,6 +960,9 @@ struct verify_backward_3d_bn_per_activation_recalc BuildReshaped4DTensorDescriptor(dx_out.desc), dx_out_dev.get(), BuildReshaped4DTensorDescriptor(scale.desc), + BuildReshaped4DTensorDescriptor(dshift.desc), + BuildReshaped4DTensorDescriptor(dshift.desc), + BuildReshaped4DTensorDescriptor(dshift.desc), scale_dev.get(), dscale_dev.get(), dshift_dev.get(), diff --git a/test/bn_3d_spatial_test.cpp b/test/bn_3d_spatial_test.cpp index 8d428fca2b..08bfdb5a57 100644 --- a/test/bn_3d_spatial_test.cpp +++ b/test/bn_3d_spatial_test.cpp @@ -327,6 +327,9 @@ struct verify_forward_train_3d_bn_spatial miopen::BuildReshaped4DTensorDescriptor(out.desc), out_dev.get(), miopen::BuildReshaped4DTensorDescriptor(scale.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), scale_dev.get(), shift_dev.get(), expAvgFactor, @@ -516,6 +519,9 @@ struct verify_forward_infer_3d_bn_spatial_recalc miopen::BuildReshaped4DTensorDescriptor(out.desc), out_dev.get(), miopen::BuildReshaped4DTensorDescriptor(scale.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), scale_dev.get(), shift_dev.get(), nullptr, @@ -632,6 +638,9 @@ struct verify_forward_infer_3d_bn_spatial_use_est miopen::BuildReshaped4DTensorDescriptor(out.desc), out_dev.get(), miopen::BuildReshaped4DTensorDescriptor(scale.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), scale_dev.get(), shift_dev.get(), estMean_dev.get(), @@ -913,6 +922,9 @@ struct verify_backward_3d_bn_spatial_recalc miopen::BuildReshaped4DTensorDescriptor(dx_out.desc), dx_out_dev.get(), miopen::BuildReshaped4DTensorDescriptor(scale.desc), + miopen::BuildReshaped4DTensorDescriptor(dshift.desc), + miopen::BuildReshaped4DTensorDescriptor(dshift.desc), + miopen::BuildReshaped4DTensorDescriptor(dshift.desc), scale_dev.get(), dscale_dev.get(), dshift_dev.get(), @@ -1138,6 +1150,9 @@ struct verify_backward_3d_bn_spatial_use_saved miopen::BuildReshaped4DTensorDescriptor(dx_out.desc), dx_out_dev.get(), miopen::BuildReshaped4DTensorDescriptor(scale.desc), + miopen::BuildReshaped4DTensorDescriptor(dshift.desc), + miopen::BuildReshaped4DTensorDescriptor(dshift.desc), + miopen::BuildReshaped4DTensorDescriptor(dshift.desc), scale_dev.get(), dscale_dev.get(), dshift_dev.get(), diff --git a/test/bn_peract_test.cpp b/test/bn_peract_test.cpp index 6622230666..4d83e05df7 100644 --- a/test/bn_peract_test.cpp +++ b/test/bn_peract_test.cpp @@ -271,6 +271,9 @@ struct verify_forward_train_bn_per_activation out.desc, out_dev.get(), scale.desc, + shift.desc, + shift.desc, + shift.desc, scale_dev.get(), shift_dev.get(), expAvgFactor, @@ -433,6 +436,9 @@ struct verify_forward_infer_bn_per_activation_recalc out.desc, out_dev.get(), scale.desc, + shift.desc, + shift.desc, + shift.desc, scale_dev.get(), shift_dev.get(), nullptr, @@ -550,6 +556,9 @@ struct verify_forward_infer_bn_per_activation_use_est out.desc, out_dev.get(), scale.desc, + shift.desc, + shift.desc, + shift.desc, scale_dev.get(), shift_dev.get(), estMean_dev.get(), @@ -716,6 +725,9 @@ struct verify_backward_bn_per_activation_use_saved dx_out.desc, dx_out_dev.get(), scale.desc, + dshift.desc, + dshift.desc, + dshift.desc, scale_dev.get(), dscale_dev.get(), dshift_dev.get(), @@ -909,6 +921,9 @@ struct verify_backward_bn_per_activation_recalc dx_out.desc, dx_out_dev.get(), scale.desc, + dshift.desc, + dshift.desc, + dshift.desc, scale_dev.get(), dscale_dev.get(), dshift_dev.get(), diff --git a/test/bn_spatial_test.cpp b/test/bn_spatial_test.cpp index 82d1cc271b..95a8ee099a 100644 --- a/test/bn_spatial_test.cpp +++ b/test/bn_spatial_test.cpp @@ -308,6 +308,9 @@ struct verify_forward_train_bn_spatial out.desc, out_dev.get(), scale.desc, + shift.desc, + shift.desc, + shift.desc, scale_dev.get(), shift_dev.get(), expAvgFactor, @@ -484,6 +487,9 @@ struct verify_forward_infer_bn_spatial_recalc out.desc, out_dev.get(), scale.desc, + shift.desc, + shift.desc, + shift.desc, scale_dev.get(), shift_dev.get(), nullptr, @@ -596,6 +602,9 @@ struct verify_forward_infer_bn_spatial_use_est out.desc, out_dev.get(), scale.desc, + shift.desc, + shift.desc, + shift.desc, scale_dev.get(), shift_dev.get(), estMean_dev.get(), @@ -853,6 +862,9 @@ struct verify_backward_bn_spatial_recalc dx_out.desc, dx_out_dev.get(), scale.desc, + dshift.desc, + dshift.desc, + dshift.desc, scale_dev.get(), dscale_dev.get(), dshift_dev.get(), @@ -1065,6 +1077,9 @@ struct verify_backward_bn_spatial_use_saved dx_out.desc, dx_out_dev.get(), scale.desc, + dshift.desc, + dshift.desc, + dshift.desc, scale_dev.get(), dscale_dev.get(), dshift_dev.get(), From 978bba1a7f6e7d1d6ee90827890c28e4688dc15f Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Tue, 15 Oct 2024 15:26:30 +0000 Subject: [PATCH 21/27] add test for V2 api --- include/miopen/miopen.h | 112 +++++++++++++++++- test/gtest/bn.hpp | 225 ++++++++++++++++++++++++++---------- test/gtest/bn_bwd.cpp | 117 +++++++++++++++---- test/gtest/bn_fwd_train.cpp | 113 ++++++++++++++---- test/gtest/bn_infer.cpp | 126 ++++++++++++++------ test/gtest/bn_test_data.hpp | 21 +++- 6 files changed, 578 insertions(+), 136 deletions(-) diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index 4244b5d63f..63c3335b1a 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -2737,7 +2737,46 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle, double epsilon, void* resultSaveMean, void* resultSaveInvVariance); - +/*! @brief Execute forward training layer for batch normalization + * + * Batch normalization pass for forward training pass. + * Takes in batch normalization mode bn_mode and input tensor x, output tensor y, bnBias and bnScale + * with their descriptor. + * + * If either resultSaveMean, or resultSaveInvVariance are null pointers then the values for the mean + * and inverse variance will not be used. + * + * Likewise, if either resultRunningMean, or resultRunningVariance are null pointers then the values + * for the running mean and variance will not be saved. + * Running averages and variances are scaled using an exponential averaging factor: \f[ + * \mu_{old} = \mu_{new}*factor + \mu_{old}*(1-factor) + * \f] + * where \f[ + * factor=1/(1+iteration) + * \f] + * + * @param handle MIOpen handle (input) + * @param bn_mode Batch normalization mode (input) + * @param alpha Floating point scaling factor, allocated on the host (input) + * @param beta Floating point shift factor, allocated on the host (input) + * @param xDesc Tensor descriptor for data input tensor x (input) + * @param x Data tensor x (input) + * @param yDesc Tensor descriptor for output data tensor y (input) + * @param y Data tensor y (output) + * @param ScaleDesc Tensor descriptor for BN scaling + * @param biasVarDesc Tensor descriptor for BN bias + * @param savedMeanDesc Tensor descriptor for BN saved Mean + * @param savedVarDesc Tensor descriptor for BN saved Variance + * @param bnScale Batch norm scaling, gamma, tensor (input) + * @param bnBias Batch norm bias, beta, tensor (input) + * @param expAvgFactor Exponential averaging factor (input) + * @param resultRunningMean Running average saved for inference (output) + * @param resultRunningVariance Running variance saved for inference (output) + * @param epsilon Value to stablize inverse variance calculation (input) + * @param resultSaveMean Saved mini-batch mean for backwards pass (output) + * @param resultSaveInvVariance Saved mini-batch inverse variance for backwards pass (output) + * @return miopenStatus_t + */ MIOPEN_EXPORT miopenStatus_t miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle, miopenBatchNormMode_t bn_mode, @@ -2805,6 +2844,37 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, void* estimatedVariance, double epsilon); +/*! @brief Execute forward inference layer for batch normalization + * + * Batch normalization pass for forward inference pass. + * Takes in batch normalization mode bn_mode and input tensor x, output tensor y, bnBias and bnScale + * with their descriptor. + * + * If either estimatedMean, or estimatedVariance are null pointers then the values for the mean and + * variance will be calculated from input data and this calculated mean and variance will be used + * to update input values. + * If variance is zero and epsilon is also zero, this function outputs NAN values. Input espilon + * value should always be non zero positive value. + * + * @param handle MIOpen handle (input) + * @param bn_mode Batch normalization mode (input) + * @param alpha Floating point scaling factor, allocated on the host (input) + * @param beta Floating point shift factor, allocated on the host (input) + * @param xDesc Tensor descriptor for data input tensor x (input) + * @param x Data tensor x (input) + * @param yDesc Tensor descriptor for output data tensor y (input) + * @param y Data tensor y (output) + * @param ScaleDesc Tensor descriptor for BN scaling + * @param biasVarDesc Tensor descriptor for BN bias + * @param estMeanDesc Tensor descriptor for BN estimated Mean + * @param estVarianceDesc Tensor descriptor for BN estimated Variance + * @param bnScale Batch norm scaling, gamma, tensor (input) + * @param bnBias Batch norm bias, beta, tensor (input) + * @param estimatedMean Running average saved during forward training (input) + * @param estimatedVariance Running variance saved during forward training (input) + * @param epsilon Value to stabilize inverse variance calculation (input) + * @return miopenStatus_t + */ MIOPEN_EXPORT miopenStatus_t miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle, miopenBatchNormMode_t bn_mode, @@ -2815,7 +2885,7 @@ miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle, const miopenTensorDescriptor_t yDesc, void* y, const miopenTensorDescriptor_t scaleDesc, - const miopenTensorDescriptor_t BiasDesc, + const miopenTensorDescriptor_t biasDesc, const miopenTensorDescriptor_t estMeanDesc, const miopenTensorDescriptor_t estVarianceDesc, void* bnScale, @@ -2879,6 +2949,44 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, const void* savedMean, const void* savedInvVariance); +/*! @brief Execute backwards propagation layer for batch normalization + * + * Batch normalization pass for backwards propagation training pass. + * The method for backwards propagation batch normalization. + * + * Takes in batch normalization mode bn_mode and input tensor data x, input activation tensor dy, + * output tensor dx, the learned tensors resultBNBiasDiff and resultBNScaleDiff with their + * descriptor. + * + * If BOTH savedMean, and savedVariance are not null pointers then the method will use the saved + * mean and variance calculated by the forward training phase. + * + * @param handle MIOpen handle (input) + * @param bn_mode Batch normalization mode (input) + * @param alphaDataDiff Floating point scaling factor, allocated on the host (input) + * @param betaDataDiff Floating point shift factor, allocated on the host (input) + * @param alphaParamDiff Floating point scaling factor, allocated on the host (input) + * @param betaParamDiff Floating point shift factor, allocated on the host (input) + * @param xDesc Tensor descriptor for data input tensor x (input) + * @param x Data tensor x (input) + * @param dyDesc Tensor descriptor for output data tensor y (input) + * @param dy Data tensor y (input) + * @param dxDesc Tensor descriptor for output data tensor dx (input) + * @param dx Data delta tensor dx (output) + * @param scaleDesc Tensor descriptor for scaling descriptor (input) + * @param biasDesc Tensor descriptor for bias/shift descriptor (input) + * @param savedMeanDesc Tensor descriptor for saved Mean descriptor (input) + * @param savedVarDesc Tensor descriptor for saved Variance descriptor (input) + * , shifting, saved variance and + * mean (input) + * @param bnScale Batch norm scaling, gamma, tensor (input) + * @param resultBnScaleDiff Tensor for dscale (output) + * @param resultBnBiasDiff Tensor for dbias (output) + * @param epsilon Value to stabilize inverse variance calculation (input) + * @param savedMean Saved mini-batch mean for backwards pass (input) + * @param savedInvVariance Saved mini-bathc inverse variance for backwards pass (input) + * @return miopenStatus_t + */ MIOPEN_EXPORT miopenStatus_t miopenBatchNormalizationBackward_V2(miopenHandle_t handle, miopenBatchNormMode_t bn_mode, diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index f5227217e4..fdff351f79 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -32,17 +32,25 @@ #include "bn_test_data.hpp" #include "test_operations.hpp" +// Define an enum to identify which version of BN api to call +enum BNApiType +{ + testBNAPIV1, + testBNAPIV2, +}; + template -struct BNInferTest : public ::testing::TestWithParam> +struct BNInferTest + : public ::testing::TestWithParam> { protected: void SetUp() override { - std::tie(bn_config, tensor_layout) = GetParam(); + std::tie(bn_config, tensor_layout, api_type) = GetParam(); bn_infer_test_data.SetUpImpl(bn_config, tensor_layout); auto&& handle = get_handle(); @@ -51,21 +59,47 @@ struct BNInferTest : public ::testing::TestWithParam bn_infer_test_data; miopenTensorLayout_t tensor_layout; + BNApiType api_type; }; template -struct BNBwdTest : public ::testing::TestWithParam> +struct BNBwdTest + : public ::testing::TestWithParam> { protected: void SetUp() override { - std::tie(bn_config, tensor_layout) = GetParam(); + std::tie(bn_config, tensor_layout, api_type) = GetParam(); bn_bwd_test_data.SetUpImpl(bn_config, tensor_layout); auto&& handle = get_handle(); - auto res = miopenBatchNormalizationBackward(&handle, - bn_config.mode, - &bn_bwd_test_data.alphaDataDiff, - &bn_bwd_test_data.betaDataDiff, - &bn_bwd_test_data.alphaParamDiff, - &bn_bwd_test_data.betaParamDiff, - &bn_bwd_test_data.input.desc, - bn_bwd_test_data.in_dev.get(), - &bn_bwd_test_data.dy.desc, - bn_bwd_test_data.dy_dev.get(), - &bn_bwd_test_data.output.desc, - bn_bwd_test_data.out_dev.get(), - &bn_bwd_test_data.bnScale.desc, - bn_bwd_test_data.bnScale_dev.get(), - bn_bwd_test_data.dScale_dev.get(), - bn_bwd_test_data.dBias_dev.get(), - bn_bwd_test_data.epsilon, - bn_bwd_test_data.savedMean_dev.get(), - bn_bwd_test_data.savedInvVar_dev.get()); + if(!miopen::solver::ck_utility::is_ck_whitelist(handle.GetStream())) + { + test_skipped = true; + GTEST_SKIP() << "Not Applicable on " << handle.GetDeviceName() << " Architecture"; + } + miopenStatus_t res; + if(api_type == BNApiType::testBNAPIV1) + { + res = miopenBatchNormalizationBackward(&handle, + bn_config.mode, + &bn_bwd_test_data.alphaDataDiff, + &bn_bwd_test_data.betaDataDiff, + &bn_bwd_test_data.alphaParamDiff, + &bn_bwd_test_data.betaParamDiff, + &bn_bwd_test_data.input.desc, + bn_bwd_test_data.in_dev.get(), + &bn_bwd_test_data.dy.desc, + bn_bwd_test_data.dy_dev.get(), + &bn_bwd_test_data.output.desc, + bn_bwd_test_data.out_dev.get(), + &bn_bwd_test_data.bnScale.desc, + bn_bwd_test_data.bnScale_dev.get(), + bn_bwd_test_data.dScale_dev.get(), + bn_bwd_test_data.dBias_dev.get(), + bn_bwd_test_data.epsilon, + bn_bwd_test_data.savedMean_dev.get(), + bn_bwd_test_data.savedInvVar_dev.get()); + } + else if(api_type == BNApiType::testBNAPIV2) + { + res = miopenBatchNormalizationBackward_V2(&handle, + bn_config.mode, + &bn_bwd_test_data.alphaDataDiff, + &bn_bwd_test_data.betaDataDiff, + &bn_bwd_test_data.alphaParamDiff, + &bn_bwd_test_data.betaParamDiff, + &bn_bwd_test_data.input.desc, + bn_bwd_test_data.in_dev.get(), + &bn_bwd_test_data.dy.desc, + bn_bwd_test_data.dy_dev.get(), + &bn_bwd_test_data.output.desc, + bn_bwd_test_data.out_dev.get(), + &bn_bwd_test_data.bnScale.desc, + &bn_bwd_test_data.dBias.desc, + &bn_bwd_test_data.savedMean.desc, + &bn_bwd_test_data.savedInvVar.desc, + bn_bwd_test_data.bnScale_dev.get(), + bn_bwd_test_data.dScale_dev.get(), + bn_bwd_test_data.dBias_dev.get(), + bn_bwd_test_data.epsilon, + bn_bwd_test_data.savedMean_dev.get(), + bn_bwd_test_data.savedInvVar_dev.get()); + } + else + GTEST_FAIL() << "ERROR: unknown bn api type!!"; if(res != miopenStatusSuccess) { GTEST_FAIL() << "miopenBatchNormalizationBackward failed"; @@ -183,6 +255,7 @@ struct BNBwdTest : public ::testing::TestWithParam bn_bwd_test_data; miopenTensorLayout_t tensor_layout; + BNApiType api_type; }; template struct BNFwdTrainTest - : public ::testing::TestWithParam> + : public ::testing::TestWithParam> { protected: void SetUp() override { - std::tie(bn_config, tensor_layout) = GetParam(); + std::tie(bn_config, tensor_layout, api_type) = GetParam(); bn_fwd_train_test_data.SetUpImpl(bn_config, tensor_layout); auto&& handle = get_handle(); - auto res = - miopenBatchNormalizationForwardTraining(&handle, - bn_config.mode, - &bn_fwd_train_test_data.alpha, - &bn_fwd_train_test_data.beta, - &bn_fwd_train_test_data.input.desc, - bn_fwd_train_test_data.in_dev.get(), - &bn_fwd_train_test_data.output.desc, - bn_fwd_train_test_data.out_dev.get(), - &bn_fwd_train_test_data.scale.desc, - bn_fwd_train_test_data.scale_dev.get(), - bn_fwd_train_test_data.shift_dev.get(), - bn_fwd_train_test_data.averageFactor, - bn_fwd_train_test_data.runMean_dev.get(), - bn_fwd_train_test_data.runVariance_dev.get(), - bn_fwd_train_test_data.epsilon, - bn_fwd_train_test_data.saveMean_dev.get(), - bn_fwd_train_test_data.saveVariance_dev.get()); + if(!miopen::solver::ck_utility::is_ck_whitelist(handle.GetStream())) + { + test_skipped = true; + GTEST_SKIP() << "Not Applicable on " << handle.GetDeviceName() << " Architecture"; + } + miopenStatus_t res; + if(api_type == BNApiType::testBNAPIV1) + { + res = miopenBatchNormalizationForwardTraining( + &handle, + bn_config.mode, + &bn_fwd_train_test_data.alpha, + &bn_fwd_train_test_data.beta, + &bn_fwd_train_test_data.input.desc, + bn_fwd_train_test_data.in_dev.get(), + &bn_fwd_train_test_data.output.desc, + bn_fwd_train_test_data.out_dev.get(), + &bn_fwd_train_test_data.scale.desc, + bn_fwd_train_test_data.scale_dev.get(), + bn_fwd_train_test_data.shift_dev.get(), + bn_fwd_train_test_data.averageFactor, + bn_fwd_train_test_data.runMean_dev.get(), + bn_fwd_train_test_data.runVariance_dev.get(), + bn_fwd_train_test_data.epsilon, + bn_fwd_train_test_data.saveMean_dev.get(), + bn_fwd_train_test_data.saveVariance_dev.get()); + } + else if(api_type == BNApiType::testBNAPIV2) + { + res = miopenBatchNormalizationForwardTraining_V2( + &handle, + bn_config.mode, + &bn_fwd_train_test_data.alpha, + &bn_fwd_train_test_data.beta, + &bn_fwd_train_test_data.input.desc, + bn_fwd_train_test_data.in_dev.get(), + &bn_fwd_train_test_data.output.desc, + bn_fwd_train_test_data.out_dev.get(), + &bn_fwd_train_test_data.scale.desc, + &bn_fwd_train_test_data.shift.desc, + &bn_fwd_train_test_data.saveMean.desc, + &bn_fwd_train_test_data.saveVariance.desc, + bn_fwd_train_test_data.scale_dev.get(), + bn_fwd_train_test_data.shift_dev.get(), + bn_fwd_train_test_data.averageFactor, + bn_fwd_train_test_data.runMean_dev.get(), + bn_fwd_train_test_data.runVariance_dev.get(), + bn_fwd_train_test_data.epsilon, + bn_fwd_train_test_data.saveMean_dev.get(), + bn_fwd_train_test_data.saveVariance_dev.get()); + } + else + GTEST_FAIL() << "ERROR: unknown bn api type!!"; if(res != miopenStatusSuccess) { GTEST_FAIL() << "miopenBatchNormalizationForwardTraining failed"; @@ -275,4 +383,5 @@ struct BNFwdTrainTest BNFwdTrainTestData bn_fwd_train_test_data; miopenTensorLayout_t tensor_layout; + BNApiType api_type; }; diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp index f2d54e8077..df093a4710 100644 --- a/test/gtest/bn_bwd.cpp +++ b/test/gtest/bn_bwd.cpp @@ -26,46 +26,123 @@ #include "bn.hpp" -struct GPU_BNBwd_FP16 +// https://github.com/ROCm/MIOpen/issues/1549 +// NCHW solver accepts +// XDataType : half_float::half +// YDataYype : half_float::half +// ScaleDataType : half_float::half +// BiasDataType : half_float::half +// MeanVarDataType : half_float::half +// struct GPU_BN_V1_BwdNCHW_FP16 : BNBwdTest +// { +// }; + +// NHWC solver accepts +// XDataType : half_float::half +// YDataYype : half_float::half +// ScaleDataType : half_float::half +// BiasDataType : half_float::half +// MeanVarDataType : float +struct GPU_BN_V2_BwdNHWC_FP16 : BNBwdTest { }; -struct GPU_BNBwd_FP32 : BNBwdTest +// bf16 NHWC solver accepts is only on CK solver +// XDataType : bfloat16 +// YDataYype : bfloat16 +// ScaleDataType : bfloat16 +// BiasDataType : bfloat16 +// MeanVarDataType : float +struct GPU_BN_V1_BwdNHWC_BFP16 : BNBwdTest +{ +}; + +struct GPU_BN_V2_BwdNHWC_BFP16 : BNBwdTest +{ +}; + +struct GPU_BN_V1_Bwd_FP32 : BNBwdTest +{ +}; + +struct GPU_BN_V2_Bwd_FP32 : BNBwdTest { }; -struct GPU_BNBwd_BFP16 : BNBwdTest +struct GPU_BN_V1_BwdNHWC_FP64 : BNBwdTest { }; -struct GPU_BNBwd_FP64 : BNBwdTest +struct GPU_BN_V2_BwdNHWC_FP64 : BNBwdTest { }; -TEST_P(GPU_BNBwd_FP16, BnBwdCKHalf) {} +// fp16 +// TEST_P(GPU_BN_V1_BwdNCHW_FP16, BnV1BwdHalf) {} +TEST_P(GPU_BN_V2_BwdNHWC_FP16, BnV2BwdCKHalf) {} + +// float +TEST_P(GPU_BN_V1_Bwd_FP32, BnV1BwdFloat) {} +TEST_P(GPU_BN_V2_Bwd_FP32, BnV2BwdFloat) {} + +// bfp16 is only on CK solver +TEST_P(GPU_BN_V1_BwdNHWC_BFP16, BnV1BwdCKBfloat) {} +TEST_P(GPU_BN_V2_BwdNHWC_BFP16, BnV2BwdCKBfloat) {} -TEST_P(GPU_BNBwd_FP32, BnBwdCKFloat) {} +// double is only on CK solver +TEST_P(GPU_BN_V1_BwdNHWC_FP64, BnV1BwdCKDouble) {} +TEST_P(GPU_BN_V2_BwdNHWC_FP64, BnV2BwdCKDouble) {} -TEST_P(GPU_BNBwd_BFP16, BnBwdCKBFloat16) {} -TEST_P(GPU_BNBwd_FP64, BnBwdCKDouble) {} +// // fp16 +// INSTANTIATE_TEST_SUITE_P(Smoke, +// GPU_BN_V1_BwdNCHW_FP16, +// testing::Combine(testing::ValuesIn(NetworkSmall()), +// testing::Values(miopenTensorNCHW), +// testing::ValuesIn({testBNAPIV1}))); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V2_BwdNHWC_FP16, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); + +// fp32 +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V1_Bwd_FP32, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNCHW), + testing::ValuesIn({testBNAPIV1}))); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V2_Bwd_FP32, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); +// bfp16 is only on CK solver INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNBwd_FP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V1_BwdNHWC_BFP16, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNBwd_FP32, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V2_BwdNHWC_BFP16, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); +// fp64 is only on CK solver INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNBwd_BFP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V1_BwdNHWC_FP64, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNBwd_FP64, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V2_BwdNHWC_FP64, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); diff --git a/test/gtest/bn_fwd_train.cpp b/test/gtest/bn_fwd_train.cpp index b5dbed4705..ed25631175 100644 --- a/test/gtest/bn_fwd_train.cpp +++ b/test/gtest/bn_fwd_train.cpp @@ -26,46 +26,119 @@ #include "bn.hpp" -struct GPU_BNFwdTrain_FP16 +// ** OCL kernel for fwd training are failing gtest ** +// ** Hence, this gtest only tests CK solvers ** + +// NHWC solver accepts +// XDataType : half_float::half +// YDataYype : half_float::half +// ScaleDataType : half_float::half +// BiasDataType : half_float::half +// MeanVarDataType : float +struct GPU_BN_V1_FwdTrainNHWC_FP16 + : BNFwdTrainTest +{ +}; + +struct GPU_BN_V2_FwdTrainNHWC_FP16 : BNFwdTrainTest { }; -struct GPU_BNFwdTrain_FP32 : BNFwdTrainTest +// bf16 NHWC solver accepts is only on CK solver +// XDataType : bfloat16 +// YDataYype : bfloat16 +// ScaleDataType : bfloat16 +// BiasDataType : bfloat16 +// MeanVarDataType : float +struct GPU_BN_V1_FwdTrainNHWC_BFP16 : BNFwdTrainTest +{ +}; + +struct GPU_BN_V2_FwdTrainNHWC_BFP16 : BNFwdTrainTest { }; -struct GPU_BNFwdTrain_FP64 : BNFwdTrainTest +struct GPU_BN_V1_FwdTrainNHWC_FP32 : BNFwdTrainTest { }; -struct GPU_BNFwdTrain_BFP16 : BNFwdTrainTest +struct GPU_BN_V2_FwdTrainNHWC_FP32 : BNFwdTrainTest { }; -TEST_P(GPU_BNFwdTrain_FP16, BnFwdTrainCKHalf) {} +struct GPU_BN_V1_FwdTrainNHWC_FP64 : BNFwdTrainTest +{ +}; -TEST_P(GPU_BNFwdTrain_FP32, BnFwdTrainCKFloat) {} +struct GPU_BN_V2_FwdTrainNHWC_FP64 : BNFwdTrainTest +{ +}; -TEST_P(GPU_BNFwdTrain_FP64, BnFwdTrainCKDouble) {} -TEST_P(GPU_BNFwdTrain_BFP16, BnFwdTrainCKBFloat16) {} +// fp16 +TEST_P(GPU_BN_V1_FwdTrainNHWC_FP16, BnV1FwdTrainHalf) {} +TEST_P(GPU_BN_V2_FwdTrainNHWC_FP16, BnV2FwdTrainCKHalf) {} + +// float +TEST_P(GPU_BN_V1_FwdTrainNHWC_FP32, BnV1FwdTrainFloat) {} +TEST_P(GPU_BN_V2_FwdTrainNHWC_FP32, BnV2FwdTrainFloat) {} + +// bfp16 +TEST_P(GPU_BN_V1_FwdTrainNHWC_BFP16, BnV1FwdTrainCKBfloat) {} +TEST_P(GPU_BN_V2_FwdTrainNHWC_BFP16, BnV2FwdTrainCKBfloat) {} + +// double +TEST_P(GPU_BN_V1_FwdTrainNHWC_FP64, BnV1FwdTrainCKDouble) {} +TEST_P(GPU_BN_V2_FwdTrainNHWC_FP64, BnV2FwdTrainCKDouble) {} + +// fp16 +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V1_FwdTrainNHWC_FP16, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V2_FwdTrainNHWC_FP16, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); + +// fp32 +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V1_FwdTrainNHWC_FP32, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V2_FwdTrainNHWC_FP32, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); +// bfp16 INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNFwdTrain_FP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V1_FwdTrainNHWC_BFP16, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNFwdTrain_FP32, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V2_FwdTrainNHWC_BFP16, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); +// fp64 INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNFwdTrain_FP64, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V1_FwdTrainNHWC_FP64, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNFwdTrain_BFP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V2_FwdTrainNHWC_FP64, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp index 081d583213..aea15c097e 100644 --- a/test/gtest/bn_infer.cpp +++ b/test/gtest/bn_infer.cpp @@ -27,66 +27,124 @@ #include "bn.hpp" // NCHW solver accepts -// XDataType : half -// YDataYype : half -// ScaleDataType : float -// BiasDataType : float -// MeanVarDataType : float -struct GPU_BNInferNCHW_FP16 : BNInferTest +// XDataType : half_float::half +// YDataYype : half_float::half +// ScaleDataType : half_float::half +// BiasDataType : half_float::half +// MeanVarDataType : half_float::half +struct GPU_BN_V1_InferNCHW_FP16 : BNInferTest { }; // NHWC solver accepts -// XDataType : half -// YDataYype : half -// ScaleDataType : half -// BiasDataType : half +// XDataType : half_float::half +// YDataYype : half_float::half +// ScaleDataType : half_float::half +// BiasDataType : half_float::half // MeanVarDataType : float -struct GPU_BNInferNHWC_FP16 +struct GPU_BN_V2_InferNHWC_FP16 : BNInferTest { }; -struct GPU_BNInfer_FP32 : BNInferTest +// bf16 NHWC solver accepts is only on CK solver +// XDataType : bfloat16 +// YDataYype : bfloat16 +// ScaleDataType : bfloat16 +// BiasDataType : bfloat16 +// MeanVarDataType : float +struct GPU_BN_V1_InferNHWC_BFP16 : BNInferTest +{ +}; + +struct GPU_BN_V2_InferNHWC_BFP16 : BNInferTest { }; -struct GPU_BNInfer_FP64 : BNInferTest +struct GPU_BN_V1_Infer_FP32 : BNInferTest { }; -struct GPU_BNInfer_BFP16 : BNInferTest +struct GPU_BN_V2_Infer_FP32 : BNInferTest { }; -TEST_P(GPU_BNInferNCHW_FP16, BnInferCKHalf) {} -TEST_P(GPU_BNInferNHWC_FP16, BnInferCKHalf) {} +struct GPU_BN_V1_InferNHWC_FP64 : BNInferTest +{ +}; + +struct GPU_BN_V2_InferNHWC_FP64 : BNInferTest +{ +}; -TEST_P(GPU_BNInfer_FP32, BnInferCKFloat) {} -TEST_P(GPU_BNInfer_FP64, BnInferCKDouble) {} -TEST_P(GPU_BNInfer_BFP16, BnInferCKBFloat16) {} +// fp16 +TEST_P(GPU_BN_V1_InferNCHW_FP16, BnV1InferHalf) {} +TEST_P(GPU_BN_V2_InferNHWC_FP16, BnV2InferCKHalf) {} + +// float +TEST_P(GPU_BN_V1_Infer_FP32, BnV1InferFloat) {} +TEST_P(GPU_BN_V2_Infer_FP32, BnV2InferFloat) {} + +// bfp16 is only on CK solver +TEST_P(GPU_BN_V1_InferNHWC_BFP16, BnV1InferCKBfloat) {} +TEST_P(GPU_BN_V2_InferNHWC_BFP16, BnV2InferCKBfloat) {} + +// double is only on CK solver +TEST_P(GPU_BN_V1_InferNHWC_FP64, BnV1InferCKDouble) {} +TEST_P(GPU_BN_V2_InferNHWC_FP64, BnV2InferCKDouble) {} + +// fp16 +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V1_InferNCHW_FP16, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNCHW), + testing::ValuesIn({testBNAPIV1}))); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V2_InferNHWC_FP16, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); + +// fp32 +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V1_Infer_FP32, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNCHW), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNInferNCHW_FP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNCHW))); + GPU_BN_V2_Infer_FP32, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); +// bfp16 is only on CK solver INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNInferNHWC_FP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V1_InferNHWC_BFP16, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNInfer_FP32, - testing::Combine(testing::ValuesIn(Network1()), - testing::ValuesIn({miopenTensorNHWC, miopenTensorNCHW}))); + GPU_BN_V2_InferNHWC_BFP16, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); +// fp64 is only on CK solver INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNInfer_FP64, - testing::Combine(testing::ValuesIn(Network1()), - testing::ValuesIn({miopenTensorNHWC}))); + GPU_BN_V1_InferNHWC_FP64, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNInfer_BFP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::ValuesIn({miopenTensorNHWC}))); + GPU_BN_V2_InferNHWC_FP64, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index fcf237400b..2d8ab5c5bf 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -56,10 +56,13 @@ struct BNTestCase }; template -std::vector Network1(); +std::vector NetworkSmall(); + +template +std::vector NetworkLarge(); template <> -inline std::vector Network1() +inline std::vector NetworkLarge() { // pyt_mlperf_resnet50v1.5 return { @@ -95,6 +98,20 @@ inline std::vector Network1() {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}}; } +template <> +inline std::vector NetworkSmall() +{ + // pyt_mlperf_resnet50v1.5 + return { + {192, 2, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, + {16, 8, 132, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, + // {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0}, + // {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + + }; +} + template struct BNTestData { From d5212dbbb6eee023afbb10c1147dd0551147b1ad Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Tue, 15 Oct 2024 15:50:04 +0000 Subject: [PATCH 22/27] typo --- test/gtest/bn_test_data.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index 2d8ab5c5bf..9afa8ea4ed 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -105,9 +105,8 @@ inline std::vector NetworkSmall() return { {192, 2, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, {16, 8, 132, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, - // {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0}, - // {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - // {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0}, + {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, }; } From 7cef7665e1e4c1ade816b9873ff20c6188b82621 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Wed, 16 Oct 2024 20:01:42 +0000 Subject: [PATCH 23/27] address review comments --- driver/bn_driver.hpp | 25 +++++++------------ src/batch_norm_api.cpp | 54 +++++++++++++++++++++--------------------- 2 files changed, 36 insertions(+), 43 deletions(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index e8ae9ff216..da1be3a066 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -101,7 +101,7 @@ class BatchNormDriver : public Driver int VerifyForward() override; // Helper function to check the Layout type short names - int ChkLayout_ShortName(); + bool ChkLayout_ShortName(); // function to validate the Layout type parameters. // layout parameter value to std (NCHW/NHWC/NCDHW/NDHWC) values, // defined in MIOpen lib. @@ -345,14 +345,14 @@ std::vector BatchNormDriver::GetInputTensorLengthsFromCmd } template -int BatchNormDriver::ChkLayout_ShortName() +bool BatchNormDriver::ChkLayout_ShortName() { // check for short name of layout type if(inflags.FindShortName("layout") == 'L') { // do noting // found valid short names - return 0; + return true; } else { @@ -364,23 +364,16 @@ int BatchNormDriver::ChkLayout_ShortName() template void BatchNormDriver::ValidateLayoutInputParameters(std::string layout_value) { - if((ChkLayout_ShortName())) + if(!ChkLayout_ShortName()) { - std::cerr << " Invalid Layout Short Name = " << ChkLayout_ShortName() << std::endl; + std::cerr << "Invalid Layout Short Name = " << inflags.FindShortName("layout") << std::endl; exit(EXIT_FAILURE); } - else + if((layout_value.compare("NCHW") != 0) && (layout_value.compare("NHWC") != 0) && + (layout_value.compare("NCDHW") != 0) && (layout_value.compare("NDHWC") != 0)) { - if((layout_value.compare("NCHW") == 0) || (layout_value.compare("NHWC") == 0) || - (layout_value.compare("NCDHW") == 0) || (layout_value.compare("NDHWC") == 0)) - { - // do nothing,Values are matching as defined in Lib. - } - else - { - std::cerr << "Invalid Layout Parameter Value - " << layout_value << std::endl; - exit(EXIT_FAILURE); - } + std::cerr << "Invalid Layout Parameter Value - " << layout_value << std::endl; + exit(EXIT_FAILURE); } } diff --git a/src/batch_norm_api.cpp b/src/batch_norm_api.cpp index 72e6a64554..56d6fbb5e8 100644 --- a/src/batch_norm_api.cpp +++ b/src/batch_norm_api.cpp @@ -138,9 +138,9 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle, yDesc, y, bnScaleBiasMeanVarDesc, - nullptr, - nullptr, - nullptr, + bnScaleBiasMeanVarDesc, + bnScaleBiasMeanVarDesc, + bnScaleBiasMeanVarDesc, bnScale, bnBias, expAvgFactor, @@ -185,9 +185,9 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, dxDesc, dx, bnScaleBiasDiffDesc, - nullptr, - nullptr, - nullptr, + bnScaleBiasDiffDesc, + bnScaleBiasDiffDesc, + bnScaleBiasDiffDesc, bnScale, resultBnScaleDiff, resultBnBiasDiff, @@ -222,9 +222,9 @@ miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle, yDesc, y, scaleDesc, - (BiasDesc == nullptr) ? scaleDesc : BiasDesc, - (estMeanDesc == nullptr) ? scaleDesc : estMeanDesc, - (estVarianceDesc == nullptr) ? scaleDesc : estVarianceDesc, + BiasDesc, + estMeanDesc, + estVarianceDesc, bnScale, bnBias, estimatedMean, @@ -232,7 +232,7 @@ miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle, epsilon); miopen::debug::LogCmdBNorm(xDesc, - (estMeanDesc == nullptr) ? scaleDesc : estMeanDesc, + estMeanDesc, bn_mode, estimatedMean, estimatedVariance, @@ -256,9 +256,9 @@ miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle, : miopen::deref(yDesc), DataCast(y), miopen::deref(scaleDesc), - miopen::deref((BiasDesc == nullptr) ? scaleDesc : BiasDesc), - miopen::deref((estMeanDesc == nullptr) ? scaleDesc : estMeanDesc), - miopen::deref((estVarianceDesc == nullptr) ? scaleDesc : estVarianceDesc), + miopen::deref(BiasDesc), + miopen::deref(estMeanDesc), + miopen::deref(estVarianceDesc), DataCast(bnScale), DataCast(bnBias), DataCast(estimatedMean), @@ -296,9 +296,9 @@ miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle, yDesc, y, scaleDesc, - (BiasDesc == nullptr) ? scaleDesc : BiasDesc, - (savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc, - (savedVarianceDesc == nullptr) ? scaleDesc : savedVarianceDesc, + BiasDesc, + savedMeanDesc, + savedVarianceDesc, bnScale, bnBias, expAvgFactor, @@ -309,7 +309,7 @@ miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle, resultSaveInvVariance); miopen::debug::LogCmdBNorm(xDesc, - (savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc, + savedMeanDesc, bn_mode, resultRunningMean, resultRunningVariance, @@ -332,9 +332,9 @@ miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle, : miopen::deref(yDesc), DataCast(y), miopen::deref(scaleDesc), - miopen::deref((BiasDesc == nullptr) ? scaleDesc : BiasDesc), - miopen::deref((savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc), - miopen::deref((savedVarianceDesc == nullptr) ? scaleDesc : savedVarianceDesc), + miopen::deref(BiasDesc), + miopen::deref(savedMeanDesc), + miopen::deref(savedVarianceDesc), DataCast(bnScale), DataCast(bnBias), expAvgFactor, @@ -379,9 +379,9 @@ miopenBatchNormalizationBackward_V2(miopenHandle_t handle, dxDesc, dx, scaleDesc, - (BiasDesc == nullptr) ? scaleDesc : BiasDesc, - (savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc, - (savedVarianceDesc == nullptr) ? scaleDesc : savedVarianceDesc, + BiasDesc, + savedMeanDesc, + savedVarianceDesc, bnScale, resultBnScaleDiff, resultBnBiasDiff, @@ -389,7 +389,7 @@ miopenBatchNormalizationBackward_V2(miopenHandle_t handle, savedMean, savedInvVariance); miopen::debug::LogCmdBNorm(xDesc, - (savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc, + savedMeanDesc, bn_mode, nullptr, nullptr, @@ -417,9 +417,9 @@ miopenBatchNormalizationBackward_V2(miopenHandle_t handle, : miopen::deref(dxDesc), DataCast(dx), miopen::deref(scaleDesc), - miopen::deref((BiasDesc == nullptr) ? scaleDesc : BiasDesc), - miopen::deref((savedMeanDesc == nullptr) ? scaleDesc : savedMeanDesc), - miopen::deref((savedVarianceDesc == nullptr) ? scaleDesc : savedVarianceDesc), + miopen::deref(BiasDesc), + miopen::deref(savedMeanDesc), + miopen::deref(savedVarianceDesc), DataCast(bnScale), DataCast(resultBnScaleDiff), DataCast(resultBnBiasDiff), From 379a72020907d09bf8e3bc9a190192f0ea49da11 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Wed, 16 Oct 2024 20:22:25 +0000 Subject: [PATCH 24/27] driver to use V2 bath norm API --- driver/bn_driver.hpp | 329 +++++++++++++++++++++------------------ src/driver_arguments.cpp | 2 +- 2 files changed, 179 insertions(+), 152 deletions(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index da1be3a066..238b4ea1e6 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -602,37 +602,43 @@ void BatchNormDriver::runGPUFwdInference(Tref epsilon, float a if(keepRunningMeanVar) { // use precalculated mean and variance - miopenBatchNormalizationForwardInference(GetHandle(), - bn_mode, - &alpha, - &beta, - &in.GetTensor().desc, - in.GetDevicePtr(), - &out.GetTensor().desc, - out.GetDevicePtr(), - &scale.GetTensor().desc, - scale.GetDevicePtr(), - bias.GetDevicePtr(), - estMean.GetDevicePtr(), - estVariance.GetDevicePtr(), - epsilon); + miopenBatchNormalizationForwardInference_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &estMean.GetTensor().desc, + &estVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + estMean.GetDevicePtr(), + estVariance.GetDevicePtr(), + epsilon); } else { // recalculate mean and variance - miopenBatchNormalizationForwardInference(GetHandle(), - bn_mode, - &alpha, - &beta, - &in.GetTensor().desc, - in.GetDevicePtr(), - &out.GetTensor().desc, - out.GetDevicePtr(), - &scale.GetTensor().desc, - scale.GetDevicePtr(), - bias.GetDevicePtr(), - nullptr, - nullptr, - epsilon); + miopenBatchNormalizationForwardInference_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &estMean.GetTensor().desc, + &estVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + nullptr, + nullptr, + epsilon); } return; @@ -646,103 +652,118 @@ void BatchNormDriver::runGPUFwdTrain(Tref epsilon, { if(saveMeanVar && keepRunningMeanVar) { - miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - &in.GetTensor().desc, - in.GetDevicePtr(), - &out.GetTensor().desc, - out.GetDevicePtr(), - &scale.GetTensor().desc, - scale.GetDevicePtr(), - bias.GetDevicePtr(), - eAF, - runMean.GetDevicePtr(), - runVariance.GetDevicePtr(), - epsilon, - savedMean.GetDevicePtr(), - savedVariance.GetDevicePtr()); + miopenBatchNormalizationForwardTraining_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + runMean.GetDevicePtr(), + runVariance.GetDevicePtr(), + epsilon, + savedMean.GetDevicePtr(), + savedVariance.GetDevicePtr()); } else if(saveMeanVar) { - miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - &in.GetTensor().desc, - in.GetDevicePtr(), - &out.GetTensor().desc, - out.GetDevicePtr(), - &scale.GetTensor().desc, - scale.GetDevicePtr(), - bias.GetDevicePtr(), - eAF, - nullptr, - nullptr, - epsilon, - savedMean.GetDevicePtr(), - savedVariance.GetDevicePtr()); + miopenBatchNormalizationForwardTraining_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + nullptr, + nullptr, + epsilon, + savedMean.GetDevicePtr(), + savedVariance.GetDevicePtr()); } else if(keepRunningMeanVar) { - miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - &in.GetTensor().desc, - in.GetDevicePtr(), - &out.GetTensor().desc, - out.GetDevicePtr(), - &scale.GetTensor().desc, - scale.GetDevicePtr(), - bias.GetDevicePtr(), - eAF, - runMean.GetDevicePtr(), - runVariance.GetDevicePtr(), - epsilon, - nullptr, - nullptr); + miopenBatchNormalizationForwardTraining_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + runMean.GetDevicePtr(), + runVariance.GetDevicePtr(), + epsilon, + nullptr, + nullptr); } else { - miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - &in.GetTensor().desc, - in.GetDevicePtr(), - &out.GetTensor().desc, - out.GetDevicePtr(), - &scale.GetTensor().desc, - scale.GetDevicePtr(), - bias.GetDevicePtr(), - eAF, - nullptr, - nullptr, - epsilon, - nullptr, - nullptr); + miopenBatchNormalizationForwardTraining_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + nullptr, + nullptr, + epsilon, + nullptr, + nullptr); } #ifdef BN_RUNFOR_PROFILER - miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - &in.GetTensor().desc, - in.GetDevicePtr(), - &out.GetTensor().desc, - out.GetDevicePtr(), - &scale.GetTensor().desc, - scale.GetDevicePtr(), - bias.GetDevicePtr(), - eAF, - nullptr, - nullptr, - epsilon, - nullptr, - nullptr); + miopenBatchNormalizationForwardTraining_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + nullptr, + nullptr, + epsilon, + nullptr, + nullptr); #endif } @@ -1015,47 +1036,53 @@ int BatchNormDriver::RunBackwardGPU() if(saveMeanVar) { - miopenBatchNormalizationBackward(GetHandle(), - bn_mode, - &alphaDataDiff, - &betaDataDiff, - &alphaParamDiff, - &betaParamDiff, - &in.GetTensor().desc, - in.GetDevicePtr(), - &dy.GetTensor().desc, - dy.GetDevicePtr(), - &out_bwd.GetTensor().desc, - out_bwd.GetDevicePtr(), - &bnScale.GetTensor().desc, - bnScale.GetDevicePtr(), - dScale.GetDevicePtr(), - dBias.GetDevicePtr(), - epsilon, - savedMean.GetDevicePtr(), - savedInvVar.GetDevicePtr()); + miopenBatchNormalizationBackward_V2(GetHandle(), + bn_mode, + &alphaDataDiff, + &betaDataDiff, + &alphaParamDiff, + &betaParamDiff, + &in.GetTensor().desc, + in.GetDevicePtr(), + &dy.GetTensor().desc, + dy.GetDevicePtr(), + &out_bwd.GetTensor().desc, + out_bwd.GetDevicePtr(), + &bnScale.GetTensor().desc, + &dBias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedInvVar.GetTensor().desc, + bnScale.GetDevicePtr(), + dScale.GetDevicePtr(), + dBias.GetDevicePtr(), + epsilon, + savedMean.GetDevicePtr(), + savedInvVar.GetDevicePtr()); } else { - miopenBatchNormalizationBackward(GetHandle(), - bn_mode, - &alphaDataDiff, - &betaDataDiff, - &alphaParamDiff, - &betaParamDiff, - &in.GetTensor().desc, - in.GetDevicePtr(), - &dy.GetTensor().desc, - dy.GetDevicePtr(), - &out_bwd.GetTensor().desc, - out_bwd.GetDevicePtr(), - &bnScale.GetTensor().desc, - bnScale.GetDevicePtr(), - dScale.GetDevicePtr(), - dBias.GetDevicePtr(), - epsilon, - nullptr, - nullptr); + miopenBatchNormalizationBackward_V2(GetHandle(), + bn_mode, + &alphaDataDiff, + &betaDataDiff, + &alphaParamDiff, + &betaParamDiff, + &in.GetTensor().desc, + in.GetDevicePtr(), + &dy.GetTensor().desc, + dy.GetDevicePtr(), + &out_bwd.GetTensor().desc, + out_bwd.GetDevicePtr(), + &bnScale.GetTensor().desc, + &dBias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedInvVar.GetTensor().desc, + bnScale.GetDevicePtr(), + dScale.GetDevicePtr(), + dBias.GetDevicePtr(), + epsilon, + nullptr, + nullptr); } miopen::deref(GetHandle()).Finish(); diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp index 971977afa9..e75ec31902 100644 --- a/src/driver_arguments.cpp +++ b/src/driver_arguments.cpp @@ -76,7 +76,7 @@ void BnDataType(std::stringstream& ss, } else if(xDesc.GetType() == miopenHalf && sMeanDesc.GetType() == miopenFloat) { - ss << "bnormbfp16fp32"; + ss << "bnormfp16fp32"; } else if(xDesc.GetType() == miopenBFloat16 && sMeanDesc.GetType() == miopenFloat) { From c746d3dfacf7a1f7424dfee0f54b0631a5ef7994 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 17 Oct 2024 00:14:09 +0000 Subject: [PATCH 25/27] Update fin to develop branch --- fin | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fin b/fin index 8c40a3c3b4..344cf42f6c 160000 --- a/fin +++ b/fin @@ -1 +1 @@ -Subproject commit 8c40a3c3b41a7d2fb31a8e747155fde4223919b9 +Subproject commit 344cf42f6c18f309f3d1dd08af1cd7b73dd38e46 From d2b851fa7e3af20738e35a9919b040abe38d7284 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 17 Oct 2024 01:51:33 +0000 Subject: [PATCH 26/27] fix hip tidy --- test/gtest/bn.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index fdff351f79..e1f192c37d 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -59,7 +59,7 @@ struct BNInferTest test_skipped = true; GTEST_SKIP() << "Not Applicable on " << handle.GetDeviceName() << " Architecture"; } - miopenStatus_t res; + miopenStatus_t res = miopenStatusUnknownError; if(api_type == BNApiType::testBNAPIV1) { res = miopenBatchNormalizationForwardInference(&handle, @@ -155,7 +155,7 @@ struct BNBwdTest test_skipped = true; GTEST_SKIP() << "Not Applicable on " << handle.GetDeviceName() << " Architecture"; } - miopenStatus_t res; + miopenStatus_t res = miopenStatusUnknownError; if(api_type == BNApiType::testBNAPIV1) { res = miopenBatchNormalizationBackward(&handle, @@ -278,7 +278,7 @@ struct BNFwdTrainTest test_skipped = true; GTEST_SKIP() << "Not Applicable on " << handle.GetDeviceName() << " Architecture"; } - miopenStatus_t res; + miopenStatus_t res = miopenStatusUnknownError; if(api_type == BNApiType::testBNAPIV1) { res = miopenBatchNormalizationForwardTraining( From cf62a8f194274657c663a9fadf6a548a24654e91 Mon Sep 17 00:00:00 2001 From: Bibek Ghimire Date: Thu, 17 Oct 2024 03:48:05 +0000 Subject: [PATCH 27/27] fix CI --- src/batch_norm_api.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/batch_norm_api.cpp b/src/batch_norm_api.cpp index 56d6fbb5e8..d3b824cee0 100644 --- a/src/batch_norm_api.cpp +++ b/src/batch_norm_api.cpp @@ -100,9 +100,9 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, yDesc, y, bnScaleBiasMeanVarDesc, - nullptr, - nullptr, - nullptr, + bnScaleBiasMeanVarDesc, + bnScaleBiasMeanVarDesc, + bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,