diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index 4b94ac42d8..238b4ea1e6 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -33,12 +33,16 @@ #include "tensor_driver.hpp" #include "timer.hpp" #include "util_driver.hpp" +#include "rocrand_wrapper.hpp" #include "../test/verify.hpp" +#include "../test/random.hpp" +#include "../test/fusionHost.hpp" #include #include #include +#include "miopen/batch_norm.hpp" #include #include @@ -66,16 +70,7 @@ template class BatchNormDriver : public Driver { public: - BatchNormDriver() : Driver() - { - miopenCreateTensorDescriptor(&inputTensor); - miopenCreateTensorDescriptor(&outputTensor); - miopenCreateTensorDescriptor(&biasScaleTensor); - miopenCreateTensorDescriptor(&dxOutputTensor); - miopenCreateTensorDescriptor(&dyInputTensor); - - data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf; - } + BatchNormDriver() : Driver() { data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf; } int AddCmdLineArgs() override; int ParseCmdLineArgs(int argc, char* argv[]) override; @@ -99,25 +94,25 @@ class BatchNormDriver : public Driver void runGPUFwdTrain(Tref epsilon, Tref eAF, float alpha, float beta); void runGPUBwd(Tref epsilon, float alpha, float beta); - void runCPUFwdInference( - Tref epsilon, int batch_sz, int channels, int height, int width, int depth = 0); - void runCPUFwdTrain( - Tref epsilon, Tref eAF, int batch_sz, int channels, int height, int width, int depth = 0); + void runCPUFwdInference(Tref epsilon); + void runCPUFwdTrain(Tref epsilon, Tref eAF); int VerifyBackward() override; int VerifyForward() override; - ~BatchNormDriver() override - { - miopenDestroyTensorDescriptor(outputTensor); - miopenDestroyTensorDescriptor(inputTensor); - miopenDestroyTensorDescriptor(biasScaleTensor); - miopenDestroyTensorDescriptor(dxOutputTensor); - miopenDestroyTensorDescriptor(dyInputTensor); - } + // Helper function to check the Layout type short names + bool ChkLayout_ShortName(); + // function to validate the Layout type parameters. + // layout parameter value to std (NCHW/NHWC/NCDHW/NDHWC) values, + // defined in MIOpen lib. + void ValidateLayoutInputParameters(std::string layout_type); + + ~BatchNormDriver() override {} private: miopenBatchNormMode_t bn_mode; + miopenActivationMode_t activ_mode = miopenActivationRELU; + bool saveMeanVar; bool bsaveMeanVar; bool keepRunningMeanVar; @@ -126,63 +121,53 @@ class BatchNormDriver : public Driver int forw; int back; + bool isFwdInfer = false; + bool isFwdTrain = false; + bool isBwd = false; + InputFlags inflags; bool isDepthSpecified = false; - miopenTensorDescriptor_t inputTensor; - miopenTensorDescriptor_t biasScaleTensor; - miopenTensorDescriptor_t outputTensor; - - // Backwards - miopenTensorDescriptor_t dyInputTensor; - miopenTensorDescriptor_t dxOutputTensor; - - std::unique_ptr dyin_dev; // this is the output of fwd - std::unique_ptr in_dev; - std::unique_ptr out_dev; - std::unique_ptr scale_dev; - std::unique_ptr bias_dev; - - std::unique_ptr dxout_dev; - std::unique_ptr dscale_dev; - std::unique_ptr dbias_dev; - - std::unique_ptr runningMean_dev; - std::unique_ptr runningVariance_dev; - std::unique_ptr saveMean_dev; - std::unique_ptr saveInvVariance_dev; - - std::vector dyin; // output of forward - std::vector in; - std::vector out; - std::vector out_host; - std::vector dxout; - std::vector dxout_host; - - std::vector scale; - std::vector scale_host; - std::vector bias; - std::vector bias_host; - - std::vector dscale; - std::vector dscale_host; - std::vector dbias; - std::vector dbias_host; - - std::vector runningMean; - std::vector runningVariance; - std::vector runningMean_host; - std::vector runningVariance_host; - - std::vector saveMean; - std::vector saveInvVariance; - - std::vector saveMean_host; - std::vector saveInvVariance_host; - - int createSaveBuffers(); - int createRunningBuffers(); + GpumemTensor in; + GpumemTensor out; + tensor out_ref; + + // forward + GpumemTensor scale; + GpumemTensor bias; + + // forward inference + GpumemTensor estMean; + GpumemTensor estVariance; + + GpumemTensor savedMean; + tensor savedMean_ref; + + // forward training + GpumemTensor savedVariance; + GpumemTensor runMean; + GpumemTensor runVariance; + // ref + tensor savedVariance_ref; + tensor runMean_ref; + tensor runVariance_ref; + + // backward needed different type for bwd. + GpumemTensor out_bwd; + + GpumemTensor bnScale; + GpumemTensor dScale; + GpumemTensor dBias; + // savedMean declared above as Tmix as well + GpumemTensor savedInvVar; + GpumemTensor dy; + + tensor dBias_ref; + tensor dScale_ref; + Tref maxval; + + miopenTensorLayout_t bn_layout; }; template @@ -202,46 +187,84 @@ template int BatchNormDriver::GetandSetData() { + std::vector in_len = GetInputTensorLengthsFromCmdLine(); SetBNParametersFromCmdLineArgs(); - std::vector in_len = GetInputTensorLengthsFromCmdLine(); + auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; - std::vector sb_len; - if(bn_mode == miopenBNPerActivation) + in.AllocOnHost(tensor{bn_layout, in_len}); + in.InitHostData(in.GetTensor().desc.GetElementSize(), true, gen_value); + + auto derivedBnDesc = miopen::TensorDescriptor{}; + miopen::DeriveBNTensorDescriptor(derivedBnDesc, in.GetTensor().desc, bn_mode); + + if(isFwdInfer || isFwdTrain) { - // 1xCxHxW | in_len.size = 4 - sb_len.push_back(1); - sb_len.push_back(in_len[1]); - sb_len.push_back(in_len[2]); - sb_len.push_back(in_len[3]); + out.AllocOnHost(tensor{bn_layout, in_len}); + scale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + bias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - // 1xCxDxHxW | in_len.size = 5 - if(in_len.size() == 5) - { - sb_len.push_back(in_len[4]); - } + auto gen_value_scale_bias = [](auto...) { + return prng::gen_descreet_uniform_sign(1e-2, 100); + }; + + scale.InitHostData(scale.GetTensor().desc.GetElementSize(), true, gen_value_scale_bias); + bias.InitHostData(bias.GetTensor().desc.GetElementSize(), true, gen_value_scale_bias); } - else if(bn_mode == miopenBNSpatial) - { // 1xCx1x1 - sb_len.push_back(1); - sb_len.push_back(in_len[1]); - sb_len.push_back(1); - sb_len.push_back(1); + if(isFwdInfer) + { + estMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + estVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - // 1xCx1x1x1 - if(in_len.size() == 5) - { - sb_len.push_back(1); - } + auto gen_value_emean = [](auto...) { + return prng::gen_descreet_uniform_sign(1e-2, 100); + }; + estMean.InitHostData(estMean.GetTensor().desc.GetElementSize(), true, gen_value_emean); + } + else if(isFwdTrain) + { + savedMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + savedVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + runMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + runVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + + auto gen_var = [](auto...) { + return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); + }; + runMean.InitHostData(runMean.GetTensor().desc.GetElementSize(), true, gen_var); + runVariance.InitHostData(runVariance.GetTensor().desc.GetElementSize(), true, gen_var); } + else if(isBwd) + { + out_bwd.AllocOnHost(tensor{bn_layout, in_len}); + + bnScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + dy.AllocOnHost(tensor{bn_layout, in_len}); + + auto gen_var_bwd = [](auto...) { + return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); + }; + dy.InitHostData(dy.GetTensor().desc.GetElementSize(), true, gen_var_bwd); - SetTensorNd(inputTensor, in_len, data_type); - SetTensorNd(biasScaleTensor, sb_len, ((sizeof(Tmix) == 4) ? miopenFloat : miopenHalf)); - SetTensorNd(outputTensor, in_len, data_type); + dScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + dBias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + savedMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + savedInvVar.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - // backwards - SetTensorNd(dyInputTensor, in_len, data_type); - SetTensorNd(dxOutputTensor, in_len, data_type); + bnScale.InitHostData(bnScale.GetTensor().desc.GetElementSize(), true, gen_value); + + savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_var_bwd); + + auto gen_in_var = [](auto...) { + return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); + }; + savedInvVar.InitHostData(savedInvVar.GetTensor().desc.GetElementSize(), true, gen_in_var); + } + else + { + std::cout << "\nUnknown batch norm state!\n"; + exit(EXIT_FAILURE); + } return miopenStatusSuccess; } @@ -265,6 +288,10 @@ int BatchNormDriver::AddCmdLineArgs() inflags.AddInputFlag("in_h", 'H', "32", "Input Height (Default=32)", "int"); inflags.AddInputFlag("in_w", 'W', "32", "Input Width (Default=32)", "int"); inflags.AddInputFlag("in_d", 'D', "0", "Input Depth (Default=0)", "int"); + + inflags.AddInputFlag( + "layout", 'L', "", "Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)", "string", true); + inflags.AddInputFlag("alpha", 'A', "1.0", "Alpha (Default=1.0)", "float"); inflags.AddInputFlag("beta", 'B', "0.", "Beta (Default=0.)", "float"); inflags.AddInputFlag("iter", 'i', "1", "Number of Iterations (Default=1)", "int"); @@ -317,6 +344,39 @@ std::vector BatchNormDriver::GetInputTensorLengthsFromCmd } } +template +bool BatchNormDriver::ChkLayout_ShortName() +{ + // check for short name of layout type + if(inflags.FindShortName("layout") == 'L') + { + // do noting + // found valid short names + return true; + } + else + { + std::cerr << "Error:Invalid Short Name for layout!" << std::endl; + exit(EXIT_FAILURE); + } +} + +template +void BatchNormDriver::ValidateLayoutInputParameters(std::string layout_value) +{ + if(!ChkLayout_ShortName()) + { + std::cerr << "Invalid Layout Short Name = " << inflags.FindShortName("layout") << std::endl; + exit(EXIT_FAILURE); + } + if((layout_value.compare("NCHW") != 0) && (layout_value.compare("NHWC") != 0) && + (layout_value.compare("NCDHW") != 0) && (layout_value.compare("NDHWC") != 0)) + { + std::cerr << "Invalid Layout Parameter Value - " << layout_value << std::endl; + exit(EXIT_FAILURE); + } +} + template int BatchNormDriver::SetBNParametersFromCmdLineArgs() { @@ -324,6 +384,45 @@ int BatchNormDriver::SetBNParametersFromCmdLineArgs() // double bnAlpha = inflags.GetValueDouble("alpha"); // double bnBeta = inflags.GetValueDouble("beta"); + const std::string default_layout = isDepthSpecified ? "NCDHW" : "NCHW"; + + // inflags value is empty, default value is used + // if it is supplied via cmd line, check the value. + if(inflags.GetValueStr("layout").empty()) + { + inflags.SetValue("layout", default_layout); + } + else + { + std::string layoutValue = inflags.GetValueStr("layout"); + ValidateLayoutInputParameters(layoutValue); + inflags.SetValue("layout", layoutValue); + } + + std::string layout = inflags.GetValueStr("layout"); + + if(layout == "NCHW") + { + bn_layout = miopenTensorNCHW; + } + else if(layout == "NHWC") + { + bn_layout = miopenTensorNHWC; + } + else if(layout == "NCDHW") + { + bn_layout = miopenTensorNCDHW; + } + else if(layout == "NDHWC") + { + bn_layout = miopenTensorNDHWC; + } + else + { + std::cout << "Cannot handle layout : " << layout << "\n"; + exit(EXIT_FAILURE); // NOLINT (concurrency-mt-unsafe) + } + // batch norm mode type if(inflags.GetValueInt("mode") == 0) { @@ -395,242 +494,100 @@ int BatchNormDriver::SetBNParametersFromCmdLineArgs() forw = 1; } - return miopenStatusSuccess; -} - -template -int BatchNormDriver::createSaveBuffers() -{ - - status_t status = STATUS_SUCCESS; - DEFINE_CONTEXT(ctx); -#if MIOPEN_BACKEND_OPENCL - clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); -#endif - - size_t sb_sz = GetTensorSize(biasScaleTensor); - - if(saveMeanVar) + if(forw == 1) { - // GPU allocation - saveMean_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - saveInvVariance_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - - if(back == 1) - { - // GPU host allocation - saveMean = std::vector(sb_sz, static_cast(0)); - saveInvVariance = std::vector(sb_sz, static_cast(0)); - - // CPU allocation - saveMean_host = std::vector(sb_sz, static_cast(0)); - saveInvVariance_host = std::vector(sb_sz, static_cast(0)); - - // Populate - for(int i = 0; i < sb_sz; i++) - { - saveMean[i] = prng::gen_canonical(); - saveMean_host[i] = static_cast(saveMean[i]); - saveInvVariance[i] = prng::gen_canonical(); - saveInvVariance_host[i] = static_cast(saveInvVariance[i]); - } - } - else - { - // GPU host allocation - saveMean = std::vector(sb_sz, static_cast(0)); - saveInvVariance = std::vector(sb_sz, static_cast(0)); - - // CPU allocation - saveMean_host = std::vector(sb_sz, static_cast(0)); - saveInvVariance_host = std::vector(sb_sz, static_cast(0)); - } - // GPU data transfer - status |= saveMean_dev->ToGPU(q, saveMean.data()); - status |= saveInvVariance_dev->ToGPU(q, saveInvVariance.data()); + isFwdTrain = true; + } + else if(forw == 2) + { + isFwdInfer = true; } else { - saveMean_dev = nullptr; - saveInvVariance_dev = nullptr; + isBwd = true; } - if(status != STATUS_SUCCESS) - printf("Error copying data to GPU\n"); - return miopenStatusSuccess; } template -int BatchNormDriver::createRunningBuffers() +int BatchNormDriver::AllocateBuffersAndCopy() { status_t status = STATUS_SUCCESS; DEFINE_CONTEXT(ctx); #if MIOPEN_BACKEND_OPENCL clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); #endif - size_t sb_sz = GetTensorSize(biasScaleTensor); + status |= in.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&in.GetTensor().desc)); - if(keepRunningMeanVar) + if(isFwdInfer || isFwdTrain) { - // GPU allocation - runningMean_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - runningVariance_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - - if(forw == 2) - { - // GPU host allocation - runningMean = std::vector(sb_sz, static_cast(0)); - runningVariance = std::vector(sb_sz, static_cast(0)); - - // CPU allocation - runningMean_host = std::vector(sb_sz, static_cast(0)); - runningVariance_host = std::vector(sb_sz, static_cast(0)); - - // Populate - for(int i = 0; i < sb_sz; i++) - { - runningMean[i] = prng::gen_canonical(); - runningMean_host[i] = static_cast(runningMean[i]); - runningVariance[i] = prng::gen_canonical(); - runningVariance_host[i] = static_cast(runningVariance[i]); - } - } - else - { - // GPU host allocation - runningMean = std::vector(sb_sz, static_cast(0)); - runningVariance = std::vector(sb_sz, static_cast(0)); - - // CPU allocation - runningMean_host = std::vector(sb_sz, static_cast(0)); - runningVariance_host = std::vector(sb_sz, static_cast(0)); - } - - // GPU data transfer - status |= runningMean_dev->ToGPU(q, runningMean.data()); - status |= runningVariance_dev->ToGPU(q, runningVariance.data()); + status |= out.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&out.GetTensor().desc)); + out_ref = + tensor{out.GetTensor().desc.GetLayout_t(), out.GetTensor().desc.GetLengths()}; + status |= scale.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&scale.GetTensor().desc)); + status |= bias.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&bias.GetTensor().desc)); } - else + if(isFwdInfer) { - runningMean_dev = nullptr; - runningVariance_dev = nullptr; + status |= estMean.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&estMean.GetTensor().desc)); + status |= + estVariance.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&estVariance.GetTensor().desc)); } - if(status != STATUS_SUCCESS) - printf("Error copying data to GPU\n"); - - return miopenStatusSuccess; -} - -template -int BatchNormDriver::AllocateBuffersAndCopy() -{ - status_t status = STATUS_SUCCESS; - DEFINE_CONTEXT(ctx); -#if MIOPEN_BACKEND_OPENCL - clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); -#endif - - size_t in_sz = GetTensorSize(inputTensor); - size_t sb_sz = GetTensorSize(biasScaleTensor); - - if(forw) + if(isFwdTrain) { + status |= + savedMean.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&savedMean.GetTensor().desc)); + status |= savedVariance.AllocOnDeviceAndInit( + q, ctx, GetTensorSize(&savedVariance.GetTensor().desc)); + status |= runMean.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&runMean.GetTensor().desc)); + status |= + runVariance.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&runVariance.GetTensor().desc)); - size_t out_sz = GetTensorSize(outputTensor); - - // GPU allocation - in_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); - scale_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - bias_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - out_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); - - // GPU host allocation - in = std::vector(in_sz, static_cast(0)); - out = std::vector(out_sz, static_cast(0)); - scale = std::vector(sb_sz, static_cast(0)); - bias = std::vector(sb_sz, static_cast(0)); + savedMean_ref = tensor{savedMean.GetTensor().desc.GetLayout_t(), + savedMean.GetTensor().desc.GetLengths()}; - // CPU allocation - out_host = std::vector(out_sz, static_cast(0)); - scale_host = std::vector(sb_sz, static_cast(0)); - bias_host = std::vector(sb_sz, static_cast(0)); + savedVariance_ref = tensor{savedVariance.GetTensor().desc.GetLayout_t(), + savedVariance.GetTensor().desc.GetLengths()}; - // Data initialization - for(int i = 0; i < in_sz; i++) - { - in[i] = prng::gen_canonical(); - } - status |= in_dev->ToGPU(q, in.data()); + runMean_ref = tensor{runMean.GetTensor().desc.GetLayout_t(), + runMean.GetTensor().desc.GetLengths()}; - // Using random beta and gamma - for(int i = 0; i < sb_sz; i++) - { - scale[i] = prng::gen_canonical(); - scale_host[i] = static_cast(scale[i]); - bias[i] = prng::gen_canonical(); - bias_host[i] = static_cast(bias[i]); - } - status |= scale_dev->ToGPU(q, scale.data()); - status |= bias_dev->ToGPU(q, bias.data()); - status |= out_dev->ToGPU(q, out.data()); - - if(forw == 1) - { // training - status |= createRunningBuffers(); - status |= createSaveBuffers(); - } - else if(forw == 2) - { // inference - status |= createRunningBuffers(); - } - } // end forward - - if(back == 1) + runVariance_ref = tensor{runVariance.GetTensor().desc.GetLayout_t(), + runVariance.GetTensor().desc.GetLengths()}; + } + if(isBwd) { + status |= out_bwd.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&out_bwd.GetTensor().desc)); - size_t out_sz = GetTensorSize(dxOutputTensor); + out_ref = tensor{out_bwd.GetTensor().desc.GetLayout_t(), + out_bwd.GetTensor().desc.GetLengths()}; - // GPU allocation - in_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); - dyin_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); - dxout_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); - dscale_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - dbias_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - scale_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); + status |= bnScale.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&bnScale.GetTensor().desc)); + status |= dy.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&dy.GetTensor().desc)); - // GPU host allocation - in = std::vector(in_sz, static_cast(0)); - dyin = std::vector(in_sz, static_cast(0)); - dxout = std::vector(out_sz, static_cast(0)); - dscale = std::vector(sb_sz, static_cast(0)); - dbias = std::vector(sb_sz, static_cast(0)); - scale = std::vector(sb_sz, static_cast(0)); + status |= dScale.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&dScale.GetTensor().desc)); + status |= dBias.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&dBias.GetTensor().desc)); + status |= + savedMean.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&savedMean.GetTensor().desc)); + status |= + savedInvVar.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&savedInvVar.GetTensor().desc)); - // CPU allocation - dxout_host = std::vector(out_sz, static_cast(0)); - dscale_host = std::vector(sb_sz, static_cast(0)); - dbias_host = std::vector(sb_sz, static_cast(0)); + dScale_ref = tensor{dScale.GetTensor().desc.GetLayout_t(), + dScale.GetTensor().desc.GetLengths()}; - // Populate - for(int i = 0; i < sb_sz; i++) - { - scale[i] = prng::gen_canonical(); - } - status |= scale_dev->ToGPU(q, scale.data()); - status |= dscale_dev->ToGPU(q, dscale.data()); - status |= dbias_dev->ToGPU(q, dbias.data()); + dBias_ref = + tensor{dBias.GetTensor().desc.GetLayout_t(), dBias.GetTensor().desc.GetLengths()}; + } - for(int i = 0; i < in_sz; i++) - { - dyin[i] = prng::gen_canonical(); - in[i] = prng::gen_canonical(); - } - status |= dyin_dev->ToGPU(q, dyin.data()); - status |= in_dev->ToGPU(q, in.data()); - status |= dxout_dev->ToGPU(q, dxout.data()); + for(size_t i = 0; i < runMean.GetVector().size(); ++i) + { + runMean_ref.data[i] = static_cast(runMean.GetVector()[i]); + } - status |= createSaveBuffers(); + for(size_t i = 0; i < runVariance.GetVector().size(); ++i) + { + runVariance_ref.data[i] = static_cast(runVariance.GetVector()[i]); } if(status != STATUS_SUCCESS) @@ -645,37 +602,43 @@ void BatchNormDriver::runGPUFwdInference(Tref epsilon, float a if(keepRunningMeanVar) { // use precalculated mean and variance - miopenBatchNormalizationForwardInference(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - runningMean_dev->GetMem(), - runningVariance_dev->GetMem(), - epsilon); + miopenBatchNormalizationForwardInference_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &estMean.GetTensor().desc, + &estVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + estMean.GetDevicePtr(), + estVariance.GetDevicePtr(), + epsilon); } else { // recalculate mean and variance - miopenBatchNormalizationForwardInference(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - nullptr, - nullptr, - epsilon); + miopenBatchNormalizationForwardInference_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &estMean.GetTensor().desc, + &estVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + nullptr, + nullptr, + epsilon); } return; @@ -689,103 +652,118 @@ void BatchNormDriver::runGPUFwdTrain(Tref epsilon, { if(saveMeanVar && keepRunningMeanVar) { - miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - eAF, - runningMean_dev->GetMem(), - runningVariance_dev->GetMem(), - epsilon, - saveMean_dev->GetMem(), - saveInvVariance_dev->GetMem()); + miopenBatchNormalizationForwardTraining_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + runMean.GetDevicePtr(), + runVariance.GetDevicePtr(), + epsilon, + savedMean.GetDevicePtr(), + savedVariance.GetDevicePtr()); } else if(saveMeanVar) { - miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - eAF, - nullptr, - nullptr, - epsilon, - saveMean_dev->GetMem(), - saveInvVariance_dev->GetMem()); + miopenBatchNormalizationForwardTraining_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + nullptr, + nullptr, + epsilon, + savedMean.GetDevicePtr(), + savedVariance.GetDevicePtr()); } else if(keepRunningMeanVar) { - miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - eAF, - runningMean_dev->GetMem(), - runningVariance_dev->GetMem(), - epsilon, - nullptr, - nullptr); + miopenBatchNormalizationForwardTraining_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + runMean.GetDevicePtr(), + runVariance.GetDevicePtr(), + epsilon, + nullptr, + nullptr); } else { - miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - eAF, - nullptr, - nullptr, - epsilon, - nullptr, - nullptr); + miopenBatchNormalizationForwardTraining_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + nullptr, + nullptr, + epsilon, + nullptr, + nullptr); } #ifdef BN_RUNFOR_PROFILER - miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - eAF, - nullptr, - nullptr, - epsilon, - nullptr, - nullptr); + miopenBatchNormalizationForwardTraining_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + nullptr, + nullptr, + epsilon, + nullptr, + nullptr); #endif } @@ -866,9 +844,9 @@ int BatchNormDriver::RunForwardGPU() avgtime / (iters - 1), iters - 1); int in_n, in_c, in_h, in_w; - std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); + std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(in.GetTensor().desc.GetLengths()); size_t M = in_n * in_c * in_h * in_w; - size_t dataSz = (M + 2 * in_c) * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); + size_t dataSz = (M + 2 * in_c) * miopen::GetTypeSize(in.GetTensor().desc.GetType()); float rdCnt = -1.0; float wrCnt = 1.0; if(forw == 1) @@ -890,41 +868,44 @@ int BatchNormDriver::RunForwardGPU() } template -void BatchNormDriver::runCPUFwdInference( - Tref epsilon, int batch_sz, int channels, int height, int width, int depth) +void BatchNormDriver::runCPUFwdInference(Tref epsilon) { + int size{0}; + miopenGetTensorDescriptorSize(&in.GetTensor().desc, &size); + + if(size == 5) + { + in.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(in.GetTensor().desc); + out_ref.desc = miopen::BuildReshaped4DTensorDescriptor(out_ref.desc); + scale.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(scale.GetTensor().desc); + bias.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(bias.GetTensor().desc); + estMean.GetTensor().desc = + miopen::BuildReshaped4DTensorDescriptor(estMean.GetTensor().desc); + estVariance.GetTensor().desc = + miopen::BuildReshaped4DTensorDescriptor(estVariance.GetTensor().desc); + } if(bn_mode == miopenBNPerActivation) { // 1xCxHxW - miopenBNFwdInferPerActivationRunHost(/* alpha, beta, */ batch_sz, - channels, - (isDepthSpecified ? depth : 1), - height, - width, - in.data(), - out_host.data(), - scale_host.data(), - bias_host.data(), - epsilon, - keepRunningMeanVar, - runningMean_host.data(), - runningVariance_host.data()); + // handle 3d case + batchNormPerActivHostInference(in.GetTensor(), + out_ref, + scale.GetTensor(), + bias.GetTensor(), + epsilon, + estMean.GetTensor(), + estVariance.GetTensor()); } else if(bn_mode == miopenBNSpatial) { // 1xCx1x1 - miopenBNFwdInferSpatialRunHost(/* alpha, beta, */ batch_sz, - channels, - (isDepthSpecified ? depth : 1), - height, - width, - in.data(), - out_host.data(), - scale_host.data(), - bias_host.data(), - epsilon, - keepRunningMeanVar, - runningMean_host.data(), - runningVariance_host.data()); + + batchNormSpatialHostInference(in.GetTensor(), + out_ref, + scale.GetTensor(), + bias.GetTensor(), + epsilon, + estMean.GetTensor(), + estVariance.GetTensor()); } else { @@ -936,59 +917,65 @@ void BatchNormDriver::runCPUFwdInference( } template -void BatchNormDriver::runCPUFwdTrain( - Tref epsilon, Tref eAF, int batch_sz, int channels, int height, int width, int depth) +void BatchNormDriver::runCPUFwdTrain(Tref epsilon, Tref eAF) { - + int size{0}; + miopenGetTensorDescriptorSize(&in.GetTensor().desc, &size); + if(size == 5) + { + in.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(in.GetTensor().desc); + out_ref.desc = miopen::BuildReshaped4DTensorDescriptor(out_ref.desc); + scale.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(scale.GetTensor().desc); + bias.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(bias.GetTensor().desc); + savedMean_ref.desc = miopen::BuildReshaped4DTensorDescriptor(savedMean_ref.desc); + savedVariance_ref.desc = miopen::BuildReshaped4DTensorDescriptor(savedVariance_ref.desc); + runMean_ref.desc = miopen::BuildReshaped4DTensorDescriptor(runMean_ref.desc); + runVariance_ref.desc = miopen::BuildReshaped4DTensorDescriptor(runVariance_ref.desc); + } if(bn_mode == miopenBNPerActivation) { // 1xCxHxW - miopenBNFwdTrainPerActivationRunHost(/* alpha, beta, */ batch_sz, - channels, -#if MIO_DRIVER_BN_REFERENCE_COMPUTE_3D_AS_2D - 1, - height * (isDepthSpecified ? depth : 1), -#else - (isDepthSpecified ? depth : 1), - height, -#endif - width, - in.data(), - out_host.data(), - scale_host.data(), - bias_host.data(), - epsilon, - saveMeanVar, - keepRunningMeanVar, - saveMean_host.data(), - saveInvVariance_host.data(), - runningMean_host.data(), - runningVariance_host.data(), - eAF); + + batchNormPerActHostFwdTrain(in.GetTensor(), + out_ref, + scale.GetTensor(), + bias.GetTensor(), + static_cast(epsilon), + static_cast(eAF), + savedMean_ref, + savedVariance_ref, + runMean_ref, + runVariance_ref); } else if(bn_mode == miopenBNSpatial) { // 1xCx1x1 - miopenBNFwdTrainSpatialRunHost(/* alpha, beta, */ batch_sz, - channels, -#if MIO_DRIVER_BN_REFERENCE_COMPUTE_3D_AS_2D - 1, - height * (isDepthSpecified ? depth : 1), -#else - (isDepthSpecified ? depth : 1), - height, -#endif - width, - in.data(), - out_host.data(), - scale_host.data(), - bias_host.data(), - epsilon, - saveMeanVar, - keepRunningMeanVar, - saveMean_host.data(), - saveInvVariance_host.data(), - runningMean_host.data(), - runningVariance_host.data(), - eAF); + + if(forw == 2 && !keepRunningMeanVar) + { + tensor empty_tensor; + batchNormSpatialHostFwdTrain(in.GetTensor(), + out_ref, + scale.GetTensor(), + bias.GetTensor(), + static_cast(epsilon), + static_cast(eAF), + empty_tensor, // savedMean_ref + empty_tensor, // savedVariance_ref + empty_tensor, // runMean_ref + empty_tensor); // runVariance_ref + } + else + { + batchNormSpatialHostFwdTrain(in.GetTensor(), + out_ref, + scale.GetTensor(), + bias.GetTensor(), + static_cast(epsilon), + static_cast(eAF), + savedMean_ref, + savedVariance_ref, + runMean_ref, + runVariance_ref); + } } else { @@ -1001,35 +988,27 @@ void BatchNormDriver::runCPUFwdTrain( template int BatchNormDriver::RunForwardCPU() { - int nIn = 0, cIn = 0, dIn = 0, hIn = 0, wIn = 0; - - if(isDepthSpecified) - miopenGet5dTensorDescriptorLengths(inputTensor, &nIn, &cIn, &dIn, &hIn, &wIn); - else - miopenGet4dTensorDescriptorLengths(inputTensor, &nIn, &cIn, &hIn, &wIn); - - int batch_sz = nIn; - int channels = cIn; - int height = hIn; - int width = wIn; - int depth = dIn; - // T alpha = 0., beta = 0.; Tref epsilon = static_cast(EPSILON); Tref eAF = static_cast(1.0); - if(forw == 1) + if(forw == 1 || (forw == 2 && !keepRunningMeanVar)) { // training only for(int i = 0; i < inflags.GetValueInt("iter"); i++) { eAF = static_cast(1.0) / (static_cast(i) + static_cast(1.0)); - runCPUFwdTrain( - epsilon, eAF, /* alpha, beta,*/ batch_sz, channels, height, width, depth); + runCPUFwdTrain(epsilon, eAF /* alpha, beta,*/); } } - else if(forw == 2) - { // inference only - runCPUFwdInference(epsilon, /* alpha, beta,*/ batch_sz, channels, height, width, depth); + else if(forw == 2 && keepRunningMeanVar) + { + // inference only + runCPUFwdInference(epsilon); + } + else + { + printf("Unsupported forward cpu run state.\nExiting...\n\n"); + exit(EXIT_FAILURE); // NOLINT (concurrency-mt-unsafe) } return miopenStatusSuccess; @@ -1038,7 +1017,6 @@ int BatchNormDriver::RunForwardCPU() template int BatchNormDriver::RunBackwardGPU() { - if(!back) return miopenStatusSuccess; @@ -1058,47 +1036,53 @@ int BatchNormDriver::RunBackwardGPU() if(saveMeanVar) { - miopenBatchNormalizationBackward(GetHandle(), - bn_mode, - &alphaDataDiff, - &betaDataDiff, - &alphaParamDiff, - &betaParamDiff, - inputTensor, - in_dev->GetMem(), - dyInputTensor, - dyin_dev->GetMem(), - dxOutputTensor, - dxout_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - dscale_dev->GetMem(), - dbias_dev->GetMem(), - epsilon, - saveMean_dev->GetMem(), - saveInvVariance_dev->GetMem()); + miopenBatchNormalizationBackward_V2(GetHandle(), + bn_mode, + &alphaDataDiff, + &betaDataDiff, + &alphaParamDiff, + &betaParamDiff, + &in.GetTensor().desc, + in.GetDevicePtr(), + &dy.GetTensor().desc, + dy.GetDevicePtr(), + &out_bwd.GetTensor().desc, + out_bwd.GetDevicePtr(), + &bnScale.GetTensor().desc, + &dBias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedInvVar.GetTensor().desc, + bnScale.GetDevicePtr(), + dScale.GetDevicePtr(), + dBias.GetDevicePtr(), + epsilon, + savedMean.GetDevicePtr(), + savedInvVar.GetDevicePtr()); } else { - miopenBatchNormalizationBackward(GetHandle(), - bn_mode, - &alphaDataDiff, - &betaDataDiff, - &alphaParamDiff, - &betaParamDiff, - inputTensor, - in_dev->GetMem(), - dyInputTensor, - dyin_dev->GetMem(), - dxOutputTensor, - dxout_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - dscale_dev->GetMem(), - dbias_dev->GetMem(), - epsilon, - nullptr, - nullptr); + miopenBatchNormalizationBackward_V2(GetHandle(), + bn_mode, + &alphaDataDiff, + &betaDataDiff, + &alphaParamDiff, + &betaParamDiff, + &in.GetTensor().desc, + in.GetDevicePtr(), + &dy.GetTensor().desc, + dy.GetDevicePtr(), + &out_bwd.GetTensor().desc, + out_bwd.GetDevicePtr(), + &bnScale.GetTensor().desc, + &dBias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedInvVar.GetTensor().desc, + bnScale.GetDevicePtr(), + dScale.GetDevicePtr(), + dBias.GetDevicePtr(), + epsilon, + nullptr, + nullptr); } miopen::deref(GetHandle()).Finish(); @@ -1120,13 +1104,11 @@ int BatchNormDriver::RunBackwardGPU() avgtime += time; int in_n, in_c, in_h, in_w; - std::tie(in_n, in_c, in_h, in_w) = - miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); - size_t M = in_n * in_c * in_h * in_w; - size_t dataSz = - (M + 2 * in_c) * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); - float rdCnt = 2.0; - float wrCnt = 1.0; + std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(in.GetTensor().desc.GetLengths()); + size_t M = in_n * in_c * in_h * in_w; + size_t dataSz = (M + 2 * in_c) * miopen::GetTypeSize(in.GetTensor().desc.GetType()); + float rdCnt = 2.0; + float wrCnt = 1.0; // layer, flopCnt, reads, writes, GFLOPS, GB/s, timeMs printf("stats: bnormb, 0, %zu, %zu, 0, %f, %f\n", dataSz, @@ -1176,27 +1158,28 @@ int BatchNormDriver::VerifyForward() if(keepRunningMeanVar) { // copy back for verification - runningMean_dev->FromGPU(GetStream(), runningMean.data()); - runningVariance_dev->FromGPU(GetStream(), runningVariance.data()); + runMean.CopyFromDeviceToHost(GetStream()); + runVariance.CopyFromDeviceToHost(GetStream()); + + auto errorRunMean = miopen::rms_range(runMean_ref.data, runMean.GetVector()); - auto errorRunMean = miopen::rms_range(runningMean_host, runningMean); if(!std::isfinite(errorRunMean) || errorRunMean > maxrms) { std::cout << "Forward train batch norm verification FAILED on running mean: " << errorRunMean << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < runningMean.size() && i < runningMean_host.size() && + for(int i = 0; i < runMean.GetVector().size() && i < runMean_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(runningMean[i]) - fabs(runningMean_host[i]))); + diff = fabs(Tmix(fabs(runMean.GetVector()[i]) - fabs(runMean_ref.data[i]))); if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "rm[" << i << "]: " << runningMean[i]; - std::cout << ", rm_host[" << i << "]: " << runningMean_host[i]; - std::cout << ", diff[" << i - << "]: " << Tmix(fabs(runningMean[i]) - fabs(runningMean_host[i])) + std::cout << "rm[" << i << "]: " << runMean.GetVector()[i]; + std::cout << ", rm_host[" << i << "]: " << runMean_ref.data[i]; + std::cout << ", diff[" << i << "]: " + << Tmix(fabs(runMean.GetVector()[i]) - fabs(runMean_ref.data[i])) << std::endl; } } @@ -1208,24 +1191,26 @@ int BatchNormDriver::VerifyForward() << errorRunMean << ')' << std::endl; } - auto errorRunVar = miopen::rms_range(runningVariance_host, runningVariance); + auto errorRunVar = miopen::rms_range(runVariance_ref.data, runVariance.GetVector()); if(!std::isfinite(errorRunVar) || errorRunVar > maxrms) { std::cout << "Forward train batch norm verification FAILED on running variance: " << errorRunVar << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < runningVariance.size() && i < runningVariance_host.size() && - i < MIO_BN_MAX_DEBUGLOOP; + for(int i = 0; i < runVariance.GetVector().size() && + i < runVariance_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(runningVariance[i]) - fabs(runningVariance_host[i]))); + diff = fabs( + Tmix(fabs(runVariance.GetVector()[i]) - fabs(runVariance_ref.data[i]))); if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "rv[" << i << "]: " << runningVariance[i]; - std::cout << ", rv_host[" << i << "]: " << runningVariance_host[i]; + std::cout << "rv[" << i << "]: " << runVariance.GetVector()[i]; + std::cout << ", rv_host[" << i << "]: " << runVariance_ref.data[i]; std::cout << ", diff[" << i << "]: " - << Tmix(fabs(runningVariance[i]) - fabs(runningVariance_host[i])) + << Tmix(fabs(runVariance.GetVector()[i]) - + fabs(runVariance_ref.data[i])) << std::endl; } } @@ -1240,28 +1225,29 @@ int BatchNormDriver::VerifyForward() if(saveMeanVar) { // copy back for verification - saveMean_dev->FromGPU(GetStream(), saveMean.data()); - saveInvVariance_dev->FromGPU(GetStream(), saveInvVariance.data()); + savedMean.CopyFromDeviceToHost(GetStream()); + savedVariance.CopyFromDeviceToHost(GetStream()); maxval = static_cast(0.0); - auto errorSaveMean = miopen::rms_range(saveMean_host, saveMean); + auto errorSaveMean = miopen::rms_range(savedMean_ref.data, savedMean.GetVector()); if(!std::isfinite(errorSaveMean) || errorSaveMean > maxrms) { std::cout << "Forward train batch norm verification FAILED on saved mean: " << errorSaveMean << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; - i < saveMean.size() && i < saveMean_host.size() && i < MIO_BN_MAX_DEBUGLOOP; + for(int i = 0; i < savedMean.GetVector().size() && i < savedMean_ref.data.size() && + i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(saveMean[i]) - fabs(saveMean_host[i]))); + diff = fabs(Tmix(fabs(savedMean.GetVector()[i]) - fabs(savedMean_ref.data[i]))); maxval = maxval < diff ? diff : maxval; if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "sm[" << i << "]: " << saveMean[i]; - std::cout << ", sm_host[" << i << "]: " << saveMean_host[i]; - std::cout << ", diff[" << i - << "]: " << Tmix(fabs(saveMean[i]) - fabs(saveMean_host[i])) + std::cout << "sm[" << i << "]: " << savedMean.GetVector()[i]; + std::cout << ", sm_host[" << i << "]: " << savedMean_ref.data[i]; + std::cout << ", diff[" << i << "]: " + << Tmix(fabs(savedMean.GetVector()[i]) - + fabs(savedMean_ref.data[i])) << std::endl; } } @@ -1274,7 +1260,8 @@ int BatchNormDriver::VerifyForward() << errorSaveMean << ')' << std::endl; } - auto errorSaveVar = miopen::rms_range(saveInvVariance_host, saveInvVariance); + auto errorSaveVar = + miopen::rms_range(savedVariance_ref.data, savedVariance.GetVector()); if(!std::isfinite(errorSaveVar) || errorSaveVar > maxrms) { std::cout @@ -1282,17 +1269,19 @@ int BatchNormDriver::VerifyForward() << errorSaveVar << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < saveInvVariance.size() && i < saveInvVariance_host.size() && - i < MIO_BN_MAX_DEBUGLOOP; + for(int i = 0; i < savedVariance.GetVector().size() && + i < savedVariance_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(saveInvVariance[i]) - fabs(saveInvVariance_host[i]))); + diff = fabs( + Tmix(fabs(savedVariance.GetVector()[i]) - fabs(savedVariance_ref.data[i]))); if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "sv[" << i << "]: " << saveInvVariance[i]; - std::cout << ", sv_host[" << i << "]: " << saveInvVariance_host[i]; + std::cout << "sv[" << i << "]: " << savedVariance.GetVector()[i]; + std::cout << ", sv_host[" << i << "]: " << savedVariance_ref.data[i]; std::cout << ", diff[" << i << "]: " - << Tmix(fabs(saveInvVariance[i]) - fabs(saveInvVariance_host[i])) + << Tmix(fabs(savedVariance.GetVector()[i]) - + fabs(savedVariance_ref.data[i])) << std::endl; } } @@ -1307,38 +1296,40 @@ int BatchNormDriver::VerifyForward() } // end if(saveMeanVar) } - // Check output tensor error - out_dev->FromGPU(GetStream(), out.data()); + out.CopyFromDeviceToHost(GetStream()); + maxval = static_cast(0.0); - auto errorOut = miopen::rms_range(out_host, out); + auto errorOut = miopen::rms_range(out_ref.data, out.GetVector()); if(!std::isfinite(errorOut) || errorOut > maxrms) { std::cout << "Forward batch norm verification FAILED on output: " << errorOut << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) unsigned int count = 0; - for(int i = 0; i < out.size() && i < out_host.size(); i++) + for(int i = 0; i < out.GetVector().size() && i < out_ref.data.size(); i++) { - if(std::isnan(out[i])) + if(std::isnan(out.GetVector()[i])) { - std::cout << "out[" << i << "] produced a nan: " << out[i] << std::endl; + std::cout << "out[" << i << "] produced a nan: " << out.GetVector()[i] << std::endl; } - if(std::isnan(out_host[i])) + if(std::isnan(out_ref.data[i])) { - std::cout << "out_host[" << i << "] produced a nan: " << out_host[i] << std::endl; + std::cout << "out_ref[" << i << "] produced a nan: " << out_ref.data[i] + << std::endl; } - diff = Tref(fabs(out[i]) - fabs(out_host[i])); + diff = Tref(fabs(out.GetVector()[i]) - fabs(out_ref.data[i])); maxval = maxval < diff ? diff : maxval; if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "out[" << i << "]: " << out[i]; - std::cout << ", out_host[" << i << "]: " << out_host[i]; - std::cout << ", diff[" << i << "]: " << Tref(out[i] - out_host[i]) << std::endl; + std::cout << "out[" << i << "]: " << out.GetVector()[i]; + std::cout << ", out_ref.data[" << i << "]: " << out_ref.data[i]; + std::cout << ", diff[" << i << "]: " << Tref(out.GetVector()[i] - out_ref.data[i]) + << std::endl; count++; } } - std::cout << "Number of elements: " << out.size() << std::endl; + std::cout << "Number of elements: " << out.GetVector().size() << std::endl; std::cout << "Number of bad elements: " << count << std::endl; std::cout << "max difference in output: " << maxval << std::endl; #endif @@ -1365,61 +1356,77 @@ int BatchNormDriver::RunBackwardCPU() if(!back) return miopenStatusSuccess; - int nIn = 0, cIn = 0, dIn = 0, hIn = 0, wIn = 0; - if(isDepthSpecified) - miopenGet5dTensorDescriptorLengths(inputTensor, &nIn, &cIn, &dIn, &hIn, &wIn); - else - miopenGet4dTensorDescriptorLengths(inputTensor, &nIn, &cIn, &hIn, &wIn); - - int batch_sz = nIn; - int channels = cIn; - int height = hIn; - int width = wIn; - int depth = dIn; - // T alphaDiff = 1, betaDiff = 0; // T alphaParam = 1, betaParam = 0; - Tref epsilon = static_cast(EPSILON); + double alpha = static_cast(1), beta = static_cast(0), + gamma = static_cast(1); + + // float alphaDataDiff = static_cast(1), betaDataDiff = static_cast(0); + // float alphaParamDiff = static_cast(1), betaParamDiff = static_cast(0); + int size{0}; + miopenGetTensorDescriptorSize(&in.GetTensor().desc, &size); + if(size == 5) + { + in.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(in.GetTensor().desc); + dy.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(dy.GetTensor().desc); + out_bwd.GetTensor().desc = + miopen::BuildReshaped4DTensorDescriptor(out_bwd.GetTensor().desc); + out_ref.desc = miopen::BuildReshaped4DTensorDescriptor(out_ref.desc); + bnScale.GetTensor().desc = + miopen::BuildReshaped4DTensorDescriptor(bnScale.GetTensor().desc); + dBias.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(dBias.GetTensor().desc); + dScale_ref.desc = miopen::BuildReshaped4DTensorDescriptor(dScale_ref.desc); + dBias_ref.desc = miopen::BuildReshaped4DTensorDescriptor(dBias_ref.desc); + savedMean.GetTensor().desc = + miopen::BuildReshaped4DTensorDescriptor(savedMean.GetTensor().desc); + savedInvVar.GetTensor().desc = + miopen::BuildReshaped4DTensorDescriptor(savedInvVar.GetTensor().desc); + } if(bn_mode == miopenBNPerActivation) - { // 1xCxHxW - miopenBNBwdPerActivationRunHost(/* alphaDiff, betaDiff, alphaParam, - betaParam, */ - batch_sz, - channels, - (isDepthSpecified ? depth : 1), - height, - width, - in.data(), - dyin.data(), - dxout_host.data(), - scale.data(), - dscale_host.data(), - dbias_host.data(), - epsilon, - saveMeanVar, - saveMean_host.data(), - saveInvVariance_host.data()); + { + // 1xCxHxW + batchNormActivSpatialHostBwdTrain(activ_mode, + gamma, + beta, + alpha, + in.GetTensor(), + dy.GetTensor(), + out.GetTensor(), + out_ref, + bnScale.GetTensor(), + dBias.GetTensor(), + dScale_ref, + dBias_ref, + savedMean.GetTensor(), + savedInvVar.GetTensor()); } else if(bn_mode == miopenBNSpatial) - { // 1xCx1x1 - miopenBNBwdSpatialRunHost(/* alphaDiff, betaDiff, alphaParam, betaParam, - */ - batch_sz, - channels, - (isDepthSpecified ? depth : 1), - height, - width, - in.data(), - dyin.data(), - dxout_host.data(), - scale.data(), - dscale_host.data(), - dbias_host.data(), - epsilon, - saveMeanVar, - saveMean_host.data(), - saveInvVariance_host.data()); + { // 1xCx1x1 + if(saveMeanVar) + { + + batchNormSpatialHostBwdTrain(in.GetTensor(), + dy.GetTensor(), + out_ref, + bnScale.GetTensor(), + dScale_ref, + dBias_ref, + savedMean.GetTensor(), + savedInvVar.GetTensor()); + } + else + { + tensor empty_tensor; + batchNormSpatialHostBwdTrain(in.GetTensor(), + dy.GetTensor(), + out_ref, + bnScale.GetTensor(), + dScale_ref, + dBias_ref, + empty_tensor, + empty_tensor); + } } else { @@ -1443,33 +1450,36 @@ int BatchNormDriver::VerifyBackward() RunBackwardCPU(); - dxout_dev->FromGPU(GetStream(), dxout.data()); - dscale_dev->FromGPU(GetStream(), dscale.data()); - dbias_dev->FromGPU(GetStream(), dbias.data()); + out_bwd.CopyFromDeviceToHost(GetStream()); + dScale.CopyFromDeviceToHost(GetStream()); + dBias.CopyFromDeviceToHost(GetStream()); + #if(MIO_BN_DEBUG == 1) const Tref tolerance = static_cast(1000 * (sizeof(Tgpu) == 4) ? ERRTOL_FP32 : ERRTOL_FP16); Tref diff = static_cast(0.0); #endif maxval = static_cast(0.0); - auto errordxout = miopen::rms_range(dxout_host, dxout); + auto errordxout = miopen::rms_range(out_ref.data, out_bwd.GetVector()); if(!std::isfinite(errordxout) || errordxout > maxrms) { std::cout << "Backwards prop batch norm verification FAILED on dx: " << errordxout << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < dxout.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) + for(int i = 0; i < out_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tgpu(fabs(dxout[i]) - fabs(dxout_host[i]))); + diff = fabs(Tgpu(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i]))); maxval = maxval < diff ? diff : maxval; if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "dxout[" << i << "]: " << dxout[i]; - std::cout << "\tdxout_host[" << i << "]: " << dxout_host[i]; - std::cout << "\tdiff[" << i << "]: " << Tgpu(fabs(dxout[i]) - fabs(dxout_host[i])); + std::cout << "out_ref[" << i << "]: " << out_ref.data[i]; + std::cout << "\tout_bwd.GetVector()[" << i << "]: " << out_bwd.GetVector()[i]; + std::cout << "\tdiff[" << i + << "]: " << Tgpu(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i])); std::cout << "\tratioH: " - << fabs(fabs(dxout[i]) - fabs(dxout_host[i])) / fabs(dxout_host[i]) + << fabs(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i])) / + fabs(out_bwd.GetVector()[i]) << std::endl; } } @@ -1483,25 +1493,26 @@ int BatchNormDriver::VerifyBackward() } maxval = static_cast(0.0); - auto errordscale = miopen::rms_range(dscale_host, dscale); + auto errordscale = miopen::rms_range(dScale_ref.data, dScale.GetVector()); if(!std::isfinite(errordscale) || errordscale > maxrms) { std::cout << "Backwards prop batch norm verification FAILED on dscale: " << errordscale << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < dscale.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) + for(int i = 0; i < dScale.GetVector().size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(dscale[i]) - fabs(dscale_host[i]))); - maxval = maxval < diff ? diff : maxval; + auto diff = fabs(Tmix(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i]))); + maxval = maxval < diff ? diff : maxval; if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "dscale[" << i << "]: " << dscale[i]; - std::cout << "\tdscale_host[" << i << "]: " << dscale_host[i]; + std::cout << "dscale[" << i << "]: " << dScale.GetVector()[i]; + std::cout << "\tdscale_host[" << i << "]: " << dScale_ref.data[i]; std::cout << "\tdiff[" << i - << "]: " << Tmix(fabs(dscale[i]) - fabs(dscale_host[i])); + << "]: " << Tmix(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i])); std::cout << "\tratioH: " - << fabs(fabs(dscale[i]) - fabs(dscale_host[i])) / fabs(dscale_host[i]) + << fabs(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i])) / + fabs(dScale_ref.data[i]) << std::endl; } } @@ -1514,23 +1525,25 @@ int BatchNormDriver::VerifyBackward() << ')' << std::endl; } - auto errordbias = miopen::rms_range(dbias_host, dbias); + auto errordbias = miopen::rms_range(dBias_ref.data, dBias.GetVector()); if(!std::isfinite(errordbias) || errordbias > maxrms) { std::cout << "Backwards prop batch norm verification FAILED on dbias: " << errordbias << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < dbias.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) + for(int i = 0; i < dBias.GetVector().size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(dbias[i]) - fabs(dbias_host[i]))); + diff = fabs(Tmix(fabs(dBias.GetVector()[i]) - fabs(dBias_ref.data[i]))); if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "dbias[" << i << "]: " << dbias[i]; - std::cout << "\tdbias_host[" << i << "]: " << dbias_host[i]; - std::cout << "\tdiff[" << i << "]: " << Tmix(fabs(dbias[i]) - fabs(dbias_host[i])); + std::cout << "dbias[" << i << "]: " << dBias.GetVector()[i]; + std::cout << "\tdbias_host[" << i << "]: " << dBias_ref.data[i]; + std::cout << "\tdiff[" << i + << "]: " << Tmix(fabs(dBias.GetVector()[i]) - fabs(dBias_ref.data[i])); std::cout << "\tratioH: " - << fabs(fabs(dbias[i]) - fabs(dbias_host[i])) / fabs(dbias_host[i]) + << fabs(fabs(dBias.GetVector()[i]) - fabs(dBias_ref.data[i])) / + fabs(dBias_ref.data[i]) << std::endl; } } diff --git a/driver/conv_driver.hpp b/driver/conv_driver.hpp index 900f52f683..9d35f9a129 100644 --- a/driver/conv_driver.hpp +++ b/driver/conv_driver.hpp @@ -180,135 +180,6 @@ static inline miopenDataType_t DataTypeFromShortString(const std::string& type) } } -template -class GpumemTensor -{ - std::unique_ptr dev; - tensor host; - bool is_gpualloc = false; - -public: - void SetGpuallocMode(bool v) { is_gpualloc = v; } - tensor& GetTensor() { return host; } - - void AllocOnHost(miopenTensorDescriptor_t t) - { - host = tensor(miopen::deref(t)); - if(is_gpualloc) // We do not need host data. - { - host.data.clear(); - host.data.shrink_to_fit(); // To free host memory. - } - } - - std::vector& GetVector() - { - if(is_gpualloc) - MIOPEN_THROW("[MIOpenDriver] GpumemTensor::GetVector should not be called in " - "'--gpualloc 1' mode"); - return host.data; - } - - Tgpu* GetVectorData() { return is_gpualloc ? nullptr : host.data.data(); } - std::size_t GetVectorSize() const { return is_gpualloc ? 0 : host.data.size(); } - - void - InitHostData(const size_t sz, // - const bool do_write, // If set to false, then only generate random data. This is - // necessary to reproduce values in input buffers even if some - // directions are skipped. For example, inputs for Backward - // will be the same for both "-F 0" and "-F 2". - std::function generator) - { - if(is_gpualloc) - { - /// In gpualloc mode, we do not care about reproducibility of results, because - /// validation is not used. Therefore, we do not have to always generate random value - /// (\ref move_rand) - return; - } - - for(size_t i = 0; i < sz; ++i) - { - /// \anchor move_rand - /// Generate random value, even if buffer is unused. This provides the same - /// initialization of input buffers regardless of which kinds of - /// convolutions are currently selectedfor testing (see the "-F" option). - /// Verification cache would be broken otherwise. - auto val = generator(); - if(do_write) - GetVector()[i] = val; - } - } - - status_t AllocOnDevice(stream, context_t ctx, const size_t sz) - { - dev = std::make_unique(ctx, sz, sizeof(Tgpu)); - return STATUS_SUCCESS; - } - - status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz) - { - AllocOnDevice(q, ctx, sz); - if(is_gpualloc) - { - /// \anchor gpualloc_random_init - /// In gpualloc mode, we do not want to leave input buffers uninitialized, because - /// there could be NaNs and Infs, which may affect the performance (which we are - /// interested to evaluate in this mode). Initialization with all 0's is not the - /// best choice as well, because GPU HW may optimize out computations with 0's and - /// that could affect performance of kernels too. That is why we are using - /// rocrand to initialize input buffers. - /// - /// However we do not care about precision in gpualloc mode, because validation - /// is not used. Therefore, range (0,1] is fine. - return gpumemrand::gen_0_1(static_cast(GetDevicePtr()), sz); - } - return dev->ToGPU(q, GetVectorData()); - } - - template - status_t AllocOnDevice(stream, context_t ctx, const size_t sz, std::vector&) - { - static_assert(std::is_same::value // - || std::is_same::value, // - "Before enabling more types, check thoroughly."); - dev = std::make_unique(ctx, sz, sizeof(T)); - return STATUS_SUCCESS; - } - - template - status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz, std::vector& init) - { - AllocOnDevice(q, ctx, sz, init); - if(is_gpualloc) - { - /// \ref gpualloc_random_init - return gpumemrand::gen_0_1(static_cast(GetDevicePtr()), sz); - } - return dev->ToGPU(q, init.data()); - } - - status_t CopyFromDeviceToHost(stream q) - { - return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, GetVectorData()); - } - - template - status_t CopyFromDeviceToHost(stream q, tensor& t) - { - return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, t.data.data()); - } - - template - status_t CopyFromDeviceToHost(stream q, std::vector& v) - { - return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, v.data()); - } - - auto GetDevicePtr() -> auto { return dev->GetMem(); } -}; - template class GpumemVector { diff --git a/driver/dm_bnorm.cpp b/driver/dm_bnorm.cpp index c7bab90bb5..24e986fa1d 100644 --- a/driver/dm_bnorm.cpp +++ b/driver/dm_bnorm.cpp @@ -31,7 +31,11 @@ static Driver* makeDriver(const std::string& base_arg) if(base_arg == "bnorm") return new BatchNormDriver(); if(base_arg == "bnormfp16") + return new BatchNormDriver(); + if(base_arg == "bnormfp16fp32") return new BatchNormDriver(); + if(base_arg == "bnormbfp16fp32") + return new BatchNormDriver(); return nullptr; } diff --git a/driver/driver.hpp b/driver/driver.hpp index 4d33f7ed38..df0a85fece 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -38,6 +38,9 @@ #include #include #include +#include <../test/tensor_holder.hpp> +#include "util_driver.hpp" +#include "rocrand_wrapper.hpp" using half = half_float::half; using hip_bfloat16 = bfloat16; #include @@ -157,6 +160,140 @@ struct GPUMem #endif }; +template +class GpumemTensor +{ + std::unique_ptr dev; + tensor host; + bool is_gpualloc = false; + +public: + void SetGpuallocMode(bool v) { is_gpualloc = v; } + tensor& GetTensor() { return host; } + + void AllocOnHost(miopenTensorDescriptor_t t) + { + host = tensor(miopen::deref(t)); + if(is_gpualloc) // We do not need host data. + { + host.data.clear(); + host.data.shrink_to_fit(); // To free host memory. + } + } + template + void AllocOnHost(tensor t) + { + AllocOnHost(&t.desc); + } + + std::vector& GetVector() + { + if(is_gpualloc) + MIOPEN_THROW("[MIOpenDriver] GpumemTensor::GetVector should not be called in " + "'--gpualloc 1' mode"); + return host.data; + } + + Tgpu* GetVectorData() { return is_gpualloc ? nullptr : host.data.data(); } + std::size_t GetVectorSize() const { return is_gpualloc ? 0 : host.data.size(); } + + void + InitHostData(const size_t sz, // + const bool do_write, // If set to false, then only generate random data. This is + // necessary to reproduce values in input buffers even if some + // directions are skipped. For example, inputs for Backward + // will be the same for both "-F 0" and "-F 2". + std::function generator) + { + if(is_gpualloc) + { + /// In gpualloc mode, we do not care about reproducibility of results, because + /// validation is not used. Therefore, we do not have to always generate random value + /// (\ref move_rand) + return; + } + + for(size_t i = 0; i < sz; ++i) + { + /// \anchor move_rand + /// Generate random value, even if buffer is unused. This provides the same + /// initialization of input buffers regardless of which kinds of + /// convolutions are currently selectedfor testing (see the "-F" option). + /// Verification cache would be broken otherwise. + auto val = generator(); + if(do_write) + GetVector()[i] = val; + } + } + + status_t AllocOnDevice(stream, context_t ctx, const size_t sz) + { + dev = std::make_unique(ctx, sz, sizeof(Tgpu)); + return STATUS_SUCCESS; + } + + status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz) + { + AllocOnDevice(q, ctx, sz); + if(is_gpualloc) + { + /// \anchor gpualloc_random_init + /// In gpualloc mode, we do not want to leave input buffers uninitialized, because + /// there could be NaNs and Infs, which may affect the performance (which we are + /// interested to evaluate in this mode). Initialization with all 0's is not the + /// best choice as well, because GPU HW may optimize out computations with 0's and + /// that could affect performance of kernels too. That is why we are using + /// rocrand to initialize input buffers. + /// + /// However we do not care about precision in gpualloc mode, because validation + /// is not used. Therefore, range (0,1] is fine. + return gpumemrand::gen_0_1(static_cast(GetDevicePtr()), sz); + } + return dev->ToGPU(q, GetVectorData()); + } + + template + status_t AllocOnDevice(stream, context_t ctx, const size_t sz, std::vector&) + { + static_assert(std::is_same::value // + || std::is_same::value, // + "Before enabling more types, check thoroughly."); + dev = std::make_unique(ctx, sz, sizeof(T)); + return STATUS_SUCCESS; + } + + template + status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz, std::vector& init) + { + AllocOnDevice(q, ctx, sz, init); + if(is_gpualloc) + { + /// \ref gpualloc_random_init + return gpumemrand::gen_0_1(static_cast(GetDevicePtr()), sz); + } + return dev->ToGPU(q, init.data()); + } + + status_t CopyFromDeviceToHost(stream q) + { + return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, GetVectorData()); + } + + template + status_t CopyFromDeviceToHost(stream q, tensor& t) + { + return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, t.data.data()); + } + + template + status_t CopyFromDeviceToHost(stream q, std::vector& v) + { + return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, v.data()); + } + + auto GetDevicePtr() -> auto { return dev->GetMem(); } +}; + inline void PadBufferSize(size_t& sz, int datatype_sz) { size_t page_sz = (2 * 1024 * 1024) / datatype_sz; @@ -194,16 +331,16 @@ inline std::string ParseBaseArg(int argc, char* argv[]) if(arg != "conv" && arg != "convfp16" && arg != "convint8" && arg != "convbfp16" && arg != "pool" && arg != "poolfp16" && arg != "lrn" && arg != "lrnfp16" && arg != "activ" && arg != "activfp16" && arg != "softmax" && arg != "softmaxfp16" && arg != "bnorm" && - arg != "bnormfp16" && arg != "rnn" && arg != "rnnfp16" && arg != "rnn_seq" && - arg != "rnn_seqfp16" && arg != "gemm" && arg != "gemmfp16" && arg != "ctc" && - arg != "dropout" && arg != "dropoutfp16" && arg != "tensorop" && arg != "reduce" && - arg != "reducefp16" && arg != "reducefp64" && arg != "layernorm" && arg != "layernormfp16" && - arg != "layernormbfp16" && arg != "sum" && arg != "sumfp16" && arg != "sumbfp16" && - arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" && - arg != "catfp16" && arg != "catbfp16" && arg != "addlayernorm" && - arg != "addlayernormfp16" && arg != "addlayernormbfp16" && arg != "t5layernorm" && - arg != "t5layernormfp16" && arg != "t5layernormbfp16" && arg != "adam" && - arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" && + arg != "bnormfp16" && arg != "bnormfp16fp32" && arg != "bnormbfp16fp32" && arg != "rnn" && + arg != "rnnfp16" && arg != "rnn_seq" && arg != "rnn_seqfp16" && arg != "gemm" && + arg != "gemmfp16" && arg != "ctc" && arg != "dropout" && arg != "dropoutfp16" && + arg != "tensorop" && arg != "reduce" && arg != "reducefp16" && arg != "reducefp64" && + arg != "layernorm" && arg != "layernormfp16" && arg != "layernormbfp16" && arg != "sum" && + arg != "sumfp16" && arg != "sumbfp16" && arg != "groupnorm" && arg != "groupnormfp16" && + arg != "groupnormbfp16" && arg != "cat" && arg != "catfp16" && arg != "catbfp16" && + arg != "addlayernorm" && arg != "addlayernormfp16" && arg != "addlayernormbfp16" && + arg != "t5layernorm" && arg != "t5layernormfp16" && arg != "t5layernormbfp16" && + arg != "adam" && arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" && arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "adamw" && arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" && arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" && diff --git a/fin b/fin index 8c40a3c3b4..344cf42f6c 160000 --- a/fin +++ b/fin @@ -1 +1 @@ -Subproject commit 8c40a3c3b41a7d2fb31a8e747155fde4223919b9 +Subproject commit 344cf42f6c18f309f3d1dd08af1cd7b73dd38e46 diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index c919c95e0a..67652ab832 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -2737,6 +2737,67 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle, double epsilon, void* resultSaveMean, void* resultSaveInvVariance); +/*! @brief Execute forward training layer for batch normalization + * + * Batch normalization pass for forward training pass. + * Takes in batch normalization mode bn_mode and input tensor x, output tensor y, bnBias and bnScale + * with their descriptor. + * + * If either resultSaveMean, or resultSaveInvVariance are null pointers then the values for the mean + * and inverse variance will not be used. + * + * Likewise, if either resultRunningMean, or resultRunningVariance are null pointers then the values + * for the running mean and variance will not be saved. + * Running averages and variances are scaled using an exponential averaging factor: \f[ + * \mu_{old} = \mu_{new}*factor + \mu_{old}*(1-factor) + * \f] + * where \f[ + * factor=1/(1+iteration) + * \f] + * + * @param handle MIOpen handle (input) + * @param bn_mode Batch normalization mode (input) + * @param alpha Floating point scaling factor, allocated on the host (input) + * @param beta Floating point shift factor, allocated on the host (input) + * @param xDesc Tensor descriptor for data input tensor x (input) + * @param x Data tensor x (input) + * @param yDesc Tensor descriptor for output data tensor y (input) + * @param y Data tensor y (output) + * @param ScaleDesc Tensor descriptor for BN scaling + * @param biasVarDesc Tensor descriptor for BN bias + * @param savedMeanDesc Tensor descriptor for BN saved Mean + * @param savedVarDesc Tensor descriptor for BN saved Variance + * @param bnScale Batch norm scaling, gamma, tensor (input) + * @param bnBias Batch norm bias, beta, tensor (input) + * @param expAvgFactor Exponential averaging factor (input) + * @param resultRunningMean Running average saved for inference (output) + * @param resultRunningVariance Running variance saved for inference (output) + * @param epsilon Value to stablize inverse variance calculation (input) + * @param resultSaveMean Saved mini-batch mean for backwards pass (output) + * @param resultSaveInvVariance Saved mini-batch inverse variance for backwards pass (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void* alpha, + void* beta, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t biasVarDesc, + const miopenTensorDescriptor_t savedMeanDesc, + const miopenTensorDescriptor_t savedVarDesc, + void* bnScale, + void* bnBias, + double expAvgFactor, + void* resultRunningMean, + void* resultRunningVariance, + double epsilon, + void* resultSaveMean, + void* resultSaveInvVariance); /*! @brief Execute forward inference layer for batch normalization * @@ -2783,6 +2844,56 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, void* estimatedVariance, double epsilon); +/*! @brief Execute forward inference layer for batch normalization + * + * Batch normalization pass for forward inference pass. + * Takes in batch normalization mode bn_mode and input tensor x, output tensor y, bnBias and bnScale + * with their descriptor. + * + * If either estimatedMean, or estimatedVariance are null pointers then the values for the mean and + * variance will be calculated from input data and this calculated mean and variance will be used + * to update input values. + * If variance is zero and epsilon is also zero, this function outputs NAN values. Input espilon + * value should always be non zero positive value. + * + * @param handle MIOpen handle (input) + * @param bn_mode Batch normalization mode (input) + * @param alpha Floating point scaling factor, allocated on the host (input) + * @param beta Floating point shift factor, allocated on the host (input) + * @param xDesc Tensor descriptor for data input tensor x (input) + * @param x Data tensor x (input) + * @param yDesc Tensor descriptor for output data tensor y (input) + * @param y Data tensor y (output) + * @param ScaleDesc Tensor descriptor for BN scaling + * @param biasVarDesc Tensor descriptor for BN bias + * @param estMeanDesc Tensor descriptor for BN estimated Mean + * @param estVarianceDesc Tensor descriptor for BN estimated Variance + * @param bnScale Batch norm scaling, gamma, tensor (input) + * @param bnBias Batch norm bias, beta, tensor (input) + * @param estimatedMean Running average saved during forward training (input) + * @param estimatedVariance Running variance saved during forward training (input) + * @param epsilon Value to stabilize inverse variance calculation (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void* alpha, + void* beta, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t biasDesc, + const miopenTensorDescriptor_t estMeanDesc, + const miopenTensorDescriptor_t estVarianceDesc, + void* bnScale, + void* bnBias, + void* estimatedMean, + void* estimatedVariance, + double epsilon); + /*! @brief Execute backwards propagation layer for batch normalization * * Batch normalization pass for backwards propagation training pass. @@ -2838,6 +2949,68 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, const void* savedMean, const void* savedInvVariance); +/*! @brief Execute backwards propagation layer for batch normalization + * + * Batch normalization pass for backwards propagation training pass. + * The method for backwards propagation batch normalization. + * + * Takes in batch normalization mode bn_mode and input tensor data x, input activation tensor dy, + * output tensor dx, the learned tensors resultBNBiasDiff and resultBNScaleDiff with their + * descriptor. + * + * If BOTH savedMean, and savedVariance are not null pointers then the method will use the saved + * mean and variance calculated by the forward training phase. + * + * @param handle MIOpen handle (input) + * @param bn_mode Batch normalization mode (input) + * @param alphaDataDiff Floating point scaling factor, allocated on the host (input) + * @param betaDataDiff Floating point shift factor, allocated on the host (input) + * @param alphaParamDiff Floating point scaling factor, allocated on the host (input) + * @param betaParamDiff Floating point shift factor, allocated on the host (input) + * @param xDesc Tensor descriptor for data input tensor x (input) + * @param x Data tensor x (input) + * @param dyDesc Tensor descriptor for output data tensor y (input) + * @param dy Data tensor y (input) + * @param dxDesc Tensor descriptor for output data tensor dx (input) + * @param dx Data delta tensor dx (output) + * @param scaleDesc Tensor descriptor for scaling descriptor (input) + * @param biasDesc Tensor descriptor for bias/shift descriptor (input) + * @param savedMeanDesc Tensor descriptor for saved Mean descriptor (input) + * @param savedVarDesc Tensor descriptor for saved Variance descriptor (input) + * , shifting, saved variance and + * mean (input) + * @param bnScale Batch norm scaling, gamma, tensor (input) + * @param resultBnScaleDiff Tensor for dscale (output) + * @param resultBnBiasDiff Tensor for dbias (output) + * @param epsilon Value to stabilize inverse variance calculation (input) + * @param savedMean Saved mini-batch mean for backwards pass (input) + * @param savedInvVariance Saved mini-bathc inverse variance for backwards pass (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenBatchNormalizationBackward_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + const void* alphaDataDiff, + const void* betaDataDiff, + const void* alphaParamDiff, + const void* betaParamDiff, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t dyDesc, + const void* dy, + const miopenTensorDescriptor_t dxDesc, + void* dx, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t biasDesc, + const miopenTensorDescriptor_t savedMeanDesc, + const miopenTensorDescriptor_t savedVarDesc, + const void* bnScale, + void* resultBnScaleDiff, + void* resultBnBiasDiff, + double epsilon, + const void* savedMean, + const void* savedInvVariance); + /** @} */ // CLOSEOUT BATCHNORM DOXYGEN GROUP diff --git a/src/batch_norm.cpp b/src/batch_norm.cpp index a3c5f93e36..938809d81c 100644 --- a/src/batch_norm.cpp +++ b/src/batch_norm.cpp @@ -67,6 +67,20 @@ void DeriveBNTensorDescriptor(TensorDescriptor& derivedBnDesc, TensorDescriptor BuildReshaped4DTensorDescriptor(const miopen::TensorDescriptor& tDesc) { auto dataType = tDesc.GetType(); + auto layout = tDesc.GetLayout_t(); + if(layout == miopenTensorNCDHW) + { + layout = miopenTensorNCHW; + } + else if(layout == miopenTensorNDHWC) + { + layout = miopenTensorNHWC; + } + else + { + std::cout << "Cannot handle layout : " << layout << "\n"; + exit(EXIT_FAILURE); // NOLINT (concurrency-mt-unsafe) + } std::vector dims(tDesc.GetLengths()); // NxCxDxHxW -> NxCx(D*H)xW @@ -74,7 +88,7 @@ TensorDescriptor BuildReshaped4DTensorDescriptor(const miopen::TensorDescriptor& dims[3] = dims[4]; dims.pop_back(); - return {dataType, dims}; + return {dataType, layout, dims}; } void profileSequence(const Handle& handle, unsigned char select, float* ctime) diff --git a/src/batch_norm_api.cpp b/src/batch_norm_api.cpp index 8f184a9508..d3b824cee0 100644 --- a/src/batch_norm_api.cpp +++ b/src/batch_norm_api.cpp @@ -50,6 +50,7 @@ namespace miopen { namespace debug { void LogCmdBNorm(const miopenTensorDescriptor_t xDesc, + const miopenTensorDescriptor_t sMeanDesc, miopenBatchNormMode_t bn_mode, const void* resultRunningMean, const void* resultRunningVariance, @@ -60,6 +61,7 @@ void LogCmdBNorm(const miopenTensorDescriptor_t xDesc, if(miopen::IsLoggingCmd()) { const std::string& str = BnormArgsForMIOpenDriver(xDesc, + sMeanDesc, bn_mode, resultRunningMean, resultRunningVariance, @@ -88,6 +90,130 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, void* estimatedMean, void* estimatedVariance, double epsilon) +{ + return miopenBatchNormalizationForwardInference_V2(handle, + bn_mode, + alpha, + beta, + xDesc, + x, + yDesc, + y, + bnScaleBiasMeanVarDesc, + bnScaleBiasMeanVarDesc, + bnScaleBiasMeanVarDesc, + bnScaleBiasMeanVarDesc, + bnScale, + bnBias, + estimatedMean, + estimatedVariance, + epsilon); +} + +extern "C" miopenStatus_t +miopenBatchNormalizationForwardTraining(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void* alpha, + void* beta, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t bnScaleBiasMeanVarDesc, + void* bnScale, + void* bnBias, + double expAvgFactor, + void* resultRunningMean, + void* resultRunningVariance, + double epsilon, + void* resultSaveMean, + void* resultSaveInvVariance) +{ + return miopenBatchNormalizationForwardTraining_V2(handle, + bn_mode, + alpha, + beta, + xDesc, + x, + yDesc, + y, + bnScaleBiasMeanVarDesc, + bnScaleBiasMeanVarDesc, + bnScaleBiasMeanVarDesc, + bnScaleBiasMeanVarDesc, + bnScale, + bnBias, + expAvgFactor, + resultRunningMean, + resultRunningVariance, + epsilon, + resultSaveMean, + resultSaveInvVariance); +} + +extern "C" miopenStatus_t +miopenBatchNormalizationBackward(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + const void* alphaDataDiff, + const void* betaDataDiff, + const void* alphaParamDiff, + const void* betaParamDiff, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t dyDesc, + const void* dy, + const miopenTensorDescriptor_t dxDesc, + void* dx, + const miopenTensorDescriptor_t bnScaleBiasDiffDesc, + const void* bnScale, + void* resultBnScaleDiff, + void* resultBnBiasDiff, + double epsilon, + const void* savedMean, + const void* savedInvVariance) +{ + return miopenBatchNormalizationBackward_V2(handle, + bn_mode, + alphaDataDiff, + betaDataDiff, + alphaParamDiff, + betaParamDiff, + xDesc, + x, + dyDesc, + dy, + dxDesc, + dx, + bnScaleBiasDiffDesc, + bnScaleBiasDiffDesc, + bnScaleBiasDiffDesc, + bnScaleBiasDiffDesc, + bnScale, + resultBnScaleDiff, + resultBnBiasDiff, + epsilon, + savedMean, + savedInvVariance); +} + +extern "C" miopenStatus_t +miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void* alpha, + void* beta, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t BiasDesc, + const miopenTensorDescriptor_t estMeanDesc, + const miopenTensorDescriptor_t estVarianceDesc, + void* bnScale, + void* bnBias, + void* estimatedMean, + void* estimatedVariance, + double epsilon) { MIOPEN_LOG_FUNCTION(handle, bn_mode, @@ -95,7 +221,10 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, x, yDesc, y, - bnScaleBiasMeanVarDesc, + scaleDesc, + BiasDesc, + estMeanDesc, + estVarianceDesc, bnScale, bnBias, estimatedMean, @@ -103,12 +232,14 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, epsilon); miopen::debug::LogCmdBNorm(xDesc, + estMeanDesc, bn_mode, estimatedMean, estimatedVariance, nullptr, nullptr, miopen::debug::BatchNormDirection_t::ForwardInference); + // In case of NxCxDxHxW int size{0}; miopenGetTensorDescriptorSize(xDesc, &size); @@ -124,9 +255,10 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, (size == 5) ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(yDesc)) : miopen::deref(yDesc), DataCast(y), - (size == 5) - ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(bnScaleBiasMeanVarDesc)) - : miopen::deref(bnScaleBiasMeanVarDesc), + miopen::deref(scaleDesc), + miopen::deref(BiasDesc), + miopen::deref(estMeanDesc), + miopen::deref(estVarianceDesc), DataCast(bnScale), DataCast(bnBias), DataCast(estimatedMean), @@ -136,32 +268,37 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, } extern "C" miopenStatus_t -miopenBatchNormalizationForwardTraining(miopenHandle_t handle, - miopenBatchNormMode_t bn_mode, - void* alpha, - void* beta, - const miopenTensorDescriptor_t xDesc, - const void* x, - const miopenTensorDescriptor_t yDesc, - void* y, - const miopenTensorDescriptor_t bnScaleBiasMeanVarDesc, - void* bnScale, - void* bnBias, - double expAvgFactor, - void* resultRunningMean, - void* resultRunningVariance, - double epsilon, - void* resultSaveMean, - void* resultSaveInvVariance) +miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void* alpha, + void* beta, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t BiasDesc, + const miopenTensorDescriptor_t savedMeanDesc, + const miopenTensorDescriptor_t savedVarianceDesc, + void* bnScale, + void* bnBias, + double expAvgFactor, + void* resultRunningMean, + void* resultRunningVariance, + double epsilon, + void* resultSaveMean, + void* resultSaveInvVariance) { - MIOPEN_LOG_FUNCTION(handle, bn_mode, xDesc, x, yDesc, y, - bnScaleBiasMeanVarDesc, + scaleDesc, + BiasDesc, + savedMeanDesc, + savedVarianceDesc, bnScale, bnBias, expAvgFactor, @@ -172,6 +309,7 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle, resultSaveInvVariance); miopen::debug::LogCmdBNorm(xDesc, + savedMeanDesc, bn_mode, resultRunningMean, resultRunningVariance, @@ -193,9 +331,10 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle, (size == 5) ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(yDesc)) : miopen::deref(yDesc), DataCast(y), - (size == 5) - ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(bnScaleBiasMeanVarDesc)) - : miopen::deref(bnScaleBiasMeanVarDesc), + miopen::deref(scaleDesc), + miopen::deref(BiasDesc), + miopen::deref(savedMeanDesc), + miopen::deref(savedVarianceDesc), DataCast(bnScale), DataCast(bnBias), expAvgFactor, @@ -208,27 +347,29 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle, } extern "C" miopenStatus_t -miopenBatchNormalizationBackward(miopenHandle_t handle, - miopenBatchNormMode_t bn_mode, - const void* alphaDataDiff, - const void* betaDataDiff, - const void* alphaParamDiff, - const void* betaParamDiff, - const miopenTensorDescriptor_t xDesc, - const void* x, - const miopenTensorDescriptor_t dyDesc, - const void* dy, - const miopenTensorDescriptor_t dxDesc, - void* dx, - const miopenTensorDescriptor_t bnScaleBiasDiffDesc, - const void* bnScale, - void* resultBnScaleDiff, - void* resultBnBiasDiff, - double epsilon, - const void* savedMean, - const void* savedInvVariance) +miopenBatchNormalizationBackward_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + const void* alphaDataDiff, + const void* betaDataDiff, + const void* alphaParamDiff, + const void* betaParamDiff, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t dyDesc, + const void* dy, + const miopenTensorDescriptor_t dxDesc, + void* dx, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t BiasDesc, + const miopenTensorDescriptor_t savedMeanDesc, + const miopenTensorDescriptor_t savedVarianceDesc, + const void* bnScale, + void* resultBnScaleDiff, + void* resultBnBiasDiff, + double epsilon, + const void* savedMean, + const void* savedInvVariance) { - MIOPEN_LOG_FUNCTION(handle, bn_mode, xDesc, @@ -237,7 +378,10 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, dy, dxDesc, dx, - bnScaleBiasDiffDesc, + scaleDesc, + BiasDesc, + savedMeanDesc, + savedVarianceDesc, bnScale, resultBnScaleDiff, resultBnBiasDiff, @@ -245,6 +389,7 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, savedMean, savedInvVariance); miopen::debug::LogCmdBNorm(xDesc, + savedMeanDesc, bn_mode, nullptr, nullptr, @@ -271,9 +416,10 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, (size == 5) ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(dxDesc)) : miopen::deref(dxDesc), DataCast(dx), - (size == 5) - ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(bnScaleBiasDiffDesc)) - : miopen::deref(bnScaleBiasDiffDesc), + miopen::deref(scaleDesc), + miopen::deref(BiasDesc), + miopen::deref(savedMeanDesc), + miopen::deref(savedVarianceDesc), DataCast(bnScale), DataCast(resultBnScaleDiff), DataCast(resultBnBiasDiff), diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp index c105996d57..e75ec31902 100644 --- a/src/driver_arguments.cpp +++ b/src/driver_arguments.cpp @@ -60,12 +60,28 @@ void ConvDataType(std::stringstream& ss, const miopen::TensorDescriptor& desc) } } -void BnDataType(std::stringstream& ss, const miopen::TensorDescriptor& desc) +// test based on the input tensor and scaleMean. +// We choose scaleMean because its a accumulator type. +void BnDataType(std::stringstream& ss, + const miopen::TensorDescriptor& xDesc, + const miopen::TensorDescriptor& sMeanDesc) { - if(desc.GetType() == miopenHalf) + if(xDesc.GetType() == miopenHalf && sMeanDesc.GetType() == miopenHalf) { ss << "bnormfp16"; } + else if(xDesc.GetType() == miopenBFloat16 && sMeanDesc.GetType() == miopenBFloat16) + { + ss << "bnormbfp16"; + } + else if(xDesc.GetType() == miopenHalf && sMeanDesc.GetType() == miopenFloat) + { + ss << "bnormfp16fp32"; + } + else if(xDesc.GetType() == miopenBFloat16 && sMeanDesc.GetType() == miopenFloat) + { + ss << "bnormbfp16fp32"; + } else { ss << "bnorm"; @@ -211,7 +227,8 @@ std::string ConvArgsForMIOpenDriver(const miopen::TensorDescriptor& xDesc, return ss.str(); } -std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc, +std::string BnormArgsForMIOpenDriver(const miopenTensorDescriptor_t xDesc, + const miopenTensorDescriptor_t sMeanDesc, miopenBatchNormMode_t bn_mode, const void* resultRunningMean, const void* resultRunningVariance, @@ -224,7 +241,7 @@ std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc, miopenGetTensorDescriptorSize(xDesc, &size); std::stringstream ss; if(print_for_bn_driver) - BnDataType(ss, miopen::deref(xDesc)); + BnDataType(ss, miopen::deref(xDesc), miopen::deref(sMeanDesc)); ss << " -n " << miopen::deref(xDesc).GetLengths()[0] // clang-format off << " -c " << miopen::deref(xDesc).GetLengths()[1]; @@ -248,6 +265,7 @@ std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc, resultRunningVariance, resultSaveMean, resultSaveInvVariance); + ss << " --layout " << miopen::deref(xDesc).GetLayout_str(); } return ss.str(); } diff --git a/src/fusion.cpp b/src/fusion.cpp index a9ef5e27a9..e536f6a1a1 100644 --- a/src/fusion.cpp +++ b/src/fusion.cpp @@ -390,6 +390,7 @@ std::string LogCmdBnormFusion(const miopenFusionPlanDescriptor_t fusePlanDesc, i if(bn_op != nullptr) { str += BnormArgsForMIOpenDriver(&bn_op->input_desc, + &bn_op->base_desc, bn_op->mode, nullptr, nullptr, diff --git a/src/include/miopen/batch_norm.hpp b/src/include/miopen/batch_norm.hpp index 50c309550c..92444f039b 100644 --- a/src/include/miopen/batch_norm.hpp +++ b/src/include/miopen/batch_norm.hpp @@ -163,40 +163,44 @@ void bnFwdTrainSelectMulti(const Handle& handle, void profileSequence(const Handle& handle, unsigned char select, float* ctime); -MIOPEN_INTERNALS_EXPORT void -BatchNormForwardInference(Handle& handle, - miopenBatchNormMode_t bn_mode, - const void* alpha, - const void* beta, - const TensorDescriptor& xDesc, - ConstData_t x, - const TensorDescriptor& yDesc, - Data_t y, - const TensorDescriptor& bnScaleBiasMeanVarDesc, - ConstData_t bnScale, - ConstData_t bnBias, - ConstData_t estimatedMean, - ConstData_t estimatedVariance, - double epsilon); - -MIOPEN_INTERNALS_EXPORT void -BatchNormForwardTraining(Handle& handle, - miopenBatchNormMode_t bn_mode, - const void* alpha, /* these don't seem to be used in conv */ - const void* beta, - const TensorDescriptor& xDesc, - ConstData_t x, - const TensorDescriptor& yDesc, - Data_t y, - const TensorDescriptor& bnScaleBiasMeanVarDesc, - ConstData_t bnScale, - ConstData_t bnBias, - double expAvgFactor, - Data_t resultRunningMean, - Data_t resultRunningVariance, - double epsilon, - Data_t resultSaveMean, - Data_t resultSaveInvVariance); +MIOPEN_INTERNALS_EXPORT void BatchNormForwardInference(Handle& handle, + miopenBatchNormMode_t bn_mode, + const void* alpha, + const void* beta, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& yDesc, + Data_t y, + const TensorDescriptor& scaleDesc, + const TensorDescriptor& BiasDesc, + const TensorDescriptor& estMeanDesc, + const TensorDescriptor& estVarianceDesc, + ConstData_t bnScale, + ConstData_t bnBias, + ConstData_t estimatedMean, + ConstData_t estimatedVariance, + double epsilon); + +MIOPEN_INTERNALS_EXPORT void BatchNormForwardTraining(Handle& handle, + miopenBatchNormMode_t bn_mode, + const void* alpha, + const void* beta, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& yDesc, + Data_t y, + const TensorDescriptor& scaleDesc, + const TensorDescriptor& biasDesc, + const TensorDescriptor& savedMeanDesc, + const TensorDescriptor& savedVarianceDesc, + ConstData_t bnScale, + ConstData_t bnBias, + double expAvgFactor, + Data_t resultRunningMean, + Data_t resultRunningVariance, + double epsilon, + Data_t resultSaveMean, + Data_t resultSaveInvVariance); MIOPEN_INTERNALS_EXPORT void BatchNormBackward(Handle& handle, miopenBatchNormMode_t bn_mode, @@ -210,7 +214,10 @@ MIOPEN_INTERNALS_EXPORT void BatchNormBackward(Handle& handle, ConstData_t dy, const TensorDescriptor& dxDesc, Data_t dx, - const TensorDescriptor& bnScaleBiasDiffDesc, + const TensorDescriptor& scaleDesc, + const TensorDescriptor& BiasDesc, + const TensorDescriptor& savedMeanDesc, + const TensorDescriptor& savedVarianceDesc, ConstData_t bnScale, Data_t resultBnScaleDiff, Data_t resultBnBiasDiff, diff --git a/src/include/miopen/batchnorm/problem_description.hpp b/src/include/miopen/batchnorm/problem_description.hpp index b87494b725..d28e91adfd 100644 --- a/src/include/miopen/batchnorm/problem_description.hpp +++ b/src/include/miopen/batchnorm/problem_description.hpp @@ -58,7 +58,10 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob ProblemDescription(miopenBatchNormMode_t bn_mode_, const TensorDescriptor& xDesc_, const TensorDescriptor& yDesc_, - const TensorDescriptor& bnScaleBiasMeanVarDesc_, + const TensorDescriptor& scaleDesc_, + const TensorDescriptor& biasDesc_, + const TensorDescriptor& sMeanDesc_, + const TensorDescriptor& sVarianceDesc_, double expAvgFactor_, double epsilon_, bool resultsave_, @@ -67,7 +70,10 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob bn_mode(bn_mode_), xDesc(xDesc_), yOrDyDesc(yDesc_), - scaleBiasDesc(bnScaleBiasMeanVarDesc_), + scaleDesc(scaleDesc_), + biasDesc(biasDesc_), + sMeanDesc(sMeanDesc_), + sVarianceDesc(sVarianceDesc_), expAvgFactor(expAvgFactor_), epsilon(epsilon_), resultsave(resultsave_), @@ -82,13 +88,19 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob ProblemDescription(miopenBatchNormMode_t bn_mode_, const TensorDescriptor& xDesc_, const TensorDescriptor& yDesc_, - const TensorDescriptor& bnScaleBiasMeanVarDesc_, + const TensorDescriptor& scaleDesc_, + const TensorDescriptor& biasDesc_, + const TensorDescriptor& sMeanDesc_, + const TensorDescriptor& sVarianceDesc_, double epsilon_) : direction(Direction::ForwardInference), bn_mode(bn_mode_), xDesc(xDesc_), yOrDyDesc(yDesc_), - scaleBiasDesc(bnScaleBiasMeanVarDesc_), + scaleDesc(scaleDesc_), + biasDesc(biasDesc_), + sMeanDesc(sMeanDesc_), + sVarianceDesc(sVarianceDesc_), epsilon(epsilon_) { SetSpatialDims(); @@ -101,7 +113,10 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob const TensorDescriptor& xDesc_, const TensorDescriptor& dyDesc_, const TensorDescriptor& dxDesc_, - const TensorDescriptor& bnScaleBiasDiffDesc_, + const TensorDescriptor& scaleDesc_, + const TensorDescriptor& biasDesc_, + const TensorDescriptor& sMeanDesc_, + const TensorDescriptor& sVarianceDesc_, double epsilon_, bool useSaved_) : direction(Direction::Backward), @@ -109,7 +124,10 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob xDesc(xDesc_), yOrDyDesc(dyDesc_), dxDesc(dxDesc_), - scaleBiasDesc(bnScaleBiasDiffDesc_), + scaleDesc(scaleDesc_), + biasDesc(biasDesc_), + sMeanDesc(sMeanDesc_), + sVarianceDesc(sVarianceDesc_), epsilon(epsilon_), useSaved(useSaved_) { @@ -153,13 +171,13 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob const TensorDescriptor& GetBnScaleBiasMeanVarDesc() const { assert(direction == Direction::ForwardTraining || direction == Direction::ForwardInference); - return scaleBiasDesc; + return scaleDesc; } const TensorDescriptor& GetScaleBiasDiffDesc() const { assert(direction == Direction::Backward); - return scaleBiasDesc; + return scaleDesc; } bool GetResultSave() const @@ -215,7 +233,11 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob TensorDescriptor xDesc; // input TensorDescriptor yOrDyDesc; // output TensorDescriptor dxDesc; - TensorDescriptor scaleBiasDesc; + + TensorDescriptor scaleDesc; // scale + TensorDescriptor biasDesc; // bias (shift) + TensorDescriptor sMeanDesc; + TensorDescriptor sVarianceDesc; #ifdef __clang__ #pragma clang diagnostic push diff --git a/src/include/miopen/driver_arguments.hpp b/src/include/miopen/driver_arguments.hpp index da4064b7f0..a964e7fe27 100644 --- a/src/include/miopen/driver_arguments.hpp +++ b/src/include/miopen/driver_arguments.hpp @@ -67,6 +67,7 @@ std::string ConvArgsForMIOpenDriver(const miopen::TensorDescriptor& xDesc, bool print_for_conv_driver = true); std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc, + miopenTensorDescriptor_t sMeanDesc, miopenBatchNormMode_t bn_mode, const void* resultRunningMean, const void* resultRunningVariance, diff --git a/src/include/miopen/fusion/problem_description.hpp b/src/include/miopen/fusion/problem_description.hpp index bcb37878d9..b3d1669cee 100644 --- a/src/include/miopen/fusion/problem_description.hpp +++ b/src/include/miopen/fusion/problem_description.hpp @@ -128,7 +128,14 @@ struct FusionDescription : ProblemDescriptionBase dynamic_cast(*fusion_plan_desc->op_map[idx]); miopen::TensorDescriptor out_desc; bn_op.GetOutputDesc(out_desc); - return {bn_op.mode, bn_op.input_desc, out_desc, bn_op.base_desc, not_used}; + return {bn_op.mode, + bn_op.input_desc, + out_desc, + bn_op.base_desc, + bn_op.base_desc, + bn_op.base_desc, + bn_op.base_desc, + not_used}; } else if(dir == miopen::batchnorm::Direction::ForwardTraining) { @@ -140,6 +147,9 @@ struct FusionDescription : ProblemDescriptionBase bn_op.input_desc, out_desc, bn_op.base_desc, + bn_op.base_desc, + bn_op.base_desc, + bn_op.base_desc, not_used, // expAvgFactor filler not_used, true /* resultSave*/, @@ -156,6 +166,9 @@ struct FusionDescription : ProblemDescriptionBase out_desc, bn_op.input_desc, {} /*bn_op.base_desc*/, + {} /*bn_op.base_desc*/, + {} /*bn_op.base_desc*/, + {} /*bn_op.base_desc*/, not_used, bn_op.useBatchStats /*useSaved*/}; } diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp index 40bcd34935..f33c5ac5db 100644 --- a/src/ocl/batchnormocl.cpp +++ b/src/ocl/batchnormocl.cpp @@ -55,6 +55,8 @@ miopen::PerformanceDb GetDb(const miopen::ExecutionContext& ctx, } } // namespace batchnorm +//============ BEGIN FORWARD TRAINING =============== + void BatchNormForwardTraining(Handle& handle, miopenBatchNormMode_t bn_mode, const void* alpha, @@ -63,7 +65,10 @@ void BatchNormForwardTraining(Handle& handle, ConstData_t x, const TensorDescriptor& yDesc, Data_t y, - const TensorDescriptor& bnScaleBiasMeanVarDesc, + const TensorDescriptor& scaleDesc, + const TensorDescriptor& biasDesc, + const TensorDescriptor& savedMeanDesc, + const TensorDescriptor& savedVarianceDesc, ConstData_t bnScale, ConstData_t bnBias, double expAvgFactor, @@ -73,13 +78,14 @@ void BatchNormForwardTraining(Handle& handle, Data_t resultSaveMean, Data_t resultSaveInvVariance) { - if(x == nullptr || y == nullptr || bnScale == nullptr || bnBias == nullptr) { MIOPEN_THROW(miopenStatusBadParm); } - if(xDesc.GetNumDims() != yDesc.GetNumDims() || - xDesc.GetNumDims() != bnScaleBiasMeanVarDesc.GetNumDims()) + if(xDesc.GetNumDims() != yDesc.GetNumDims() || xDesc.GetNumDims() != scaleDesc.GetNumDims() || + xDesc.GetNumDims() != biasDesc.GetNumDims() || + xDesc.GetNumDims() != savedMeanDesc.GetNumDims() || + xDesc.GetNumDims() != savedVarianceDesc.GetNumDims()) { MIOPEN_THROW(miopenStatusBadParm); } @@ -105,9 +111,9 @@ void BatchNormForwardTraining(Handle& handle, { miopen::checkNumericsInput(handle, xDesc, x); if(bnScale != nullptr) - miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, bnScale); + miopen::checkNumericsInput(handle, scaleDesc, bnScale); if(bnBias != nullptr) - miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, bnBias); + miopen::checkNumericsInput(handle, biasDesc, bnBias); } const auto resultsave = resultSaveMean != nullptr && resultSaveInvVariance != nullptr; @@ -116,7 +122,10 @@ void BatchNormForwardTraining(Handle& handle, const auto problem = batchnorm::ProblemDescription{bn_mode, xDesc, yDesc, - bnScaleBiasMeanVarDesc, + scaleDesc, + biasDesc, + savedMeanDesc, + savedVarianceDesc, expAvgFactor, epsilon, resultsave, @@ -153,15 +162,16 @@ void BatchNormForwardTraining(Handle& handle, { miopen::checkNumericsOutput(handle, yDesc, y); if(resultRunningMean != nullptr) - miopen::checkNumericsOutput(handle, bnScaleBiasMeanVarDesc, resultRunningMean); + miopen::checkNumericsOutput(handle, savedMeanDesc, resultRunningMean); if(resultRunningVariance != nullptr) - miopen::checkNumericsOutput(handle, bnScaleBiasMeanVarDesc, resultRunningVariance); + miopen::checkNumericsOutput(handle, savedVarianceDesc, resultRunningVariance); if(resultSaveMean != nullptr) - miopen::checkNumericsOutput(handle, bnScaleBiasMeanVarDesc, resultSaveMean); + miopen::checkNumericsOutput(handle, savedMeanDesc, resultSaveMean); if(resultSaveInvVariance != nullptr) - miopen::checkNumericsOutput(handle, bnScaleBiasMeanVarDesc, resultSaveInvVariance); + miopen::checkNumericsOutput(handle, savedVarianceDesc, resultSaveInvVariance); } } + //================== END FWD TRAIN =================== //============ BEGIN FORWARD INFERENCE =============== @@ -173,31 +183,37 @@ void BatchNormForwardInference(Handle& handle, ConstData_t x, const TensorDescriptor& yDesc, Data_t y, - const TensorDescriptor& bnScaleBiasMeanVarDesc, + const TensorDescriptor& scaleDesc, + const TensorDescriptor& biasDesc, + const TensorDescriptor& estMeanDesc, + const TensorDescriptor& estVarianceDesc, ConstData_t bnScale, ConstData_t bnBias, ConstData_t estimatedMean, ConstData_t estimatedVariance, double epsilon) { + if(miopen::CheckNumericsEnabled()) { miopen::checkNumericsInput(handle, xDesc, x); - miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, bnScale); - miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, bnBias); - miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, estimatedMean); - miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, estimatedVariance); + miopen::checkNumericsInput(handle, scaleDesc, bnScale); + miopen::checkNumericsInput(handle, biasDesc, bnBias); + miopen::checkNumericsInput(handle, estMeanDesc, estimatedMean); + miopen::checkNumericsInput(handle, estVarianceDesc, estimatedVariance); } if(estimatedMean != nullptr && estimatedVariance != nullptr) { - if(x == nullptr || y == nullptr || bnScale == nullptr || bnBias == nullptr) { MIOPEN_THROW(miopenStatusBadParm); } if(xDesc.GetNumDims() != yDesc.GetNumDims() || - xDesc.GetNumDims() != bnScaleBiasMeanVarDesc.GetNumDims()) + xDesc.GetNumDims() != scaleDesc.GetNumDims() || + xDesc.GetNumDims() != biasDesc.GetNumDims() || + xDesc.GetNumDims() != estMeanDesc.GetNumDims() || + xDesc.GetNumDims() != estVarianceDesc.GetNumDims()) { MIOPEN_THROW(miopenStatusBadParm); } @@ -216,8 +232,8 @@ void BatchNormForwardInference(Handle& handle, MIOPEN_THROW(miopenStatusBadParm); } - const auto problem = - batchnorm::ProblemDescription{bn_mode, xDesc, yDesc, bnScaleBiasMeanVarDesc, epsilon}; + const auto problem = batchnorm::ProblemDescription{ + bn_mode, xDesc, yDesc, scaleDesc, biasDesc, estMeanDesc, estVarianceDesc, epsilon}; const auto invoke_params = [&]() { auto tmp = batchnorm::InfInvokeParams{}; @@ -250,7 +266,10 @@ void BatchNormForwardInference(Handle& handle, x, yDesc, y, - bnScaleBiasMeanVarDesc, + scaleDesc, + biasDesc, + estMeanDesc, + estVarianceDesc, bnScale, bnBias, 0, @@ -265,9 +284,11 @@ void BatchNormForwardInference(Handle& handle, miopen::checkNumericsOutput(handle, yDesc, y); } } + //================= END FORWARD INFERENCE ==================== //=============== BEGIN BACKWARDS PROPAGATION ================ + void BatchNormBackward(Handle& handle, miopenBatchNormMode_t bn_mode, const void* alphaDataDiff, @@ -280,7 +301,10 @@ void BatchNormBackward(Handle& handle, ConstData_t dy, const TensorDescriptor& dxDesc, Data_t dx, - const TensorDescriptor& bnScaleBiasDiffDesc, + const TensorDescriptor& scaleDesc, + const TensorDescriptor& biasDesc, + const TensorDescriptor& savedMeanDesc, + const TensorDescriptor& savedVarianceDesc, ConstData_t bnScale, Data_t resultBnScaleDiff, Data_t resultBnBiasDiff, @@ -296,20 +320,23 @@ void BatchNormBackward(Handle& handle, { miopen::checkNumericsInput(handle, xDesc, x); miopen::checkNumericsInput(handle, dyDesc, dy); - miopen::checkNumericsInput(handle, bnScaleBiasDiffDesc, bnScale); + miopen::checkNumericsInput(handle, scaleDesc, bnScale); + miopen::checkNumericsInput(handle, biasDesc, bnScale); if(savedMean != nullptr) - miopen::checkNumericsInput(handle, bnScaleBiasDiffDesc, savedMean); + miopen::checkNumericsInput(handle, savedMeanDesc, savedMean); if(savedInvVariance != nullptr) - miopen::checkNumericsInput(handle, bnScaleBiasDiffDesc, savedInvVariance); + miopen::checkNumericsInput(handle, savedVarianceDesc, savedInvVariance); } if(x == nullptr || dy == nullptr || bnScale == nullptr || dx == nullptr) { MIOPEN_THROW(miopenStatusBadParm); } - if(xDesc.GetNumDims() != dyDesc.GetNumDims() || - xDesc.GetNumDims() != bnScaleBiasDiffDesc.GetNumDims()) + if(xDesc.GetNumDims() != dyDesc.GetNumDims() || xDesc.GetNumDims() != scaleDesc.GetNumDims() || + xDesc.GetNumDims() != biasDesc.GetNumDims() || + xDesc.GetNumDims() != savedMeanDesc.GetNumDims() || + xDesc.GetNumDims() != savedVarianceDesc.GetNumDims()) { MIOPEN_THROW(miopenStatusBadParm); } @@ -336,8 +363,16 @@ void BatchNormBackward(Handle& handle, const auto useSaved = savedMean != nullptr && savedInvVariance != nullptr; - const auto problem = batchnorm::ProblemDescription{ - bn_mode, xDesc, dyDesc, dxDesc, bnScaleBiasDiffDesc, epsilon, useSaved}; + const auto problem = batchnorm::ProblemDescription{bn_mode, + xDesc, + dyDesc, + dxDesc, + scaleDesc, + biasDesc, + savedMeanDesc, + savedVarianceDesc, + epsilon, + useSaved}; const auto algo = bn_mode == miopenBNSpatial ? AlgorithmName{"miopenBatchNormBackwardPropSpatial"} @@ -368,8 +403,8 @@ void BatchNormBackward(Handle& handle, if(miopen::CheckNumericsEnabled()) { miopen::checkNumericsOutput(handle, dxDesc, dx); - miopen::checkNumericsOutput(handle, bnScaleBiasDiffDesc, resultBnScaleDiff); - miopen::checkNumericsOutput(handle, bnScaleBiasDiffDesc, resultBnBiasDiff); + miopen::checkNumericsOutput(handle, scaleDesc, resultBnScaleDiff); + miopen::checkNumericsOutput(handle, biasDesc, resultBnBiasDiff); } } } // namespace miopen diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp index 7769e4d563..bca7afc3a5 100644 --- a/src/solver/batchnorm/backward_ck.cpp +++ b/src/solver/batchnorm/backward_ck.cpp @@ -201,7 +201,8 @@ bool BnCKBwdBackward::IsApplicable( return false; if(bn_problem.GetDirection() != miopen::batchnorm::Direction::Backward) return false; - + if(!bn_problem.Is2D()) + return false; switch(bn_problem.GetXDesc().GetType()) { case miopenFloat: return CheckCKApplicability(bn_problem); diff --git a/src/solver/batchnorm/backward_per_activation.cpp b/src/solver/batchnorm/backward_per_activation.cpp index 93cf670194..af52fbc339 100644 --- a/src/solver/batchnorm/backward_per_activation.cpp +++ b/src/solver/batchnorm/backward_per_activation.cpp @@ -41,6 +41,8 @@ namespace batchnorm { bool BnBwdTrainingPerActivation::IsApplicable( const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem) const { + if(!problem.Is2D()) + return false; return problem.GetDirection() == miopen::batchnorm::Direction::Backward && problem.GetMode() == miopenBNPerActivation; } diff --git a/src/solver/batchnorm/backward_spatial_multiple.cpp b/src/solver/batchnorm/backward_spatial_multiple.cpp index 29bbd5dba9..7fa9c0f89a 100644 --- a/src/solver/batchnorm/backward_spatial_multiple.cpp +++ b/src/solver/batchnorm/backward_spatial_multiple.cpp @@ -44,6 +44,10 @@ bool BnBwdTrainingSpatialMultiple::IsApplicable( if(problem.GetDirection() != miopen::batchnorm::Direction::Backward || problem.GetMode() != miopenBNSpatial) return false; + if(!problem.Is2D()) + { + return false; + } #if WORKAROUND_ISSUE_1549_FP16_BUILD_ERROR if(problem.GetXDesc().GetType() == miopenHalf && diff --git a/src/solver/batchnorm/backward_spatial_single.cpp b/src/solver/batchnorm/backward_spatial_single.cpp index 30b0c0495f..86fa5a68c7 100644 --- a/src/solver/batchnorm/backward_spatial_single.cpp +++ b/src/solver/batchnorm/backward_spatial_single.cpp @@ -45,6 +45,8 @@ bool BnBwdTrainingSpatialSingle::IsApplicable( if(problem.GetDirection() != miopen::batchnorm::Direction::Backward || problem.GetMode() != miopenBNSpatial) return false; + if(!problem.Is2D()) + return false; #if WORKAROUND_ISSUE_1549_FP16_BUILD_ERROR if(problem.GetXDesc().GetType() == miopenHalf && diff --git a/test/bn_3d_peract_test.cpp b/test/bn_3d_peract_test.cpp index 19fd15e7ce..c5f96ff9ba 100644 --- a/test/bn_3d_peract_test.cpp +++ b/test/bn_3d_peract_test.cpp @@ -281,6 +281,9 @@ struct verify_forward_train_3d_bn_per_activation BuildReshaped4DTensorDescriptor(out.desc), out_dev.get(), BuildReshaped4DTensorDescriptor(scale.desc), + BuildReshaped4DTensorDescriptor(shift.desc), + BuildReshaped4DTensorDescriptor(shift.desc), + BuildReshaped4DTensorDescriptor(shift.desc), scale_dev.get(), shift_dev.get(), expAvgFactor, @@ -450,6 +453,9 @@ struct verify_forward_infer_3d_bn_per_activation_recalc BuildReshaped4DTensorDescriptor(out.desc), out_dev.get(), BuildReshaped4DTensorDescriptor(scale.desc), + BuildReshaped4DTensorDescriptor(shift.desc), + BuildReshaped4DTensorDescriptor(shift.desc), + BuildReshaped4DTensorDescriptor(shift.desc), scale_dev.get(), shift_dev.get(), nullptr, @@ -573,6 +579,9 @@ struct verify_forward_infer_3d_bn_per_activation_use_est BuildReshaped4DTensorDescriptor(out.desc), out_dev.get(), BuildReshaped4DTensorDescriptor(scale.desc), + BuildReshaped4DTensorDescriptor(shift.desc), + BuildReshaped4DTensorDescriptor(shift.desc), + BuildReshaped4DTensorDescriptor(shift.desc), scale_dev.get(), shift_dev.get(), estMean_dev.get(), @@ -747,6 +756,9 @@ struct verify_backward_3d_bn_per_activation_use_saved BuildReshaped4DTensorDescriptor(dx_out.desc), dx_out_dev.get(), BuildReshaped4DTensorDescriptor(scale.desc), + BuildReshaped4DTensorDescriptor(dshift.desc), + BuildReshaped4DTensorDescriptor(dshift.desc), + BuildReshaped4DTensorDescriptor(dshift.desc), scale_dev.get(), dscale_dev.get(), dshift_dev.get(), @@ -948,6 +960,9 @@ struct verify_backward_3d_bn_per_activation_recalc BuildReshaped4DTensorDescriptor(dx_out.desc), dx_out_dev.get(), BuildReshaped4DTensorDescriptor(scale.desc), + BuildReshaped4DTensorDescriptor(dshift.desc), + BuildReshaped4DTensorDescriptor(dshift.desc), + BuildReshaped4DTensorDescriptor(dshift.desc), scale_dev.get(), dscale_dev.get(), dshift_dev.get(), diff --git a/test/bn_3d_spatial_test.cpp b/test/bn_3d_spatial_test.cpp index 8d428fca2b..08bfdb5a57 100644 --- a/test/bn_3d_spatial_test.cpp +++ b/test/bn_3d_spatial_test.cpp @@ -327,6 +327,9 @@ struct verify_forward_train_3d_bn_spatial miopen::BuildReshaped4DTensorDescriptor(out.desc), out_dev.get(), miopen::BuildReshaped4DTensorDescriptor(scale.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), scale_dev.get(), shift_dev.get(), expAvgFactor, @@ -516,6 +519,9 @@ struct verify_forward_infer_3d_bn_spatial_recalc miopen::BuildReshaped4DTensorDescriptor(out.desc), out_dev.get(), miopen::BuildReshaped4DTensorDescriptor(scale.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), scale_dev.get(), shift_dev.get(), nullptr, @@ -632,6 +638,9 @@ struct verify_forward_infer_3d_bn_spatial_use_est miopen::BuildReshaped4DTensorDescriptor(out.desc), out_dev.get(), miopen::BuildReshaped4DTensorDescriptor(scale.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), scale_dev.get(), shift_dev.get(), estMean_dev.get(), @@ -913,6 +922,9 @@ struct verify_backward_3d_bn_spatial_recalc miopen::BuildReshaped4DTensorDescriptor(dx_out.desc), dx_out_dev.get(), miopen::BuildReshaped4DTensorDescriptor(scale.desc), + miopen::BuildReshaped4DTensorDescriptor(dshift.desc), + miopen::BuildReshaped4DTensorDescriptor(dshift.desc), + miopen::BuildReshaped4DTensorDescriptor(dshift.desc), scale_dev.get(), dscale_dev.get(), dshift_dev.get(), @@ -1138,6 +1150,9 @@ struct verify_backward_3d_bn_spatial_use_saved miopen::BuildReshaped4DTensorDescriptor(dx_out.desc), dx_out_dev.get(), miopen::BuildReshaped4DTensorDescriptor(scale.desc), + miopen::BuildReshaped4DTensorDescriptor(dshift.desc), + miopen::BuildReshaped4DTensorDescriptor(dshift.desc), + miopen::BuildReshaped4DTensorDescriptor(dshift.desc), scale_dev.get(), dscale_dev.get(), dshift_dev.get(), diff --git a/test/bn_peract_test.cpp b/test/bn_peract_test.cpp index 6622230666..4d83e05df7 100644 --- a/test/bn_peract_test.cpp +++ b/test/bn_peract_test.cpp @@ -271,6 +271,9 @@ struct verify_forward_train_bn_per_activation out.desc, out_dev.get(), scale.desc, + shift.desc, + shift.desc, + shift.desc, scale_dev.get(), shift_dev.get(), expAvgFactor, @@ -433,6 +436,9 @@ struct verify_forward_infer_bn_per_activation_recalc out.desc, out_dev.get(), scale.desc, + shift.desc, + shift.desc, + shift.desc, scale_dev.get(), shift_dev.get(), nullptr, @@ -550,6 +556,9 @@ struct verify_forward_infer_bn_per_activation_use_est out.desc, out_dev.get(), scale.desc, + shift.desc, + shift.desc, + shift.desc, scale_dev.get(), shift_dev.get(), estMean_dev.get(), @@ -716,6 +725,9 @@ struct verify_backward_bn_per_activation_use_saved dx_out.desc, dx_out_dev.get(), scale.desc, + dshift.desc, + dshift.desc, + dshift.desc, scale_dev.get(), dscale_dev.get(), dshift_dev.get(), @@ -909,6 +921,9 @@ struct verify_backward_bn_per_activation_recalc dx_out.desc, dx_out_dev.get(), scale.desc, + dshift.desc, + dshift.desc, + dshift.desc, scale_dev.get(), dscale_dev.get(), dshift_dev.get(), diff --git a/test/bn_spatial_test.cpp b/test/bn_spatial_test.cpp index 82d1cc271b..95a8ee099a 100644 --- a/test/bn_spatial_test.cpp +++ b/test/bn_spatial_test.cpp @@ -308,6 +308,9 @@ struct verify_forward_train_bn_spatial out.desc, out_dev.get(), scale.desc, + shift.desc, + shift.desc, + shift.desc, scale_dev.get(), shift_dev.get(), expAvgFactor, @@ -484,6 +487,9 @@ struct verify_forward_infer_bn_spatial_recalc out.desc, out_dev.get(), scale.desc, + shift.desc, + shift.desc, + shift.desc, scale_dev.get(), shift_dev.get(), nullptr, @@ -596,6 +602,9 @@ struct verify_forward_infer_bn_spatial_use_est out.desc, out_dev.get(), scale.desc, + shift.desc, + shift.desc, + shift.desc, scale_dev.get(), shift_dev.get(), estMean_dev.get(), @@ -853,6 +862,9 @@ struct verify_backward_bn_spatial_recalc dx_out.desc, dx_out_dev.get(), scale.desc, + dshift.desc, + dshift.desc, + dshift.desc, scale_dev.get(), dscale_dev.get(), dshift_dev.get(), @@ -1065,6 +1077,9 @@ struct verify_backward_bn_spatial_use_saved dx_out.desc, dx_out_dev.get(), scale.desc, + dshift.desc, + dshift.desc, + dshift.desc, scale_dev.get(), dscale_dev.get(), dshift_dev.get(), diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index c0c49b06b6..d525b79cf6 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -134,9 +134,9 @@ void convHostForward(const tensor& input, } } -template +template void batchNormSpatialHostInference(const tensor& input, - tensor& output, + tensor& output, const tensor& scale, const tensor& bias, double epsilon, @@ -162,20 +162,21 @@ void batchNormSpatialHostInference(const tensor& input, output(bidx, cidx, row, column) = static_cast(scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); // printf("output: %f\n",scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); + // std::cout << output(bidx, cidx, row, column) << ","; } } } }); } -template +template void batchNormPerActivHostInference(const tensor& input, - tensor& output, + tensor& output, const tensor& scale, const tensor& bias, double epsilon, - const tensor& estimatedMean, - const tensor& estimatedVariance) + const tensor& estimatedMean, + const tensor& estimatedVariance) { int n_batches, channels, height, width; std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); @@ -202,17 +203,17 @@ void batchNormPerActivHostInference(const tensor& input, }); } -template +template void batchNormSpatialHostFwdTrain(const tensor& input, - tensor& out, + tensor& out, const tensor& scale, const tensor& bias, double epsilon, double expAvgFactor, - tensor& saveMean, - tensor& saveInvVar, - tensor& runMean, - tensor& runVar) + tensor& saveMean, + tensor& saveInvVar, + tensor& runMean, + tensor& runVar) { int height, width, n_batch, channels; @@ -265,30 +266,38 @@ void batchNormSpatialHostFwdTrain(const tensor& input, } // for (column) } // for (row) } // end for(n_batchs) - - saveMean(0, cidx, 0, 0) = mean_accum; - saveInvVar(0, cidx, 0, 0) = invVar; - - newRunMean = runMean(0, cidx, 0, 0) * (1 - expAvgFactor); - runMean(0, cidx, 0, 0) = mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp - // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n) - adjust = - (n_batch * height * width == 1) ? variance_accum : (nhw / (nhw - 1)) * variance_accum; - runVar(0, cidx, 0, 0) = (1 - expAvgFactor) * runVar(0, cidx, 0, 0) + expAvgFactor * adjust; + if(!saveMean.data.empty()) + { + saveMean(0, cidx, 0, 0) = mean_accum; + saveInvVar(0, cidx, 0, 0) = invVar; + } + if(!runMean.data.empty()) + { + newRunMean = runMean(0, cidx, 0, 0) * (1 - expAvgFactor); + runMean(0, cidx, 0, 0) = mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp + // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n) + adjust = (n_batch * height * width == 1) ? variance_accum + : (nhw / (nhw - 1)) * variance_accum; + runVar(0, cidx, 0, 0) = + (1 - expAvgFactor) * runVar(0, cidx, 0, 0) + expAvgFactor * adjust; + } }); } -template -void batchNormSpatialHostBwdTrain(const tensor& x_input, - const tensor& dy_input, - tensor& dx_out, - const tensor& scale, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar) +template +void batchNormSpatialHostBwdTrain(const tensor& x_input, + const tensor& dy_input, + tensor& dx_out, + const tensor& bnScale, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar) { - int height, width, n_batch, channels; std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); auto nhw = double(height * width * n_batch); @@ -297,14 +306,50 @@ void batchNormSpatialHostBwdTrain(const tensor& x_input, par_for(channels, 1, [&](int cidx) { double elemStd = 0.; unsigned int xhat_index; - double mean = savedMean(0, cidx, 0, 0); // HxW elements - double invVar = savedInvVar(0, cidx, 0, 0); // HxW elements + double mean = 0.0; + double invVar = 0.0; double dyelem = 0.; std::vector xhat(static_cast(n_batch) * in_cstride, 0.0); // process the batch per channel dscale(0, cidx, 0, 0) = 0.; dbias(0, cidx, 0, 0) = 0.; + if(!savedMean.data.empty()) + { + + mean = savedMean(0, cidx, 0, 0); // HxW elements + invVar = savedInvVar(0, cidx, 0, 0); // HxW elements + } + else + { + double variance_accum = 0.; + double mean_accum = 0.; + double inv_Var = 0.; + + // process the batch per channel + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + // #1 calculate the mean + // iterating through the stack of images in the mini_batch + auto inval = static_cast(x_input(bidx, cidx, row, column)); + mean_accum += inval; + variance_accum += inval * inval; + } // end for (column) + } // end for (row) + } // end for (n) + + mean_accum /= nhw; + variance_accum /= nhw; + variance_accum += (-mean_accum * mean_accum); + inv_Var = 1.0 / sqrt(variance_accum); + + mean = mean_accum; + invVar = inv_Var; + } for(int row = 0; row < height; row++) { // via rows for(int column = 0; column < width; column++) @@ -333,29 +378,36 @@ void batchNormSpatialHostBwdTrain(const tensor& x_input, double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); - double tmp3 = (scale(0, cidx, 0, 0) * invVar) / nhw; - dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); + double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw; + dx_out(bidx, cidx, row, column) = + static_cast(tmp3 * (tmp2 + tmp1)); } // end for(n_batchs) } // for (column) } // for (row) }); // for (channel) } -template +template void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, double gamma, double beta, double alpha, - const tensor& x_input, - const tensor& dy_input, - const tensor& y_input, - tensor& dx_out, - const tensor& scale, - const tensor& bias, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar) + const tensor& x_input, + const tensor& dy_input, + const tensor& y_input, + tensor& dx_out, + const tensor& bnScale, + const tensor& bias, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar) { int height, width, n_batch, channels; @@ -387,7 +439,8 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, elemStd = static_cast(x_input(bidx, cidx, row, column)) - mean; // (x_i - mean) xhat[xhat_index] = elemStd * invVar; - double bnrefowd = scale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); + double bnrefowd = + bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); activationHostBwdElement(activMode, gamma, beta, @@ -408,8 +461,9 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, { // via columns for(int bidx = 0; bidx < n_batch; bidx++) { // via mini_batch - xhat_index = in_cstride * bidx + (width * row + column); - double bnrefowd = scale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); + xhat_index = in_cstride * bidx + (width * row + column); + double bnrefowd = + bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); activationHostBwdElement(activMode, gamma, beta, @@ -421,25 +475,25 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, // double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); double tmp1 = nhw * dyelem - dbias(0, cidx, 0, 0); double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); - double tmp3 = (scale(0, cidx, 0, 0) * invVar) / nhw; - dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); + double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw; + dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); } // end for(n_batchs) } // for (column) } // for (row) }); // for (channel) } -template +template void batchNormPerActHostFwdTrain(const tensor& input, - tensor& out, + tensor& out, const tensor& scale, const tensor& bias, double epsilon, double expAvgFactor, - tensor& saveMean, - tensor& saveInvVar, - tensor& runMean, - tensor& runVar) + tensor& saveMean, + tensor& saveInvVar, + tensor& runMean, + tensor& runVar) { int height, width, n_batch, channels; @@ -483,7 +537,7 @@ void batchNormPerActHostFwdTrain(const tensor& input, elemStd = (input(bidx, cidx, row, column) - mean_accum); // (x_i - mean) inhat = elemStd * elemInvVar; // #5 Gamma and Beta adjust :: y_i = gamma*x_hat + beta - out(bidx, cidx, row, column) = static_cast( + out(bidx, cidx, row, column) = static_cast( scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column)); } // end for(n_batch) @@ -496,21 +550,21 @@ void batchNormPerActHostFwdTrain(const tensor& input, runVar(0, cidx, row, column) = (1 - expAvgFactor) * runVar(0, cidx, row, column) + expAvgFactor * adjust; - saveMean(0, cidx, row, column) = static_cast(mean_accum); - saveInvVar(0, cidx, row, column) = static_cast(elemInvVar); + saveMean(0, cidx, row, column) = static_cast(mean_accum); + saveInvVar(0, cidx, row, column) = static_cast(elemInvVar); } // for (column) } // for (row) }); } -template +template void batchNormPerActHostBwdTrain(const tensor& x_input, const tensor& dy_input, const tensor& scale, - tensor& dscale, - tensor& dbias, - tensor& dx_out, + tensor& dscale, + tensor& dbias, + tensor& dx_out, const tensor& savedMean, const tensor& savedInvVar) { diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index f5227217e4..e1f192c37d 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -32,17 +32,25 @@ #include "bn_test_data.hpp" #include "test_operations.hpp" +// Define an enum to identify which version of BN api to call +enum BNApiType +{ + testBNAPIV1, + testBNAPIV2, +}; + template -struct BNInferTest : public ::testing::TestWithParam> +struct BNInferTest + : public ::testing::TestWithParam> { protected: void SetUp() override { - std::tie(bn_config, tensor_layout) = GetParam(); + std::tie(bn_config, tensor_layout, api_type) = GetParam(); bn_infer_test_data.SetUpImpl(bn_config, tensor_layout); auto&& handle = get_handle(); @@ -51,21 +59,47 @@ struct BNInferTest : public ::testing::TestWithParam bn_infer_test_data; miopenTensorLayout_t tensor_layout; + BNApiType api_type; }; template -struct BNBwdTest : public ::testing::TestWithParam> +struct BNBwdTest + : public ::testing::TestWithParam> { protected: void SetUp() override { - std::tie(bn_config, tensor_layout) = GetParam(); + std::tie(bn_config, tensor_layout, api_type) = GetParam(); bn_bwd_test_data.SetUpImpl(bn_config, tensor_layout); auto&& handle = get_handle(); - auto res = miopenBatchNormalizationBackward(&handle, - bn_config.mode, - &bn_bwd_test_data.alphaDataDiff, - &bn_bwd_test_data.betaDataDiff, - &bn_bwd_test_data.alphaParamDiff, - &bn_bwd_test_data.betaParamDiff, - &bn_bwd_test_data.input.desc, - bn_bwd_test_data.in_dev.get(), - &bn_bwd_test_data.dy.desc, - bn_bwd_test_data.dy_dev.get(), - &bn_bwd_test_data.output.desc, - bn_bwd_test_data.out_dev.get(), - &bn_bwd_test_data.bnScale.desc, - bn_bwd_test_data.bnScale_dev.get(), - bn_bwd_test_data.dScale_dev.get(), - bn_bwd_test_data.dBias_dev.get(), - bn_bwd_test_data.epsilon, - bn_bwd_test_data.savedMean_dev.get(), - bn_bwd_test_data.savedInvVar_dev.get()); + if(!miopen::solver::ck_utility::is_ck_whitelist(handle.GetStream())) + { + test_skipped = true; + GTEST_SKIP() << "Not Applicable on " << handle.GetDeviceName() << " Architecture"; + } + miopenStatus_t res = miopenStatusUnknownError; + if(api_type == BNApiType::testBNAPIV1) + { + res = miopenBatchNormalizationBackward(&handle, + bn_config.mode, + &bn_bwd_test_data.alphaDataDiff, + &bn_bwd_test_data.betaDataDiff, + &bn_bwd_test_data.alphaParamDiff, + &bn_bwd_test_data.betaParamDiff, + &bn_bwd_test_data.input.desc, + bn_bwd_test_data.in_dev.get(), + &bn_bwd_test_data.dy.desc, + bn_bwd_test_data.dy_dev.get(), + &bn_bwd_test_data.output.desc, + bn_bwd_test_data.out_dev.get(), + &bn_bwd_test_data.bnScale.desc, + bn_bwd_test_data.bnScale_dev.get(), + bn_bwd_test_data.dScale_dev.get(), + bn_bwd_test_data.dBias_dev.get(), + bn_bwd_test_data.epsilon, + bn_bwd_test_data.savedMean_dev.get(), + bn_bwd_test_data.savedInvVar_dev.get()); + } + else if(api_type == BNApiType::testBNAPIV2) + { + res = miopenBatchNormalizationBackward_V2(&handle, + bn_config.mode, + &bn_bwd_test_data.alphaDataDiff, + &bn_bwd_test_data.betaDataDiff, + &bn_bwd_test_data.alphaParamDiff, + &bn_bwd_test_data.betaParamDiff, + &bn_bwd_test_data.input.desc, + bn_bwd_test_data.in_dev.get(), + &bn_bwd_test_data.dy.desc, + bn_bwd_test_data.dy_dev.get(), + &bn_bwd_test_data.output.desc, + bn_bwd_test_data.out_dev.get(), + &bn_bwd_test_data.bnScale.desc, + &bn_bwd_test_data.dBias.desc, + &bn_bwd_test_data.savedMean.desc, + &bn_bwd_test_data.savedInvVar.desc, + bn_bwd_test_data.bnScale_dev.get(), + bn_bwd_test_data.dScale_dev.get(), + bn_bwd_test_data.dBias_dev.get(), + bn_bwd_test_data.epsilon, + bn_bwd_test_data.savedMean_dev.get(), + bn_bwd_test_data.savedInvVar_dev.get()); + } + else + GTEST_FAIL() << "ERROR: unknown bn api type!!"; if(res != miopenStatusSuccess) { GTEST_FAIL() << "miopenBatchNormalizationBackward failed"; @@ -183,6 +255,7 @@ struct BNBwdTest : public ::testing::TestWithParam bn_bwd_test_data; miopenTensorLayout_t tensor_layout; + BNApiType api_type; }; template struct BNFwdTrainTest - : public ::testing::TestWithParam> + : public ::testing::TestWithParam> { protected: void SetUp() override { - std::tie(bn_config, tensor_layout) = GetParam(); + std::tie(bn_config, tensor_layout, api_type) = GetParam(); bn_fwd_train_test_data.SetUpImpl(bn_config, tensor_layout); auto&& handle = get_handle(); - auto res = - miopenBatchNormalizationForwardTraining(&handle, - bn_config.mode, - &bn_fwd_train_test_data.alpha, - &bn_fwd_train_test_data.beta, - &bn_fwd_train_test_data.input.desc, - bn_fwd_train_test_data.in_dev.get(), - &bn_fwd_train_test_data.output.desc, - bn_fwd_train_test_data.out_dev.get(), - &bn_fwd_train_test_data.scale.desc, - bn_fwd_train_test_data.scale_dev.get(), - bn_fwd_train_test_data.shift_dev.get(), - bn_fwd_train_test_data.averageFactor, - bn_fwd_train_test_data.runMean_dev.get(), - bn_fwd_train_test_data.runVariance_dev.get(), - bn_fwd_train_test_data.epsilon, - bn_fwd_train_test_data.saveMean_dev.get(), - bn_fwd_train_test_data.saveVariance_dev.get()); + if(!miopen::solver::ck_utility::is_ck_whitelist(handle.GetStream())) + { + test_skipped = true; + GTEST_SKIP() << "Not Applicable on " << handle.GetDeviceName() << " Architecture"; + } + miopenStatus_t res = miopenStatusUnknownError; + if(api_type == BNApiType::testBNAPIV1) + { + res = miopenBatchNormalizationForwardTraining( + &handle, + bn_config.mode, + &bn_fwd_train_test_data.alpha, + &bn_fwd_train_test_data.beta, + &bn_fwd_train_test_data.input.desc, + bn_fwd_train_test_data.in_dev.get(), + &bn_fwd_train_test_data.output.desc, + bn_fwd_train_test_data.out_dev.get(), + &bn_fwd_train_test_data.scale.desc, + bn_fwd_train_test_data.scale_dev.get(), + bn_fwd_train_test_data.shift_dev.get(), + bn_fwd_train_test_data.averageFactor, + bn_fwd_train_test_data.runMean_dev.get(), + bn_fwd_train_test_data.runVariance_dev.get(), + bn_fwd_train_test_data.epsilon, + bn_fwd_train_test_data.saveMean_dev.get(), + bn_fwd_train_test_data.saveVariance_dev.get()); + } + else if(api_type == BNApiType::testBNAPIV2) + { + res = miopenBatchNormalizationForwardTraining_V2( + &handle, + bn_config.mode, + &bn_fwd_train_test_data.alpha, + &bn_fwd_train_test_data.beta, + &bn_fwd_train_test_data.input.desc, + bn_fwd_train_test_data.in_dev.get(), + &bn_fwd_train_test_data.output.desc, + bn_fwd_train_test_data.out_dev.get(), + &bn_fwd_train_test_data.scale.desc, + &bn_fwd_train_test_data.shift.desc, + &bn_fwd_train_test_data.saveMean.desc, + &bn_fwd_train_test_data.saveVariance.desc, + bn_fwd_train_test_data.scale_dev.get(), + bn_fwd_train_test_data.shift_dev.get(), + bn_fwd_train_test_data.averageFactor, + bn_fwd_train_test_data.runMean_dev.get(), + bn_fwd_train_test_data.runVariance_dev.get(), + bn_fwd_train_test_data.epsilon, + bn_fwd_train_test_data.saveMean_dev.get(), + bn_fwd_train_test_data.saveVariance_dev.get()); + } + else + GTEST_FAIL() << "ERROR: unknown bn api type!!"; if(res != miopenStatusSuccess) { GTEST_FAIL() << "miopenBatchNormalizationForwardTraining failed"; @@ -275,4 +383,5 @@ struct BNFwdTrainTest BNFwdTrainTestData bn_fwd_train_test_data; miopenTensorLayout_t tensor_layout; + BNApiType api_type; }; diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp index f2d54e8077..df093a4710 100644 --- a/test/gtest/bn_bwd.cpp +++ b/test/gtest/bn_bwd.cpp @@ -26,46 +26,123 @@ #include "bn.hpp" -struct GPU_BNBwd_FP16 +// https://github.com/ROCm/MIOpen/issues/1549 +// NCHW solver accepts +// XDataType : half_float::half +// YDataYype : half_float::half +// ScaleDataType : half_float::half +// BiasDataType : half_float::half +// MeanVarDataType : half_float::half +// struct GPU_BN_V1_BwdNCHW_FP16 : BNBwdTest +// { +// }; + +// NHWC solver accepts +// XDataType : half_float::half +// YDataYype : half_float::half +// ScaleDataType : half_float::half +// BiasDataType : half_float::half +// MeanVarDataType : float +struct GPU_BN_V2_BwdNHWC_FP16 : BNBwdTest { }; -struct GPU_BNBwd_FP32 : BNBwdTest +// bf16 NHWC solver accepts is only on CK solver +// XDataType : bfloat16 +// YDataYype : bfloat16 +// ScaleDataType : bfloat16 +// BiasDataType : bfloat16 +// MeanVarDataType : float +struct GPU_BN_V1_BwdNHWC_BFP16 : BNBwdTest +{ +}; + +struct GPU_BN_V2_BwdNHWC_BFP16 : BNBwdTest +{ +}; + +struct GPU_BN_V1_Bwd_FP32 : BNBwdTest +{ +}; + +struct GPU_BN_V2_Bwd_FP32 : BNBwdTest { }; -struct GPU_BNBwd_BFP16 : BNBwdTest +struct GPU_BN_V1_BwdNHWC_FP64 : BNBwdTest { }; -struct GPU_BNBwd_FP64 : BNBwdTest +struct GPU_BN_V2_BwdNHWC_FP64 : BNBwdTest { }; -TEST_P(GPU_BNBwd_FP16, BnBwdCKHalf) {} +// fp16 +// TEST_P(GPU_BN_V1_BwdNCHW_FP16, BnV1BwdHalf) {} +TEST_P(GPU_BN_V2_BwdNHWC_FP16, BnV2BwdCKHalf) {} + +// float +TEST_P(GPU_BN_V1_Bwd_FP32, BnV1BwdFloat) {} +TEST_P(GPU_BN_V2_Bwd_FP32, BnV2BwdFloat) {} + +// bfp16 is only on CK solver +TEST_P(GPU_BN_V1_BwdNHWC_BFP16, BnV1BwdCKBfloat) {} +TEST_P(GPU_BN_V2_BwdNHWC_BFP16, BnV2BwdCKBfloat) {} -TEST_P(GPU_BNBwd_FP32, BnBwdCKFloat) {} +// double is only on CK solver +TEST_P(GPU_BN_V1_BwdNHWC_FP64, BnV1BwdCKDouble) {} +TEST_P(GPU_BN_V2_BwdNHWC_FP64, BnV2BwdCKDouble) {} -TEST_P(GPU_BNBwd_BFP16, BnBwdCKBFloat16) {} -TEST_P(GPU_BNBwd_FP64, BnBwdCKDouble) {} +// // fp16 +// INSTANTIATE_TEST_SUITE_P(Smoke, +// GPU_BN_V1_BwdNCHW_FP16, +// testing::Combine(testing::ValuesIn(NetworkSmall()), +// testing::Values(miopenTensorNCHW), +// testing::ValuesIn({testBNAPIV1}))); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V2_BwdNHWC_FP16, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); + +// fp32 +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V1_Bwd_FP32, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNCHW), + testing::ValuesIn({testBNAPIV1}))); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V2_Bwd_FP32, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); +// bfp16 is only on CK solver INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNBwd_FP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V1_BwdNHWC_BFP16, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNBwd_FP32, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V2_BwdNHWC_BFP16, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); +// fp64 is only on CK solver INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNBwd_BFP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V1_BwdNHWC_FP64, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNBwd_FP64, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V2_BwdNHWC_FP64, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); diff --git a/test/gtest/bn_fwd_train.cpp b/test/gtest/bn_fwd_train.cpp index b5dbed4705..ed25631175 100644 --- a/test/gtest/bn_fwd_train.cpp +++ b/test/gtest/bn_fwd_train.cpp @@ -26,46 +26,119 @@ #include "bn.hpp" -struct GPU_BNFwdTrain_FP16 +// ** OCL kernel for fwd training are failing gtest ** +// ** Hence, this gtest only tests CK solvers ** + +// NHWC solver accepts +// XDataType : half_float::half +// YDataYype : half_float::half +// ScaleDataType : half_float::half +// BiasDataType : half_float::half +// MeanVarDataType : float +struct GPU_BN_V1_FwdTrainNHWC_FP16 + : BNFwdTrainTest +{ +}; + +struct GPU_BN_V2_FwdTrainNHWC_FP16 : BNFwdTrainTest { }; -struct GPU_BNFwdTrain_FP32 : BNFwdTrainTest +// bf16 NHWC solver accepts is only on CK solver +// XDataType : bfloat16 +// YDataYype : bfloat16 +// ScaleDataType : bfloat16 +// BiasDataType : bfloat16 +// MeanVarDataType : float +struct GPU_BN_V1_FwdTrainNHWC_BFP16 : BNFwdTrainTest +{ +}; + +struct GPU_BN_V2_FwdTrainNHWC_BFP16 : BNFwdTrainTest { }; -struct GPU_BNFwdTrain_FP64 : BNFwdTrainTest +struct GPU_BN_V1_FwdTrainNHWC_FP32 : BNFwdTrainTest { }; -struct GPU_BNFwdTrain_BFP16 : BNFwdTrainTest +struct GPU_BN_V2_FwdTrainNHWC_FP32 : BNFwdTrainTest { }; -TEST_P(GPU_BNFwdTrain_FP16, BnFwdTrainCKHalf) {} +struct GPU_BN_V1_FwdTrainNHWC_FP64 : BNFwdTrainTest +{ +}; -TEST_P(GPU_BNFwdTrain_FP32, BnFwdTrainCKFloat) {} +struct GPU_BN_V2_FwdTrainNHWC_FP64 : BNFwdTrainTest +{ +}; -TEST_P(GPU_BNFwdTrain_FP64, BnFwdTrainCKDouble) {} -TEST_P(GPU_BNFwdTrain_BFP16, BnFwdTrainCKBFloat16) {} +// fp16 +TEST_P(GPU_BN_V1_FwdTrainNHWC_FP16, BnV1FwdTrainHalf) {} +TEST_P(GPU_BN_V2_FwdTrainNHWC_FP16, BnV2FwdTrainCKHalf) {} + +// float +TEST_P(GPU_BN_V1_FwdTrainNHWC_FP32, BnV1FwdTrainFloat) {} +TEST_P(GPU_BN_V2_FwdTrainNHWC_FP32, BnV2FwdTrainFloat) {} + +// bfp16 +TEST_P(GPU_BN_V1_FwdTrainNHWC_BFP16, BnV1FwdTrainCKBfloat) {} +TEST_P(GPU_BN_V2_FwdTrainNHWC_BFP16, BnV2FwdTrainCKBfloat) {} + +// double +TEST_P(GPU_BN_V1_FwdTrainNHWC_FP64, BnV1FwdTrainCKDouble) {} +TEST_P(GPU_BN_V2_FwdTrainNHWC_FP64, BnV2FwdTrainCKDouble) {} + +// fp16 +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V1_FwdTrainNHWC_FP16, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V2_FwdTrainNHWC_FP16, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); + +// fp32 +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V1_FwdTrainNHWC_FP32, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V2_FwdTrainNHWC_FP32, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); +// bfp16 INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNFwdTrain_FP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V1_FwdTrainNHWC_BFP16, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNFwdTrain_FP32, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V2_FwdTrainNHWC_BFP16, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); +// fp64 INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNFwdTrain_FP64, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V1_FwdTrainNHWC_FP64, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNFwdTrain_BFP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V2_FwdTrainNHWC_FP64, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp index 081d583213..aea15c097e 100644 --- a/test/gtest/bn_infer.cpp +++ b/test/gtest/bn_infer.cpp @@ -27,66 +27,124 @@ #include "bn.hpp" // NCHW solver accepts -// XDataType : half -// YDataYype : half -// ScaleDataType : float -// BiasDataType : float -// MeanVarDataType : float -struct GPU_BNInferNCHW_FP16 : BNInferTest +// XDataType : half_float::half +// YDataYype : half_float::half +// ScaleDataType : half_float::half +// BiasDataType : half_float::half +// MeanVarDataType : half_float::half +struct GPU_BN_V1_InferNCHW_FP16 : BNInferTest { }; // NHWC solver accepts -// XDataType : half -// YDataYype : half -// ScaleDataType : half -// BiasDataType : half +// XDataType : half_float::half +// YDataYype : half_float::half +// ScaleDataType : half_float::half +// BiasDataType : half_float::half // MeanVarDataType : float -struct GPU_BNInferNHWC_FP16 +struct GPU_BN_V2_InferNHWC_FP16 : BNInferTest { }; -struct GPU_BNInfer_FP32 : BNInferTest +// bf16 NHWC solver accepts is only on CK solver +// XDataType : bfloat16 +// YDataYype : bfloat16 +// ScaleDataType : bfloat16 +// BiasDataType : bfloat16 +// MeanVarDataType : float +struct GPU_BN_V1_InferNHWC_BFP16 : BNInferTest +{ +}; + +struct GPU_BN_V2_InferNHWC_BFP16 : BNInferTest { }; -struct GPU_BNInfer_FP64 : BNInferTest +struct GPU_BN_V1_Infer_FP32 : BNInferTest { }; -struct GPU_BNInfer_BFP16 : BNInferTest +struct GPU_BN_V2_Infer_FP32 : BNInferTest { }; -TEST_P(GPU_BNInferNCHW_FP16, BnInferCKHalf) {} -TEST_P(GPU_BNInferNHWC_FP16, BnInferCKHalf) {} +struct GPU_BN_V1_InferNHWC_FP64 : BNInferTest +{ +}; + +struct GPU_BN_V2_InferNHWC_FP64 : BNInferTest +{ +}; -TEST_P(GPU_BNInfer_FP32, BnInferCKFloat) {} -TEST_P(GPU_BNInfer_FP64, BnInferCKDouble) {} -TEST_P(GPU_BNInfer_BFP16, BnInferCKBFloat16) {} +// fp16 +TEST_P(GPU_BN_V1_InferNCHW_FP16, BnV1InferHalf) {} +TEST_P(GPU_BN_V2_InferNHWC_FP16, BnV2InferCKHalf) {} + +// float +TEST_P(GPU_BN_V1_Infer_FP32, BnV1InferFloat) {} +TEST_P(GPU_BN_V2_Infer_FP32, BnV2InferFloat) {} + +// bfp16 is only on CK solver +TEST_P(GPU_BN_V1_InferNHWC_BFP16, BnV1InferCKBfloat) {} +TEST_P(GPU_BN_V2_InferNHWC_BFP16, BnV2InferCKBfloat) {} + +// double is only on CK solver +TEST_P(GPU_BN_V1_InferNHWC_FP64, BnV1InferCKDouble) {} +TEST_P(GPU_BN_V2_InferNHWC_FP64, BnV2InferCKDouble) {} + +// fp16 +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V1_InferNCHW_FP16, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNCHW), + testing::ValuesIn({testBNAPIV1}))); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V2_InferNHWC_FP16, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); + +// fp32 +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V1_Infer_FP32, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNCHW), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNInferNCHW_FP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNCHW))); + GPU_BN_V2_Infer_FP32, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); +// bfp16 is only on CK solver INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNInferNHWC_FP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V1_InferNHWC_BFP16, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNInfer_FP32, - testing::Combine(testing::ValuesIn(Network1()), - testing::ValuesIn({miopenTensorNHWC, miopenTensorNCHW}))); + GPU_BN_V2_InferNHWC_BFP16, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); +// fp64 is only on CK solver INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNInfer_FP64, - testing::Combine(testing::ValuesIn(Network1()), - testing::ValuesIn({miopenTensorNHWC}))); + GPU_BN_V1_InferNHWC_FP64, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNInfer_BFP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::ValuesIn({miopenTensorNHWC}))); + GPU_BN_V2_InferNHWC_FP64, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index fcf237400b..9afa8ea4ed 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -56,10 +56,13 @@ struct BNTestCase }; template -std::vector Network1(); +std::vector NetworkSmall(); + +template +std::vector NetworkLarge(); template <> -inline std::vector Network1() +inline std::vector NetworkLarge() { // pyt_mlperf_resnet50v1.5 return { @@ -95,6 +98,19 @@ inline std::vector Network1() {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}}; } +template <> +inline std::vector NetworkSmall() +{ + // pyt_mlperf_resnet50v1.5 + return { + {192, 2, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, + {16, 8, 132, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, + {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0}, + {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + + }; +} + template struct BNTestData {