Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable NCHW/NHWC and NCDHW/NDHWC layout in batch norm driver command #3234

Open
wants to merge 19 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,174 changes: 563 additions & 611 deletions driver/bn_driver.hpp

Large diffs are not rendered by default.

129 changes: 0 additions & 129 deletions driver/conv_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,135 +180,6 @@ static inline miopenDataType_t DataTypeFromShortString(const std::string& type)
}
}

template <typename Tgpu>
class GpumemTensor
{
std::unique_ptr<GPUMem> dev;
tensor<Tgpu> host;
bool is_gpualloc = false;

public:
void SetGpuallocMode(bool v) { is_gpualloc = v; }
tensor<Tgpu>& GetTensor() { return host; }

void AllocOnHost(miopenTensorDescriptor_t t)
{
host = tensor<Tgpu>(miopen::deref(t));
if(is_gpualloc) // We do not need host data.
{
host.data.clear();
host.data.shrink_to_fit(); // To free host memory.
}
}

std::vector<Tgpu>& GetVector()
{
if(is_gpualloc)
MIOPEN_THROW("[MIOpenDriver] GpumemTensor::GetVector should not be called in "
"'--gpualloc 1' mode");
return host.data;
}

Tgpu* GetVectorData() { return is_gpualloc ? nullptr : host.data.data(); }
std::size_t GetVectorSize() const { return is_gpualloc ? 0 : host.data.size(); }

void
InitHostData(const size_t sz, //
const bool do_write, // If set to false, then only generate random data. This is
// necessary to reproduce values in input buffers even if some
// directions are skipped. For example, inputs for Backward
// will be the same for both "-F 0" and "-F 2".
std::function<Tgpu()> generator)
{
if(is_gpualloc)
{
/// In gpualloc mode, we do not care about reproducibility of results, because
/// validation is not used. Therefore, we do not have to always generate random value
/// (\ref move_rand)
return;
}

for(size_t i = 0; i < sz; ++i)
{
/// \anchor move_rand
/// Generate random value, even if buffer is unused. This provides the same
/// initialization of input buffers regardless of which kinds of
/// convolutions are currently selectedfor testing (see the "-F" option).
/// Verification cache would be broken otherwise.
auto val = generator();
if(do_write)
GetVector()[i] = val;
}
}

status_t AllocOnDevice(stream, context_t ctx, const size_t sz)
{
dev = std::make_unique<GPUMem>(ctx, sz, sizeof(Tgpu));
return STATUS_SUCCESS;
}

status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz)
{
AllocOnDevice(q, ctx, sz);
if(is_gpualloc)
{
/// \anchor gpualloc_random_init
/// In gpualloc mode, we do not want to leave input buffers uninitialized, because
/// there could be NaNs and Infs, which may affect the performance (which we are
/// interested to evaluate in this mode). Initialization with all 0's is not the
/// best choice as well, because GPU HW may optimize out computations with 0's and
/// that could affect performance of kernels too. That is why we are using
/// rocrand to initialize input buffers.
///
/// However we do not care about precision in gpualloc mode, because validation
/// is not used. Therefore, range (0,1] is fine.
return gpumemrand::gen_0_1(static_cast<Tgpu*>(GetDevicePtr()), sz);
}
return dev->ToGPU(q, GetVectorData());
}

template <typename T>
status_t AllocOnDevice(stream, context_t ctx, const size_t sz, std::vector<T>&)
{
static_assert(std::is_same<T, float>::value //
|| std::is_same<T, int32_t>::value, //
"Before enabling more types, check thoroughly.");
dev = std::make_unique<GPUMem>(ctx, sz, sizeof(T));
return STATUS_SUCCESS;
}

template <typename T>
status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz, std::vector<T>& init)
{
AllocOnDevice(q, ctx, sz, init);
if(is_gpualloc)
{
/// \ref gpualloc_random_init
return gpumemrand::gen_0_1(static_cast<Tgpu*>(GetDevicePtr()), sz);
}
return dev->ToGPU(q, init.data());
}

status_t CopyFromDeviceToHost(stream q)
{
return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, GetVectorData());
}

template <typename T>
status_t CopyFromDeviceToHost(stream q, tensor<T>& t)
{
return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, t.data.data());
}

template <typename T>
status_t CopyFromDeviceToHost(stream q, std::vector<T>& v)
{
return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, v.data());
}

auto GetDevicePtr() -> auto { return dev->GetMem(); }
};

template <typename Tgpu>
class GpumemVector
{
Expand Down
4 changes: 4 additions & 0 deletions driver/dm_bnorm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@ static Driver* makeDriver(const std::string& base_arg)
if(base_arg == "bnorm")
return new BatchNormDriver<float, double>();
if(base_arg == "bnormfp16")
return new BatchNormDriver<float16, double, float16>();
if(base_arg == "bnormfp16fp32")
return new BatchNormDriver<float16, double, float>();
if(base_arg == "bnormbfp16fp32")
return new BatchNormDriver<bfloat16, double, float>();
return nullptr;
}

Expand Down
157 changes: 147 additions & 10 deletions driver/driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@
#include <miopen/logger.hpp>
#include <miopen/miopen.h>
#include <miopen/bfloat16.hpp>
#include <../test/tensor_holder.hpp>
#include "util_driver.hpp"
#include "rocrand_wrapper.hpp"
using half = half_float::half;
using hip_bfloat16 = bfloat16;
#include <hip_float8.hpp>
Expand Down Expand Up @@ -157,6 +160,140 @@ struct GPUMem
#endif
};

template <typename Tgpu>
class GpumemTensor
{
std::unique_ptr<GPUMem> dev;
tensor<Tgpu> host;
bool is_gpualloc = false;

public:
void SetGpuallocMode(bool v) { is_gpualloc = v; }
tensor<Tgpu>& GetTensor() { return host; }

void AllocOnHost(miopenTensorDescriptor_t t)
{
host = tensor<Tgpu>(miopen::deref(t));
if(is_gpualloc) // We do not need host data.
{
host.data.clear();
host.data.shrink_to_fit(); // To free host memory.
}
}
template <typename T>
void AllocOnHost(tensor<T> t)
{
AllocOnHost(&t.desc);
}

std::vector<Tgpu>& GetVector()
{
if(is_gpualloc)
MIOPEN_THROW("[MIOpenDriver] GpumemTensor::GetVector should not be called in "
"'--gpualloc 1' mode");
return host.data;
}

Tgpu* GetVectorData() { return is_gpualloc ? nullptr : host.data.data(); }
std::size_t GetVectorSize() const { return is_gpualloc ? 0 : host.data.size(); }

void
InitHostData(const size_t sz, //
const bool do_write, // If set to false, then only generate random data. This is
// necessary to reproduce values in input buffers even if some
// directions are skipped. For example, inputs for Backward
// will be the same for both "-F 0" and "-F 2".
std::function<Tgpu()> generator)
{
if(is_gpualloc)
{
/// In gpualloc mode, we do not care about reproducibility of results, because
/// validation is not used. Therefore, we do not have to always generate random value
/// (\ref move_rand)
return;
}

for(size_t i = 0; i < sz; ++i)
{
/// \anchor move_rand
/// Generate random value, even if buffer is unused. This provides the same
/// initialization of input buffers regardless of which kinds of
/// convolutions are currently selectedfor testing (see the "-F" option).
/// Verification cache would be broken otherwise.
auto val = generator();
if(do_write)
GetVector()[i] = val;
}
}

status_t AllocOnDevice(stream, context_t ctx, const size_t sz)
{
dev = std::make_unique<GPUMem>(ctx, sz, sizeof(Tgpu));
return STATUS_SUCCESS;
}

status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz)
{
AllocOnDevice(q, ctx, sz);
if(is_gpualloc)
{
/// \anchor gpualloc_random_init
/// In gpualloc mode, we do not want to leave input buffers uninitialized, because
/// there could be NaNs and Infs, which may affect the performance (which we are
/// interested to evaluate in this mode). Initialization with all 0's is not the
/// best choice as well, because GPU HW may optimize out computations with 0's and
/// that could affect performance of kernels too. That is why we are using
/// rocrand to initialize input buffers.
///
/// However we do not care about precision in gpualloc mode, because validation
/// is not used. Therefore, range (0,1] is fine.
return gpumemrand::gen_0_1(static_cast<Tgpu*>(GetDevicePtr()), sz);
}
return dev->ToGPU(q, GetVectorData());
}

template <typename T>
status_t AllocOnDevice(stream, context_t ctx, const size_t sz, std::vector<T>&)
{
static_assert(std::is_same<T, float>::value //
|| std::is_same<T, int32_t>::value, //
"Before enabling more types, check thoroughly.");
dev = std::make_unique<GPUMem>(ctx, sz, sizeof(T));
return STATUS_SUCCESS;
}

template <typename T>
status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz, std::vector<T>& init)
{
AllocOnDevice(q, ctx, sz, init);
if(is_gpualloc)
{
/// \ref gpualloc_random_init
return gpumemrand::gen_0_1(static_cast<Tgpu*>(GetDevicePtr()), sz);
}
return dev->ToGPU(q, init.data());
}

status_t CopyFromDeviceToHost(stream q)
{
return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, GetVectorData());
}

template <typename T>
status_t CopyFromDeviceToHost(stream q, tensor<T>& t)
{
return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, t.data.data());
}

template <typename T>
status_t CopyFromDeviceToHost(stream q, std::vector<T>& v)
{
return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, v.data());
}

auto GetDevicePtr() -> auto { return dev->GetMem(); }
};

inline void PadBufferSize(size_t& sz, int datatype_sz)
{
size_t page_sz = (2 * 1024 * 1024) / datatype_sz;
Expand Down Expand Up @@ -193,16 +330,16 @@ inline std::string ParseBaseArg(int argc, char* argv[])
if(arg != "conv" && arg != "convfp16" && arg != "convint8" && arg != "convbfp16" &&
arg != "pool" && arg != "poolfp16" && arg != "lrn" && arg != "lrnfp16" && arg != "activ" &&
arg != "activfp16" && arg != "softmax" && arg != "softmaxfp16" && arg != "bnorm" &&
arg != "bnormfp16" && arg != "rnn" && arg != "rnnfp16" && arg != "rnn_seq" &&
arg != "rnn_seqfp16" && arg != "gemm" && arg != "gemmfp16" && arg != "ctc" &&
arg != "dropout" && arg != "dropoutfp16" && arg != "tensorop" && arg != "reduce" &&
arg != "reducefp16" && arg != "reducefp64" && arg != "layernorm" && arg != "layernormfp16" &&
arg != "layernormbfp16" && arg != "sum" && arg != "sumfp16" && arg != "sumbfp16" &&
arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" &&
arg != "catfp16" && arg != "catbfp16" && arg != "addlayernorm" &&
arg != "addlayernormfp16" && arg != "addlayernormbfp16" && arg != "t5layernorm" &&
arg != "t5layernormfp16" && arg != "t5layernormbfp16" && arg != "adam" &&
arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" &&
arg != "bnormfp16" && arg != "bnormfp16fp32" && arg != "bnormbfp16fp32" && arg != "rnn" &&
arg != "rnnfp16" && arg != "rnn_seq" && arg != "rnn_seqfp16" && arg != "gemm" &&
arg != "gemmfp16" && arg != "ctc" && arg != "dropout" && arg != "dropoutfp16" &&
arg != "tensorop" && arg != "reduce" && arg != "reducefp16" && arg != "reducefp64" &&
arg != "layernorm" && arg != "layernormfp16" && arg != "layernormbfp16" && arg != "sum" &&
arg != "sumfp16" && arg != "sumbfp16" && arg != "groupnorm" && arg != "groupnormfp16" &&
arg != "groupnormbfp16" && arg != "cat" && arg != "catfp16" && arg != "catbfp16" &&
arg != "addlayernorm" && arg != "addlayernormfp16" && arg != "addlayernormbfp16" &&
arg != "t5layernorm" && arg != "t5layernormfp16" && arg != "t5layernormbfp16" &&
arg != "adam" && arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" &&
arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "adamw" &&
arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" &&
arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" &&
Expand Down
16 changes: 15 additions & 1 deletion src/batch_norm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,28 @@ void DeriveBNTensorDescriptor(TensorDescriptor& derivedBnDesc,
TensorDescriptor BuildReshaped4DTensorDescriptor(const miopen::TensorDescriptor& tDesc)
{
auto dataType = tDesc.GetType();
auto layout = tDesc.GetLayout_t();
if(layout == miopenTensorNCDHW)
{
layout = miopenTensorNCHW;
}
else if(layout == miopenTensorNDHWC)
{
layout = miopenTensorNHWC;
}
else
{
std::cout << "Cannot handle layout : " << layout << "\n";
exit(EXIT_FAILURE); // NOLINT (concurrency-mt-unsafe)
}
std::vector<size_t> dims(tDesc.GetLengths());

// NxCxDxHxW -> NxCx(D*H)xW
dims[2] *= dims[3];
dims[3] = dims[4];
dims.pop_back();

return {dataType, dims};
return {dataType, layout, dims};
}

void profileSequence(const Handle& handle, unsigned char select, float* ctime)
Expand Down
5 changes: 5 additions & 0 deletions src/driver_arguments.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ void BnDataType(std::stringstream& ss, const miopen::TensorDescriptor& desc)
{
ss << "bnormfp16";
}
if(desc.GetType() == miopenBFloat16)
{
ss << "bnormbfp16";
}
else
{
ss << "bnorm";
Expand Down Expand Up @@ -248,6 +252,7 @@ std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc,
resultRunningVariance,
resultSaveMean,
resultSaveInvVariance);
ss << " --layout " << miopen::deref(xDesc).GetLayout_str();
}
return ss.str();
}
Expand Down
3 changes: 2 additions & 1 deletion src/solver/batchnorm/backward_ck.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,8 @@ bool BnCKBwdBackward::IsApplicable(
return false;
if(bn_problem.GetDirection() != miopen::batchnorm::Direction::Backward)
return false;

if(!bn_problem.Is2D())
return false;
switch(bn_problem.GetXDesc().GetType())
{
case miopenFloat: return CheckCKApplicability<F32, F32, F32, F32, F32, F32, F32>(bn_problem);
Expand Down
2 changes: 2 additions & 0 deletions src/solver/batchnorm/backward_per_activation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ namespace batchnorm {
bool BnBwdTrainingPerActivation::IsApplicable(
const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem) const
{
if(!problem.Is2D())
return false;
return problem.GetDirection() == miopen::batchnorm::Direction::Backward &&
problem.GetMode() == miopenBNPerActivation;
}
Expand Down
Loading