Skip to content

Commit

Permalink
Fix tmp idx (#185)
Browse files Browse the repository at this point in the history
* fix tmp index

* limit the launch bounds with maximum number of blocks per sm
  • Loading branch information
cosunae authored and twicki committed Mar 22, 2019
1 parent 7eb0f7e commit a6007b6
Show file tree
Hide file tree
Showing 16 changed files with 165 additions and 98 deletions.
63 changes: 46 additions & 17 deletions src/dawn/CodeGen/Cuda/CodeGeneratorHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,31 +34,24 @@ std::string CodeGeneratorHelper::indexIteratorName(Array3i dims) {
return n_;
}

bool CodeGeneratorHelper::useNormalIteratorForTmp(const std::unique_ptr<iir::MultiStage>& ms) {
for(const auto& stage : ms->getChildren()) {
if(!stage->getExtents().isHorizontalPointwise()) {
return false;
}
}
return true;
}

std::string CodeGeneratorHelper::buildCudaKernelName(
const std::shared_ptr<iir::StencilInstantiation>& instantiation,
const std::unique_ptr<iir::MultiStage>& ms) {
return instantiation->getName() + "_stencil" + std::to_string(ms->getParent()->getStencilID()) +
"_ms" + std::to_string(ms->getID()) + "_kernel";
}

std::vector<std::string> CodeGeneratorHelper::generateStrideArguments(const IndexRange<const std::map<int, iir::Field> > &nonTempFields,
const IndexRange<const std::map<int, iir::Field> > &tempFields,
std::vector<std::string> CodeGeneratorHelper::generateStrideArguments(
const IndexRange<const std::map<int, iir::Field>>& nonTempFields,
const IndexRange<const std::map<int, iir::Field>>& tempFields,
const std::shared_ptr<iir::StencilInstantiation>& stencilInstantiation,
const std::unique_ptr<iir::MultiStage>& ms, CodeGeneratorHelper::FunctionArgType funArg) {

std::unordered_set<std::string> processedDims;
std::vector<std::string> strides;
for(auto field : nonTempFields) {
const auto fieldName = stencilInstantiation->getFieldNameFromAccessID((*field).second.getAccessID());
const auto fieldName =
stencilInstantiation->getFieldNameFromAccessID((*field).second.getAccessID());
Array3i dims{-1, -1, -1};
// TODO this is a hack, we need to have dimensions also at ms level
for(const auto& fieldInfo : ms->getParent()->getFields()) {
Expand Down Expand Up @@ -118,6 +111,42 @@ iir::Extents CodeGeneratorHelper::computeTempMaxWriteExtent(iir::Stencil const&
return maxExtents;
}

bool CodeGeneratorHelper::hasAccessIDMemAccess(const int accessID,
const std::unique_ptr<iir::Stencil>& stencil) {

for(const auto& ms : stencil->getChildren()) {
if(!ms->hasField(accessID))
continue;
if(!ms->isCached(accessID))
return true;
if(ms->getCache(accessID).getCacheType() == iir::Cache::CacheTypeKind::bypass) {
return true;
}
if(ms->getCache(accessID).getCacheIOPolicy() != iir::Cache::CacheIOPolicy::local) {
return true;
}
}
return false;
}

bool CodeGeneratorHelper::useTemporaries(
const std::unique_ptr<iir::Stencil>& stencil,
const std::shared_ptr<iir::StencilInstantiation>& stencilInstantiation) {

const auto& fields = stencil->getFields();
const bool containsMemTemporary =
(find_if(fields.begin(), fields.end(),
[&](const std::pair<int, iir::Stencil::FieldInfo>& field) {
const int accessID = field.second.field.getAccessID();
if(!stencilInstantiation->isTemporaryField(accessID))
return false;
// we dont need to use temporaries infrastructure for fields that are cached
return hasAccessIDMemAccess(accessID, stencil);
}) != fields.end());

return containsMemTemporary && stencil->containsRedundantComputations();
}

void CodeGeneratorHelper::generateFieldAccessDeref(
std::stringstream& ss, const std::unique_ptr<iir::MultiStage>& ms,
const std::shared_ptr<iir::StencilInstantiation>& instantiation, const int accessID,
Expand All @@ -126,15 +155,15 @@ void CodeGeneratorHelper::generateFieldAccessDeref(
bool isTemporary = instantiation->isTemporaryField(accessID);
DAWN_ASSERT(fieldIndexMap.count(accessID) || isTemporary);
const auto& field = ms->getField(accessID);
bool useTmpIndex_ = (isTemporary && !useNormalIteratorForTmp(ms));
std::string index = useTmpIndex_ ? "idx_tmp" : "idx" + CodeGeneratorHelper::indexIteratorName(
fieldIndexMap.at(accessID));
bool useTmpIndex = isTemporary && useTemporaries(ms->getParent(), instantiation);
std::string index = useTmpIndex ? "idx_tmp" : "idx" + CodeGeneratorHelper::indexIteratorName(
fieldIndexMap.at(accessID));

// temporaries have all 3 dimensions
Array3i iter = isTemporary ? Array3i{1, 1, 1} : fieldIndexMap.at(accessID);

std::string offsetStr = RangeToString("+", "", "", true)(
CodeGeneratorHelper::ijkfyOffset(offset, useTmpIndex_, iter));
std::string offsetStr =
RangeToString("+", "", "", true)(CodeGeneratorHelper::ijkfyOffset(offset, useTmpIndex, iter));
const bool readOnly = (field.getIntend() == iir::Field::IntendKind::IK_Input);
ss << (readOnly ? "__ldg(&(" : "") << accessName
<< (offsetStr.empty() ? "[" + index + "]" : ("[" + index + "+" + offsetStr + "]"))
Expand Down
27 changes: 18 additions & 9 deletions src/dawn/CodeGen/Cuda/CodeGeneratorHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
#include "dawn/IIR/StencilInstantiation.h"
#include "dawn/Support/Array.h"
#include "dawn/Support/IndexRange.h"
#include <string>
#include <map>
#include <string>

namespace dawn {
namespace codegen {
Expand All @@ -44,9 +44,9 @@ class CodeGeneratorHelper {
static std::array<std::string, 3> ijkfyOffset(const Array3i& offsets, bool isTemporary,
const Array3i iteratorDims);

/// @brief returns true if a normal ijk field iterator should be used for temporaries instead of a
/// custom iterator
static bool useNormalIteratorForTmp(const std::unique_ptr<iir::MultiStage>& ms);
/// @brief determines wheter an accessID will perform an access to main memory
static bool hasAccessIDMemAccess(const int accessID,
const std::unique_ptr<iir::Stencil>& stencil);

/// @brief return true if the ms can be solved in parallel (in the vertical dimension)
static bool solveKLoopInParallel(const std::unique_ptr<iir::MultiStage>& ms);
Expand All @@ -55,15 +55,24 @@ class CodeGeneratorHelper {
static std::vector<iir::Interval>
computePartitionOfIntervals(const std::unique_ptr<iir::MultiStage>& ms);

/// @brief determines whether for code generation, using temporaries will be required.
/// Even if the stencil contains temporaries, in some cases, like when they are local cached, they
/// are not required for code generation. Also in the case of no redundant computations,
/// temporaries will become normal fields
static bool
useTemporaries(const std::unique_ptr<iir::Stencil>& stencil,
const std::shared_ptr<iir::StencilInstantiation>& stencilInstantiation);

/// @brief computes the maximum extent required by all temporaries, which will be used for proper
/// allocation
static iir::Extents computeTempMaxWriteExtent(iir::Stencil const& stencil);

static std::vector<std::string> generateStrideArguments(
const IndexRange<const std::map<int, iir::Field>>& nonTempFields,
const IndexRange<const std::map<int, iir::Field>>& tempFields,
const std::shared_ptr<iir::StencilInstantiation>& stencilInstantiation,
const std::unique_ptr<iir::MultiStage>& ms, CodeGeneratorHelper::FunctionArgType funArg);
static std::vector<std::string>
generateStrideArguments(const IndexRange<const std::map<int, iir::Field>>& nonTempFields,
const IndexRange<const std::map<int, iir::Field>>& tempFields,
const std::shared_ptr<iir::StencilInstantiation>& stencilInstantiation,
const std::unique_ptr<iir::MultiStage>& ms,
CodeGeneratorHelper::FunctionArgType funArg);

/// @brief compose the cuda kernel name of a stencil instantiation
static std::string
Expand Down
10 changes: 5 additions & 5 deletions src/dawn/CodeGen/Cuda/CudaCodeGen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -385,8 +385,8 @@ void CudaCodeGen::generateStencilWrapperMembers(
stencilWrapperClass.addMember(c_gtc() + "meta_data_t", "m_meta_data");

for(int AccessID : stencilInstantiation->getAllocatedFieldAccessIDs())
stencilWrapperClass.addMember(c_gtc() + "storage_t",
"m_" + stencilInstantiation->getFieldNameFromAccessID(AccessID));
stencilWrapperClass.addMember(
c_gtc() + "storage_t", "m_" + stencilInstantiation->getFieldNameFromAccessID(AccessID));
}

if(!globalsMap.empty()) {
Expand Down Expand Up @@ -566,16 +566,16 @@ void CudaCodeGen::generateStencilRunMethod(
// in some cases (where there are no horizontal extents) we dont use the special tmp index
// iterator, but rather a normal 3d field index iterator. In that case we pass temporaries in
// the same manner as normal fields
if(CodeGeneratorHelper::useNormalIteratorForTmp(multiStagePtr)) {
if(!CodeGeneratorHelper::useTemporaries(multiStagePtr->getParent(), stencilInstantiation)) {
const auto fieldName =
stencilInstantiation->getFieldNameFromAccessID((*field).second.getAccessID());

args = args + ", (" + fieldName + ".data()+" + "m_" + fieldName +
".get_storage_info_ptr()->index(" + fieldName + ".begin<0>(), " + fieldName +
".begin<1>()," + fieldName + ".begin<2>()," + fieldName + ".begin<3>(), 0))";
} else {
args =
args + "," + stencilInstantiation->getFieldNameFromAccessID((*field).second.getAccessID());
args = args + "," +
stencilInstantiation->getFieldNameFromAccessID((*field).second.getAccessID());
}
}

Expand Down
91 changes: 34 additions & 57 deletions src/dawn/CodeGen/Cuda/MSCodeGen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,30 +30,12 @@ MSCodeGen::MSCodeGen(std::stringstream& ss, const std::unique_ptr<iir::MultiStag
const CacheProperties& cacheProperties)
: ss_(ss), ms_(ms), stencilInstantiation_(stencilInstantiation),
cacheProperties_(cacheProperties),
useCodeGenTemporaries_(
CodeGeneratorHelper::useTemporaries(ms->getParent(), stencilInstantiation) &&
ms->hasMemAccessTemporaries()),
cudaKernelName_(CodeGeneratorHelper::buildCudaKernelName(stencilInstantiation_, ms_)),
blockSize_(stencilInstantiation_->getIIR()->getBlockSize()),
solveKLoopInParallel_(CodeGeneratorHelper::solveKLoopInParallel(ms_)) {

// useTmpIndex_
const auto& fields = ms_->getFields();
const bool containsTemporary =
(find_if(fields.begin(), fields.end(), [&](const std::pair<int, iir::Field>& field) {
const int accessID = field.second.getAccessID();
if(!stencilInstantiation_->isTemporaryField(accessID))
return false;
// we dont need to initialize tmp indices for fields that are cached
if(!cacheProperties_.accessIsCached(accessID))
return true;
const auto& cache = ms_->getCache(accessID);
if(cache.getCacheIOPolicy() == iir::Cache::CacheIOPolicy::local) {
return false;
}
return true;
}) != fields.end());

useTmpIndex_ = containsTemporary && !CodeGeneratorHelper::useNormalIteratorForTmp(ms_);

cudaKernelName_ = CodeGeneratorHelper::buildCudaKernelName(stencilInstantiation_, ms_);
}
solveKLoopInParallel_(CodeGeneratorHelper::solveKLoopInParallel(ms_)) {}

void MSCodeGen::generateIJCacheDecl(MemberFunction& kernel) const {
for(const auto& cacheP : ms_->getCaches()) {
Expand Down Expand Up @@ -117,7 +99,7 @@ MSCodeGen::computeNextLevelToProcess(const iir::Interval& interval,

void MSCodeGen::generateTmpIndexInit(MemberFunction& kernel) const {

if(!useTmpIndex_)
if(!useCodeGenTemporaries_)
return;

auto maxExtentTmps = CodeGeneratorHelper::computeTempMaxWriteExtent(*(ms_->getParent()));
Expand Down Expand Up @@ -698,23 +680,14 @@ void MSCodeGen::generateCudaKernelCode() {
// of
// tmp storages (allocation, iterators, etc)
auto tempFieldsNonLocalCached =
makeRange(fields, std::function<bool(std::pair<int, iir::Field> const&)>([&](
std::pair<int, iir::Field> const& p) {
const int accessID = p.first;
if(!stencilInstantiation_->isTemporaryField(p.second.getAccessID()))
return false;
if(!cacheProperties_.accessIsCached(accessID))
return true;
if(ms_->getCache(accessID).getCacheIOPolicy() == iir::Cache::CacheIOPolicy::local)
return false;

return true;
}));

const bool containsTemporary = !tempFieldsNonLocalCached.empty();
makeRange(fields, std::function<bool(std::pair<int, iir::Field> const&)>(
[&](std::pair<int, iir::Field> const& p) {
const int accessID = p.first;
return ms_->isMemAccessTemporary(accessID);
}));

std::string fnDecl = "";
if(containsTemporary && useTmpIndex_)
if(useCodeGenTemporaries_)
fnDecl = "template<typename TmpStorage>";
fnDecl = fnDecl + "__global__ void";

Expand All @@ -723,9 +696,13 @@ void MSCodeGen::generateCudaKernelCode() {
(maxExtents[0].Minus < 0 ? 1 : 0) + (maxExtents[0].Plus > 0 ? 1 : 0));

int nSM = stencilInstantiation_->getOptimizerContext()->getOptions().nsms;
int maxBlocksPerSM = stencilInstantiation_->getOptimizerContext()->getOptions().maxBlocksPerSM;

std::string domain_size = stencilInstantiation_->getOptimizerContext()->getOptions().domain_size;
if(nSM > 0 && !domain_size.empty()) {
if(maxBlocksPerSM <= 0) {
throw std::runtime_error("--max-blocks-sm must be defined");
}
std::istringstream idomain_size(domain_size);
std::string arg;
getline(idomain_size, arg, ',');
Expand All @@ -739,7 +716,7 @@ void MSCodeGen::generateCudaKernelCode() {
minBlocksPerSM /= nSM;

fnDecl = fnDecl + " __launch_bounds__(" + std::to_string(maxThreadsPerBlock) + "," +
std::to_string(minBlocksPerSM) + ") ";
std::to_string(std::min(maxBlocksPerSM, minBlocksPerSM)) + ") ";
} else {
fnDecl = fnDecl + " __launch_bounds__(" + std::to_string(maxThreadsPerBlock) + ") ";
}
Expand All @@ -763,19 +740,21 @@ void MSCodeGen::generateCudaKernelCode() {

// first we construct non temporary field arguments
for(auto field : nonTempFields) {
cudaKernel.addArg("gridtools::clang::float_type * const " +
stencilInstantiation_->getFieldNameFromAccessID((*field).second.getAccessID()));
cudaKernel.addArg(
"gridtools::clang::float_type * const " +
stencilInstantiation_->getFieldNameFromAccessID((*field).second.getAccessID()));
}

// then the temporary field arguments
for(auto field : tempFieldsNonLocalCached) {
if(useTmpIndex_) {
cudaKernel.addArg(c_gt() + "data_view<TmpStorage>" +
stencilInstantiation_->getFieldNameFromAccessID((*field).second.getAccessID()) +
"_dv");
if(useCodeGenTemporaries_) {
cudaKernel.addArg(
c_gt() + "data_view<TmpStorage>" +
stencilInstantiation_->getFieldNameFromAccessID((*field).second.getAccessID()) + "_dv");
} else {
cudaKernel.addArg("gridtools::clang::float_type * const " +
stencilInstantiation_->getFieldNameFromAccessID((*field).second.getAccessID()));
cudaKernel.addArg(
"gridtools::clang::float_type * const " +
stencilInstantiation_->getFieldNameFromAccessID((*field).second.getAccessID()));
}
}

Expand All @@ -786,7 +765,7 @@ void MSCodeGen::generateCudaKernelCode() {
cudaKernel.addComment("Start kernel");

// extract raw pointers of temporaries from the data views
if(useTmpIndex_) {
if(useCodeGenTemporaries_) {
for(auto field : tempFieldsNonLocalCached) {
std::string fieldName =
stencilInstantiation_->getFieldNameFromAccessID((*field).second.getAccessID());
Expand Down Expand Up @@ -917,9 +896,7 @@ void MSCodeGen::generateCudaKernelCode() {
generateIJCacheIndexInit(cudaKernel);
}

if(containsTemporary) {
generateTmpIndexInit(cudaKernel);
}
generateTmpIndexInit(cudaKernel);

// compute the partition of the intervals
auto partitionIntervals = CodeGeneratorHelper::computePartitionOfIntervals(ms_);
Expand Down Expand Up @@ -959,7 +936,7 @@ void MSCodeGen::generateCudaKernelCode() {
intervalDiffToString(kmin, "ksize - 1") + ")");
}
}
if(useTmpIndex_ && !kmin.null() && !((solveKLoopInParallel_) && firstInterval)) {
if(useCodeGenTemporaries_ && !kmin.null() && !((solveKLoopInParallel_) && firstInterval)) {
cudaKernel.addComment("jump tmp iterators to match the beginning of next interval");
cudaKernel.addStatement("idx_tmp += kstride_tmp*(" +
intervalDiffToString(kmin, "ksize - 1") + ")");
Expand All @@ -986,12 +963,12 @@ void MSCodeGen::generateCudaKernelCode() {
}
}
}
if(useTmpIndex_) {
if(useCodeGenTemporaries_) {
cudaKernel.addComment("jump tmp iterators to match the intersection of beginning of next "
"interval and the parallel execution block ");
cudaKernel.addStatement("idx_tmp += max(" + intervalDiffToString(kmin, "ksize - 1") +
", kstride_tmp * blockIdx.z * " + std::to_string(blockSize_[2]) +
")");
", blockIdx.z * " + std::to_string(blockSize_[2]) +
") * kstride_tmp");
}
}

Expand Down Expand Up @@ -1075,7 +1052,7 @@ void MSCodeGen::generateCudaKernelCode() {
CodeGeneratorHelper::generateStrideName(2, index.second));
}
}
if(useTmpIndex_) {
if(useCodeGenTemporaries_) {
cudaKernel.addStatement("idx_tmp " + incStr + " kstride_tmp");
}
});
Expand Down
2 changes: 1 addition & 1 deletion src/dawn/CodeGen/Cuda/MSCodeGen.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class MSCodeGen {
const std::unique_ptr<iir::MultiStage>& ms_;
const std::shared_ptr<iir::StencilInstantiation> stencilInstantiation_;
const CacheProperties& cacheProperties_;
bool useTmpIndex_;
bool useCodeGenTemporaries_;
std::string cudaKernelName_;
Array3ui blockSize_;
const bool solveKLoopInParallel_;
Expand Down
2 changes: 2 additions & 0 deletions src/dawn/Compiler/Options.inc
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ OPT(std::string, Backend, "gridtools", "backend", "",
"\n - cuda = optimized cuda", "<backend>", true, false)
OPT(int, nsms, 0, "nsms", "",
"Number of (CUDA) SMs", "<nsms>", true, false)
OPT(int, maxBlocksPerSM, 0, "max-blocks-sm", "",
"Maximum number of blocks that can be registered per SM", "<max-blocks-sm>", true, false)
OPT(std::string, domain_size, "", "domain-size", "",
"domain size for compiler optimization", "", true, false)
OPT(std::string, block_size, "", "block-size", "",
Expand Down
4 changes: 4 additions & 0 deletions src/dawn/IIR/Cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ Interval Cache::getWindowInterval(Interval::Bound bound) const {
return interval_->crop(bound, {window_->m_m, window_->m_p});
}

bool Cache::requiresMemMemoryAccess() const {
return (policy_ != CacheIOPolicy::local) || (type_ == CacheTypeKind::bypass);
}

json::json Cache::jsonDump() const {
json::json node;
node["accessid"] = AccessID_;
Expand Down
Loading

0 comments on commit a6007b6

Please sign in to comment.