Fix tmp idx (#185)

* fix tmp index * limit the launch bounds with maximum number of blocks per sm
MeteoSwiss-APN · Mar 22, 2019 · a6007b6 · a6007b6
1 parent 7eb0f7e
commit a6007b6
Show file tree

Hide file tree

Showing 16 changed files with 165 additions and 98 deletions.
diff --git a/src/dawn/CodeGen/Cuda/CodeGeneratorHelper.cpp b/src/dawn/CodeGen/Cuda/CodeGeneratorHelper.cpp
@@ -34,31 +34,24 @@ std::string CodeGeneratorHelper::indexIteratorName(Array3i dims) {
   return n_;
 }
 
-bool CodeGeneratorHelper::useNormalIteratorForTmp(const std::unique_ptr<iir::MultiStage>& ms) {
-  for(const auto& stage : ms->getChildren()) {
-    if(!stage->getExtents().isHorizontalPointwise()) {
-      return false;
-    }
-  }
-  return true;
-}
-
 std::string CodeGeneratorHelper::buildCudaKernelName(
     const std::shared_ptr<iir::StencilInstantiation>& instantiation,
     const std::unique_ptr<iir::MultiStage>& ms) {
   return instantiation->getName() + "_stencil" + std::to_string(ms->getParent()->getStencilID()) +
          "_ms" + std::to_string(ms->getID()) + "_kernel";
 }
 
-std::vector<std::string> CodeGeneratorHelper::generateStrideArguments(const IndexRange<const std::map<int, iir::Field> > &nonTempFields,
-    const IndexRange<const std::map<int, iir::Field> > &tempFields,
+std::vector<std::string> CodeGeneratorHelper::generateStrideArguments(
+    const IndexRange<const std::map<int, iir::Field>>& nonTempFields,
+    const IndexRange<const std::map<int, iir::Field>>& tempFields,
     const std::shared_ptr<iir::StencilInstantiation>& stencilInstantiation,
     const std::unique_ptr<iir::MultiStage>& ms, CodeGeneratorHelper::FunctionArgType funArg) {
 
   std::unordered_set<std::string> processedDims;
   std::vector<std::string> strides;
   for(auto field : nonTempFields) {
-    const auto fieldName = stencilInstantiation->getFieldNameFromAccessID((*field).second.getAccessID());
+    const auto fieldName =
+        stencilInstantiation->getFieldNameFromAccessID((*field).second.getAccessID());
     Array3i dims{-1, -1, -1};
     // TODO this is a hack, we need to have dimensions also at ms level
     for(const auto& fieldInfo : ms->getParent()->getFields()) {
@@ -118,6 +111,42 @@ iir::Extents CodeGeneratorHelper::computeTempMaxWriteExtent(iir::Stencil const&
   return maxExtents;
 }
 
+bool CodeGeneratorHelper::hasAccessIDMemAccess(const int accessID,
+                                               const std::unique_ptr<iir::Stencil>& stencil) {
+
+  for(const auto& ms : stencil->getChildren()) {
+    if(!ms->hasField(accessID))
+      continue;
+    if(!ms->isCached(accessID))
+      return true;
+    if(ms->getCache(accessID).getCacheType() == iir::Cache::CacheTypeKind::bypass) {
+      return true;
+    }
+    if(ms->getCache(accessID).getCacheIOPolicy() != iir::Cache::CacheIOPolicy::local) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CodeGeneratorHelper::useTemporaries(
+    const std::unique_ptr<iir::Stencil>& stencil,
+    const std::shared_ptr<iir::StencilInstantiation>& stencilInstantiation) {
+
+  const auto& fields = stencil->getFields();
+  const bool containsMemTemporary =
+      (find_if(fields.begin(), fields.end(),
+               [&](const std::pair<int, iir::Stencil::FieldInfo>& field) {
+                 const int accessID = field.second.field.getAccessID();
+                 if(!stencilInstantiation->isTemporaryField(accessID))
+                   return false;
+                 // we dont need to use temporaries infrastructure for fields that are cached
+                 return hasAccessIDMemAccess(accessID, stencil);
+               }) != fields.end());
+
+  return containsMemTemporary && stencil->containsRedundantComputations();
+}
+
 void CodeGeneratorHelper::generateFieldAccessDeref(
     std::stringstream& ss, const std::unique_ptr<iir::MultiStage>& ms,
     const std::shared_ptr<iir::StencilInstantiation>& instantiation, const int accessID,
@@ -126,15 +155,15 @@ void CodeGeneratorHelper::generateFieldAccessDeref(
   bool isTemporary = instantiation->isTemporaryField(accessID);
   DAWN_ASSERT(fieldIndexMap.count(accessID) || isTemporary);
   const auto& field = ms->getField(accessID);
-  bool useTmpIndex_ = (isTemporary && !useNormalIteratorForTmp(ms));
-  std::string index = useTmpIndex_ ? "idx_tmp" : "idx" + CodeGeneratorHelper::indexIteratorName(
-                                                             fieldIndexMap.at(accessID));
+  bool useTmpIndex = isTemporary && useTemporaries(ms->getParent(), instantiation);
+  std::string index = useTmpIndex ? "idx_tmp" : "idx" + CodeGeneratorHelper::indexIteratorName(
+                                                            fieldIndexMap.at(accessID));
 
   // temporaries have all 3 dimensions
   Array3i iter = isTemporary ? Array3i{1, 1, 1} : fieldIndexMap.at(accessID);
 
-  std::string offsetStr = RangeToString("+", "", "", true)(
-      CodeGeneratorHelper::ijkfyOffset(offset, useTmpIndex_, iter));
+  std::string offsetStr =
+      RangeToString("+", "", "", true)(CodeGeneratorHelper::ijkfyOffset(offset, useTmpIndex, iter));
   const bool readOnly = (field.getIntend() == iir::Field::IntendKind::IK_Input);
   ss << (readOnly ? "__ldg(&(" : "") << accessName
      << (offsetStr.empty() ? "[" + index + "]" : ("[" + index + "+" + offsetStr + "]"))

diff --git a/src/dawn/CodeGen/Cuda/CodeGeneratorHelper.h b/src/dawn/CodeGen/Cuda/CodeGeneratorHelper.h
@@ -19,8 +19,8 @@
 #include "dawn/IIR/StencilInstantiation.h"
 #include "dawn/Support/Array.h"
 #include "dawn/Support/IndexRange.h"
-#include <string>
 #include <map>
+#include <string>
 
 namespace dawn {
 namespace codegen {
@@ -44,9 +44,9 @@ class CodeGeneratorHelper {
   static std::array<std::string, 3> ijkfyOffset(const Array3i& offsets, bool isTemporary,
                                                 const Array3i iteratorDims);
 
-  /// @brief returns true if a normal ijk field iterator should be used for temporaries instead of a
-  /// custom iterator
-  static bool useNormalIteratorForTmp(const std::unique_ptr<iir::MultiStage>& ms);
+  /// @brief determines wheter an accessID will perform an access to main memory
+  static bool hasAccessIDMemAccess(const int accessID,
+                                   const std::unique_ptr<iir::Stencil>& stencil);
 
   /// @brief return true if the ms can be solved in parallel (in the vertical dimension)
   static bool solveKLoopInParallel(const std::unique_ptr<iir::MultiStage>& ms);
@@ -55,15 +55,24 @@ class CodeGeneratorHelper {
   static std::vector<iir::Interval>
   computePartitionOfIntervals(const std::unique_ptr<iir::MultiStage>& ms);
 
+  /// @brief determines whether for code generation, using temporaries will be required.
+  /// Even if the stencil contains temporaries, in some cases, like when they are local cached, they
+  /// are not required for code generation. Also in the case of no redundant computations,
+  /// temporaries will become normal fields
+  static bool
+  useTemporaries(const std::unique_ptr<iir::Stencil>& stencil,
+                 const std::shared_ptr<iir::StencilInstantiation>& stencilInstantiation);
+
   /// @brief computes the maximum extent required by all temporaries, which will be used for proper
   /// allocation
   static iir::Extents computeTempMaxWriteExtent(iir::Stencil const& stencil);
 
-  static std::vector<std::string> generateStrideArguments(
-      const IndexRange<const std::map<int, iir::Field>>& nonTempFields,
-      const IndexRange<const std::map<int, iir::Field>>& tempFields,
-      const std::shared_ptr<iir::StencilInstantiation>& stencilInstantiation,
-      const std::unique_ptr<iir::MultiStage>& ms, CodeGeneratorHelper::FunctionArgType funArg);
+  static std::vector<std::string>
+  generateStrideArguments(const IndexRange<const std::map<int, iir::Field>>& nonTempFields,
+                          const IndexRange<const std::map<int, iir::Field>>& tempFields,
+                          const std::shared_ptr<iir::StencilInstantiation>& stencilInstantiation,
+                          const std::unique_ptr<iir::MultiStage>& ms,
+                          CodeGeneratorHelper::FunctionArgType funArg);
 
   /// @brief compose the cuda kernel name of a stencil instantiation
   static std::string

diff --git a/src/dawn/CodeGen/Cuda/CudaCodeGen.cpp b/src/dawn/CodeGen/Cuda/CudaCodeGen.cpp
@@ -385,8 +385,8 @@ void CudaCodeGen::generateStencilWrapperMembers(
     stencilWrapperClass.addMember(c_gtc() + "meta_data_t", "m_meta_data");
 
     for(int AccessID : stencilInstantiation->getAllocatedFieldAccessIDs())
-      stencilWrapperClass.addMember(c_gtc() + "storage_t",
-                                    "m_" + stencilInstantiation->getFieldNameFromAccessID(AccessID));
+      stencilWrapperClass.addMember(
+          c_gtc() + "storage_t", "m_" + stencilInstantiation->getFieldNameFromAccessID(AccessID));
   }
 
   if(!globalsMap.empty()) {
@@ -566,16 +566,16 @@ void CudaCodeGen::generateStencilRunMethod(
       // in some cases (where there are no horizontal extents) we dont use the special tmp index
       // iterator, but rather a normal 3d field index iterator. In that case we pass temporaries in
       // the same manner as normal fields
-      if(CodeGeneratorHelper::useNormalIteratorForTmp(multiStagePtr)) {
+      if(!CodeGeneratorHelper::useTemporaries(multiStagePtr->getParent(), stencilInstantiation)) {
         const auto fieldName =
             stencilInstantiation->getFieldNameFromAccessID((*field).second.getAccessID());
 
         args = args + ", (" + fieldName + ".data()+" + "m_" + fieldName +
                ".get_storage_info_ptr()->index(" + fieldName + ".begin<0>(), " + fieldName +
                ".begin<1>()," + fieldName + ".begin<2>()," + fieldName + ".begin<3>(), 0))";
       } else {
-        args =
-            args + "," + stencilInstantiation->getFieldNameFromAccessID((*field).second.getAccessID());
+        args = args + "," +
+               stencilInstantiation->getFieldNameFromAccessID((*field).second.getAccessID());
       }
     }
 

diff --git a/src/dawn/CodeGen/Cuda/MSCodeGen.cpp b/src/dawn/CodeGen/Cuda/MSCodeGen.cpp
@@ -30,30 +30,12 @@ MSCodeGen::MSCodeGen(std::stringstream& ss, const std::unique_ptr<iir::MultiStag
                      const CacheProperties& cacheProperties)
     : ss_(ss), ms_(ms), stencilInstantiation_(stencilInstantiation),
       cacheProperties_(cacheProperties),
+      useCodeGenTemporaries_(
+          CodeGeneratorHelper::useTemporaries(ms->getParent(), stencilInstantiation) &&
+          ms->hasMemAccessTemporaries()),
+      cudaKernelName_(CodeGeneratorHelper::buildCudaKernelName(stencilInstantiation_, ms_)),
       blockSize_(stencilInstantiation_->getIIR()->getBlockSize()),
-      solveKLoopInParallel_(CodeGeneratorHelper::solveKLoopInParallel(ms_)) {
-
-  // useTmpIndex_
-  const auto& fields = ms_->getFields();
-  const bool containsTemporary =
-      (find_if(fields.begin(), fields.end(), [&](const std::pair<int, iir::Field>& field) {
-         const int accessID = field.second.getAccessID();
-         if(!stencilInstantiation_->isTemporaryField(accessID))
-           return false;
-         // we dont need to initialize tmp indices for fields that are cached
-         if(!cacheProperties_.accessIsCached(accessID))
-           return true;
-         const auto& cache = ms_->getCache(accessID);
-         if(cache.getCacheIOPolicy() == iir::Cache::CacheIOPolicy::local) {
-           return false;
-         }
-         return true;
-       }) != fields.end());
-
-  useTmpIndex_ = containsTemporary && !CodeGeneratorHelper::useNormalIteratorForTmp(ms_);
-
-  cudaKernelName_ = CodeGeneratorHelper::buildCudaKernelName(stencilInstantiation_, ms_);
-}
+      solveKLoopInParallel_(CodeGeneratorHelper::solveKLoopInParallel(ms_)) {}
 
 void MSCodeGen::generateIJCacheDecl(MemberFunction& kernel) const {
   for(const auto& cacheP : ms_->getCaches()) {
@@ -117,7 +99,7 @@ MSCodeGen::computeNextLevelToProcess(const iir::Interval& interval,
 
 void MSCodeGen::generateTmpIndexInit(MemberFunction& kernel) const {
 
-  if(!useTmpIndex_)
+  if(!useCodeGenTemporaries_)
     return;
 
   auto maxExtentTmps = CodeGeneratorHelper::computeTempMaxWriteExtent(*(ms_->getParent()));
@@ -698,23 +680,14 @@ void MSCodeGen::generateCudaKernelCode() {
   // of
   // tmp storages (allocation, iterators, etc)
   auto tempFieldsNonLocalCached =
-      makeRange(fields, std::function<bool(std::pair<int, iir::Field> const&)>([&](
-                            std::pair<int, iir::Field> const& p) {
-                  const int accessID = p.first;
-                  if(!stencilInstantiation_->isTemporaryField(p.second.getAccessID()))
-                    return false;
-                  if(!cacheProperties_.accessIsCached(accessID))
-                    return true;
-                  if(ms_->getCache(accessID).getCacheIOPolicy() == iir::Cache::CacheIOPolicy::local)
-                    return false;
-
-                  return true;
-                }));
-
-  const bool containsTemporary = !tempFieldsNonLocalCached.empty();
+      makeRange(fields, std::function<bool(std::pair<int, iir::Field> const&)>(
+                            [&](std::pair<int, iir::Field> const& p) {
+                              const int accessID = p.first;
+                              return ms_->isMemAccessTemporary(accessID);
+                            }));
 
   std::string fnDecl = "";
-  if(containsTemporary && useTmpIndex_)
+  if(useCodeGenTemporaries_)
     fnDecl = "template<typename TmpStorage>";
   fnDecl = fnDecl + "__global__ void";
 
@@ -723,9 +696,13 @@ void MSCodeGen::generateCudaKernelCode() {
                        (maxExtents[0].Minus < 0 ? 1 : 0) + (maxExtents[0].Plus > 0 ? 1 : 0));
 
   int nSM = stencilInstantiation_->getOptimizerContext()->getOptions().nsms;
+  int maxBlocksPerSM = stencilInstantiation_->getOptimizerContext()->getOptions().maxBlocksPerSM;
 
   std::string domain_size = stencilInstantiation_->getOptimizerContext()->getOptions().domain_size;
   if(nSM > 0 && !domain_size.empty()) {
+    if(maxBlocksPerSM <= 0) {
+      throw std::runtime_error("--max-blocks-sm must be defined");
+    }
     std::istringstream idomain_size(domain_size);
     std::string arg;
     getline(idomain_size, arg, ',');
@@ -739,7 +716,7 @@ void MSCodeGen::generateCudaKernelCode() {
     minBlocksPerSM /= nSM;
 
     fnDecl = fnDecl + " __launch_bounds__(" + std::to_string(maxThreadsPerBlock) + "," +
-             std::to_string(minBlocksPerSM) + ") ";
+             std::to_string(std::min(maxBlocksPerSM, minBlocksPerSM)) + ") ";
   } else {
     fnDecl = fnDecl + " __launch_bounds__(" + std::to_string(maxThreadsPerBlock) + ") ";
   }
@@ -763,19 +740,21 @@ void MSCodeGen::generateCudaKernelCode() {
 
   // first we construct non temporary field arguments
   for(auto field : nonTempFields) {
-    cudaKernel.addArg("gridtools::clang::float_type * const " +
-                      stencilInstantiation_->getFieldNameFromAccessID((*field).second.getAccessID()));
+    cudaKernel.addArg(
+        "gridtools::clang::float_type * const " +
+        stencilInstantiation_->getFieldNameFromAccessID((*field).second.getAccessID()));
   }
 
   // then the temporary field arguments
   for(auto field : tempFieldsNonLocalCached) {
-    if(useTmpIndex_) {
-      cudaKernel.addArg(c_gt() + "data_view<TmpStorage>" +
-                        stencilInstantiation_->getFieldNameFromAccessID((*field).second.getAccessID()) +
-                        "_dv");
+    if(useCodeGenTemporaries_) {
+      cudaKernel.addArg(
+          c_gt() + "data_view<TmpStorage>" +
+          stencilInstantiation_->getFieldNameFromAccessID((*field).second.getAccessID()) + "_dv");
     } else {
-      cudaKernel.addArg("gridtools::clang::float_type * const " +
-                        stencilInstantiation_->getFieldNameFromAccessID((*field).second.getAccessID()));
+      cudaKernel.addArg(
+          "gridtools::clang::float_type * const " +
+          stencilInstantiation_->getFieldNameFromAccessID((*field).second.getAccessID()));
     }
   }
 
@@ -786,7 +765,7 @@ void MSCodeGen::generateCudaKernelCode() {
   cudaKernel.addComment("Start kernel");
 
   // extract raw pointers of temporaries from the data views
-  if(useTmpIndex_) {
+  if(useCodeGenTemporaries_) {
     for(auto field : tempFieldsNonLocalCached) {
       std::string fieldName =
           stencilInstantiation_->getFieldNameFromAccessID((*field).second.getAccessID());
@@ -917,9 +896,7 @@ void MSCodeGen::generateCudaKernelCode() {
     generateIJCacheIndexInit(cudaKernel);
   }
 
-  if(containsTemporary) {
-    generateTmpIndexInit(cudaKernel);
-  }
+  generateTmpIndexInit(cudaKernel);
 
   // compute the partition of the intervals
   auto partitionIntervals = CodeGeneratorHelper::computePartitionOfIntervals(ms_);
@@ -959,7 +936,7 @@ void MSCodeGen::generateCudaKernelCode() {
                                   intervalDiffToString(kmin, "ksize - 1") + ")");
         }
       }
-      if(useTmpIndex_ && !kmin.null() && !((solveKLoopInParallel_) && firstInterval)) {
+      if(useCodeGenTemporaries_ && !kmin.null() && !((solveKLoopInParallel_) && firstInterval)) {
         cudaKernel.addComment("jump tmp iterators to match the beginning of next interval");
         cudaKernel.addStatement("idx_tmp += kstride_tmp*(" +
                                 intervalDiffToString(kmin, "ksize - 1") + ")");
@@ -986,12 +963,12 @@ void MSCodeGen::generateCudaKernelCode() {
           }
         }
       }
-      if(useTmpIndex_) {
+      if(useCodeGenTemporaries_) {
         cudaKernel.addComment("jump tmp iterators to match the intersection of beginning of next "
                               "interval and the parallel execution block ");
         cudaKernel.addStatement("idx_tmp += max(" + intervalDiffToString(kmin, "ksize - 1") +
-                                ", kstride_tmp * blockIdx.z * " + std::to_string(blockSize_[2]) +
-                                ")");
+                                ", blockIdx.z * " + std::to_string(blockSize_[2]) +
+                                ") * kstride_tmp");
       }
     }
 
@@ -1075,7 +1052,7 @@ void MSCodeGen::generateCudaKernelCode() {
                                   CodeGeneratorHelper::generateStrideName(2, index.second));
         }
       }
-      if(useTmpIndex_) {
+      if(useCodeGenTemporaries_) {
         cudaKernel.addStatement("idx_tmp " + incStr + " kstride_tmp");
       }
     });

diff --git a/src/dawn/CodeGen/Cuda/MSCodeGen.hpp b/src/dawn/CodeGen/Cuda/MSCodeGen.hpp
@@ -49,7 +49,7 @@ class MSCodeGen {
   const std::unique_ptr<iir::MultiStage>& ms_;
   const std::shared_ptr<iir::StencilInstantiation> stencilInstantiation_;
   const CacheProperties& cacheProperties_;
-  bool useTmpIndex_;
+  bool useCodeGenTemporaries_;
   std::string cudaKernelName_;
   Array3ui blockSize_;
   const bool solveKLoopInParallel_;

diff --git a/src/dawn/Compiler/Options.inc b/src/dawn/Compiler/Options.inc
@@ -42,6 +42,8 @@ OPT(std::string, Backend, "gridtools", "backend", "",
     "\n - cuda        = optimized cuda", "<backend>", true, false)
 OPT(int, nsms, 0, "nsms", "",
     "Number of (CUDA) SMs", "<nsms>", true, false)
+OPT(int, maxBlocksPerSM, 0, "max-blocks-sm", "",
+    "Maximum number of blocks that can be registered per SM", "<max-blocks-sm>", true, false)
 OPT(std::string, domain_size, "", "domain-size", "",
     "domain size for compiler optimization", "", true, false)
 OPT(std::string, block_size, "", "block-size", "",

diff --git a/src/dawn/IIR/Cache.cpp b/src/dawn/IIR/Cache.cpp
@@ -67,6 +67,10 @@ Interval Cache::getWindowInterval(Interval::Bound bound) const {
   return interval_->crop(bound, {window_->m_m, window_->m_p});
 }
 
+bool Cache::requiresMemMemoryAccess() const {
+  return (policy_ != CacheIOPolicy::local) || (type_ == CacheTypeKind::bypass);
+}
+
 json::json Cache::jsonDump() const {
   json::json node;
   node["accessid"] = AccessID_;