Fix main GEMM matmul load balancing bottleneck. 0.6 -> 2 TFlops (#69)

* Higher sampling to allow proper warmup of GEMM bench * Enforce static schedule, Laser can reach 2.68 TFlops * Default to Intel OMP for MKL (3.2TFLOPs) and provides cfg for Gnu OMP and Intel TBB * Don't test the full GEMM * Reaching 2TFlops! Parallelizing 3 loops
mratsim · Dec 27, 2019 · 3239bc1 · 3239bc1
1 parent c898c41
commit 3239bc1
Show file tree

Hide file tree

Showing 12 changed files with 29 additions and 14 deletions.
diff --git a/benchmarks/matmul_gemm_blas/gemm_bench_config.nim b/benchmarks/matmul_gemm_blas/gemm_bench_config.nim
@@ -7,7 +7,7 @@ const
   M*     = 16*6*20
   K*     = 16*6*20
   N*     = 16*6*20
-  NbSamples* = 10    # This might stresss the allocator when packing if the matrices are big
+  NbSamples* = 300  # This might stresss the allocator when packing if the matrices are big
   CpuGhz = 3.5      # i9-9980XE OC All turbo 4.1GHz (AVX2 4.0GHz, AVX512 3.5GHz)
   NumCpuCores = 18
   VectorWidth = 16  # 8 float32 for AVX2, 16 for AVX512

diff --git a/benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_laser_omp.nim b/benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_laser_omp.nim
@@ -160,7 +160,7 @@ proc gemm_impl[T; ukernel: static MicroKernel](
     omp_parallel_if(parallelize):
       # ####################################
       # 3. for ic = 0,...,m−1 in steps of mc
-      omp_for(icb, tiles.ic_num_tasks, use_simd=false, nowait=true):
+      omp_for(icb, tiles.ic_num_tasks, use_simd=false, nowait=true, schedule="static"):
         let packA = tiles.a + icb * tiles.upanelA_size
         prefetch(packA, Write, LowTemporalLocality)
         let ic = icb * tiles.mc

diff --git a/benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim b/benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim
@@ -81,7 +81,8 @@ proc gebp_mkernel*[T; ukernel: static MicroKernel](
 
     # ###################################
     # 5. for ir = 0,...,m−1 in steps of mr
-    for ir in countup(0, mc-1, MR):
+    parallelForStrided ir in 0 ..< mc, stride = MR:
+      captures: {nr, jr, mc, nc, kc, alpha, packA, packB, beta, mcncC}
       let mr = min(mc - ir, MR)
       let c_aux = mcncC.stride(ir, jr)               # C[ic+ir:ic+ir+mr, jc+jr:jc+jr+nr]
 

diff --git a/benchmarks/matmul_gemm_blas/gemm_pure_nim/openmp.nim b/benchmarks/matmul_gemm_blas/gemm_pure_nim/openmp.nim
@@ -127,6 +127,7 @@ template omp_for*(
     index: untyped,
     length: Natural,
     use_simd, nowait: static bool,
+    schedule: static string,
     body: untyped
   ) =
   ## OpenMP for loop (not parallel)
@@ -153,7 +154,8 @@ template omp_for*(
   const omp_annotation = block:
     "for " &
       (when use_simd: "simd " else: "") &
-      (when nowait: "nowait " else: "")
+      (when nowait: "nowait " else: "") &
+      "schedule(" & schedule & ')'
   for `index`{.inject.} in `||`(0, length-1, omp_annotation):
     block: body
 

diff --git a/benchmarks/matmul_gemm_blas/laser_omp_gemm.nim b/benchmarks/matmul_gemm_blas/laser_omp_gemm.nim
@@ -41,7 +41,7 @@ when isMainModule:
   import std/[random, sequtils]
 
   randomize(42) # FOr reproducibility
-  warmup()
+  # warmup()
   reportConfig("Laser (Pure Nim) + OpenMP", float32, (M, K), (K, N))
 
   block:

diff --git a/benchmarks/matmul_gemm_blas/mkl_gemm.nim b/benchmarks/matmul_gemm_blas/mkl_gemm.nim
@@ -47,7 +47,7 @@ when isMainModule:
   import std/[random, sequtils]
 
   randomize(42) # FOr reproducibility
-  warmup()
+  # warmup()
   reportConfig("Intel MKL", float32, (M, K), (K, N))
 
   block:

diff --git a/benchmarks/matmul_gemm_blas/mkl_gemm.nim.cfg b/benchmarks/matmul_gemm_blas/mkl_gemm.nim.cfg
@@ -1,6 +1,6 @@
 clibdir:"/opt/intel/mkl/lib/intel64"
 passl:"/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.a"
 passl:"-lmkl_core"
-passl:"-lmkl_gnu_thread"
-passl:"-lgomp"
+passl:"-lmkl_intel_thread"
+passl:"-liomp5"
 dynlibOverride:"mkl_intel_lp64"
diff --git a/benchmarks/matmul_gemm_blas/mkl_gemm_gnu_omp.nim.cfg b/benchmarks/matmul_gemm_blas/mkl_gemm_gnu_omp.nim.cfg
@@ -0,0 +1,6 @@
+clibdir:"/opt/intel/mkl/lib/intel64"
+passl:"/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.a"
+passl:"-lmkl_core"
+passl:"-lmkl_gnu_thread"
+passl:"-lgomp"
+dynlibOverride:"mkl_intel_lp64"
diff --git a/benchmarks/matmul_gemm_blas/mkl_gemm_tbb.nim.cfg b/benchmarks/matmul_gemm_blas/mkl_gemm_tbb.nim.cfg
@@ -0,0 +1,6 @@
+clibdir:"/opt/intel/mkl/lib/intel64"
+passl:"/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.a"
+passl:"-lmkl_core"
+passl:"-lmkl_tbb_thread"
+passl:"-ltbb"
+dynlibOverride:"mkl_intel_lp64"
diff --git a/benchmarks/matmul_gemm_blas/openblas_gemm.nim b/benchmarks/matmul_gemm_blas/openblas_gemm.nim
@@ -46,7 +46,7 @@ when isMainModule:
   import std/[random, sequtils]
 
   randomize(42) # FOr reproducibility
-  warmup()
+  # warmup()
   reportConfig("OpenBLAS", float32, (M, K), (K, N))
 
   block:

diff --git a/benchmarks/matmul_gemm_blas/weave_gemm.nim b/benchmarks/matmul_gemm_blas/weave_gemm.nim
@@ -42,7 +42,7 @@ when isMainModule:
   import ../../weave
 
   randomize(42) # FOr reproducibility
-  warmup()
+  # warmup()
   reportConfig("Weave (Pure Nim)", float32, (M, K), (K, N))
 
   block:

diff --git a/weave.nimble b/weave.nimble
@@ -61,8 +61,8 @@ task test, "Run Weave tests":
     test "", "benchmarks/bouncing_producer_consumer/weave_bpc.nim"
   when defined(i386) or defined(amd64):
     test "", "benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim"
-    when not defined(windows): # This is too slow on Azure windows machines
-      test "", "benchmarks/matmul_gemm_blas/weave_gemm.nim"
+  # This is too slow
+  # test "", "benchmarks/matmul_gemm_blas/weave_gemm.nim"
 
   test "-d:WV_LazyFlowvar", "benchmarks/dfs/weave_dfs.nim"
   test "-d:WV_LazyFlowvar", "benchmarks/fibonacci/weave_fib.nim"
@@ -74,5 +74,5 @@ task test, "Run Weave tests":
     test "-d:WV_LazyFlowvar", "benchmarks/bouncing_producer_consumer/weave_bpc.nim"
   when defined(i386) or defined(amd64):
     test "-d:WV_LazyFlowvar", "benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim"
-    when not defined(windows): # This is too slow on Azure windows machines
-      test "-d:WV_LazyFlowvar", "benchmarks/matmul_gemm_blas/weave_gemm.nim"
+  # This is too slow on Azure windows machines
+  # test "-d:WV_LazyFlowvar", "benchmarks/matmul_gemm_blas/weave_gemm.nim"