Skip to content

Commit

Permalink
Fix main GEMM matmul load balancing bottleneck. 0.6 -> 2 TFlops (#69)
Browse files Browse the repository at this point in the history
* Higher sampling to allow proper warmup of GEMM bench

* Enforce static schedule, Laser can reach 2.68 TFlops

* Default to Intel OMP for MKL (3.2TFLOPs) and provides cfg for Gnu OMP and Intel TBB

* Don't test the full GEMM

* Reaching 2TFlops! Parallelizing 3 loops
  • Loading branch information
mratsim authored Dec 27, 2019
1 parent c898c41 commit 3239bc1
Show file tree
Hide file tree
Showing 12 changed files with 29 additions and 14 deletions.
2 changes: 1 addition & 1 deletion benchmarks/matmul_gemm_blas/gemm_bench_config.nim
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ const
M* = 16*6*20
K* = 16*6*20
N* = 16*6*20
NbSamples* = 10 # This might stresss the allocator when packing if the matrices are big
NbSamples* = 300 # This might stresss the allocator when packing if the matrices are big
CpuGhz = 3.5 # i9-9980XE OC All turbo 4.1GHz (AVX2 4.0GHz, AVX512 3.5GHz)
NumCpuCores = 18
VectorWidth = 16 # 8 float32 for AVX2, 16 for AVX512
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ proc gemm_impl[T; ukernel: static MicroKernel](
omp_parallel_if(parallelize):
# ####################################
# 3. for ic = 0,...,m−1 in steps of mc
omp_for(icb, tiles.ic_num_tasks, use_simd=false, nowait=true):
omp_for(icb, tiles.ic_num_tasks, use_simd=false, nowait=true, schedule="static"):
let packA = tiles.a + icb * tiles.upanelA_size
prefetch(packA, Write, LowTemporalLocality)
let ic = icb * tiles.mc
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ proc gebp_mkernel*[T; ukernel: static MicroKernel](

# ###################################
# 5. for ir = 0,...,m−1 in steps of mr
for ir in countup(0, mc-1, MR):
parallelForStrided ir in 0 ..< mc, stride = MR:
captures: {nr, jr, mc, nc, kc, alpha, packA, packB, beta, mcncC}
let mr = min(mc - ir, MR)
let c_aux = mcncC.stride(ir, jr) # C[ic+ir:ic+ir+mr, jc+jr:jc+jr+nr]

Expand Down
4 changes: 3 additions & 1 deletion benchmarks/matmul_gemm_blas/gemm_pure_nim/openmp.nim
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ template omp_for*(
index: untyped,
length: Natural,
use_simd, nowait: static bool,
schedule: static string,
body: untyped
) =
## OpenMP for loop (not parallel)
Expand All @@ -153,7 +154,8 @@ template omp_for*(
const omp_annotation = block:
"for " &
(when use_simd: "simd " else: "") &
(when nowait: "nowait " else: "")
(when nowait: "nowait " else: "") &
"schedule(" & schedule & ')'
for `index`{.inject.} in `||`(0, length-1, omp_annotation):
block: body

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/matmul_gemm_blas/laser_omp_gemm.nim
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ when isMainModule:
import std/[random, sequtils]

randomize(42) # FOr reproducibility
warmup()
# warmup()
reportConfig("Laser (Pure Nim) + OpenMP", float32, (M, K), (K, N))

block:
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/matmul_gemm_blas/mkl_gemm.nim
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ when isMainModule:
import std/[random, sequtils]

randomize(42) # FOr reproducibility
warmup()
# warmup()
reportConfig("Intel MKL", float32, (M, K), (K, N))

block:
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/matmul_gemm_blas/mkl_gemm.nim.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
clibdir:"/opt/intel/mkl/lib/intel64"
passl:"/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.a"
passl:"-lmkl_core"
passl:"-lmkl_gnu_thread"
passl:"-lgomp"
passl:"-lmkl_intel_thread"
passl:"-liomp5"
dynlibOverride:"mkl_intel_lp64"
6 changes: 6 additions & 0 deletions benchmarks/matmul_gemm_blas/mkl_gemm_gnu_omp.nim.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
clibdir:"/opt/intel/mkl/lib/intel64"
passl:"/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.a"
passl:"-lmkl_core"
passl:"-lmkl_gnu_thread"
passl:"-lgomp"
dynlibOverride:"mkl_intel_lp64"
6 changes: 6 additions & 0 deletions benchmarks/matmul_gemm_blas/mkl_gemm_tbb.nim.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
clibdir:"/opt/intel/mkl/lib/intel64"
passl:"/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.a"
passl:"-lmkl_core"
passl:"-lmkl_tbb_thread"
passl:"-ltbb"
dynlibOverride:"mkl_intel_lp64"
2 changes: 1 addition & 1 deletion benchmarks/matmul_gemm_blas/openblas_gemm.nim
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ when isMainModule:
import std/[random, sequtils]

randomize(42) # FOr reproducibility
warmup()
# warmup()
reportConfig("OpenBLAS", float32, (M, K), (K, N))

block:
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/matmul_gemm_blas/weave_gemm.nim
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ when isMainModule:
import ../../weave

randomize(42) # FOr reproducibility
warmup()
# warmup()
reportConfig("Weave (Pure Nim)", float32, (M, K), (K, N))

block:
Expand Down
8 changes: 4 additions & 4 deletions weave.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ task test, "Run Weave tests":
test "", "benchmarks/bouncing_producer_consumer/weave_bpc.nim"
when defined(i386) or defined(amd64):
test "", "benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim"
when not defined(windows): # This is too slow on Azure windows machines
test "", "benchmarks/matmul_gemm_blas/weave_gemm.nim"
# This is too slow
# test "", "benchmarks/matmul_gemm_blas/weave_gemm.nim"

test "-d:WV_LazyFlowvar", "benchmarks/dfs/weave_dfs.nim"
test "-d:WV_LazyFlowvar", "benchmarks/fibonacci/weave_fib.nim"
Expand All @@ -74,5 +74,5 @@ task test, "Run Weave tests":
test "-d:WV_LazyFlowvar", "benchmarks/bouncing_producer_consumer/weave_bpc.nim"
when defined(i386) or defined(amd64):
test "-d:WV_LazyFlowvar", "benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim"
when not defined(windows): # This is too slow on Azure windows machines
test "-d:WV_LazyFlowvar", "benchmarks/matmul_gemm_blas/weave_gemm.nim"
# This is too slow on Azure windows machines
# test "-d:WV_LazyFlowvar", "benchmarks/matmul_gemm_blas/weave_gemm.nim"

0 comments on commit 3239bc1

Please sign in to comment.