diff --git a/benchmarks/matmul_gemm_blas/gemm_bench_config.nim b/benchmarks/matmul_gemm_blas/gemm_bench_config.nim index afc22b1..3e4bb82 100644 --- a/benchmarks/matmul_gemm_blas/gemm_bench_config.nim +++ b/benchmarks/matmul_gemm_blas/gemm_bench_config.nim @@ -7,7 +7,7 @@ const M* = 16*6*20 K* = 16*6*20 N* = 16*6*20 - NbSamples* = 10 # This might stresss the allocator when packing if the matrices are big + NbSamples* = 300 # This might stresss the allocator when packing if the matrices are big CpuGhz = 3.5 # i9-9980XE OC All turbo 4.1GHz (AVX2 4.0GHz, AVX512 3.5GHz) NumCpuCores = 18 VectorWidth = 16 # 8 float32 for AVX2, 16 for AVX512 diff --git a/benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_laser_omp.nim b/benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_laser_omp.nim index dbcfe4f..7a9bf55 100644 --- a/benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_laser_omp.nim +++ b/benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_laser_omp.nim @@ -160,7 +160,7 @@ proc gemm_impl[T; ukernel: static MicroKernel]( omp_parallel_if(parallelize): # #################################### # 3. for ic = 0,...,m−1 in steps of mc - omp_for(icb, tiles.ic_num_tasks, use_simd=false, nowait=true): + omp_for(icb, tiles.ic_num_tasks, use_simd=false, nowait=true, schedule="static"): let packA = tiles.a + icb * tiles.upanelA_size prefetch(packA, Write, LowTemporalLocality) let ic = icb * tiles.mc diff --git a/benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim b/benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim index 2099be8..eee82d3 100644 --- a/benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim +++ b/benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim @@ -81,7 +81,8 @@ proc gebp_mkernel*[T; ukernel: static MicroKernel]( # ################################### # 5. for ir = 0,...,m−1 in steps of mr - for ir in countup(0, mc-1, MR): + parallelForStrided ir in 0 ..< mc, stride = MR: + captures: {nr, jr, mc, nc, kc, alpha, packA, packB, beta, mcncC} let mr = min(mc - ir, MR) let c_aux = mcncC.stride(ir, jr) # C[ic+ir:ic+ir+mr, jc+jr:jc+jr+nr] diff --git a/benchmarks/matmul_gemm_blas/gemm_pure_nim/openmp.nim b/benchmarks/matmul_gemm_blas/gemm_pure_nim/openmp.nim index 21af751..789cd5f 100644 --- a/benchmarks/matmul_gemm_blas/gemm_pure_nim/openmp.nim +++ b/benchmarks/matmul_gemm_blas/gemm_pure_nim/openmp.nim @@ -127,6 +127,7 @@ template omp_for*( index: untyped, length: Natural, use_simd, nowait: static bool, + schedule: static string, body: untyped ) = ## OpenMP for loop (not parallel) @@ -153,7 +154,8 @@ template omp_for*( const omp_annotation = block: "for " & (when use_simd: "simd " else: "") & - (when nowait: "nowait " else: "") + (when nowait: "nowait " else: "") & + "schedule(" & schedule & ')' for `index`{.inject.} in `||`(0, length-1, omp_annotation): block: body diff --git a/benchmarks/matmul_gemm_blas/laser_omp_gemm.nim b/benchmarks/matmul_gemm_blas/laser_omp_gemm.nim index 4501701..c3cc5b7 100644 --- a/benchmarks/matmul_gemm_blas/laser_omp_gemm.nim +++ b/benchmarks/matmul_gemm_blas/laser_omp_gemm.nim @@ -41,7 +41,7 @@ when isMainModule: import std/[random, sequtils] randomize(42) # FOr reproducibility - warmup() + # warmup() reportConfig("Laser (Pure Nim) + OpenMP", float32, (M, K), (K, N)) block: diff --git a/benchmarks/matmul_gemm_blas/mkl_gemm.nim b/benchmarks/matmul_gemm_blas/mkl_gemm.nim index 1c8c923..1e721ba 100644 --- a/benchmarks/matmul_gemm_blas/mkl_gemm.nim +++ b/benchmarks/matmul_gemm_blas/mkl_gemm.nim @@ -47,7 +47,7 @@ when isMainModule: import std/[random, sequtils] randomize(42) # FOr reproducibility - warmup() + # warmup() reportConfig("Intel MKL", float32, (M, K), (K, N)) block: diff --git a/benchmarks/matmul_gemm_blas/mkl_gemm.nim.cfg b/benchmarks/matmul_gemm_blas/mkl_gemm.nim.cfg index c3fca60..2836b1a 100644 --- a/benchmarks/matmul_gemm_blas/mkl_gemm.nim.cfg +++ b/benchmarks/matmul_gemm_blas/mkl_gemm.nim.cfg @@ -1,6 +1,6 @@ clibdir:"/opt/intel/mkl/lib/intel64" passl:"/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.a" passl:"-lmkl_core" -passl:"-lmkl_gnu_thread" -passl:"-lgomp" +passl:"-lmkl_intel_thread" +passl:"-liomp5" dynlibOverride:"mkl_intel_lp64" diff --git a/benchmarks/matmul_gemm_blas/mkl_gemm_gnu_omp.nim.cfg b/benchmarks/matmul_gemm_blas/mkl_gemm_gnu_omp.nim.cfg new file mode 100644 index 0000000..c3fca60 --- /dev/null +++ b/benchmarks/matmul_gemm_blas/mkl_gemm_gnu_omp.nim.cfg @@ -0,0 +1,6 @@ +clibdir:"/opt/intel/mkl/lib/intel64" +passl:"/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.a" +passl:"-lmkl_core" +passl:"-lmkl_gnu_thread" +passl:"-lgomp" +dynlibOverride:"mkl_intel_lp64" diff --git a/benchmarks/matmul_gemm_blas/mkl_gemm_tbb.nim.cfg b/benchmarks/matmul_gemm_blas/mkl_gemm_tbb.nim.cfg new file mode 100644 index 0000000..8110053 --- /dev/null +++ b/benchmarks/matmul_gemm_blas/mkl_gemm_tbb.nim.cfg @@ -0,0 +1,6 @@ +clibdir:"/opt/intel/mkl/lib/intel64" +passl:"/opt/intel/mkl/lib/intel64/libmkl_intel_lp64.a" +passl:"-lmkl_core" +passl:"-lmkl_tbb_thread" +passl:"-ltbb" +dynlibOverride:"mkl_intel_lp64" diff --git a/benchmarks/matmul_gemm_blas/openblas_gemm.nim b/benchmarks/matmul_gemm_blas/openblas_gemm.nim index 0bb2913..56dfea2 100644 --- a/benchmarks/matmul_gemm_blas/openblas_gemm.nim +++ b/benchmarks/matmul_gemm_blas/openblas_gemm.nim @@ -46,7 +46,7 @@ when isMainModule: import std/[random, sequtils] randomize(42) # FOr reproducibility - warmup() + # warmup() reportConfig("OpenBLAS", float32, (M, K), (K, N)) block: diff --git a/benchmarks/matmul_gemm_blas/weave_gemm.nim b/benchmarks/matmul_gemm_blas/weave_gemm.nim index 5e7640d..a857ca7 100644 --- a/benchmarks/matmul_gemm_blas/weave_gemm.nim +++ b/benchmarks/matmul_gemm_blas/weave_gemm.nim @@ -42,7 +42,7 @@ when isMainModule: import ../../weave randomize(42) # FOr reproducibility - warmup() + # warmup() reportConfig("Weave (Pure Nim)", float32, (M, K), (K, N)) block: diff --git a/weave.nimble b/weave.nimble index e67d0eb..0da78b4 100644 --- a/weave.nimble +++ b/weave.nimble @@ -61,8 +61,8 @@ task test, "Run Weave tests": test "", "benchmarks/bouncing_producer_consumer/weave_bpc.nim" when defined(i386) or defined(amd64): test "", "benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim" - when not defined(windows): # This is too slow on Azure windows machines - test "", "benchmarks/matmul_gemm_blas/weave_gemm.nim" + # This is too slow + # test "", "benchmarks/matmul_gemm_blas/weave_gemm.nim" test "-d:WV_LazyFlowvar", "benchmarks/dfs/weave_dfs.nim" test "-d:WV_LazyFlowvar", "benchmarks/fibonacci/weave_fib.nim" @@ -74,5 +74,5 @@ task test, "Run Weave tests": test "-d:WV_LazyFlowvar", "benchmarks/bouncing_producer_consumer/weave_bpc.nim" when defined(i386) or defined(amd64): test "-d:WV_LazyFlowvar", "benchmarks/matmul_gemm_blas/gemm_pure_nim/gemm_weave.nim" - when not defined(windows): # This is too slow on Azure windows machines - test "-d:WV_LazyFlowvar", "benchmarks/matmul_gemm_blas/weave_gemm.nim" + # This is too slow on Azure windows machines + # test "-d:WV_LazyFlowvar", "benchmarks/matmul_gemm_blas/weave_gemm.nim"