Add SPC benchmark

mratsim · Nov 30, 2019 · 91ecde4 · 91ecde4
1 parent 6186810
commit 91ecde4
Show file tree

Hide file tree

Showing 6 changed files with 168 additions and 15 deletions.
diff --git a/benchmarks/fibonacci/weave_fib.nim b/benchmarks/fibonacci/weave_fib.nim
@@ -11,12 +11,12 @@ proc fib(n: int): int =
   result = sync(x) + y
 
 proc main() =
-  init(Runtime)
+  init(Weave)
 
   let f = fib(40)
 
-  sync(Runtime)
-  exit(Runtime)
+  sync(Weave)
+  exit(Weave)
 
   echo f
 

diff --git a/benchmarks/single_task_producer/README.md b/benchmarks/single_task_producer/README.md
@@ -0,0 +1,7 @@
+# Simple single-producer multiple consumers benchmarks
+
+SPC A Simple Producer-Consumer benchmark.
+
+A single worker produces n tasks,
+each running for t microseconds. This benchmark allows us to test how many
+concurrent consumers a single producer can sustain.
diff --git a/benchmarks/single_task_producer/weave_spc.nim b/benchmarks/single_task_producer/weave_spc.nim
@@ -0,0 +1,106 @@
+import
+  # STD lib
+  os, strutils, system/ansi_c,
+  # Library
+  ../../weave,
+  # bench
+  ../wtime
+
+var NumTasksTotal: int32
+var TaskGranularity: int32 # microsecond
+var PollInterval: float64  # microsecond
+
+proc print_usage() =
+  # If not specified the polling interval is set to the value
+  # of Task Granularity which effectively disables polling
+  echo "Usage: spc <number of tasks> <task granularity (us)> " &
+       "[polling interval (us)]"
+
+var global_poll_elapsed {.threadvar.}: float64
+
+template dummy_cpt(): untyped =
+  # Dummy computation
+  # Calculate fib(30) iteratively
+  var
+    fib = 0
+    f2 = 0
+    f1 = 1
+  for i in 2 .. 30:
+    fib = f1 + f2
+    f2 = f1
+    f1 = fib
+
+proc spc_consume(usec: int32) =
+
+  var pollElapsed = 0'f64
+
+  let start = Wtime_usec()
+  let stop = usec.float64
+  global_poll_elapsed = PollInterval
+
+  while true:
+    var elapsed = Wtime_usec() - start
+    elapsed = elapsed - pollElapsed
+    if elapsed >= stop:
+      break
+
+    dummy_cpt()
+
+    if elapsed >= poll_elapsed:
+      let pollStart = Wtime_usec()
+      loadBalance(Weave)
+      pollElapsed += Wtime_usec() - pollStart
+      global_poll_elapsed += PollInterval
+
+  # c_printf("Elapsed: %.2lfus\n", elapsed)
+
+proc spc_consume_nopoll(usec: int32) =
+
+  let start = Wtime_usec()
+  let stop = usec.float64
+
+  while true:
+    var elapsed = Wtime_usec() - start
+    if elapsed >= stop:
+      break
+
+    dummy_cpt()
+
+  # c_printf("Elapsed: %.2lfus\n", elapsed)
+
+proc spc_produce(n: int32) =
+  for i in 0 ..< n:
+    spawn spc_consume(TaskGranularity)
+
+proc spc_produce_seq(n: int32) =
+  for i in 0 ..< n:
+    spc_consume_no_poll(TaskGranularity)
+
+proc main() =
+  if paramCount() notin 2..3:
+    print_usage()
+    quit 0
+
+  NumTasksTotal = paramStr(1).parseInt.int32
+  TaskGranularity = paramStr(2). parseInt.int32
+  PollInterval = if paramCount() == 4: paramStr(3).parseInt.float64
+                 else: TaskGranularity.float64
+
+  init(Weave)
+
+  let start = Wtime_msec()
+
+  # spc_produce_seq(NumTasksTotal)
+  spc_produce(NumTasksTotal)
+  sync(Weave)
+
+  let stop = Wtime_msec()
+
+  c_printf("Elapsed wall time: %.2lf ms (%d us per task)\n", stop-start, TASK_GRANULARITY)
+
+  sync(Weave)
+  exit(Weave)
+
+  quit 0
+
+main()
diff --git a/weave.nim b/weave.nim
@@ -10,9 +10,10 @@ import
   weave/datatypes/flowvars
 
 export
-  Flowvar, Runtime,
+  Flowvar, Weave,
   spawn, sync,
-  init, exit
+  init, exit,
+  loadBalance
 
 # TODO, those are workaround for not binding symbols in spawn macro
 export

diff --git a/weave/async.nim b/weave/async.nim
@@ -149,13 +149,13 @@ when isMainModule:
       stdout.write(" - SUCCESS\n")
 
     proc main() =
-      init(Runtime)
+      init(Weave)
 
       spawn display_int(123456)
       spawn display_int(654321)
 
-      sync(Runtime)
-      exit(Runtime)
+      sync(Weave)
+      exit(Weave)
 
     # main()
 
@@ -173,12 +173,12 @@ when isMainModule:
 
     proc main2() =
 
-      init(Runtime)
+      init(Weave)
 
       let f = async_fib(20)
 
-      sync(Runtime)
-      exit(Runtime)
+      sync(Weave)
+      exit(Weave)
 
       echo f
 

diff --git a/weave/runtime.nim b/weave/runtime.nim
@@ -21,9 +21,9 @@ import
 # Runtime public routines
 # ----------------------------------------------------------------------------------
 
-type Runtime* = object
+type Weave* = object
 
-proc init*(_: type Runtime) =
+proc init*(_: type Weave) =
   # TODO detect Hyper-Threading and NUMA domain
 
   if existsEnv"WEAVE_NUM_THREADS":
@@ -84,7 +84,7 @@ proc globalCleanup() =
   metrics:
     log("+========================================+\n")
 
-proc exit*(_: type Runtime) =
+proc exit*(_: type Weave) =
   signalTerminate(nil)
   localCtx.signaledTerminate = true
 
@@ -97,7 +97,7 @@ proc exit*(_: type Runtime) =
   threadLocalCleanup()
   globalCleanup()
 
-proc sync*(_: type Runtime) =
+proc sync*(_: type Weave) =
   ## Global barrier for the Picasso runtime
   ## This is only valid in the root task
   Worker: return
@@ -178,3 +178,42 @@ proc sync*(_: type Runtime) =
 
   debugTermination:
     log(">>> Worker %2d leaves barrier <<<\n", myID())
+
+proc loadBalance*(_: type Weave) =
+  ## This makes the current thread ensures it shares work with other threads.
+  ##
+  ## This is done automatically at synchronization points
+  ## like task spawn and sync. But for long-running unbalance tasks
+  ## with no synchronization point this is needed.
+  ##
+  ## For example this can be placed at the end of a for-loop.
+  #
+  # Design notes:
+  # this is a leaky abstraction of Weave design, busy workers
+  # are the ones distributing tasks. However in the IO world
+  # adding poll() calls is widely used as well.
+  #
+  # It's arguably a much better way of handling:
+  # - load imbalance
+  # - the wide range of user CPUs
+  # - dynamic environment changes (maybe there are other programs)
+  # - generic functions on collections like map
+  #
+  # than and asking the developer to guesstimate the task size and
+  # hardcoding the task granularity like some popular frameworks requires.
+  #
+  # See: PyTorch troubles with OpenMP
+  #      Should we use OpenMP when we have 1000 or 80000 items an array?
+  #      https://github.com/zy97140/omp-benchmark-for-pytorch
+  #      It depends on:
+  #      - is it on a dual core CPU
+  #      - or a 4 sockets 100+ cores server grade CPU
+  #      - are you doing addition
+  #      - or exponentiation
+
+  shareWork()
+
+  if myThieves().peek() != 0:
+    var req: StealRequest
+    while recv(req):
+      dispatchTasks(req)