From 91ecde4b60d42b157d1926330985463ca3325b87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= Date: Sat, 30 Nov 2019 17:16:49 +0100 Subject: [PATCH] Add SPC benchmark --- benchmarks/fibonacci/weave_fib.nim | 6 +- benchmarks/single_task_producer/README.md | 7 ++ benchmarks/single_task_producer/weave_spc.nim | 106 ++++++++++++++++++ weave.nim | 5 +- weave/async.nim | 12 +- weave/runtime.nim | 47 +++++++- 6 files changed, 168 insertions(+), 15 deletions(-) create mode 100644 benchmarks/single_task_producer/README.md create mode 100644 benchmarks/single_task_producer/weave_spc.nim diff --git a/benchmarks/fibonacci/weave_fib.nim b/benchmarks/fibonacci/weave_fib.nim index 0eb587a..b25a51e 100644 --- a/benchmarks/fibonacci/weave_fib.nim +++ b/benchmarks/fibonacci/weave_fib.nim @@ -11,12 +11,12 @@ proc fib(n: int): int = result = sync(x) + y proc main() = - init(Runtime) + init(Weave) let f = fib(40) - sync(Runtime) - exit(Runtime) + sync(Weave) + exit(Weave) echo f diff --git a/benchmarks/single_task_producer/README.md b/benchmarks/single_task_producer/README.md new file mode 100644 index 0000000..6f3bab5 --- /dev/null +++ b/benchmarks/single_task_producer/README.md @@ -0,0 +1,7 @@ +# Simple single-producer multiple consumers benchmarks + +SPC A Simple Producer-Consumer benchmark. + +A single worker produces n tasks, +each running for t microseconds. This benchmark allows us to test how many +concurrent consumers a single producer can sustain. diff --git a/benchmarks/single_task_producer/weave_spc.nim b/benchmarks/single_task_producer/weave_spc.nim new file mode 100644 index 0000000..07ead5d --- /dev/null +++ b/benchmarks/single_task_producer/weave_spc.nim @@ -0,0 +1,106 @@ +import + # STD lib + os, strutils, system/ansi_c, + # Library + ../../weave, + # bench + ../wtime + +var NumTasksTotal: int32 +var TaskGranularity: int32 # microsecond +var PollInterval: float64 # microsecond + +proc print_usage() = + # If not specified the polling interval is set to the value + # of Task Granularity which effectively disables polling + echo "Usage: spc " & + "[polling interval (us)]" + +var global_poll_elapsed {.threadvar.}: float64 + +template dummy_cpt(): untyped = + # Dummy computation + # Calculate fib(30) iteratively + var + fib = 0 + f2 = 0 + f1 = 1 + for i in 2 .. 30: + fib = f1 + f2 + f2 = f1 + f1 = fib + +proc spc_consume(usec: int32) = + + var pollElapsed = 0'f64 + + let start = Wtime_usec() + let stop = usec.float64 + global_poll_elapsed = PollInterval + + while true: + var elapsed = Wtime_usec() - start + elapsed = elapsed - pollElapsed + if elapsed >= stop: + break + + dummy_cpt() + + if elapsed >= poll_elapsed: + let pollStart = Wtime_usec() + loadBalance(Weave) + pollElapsed += Wtime_usec() - pollStart + global_poll_elapsed += PollInterval + + # c_printf("Elapsed: %.2lfus\n", elapsed) + +proc spc_consume_nopoll(usec: int32) = + + let start = Wtime_usec() + let stop = usec.float64 + + while true: + var elapsed = Wtime_usec() - start + if elapsed >= stop: + break + + dummy_cpt() + + # c_printf("Elapsed: %.2lfus\n", elapsed) + +proc spc_produce(n: int32) = + for i in 0 ..< n: + spawn spc_consume(TaskGranularity) + +proc spc_produce_seq(n: int32) = + for i in 0 ..< n: + spc_consume_no_poll(TaskGranularity) + +proc main() = + if paramCount() notin 2..3: + print_usage() + quit 0 + + NumTasksTotal = paramStr(1).parseInt.int32 + TaskGranularity = paramStr(2). parseInt.int32 + PollInterval = if paramCount() == 4: paramStr(3).parseInt.float64 + else: TaskGranularity.float64 + + init(Weave) + + let start = Wtime_msec() + + # spc_produce_seq(NumTasksTotal) + spc_produce(NumTasksTotal) + sync(Weave) + + let stop = Wtime_msec() + + c_printf("Elapsed wall time: %.2lf ms (%d us per task)\n", stop-start, TASK_GRANULARITY) + + sync(Weave) + exit(Weave) + + quit 0 + +main() diff --git a/weave.nim b/weave.nim index 6e4ff49..6c75321 100644 --- a/weave.nim +++ b/weave.nim @@ -10,9 +10,10 @@ import weave/datatypes/flowvars export - Flowvar, Runtime, + Flowvar, Weave, spawn, sync, - init, exit + init, exit, + loadBalance # TODO, those are workaround for not binding symbols in spawn macro export diff --git a/weave/async.nim b/weave/async.nim index abc1011..33acd73 100644 --- a/weave/async.nim +++ b/weave/async.nim @@ -149,13 +149,13 @@ when isMainModule: stdout.write(" - SUCCESS\n") proc main() = - init(Runtime) + init(Weave) spawn display_int(123456) spawn display_int(654321) - sync(Runtime) - exit(Runtime) + sync(Weave) + exit(Weave) # main() @@ -173,12 +173,12 @@ when isMainModule: proc main2() = - init(Runtime) + init(Weave) let f = async_fib(20) - sync(Runtime) - exit(Runtime) + sync(Weave) + exit(Weave) echo f diff --git a/weave/runtime.nim b/weave/runtime.nim index 4835d46..21482aa 100644 --- a/weave/runtime.nim +++ b/weave/runtime.nim @@ -21,9 +21,9 @@ import # Runtime public routines # ---------------------------------------------------------------------------------- -type Runtime* = object +type Weave* = object -proc init*(_: type Runtime) = +proc init*(_: type Weave) = # TODO detect Hyper-Threading and NUMA domain if existsEnv"WEAVE_NUM_THREADS": @@ -84,7 +84,7 @@ proc globalCleanup() = metrics: log("+========================================+\n") -proc exit*(_: type Runtime) = +proc exit*(_: type Weave) = signalTerminate(nil) localCtx.signaledTerminate = true @@ -97,7 +97,7 @@ proc exit*(_: type Runtime) = threadLocalCleanup() globalCleanup() -proc sync*(_: type Runtime) = +proc sync*(_: type Weave) = ## Global barrier for the Picasso runtime ## This is only valid in the root task Worker: return @@ -178,3 +178,42 @@ proc sync*(_: type Runtime) = debugTermination: log(">>> Worker %2d leaves barrier <<<\n", myID()) + +proc loadBalance*(_: type Weave) = + ## This makes the current thread ensures it shares work with other threads. + ## + ## This is done automatically at synchronization points + ## like task spawn and sync. But for long-running unbalance tasks + ## with no synchronization point this is needed. + ## + ## For example this can be placed at the end of a for-loop. + # + # Design notes: + # this is a leaky abstraction of Weave design, busy workers + # are the ones distributing tasks. However in the IO world + # adding poll() calls is widely used as well. + # + # It's arguably a much better way of handling: + # - load imbalance + # - the wide range of user CPUs + # - dynamic environment changes (maybe there are other programs) + # - generic functions on collections like map + # + # than and asking the developer to guesstimate the task size and + # hardcoding the task granularity like some popular frameworks requires. + # + # See: PyTorch troubles with OpenMP + # Should we use OpenMP when we have 1000 or 80000 items an array? + # https://github.com/zy97140/omp-benchmark-for-pytorch + # It depends on: + # - is it on a dual core CPU + # - or a 4 sockets 100+ cores server grade CPU + # - are you doing addition + # - or exponentiation + + shareWork() + + if myThieves().peek() != 0: + var req: StealRequest + while recv(req): + dispatchTasks(req)