Skip to content

Commit

Permalink
Add SPC benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
mratsim committed Nov 30, 2019
1 parent 6186810 commit 91ecde4
Show file tree
Hide file tree
Showing 6 changed files with 168 additions and 15 deletions.
6 changes: 3 additions & 3 deletions benchmarks/fibonacci/weave_fib.nim
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ proc fib(n: int): int =
result = sync(x) + y

proc main() =
init(Runtime)
init(Weave)

let f = fib(40)

sync(Runtime)
exit(Runtime)
sync(Weave)
exit(Weave)

echo f

Expand Down
7 changes: 7 additions & 0 deletions benchmarks/single_task_producer/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Simple single-producer multiple consumers benchmarks

SPC A Simple Producer-Consumer benchmark.

A single worker produces n tasks,
each running for t microseconds. This benchmark allows us to test how many
concurrent consumers a single producer can sustain.
106 changes: 106 additions & 0 deletions benchmarks/single_task_producer/weave_spc.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import
# STD lib
os, strutils, system/ansi_c,
# Library
../../weave,
# bench
../wtime

var NumTasksTotal: int32
var TaskGranularity: int32 # microsecond
var PollInterval: float64 # microsecond

proc print_usage() =
# If not specified the polling interval is set to the value
# of Task Granularity which effectively disables polling
echo "Usage: spc <number of tasks> <task granularity (us)> " &
"[polling interval (us)]"

var global_poll_elapsed {.threadvar.}: float64

template dummy_cpt(): untyped =
# Dummy computation
# Calculate fib(30) iteratively
var
fib = 0
f2 = 0
f1 = 1
for i in 2 .. 30:
fib = f1 + f2
f2 = f1
f1 = fib

proc spc_consume(usec: int32) =

var pollElapsed = 0'f64

let start = Wtime_usec()
let stop = usec.float64
global_poll_elapsed = PollInterval

while true:
var elapsed = Wtime_usec() - start
elapsed = elapsed - pollElapsed
if elapsed >= stop:
break

dummy_cpt()

if elapsed >= poll_elapsed:
let pollStart = Wtime_usec()
loadBalance(Weave)
pollElapsed += Wtime_usec() - pollStart
global_poll_elapsed += PollInterval

# c_printf("Elapsed: %.2lfus\n", elapsed)

proc spc_consume_nopoll(usec: int32) =

let start = Wtime_usec()
let stop = usec.float64

while true:
var elapsed = Wtime_usec() - start
if elapsed >= stop:
break

dummy_cpt()

# c_printf("Elapsed: %.2lfus\n", elapsed)

proc spc_produce(n: int32) =
for i in 0 ..< n:
spawn spc_consume(TaskGranularity)

proc spc_produce_seq(n: int32) =
for i in 0 ..< n:
spc_consume_no_poll(TaskGranularity)

proc main() =
if paramCount() notin 2..3:
print_usage()
quit 0

NumTasksTotal = paramStr(1).parseInt.int32
TaskGranularity = paramStr(2). parseInt.int32
PollInterval = if paramCount() == 4: paramStr(3).parseInt.float64
else: TaskGranularity.float64

init(Weave)

let start = Wtime_msec()

# spc_produce_seq(NumTasksTotal)
spc_produce(NumTasksTotal)
sync(Weave)

let stop = Wtime_msec()

c_printf("Elapsed wall time: %.2lf ms (%d us per task)\n", stop-start, TASK_GRANULARITY)

sync(Weave)
exit(Weave)

quit 0

main()
5 changes: 3 additions & 2 deletions weave.nim
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@ import
weave/datatypes/flowvars

export
Flowvar, Runtime,
Flowvar, Weave,
spawn, sync,
init, exit
init, exit,
loadBalance

# TODO, those are workaround for not binding symbols in spawn macro
export
Expand Down
12 changes: 6 additions & 6 deletions weave/async.nim
Original file line number Diff line number Diff line change
Expand Up @@ -149,13 +149,13 @@ when isMainModule:
stdout.write(" - SUCCESS\n")

proc main() =
init(Runtime)
init(Weave)

spawn display_int(123456)
spawn display_int(654321)

sync(Runtime)
exit(Runtime)
sync(Weave)
exit(Weave)

# main()

Expand All @@ -173,12 +173,12 @@ when isMainModule:

proc main2() =

init(Runtime)
init(Weave)

let f = async_fib(20)

sync(Runtime)
exit(Runtime)
sync(Weave)
exit(Weave)

echo f

Expand Down
47 changes: 43 additions & 4 deletions weave/runtime.nim
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ import
# Runtime public routines
# ----------------------------------------------------------------------------------

type Runtime* = object
type Weave* = object

proc init*(_: type Runtime) =
proc init*(_: type Weave) =
# TODO detect Hyper-Threading and NUMA domain

if existsEnv"WEAVE_NUM_THREADS":
Expand Down Expand Up @@ -84,7 +84,7 @@ proc globalCleanup() =
metrics:
log("+========================================+\n")

proc exit*(_: type Runtime) =
proc exit*(_: type Weave) =
signalTerminate(nil)
localCtx.signaledTerminate = true

Expand All @@ -97,7 +97,7 @@ proc exit*(_: type Runtime) =
threadLocalCleanup()
globalCleanup()

proc sync*(_: type Runtime) =
proc sync*(_: type Weave) =
## Global barrier for the Picasso runtime
## This is only valid in the root task
Worker: return
Expand Down Expand Up @@ -178,3 +178,42 @@ proc sync*(_: type Runtime) =

debugTermination:
log(">>> Worker %2d leaves barrier <<<\n", myID())

proc loadBalance*(_: type Weave) =
## This makes the current thread ensures it shares work with other threads.
##
## This is done automatically at synchronization points
## like task spawn and sync. But for long-running unbalance tasks
## with no synchronization point this is needed.
##
## For example this can be placed at the end of a for-loop.
#
# Design notes:
# this is a leaky abstraction of Weave design, busy workers
# are the ones distributing tasks. However in the IO world
# adding poll() calls is widely used as well.
#
# It's arguably a much better way of handling:
# - load imbalance
# - the wide range of user CPUs
# - dynamic environment changes (maybe there are other programs)
# - generic functions on collections like map
#
# than and asking the developer to guesstimate the task size and
# hardcoding the task granularity like some popular frameworks requires.
#
# See: PyTorch troubles with OpenMP
# Should we use OpenMP when we have 1000 or 80000 items an array?
# https://github.com/zy97140/omp-benchmark-for-pytorch
# It depends on:
# - is it on a dual core CPU
# - or a 4 sockets 100+ cores server grade CPU
# - are you doing addition
# - or exponentiation

shareWork()

if myThieves().peek() != 0:
var req: StealRequest
while recv(req):
dispatchTasks(req)

0 comments on commit 91ecde4

Please sign in to comment.