diff --git a/unused/channels/README.md b/unused/channels/README.md new file mode 100644 index 0000000..04415d4 --- /dev/null +++ b/unused/channels/README.md @@ -0,0 +1,26 @@ +# Unused channels implementations + +This directory tracks channels implementations or documentation/research +that has been superceded by more specialized and/or more efficient channels. + +Ultimately they should probably go in an external library. + +## Bounded SPSC channel research + +Thorough research has been done to avoid wasting memory on very small queues and channels. +An innovative scheme by rolling the front and back index at 2xCapacity instead of just 1xCapacity +allows avoiding allocating an extra always empty slot to differentiate between full and empty. +The usual alternative was to allocate a power-of-2 number of slots and use +bit-masking which is even more wasteful in the general case. + +Implementation is available in [./channels_mpsc_bounded_lock.nim](channels_mpsc_bounded_lock.nim) +and also used in the main source at [../../weave/datatypes/bounded_queues.nim](bounded_queues.nim) + +Turns out we only need SPSC channels of a single slot for tasks and futures/flowvars + +## Bounded MPSC channels + +Those were used for steal requests when they were stack objects however: +- using pointers (https://github.com/mratsim/weave/pull/13) reduced overhead by 20% combined with lazy futures (5% on normal futures) +- pointers enabled using intrusive list-based lockless channels for steal requests which requires no allocation + and improves overhead also by about 5% with lazy futures (https://github.com/mratsim/weave/pull/21) diff --git a/weave/channels/channels_mpsc_bounded_lock.nim b/unused/channels/channels_mpsc_bounded_lock.nim similarity index 100% rename from weave/channels/channels_mpsc_bounded_lock.nim rename to unused/channels/channels_mpsc_bounded_lock.nim diff --git a/weave/channels/channels_spsc.md b/unused/channels/channels_spsc.md similarity index 100% rename from weave/channels/channels_spsc.md rename to unused/channels/channels_spsc.md diff --git a/weave/channels/README.md b/weave/channels/README.md index fcb6261..eeffd6f 100644 --- a/weave/channels/README.md +++ b/weave/channels/README.md @@ -17,6 +17,7 @@ kind: unbuf/bounded/unbounded: - unbuf: unbuffered channel (also called rendezvous). Blocking. + - single: channel can only buffer a single element - bounded: channel has a max pre-allocated capacity. Usually array-based. - unbounded: channel has no max capacity. List-based. @@ -34,7 +35,7 @@ between senders and receivers however it might be interesting to explore endpoints with only the recv and send proc implemented. Furthermore we can add a "can only be moved" restrictions for single consumer or single receiver endpoints. -It prevents misuse of channels however for Picasso: +It prevents misuse of channels however for Weave: - We store all channels in a context/global, with the sender/receiver distinction it will be twice the size (and potentially twice the cache misses). - It would require tracing for the channel buffer to be destroyed diff --git a/weave/channels/channels_legacy.nim b/weave/channels/channels_legacy.nim index 8e09a56..edf0e72 100644 --- a/weave/channels/channels_legacy.nim +++ b/weave/channels/channels_legacy.nim @@ -607,6 +607,24 @@ proc channel_receive*[T](chan: ChannelLegacy[T], data: ptr T, size: int32): bool ## (Remove the first item) recv_fn[chan.impl](chan, data, size) +# Weave API +# ---------------------------------------------------------------------------------- + +func trySend*[T](c: ChannelLegacy[T], src: sink T): bool {.inline.} = + channel_send(c, src, int32 sizeof(src)) + +func tryRecv*[T](c: ChannelLegacy[T], dst: var T): bool {.inline.} = + channel_receive(c, dst.addr, int32 sizeof(dst)) + +func peek*[T](c: ChannelLegacy[T]): int32 = + channel_peek(c) + +proc initialize*[T](c: var ChannelLegacy[T], size: int32) = + c = channel_alloc(int32 sizeof(T), size, Mpsc) + +proc delete*[T](c: var ChannelLegacy[T]) = + channel_free(c) + # Tests # ---------------------------------------------------------------------------------- diff --git a/weave/channels/channels_mpsc.md b/weave/channels/channels_mpsc.md index 834a8e4..735fa49 100644 --- a/weave/channels/channels_mpsc.md +++ b/weave/channels/channels_mpsc.md @@ -14,30 +14,54 @@ Channels are used for the following types: - SPSC channels - unbuffered (rendezvous / blocking) - StealRequest - - object of size 32 bytes + - object of size 32 bytes or pointers to heap allocated steal requests - MPSC channels - buffered - "MaxSteal * num_workers" or "2 * MaxSteal * num_workers" (Main thread) +- Memory Pool (pointer to memory blocks) + - pointer objects + - MPSC channel + - unbounded ## Requirements notes There are a lot of litterature regarding linked-list based MPSC channels. -Given that we do not need linearizability guarantee, the Vyukhov MPSC queue would probably be enough: -- http://www.1024cores.net/home/lock-free-algorithms/queues/intrusive-mpsc-node-based-queue (Intrusive) + +Initially the MPSC channels were implemented following Microsoft's message-passing allocator [Snmalloc](https://github.com/microsoft/snmalloc) implementation which is itself heavily inspired by Pony language's actor message queue. + +However they: + +- Hold on the last item of the queue: unsuitable for steal requests +- Require memory management of a stub node: snmalloc overwrites it and never seem to reclaim its memory +- They never update the "back" pointer of the queue when dequeueing so the back pointer + still points to an unowned item, requiring the caller to do an emptiness check +- There is no way of estimating the number of enqueued items for the consumers which is necessary for both + steal requests (adaptative stealing) and memory pool (estimating free memory blocks) + +So the unbounded queue has been heavily modified with inspiration from the intrusive Vyukov queue at: +- http://www.1024cores.net/home/lock-free-algorithms/queues/intrusive-mpsc-node-based-queue + +An alternative implementation using CAS is discussed here: +- https://groups.google.com/forum/#!topic/comp.programming.threads/M_ecdRRlgvM + +It may be possible to use the required "count" atomic field for further optimization as other queues don't need it +but the likelihood is low given that it's a contended variable by all threads accessign the data-structure +compared to "next" field. + +A non-intrusive approach would require some memory reclamation of the node which we don't want when implementing +a memory pool. The extra pointer indirection is probably also costly in terms of cache miss while the +"next" field will be in the same cache-line as fresh data for the producers or required data for the consumer. - http://www.1024cores.net/home/lock-free-algorithms/queues/non-intrusive-mpsc-node-based-queue (non-intrusive) - Reviews: - https://int08h.com/post/ode-to-a-vyukov-queue/ - http://concurrencyfreaks.blogspot.com/2014/04/multi-producer-single-consumer-queue.html - http://psy-lob-saw.blogspot.com/2015/04/porting-dvyukov-mpsc.html -compared to something more complex but with wait-free and linearizability guarantees like -the MultiList queue: https://github.com/pramalhe/ConcurrencyFreaks/blob/master/papers/multilist-2017.pdf which probably has more latency and/or less throughput. - -However the MPSC queue will be used for StealRequest. Linked list based queues will require to pass data as a pointer meaning allocation can happen in one thread and deallocation in another. -This requires multithreaded alloc/free scheme. As we probably would want to implement pool allocation to reduce allocator pressure complicating the pool allocator design. +Lastly something more complex but with wait-free and linearizability guarantees like +the MultiList queue: https://github.com/pramalhe/ConcurrencyFreaks/blob/master/papers/multilist-2017.pdf probably has more latency and/or less throughput. -Another issue is that assuming we pool StealRequests, they will trigger access to the same cacheline from random threads so it's probably better if StealRequests were thread-local. - -Lastly, in the channel-based work-stealing, the thread that has work to do must also handle the steal requests (contrary to shared-memory design where the threads that have nothing to do handle the stealing). This means that that busy thread will also need to handle heap/pool deallocation. +Note that for steal requests as pointers we can actually guarantee that the thread that allocated them +can destroy/recycle them when no other thread is reading them, making list-based MPSC channels suitable. +Moving a pointer instead of 32 byte object resulted in a significant overhead reduction (25%) in https://github.com/mratsim/weave/pull/13. ## More notes @@ -45,12 +69,16 @@ Arguably there might be cache thrashing between the producers when writing the t As the producers have nothing to do anyway, a lock-based solution only on the producer side should be suitable. -Furthermore each producer is only allowed a limited number of steal requests +Furthermore each producer is only allowed a limited number of steal requests. + +Also Dmitry Vyokov presents "Memory Queues": https://groups.google.com/forum/#!topic/comp.programming.threads/4ILh6yY5fV4 +to be used with thread-safe memory allocator such as: https://groups.google.com/forum/embed/#!topic/comp.programming.threads/gkX0tIWR_Gk ## References There is a definite lack of papers on ring-buffer based MPSC queues +- https://github.com/dbittman/waitfree-mpsc-queue - https://github.com/rmind/ringbuf - https://github.com/cloudfoundry/go-diodes (can overwrite data) - Disruptor: https://github.com/LMAX-Exchange/disruptor/wiki/Blogs-And-Articles diff --git a/weave/contexts.nim b/weave/contexts.nim index c5a1b48..f937186 100644 --- a/weave/contexts.nim +++ b/weave/contexts.nim @@ -7,12 +7,11 @@ import ./datatypes/[context_global, context_thread_local, sync_types], - ./channels/[channels_spsc_single_ptr, channels_mpsc_bounded_lock, channels_mpsc_unbounded], + ./channels/[channels_spsc_single_ptr, channels_mpsc_unbounded], ./memory/[persistacks, intrusive_stacks], ./config, system/ansi_c, - ./primitives/barriers, - ./instrumentation/[contracts, profilers, loggers] + ./instrumentation/[profilers, loggers] # Contexts # ---------------------------------------------------------------------------------- @@ -47,24 +46,6 @@ proc newTaskFromCache*(): Task {.inline.} = template myTodoBoxes*: Persistack[WV_MaxConcurrentStealPerWorker, ChannelSpscSinglePtr[Task]] = globalCtx.com.tasks[localCtx.worker.ID] -# TODO: used to debug a recurrent deadlock on trySend with 5 workers -import ./channels/channels_legacy - -func trySend*[T](c: ChannelLegacy[T], src: sink T): bool {.inline.} = - channel_send(c, src, int32 sizeof(src)) - -func tryRecv*[T](c: ChannelLegacy[T], dst: var T): bool {.inline.} = - channel_receive(c, dst.addr, int32 sizeof(dst)) - -func peek*[T](c: ChannelLegacy[T]): int32 = - channel_peek(c) - -proc initialize*[T](c: var ChannelLegacy[T], size: int32) = - c = channel_alloc(int32 sizeof(T), size, Mpsc) - -proc delete*[T](c: var ChannelLegacy[T]) = - channel_free(c) - template myThieves*: ChannelMpscUnbounded[StealRequest] = globalCtx.com.thefts[localCtx.worker.ID] diff --git a/weave/datatypes/context_global.nim b/weave/datatypes/context_global.nim index 3b50ff7..46e1d6f 100644 --- a/weave/datatypes/context_global.nim +++ b/weave/datatypes/context_global.nim @@ -6,7 +6,6 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import - ../channels/channels_mpsc_bounded_lock, ../channels/channels_mpsc_unbounded, ../channels/channels_spsc_single_ptr, ../memory/persistacks, @@ -17,9 +16,6 @@ import # Global / inter-thread communication channels # ---------------------------------------------------------------------------------- -# TODO: used to debug a recurrent deadlock on trySend with 5 workers -import ../channels/channels_legacy - type ComChannels* = object ## Communication channels @@ -36,7 +32,9 @@ type # would work but then it requires a pointer indirection # per channel # and a known max number of workers - thefts*: ptr UncheckedArray[ChannelMPSCunbounded[StealRequest]] + + # Theft channels is bounded to "NumWorkers * WV_MaxConcurrentStealPerWorker" + thefts*: ptr UncheckedArray[ChannelMpscUnbounded[StealRequest]] tasks*: ptr UncheckedArray[Persistack[WV_MaxConcurrentStealPerWorker, ChannelSpscSinglePtr[Task]]] GlobalContext* = object diff --git a/weave/datatypes/flowvars.nim b/weave/datatypes/flowvars.nim index c27bed4..63c6cdd 100644 --- a/weave/datatypes/flowvars.nim +++ b/weave/datatypes/flowvars.nim @@ -9,7 +9,7 @@ import ../channels/[channels_spsc_single_ptr, channels_spsc_single_object], ../memory/allocs, ../instrumentation/contracts, - ../contexts, ../config + ../config # TODO for the Flowvar we need critically need a caching scheme for the channels # we use the legacy channels in the mean time diff --git a/weave/loop_splitting.nim b/weave/loop_splitting.nim index c83ed60..4a1aeec 100644 --- a/weave/loop_splitting.nim +++ b/weave/loop_splitting.nim @@ -6,7 +6,7 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import - ./config, ./contexts, + ./config, ./datatypes/sync_types, ./instrumentation/[contracts, loggers] diff --git a/weave/memory/multithreaded_memory_management.md b/weave/memory/multithreaded_memory_management.md index fecc89d..1d6842a 100644 --- a/weave/memory/multithreaded_memory_management.md +++ b/weave/memory/multithreaded_memory_management.md @@ -332,6 +332,12 @@ This article goes over the skeleton of a threadsafe growable pool allocator (but https://www.qt.io/blog/a-fast-and-thread-safe-pool-allocator-for-qt-part-1 +#### comp.programming.threads + +Lock-free slab allocator by Chris Thomasson: https://groups.google.com/forum/embed/#!topic/comp.programming.threads/gkX0tIWR_Gk + +Associated memory queue by Dmitry Vyukov: https://groups.google.com/forum/#!topic/comp.programming.threads/4ILh6yY5fV4 + ## False sharing keywords: cache ping-pong, false sharing, cache threashing, cache lines diff --git a/weave/runtime.nim b/weave/runtime.nim index 8cc4339..fdecacb 100644 --- a/weave/runtime.nim +++ b/weave/runtime.nim @@ -12,7 +12,7 @@ import ./instrumentation/[contracts, profilers, loggers], ./contexts, ./config, ./datatypes/[sync_types, prell_deques], - ./channels/[channels_mpsc_bounded_lock, channels_spsc_single_ptr, channels_mpsc_unbounded], + ./channels/[channels_spsc_single_ptr, channels_mpsc_unbounded], ./memory/[persistacks, intrusive_stacks, allocs], ./scheduler, ./signals, ./workers, ./thieves, ./victims, # Low-level primitives @@ -23,9 +23,6 @@ import type Runtime* = object -# TODO: used to debug a recurrent deadlock on trySend with 5 workers -import ./channels/channels_legacy - proc init*(_: type Runtime) = # TODO detect Hyper-Threading and NUMA domain diff --git a/weave/scheduler.nim b/weave/scheduler.nim index ed3168d..ad5d16b 100644 --- a/weave/scheduler.nim +++ b/weave/scheduler.nim @@ -9,7 +9,7 @@ import ./instrumentation/[contracts, profilers, loggers], ./primitives/barriers, ./datatypes/[sync_types, prell_deques, context_thread_local, flowvars, sparsesets], - ./channels/[channels_mpsc_bounded_lock, channels_spsc_single_ptr, channels_spsc_single_object, channels_mpsc_unbounded], + ./channels/[channels_legacy, channels_spsc_single_ptr, channels_mpsc_unbounded], ./memory/[persistacks, intrusive_stacks, allocs], ./contexts, ./config, ./victims, ./loop_splitting, diff --git a/weave/targets.nim b/weave/targets.nim index f2c3c22..7383186 100644 --- a/weave/targets.nim +++ b/weave/targets.nim @@ -10,8 +10,7 @@ import ./contexts, ./random/rng, ./instrumentation/[contracts, loggers], - ./config, - ./memory/allocs + ./config # Victim selection # ---------------------------------------------------------------------------------- diff --git a/weave/victims.nim b/weave/victims.nim index 001b2e9..115ddae 100644 --- a/weave/victims.nim +++ b/weave/victims.nim @@ -10,7 +10,7 @@ import sparsesets, prell_deques, flowvars], ./contexts, ./config, ./instrumentation/[contracts, profilers, loggers], - ./channels/[channels_mpsc_bounded_lock, channels_spsc_single_ptr, channels_mpsc_unbounded], + ./channels/[channels_spsc_single_ptr, channels_mpsc_unbounded], ./thieves, ./loop_splitting # Victims - Adaptative task splitting diff --git a/weave/workers.nim b/weave/workers.nim index 35b8de4..e616715 100644 --- a/weave/workers.nim +++ b/weave/workers.nim @@ -6,10 +6,10 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import - ./datatypes/[sync_types, context_thread_local, bounded_queues], + ./datatypes/[sync_types, context_thread_local], ./contexts, ./instrumentation/[contracts, profilers, loggers], - ./channels/[channels_mpsc_bounded_lock, channels_spsc_single_ptr], + ./channels/channels_spsc_single_ptr, ./memory/persistacks, ./config, ./thieves