diff --git a/unused/channels/README.md b/unused/channels/README.md
new file mode 100644
index 0000000..04415d4
--- /dev/null
+++ b/unused/channels/README.md
@@ -0,0 +1,26 @@
+# Unused channels implementations
+
+This directory tracks channels implementations or documentation/research
+that has been superceded by more specialized and/or more efficient channels.
+
+Ultimately they should probably go in an external library.
+
+## Bounded SPSC channel research
+
+Thorough research has been done to avoid wasting memory on very small queues and channels.
+An innovative scheme by rolling the front and back index at 2xCapacity instead of just 1xCapacity
+allows avoiding allocating an extra always empty slot to differentiate between full and empty.
+The usual alternative was to allocate a power-of-2 number of slots and use
+bit-masking which is even more wasteful in the general case.
+
+Implementation is available in [./channels_mpsc_bounded_lock.nim](channels_mpsc_bounded_lock.nim)
+and also used in the main source at [../../weave/datatypes/bounded_queues.nim](bounded_queues.nim)
+
+Turns out we only need SPSC channels of a single slot for tasks and futures/flowvars
+
+## Bounded MPSC channels
+
+Those were used for steal requests when they were stack objects however:
+- using pointers (https://github.com/mratsim/weave/pull/13) reduced overhead by 20% combined with lazy futures (5% on normal futures)
+- pointers enabled using intrusive list-based lockless channels for steal requests which requires no allocation
+  and improves overhead also by about 5% with lazy futures (https://github.com/mratsim/weave/pull/21)
diff --git a/weave/channels/channels_mpsc_bounded_lock.nim b/unused/channels/channels_mpsc_bounded_lock.nim
similarity index 100%
rename from weave/channels/channels_mpsc_bounded_lock.nim
rename to unused/channels/channels_mpsc_bounded_lock.nim
diff --git a/weave/channels/channels_spsc.md b/unused/channels/channels_spsc.md
similarity index 100%
rename from weave/channels/channels_spsc.md
rename to unused/channels/channels_spsc.md
diff --git a/weave/channels/README.md b/weave/channels/README.md
index fcb6261..eeffd6f 100644
--- a/weave/channels/README.md
+++ b/weave/channels/README.md
@@ -17,6 +17,7 @@ kind:
 
 unbuf/bounded/unbounded:
   - unbuf: unbuffered channel (also called rendezvous). Blocking.
+  - single: channel can only buffer a single element
   - bounded: channel has a max pre-allocated capacity. Usually array-based.
   - unbounded: channel has no max capacity. List-based.
 
@@ -34,7 +35,7 @@ between senders and receivers however it might be interesting to explore
 endpoints with only the recv and send proc implemented.
 Furthermore we can add a "can only be moved" restrictions for single consumer or single receiver endpoints.
 
-It prevents misuse of channels however for Picasso:
+It prevents misuse of channels however for Weave:
 - We store all channels in a context/global, with the sender/receiver distinction
   it will be twice the size (and potentially twice the cache misses).
 - It would require tracing for the channel buffer to be destroyed
diff --git a/weave/channels/channels_legacy.nim b/weave/channels/channels_legacy.nim
index 8e09a56..edf0e72 100644
--- a/weave/channels/channels_legacy.nim
+++ b/weave/channels/channels_legacy.nim
@@ -607,6 +607,24 @@ proc channel_receive*[T](chan: ChannelLegacy[T], data: ptr T, size: int32): bool
   ## (Remove the first item)
   recv_fn[chan.impl](chan, data, size)
 
+# Weave API
+# ----------------------------------------------------------------------------------
+
+func trySend*[T](c: ChannelLegacy[T], src: sink T): bool {.inline.} =
+  channel_send(c, src, int32 sizeof(src))
+
+func tryRecv*[T](c: ChannelLegacy[T], dst: var T): bool {.inline.} =
+  channel_receive(c, dst.addr, int32 sizeof(dst))
+
+func peek*[T](c: ChannelLegacy[T]): int32 =
+  channel_peek(c)
+
+proc initialize*[T](c: var ChannelLegacy[T], size: int32) =
+  c = channel_alloc(int32 sizeof(T), size, Mpsc)
+
+proc delete*[T](c: var ChannelLegacy[T]) =
+  channel_free(c)
+
 # Tests
 # ----------------------------------------------------------------------------------
 
diff --git a/weave/channels/channels_mpsc.md b/weave/channels/channels_mpsc.md
index 834a8e4..735fa49 100644
--- a/weave/channels/channels_mpsc.md
+++ b/weave/channels/channels_mpsc.md
@@ -14,30 +14,54 @@ Channels are used for the following types:
   - SPSC channels
   - unbuffered (rendezvous / blocking)
 - StealRequest
-  - object of size 32 bytes
+  - object of size 32 bytes or pointers to heap allocated steal requests
   - MPSC channels
   - buffered - "MaxSteal * num_workers" or "2 * MaxSteal * num_workers" (Main thread)
+- Memory Pool (pointer to memory blocks)
+  - pointer objects
+  - MPSC channel
+  - unbounded
 
 ## Requirements notes
 
 There are a lot of litterature regarding linked-list based MPSC channels.
-Given that we do not need linearizability guarantee, the Vyukhov MPSC queue would probably be enough:
-- http://www.1024cores.net/home/lock-free-algorithms/queues/intrusive-mpsc-node-based-queue (Intrusive)
+
+Initially the MPSC channels were implemented following Microsoft's message-passing allocator [Snmalloc](https://github.com/microsoft/snmalloc) implementation which is itself heavily inspired by Pony language's actor message queue.
+
+However they:
+
+- Hold on the last item of the queue: unsuitable for steal requests
+- Require memory management of a stub node: snmalloc overwrites it and never seem to reclaim its memory
+- They never update the "back" pointer of the queue when dequeueing so the back pointer
+  still points to an unowned item, requiring the caller to do an emptiness check
+- There is no way of estimating the number of enqueued items for the consumers which is necessary for both
+  steal requests (adaptative stealing) and memory pool (estimating free memory blocks)
+
+So the unbounded queue has been heavily modified with inspiration from the intrusive Vyukov queue at:
+- http://www.1024cores.net/home/lock-free-algorithms/queues/intrusive-mpsc-node-based-queue
+
+An alternative implementation using CAS is discussed here:
+- https://groups.google.com/forum/#!topic/comp.programming.threads/M_ecdRRlgvM
+
+It may be possible to use the required "count" atomic field for further optimization as other queues don't need it
+but the likelihood is low given that it's a contended variable by all threads accessign the data-structure
+compared to "next" field.
+
+A non-intrusive approach would require some memory reclamation of the node which we don't want when implementing
+a memory pool. The extra pointer indirection is probably also costly in terms of cache miss while the
+"next" field will be in the same cache-line as fresh data for the producers or required data for the consumer.
 - http://www.1024cores.net/home/lock-free-algorithms/queues/non-intrusive-mpsc-node-based-queue (non-intrusive)
 - Reviews:
   - https://int08h.com/post/ode-to-a-vyukov-queue/
   - http://concurrencyfreaks.blogspot.com/2014/04/multi-producer-single-consumer-queue.html
   - http://psy-lob-saw.blogspot.com/2015/04/porting-dvyukov-mpsc.html
 
-compared to something more complex but with wait-free and linearizability guarantees like
-the MultiList queue: https://github.com/pramalhe/ConcurrencyFreaks/blob/master/papers/multilist-2017.pdf which probably has more latency and/or less throughput.
-
-However the MPSC queue will be used for StealRequest. Linked list based queues will require to pass data as a pointer meaning allocation can happen in one thread and deallocation in another.
-This requires multithreaded alloc/free scheme. As we probably would want to implement pool allocation to reduce allocator pressure complicating the pool allocator design.
+Lastly something more complex but with wait-free and linearizability guarantees like
+the MultiList queue: https://github.com/pramalhe/ConcurrencyFreaks/blob/master/papers/multilist-2017.pdf probably has more latency and/or less throughput.
 
-Another issue is that assuming we pool StealRequests, they will trigger access to the same cacheline from random threads so it's probably better if StealRequests were thread-local.
-
-Lastly, in the channel-based work-stealing, the thread that has work to do must also handle the steal requests (contrary to shared-memory design where the threads that have nothing to do handle the stealing). This means that that busy thread will also need to handle heap/pool deallocation.
+Note that for steal requests as pointers we can actually guarantee that the thread that allocated them
+can destroy/recycle them when no other thread is reading them, making list-based MPSC channels suitable.
+Moving a pointer instead of 32 byte object resulted in a significant overhead reduction (25%) in https://github.com/mratsim/weave/pull/13.
 
 ## More notes
 
@@ -45,12 +69,16 @@ Arguably there might be cache thrashing between the producers when writing the t
 
 As the producers have nothing to do anyway, a lock-based solution only on the producer side should be suitable.
 
-Furthermore each producer is only allowed a limited number of steal requests
+Furthermore each producer is only allowed a limited number of steal requests.
+
+Also Dmitry Vyokov presents "Memory Queues": https://groups.google.com/forum/#!topic/comp.programming.threads/4ILh6yY5fV4
+to be used with thread-safe memory allocator such as: https://groups.google.com/forum/embed/#!topic/comp.programming.threads/gkX0tIWR_Gk
 
 ## References
 
 There is a definite lack of papers on ring-buffer based MPSC queues
 
+- https://github.com/dbittman/waitfree-mpsc-queue
 - https://github.com/rmind/ringbuf
 - https://github.com/cloudfoundry/go-diodes (can overwrite data)
 - Disruptor: https://github.com/LMAX-Exchange/disruptor/wiki/Blogs-And-Articles
diff --git a/weave/contexts.nim b/weave/contexts.nim
index c5a1b48..f937186 100644
--- a/weave/contexts.nim
+++ b/weave/contexts.nim
@@ -7,12 +7,11 @@
 
 import
   ./datatypes/[context_global, context_thread_local, sync_types],
-  ./channels/[channels_spsc_single_ptr, channels_mpsc_bounded_lock, channels_mpsc_unbounded],
+  ./channels/[channels_spsc_single_ptr, channels_mpsc_unbounded],
   ./memory/[persistacks, intrusive_stacks],
   ./config,
   system/ansi_c,
-  ./primitives/barriers,
-  ./instrumentation/[contracts, profilers, loggers]
+  ./instrumentation/[profilers, loggers]
 
 # Contexts
 # ----------------------------------------------------------------------------------
@@ -47,24 +46,6 @@ proc newTaskFromCache*(): Task {.inline.} =
 template myTodoBoxes*: Persistack[WV_MaxConcurrentStealPerWorker, ChannelSpscSinglePtr[Task]] =
   globalCtx.com.tasks[localCtx.worker.ID]
 
-# TODO: used to debug a recurrent deadlock on trySend with 5 workers
-import ./channels/channels_legacy
-
-func trySend*[T](c: ChannelLegacy[T], src: sink T): bool {.inline.} =
-  channel_send(c, src, int32 sizeof(src))
-
-func tryRecv*[T](c: ChannelLegacy[T], dst: var T): bool {.inline.} =
-  channel_receive(c, dst.addr, int32 sizeof(dst))
-
-func peek*[T](c: ChannelLegacy[T]): int32 =
-  channel_peek(c)
-
-proc initialize*[T](c: var ChannelLegacy[T], size: int32) =
-  c = channel_alloc(int32 sizeof(T), size, Mpsc)
-
-proc delete*[T](c: var ChannelLegacy[T]) =
-  channel_free(c)
-
 template myThieves*: ChannelMpscUnbounded[StealRequest] =
   globalCtx.com.thefts[localCtx.worker.ID]
 
diff --git a/weave/datatypes/context_global.nim b/weave/datatypes/context_global.nim
index 3b50ff7..46e1d6f 100644
--- a/weave/datatypes/context_global.nim
+++ b/weave/datatypes/context_global.nim
@@ -6,7 +6,6 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  ../channels/channels_mpsc_bounded_lock,
   ../channels/channels_mpsc_unbounded,
   ../channels/channels_spsc_single_ptr,
   ../memory/persistacks,
@@ -17,9 +16,6 @@ import
 # Global / inter-thread communication channels
 # ----------------------------------------------------------------------------------
 
-# TODO: used to debug a recurrent deadlock on trySend with 5 workers
-import ../channels/channels_legacy
-
 type
   ComChannels* = object
     ## Communication channels
@@ -36,7 +32,9 @@ type
     #   would work but then it requires a pointer indirection
     #   per channel
     #   and a known max number of workers
-    thefts*: ptr UncheckedArray[ChannelMPSCunbounded[StealRequest]]
+
+    # Theft channels is bounded to "NumWorkers * WV_MaxConcurrentStealPerWorker"
+    thefts*: ptr UncheckedArray[ChannelMpscUnbounded[StealRequest]]
     tasks*: ptr UncheckedArray[Persistack[WV_MaxConcurrentStealPerWorker, ChannelSpscSinglePtr[Task]]]
 
   GlobalContext* = object
diff --git a/weave/datatypes/flowvars.nim b/weave/datatypes/flowvars.nim
index c27bed4..63c6cdd 100644
--- a/weave/datatypes/flowvars.nim
+++ b/weave/datatypes/flowvars.nim
@@ -9,7 +9,7 @@ import
   ../channels/[channels_spsc_single_ptr, channels_spsc_single_object],
   ../memory/allocs,
   ../instrumentation/contracts,
-  ../contexts, ../config
+  ../config
 
 # TODO for the Flowvar we need critically need a caching scheme for the channels
 # we use the legacy channels in the mean time
diff --git a/weave/loop_splitting.nim b/weave/loop_splitting.nim
index c83ed60..4a1aeec 100644
--- a/weave/loop_splitting.nim
+++ b/weave/loop_splitting.nim
@@ -6,7 +6,7 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  ./config, ./contexts,
+  ./config,
   ./datatypes/sync_types,
   ./instrumentation/[contracts, loggers]
 
diff --git a/weave/memory/multithreaded_memory_management.md b/weave/memory/multithreaded_memory_management.md
index fecc89d..1d6842a 100644
--- a/weave/memory/multithreaded_memory_management.md
+++ b/weave/memory/multithreaded_memory_management.md
@@ -332,6 +332,12 @@ This article goes over the skeleton of a threadsafe growable pool allocator (but
 
 https://www.qt.io/blog/a-fast-and-thread-safe-pool-allocator-for-qt-part-1
 
+#### comp.programming.threads
+
+Lock-free slab allocator by Chris Thomasson: https://groups.google.com/forum/embed/#!topic/comp.programming.threads/gkX0tIWR_Gk
+
+Associated memory queue by Dmitry Vyukov: https://groups.google.com/forum/#!topic/comp.programming.threads/4ILh6yY5fV4
+
 ## False sharing
 
 keywords: cache ping-pong, false sharing, cache threashing, cache lines
diff --git a/weave/runtime.nim b/weave/runtime.nim
index 8cc4339..fdecacb 100644
--- a/weave/runtime.nim
+++ b/weave/runtime.nim
@@ -12,7 +12,7 @@ import
   ./instrumentation/[contracts, profilers, loggers],
   ./contexts, ./config,
   ./datatypes/[sync_types, prell_deques],
-  ./channels/[channels_mpsc_bounded_lock, channels_spsc_single_ptr, channels_mpsc_unbounded],
+  ./channels/[channels_spsc_single_ptr, channels_mpsc_unbounded],
   ./memory/[persistacks, intrusive_stacks, allocs],
   ./scheduler, ./signals, ./workers, ./thieves, ./victims,
   # Low-level primitives
@@ -23,9 +23,6 @@ import
 
 type Runtime* = object
 
-# TODO: used to debug a recurrent deadlock on trySend with 5 workers
-import ./channels/channels_legacy
-
 proc init*(_: type Runtime) =
   # TODO detect Hyper-Threading and NUMA domain
 
diff --git a/weave/scheduler.nim b/weave/scheduler.nim
index ed3168d..ad5d16b 100644
--- a/weave/scheduler.nim
+++ b/weave/scheduler.nim
@@ -9,7 +9,7 @@ import
   ./instrumentation/[contracts, profilers, loggers],
   ./primitives/barriers,
   ./datatypes/[sync_types, prell_deques, context_thread_local, flowvars, sparsesets],
-  ./channels/[channels_mpsc_bounded_lock, channels_spsc_single_ptr, channels_spsc_single_object, channels_mpsc_unbounded],
+  ./channels/[channels_legacy, channels_spsc_single_ptr, channels_mpsc_unbounded],
   ./memory/[persistacks, intrusive_stacks, allocs],
   ./contexts, ./config,
   ./victims, ./loop_splitting,
diff --git a/weave/targets.nim b/weave/targets.nim
index f2c3c22..7383186 100644
--- a/weave/targets.nim
+++ b/weave/targets.nim
@@ -10,8 +10,7 @@ import
   ./contexts,
   ./random/rng,
   ./instrumentation/[contracts, loggers],
-  ./config,
-  ./memory/allocs
+  ./config
 
 # Victim selection
 # ----------------------------------------------------------------------------------
diff --git a/weave/victims.nim b/weave/victims.nim
index 001b2e9..115ddae 100644
--- a/weave/victims.nim
+++ b/weave/victims.nim
@@ -10,7 +10,7 @@ import
                sparsesets, prell_deques, flowvars],
   ./contexts, ./config,
   ./instrumentation/[contracts, profilers, loggers],
-  ./channels/[channels_mpsc_bounded_lock, channels_spsc_single_ptr, channels_mpsc_unbounded],
+  ./channels/[channels_spsc_single_ptr, channels_mpsc_unbounded],
   ./thieves, ./loop_splitting
 
 # Victims - Adaptative task splitting
diff --git a/weave/workers.nim b/weave/workers.nim
index 35b8de4..e616715 100644
--- a/weave/workers.nim
+++ b/weave/workers.nim
@@ -6,10 +6,10 @@
 # at your option. This file may not be copied, modified, or distributed except according to those terms.
 
 import
-  ./datatypes/[sync_types, context_thread_local, bounded_queues],
+  ./datatypes/[sync_types, context_thread_local],
   ./contexts,
   ./instrumentation/[contracts, profilers, loggers],
-  ./channels/[channels_mpsc_bounded_lock, channels_spsc_single_ptr],
+  ./channels/channels_spsc_single_ptr,
   ./memory/persistacks,
   ./config,
   ./thieves