diff --git a/Project.toml b/Project.toml
index bd8f0b7d29..f43f7da517 100644
--- a/Project.toml
+++ b/Project.toml
@@ -12,6 +12,7 @@ Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+TaskLocalValues = "ed4db957-447d-4319-bfb6-7fa9ae7ecf34"
 Tensors = "48a634ad-e948-5137-8d70-aa71f2a747f4"
 WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
 
@@ -31,6 +32,7 @@ NearestNeighbors = "0.4"
 OrderedCollections = "1"
 Preferences = "1"
 Reexport = "1"
+TaskLocalValues = "0.1"
 Tensors = "1.14"
 WriteVTK = "1.13"
 julia = "1.9"
diff --git a/docs/Manifest.toml b/docs/Manifest.toml
index 4b67ef2b1f..4f814b904c 100644
--- a/docs/Manifest.toml
+++ b/docs/Manifest.toml
@@ -542,7 +542,7 @@ uuid = "29a986be-02c6-4525-aec4-84b980013641"
 version = "2.0.4"
 
 [[deps.Ferrite]]
-deps = ["EnumX", "ForwardDiff", "LinearAlgebra", "NearestNeighbors", "OrderedCollections", "Preferences", "Reexport", "SparseArrays", "StaticArrays", "Tensors", "WriteVTK"]
+deps = ["EnumX", "ForwardDiff", "LinearAlgebra", "NearestNeighbors", "OrderedCollections", "Preferences", "Reexport", "SparseArrays", "StaticArrays", "TaskLocalValues", "Tensors", "WriteVTK"]
 path = ".."
 uuid = "c061ca5d-56c9-439f-9c0e-210fe06d3992"
 version = "1.0.0"
@@ -2104,6 +2104,11 @@ deps = ["ArgTools", "SHA"]
 uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 version = "1.10.0"
 
+[[deps.TaskLocalValues]]
+git-tree-sha1 = "d155450e6dff2a8bc2fcb81dcb194bd98b0aeb46"
+uuid = "ed4db957-447d-4319-bfb6-7fa9ae7ecf34"
+version = "0.1.2"
+
 [[deps.TensorCore]]
 deps = ["LinearAlgebra"]
 git-tree-sha1 = "1feb45f88d133a655e001435632f019a9a1bcdb6"
diff --git a/docs/src/literate-howto/threaded_assembly.jl b/docs/src/literate-howto/threaded_assembly.jl
index fa0526fa9f..332989efcd 100644
--- a/docs/src/literate-howto/threaded_assembly.jl
+++ b/docs/src/literate-howto/threaded_assembly.jl
@@ -73,7 +73,7 @@ end;
 #
 # ScratchValues is a thread-local collection of data that each thread needs to own,
 # since we need to be able to mutate the data in the threads independently
-struct ScratchValues{T, CV <: CellValues, FV <: FacetValues, TT <: AbstractTensor, dim, AT}
+struct ScratchValues{T, CV <: CellValues, FV <: FacetValues, TT <: AbstractTensor, dim}
     Ke::Matrix{T}
     fe::Vector{T}
     cellvalues::CV
@@ -81,7 +81,6 @@ struct ScratchValues{T, CV <: CellValues, FV <: FacetValues, TT <: AbstractTenso
     global_dofs::Vector{Int}
     ɛ::Vector{TT}
     coordinates::Vector{Vec{dim, T}}
-    assembler::AT
 end;
 
 # Each thread need its own CellValues and FacetValues (although, for this example we don't use
@@ -96,9 +95,8 @@ function create_values(interpolation_space::Interpolation{refshape}, qr_order::I
 end;
 
 # Create a `ScratchValues` for each thread with the thread local data
-function create_scratchvalues(K, f, dh::DofHandler{dim}, ip) where {dim}
+function create_scratchvalues(dh::DofHandler{dim}, ip) where {dim}
     nthreads = Threads.nthreads()
-    assemblers = [start_assemble(K, f) for i in 1:nthreads]
     cellvalues, facetvalues = create_values(ip, 2)
 
     n_basefuncs = getnbasefunctions(cellvalues[1])
@@ -112,7 +110,7 @@ function create_scratchvalues(K, f, dh::DofHandler{dim}, ip) where {dim}
     coordinates = [[zero(Vec{dim}) for i in 1:length(dh.grid.cells[1].nodes)] for i in 1:nthreads]
 
     return [ScratchValues(Kes[i], fes[i], cellvalues[i], facetvalues[i], global_dofs[i],
-                         ɛs[i], coordinates[i], assemblers[i]) for i in 1:nthreads]
+                         ɛs[i], coordinates[i]) for i in 1:nthreads]
 end;
 
 # ## Threaded assemble
@@ -121,13 +119,15 @@ end;
 function doassemble(K::SparseMatrixCSC, colors, grid::Grid, dh::DofHandler, C::SymmetricTensor{4, dim}, ip) where {dim}
 
     f = zeros(ndofs(dh))
-    scratches = create_scratchvalues(K, f, dh, ip)
+    assembler = start_assemble(K, f)
+
+    scratches = create_scratchvalues(dh, ip)
     b = Vec{3}((0.0, 0.0, 0.0)) # Body force
 
     for color in colors
         ## Each color is safe to assemble threaded
         Threads.@threads :static for i in 1:length(color)
-            assemble_cell!(scratches[Threads.threadid()], color[i], K, grid, dh, C, b)
+            assemble_cell!(scratches[Threads.threadid()], color[i], assembler, grid, dh, C, b)
         end
     end
 
@@ -136,13 +136,13 @@ end
 
 # The cell assembly function is written the same way as if it was a single threaded example.
 # The only difference is that we unpack the variables from our `scratch`.
-function assemble_cell!(scratch::ScratchValues, cell::Int, K::SparseMatrixCSC,
+function assemble_cell!(scratch::ScratchValues, cell::Int, assembler,
                         grid::Grid, dh::DofHandler, C::SymmetricTensor{4, dim}, b::Vec{dim}) where {dim}
 
     ## Unpack our stuff from the scratch
-    Ke, fe, cellvalues, facetvalues, global_dofs, ɛ, coordinates, assembler =
+    Ke, fe, cellvalues, facetvalues, global_dofs, ɛ, coordinates =
          scratch.Ke, scratch.fe, scratch.cellvalues, scratch.facetvalues,
-         scratch.global_dofs, scratch.ɛ, scratch.coordinates, scratch.assembler
+         scratch.global_dofs, scratch.ɛ, scratch.coordinates
 
     fill!(Ke, 0)
     fill!(fe, 0)
diff --git a/src/Ferrite.jl b/src/Ferrite.jl
index 2738904d83..ceb13fa33c 100644
--- a/src/Ferrite.jl
+++ b/src/Ferrite.jl
@@ -25,6 +25,8 @@ using Tensors:
     rotation_tensor, symmetric, tovoigt!, hessian, otimesu
 using ForwardDiff:
     ForwardDiff
+using TaskLocalValues:
+    TaskLocalValue
 
 include("CollectionsOfViews.jl")
 using .CollectionsOfViews:
diff --git a/src/assembler.jl b/src/assembler.jl
index e79521dc4b..de034df7af 100644
--- a/src/assembler.jl
+++ b/src/assembler.jl
@@ -115,8 +115,8 @@ Assembler for sparse matrix with CSC storage type.
 struct CSCAssembler{Tv,Ti,MT<:AbstractSparseMatrixCSC{Tv,Ti}} <: AbstractCSCAssembler
     K::MT
     f::Vector{Tv}
-    permutation::Vector{Int}
-    sorteddofs::Vector{Int}
+    permutation::TaskLocalValue{Vector{Int}}
+    sorteddofs::TaskLocalValue{Vector{Int}}
 end
 
 """
@@ -125,8 +125,8 @@ Assembler for symmetric sparse matrix with CSC storage type.
 struct SymmetricCSCAssembler{Tv,Ti, MT <: Symmetric{Tv,<:AbstractSparseMatrixCSC{Tv,Ti}}} <: AbstractCSCAssembler
     K::MT
     f::Vector{Tv}
-    permutation::Vector{Int}
-    sorteddofs::Vector{Int}
+    permutation::TaskLocalValue{Vector{Int}}
+    sorteddofs::TaskLocalValue{Vector{Int}}
 end
 
 function Base.show(io::IO, ::MIME"text/plain", a::Union{CSCAssembler, SymmetricCSCAssembler})
@@ -165,11 +165,15 @@ start_assemble(K::Union{AbstractSparseMatrixCSC, Symmetric{<:Any,<:AbstractSpars
 
 function start_assemble(K::AbstractSparseMatrixCSC{T}, f::Vector=T[]; fillzero::Bool=true, maxcelldofs_hint::Int=0) where {T}
     fillzero && (fillzero!(K); fillzero!(f))
-    return CSCAssembler(K, f, zeros(Int,maxcelldofs_hint), zeros(Int,maxcelldofs_hint))
+    permutation = TaskLocalValue{Vector{Int}}(() -> Vector{Int}(undef, maxcelldofs_hint))
+    sorteddofs = TaskLocalValue{Vector{Int}}(() -> Vector{Int}(undef, maxcelldofs_hint))
+    return CSCAssembler(K, f, permutation, sorteddofs)
 end
 function start_assemble(K::Symmetric{T,<:SparseMatrixCSC}, f::Vector=T[]; fillzero::Bool=true, maxcelldofs_hint::Int=0) where T
     fillzero && (fillzero!(K); fillzero!(f))
-    return SymmetricCSCAssembler(K, f, zeros(Int,maxcelldofs_hint), zeros(Int,maxcelldofs_hint))
+    permutation = TaskLocalValue{Vector{Int}}(() -> Vector{Int}(undef, maxcelldofs_hint))
+    sorteddofs = TaskLocalValue{Vector{Int}}(() -> Vector{Int}(undef, maxcelldofs_hint))
+    return SymmetricCSCAssembler(K, f, permutation, sorteddofs)
 end
 
 function finish_assemble(a::Union{CSCAssembler, SymmetricCSCAssembler})
@@ -226,7 +230,7 @@ end
     # We assume that the input dofs are not sorted, because the cells need the dofs in
     # a specific order, which might not be the sorted order. Hence we sort them.
     # Note that we are not allowed to mutate `dofs` in the process.
-    sorteddofs, permutation = _sortdofs_for_assembly!(A.permutation, A.sorteddofs, dofs)
+    sorteddofs, permutation = _sortdofs_for_assembly!(A.permutation[], A.sorteddofs[], dofs)
 
     current_col = 1
     @inbounds for Kcol in sorteddofs