From 754aa9670bc73c38af3769be2ff8110fb475c450 Mon Sep 17 00:00:00 2001
From: Lukas <37111893+lkdvos@users.noreply.github.com>
Date: Fri, 12 Jul 2024 14:37:59 +0200
Subject: [PATCH] Taking allocators seriously (#182)

* Start allocator implementations

* Replace `istemp` with Val in allocators

* Add tensorfree calls in base implementations

* Fix typo

* Stricter typing in allocation functions

* Refactor allocations

* slightly modify BaseCopy implementation

* Fix some missing Vals

* Add @no_escape block in blas_contract

* Move Bumper to package extension

* Specify unexported macro for julia 1.8

* Update docs

* Remove extraneous arguments

* Fix docstring

* Link TBLIS in docs

* add ncon backend support and tests

* improved backend tests

* address first set of comments

* fix cuda issue

* update ncon and improve extra macros; more tests part 1

* extensions on julia v1.8

* Formatter [no ci]

* fix and test cuTENSORExt

* add bumper tests

* fix and test zero-dimensional bumper allocations

---------

Co-authored-by: Jutho <jutho.haegeman@ugent.be>
---
 Project.toml                       |   7 +
 docs/make.jl                       |   1 +
 docs/src/index.md                  |   4 +-
 docs/src/man/backends.md           | 131 ++++++++++++++
 ext/TensorOperationsBumperExt.jl   |  47 +++++
 ext/TensorOperationscuTENSORExt.jl |  35 +++-
 src/TensorOperations.jl            |   3 +-
 src/implementation/allocator.jl    |  54 ++++--
 src/implementation/base.jl         |  45 ++---
 src/implementation/functions.jl    | 149 +++++++++-------
 src/implementation/ncon.jl         |  35 ++--
 src/implementation/strided.jl      |   6 +-
 src/indexnotation/instantiators.jl |   4 +-
 src/indexnotation/tensormacros.jl  |  39 ++--
 test/butensor.jl                   |  86 +++++++++
 test/cutensor.jl                   |  51 +++++-
 test/methods.jl                    | 123 ++++++-------
 test/runtests.jl                   |  15 +-
 test/tensor.jl                     | 275 +++++++++++++++--------------
 19 files changed, 757 insertions(+), 353 deletions(-)
 create mode 100644 docs/src/man/backends.md
 create mode 100644 ext/TensorOperationsBumperExt.jl
 create mode 100644 test/butensor.jl

diff --git a/Project.toml b/Project.toml
index bd932d9b..8c8d281c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -13,6 +13,7 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 PackageExtensionCompat = "65ce6f38-6b18-4e1d-a461-8949797d7930"
+PtrArrays = "43287f4e-b6f4-7ad1-bb20-aadabca52c3d"
 Strided = "5e0ebb24-38b0-5f93-81fe-25c709ecae67"
 StridedViews = "4db3bf67-4bd7-4b4e-b153-31dc3fb37143"
 TupleTools = "9d95972d-f1c8-5527-a6e0-b4b365fa01f6"
@@ -20,16 +21,19 @@ VectorInterface = "409d34a3-91d5-4945-b6ec-7529ddf182d8"
 cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [weakdeps]
+Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [extensions]
+TensorOperationsBumperExt = "Bumper"
 TensorOperationsChainRulesCoreExt = "ChainRulesCore"
 TensorOperationscuTENSORExt = ["cuTENSOR", "CUDA"]
 
 [compat]
 Aqua = "0.6, 0.7, 0.8"
+Bumper = "0.6"
 CUDA = "5.4.0"
 ChainRulesCore = "1"
 ChainRulesTestUtils = "1"
@@ -38,6 +42,7 @@ LRUCache = "1"
 LinearAlgebra = "1.6"
 Logging = "1.6"
 PackageExtensionCompat = "1"
+PtrArrays = "1.2"
 Random = "1"
 Strided = "2.0.4"
 StridedViews = "0.3"
@@ -49,6 +54,7 @@ julia = "1.8"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a"
 DynamicPolynomials = "7c1d4256-1411-5781-91ec-d7bc3513ac07"
@@ -67,4 +73,5 @@ test = [
     "cuTENSOR",
     "Aqua",
     "Logging",
+    "Bumper",
 ]
diff --git a/docs/make.jl b/docs/make.jl
index 7441b3ef..f6670bae 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -9,6 +9,7 @@ makedocs(; modules=[TensorOperations],
                 "Manual" => ["man/indexnotation.md",
                              "man/functions.md",
                              "man/interface.md",
+                             "man/backends.md",
                              "man/autodiff.md",
                              "man/implementation.md"],
                 "Index" => "index/index.md"])
diff --git a/docs/src/index.md b/docs/src/index.md
index 6ec22608..08e5ea4d 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -5,7 +5,7 @@
 ## Table of contents
 
 ```@contents
-Pages = ["index.md", "man/indexnotation.md", "man/functions.md", "man/autodiff.md", "man/interface.md", "man/implementation.md"]
+Pages = ["index.md", "man/indexnotation.md", "man/functions.md", "man/interface.md", "man/backends.md", "man/autodiff.md", "man/implementation.md"]
 Depth = 4
 ```
 
@@ -82,7 +82,5 @@ complicated tensor expression is deconstructed.
 
 ## To do list
 
-  - Add more backends, e.g. using pure Julia Base functionality, or using
-    [LoopVectorization.jl](https://github.com/JuliaSIMD/LoopVectorization.jl)
   - Make it easier to modify the contraction order algorithm or its cost function (e.g. to
     optimize based on memory footprint) or to splice in runtime information.
diff --git a/docs/src/man/backends.md b/docs/src/man/backends.md
new file mode 100644
index 00000000..a32403ca
--- /dev/null
+++ b/docs/src/man/backends.md
@@ -0,0 +1,131 @@
+# Backends and Allocators
+
+The `TensorOperations` package is designed to provide powerful tools for performing tensor computations efficiently.
+In advanced use cases, it can be desirable to squeeze the last drops of performance out of the library, by experimenting with either different micro-optimized implementations of the same operation, or by altering the memory management system.
+Here, we detail how to access these functionalities.
+
+## Backends
+
+### Backend Selection
+
+`TensorOperations` supports multiple backends for tensor contractions, allowing users to choose different implementations based on their specific needs.
+While special care is taken to ensure good defaults, we also provide the flexibility to select a backend manually.
+This can be achieved in a variety of ways:
+
+1. **Global setting**: The default backend can be set globally on a per-type basis, as well as a per-function basis. This is achieved by hooking into the implementation of the default backend selection procedure. In particular, this procedure ends up calling [`select_backend`](@ref)`, which can be overloaded to return a different backend.
+
+2. **Local setting**: Alternatively, the backend can be set locally for a specific call to either [`@tensor`](@ref), [`ncon`](@ref) or the function-based interface. Both `@tensor` and `ncon` accept a keyword argument `backend`, which will locally override the default backend selection mechanism. The result is that the specified backend will be inserted as a final argument to all calls of the primitive tensor operations. This is also how this can be achieved in the function-based interface.
+
+```julia
+using TensorOperations
+mybackend = StridedNative()
+
+# inserting a backend into the @tensor macro
+@tensor backend = mybackend A[i,j] := B[i,k] * C[k,j]
+
+# inserting a backend into the ncon function
+D = ncon([A, B, C], [[1, 2], [2, 3], [3, 1]]; backend=mybackend)
+
+# inserting a backend into the function-based interface
+tensoradd(A, pA, conjA, B, pB, conjB, α, β, mybackend)
+```
+
+### Available Backends
+
+`TensorOperations` provides some options for backends out-of-the box.
+In particular, the following backends are available:
+
+```@docs
+DefaultBackend
+BaseCopy
+BaseView
+StridedNative
+StridedBLAS
+cuTENSORBackend
+```
+
+Here, arrays that are strided are typically handled most efficiently by the `Strided.jl`-based backends.
+By default, the `StridedBLAS` backend is used for element types that support BLAS operations, as it seems that the performance gains from using BLAS outweigh the overhead of sometimes having to allocate intermediate permuted arrays.
+
+On the other hand, the `BaseCopy` and `BaseView` backends are used for arrays that are not strided.
+These are designed to be as general as possible, and as a result are not as performant as specific implementations.
+Nevertheless, they can be useful for debugging purposes or for working with custom tensor types that have limited support for methods outside of `Base`.
+
+Finally, we also provide a `cuTENSORBackend` for use with the `cuTENSOR.jl` library, which is a NVidia GPU-accelerated tensor contraction library.
+This backend is only available through a package extension for `cuTENSOR`.
+
+### Custom Backends
+
+Users can also define their own backends, to facilitate experimentation with new implementations.
+This can be done by defining a new type that is a subtype of `AbstractBackend`, and dispatching on this type in the implementation of the primitive tensor operations.
+In particular, the only required implemented methods are [`tensoradd!`](@ref), [`tensortrace!`](@ref), [`tensorcontract!`](@ref).
+
+For example, [`TensorOperationsTBLIS`](https://github.com/lkdvos/TensorOperationsTBLIS.jl) is a wrapper that provides a backend for tensor contractions using the [TBLIS](https://github.com/devinamatthews/tblis) library.
+
+## Allocators
+
+Evaluating complex tensor networks is typically done most efficiently by pairwise operations.
+As a result, this procedure often requires the allocation of many temporary arrays, which can affect performance for certain operations.
+To mitigate this, `TensorOperations` exposes an allocator system, which allows users to more finely control the allocation of both output tensors and temporary tensors.
+
+In particular, the allocator system is used in multiple ways:
+As mentioned before, it can be used to allocate and free the intermediate tensors that are required to evaluate a tensor network in a pairwise fashion.
+Additionally, it can also be used to allocate and free temporary objects that arise when reshaping and permuting input tensors, for example when making them compatible with BLAS instructions.
+
+### Allocator Selection
+
+The allocator system can only be accessed *locally*, by passing an allocator to the `@tensor` macro, the `ncon` function, or the function-based interface.
+
+```julia
+using TensorOperations
+myallocator = ManualAllocator()
+
+# inserting a backend into the @tensor macro
+@tensor allocator = myallocator A[i,j] := B[i,k] * C[k,j]
+
+# inserting an allocator into the ncon function
+D = ncon([A, B, C], [[1, 2], [2, 3], [3, 1]]; allocator=myallocator)
+
+# inserting a backend into the function-based interface
+tensoradd(A, pA, conjA, B, pB, conjB, α, β, DefaultBackend(), myallocator)
+```
+
+Important to note here is that the backend system is prioritized over the allocator system.
+In particular, this means that the backend will be selected **first**, while only then the allocator should be inserted.
+
+### Available Allocators
+
+`TensorOperations` also provides some options for allocators out-of-the box.
+
+```@docs
+DefaultAllocator
+ManualAllocator
+```
+
+By default, the `DefaultAllocator` is used, which uses Julia's built-in memory management system.
+Optionally, it can be useful to use the `ManualAllocator`, as the manual memory management reduces the pressure on the garbage collector.
+In particular in multi-threaded applications, this can sometimes lead to a significant performance improvement.
+
+Finally, users can also opt to use the `Bumper.jl` system, which pre-allocates a slab of memory that can be re-used afterwards.
+This is available through a package extension for `Bumper`.
+Here, the `allocator` object is just the provided buffers, which are then used to store the intermediate tensors.
+
+```julia
+using TensorOperations, Bumper
+buf = Bumper.default_buffer()
+@no_escape buf
+    @tensor allocator = buf A[i,j] := B[i,k] * C[k,j]
+end
+```
+For convenience, the construction above is also provided in a specialized macro form which is fully equivalent:
+
+```@docs
+@butensor
+```
+
+### Custom Allocators
+
+Users can also define their own allocators, to facilitate experimentation with new implementations.
+Here, no restriction is made on the type of the allocator, and any object can be passed as an allocator.
+The required implementated methods are [`tensoralloc`](@ref) and [`tensorfree`](@ref).
+
diff --git a/ext/TensorOperationsBumperExt.jl b/ext/TensorOperationsBumperExt.jl
new file mode 100644
index 00000000..44c37195
--- /dev/null
+++ b/ext/TensorOperationsBumperExt.jl
@@ -0,0 +1,47 @@
+module TensorOperationsBumperExt
+
+using TensorOperations
+using Bumper
+
+function TensorOperations.tensoralloc(::Type{A}, structure, ::Val{istemp},
+                                      buf::Union{SlabBuffer,AllocBuffer}) where {A<:AbstractArray,
+                                                                                 istemp}
+    # TODO: remove the `ndims` check if this is fixed in Bumper / StrideArraysCore
+    if istemp & ndims(A) > 0
+        return Bumper.alloc!(buf, eltype(A), structure...)
+    else
+        return TensorOperations.tensoralloc(A, structure, Val(istemp))
+    end
+end
+
+function TensorOperations.blas_contract!(C, A, pA, conjA, B, pB, conjB, pAB, α, β,
+                                         allocator::Union{SlabBuffer,AllocBuffer})
+    @no_escape allocator begin
+        C = Base.@invoke TensorOperations.blas_contract!(C::Any, A::Any, pA::Any,
+                                                         conjA::Any,
+                                                         B::Any, pB::Any,
+                                                         conjB::Any, pAB::Any, α::Any,
+                                                         β::Any,
+                                                         allocator::Any)
+    end
+    return C
+end
+
+function TensorOperations._butensor(src, ex...)
+    buf_sym = gensym("buffer")
+    cp_sym = gensym("checkpoint")
+    res_sym = gensym("result")
+
+    # TODO: there is no check for doubled tensor kwargs
+    newex = quote
+        $buf_sym = $(Expr(:call, GlobalRef(Bumper, :default_buffer)))
+        $cp_sym = $(Expr(:call, GlobalRef(Bumper, :checkpoint_save), buf_sym))
+        $res_sym = $(Expr(:macrocall, GlobalRef(TensorOperations, Symbol("@tensor")),
+                          src, :(allocator = $buf_sym), ex...))
+        $(Expr(:call, GlobalRef(Bumper, :checkpoint_restore!), cp_sym))
+        $res_sym
+    end
+    return return Base.remove_linenums!(newex)
+end
+
+end
diff --git a/ext/TensorOperationscuTENSORExt.jl b/ext/TensorOperationscuTENSORExt.jl
index b5cb8d40..6fe63995 100644
--- a/ext/TensorOperationscuTENSORExt.jl
+++ b/ext/TensorOperationscuTENSORExt.jl
@@ -29,9 +29,27 @@ using CUDA.Adapt: adapt
 using Strided
 using TupleTools: TupleTools as TT
 
-StridedViewsCUDAExt = Base.get_extension(Strided.StridedViews, :StridedViewsCUDAExt)
+const StridedViewsCUDAExt = @static if isdefined(Base, :get_extension)
+    Base.get_extension(Strided.StridedViews, :StridedViewsCUDAExt)
+else
+    Strided.StridedViews.StridedViewsCUDAExt
+end
 isnothing(StridedViewsCUDAExt) && error("StridedViewsCUDAExt not found")
 
+#-------------------------------------------------------------------------------------------
+# @cutensor macro
+#-------------------------------------------------------------------------------------------
+function TensorOperations._cutensor(src, ex...)
+    # TODO: there is no check for doubled tensor kwargs
+    return Expr(:macrocall, GlobalRef(TensorOperations, Symbol("@tensor")),
+                src,
+                Expr(:(=), :backend,
+                     Expr(:call, GlobalRef(TensorOperations, :cuTENSORBackend))),
+                Expr(:(=), :allocator,
+                     Expr(:call, GlobalRef(TensorOperations, :CUDAAllocator))),
+                ex...)
+end
+
 #-------------------------------------------------------------------------------------------
 # Backend selection and passing
 #-------------------------------------------------------------------------------------------
@@ -63,7 +81,7 @@ function TO.tensoradd!(C::AbstractArray,
     C_cuda, isview = _custrided(C, allocator)
     A_cuda, = _custrided(A, allocator)
     tensoradd!(C_cuda, A_cuda, pA, conjA, α, β, backend, allocator)
-    isview || copy!(C, C_cuda)
+    isview || copy!(C, C_cuda.parent)
     return C
 end
 function TO.tensorcontract!(C::AbstractArray,
@@ -77,7 +95,7 @@ function TO.tensorcontract!(C::AbstractArray,
     B_cuda, = _custrided(B, allocator)
     tensorcontract!(C_cuda, A_cuda, pA, conjA, B_cuda, pB, conjB, pAB, α, β, backend,
                     allocator)
-    isview || copy!(C, C_cuda)
+    isview || copy!(C, C_cuda.parent)
     return C
 end
 function TO.tensortrace!(C::AbstractArray,
@@ -87,7 +105,7 @@ function TO.tensortrace!(C::AbstractArray,
     C_cuda, isview = _custrided(C, allocator)
     A_cuda, = _custrided(A, allocator)
     tensortrace!(C_cuda, A_cuda, p, q, conjA, α, β, backend, allocator)
-    isview || copy!(C, C_cuda)
+    isview || copy!(C, C_cuda.parent)
     return C
 end
 
@@ -125,7 +143,7 @@ function CUDAAllocator()
 end
 
 function TO.tensoralloc_add(TC, A::AbstractArray, pA::Index2Tuple, conjA::Bool,
-                            istemp::Bool,
+                            istemp::Val,
                             allocator::CUDAAllocator)
     ttype = CuArray{TC,TO.numind(pA)}
     structure = TO.tensoradd_structure(A, pA, conjA)
@@ -136,7 +154,7 @@ function TO.tensoralloc_contract(TC,
                                  A::AbstractArray, pA::Index2Tuple, conjA::Bool,
                                  B::AbstractArray, pB::Index2Tuple, conjB::Bool,
                                  pAB::Index2Tuple,
-                                 istemp::Bool,
+                                 istemp::Val,
                                  allocator::CUDAAllocator)
     ttype = CuArray{TC,TO.numind(pAB)}
     structure = TO.tensorcontract_structure(A, pA, conjA, B, pB, conjB, pAB)
@@ -150,8 +168,9 @@ end
 
 # NOTE: the general implementation in the `DefaultAllocator` case works just fine, without
 # selecting an explicit memory model
-function TO.tensoralloc(::Type{CuArray{T,N}}, structure, istemp::Bool,
-                        allocator::CUDAAllocator{Mout,Min,Mtemp}) where {T,N,Mout,Min,Mtemp}
+function TO.tensoralloc(::Type{CuArray{T,N}}, structure, ::Val{istemp},
+                        allocator::CUDAAllocator{Mout,Min,Mtemp}) where {T,N,istemp,Mout,
+                                                                         Min,Mtemp}
     M = istemp ? Mtemp : Mout
     return CuArray{T,N,M}(undef, structure)
 end
diff --git a/src/TensorOperations.jl b/src/TensorOperations.jl
index a15f7bca..d3e6ff7d 100644
--- a/src/TensorOperations.jl
+++ b/src/TensorOperations.jl
@@ -7,6 +7,7 @@ using LinearAlgebra
 using LinearAlgebra: mul!, BlasFloat
 using Strided
 using StridedViews: isstrided
+using PtrArrays
 using LRUCache
 
 using Base.Meta: isexpr
@@ -15,7 +16,7 @@ using Base.Meta: isexpr
 #---------
 # export macro API
 export @tensor, @tensoropt, @tensoropt_verbose, @optimalcontractiontree, @notensor, @ncon
-export @cutensor
+export @cutensor, @butensor
 
 # export function based API
 export ncon
diff --git a/src/implementation/allocator.jl b/src/implementation/allocator.jl
index 6d774706..fdd6595a 100644
--- a/src/implementation/allocator.jl
+++ b/src/implementation/allocator.jl
@@ -1,5 +1,5 @@
 # ------------------------------------------------------------------------------------------
-# DefaultAllocator
+# Allocator backends
 # ------------------------------------------------------------------------------------------
 """
     DefaultAllocator()
@@ -30,6 +30,17 @@ parameters `Min`, `Mout`, `Mtemp`` can be any of the CUDA.jl memory types, i.e.
 """
 struct CUDAAllocator{Mout,Min,Mtemp} end
 
+"""
+    ManualAllocator()
+
+Allocator that bypasses Julia's memory management for temporary tensors by leveraging `Libc.malloc`
+and `Libc.free` directly. This can be useful for reducing the pressure on the garbage collector.
+This backend will allocate using `DefaultAllocator` for output tensors that escape the `@tensor`
+block, which will thus still be managed using Julia's GC. The other tensors will be backed by
+`PtrArray` instances, from `PtrArrays.jl`, thus requiring compatibility with that interface.
+"""
+struct ManualAllocator end
+
 # ------------------------------------------------------------------------------------------
 # Generic implementation
 # ------------------------------------------------------------------------------------------
@@ -49,7 +60,7 @@ Promote the scalar types of a tensor addition to a common type.
 promote_add(args...) = Base.promote_op(+, args...)
 
 """
-    tensoralloc_add(TC, A, pA, conjA, [istemp=false, allocator])
+    tensoralloc_add(TC, A, pA, conjA, [istemp=Val(false), allocator])
 
 Allocate a tensor `C` of scalar type `TC` that would be the result of
 
@@ -61,15 +72,15 @@ used to implement different allocation strategies.
 
 See also [`tensoralloc`](@ref) and [`tensorfree!`](@ref).
 """
-function tensoralloc_add(TC, A, pA::Index2Tuple, conjA::Bool, istemp::Bool=false,
+function tensoralloc_add(TC, A, pA::Index2Tuple, conjA::Bool, istemp::Val=Val(false),
                          allocator=DefaultAllocator())
     ttype = tensoradd_type(TC, A, pA, conjA)
     structure = tensoradd_structure(A, pA, conjA)
-    return tensoralloc(ttype, structure, istemp, allocator)::ttype
+    return tensoralloc(ttype, structure, istemp, allocator)
 end
 
 """
-    tensoralloc_contract(TC, A, pA, conjA, B, pB, conjB, pAB, [istemp=false, allocator])
+    tensoralloc_contract(TC, A, pA, conjA, B, pB, conjB, pAB, [istemp=Val(false), allocator])
 
 Allocate a tensor `C` of scalar type `TC` that would be the result of
 
@@ -84,11 +95,11 @@ See also [`tensoralloc`](@ref) and [`tensorfree!`](@ref).
 function tensoralloc_contract(TC,
                               A, pA::Index2Tuple, conjA::Bool,
                               B, pB::Index2Tuple, conjB::Bool,
-                              pAB::Index2Tuple, istemp::Bool=false,
+                              pAB::Index2Tuple, istemp::Val=Val(false),
                               allocator=DefaultAllocator())
     ttype = tensorcontract_type(TC, A, pA, conjA, B, pB, conjB, pAB)
     structure = tensorcontract_structure(A, pA, conjA, B, pB, conjB, pAB)
-    return tensoralloc(ttype, structure, istemp, allocator)::ttype
+    return tensoralloc(ttype, structure, istemp, allocator)
 end
 
 # ------------------------------------------------------------------------------------------
@@ -118,8 +129,9 @@ function tensoradd_structure(A::AbstractArray, pA::Index2Tuple, conjA::Bool)
     return size.(Ref(A), linearize(pA))
 end
 
-function tensorcontract_type(TC, A::AbstractArray, pA, conjA,
-                             B::AbstractArray, pB, conjB, pAB)
+function tensorcontract_type(TC, A::AbstractArray, pA::Index2Tuple, conjA::Bool,
+                             B::AbstractArray, pB::Index2Tuple, conjB::Bool,
+                             pAB::Index2Tuple)
     T1 = tensoradd_type(TC, A, pAB, conjA)
     T2 = tensoradd_type(TC, B, pAB, conjB)
     if T1 == T2
@@ -129,14 +141,15 @@ function tensorcontract_type(TC, A::AbstractArray, pA, conjA,
     end
 end
 
-function tensorcontract_structure(A::AbstractArray, pA, conjA,
-                                  B::AbstractArray, pB, conjB, pAB)
+function tensorcontract_structure(A::AbstractArray, pA::Index2Tuple, conjA::Bool,
+                                  B::AbstractArray, pB::Index2Tuple, conjB::Bool,
+                                  pAB::Index2Tuple)
     return let lA = length(pA[1])
         map(n -> n <= lA ? size(A, pA[1][n]) : size(B, pB[2][n - lA]), linearize(pAB))
     end
 end
 
-function tensoralloc(ttype, structure, istemp=false, allocator=DefaultAllocator())
+function tensoralloc(ttype, structure, ::Val=Val(false), allocator=DefaultAllocator())
     C = ttype(undef, structure)
     # fix an issue with undefined references for strided arrays
     if !isbitstype(scalartype(ttype))
@@ -145,6 +158,21 @@ function tensoralloc(ttype, structure, istemp=false, allocator=DefaultAllocator(
     return C
 end
 
-function tensorfree!(C, allocator=DefaultAllocator())
+tensorfree!(C, allocator=DefaultAllocator()) = nothing
+
+# ------------------------------------------------------------------------------------------
+# ManualAllocator implementation
+# ------------------------------------------------------------------------------------------
+function tensoralloc(::Type{A}, structure, ::Val{istemp},
+                     ::ManualAllocator) where {A<:AbstractArray,istemp}
+    if istemp
+        return malloc(eltype(A), structure...)
+    else
+        return tensoralloc(A, structure, Val(istemp))
+    end
+end
+
+function tensorfree!(C::PtrArray, ::ManualAllocator)
+    free(C)
     return nothing
 end
diff --git a/src/implementation/base.jl b/src/implementation/base.jl
index 94f45bf7..499ea806 100644
--- a/src/implementation/base.jl
+++ b/src/implementation/base.jl
@@ -36,7 +36,7 @@ function tensoradd!(C::AbstractArray,
 
     # can we assume that C is mutable?
     # is there more functionality in base that we can use?
-    Atemp = tensoralloc_add(eltype(A), A, pA, conjA, true, allocator)
+    Atemp = tensoralloc_add(eltype(A), A, pA, conjA, Val(true), allocator)
     Ã = permutedims!(Atemp, A, linearize(pA))
     if conjA
         if iszero(β)
@@ -51,6 +51,7 @@ function tensoradd!(C::AbstractArray,
             C .= β .* C .+ α .* Ã
         end
     end
+    tensorfree!(Atemp, allocator)
     return C
 end
 
@@ -100,9 +101,8 @@ function tensortrace!(C::AbstractArray,
     so = TupleTools.getindices(szA, linearize(p))
     st = prod(TupleTools.getindices(szA, q[1]))
     perm = (linearize(p)..., linearize(q)...)
-    Atemp = tensoralloc_add(eltype(A), A, (perm, ()), conjA, true, allocator)
+    Atemp = tensoralloc_add(eltype(A), A, (perm, ()), conjA, Val(true), allocator)
     Ã = reshape(permutedims!(Atemp, A, perm), (prod(so), st, st))
-
     if conjA
         if iszero(β)
             C .= α .* conj.(reshape(view(Ã, :, 1, 1), so))
@@ -122,6 +122,7 @@ function tensortrace!(C::AbstractArray,
             C .+= α .* reshape(view(Ã, :, i, i), so)
         end
     end
+    tensorfree!(Atemp, allocator)
     return C
 end
 
@@ -183,37 +184,37 @@ function tensorcontract!(C::AbstractArray,
     sc1 = prod(sc)
 
     AB = tensoralloc_contract(eltype(C), A, pA, conjA, B, pB, conjB,
-                              trivialpermutation(pAB), true, allocator)
+                              trivialpermutation(pAB), Val(true), allocator)
     ÃB̃ = reshape(AB, (soA1, soB1))
     if conjA && conjB
-        Atemp = tensoralloc_add(eltype(C), A, reverse(pA), conjA, true, allocator)
-        Btemp = tensoralloc_add(eltype(C), B, reverse(pB), conjB, true, allocator)
-        Ã = reshape(permutedims!(Atemp, A, linearize(reverse(pA))), (sc1, soA1))
-        B̃ = reshape(permutedims!(Btemp, B, linearize(reverse(pB))), (soB1, sc1))
-        mul!(ÃB̃, adjoint(Ã), adjoint(B̃))
+        Atemp = tensoralloc_add(eltype(C), A, reverse(pA), conjA, Val(true), allocator)
+        Btemp = tensoralloc_add(eltype(C), B, reverse(pB), conjB, Val(true), allocator)
+        Ã = adjoint(reshape(permutedims!(Atemp, A, linearize(reverse(pA))), (sc1, soA1)))
+        B̃ = adjoint(reshape(permutedims!(Btemp, B, linearize(reverse(pB))), (soB1, sc1)))
     elseif conjA
-        Atemp = tensoralloc_add(eltype(C), A, reverse(pA), conjA, true, allocator)
-        Btemp = tensoralloc_add(eltype(C), B, pB, conjB, true, allocator)
-        Ã = reshape(permutedims!(Atemp, A, linearize(reverse(pA))), (sc1, soA1))
+        Atemp = tensoralloc_add(eltype(C), A, reverse(pA), conjA, Val(true), allocator)
+        Btemp = tensoralloc_add(eltype(C), B, pB, conjB, Val(true), allocator)
+        Ã = adjoint(reshape(permutedims!(Atemp, A, linearize(reverse(pA))), (sc1, soA1)))
         B̃ = reshape(permutedims!(Btemp, B, linearize(pB)), (sc1, soB1))
-        mul!(ÃB̃, adjoint(Ã), B̃)
     elseif conjB
-        Atemp = tensoralloc_add(eltype(C), A, pA, conjA, true, allocator)
-        Btemp = tensoralloc_add(eltype(C), B, reverse(pB), conjB, true, allocator)
+        Atemp = tensoralloc_add(eltype(C), A, pA, conjA, Val(true), allocator)
+        Btemp = tensoralloc_add(eltype(C), B, reverse(pB), conjB, Val(true), allocator)
         Ã = reshape(permutedims!(Atemp, A, linearize(pA)), (soA1, sc1))
-        B̃ = reshape(permutedims!(Btemp, B, linearize(reverse(pB))), (soB1, sc1))
-        mul!(ÃB̃, Ã, adjoint(B̃))
+        B̃ = adjoint(reshape(permutedims!(Btemp, B, linearize(reverse(pB))), (soB1, sc1)))
     else
-        Atemp = tensoralloc_add(eltype(C), A, pA, conjA, true, allocator)
-        Btemp = tensoralloc_add(eltype(C), B, pB, conjB, true, allocator)
+        Atemp = tensoralloc_add(eltype(C), A, pA, conjA, Val(true), allocator)
+        Btemp = tensoralloc_add(eltype(C), B, pB, conjB, Val(true), allocator)
         Ã = reshape(permutedims!(Atemp, A, linearize(pA)), (soA1, sc1))
         B̃ = reshape(permutedims!(Btemp, B, linearize(pB)), (sc1, soB1))
-        mul!(ÃB̃, Ã, B̃)
     end
+    mul!(ÃB̃, Ã, B̃)
+    tensorfree!(Btemp, allocator)
+    tensorfree!(Atemp, allocator)
+
     if istrivialpermutation(linearize(pAB))
         pAB = AB
     else
-        pABtemp = tensoralloc_add(eltype(C), AB, pAB, false, true, allocator)
+        pABtemp = tensoralloc_add(eltype(C), AB, pAB, false, Val(true), allocator)
         pAB = permutedims!(pABtemp, AB, linearize(pAB))
     end
     if iszero(β)
@@ -221,5 +222,7 @@ function tensorcontract!(C::AbstractArray,
     else
         C .= β .* C .+ α .* pAB
     end
+    pAB === AB || tensorfree!(pABtemp, allocator)
+    tensorfree!(AB, allocator)
     return C
 end
diff --git a/src/implementation/functions.jl b/src/implementation/functions.jl
index d60c12e7..7444b0bd 100644
--- a/src/implementation/functions.jl
+++ b/src/implementation/functions.jl
@@ -1,12 +1,31 @@
 # methods/simple.jl
 #
 # Method-based access to tensor operations using simple definitions.
+
+# utility function
+function _kwargs2args(; kwargs...)
+    for k in keys(kwargs)
+        if !(k in (:backend, :allocator))
+            throw(ArgumentError("unknown keyword argument: $k"))
+        end
+    end
+    if haskey(kwargs, :backend) && haskey(kwargs, :allocator)
+        return (kwargs[:backend], kwargs[:allocator])
+    elseif haskey(kwargs, :allocator)
+        return (DefaultBackend(), kwargs[:allocator])
+    elseif haskey(kwargs, :backend)
+        return (kwargs[:backend],)
+    else
+        return ()
+    end
+end
+
 # ------------------------------------------------------------------------------------------
 # tensorcopy
 # ------------------------------------------------------------------------------------------
 """
-    tensorcopy([IC=IA], A, IA, [conjA=false, [α=1]])
-    tensorcopy(A, pA::Index2Tuple, conjA, α, [backend]) # expert mode
+    tensorcopy([IC=IA], A, IA, [conjA=false, [α=1]]; [backend=..., allocator=...])
+    tensorcopy(A, pA::Index2Tuple, conjA, α, [backend, allocator]) # expert mode
 
 Create a copy of `A`, where the dimensions of `A` are assigned indices from the
 iterable `IA` and the indices of the copy are contained in `IC`. Both iterables
@@ -16,31 +35,34 @@ The result of this method is equivalent to `α * permutedims(A, pA)` where `pA`
 permutation such that `IC = IA[pA]`. The implementation of `tensorcopy` is however more
 efficient on average, especially if `Threads.nthreads() > 1`.
 
-Optionally, the flag `conjA` can be used to specify whether the input tensor should be
-conjugated (`true`) or not (`false`).
+The optional argument `conjA` can be used to specify whether the input tensor should be
+conjugated (`true`) or not (`false`), whereas `α` can be used to scale the result. It is
+also optional to specify a backend implementation to use, and an allocator to be used if
+temporary tensor objects are needed.
 
 See also [`tensorcopy!`](@ref).
 """
 function tensorcopy end
 
-function tensorcopy(IC::Labels, A, IA::Labels, conjA::Bool=false, α::Number=One())
+function tensorcopy(IC::Labels, A, IA::Labels, conjA::Bool=false, α::Number=One();
+                    kwargs...)
     pA = add_indices(IA, IC)
-    return tensorcopy(A, pA, conjA, α)
+    return tensorcopy(A, pA, conjA, α, _kwargs2args(; kwargs...)...)
 end
 # default `IC`
-function tensorcopy(A, IA::Labels, conjA::Bool=false, α::Number=One())
-    return tensorcopy(IA, A, IA, conjA, α)
+function tensorcopy(A, IA::Labels, conjA::Bool=false, α::Number=One(); kwargs...)
+    return tensorcopy(IA, A, IA, conjA, α; kwargs...)
 end
 # expert mode
 function tensorcopy(A, pA::Index2Tuple, conjA::Bool=false, α::Number=One(),
                     backend=DefaultBackend(), allocator=DefaultAllocator())
     TC = promote_add(scalartype(A), scalartype(α))
-    C = tensoralloc_add(TC, A, pA, conjA)
+    C = tensoralloc_add(TC, A, pA, conjA, Val(false), allocator)
     return tensorcopy!(C, A, pA, conjA, α, backend, allocator)
 end
 
 """
-    tensorcopy!(C, A, pA::Index2Tuple, conjA=false, α=1, [backend])
+    tensorcopy!(C, A, pA::Index2Tuple, conjA=false, α=1, [backend, allocator])
 
 Copy the contents of tensor `A` into `C`, where the dimensions `A` are permuted according to
 the permutation and repartition `pA`.
@@ -48,7 +70,7 @@ the permutation and repartition `pA`.
 The result of this method is equivalent to `α * permutedims!(C, A, pA)`.
 
 Optionally, the flag `conjA` can be used to specify whether the input tensor should be
-conjugated (`true`) or not (`false`).
+conjugated (`true`) or not (`false`). 
 
 !!! warning 
     The object `C` must not be aliased with `A`.
@@ -64,8 +86,8 @@ end
 # tensoradd
 # ------------------------------------------------------------------------------------------
 """
-    tensoradd([IC=IA], A, IA, [conjA], B, IB, [conjB], [α=1, [β=1]])
-    tensoradd(A, pA::Index2Tuple, conjA, B, pB::Index2Tuple, pB::Index2Tuple, conjB, α=1, β=1, [backend]) # expert mode
+    tensoradd([IC=IA], A, IA, [conjA], B, IB, [conjB], [α=1, [β=1]]; [backend=..., allocator=...])
+    tensoradd(A, pA::Index2Tuple, conjA, B, pB::Index2Tuple, conjB, α=1, β=1, [backend, allocator]) # expert mode
 
 Return the result of adding arrays `A` and `B` where the iterables `IA` and `IB`
 denote how the array data should be permuted in order to be added. More specifically,
@@ -81,23 +103,26 @@ See also [`tensoradd!`](@ref).
 """
 function tensoradd end
 
-function tensoradd(IC::Labels, A, IA::Labels, conjA::Bool, B, IB::Labels,
-                   conjB::Bool, α::Number=One(), β::Number=One())
-    return tensoradd(A, add_indices(IA, IC), conjA, B, add_indices(IB, IC), conjB, α, β)
+function tensoradd(IC::Labels, A, IA::Labels, conjA::Bool, B, IB::Labels, conjB::Bool,
+                   α::Number=One(), β::Number=One(); kwargs...)
+    pA = add_indices(IA, IC)
+    pB = add_indices(IB, IC)
+    return tensoradd(A, pA, conjA, B, pB, conjB, α, β, _kwargs2args(; kwargs...)...)
 end
 # default `IC`
 function tensoradd(A, IA::Labels, conjA::Bool, B, IB::Labels, conjB::Bool,
-                   α::Number=One(), β::Number=One())
-    return tensoradd(IA, A, IA, conjA, B, IB, conjB, α, β)
+                   α::Number=One(), β::Number=One(); kwargs...)
+    return tensoradd(IA, A, IA, conjA, B, IB, conjB, α, β; kwargs...)
 end
 # default `conjA` and `conjB`
-function tensoradd(IC::Labels, A, IA::Labels, B, IB::Labels, α::Number=One(),
-                   β::Number=One())
+function tensoradd(IC::Labels, A, IA::Labels, B, IB::Labels,
+                   α::Number=One(), β::Number=One(); kwargs...)
     return tensoradd(IC, A, IA, false, B, IB, false, α, β)
 end
 # default `IC`, `conjA` and `conjB`
-function tensoradd(A, IA::Labels, B, IB::Labels, α::Number=One(), β::Number=One())
-    return tensoradd(IA, A, IA, B, IB, α, β)
+function tensoradd(A, IA::Labels, B, IB::Labels, α::Number=One(), β::Number=One();
+                   kwargs...)
+    return tensoradd(IA, A, IA, B, IB, α, β; kwargs...)
 end
 # expert mode
 function tensoradd(A, pA::Index2Tuple, conjA::Bool,
@@ -105,7 +130,7 @@ function tensoradd(A, pA::Index2Tuple, conjA::Bool,
                    α::Number=One(), β::Number=One(),
                    backend=DefaultBackend(), allocator=DefaultAllocator())
     TC = promote_add(scalartype(A), scalartype(B), scalartype(α), scalartype(β))
-    C = tensoralloc_add(TC, A, pA, conjA, false, allocator)
+    C = tensoralloc_add(TC, A, pA, conjA, Val(false), allocator)
     C = tensorcopy!(C, A, pA, conjA, α, backend, allocator)
     return tensoradd!(C, B, pB, conjB, β, One(), backend, allocator)
 end
@@ -114,8 +139,8 @@ end
 # tensortrace
 # ------------------------------------------------------------------------------------------
 """
-    tensortrace([IC], A, IA, [conjA], [α=1])
-    tensortrace(A, p::Index2Tuple, q::Index2Tuple, conjA, α=1, [backend]) # expert mode
+    tensortrace([IC], A, IA, [conjA], [α=1]; [backend=..., allocator=...])
+    tensortrace(A, p::Index2Tuple, q::Index2Tuple, conjA, α=1, [backend, allocator]) # expert mode
 
 Trace or contract pairs of indices of tensor `A`, by assigning them identical indices in the
 iterable `IA`. The untraced indices, which are assigned a unique index, can be reordered
@@ -130,38 +155,36 @@ See also [`tensortrace!`](@ref).
 """
 function tensortrace end
 
+function tensortrace(IC::Labels, A, IA::Labels, conjA::Bool, α::Number=One(); kwargs...)
+    p, q = trace_indices(IA, IC)
+    return tensortrace(A, p, q, conjA, α, _kwargs2args(; kwargs...)...)
+end
 # default `IC`
-function tensortrace(A, IA::Labels, conjA::Bool, α::Number=One())
-    return tensortrace(unique2(IA), A, IA, conjA, α)
+function tensortrace(A, IA::Labels, conjA::Bool, α::Number=One(); kwargs...)
+    return tensortrace(unique2(IA), A, IA, conjA, α; kwargs...)
 end
 # default `conjA`
-function tensortrace(IC::Labels, A, IA::Labels, α::Number=One())
-    return tensortrace(IC, A, IA, false, α)
+function tensortrace(IC::Labels, A, IA::Labels, α::Number=One(); kwargs...)
+    return tensortrace(IC, A, IA, false, α; kwargs...)
 end
 # default `IC` and `conjA`
-function tensortrace(A, IA::Labels, α::Number=One())
-    return tensortrace(unique2(IA), A, IA, false, α)
-end
-# labels to indices
-function tensortrace(IC::Labels, A, IA::Labels, conjA::Bool, α::Number=One())
-    p, q = trace_indices(IA, IC)
-    return tensortrace(A, p, q, conjA, α)
+function tensortrace(A, IA::Labels, α::Number=One(); kwargs...)
+    return tensortrace(unique2(IA), A, IA, false, α; kwargs...)
 end
 # expert mode
 function tensortrace(A, p::Index2Tuple, q::Index2Tuple, conjA::Bool, α::Number=One(),
                      backend=DefaultBackend(), allocator=DefaultAllocator())
     TC = promote_contract(scalartype(A), scalartype(α))
-    C = tensoralloc_add(TC, A, p, conjA, false, allocator)
+    C = tensoralloc_add(TC, A, p, conjA, Val(false), allocator)
     return tensortrace!(C, A, p, q, conjA, α, Zero(), backend, allocator)
 end
 
 # ------------------------------------------------------------------------------------------
 # tensorcontract
 # ------------------------------------------------------------------------------------------
-
 """
-    tensorcontract([IC], A, IA, [conjA], B, IB, [conjB], [α=1])
-    tensorcontract(A, pA::Index2Tuple, conjA, B, pB::Index2Tuple, conjB, pAB::Index2Tuple, α=1, [backend]) # expert mode
+    tensorcontract([IC], A, IA, [conjA], B, IB, [conjB], [α=1]; [backend=..., allocator=...])
+    tensorcontract(A, pA::Index2Tuple, conjA, B, pB::Index2Tuple, conjB, pAB::Index2Tuple, α=1, [backend, allocator]) # expert mode
 
 Contract indices of tensor `A` with corresponding indices in tensor `B` by assigning
 them identical labels in the iterables `IA` and `IB`. The indices of the resulting
@@ -180,22 +203,23 @@ See also [`tensorcontract!`](@ref).
 function tensorcontract end
 
 function tensorcontract(IC::Labels, A, IA::Labels, conjA::Bool, B, IB::Labels, conjB::Bool,
-                        α::Number=One())
+                        α::Number=One(); kwargs...)
     pA, pB, pAB = contract_indices(IA, IB, IC)
-    return tensorcontract(A, pA, conjA, B, pB, conjB, pAB, α)
+    return tensorcontract(A, pA, conjA, B, pB, conjB, pAB, α, _kwargs2args(; kwargs...)...)
 end
 # default `IC`
 function tensorcontract(A, IA::Labels, conjA::Bool, B, IB::Labels, conjB::Bool,
-                        α::Number=One())
-    return tensorcontract(symdiff(IA, IB), A, IA, conjA, B, IB, conjB, α)
+                        α::Number=One(); kwargs...)
+    return tensorcontract(symdiff(IA, IB), A, IA, conjA, B, IB, conjB, α; kwargs...)
 end
 # default `conjA` and `conjB`
-function tensorcontract(IC::Labels, A, IA::Labels, B, IB::Labels, α::Number=One())
-    return tensorcontract(IC, A, IA, false, B, IB, false, α)
+function tensorcontract(IC::Labels, A, IA::Labels, B, IB::Labels, α::Number=One();
+                        kwargs...)
+    return tensorcontract(IC, A, IA, false, B, IB, false, α; kwargs...)
 end
 # default `IC`, `conjA` and `conjB`
-function tensorcontract(A, IA::Labels, B, IB::Labels, α::Number=One())
-    return tensorcontract(symdiff(IA, IB), A, IA, false, B, IB, false, α)
+function tensorcontract(A, IA::Labels, B, IB::Labels, α::Number=One(); kwargs...)
+    return tensorcontract(symdiff(IA, IB), A, IA, false, B, IB, false, α; kwargs...)
 end
 # expert mode
 function tensorcontract(A, pA::Index2Tuple, conjA::Bool,
@@ -203,7 +227,7 @@ function tensorcontract(A, pA::Index2Tuple, conjA::Bool,
                         pAB::Index2Tuple, α::Number=One(),
                         backend=DefaultBackend(), allocator=DefaultAllocator())
     TC = promote_contract(scalartype(A), scalartype(B), scalartype(α))
-    C = tensoralloc_contract(TC, A, pA, conjA, B, pB, conjB, pAB, false, allocator)
+    C = tensoralloc_contract(TC, A, pA, conjA, B, pB, conjB, pAB, Val(false), allocator)
     return tensorcontract!(C, A, pA, conjA, B, pB, conjB, pAB, α, Zero(), backend,
                            allocator)
 end
@@ -213,8 +237,8 @@ end
 # ------------------------------------------------------------------------------------------
 
 """
-    tensorproduct([IC], A, IA, [conjA], B, IB, [conjB], [α=1])
-    tensorproduct(A, pA::Index2Tuple, conjA, B, pB::Index2Tuple, conjB, pAB::Index2Tuple, α=1, [backend]) # expert mode
+    tensorproduct([IC], A, IA, [conjA], B, IB, [conjB], [α=1]; [backend=..., allocator=...])
+    tensorproduct(A, pA::Index2Tuple, conjA, B, pB::Index2Tuple, conjB, pAB::Index2Tuple, α=1, [backend, allocator]) # expert mode
 
 Compute the tensor product (outer product) of two tensors `A` and `B`, i.e. returns a new
 tensor `C` with `ndims(C) = ndims(A) + ndims(B)`. The indices of the output tensor are
@@ -231,22 +255,22 @@ See also [`tensorproduct!`](@ref) and [`tensorcontract`](@ref).
 function tensorproduct end
 
 function tensorproduct(IC::Labels, A, IA::Labels, conjA::Bool, B, IB::Labels, conjB::Bool,
-                       α::Number=One())
+                       α::Number=One(); kwargs...)
     pA, pB, pAB = contract_indices(IA, IB, IC)
-    return tensorproduct(A, pA, conjA, B, pB, conjB, pAB, α)
+    return tensorproduct(A, pA, conjA, B, pB, conjB, pAB, α, _kwargs2args(; kwargs...)...)
 end
 # default `IC`
 function tensorproduct(A, IA::Labels, conjA::Bool, B, IB::Labels, conjB::Bool,
-                       α::Number=One())
-    return tensorproduct(vcat(IA, IB), A, IA, conjA, B, IB, conjB, α)
+                       α::Number=One(); kwargs...)
+    return tensorproduct(vcat(IA, IB), A, IA, conjA, B, IB, conjB, α; kwargs...)
 end
 # default `conjA` and `conjB`
-function tensorproduct(IC::Labels, A, IA::Labels, B, IB::Labels, α::Number=One())
-    return tensorproduct(IC, A, IA, false, B, IB, false, α)
+function tensorproduct(IC::Labels, A, IA::Labels, B, IB::Labels, α::Number=One(); kwargs...)
+    return tensorproduct(IC, A, IA, false, B, IB, false, α; kwargs...)
 end
 # default `IC`, `conjA` and `conjB`
-function tensorproduct(A, IA::Labels, B, IB::Labels, α::Number=One())
-    return tensorproduct(vcat(IA, IB), A, IA, false, B, IB, false, α)
+function tensorproduct(A, IA::Labels, B, IB::Labels, α::Number=One(); kwargs...)
+    return tensorproduct(vcat(IA, IB), A, IA, false, B, IB, false, α; kwargs...)
 end
 # expert mode
 function tensorproduct(A, pA::Index2Tuple, conjA::Bool,
@@ -259,11 +283,14 @@ function tensorproduct(A, pA::Index2Tuple, conjA::Bool,
 end
 
 """
-    tensorproduct!(C, A, pA::Index2Tuple, conjA, B, pB::Index2Tuple, conjB, pAB::Index2Tuple, α=1, β=0)
+    tensorproduct!(C, A, pA::Index2Tuple, conjA, B, pB::Index2Tuple, conjB, pAB::Index2Tuple, α=1, β=0, [backend, allocator])
 
 Compute the tensor product (outer product) of two tensors `A` and `B`, i.e. a wrapper of
 [`tensorcontract!`](@ref) with no indices being contracted over. This method checks whether
-the indices indeed specify a tensor product instead of a genuine contraction.
+the indices indeed specify a tensor product instead of a genuine contraction.  Optionally 
+specify a backend implementation to use, and an allocator to be used if temporary tensor
+objects are needed.
+
 
 !!! warning 
     The object `C` must not be aliased with `A` or `B`.
diff --git a/src/implementation/ncon.jl b/src/implementation/ncon.jl
index f77256b2..414b2601 100644
--- a/src/implementation/ncon.jl
+++ b/src/implementation/ncon.jl
@@ -1,5 +1,5 @@
 """
-    ncon(tensorlist, indexlist, [conjlist, sym]; order = ..., output = ...)
+    ncon(tensorlist, indexlist, [conjlist, sym]; order = ..., output = ..., backend = ..., allocator = ...)
 
 Contract the tensors in `tensorlist` (of type `Vector` or `Tuple`) according to the network
 as specified by `indexlist`. Here, `indexlist` is a list (i.e. a `Vector` or `Tuple`) with
@@ -24,7 +24,7 @@ See also the macro version [`@ncon`](@ref).
 """
 function ncon(tensors, network,
               conjlist=fill(false, length(tensors));
-              order=nothing, output=nothing)
+              order=nothing, output=nothing, kwargs...)
     length(tensors) == length(network) == length(conjlist) ||
         throw(ArgumentError("number of tensors and of index lists should be the same"))
     isnconstyle(network) || throw(ArgumentError("invalid NCON network: $network"))
@@ -32,31 +32,42 @@ function ncon(tensors, network,
 
     if length(tensors) == 1
         if length(output′) == length(network[1])
-            return tensorcopy(output′, tensors[1], network[1], conjlist[1])
+            return tensorcopy(output′, tensors[1], network[1], conjlist[1]; kwargs...)
         else
-            return tensortrace(output′, tensors[1], network[1], conjlist[1])
+            return tensortrace(output′, tensors[1], network[1], conjlist[1]; kwargs...)
         end
     end
 
     (tensors, network) = resolve_traces(tensors, network)
     tree = order === nothing ? ncontree(network) : indexordertree(network, order)
 
-    A, IA, CA = contracttree(tensors, network, conjlist, tree[1])
-    B, IB, CB = contracttree(tensors, network, conjlist, tree[2])
+    A, IA, conjA = contracttree(tensors, network, conjlist, tree[1]; kwargs...)
+    B, IB, conjB = contracttree(tensors, network, conjlist, tree[2]; kwargs...)
     IC = tuple(output′...)
-
-    return tensorcontract(IC, A, IA, CA, B, IB, CB)
+    C = tensorcontract(IC, A, IA, conjA, B, IB, conjB; kwargs...)
+    allocator = haskey(kwargs, :allocator) ? kwargs[:allocator] : DefaultAllocator()
+    tree[1] isa Int || tensorfree!(A, allocator)
+    tree[2] isa Int || tensorfree!(B, allocator)
+    return C
 end
 
-function contracttree(tensors, network, conjlist, tree)
+function contracttree(tensors, network, conjlist, tree; kwargs...)
     @nospecialize
     if tree isa Int
         return tensors[tree], tuple(network[tree]...), (conjlist[tree])
     end
-    A, IA, CA = contracttree(tensors, network, conjlist, tree[1])
-    B, IB, CB = contracttree(tensors, network, conjlist, tree[2])
+    A, IA, conjA = contracttree(tensors, network, conjlist, tree[1]; kwargs...)
+    B, IB, conjB = contracttree(tensors, network, conjlist, tree[2]; kwargs...)
     IC = tuple(symdiff(IA, IB)...)
-    C = tensorcontract(IC, A, IA, CA, B, IB, CB)
+    pA, pB, pAB = contract_indices(IA, IB, IC)
+    TC = promote_contract(scalartype(A), scalartype(B))
+    allocator = haskey(kwargs, :allocator) ? kwargs[:allocator] : DefaultAllocator()
+    backend = haskey(kwargs, :backend) ? kwargs[:backend] : DefaultBackend()
+    C = tensoralloc_contract(TC, A, pA, conjA, B, pB, conjB, pAB, Val(true), allocator)
+    C = tensorcontract!(C, A, pA, conjA, B, pB, conjB, pAB, One(), Zero(), backend,
+                        allocator)
+    tree[1] isa Int || tensorfree!(A, allocator)
+    tree[2] isa Int || tensorfree!(B, allocator)
     return C, IC, false
 end
 
diff --git a/src/implementation/strided.jl b/src/implementation/strided.jl
index 132ea4a8..51cf9c91 100644
--- a/src/implementation/strided.jl
+++ b/src/implementation/strided.jl
@@ -223,8 +223,7 @@ function blas_contract!(C, A, pA, conjA, B, pB, conjB, pAB, α, β, allocator)
         C_ = C
         _unsafe_blas_contract!(C_, A_, pA, B_, pB, ipAB, α, β)
     else
-        C_ = StridedView(TensorOperations.tensoralloc_add(TC, C, ipAB, false, true,
-                                                          allocator))
+        C_ = StridedView(tensoralloc_add(TC, C, ipAB, false, Val(true), allocator))
         _unsafe_blas_contract!(C_, A_, pA, B_, pB, trivialpermutation(ipAB),
                                one(TC), zero(TC))
         tensoradd!(C, C_, pAB, false, α, β)
@@ -257,8 +256,7 @@ end
 @inline function makeblascontractable(A, pA, TC, allocator)
     flagA = isblascontractable(A, pA) && eltype(A) == TC
     if !flagA
-        A_ = StridedView(TensorOperations.tensoralloc_add(TC, A, pA, false, true,
-                                                          allocator))
+        A_ = StridedView(tensoralloc_add(TC, A, pA, false, Val(true), allocator))
         A = tensoradd!(A_, A, pA, false, One(), Zero())
         pA = trivialpermutation(pA)
     end
diff --git a/src/indexnotation/instantiators.jl b/src/indexnotation/instantiators.jl
index e34170f1..a7527a4b 100644
--- a/src/indexnotation/instantiators.jl
+++ b/src/indexnotation/instantiators.jl
@@ -113,7 +113,7 @@ function instantiate_generaltensor(dst, β, ex::Expr, α, leftind::Vector{Any},
     end
     if alloc ∈ (NewTensor, TemporaryTensor)
         TC = gensym("T_" * string(dst))
-        istemporary = (alloc === TemporaryTensor)
+        istemporary = Val(alloc === TemporaryTensor)
         if scaltype === nothing
             TCval = α === One() ? instantiate_scalartype(src) :
                     instantiate_scalartype(Expr(:call, :*, α, src))
@@ -260,7 +260,7 @@ function instantiate_contraction(dst, β, ex::Expr, α, leftind::Vector{Any},
         else
             TCval = scaltype
         end
-        istemporary = alloc === TemporaryTensor
+        istemporary = Val(alloc === TemporaryTensor)
         initC = Expr(:block, Expr(:(=), TCsym, TCval),
                      :($dst = tensoralloc_contract($TCsym, $A, $pA, $conjA, $B, $pB,
                                                    $conjB, $pAB, $istemporary)))
diff --git a/src/indexnotation/tensormacros.jl b/src/indexnotation/tensormacros.jl
index b1f7b8bd..678aa983 100644
--- a/src/indexnotation/tensormacros.jl
+++ b/src/indexnotation/tensormacros.jl
@@ -304,15 +304,32 @@ transfer it back. This macro is equivalent to
     This macro requires the cuTENSOR library to be installed and loaded. This can be
     achieved by running `using cuTENSOR` or `import cuTENSOR` before using the macro.
 """
-macro cutensor(ex::Expr)
-    haskey(Base.loaded_modules, Base.identify_package("cuTENSOR")) ||
-        throw(ArgumentError("cuTENSOR not loaded: add `using cuTENSOR` or `import cuTENSOR` before using `@cutensor`"))
-    parser = TensorParser()
-    push!(parser.postprocessors,
-          ex -> insertbackend(ex,
-                              Expr(:call, GlobalRef(TensorOperations, :cuTENSORBackend))))
-    push!(parser.postprocessors,
-          ex -> insertallocator(ex,
-                                Expr(:call, GlobalRef(TensorOperations, :CUDAAllocator))))
-    return esc(parser(ex))
+macro cutensor(ex...)
+    if length(methods(_cutensor)) == 0
+        throw(ArgumentError("cuTENSOR.jl not loaded: add `using cuTENSOR` or `import cuTENSOR` before using `@cutensor`"))
+    end
+    return esc(_cutensor(__source__, ex...))
+end
+
+function _cutensor end
+"""
+    @butensor tensor_expr
+
+Use Bumper.jl to handle allocation of temporary tensors. This macro will use the default
+buffer and automatically reset it after the tensor expression has been evaluated. This macro
+is equivalent to `@no_escape @tensor tensor_expr` with all temporary allocations handled by
+Bumper.jl.
+
+!!! note
+    This macro requires Bumper.jl to be installed and loaded. This can be achieved by running
+    `using Bumper` or `import Bumper` before using the macro.
+"""
+macro butensor(ex...)
+    if length(methods(_butensor)) == 0
+        throw(ArgumentError("Bumper.jl not loaded: add `using Bumper` or `import Bumper` before using `@butensor`"))
+    end
+    return esc(_butensor(__source__, ex...))
 end
+
+# construction to move implementation to a package extension
+function _butensor end
diff --git a/test/butensor.jl b/test/butensor.jl
new file mode 100644
index 00000000..02cb3d0d
--- /dev/null
+++ b/test/butensor.jl
@@ -0,0 +1,86 @@
+@testset "@butensor dependency check" begin
+    @test_throws ArgumentError begin
+        ex = :(@butensor A[a, b, c, d] := B[a, b, c, d])
+        macroexpand(Main, ex)
+    end
+end
+
+using Bumper
+@testset "Bumper tests with eltype $T" for T in (Float32, ComplexF64)
+    D1, D2, D3 = 30, 40, 20
+    d1, d2 = 2, 3
+
+    A1 = randn(T, D1, d1, D2)
+    A2 = randn(T, D2, d2, D3)
+    ρₗ = randn(T, D1, D1)
+    ρᵣ = randn(T, D3, D3)
+    H = randn(T, d1, d2, d1, d2)
+
+    @tensor begin
+        HRAA1[a, s1, s2, c] := ρₗ[a, a'] * A1[a', t1, b] * A2[b, t2, c'] *
+                               ρᵣ[c', c] *
+                               H[s1, s2, t1, t2]
+        E1 = ρₗ[a', a] * A1[a, s, b] * A2[b, s', c] * ρᵣ[c, c'] * H[t, t', s, s'] *
+             conj(A1[a', t, b']) * conj(A2[b', t', c'])
+    end
+
+    # butensor implementation
+    @butensor begin
+        HRAA2[a, s1, s2, c] := ρₗ[a, a'] * A1[a', t1, b] * A2[b, t2, c'] *
+                               ρᵣ[c', c] *
+                               H[s1, s2, t1, t2]
+        E2 = ρₗ[a', a] * A1[a, s, b] * A2[b, s', c] * ρᵣ[c, c'] * H[t, t', s, s'] *
+             conj(A1[a', t, b']) * conj(A2[b', t', c'])
+    end
+    @test HRAA2 isa Array{T,4}
+    @test E2 isa T
+    @test HRAA2 ≈ HRAA1
+    @test E2 ≈ E1
+
+    # manual equivalent
+    @no_escape @tensor allocator = default_buffer() begin
+        HRAA3[a, s1, s2, c] := ρₗ[a, a'] * A1[a', t1, b] * A2[b, t2, c'] *
+                               ρᵣ[c', c] *
+                               H[s1, s2, t1, t2]
+        E3 = ρₗ[a', a] * A1[a, s, b] * A2[b, s', c] * ρᵣ[c, c'] * H[t, t', s, s'] *
+             conj(A1[a', t, b']) * conj(A2[b', t', c'])
+    end
+    @test HRAA3 isa Array{T,4}
+    @test E3 isa T
+    @test HRAA3 ≈ HRAA1
+    @test E3 ≈ E1
+
+    # new buffer / completely manual
+    slabsize = typeof(default_buffer()).parameters[1] >> 1 # take slab size half as big
+    slabbuf = SlabBuffer{slabsize}()
+    begin
+        local cp = Bumper.checkpoint_save(slabbuf)
+        @tensor allocator = slabbuf begin
+            HRAA4[a, s1, s2, c] := ρₗ[a, a'] * A1[a', t1, b] * A2[b, t2, c'] *
+                                   ρᵣ[c', c] *
+                                   H[s1, s2, t1, t2]
+            E4 = ρₗ[a', a] * A1[a, s, b] * A2[b, s', c] * ρᵣ[c, c'] * H[t, t', s, s'] *
+                 conj(A1[a', t, b']) * conj(A2[b', t', c'])
+        end
+        bufferlength = slabsize * length(slabbuf.slabs)
+        Bumper.checkpoint_restore!(cp)
+    end
+    @test HRAA4 isa Array{T,4}
+    @test E4 isa T
+    @test HRAA4 ≈ HRAA1
+    @test E4 ≈ E1
+
+    # allocbuffer
+    allocbuf = AllocBuffer(bufferlength)
+    @no_escape allocbuf @tensor allocator = allocbuf begin
+        HRAA5[a, s1, s2, c] := ρₗ[a, a'] * A1[a', t1, b] * A2[b, t2, c'] *
+                               ρᵣ[c', c] *
+                               H[s1, s2, t1, t2]
+        E5 = ρₗ[a', a] * A1[a, s, b] * A2[b, s', c] * ρᵣ[c, c'] * H[t, t', s, s'] *
+             conj(A1[a', t, b']) * conj(A2[b', t', c'])
+    end
+    @test HRAA5 isa Array{T,4}
+    @test E5 isa T
+    @test HRAA5 ≈ HRAA1
+    @test E5 ≈ E1
+end
diff --git a/test/cutensor.jl b/test/cutensor.jl
index 61ee3e79..ccf1bf98 100644
--- a/test/cutensor.jl
+++ b/test/cutensor.jl
@@ -1,4 +1,4 @@
-@testset "cuTENSOR dependency check" begin
+@testset "@cutensor dependency check" begin
     @test_throws ArgumentError begin
         ex = :(@cutensor A[a, b, c, d] := B[a, b, c, d])
         macroexpand(Main, ex)
@@ -10,12 +10,14 @@ if cuTENSOR.has_cutensor()
     using CUDA
     using LinearAlgebra: norm
     using TensorOperations: IndexError
+    using TensorOperations: cuTENSORBackend, CUDAAllocator
 
     @testset "elementary operations" verbose = true begin
         @testset "tensorcopy" begin
             A = randn(Float32, (3, 5, 4, 6))
             @tensor C1[4, 1, 3, 2] := A[1, 2, 3, 4]
             @tensor C2[4, 1, 3, 2] := CuArray(A)[1, 2, 3, 4]
+            @test C2 isa CuArray
             @test collect(C2) ≈ C1
         end
 
@@ -122,16 +124,43 @@ if cuTENSOR.has_cutensor()
             H = randn(T, d1, d2, d1, d2)
 
             @tensor begin
-                HrA12[a, s1, s2, c] := ρₗ[a, a'] * A1[a', t1, b] * A2[b, t2, c'] *
+                HRAA1[a, s1, s2, c] := ρₗ[a, a'] * A1[a', t1, b] * A2[b, t2, c'] *
                                        ρᵣ[c', c] *
                                        H[s1, s2, t1, t2]
             end
             @tensor begin
-                HrA12′[a, s1, s2, c] := CuArray(ρₗ)[a, a'] * CuArray(A1)[a', t1, b] *
-                                        CuArray(A2)[b, t2, c'] * CuArray(ρᵣ)[c', c] *
-                                        CuArray(H)[s1, s2, t1, t2]
+                HRAA2[a, s1, s2, c] := CuArray(ρₗ)[a, a'] * CuArray(A1)[a', t1, b] *
+                                       CuArray(A2)[b, t2, c'] * CuArray(ρᵣ)[c', c] *
+                                       CuArray(H)[s1, s2, t1, t2]
+            end
+            @test HRAA2 isa CuArray{T}
+            @test collect(HRAA2) ≈ HRAA1
+
+            cumemtypes = (CUDA.DeviceMemory, CUDA.UnifiedMemory, CUDA.HostMemory)
+            for Mout in cumemtypes
+                Min = CUDA.DeviceMemory
+                Mtemp = CUDA.DeviceMemory
+                allocator = CUDAAllocator{Mout,Min,Mtemp}()
+                @tensor backend = cuTENSORBackend() allocator = allocator begin
+                    HRAA3[a, s1, s2, c] := ρₗ[a, a'] * A1[a', t1, b] * A2[b, t2, c'] *
+                                           ρᵣ[c', c] *
+                                           H[s1, s2, t1, t2]
+                end
+                @test HRAA3 isa CuArray{T,4,Mout}
+                @test collect(HRAA3) ≈ HRAA1
+            end
+            for Min in cumemtypes
+                Mout = CUDA.UnifiedMemory
+                Mtemp = CUDA.UnifiedMemory
+                allocator = CUDAAllocator{Mout,Min,Mtemp}()
+                @tensor backend = cuTENSORBackend() allocator = allocator begin
+                    HRAA3[a, s1, s2, c] := ρₗ[a, a'] * A1[a', t1, b] * A2[b, t2, c'] *
+                                           ρᵣ[c', c] *
+                                           H[s1, s2, t1, t2]
+                end
+                @test HRAA3 isa CuArray{T,4,Mout}
+                @test collect(HRAA3) ≈ HRAA1
             end
-            @test collect(HrA12′) ≈ HrA12
 
             @tensor begin
                 E1 = ρₗ[a', a] * A1[a, s, b] * A2[b, s', c] * ρᵣ[c, c'] * H[t, t', s, s'] *
@@ -140,7 +169,11 @@ if cuTENSOR.has_cutensor()
                      CuArray(ρᵣ)[c, c'] * CuArray(H)[t, t', s, s'] *
                      conj(CuArray(A1)[a', t, b']) * conj(CuArray(A2)[b', t', c'])
             end
-            @test E2 ≈ E1
+            @tensor backend = cuTENSORBackend() allocator = CUDAAllocator() begin
+                E3 = ρₗ[a', a] * A1[a, s, b] * A2[b, s', c] * ρᵣ[c, c'] * H[t, t', s, s'] *
+                     conj(A1[a', t, b']) * conj(A2[b', t', c'])
+            end
+            @test E1 ≈ E2 ≈ E3
         end
     end
 
@@ -150,10 +183,10 @@ if cuTENSOR.has_cutensor()
             @tensor C1[4, 1, 3, 2] := A[1, 2, 3, 4]
             @cutensor C2[4, 1, 3, 2] := A[1, 2, 3, 4]
             @test C1 ≈ collect(C2)
-            @test_throws TensorOperations.IndexError begin
+            @test_throws IndexError begin
                 @cutensor C[1, 2, 3, 4] := A[1, 2, 3]
             end
-            @test_throws TensorOperations.IndexError begin
+            @test_throws IndexError begin
                 @cutensor C[1, 2, 3, 4] := A[1, 2, 2, 4]
             end
 
diff --git a/test/methods.jl b/test/methods.jl
index c73dcbf3..37c6af38 100644
--- a/test/methods.jl
+++ b/test/methods.jl
@@ -1,20 +1,16 @@
-using TensorOperations
-using Test
-using TensorOperations: IndexError
-using TensorOperations: BaseCopy, BaseView, StridedNative, StridedBLAS
+backendlist = (BaseCopy(), BaseView(), StridedNative(), StridedBLAS())
 
 # test simple methods
 #---------------------
-@testset "simple methods" begin
+@testset "simple methods with backend = $b" for b in backendlist
     @testset "tensorcopy" begin
         A = randn(Float64, (3, 5, 4, 6))
         p = (3, 1, 4, 2)
         C1 = permutedims(A, p)
-        C2 = @inferred tensorcopy((p...,), A, (1:4...,))
-        C3 = @inferred tensorcopy(A, (p, ()), false, 1, StridedNative())
-        C4 = @inferred tensorcopy(A, (p, ()), false, 1, BaseView())
-        C5 = @inferred tensorcopy(A, (p, ()), false, 1, BaseCopy())
-        @test C1 ≈ C2 ≈ C3 ≈ C4 ≈ C5
+        C2 = @inferred tensorcopy((p...,), A, (1:4...,); backend=b)
+        C3 = @inferred tensorcopy(A, (p, ()), false, 1, b)
+        @test C1 ≈ C2
+        @test C2 == C3
         @test C1 ≈ ncon(Any[A], Any[[-2, -4, -1, -3]])
         @test_throws IndexError tensorcopy(1:4, A, 1:3)
         @test_throws IndexError tensorcopy(1:4, A, [1, 2, 2, 4])
@@ -24,75 +20,61 @@ using TensorOperations: BaseCopy, BaseView, StridedNative, StridedBLAS
         A = randn(Float64, (3, 5, 4, 6))
         B = randn(Float64, (5, 6, 3, 4))
         p = (3, 1, 4, 2)
-        C1 = @inferred tensoradd(A, p, B, (1:4...,))
-        C2 = A + permutedims(B, p)
+        C1 = A + permutedims(B, p)
+        C2 = @inferred tensoradd(A, p, B, (1:4...,); backend=b)
+        C3 = @inferred tensoradd(A, ((1:4...,), ()), false, B, (p, ()), false, 1.0, 1.0, b)
         @test C1 ≈ C2
-        @test C1 ≈ A + ncon(Any[B], Any[[-2, -4, -1, -3]])
-        @test C1 ≈
-              @inferred tensoradd(A, ((1:4...,), ()), false, B, (p, ()), false, 1.0, 1.0,
-                                  StridedNative())
-        @test C1 ≈
-              @inferred tensoradd(A, ((1:4...,), ()), false, B, (p, ()), false, 1.0, 1.0,
-                                  BaseView())
-        @test C1 ≈
-              @inferred tensoradd(A, ((1:4...,), ()), false, B, (p, ()), false, 1.0, 1.0,
-                                  BaseCopy())
-
+        @test C2 == C3
+        @test C1 ≈ A + ncon(Any[B], Any[[-2, -4, -1, -3]]; backend=b)
         @test_throws DimensionMismatch tensoradd(A, 1:4, B, 1:4)
     end
 
     @testset "tensortrace" begin
         A = randn(Float64, (50, 100, 100))
-        C1 = tensortrace(A, [:a, :b, :b])
-        C2 = zeros(50)
+        C1 = zeros(50)
         for i in 1:50
             for j in 1:100
-                C2[i] += A[i, j, j]
+                C1[i] += A[i, j, j]
             end
         end
+        C2 = tensortrace(A, [:a, :b, :b]; backend=b)
+        C3 = ncon(Any[A], Any[[-1, 1, 1]]; backend=b)
         @test C1 ≈ C2
-        @test C1 ≈ ncon(Any[A], Any[[-1, 1, 1]])
+        @test C2 == C3
         A = randn(Float64, (3, 20, 5, 3, 20, 4, 5))
-        C1 = @inferred tensortrace((:e, :a, :d), A, (:a, :b, :c, :d, :b, :e, :c))
-        C2 = zeros(4, 3, 3)
+        C1 = zeros(4, 3, 3)
         for i1 in 1:4, i2 in 1:3, i3 in 1:3
             for j1 in 1:20, j2 in 1:5
-                C2[i1, i2, i3] += A[i2, j1, j2, i3, j1, i1, j2]
+                C1[i1, i2, i3] += A[i2, j1, j2, i3, j1, i1, j2]
             end
         end
+        C2 = @inferred tensortrace((:e, :a, :d), A, (:a, :b, :c, :d, :b, :e, :c); backend=b)
+        C3 = @inferred tensortrace(A, ((6, 1, 4), ()), ((2, 3), (5, 7)), false, 1.0, b)
+        C4 = ncon(Any[A], Any[[-2, 1, 2, -3, 1, -1, 2]]; backend=b)
         @test C1 ≈ C2
-        C1 ≈ @inferred tensortrace(A, ((6, 1, 4), ()), ((2, 3), (5, 7)), false, 1.0,
-                                   StridedNative())
-        C1 ≈ @inferred tensortrace(A, ((6, 1, 4), ()), ((2, 3), (5, 7)), false, 1.0,
-                                   BaseView())
-        C1 ≈ @inferred tensortrace(A, ((6, 1, 4), ()), ((2, 3), (5, 7)), false, 1.0,
-                                   BaseCopy())
-        @test C1 ≈ ncon(Any[A], Any[[-2, 1, 2, -3, 1, -1, 2]])
+        @test C2 == C3 == C4
     end
 
     @testset "tensorcontract" begin
         A = randn(Float64, (3, 20, 5, 3, 4))
         B = randn(Float64, (5, 6, 20, 3))
-        C1 = @inferred tensorcontract((:a, :g, :e, :d, :f),
-                                      A, (:a, :b, :c, :d, :e), B, (:c, :f, :b, :g))
-        C2 = zeros(3, 3, 4, 3, 6)
+        C1 = zeros(3, 3, 4, 3, 6)
         for a in 1:3, b in 1:20, c in 1:5, d in 1:3, e in 1:4, f in 1:6, g in 1:3
-            C2[a, g, e, d, f] += A[a, b, c, d, e] * B[c, f, b, g]
+            C1[a, g, e, d, f] += A[a, b, c, d, e] * B[c, f, b, g]
         end
+        C2 = @inferred tensorcontract((:a, :g, :e, :d, :f),
+                                      A, (:a, :b, :c, :d, :e), B, (:c, :f, :b, :g);
+                                      backend=b)
+        C3 = @inferred tensorcontract(A, ((1, 4, 5), (2, 3)), false, B, ((3, 1), (2, 4)),
+                                      false, ((1, 5, 3, 2, 4), ()), 1.0, b)
+        C4 = @inferred tensorcontract(A, ((1, 4, 5), (2, 3)), false, B, ((3, 1), (2, 4)),
+                                      false, ((1, 5, 3, 2, 4), ()), 1.0, b,
+                                      ManualAllocator())
+        C5 = ncon(Any[A, B], Any[[-1, 1, 2, -4, -3], [2, -5, 1, -2]]; backend=b,
+                  allocator=ManualAllocator())
+
         @test C1 ≈ C2
-        @test C1 ≈
-              @inferred tensorcontract(A, ((1, 4, 5), (2, 3)), false, B, ((3, 1), (2, 4)),
-                                       false, ((1, 5, 3, 2, 4), ()), 1.0, StridedNative())
-        @test C1 ≈
-              @inferred tensorcontract(A, ((1, 4, 5), (2, 3)), false, B, ((3, 1), (2, 4)),
-                                       false, ((1, 5, 3, 2, 4), ()), 1.0, StridedBLAS())
-        @test C1 ≈
-              @inferred tensorcontract(A, ((1, 4, 5), (2, 3)), false, B, ((3, 1), (2, 4)),
-                                       false, ((1, 5, 3, 2, 4), ()), 1.0, BaseView())
-        @test C1 ≈
-              @inferred tensorcontract(A, ((1, 4, 5), (2, 3)), false, B, ((3, 1), (2, 4)),
-                                       false, ((1, 5, 3, 2, 4), ()), 1.0, BaseCopy())
-        @test C1 ≈ ncon(Any[A, B], Any[[-1, 1, 2, -4, -3], [2, -5, 1, -2]])
+        @test C2 == C3 == C4 == C5
         @test_throws IndexError tensorcontract(A, [:a, :b, :c, :d], B, [:c, :f, :b, :g])
         @test_throws IndexError tensorcontract(A, [:a, :b, :c, :a, :e], B, [:c, :f, :b, :g])
     end
@@ -100,10 +82,10 @@ using TensorOperations: BaseCopy, BaseView, StridedNative, StridedBLAS
     @testset "tensorproduct" begin
         A = randn(Float64, (5, 5, 5, 5))
         B = rand(ComplexF64, (5, 5, 5, 5))
-        C1 = reshape((@inferred tensorproduct((1, 2, 5, 6, 3, 4, 7, 8),
-                                              A, (1, 2, 3, 4), B, (5, 6, 7, 8))),
+        C1 = kron(reshape(B, (25, 25)), reshape(A, (25, 25)))
+        C2 = reshape((@inferred tensorproduct((1, 2, 5, 6, 3, 4, 7, 8),
+                                              A, (1, 2, 3, 4), B, (5, 6, 7, 8); backend=b)),
                      (5 * 5 * 5 * 5, 5 * 5 * 5 * 5))
-        C2 = kron(reshape(B, (25, 25)), reshape(A, (25, 25)))
         @test C1 ≈ C2
         @test_throws IndexError tensorproduct(A, [:a, :b, :c, :d],
                                               B, [:d, :e, :f, :g])
@@ -112,18 +94,18 @@ using TensorOperations: BaseCopy, BaseView, StridedNative, StridedBLAS
 
         A = rand(1, 2)
         B = rand(4, 5)
-        C1 = tensorcontract((-1, -2, -3, -4), A, (-3, -1), false, B, (-2, -4), false)
-        C2 = zeros(2, 4, 1, 5)
-        for i in axes(C2, 1), j in axes(C2, 2), k in axes(C2, 3), l in axes(C2, 4)
-            C2[i, j, k, l] = A[k, i] * B[j, l]
+        C1 = zeros(2, 4, 1, 5)
+        for i in axes(C1, 1), j in axes(C1, 2), k in axes(C1, 3), l in axes(C1, 4)
+            C1[i, j, k, l] = A[k, i] * B[j, l]
         end
+        C2 = tensorcontract((-1, -2, -3, -4), A, (-3, -1), false, B, (-2, -4), false;
+                            backend=b)
+        C3 = tensorproduct(A, ((1, 2), ()), false, B, ((), (1, 2)), false,
+                           ((2, 3, 1, 4), ()), 1.0, b)
+        C4 = tensorproduct(A, ((1, 2), ()), false, B, ((), (1, 2)), false,
+                           ((2, 3, 1, 4), ()), 1.0, b, ManualAllocator())
         @test C1 ≈ C2
-        @test C1 ≈ @inferred tensorproduct(A, ((1, 2), ()), false, B, ((), (1, 2)), false,
-                                           ((2, 3, 1, 4), ()), 1.0, StridedNative())
-        @test C1 ≈ @inferred tensorproduct(A, ((1, 2), ()), false, B, ((), (1, 2)), false,
-                                           ((2, 3, 1, 4), ()), 1.0, BaseView())
-        @test C1 ≈ @inferred tensorproduct(A, ((1, 2), ()), false, B, ((), (1, 2)), false,
-                                           ((2, 3, 1, 4), ()), 1.0, BaseCopy())
+        @test C2 == C3 == C4
     end
 end
 
@@ -131,7 +113,6 @@ end
 #-----------------------
 # test different versions of in-place methods,
 # with changing element type and with nontrivial strides
-backendlist = (StridedNative(), StridedBLAS(), BaseView(), BaseCopy())
 @testset "in-place methods with backend $b" for b in backendlist
     @testset "tensorcopy!" begin
         Abig = randn(Float64, (30, 30, 30, 30))
@@ -194,7 +175,9 @@ backendlist = (StridedNative(), StridedBLAS(), BaseView(), BaseCopy())
                                                     α, β, b)
     end
 
-    @testset "tensorcontract!" begin
+    @testset "tensorcontract! with allocator = $allocator" for allocator in
+                                                               (DefaultAllocator(),
+                                                                ManualAllocator())
         Abig = rand(Float64, (30, 30, 30, 30))
         A = view(Abig, 1 .+ 3 * (0:8), 2 .+ 2 * (0:14), 5 .+ 4 * (0:6), 7 .+ 2 * (0:8))
         Bbig = rand(ComplexF64, (50, 50, 50))
@@ -213,7 +196,7 @@ backendlist = (StridedNative(), StridedBLAS(), BaseView(), BaseCopy())
             end
         end
         tensorcontract!(C, A, ((4, 1), (2, 3)), false, B, ((3, 1), (2,)), true,
-                        ((1, 2, 3), ()), α, β, b)
+                        ((1, 2, 3), ()), α, β, b, allocator)
         @test C ≈ Ccopy
         @test_throws IndexError tensorcontract!(C,
                                                 A, ((4, 1), (2, 4)), false,
diff --git a/test/runtests.jl b/test/runtests.jl
index fe07f4f9..5b068140 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,9 +2,12 @@ using TensorOperations
 using LinearAlgebra
 using Test
 using Random
-using Aqua
-
 Random.seed!(1234567)
+
+using TensorOperations: IndexError
+using TensorOperations: BaseCopy, BaseView, StridedNative, StridedBLAS
+using TensorOperations: DefaultAllocator, ManualAllocator
+
 @testset "tensoropt" verbose = true begin
     include("tensoropt.jl")
 end
@@ -26,10 +29,16 @@ end
 
 # note: cuTENSOR should not be loaded before this point
 # as there is a test which requires it to be loaded after
-@testset "cuTENSOR" verbose = true begin
+@testset "cuTENSOR extension" verbose = true begin
     include("cutensor.jl")
 end
 
+# note: Bumper should not be loaded before this point
+# as there is a test which requires it to be loaded after
+@testset "Bumper extension" verbose = true begin
+    include("butensor.jl")
+end
+
 @testset "Polynomials" begin
     include("polynomials.jl")
 end
diff --git a/test/tensor.jl b/test/tensor.jl
index 51af976f..1f497574 100644
--- a/test/tensor.jl
+++ b/test/tensor.jl
@@ -1,6 +1,5 @@
 # test index notation using @tensor macro
 #-----------------------------------------
-using TensorOperations: BaseCopy, BaseView, StridedNative, StridedBLAS
 backendlist = (BaseCopy(), BaseView(), StridedNative(), StridedBLAS())
 
 @testset "with backend $b" for b in backendlist
@@ -97,16 +96,16 @@ backendlist = (BaseCopy(), BaseView(), StridedNative(), StridedBLAS())
         C = view(Cbig, 13 .+ (0:6), 11 .+ 4 .* (0:9), 15 .+ 4 .* (0:8), 4 .+ 3 .* (0:6))
         Acopy = copy(A)
         Ccopy = copy(C)
-        @tensor C[3, 1, 4, 2] = A[1, 2, 3, 4]
-        @tensor Ccopy[3, 1, 4, 2] = Acopy[1, 2, 3, 4]
+        @tensor backend = b C[3, 1, 4, 2] = A[1, 2, 3, 4]
+        @tensor backend = b Ccopy[3, 1, 4, 2] = Acopy[1, 2, 3, 4]
         @test C ≈ Ccopy
-        @test_throws TensorOperations.IndexError begin
+        @test_throws IndexError begin
             @tensor C[3, 1, 4, 2] = A[1, 2, 3]
         end
         @test_throws DimensionMismatch begin
             @tensor C[3, 1, 4, 2] = A[3, 1, 4, 2]
         end
-        @test_throws TensorOperations.IndexError begin
+        @test_throws IndexError begin
             @tensor C[1, 1, 2, 3] = A[1, 2, 3, 4]
         end
     end
@@ -121,7 +120,7 @@ backendlist = (BaseCopy(), BaseView(), StridedNative(), StridedBLAS())
         Ccopy = copy(C)
         α = randn(Float64)
         β = randn(Float64)
-        @tensor C[3, 1, 4, 2] = β * C[3, 1, 4, 2] + α * A[1, 2, 3, 4]
+        @tensor backend = b C[3, 1, 4, 2] = β * C[3, 1, 4, 2] + α * A[1, 2, 3, 4]
         Ccopy = β * Ccopy + α * Acopy
         @test C ≈ Ccopy
         @test_throws IndexError begin
@@ -143,7 +142,7 @@ backendlist = (BaseCopy(), BaseView(), StridedNative(), StridedBLAS())
         Acopy = copy(A)
         Bcopy = copy(B)
         α = randn(Float64)
-        @tensor B[b, c] += α * A[a, b, c, a]
+        @tensor backend = b B[b, c] += α * A[a, b, c, a]
         for i in 1 .+ (0:8)
             Bcopy += α * view(A, i, :, :, i)
         end
@@ -178,7 +177,7 @@ backendlist = (BaseCopy(), BaseView(), StridedNative(), StridedBLAS())
                 Ccopy[d, a, e] -= α * A[a, b, c, d] * conj(B[c, e, b])
             end
         end
-        @tensor C[d, a, e] -= α * A[a, b, c, d] * conj(B[c, e, b])
+        @tensor backend = b C[d, a, e] -= α * A[a, b, c, d] * conj(B[c, e, b])
         @test C ≈ Ccopy
     end
 
@@ -196,7 +195,7 @@ backendlist = (BaseCopy(), BaseView(), StridedNative(), StridedBLAS())
                 Ccopy[d, a, e] += α * A[a, b, c, d] * conj(B[c, e, b])
             end
         end
-        @tensor C[d, a, e] += α * A[a, b, c, d] * conj(B[c, e, b])
+        @tensor backend = b C[d, a, e] += α * A[a, b, c, d] * conj(B[c, e, b])
         @test C ≈ Ccopy
         @test_throws IndexError begin
             @tensor C[d, a, e] += α * A[a, b, c, a] * B[c, e, b]
@@ -238,9 +237,12 @@ backendlist = (BaseCopy(), BaseView(), StridedNative(), StridedBLAS())
     end
 
     # Some tensor network examples
-    @testset "tensor network examples ($T)" for T in
-                                                (Float32, Float64, ComplexF32, ComplexF64,
-                                                 BigFloat)
+    if !(b isa StridedBLAS)
+        scalartypelist = (Float32, Float64, ComplexF32, ComplexF64, BigFloat)
+    else
+        scalartypelist = (Float32, Float64, ComplexF32, ComplexF64)
+    end
+    @testset "tensor network examples ($T)" for T in scalartypelist
         D1, D2, D3 = 30, 40, 20
         d1, d2 = 2, 3
         A1 = rand(T, D1, d1, D2) .- 1 // 2
@@ -256,11 +258,13 @@ backendlist = (BaseCopy(), BaseView(), StridedNative(), StridedBLAS())
                                             (d1 * d2, D1 * D3)), (d1, d2, D1, D3)),
                             (3, 1, 2, 4))
         E = dot(A12, HrA12)
-        @tensor HrA12′[a, s1, s2, c] := rhoL[a, a'] * A1[a', t1, b] * A2[b, t2, c'] *
-                                        rhoR[c', c] * H[s1, s2, t1, t2]
-        @tensor HrA12′′[:] := rhoL[-1, 1] * H[-2, -3, 4, 5] * A2[2, 5, 3] * rhoR[3, -4] *
-                              A1[1, 4, 2] # should be contracted in exactly same order
-        @tensor order = (a', b, c', t1, t2) begin
+        @tensor backend = b HrA12′[a, s1, s2, c] := rhoL[a, a'] * A1[a', t1, b] *
+                                                    A2[b, t2, c'] *
+                                                    rhoR[c', c] * H[s1, s2, t1, t2]
+        @tensor backend = b HrA12′′[:] := rhoL[-1, 1] * H[-2, -3, 4, 5] * A2[2, 5, 3] *
+                                          rhoR[3, -4] *
+                                          A1[1, 4, 2] # should be contracted in exactly same order
+        @tensor backend = b order = (a', b, c', t1, t2) begin
             HrA12′′′[a, s1, s2, c] := rhoL[a, a'] * H[s1, s2, t1, t2] * A2[b, t2, c'] *
                                       rhoR[c', c] * A1[a', t1, b] # should be contracted in exactly same order
         end
@@ -270,58 +274,135 @@ backendlist = (BaseCopy(), BaseView(), StridedNative(), StridedBLAS())
         @test HrA12′ == HrA12′′ == HrA12′′′ # should be exactly equal
         @test HrA12 ≈ HrA12′
         @test HrA12 ≈ HrA12′′′′
-        @test HrA12′′ == ncon([rhoL, H, A2, rhoR, A1],
-                              [[-1, 1], [-2, -3, 4, 5], [2, 5, 3], [3, -4], [1, 4, 2]])
+        @test HrA12′′ ≈ ncon([rhoL, H, A2, rhoR, A1],
+                             [[-1, 1], [-2, -3, 4, 5], [2, 5, 3], [3, -4], [1, 4, 2]])
         @test HrA12′′ == @ncon([rhoL, H, A2, rhoR, A1],
                                [[-1, 1], [-2, -3, 4, 5], [2, 5, 3], [3, -4], [1, 4, 2]];
-                               order=[1, 2, 3, 4, 5], output=[-1, -2, -3, -4])
+                               order=[1, 2, 3, 4, 5], output=[-1, -2, -3, -4], backend=b)
         @test E ≈
               @tensor tensorscalar(rhoL[a', a] * A1[a, s, b] * A2[b, s', c] * rhoR[c, c'] *
                                    H[t, t', s, s'] * conj(A1[a', t, b']) *
                                    conj(A2[b', t', c']))
     end
+end
+@testset "tensoropt" begin
+    A = randn(5, 5, 5, 5)
+    B = randn(5, 5, 5)
+    C = randn(5, 5, 5)
+    @tensoropt (a => χ, b => χ^2, c => 2 * χ, d => χ, e => 5, f => 2 * χ) begin
+        D1[a, b, c, d] := A[a, e, c, f] * B[g, d, e] * C[g, f, b]
+    end
+    @tensoropt (a=χ, b=χ^2, c=2 * χ, d=χ, e=5, f=2 * χ) begin
+        D2[a, b, c, d] := A[a, e, c, f] * B[g, d, e] * C[g, f, b]
+    end
+    @tensoropt ((a, d) => χ, b => χ^2, (c, f) => 2 * χ, e => 5) begin
+        D3[a, b, c, d] := A[a, e, c, f] * B[g, d, e] * C[g, f, b]
+    end
+    @tensoropt ((a, d)=χ, b=χ^2, (c, f)=2 * χ, e=5) begin
+        D4[a, b, c, d] := A[a, e, c, f] * B[g, d, e] * C[g, f, b]
+    end
+    @test D1 == D2 == D3 == D4
+    _optdata = optex -> TensorOperations.optdata(optex,
+                                                 :(D1[a, b, c, d] := A[a, e, c, f] *
+                                                                     B[g, d, e] *
+                                                                     C[g, f, b]))
+    optex1 = :((a => χ, b => χ^2, c => 2 * χ, d => χ, e => 5, f => 2 * χ))
+    optex2 = :((a=χ, b=χ^2, c=2 * χ, d=χ, e=5, f=2 * χ))
+    optex3 = :(((a, d) => χ, b => χ^2, (c, f) => 2 * χ, e => 5))
+    optex4 = :(((a, d)=χ, b=χ^2, (c, f)=2 * χ, e=5))
+    optex5 = :(((a,) => χ, b => χ^2, (c,) => 2χ, d => χ, e => 5, f => χ * 2,
+                () => 12345))
+    @test _optdata(optex1) == _optdata(optex2) == _optdata(optex3) ==
+          _optdata(optex4) == _optdata(optex5)
+    optex6 = :(((a, b, c)=χ,))
+    optex7 = :((a, b, c))
+    @test _optdata(optex6) == _optdata(optex7)
+    optex8 = :(((a, b, c)=1, (d, e, f, g)=χ))
+    optex9 = :(!(a, b, c))
+    @test _optdata(optex8) == _optdata(optex9)
+    optex10 = :((a => χ, b => χ^2, c=2 * χ, d => χ, e => 5, f=2 * χ))
+    optex11 = :((a=χ, b=χ^2, c=2 * χ, d, e=5, f))
+    @test_throws ErrorException _optdata(optex10)
+    @test_throws ErrorException _optdata(optex11)
+end
 
-    @testset "tensoropt" begin
-        A = randn(5, 5, 5, 5)
-        B = randn(5, 5, 5)
-        C = randn(5, 5, 5)
-        @tensoropt (a => χ, b => χ^2, c => 2 * χ, d => χ, e => 5, f => 2 * χ) begin
-            D1[a, b, c, d] := A[a, e, c, f] * B[g, d, e] * C[g, f, b]
-        end
-        @tensoropt (a=χ, b=χ^2, c=2 * χ, d=χ, e=5, f=2 * χ) begin
-            D2[a, b, c, d] := A[a, e, c, f] * B[g, d, e] * C[g, f, b]
-        end
-        @tensoropt ((a, d) => χ, b => χ^2, (c, f) => 2 * χ, e => 5) begin
-            D3[a, b, c, d] := A[a, e, c, f] * B[g, d, e] * C[g, f, b]
-        end
-        @tensoropt ((a, d)=χ, b=χ^2, (c, f)=2 * χ, e=5) begin
-            D4[a, b, c, d] := A[a, e, c, f] * B[g, d, e] * C[g, f, b]
-        end
-        @test D1 == D2 == D3 == D4
-        _optdata = optex -> TensorOperations.optdata(optex,
-                                                     :(D1[a, b, c, d] := A[a, e, c, f] *
-                                                                         B[g, d, e] *
-                                                                         C[g, f, b]))
-        optex1 = :((a => χ, b => χ^2, c => 2 * χ, d => χ, e => 5, f => 2 * χ))
-        optex2 = :((a=χ, b=χ^2, c=2 * χ, d=χ, e=5, f=2 * χ))
-        optex3 = :(((a, d) => χ, b => χ^2, (c, f) => 2 * χ, e => 5))
-        optex4 = :(((a, d)=χ, b=χ^2, (c, f)=2 * χ, e=5))
-        optex5 = :(((a,) => χ, b => χ^2, (c,) => 2χ, d => χ, e => 5, f => χ * 2,
-                    () => 12345))
-        @test _optdata(optex1) == _optdata(optex2) == _optdata(optex3) ==
-              _optdata(optex4) == _optdata(optex5)
-        optex6 = :(((a, b, c)=χ,))
-        optex7 = :((a, b, c))
-        @test _optdata(optex6) == _optdata(optex7)
-        optex8 = :(((a, b, c)=1, (d, e, f, g)=χ))
-        optex9 = :(!(a, b, c))
-        @test _optdata(optex8) == _optdata(optex9)
-        optex10 = :((a => χ, b => χ^2, c=2 * χ, d => χ, e => 5, f=2 * χ))
-        optex11 = :((a=χ, b=χ^2, c=2 * χ, d, e=5, f))
-        @test_throws ErrorException _optdata(optex10)
-        @test_throws ErrorException _optdata(optex11)
+# diagonal matrices
+@testset "Diagonal($T)" for T in (Float32, Float64, ComplexF32, ComplexF64)
+    A = randn(T, 10, 10, 10, 10)
+    @tensor C[3, 1, 4, 2] := A[1, 2, 3, 4]
+    U, S, V = svd(reshape(C, (100, 100)))
+    U2 = reshape(view(U, 1:2:100, :), (5, 10, 100))
+    S2 = Diagonal(S)
+    V2 = reshape(view(V, 1:2:100, :), (5, 10, 100))
+    @tensor A2[1, 2, 3, 4] := U2[3, 1, a] * S2[a, a'] * conj(V2[4, 2, a'])
+    @test A2 ≈ view(A, :, :, 1:2:10, 1:2:10)
+
+    S = Diagonal(randn(T, 5))
+    @tensor S2[i, k] := S[i, j] * S[k, j]
+    S3 = similar(S)
+    @tensor S3[i, k] = S[i, j] * S[j, k]
+    @test S2 ≈ S3 ≈ S * S
+    Str = @tensor S[i, j] * S[i, j]
+    @test Str ≈ sum(S.diag .^ 2)
+
+    @tensor SS[i, j, k, l] := S[i, j] * S[k, l]
+    @tensor SS2[i, j, k, l] := Array(S)[i, j] * Array(S)[k, l]
+    @test SS ≈ SS2
+
+    B = randn(T, 10)
+    @tensor C1[3, 1, 4, 2] := A[a, 2, 3, 4] * Diagonal(B)[a, 1]
+    @tensor C2[3, 1, 4, 2] := A[1, a, 3, 4] * conj(Diagonal(B)[a, 2])
+    @tensor C3[3, 1, 4, 2] := conj(A[1, 2, a, 4]) * Diagonal(B)[a, 3]
+    @tensor C4[3, 1, 4, 2] := conj(A[1, 2, 3, a]) * conj(Diagonal(B)[a, 4])
+    @tensor C1′[3, 1, 4, 2] := Diagonal(B)[a, 1] * A[a, 2, 3, 4]
+    @tensor C2′[3, 1, 4, 2] := conj(Diagonal(B)[a, 2]) * A[1, a, 3, 4]
+    @tensor C3′[3, 1, 4, 2] := Diagonal(B)[a, 3] * conj(A[1, 2, a, 4])
+    @tensor C4′[3, 1, 4, 2] := conj(Diagonal(B)[a, 4]) * conj(A[1, 2, 3, a])
+    @test C1 == C1′
+    @test C2 == C2′
+    @test C3 == C3′
+    @test C4 == C4′
+    for i in 1:10
+        @test C1[:, i, :, :] ≈ permutedims(A[i, :, :, :], (2, 3, 1)) * B[i]
+        @test C2[:, :, :, i] ≈ permutedims(A[:, i, :, :], (2, 1, 3)) * conj(B[i])
+        @test C3[i, :, :, :] ≈ conj(permutedims(A[:, :, i, :], (1, 3, 2))) * B[i]
+        @test C4[:, :, i, :] ≈ conj(permutedims(A[:, :, :, i], (3, 1, 2))) * conj(B[i])
     end
 
+    @tensor D1[1, 2] := A[1, 2, a, b] * Diagonal(B)[a, b]
+    @tensor D2[1, 2] := A[1, b, 2, a] * conj(Diagonal(B)[a, b])
+    @tensor D3[1, 2] := conj(A[a, 2, 1, b]) * Diagonal(B)[a, b]
+    @tensor D4[1, 2] := conj(A[a, 1, b, 2]) * conj(Diagonal(B)[a, b])
+    @tensor D1′[1, 2] := Diagonal(B)[a, b] * A[1, 2, a, b]
+    @tensor D2′[1, 2] := conj(Diagonal(B)[a, b]) * A[1, b, 2, a]
+    @tensor D3′[1, 2] := Diagonal(B)[a, b] * conj(A[a, 2, 1, b])
+    @tensor D4′[1, 2] := conj(Diagonal(B)[a, b]) * conj(A[a, 1, b, 2])
+    @test D1 == D1′
+    @test D2 == D2′
+    @test D3 == D3′
+    @test D4 == D4′
+
+    E1 = zero(D1)
+    E2 = zero(D2)
+    E3 = zero(D3)
+    E4 = zero(D4)
+    for i in 1:10
+        E1[:, :] += A[:, :, i, i] * B[i]
+        E2[:, :] += A[:, i, :, i] * conj(B[i])
+        E3[:, :] += A[i, :, :, i]' * B[i]
+        E4[:, :] += conj(A[i, :, i, :]) * conj(B[i])
+    end
+    @test D1 ≈ E1
+    @test D2 ≈ E2
+    @test D3 ≈ E3
+    @test D4 ≈ E4
+
+    F = randn(T, (10, 10))
+    @tensor G[a, c, b, d] := F[a, b] * Diagonal(B)[c, d]
+    @test reshape(G, (100, 100)) ≈ kron(Diagonal(B), F)
+end
+
+@testset "Specific issues" begin
     @testset "Issue 83" begin
         op1 = randn(2, 2)
         op2 = randn(2, 2)
@@ -336,82 +417,6 @@ backendlist = (BaseCopy(), BaseView(), StridedNative(), StridedBLAS())
         @test b != c
     end
 
-    # diagonal matrices
-    @testset "Diagonal($T)" for T in (Float32, Float64, ComplexF32, ComplexF64)
-        A = randn(T, 10, 10, 10, 10)
-        @tensor C[3, 1, 4, 2] := A[1, 2, 3, 4]
-        U, S, V = svd(reshape(C, (100, 100)))
-        U2 = reshape(view(U, 1:2:100, :), (5, 10, 100))
-        S2 = Diagonal(S)
-        V2 = reshape(view(V, 1:2:100, :), (5, 10, 100))
-        @tensor A2[1, 2, 3, 4] := U2[3, 1, a] * S2[a, a'] * conj(V2[4, 2, a'])
-        @test A2 ≈ view(A, :, :, 1:2:10, 1:2:10)
-
-        S = Diagonal(randn(T, 5))
-        @tensor S2[i, k] := S[i, j] * S[k, j]
-        S3 = similar(S)
-        @tensor S3[i, k] = S[i, j] * S[j, k]
-        @test S2 ≈ S3 ≈ S * S
-        Str = @tensor S[i, j] * S[i, j]
-        @test Str ≈ sum(S.diag .^ 2)
-
-        @tensor SS[i, j, k, l] := S[i, j] * S[k, l]
-        @tensor SS2[i, j, k, l] := Array(S)[i, j] * Array(S)[k, l]
-        @test SS ≈ SS2
-
-        B = randn(T, 10)
-        @tensor C1[3, 1, 4, 2] := A[a, 2, 3, 4] * Diagonal(B)[a, 1]
-        @tensor C2[3, 1, 4, 2] := A[1, a, 3, 4] * conj(Diagonal(B)[a, 2])
-        @tensor C3[3, 1, 4, 2] := conj(A[1, 2, a, 4]) * Diagonal(B)[a, 3]
-        @tensor C4[3, 1, 4, 2] := conj(A[1, 2, 3, a]) * conj(Diagonal(B)[a, 4])
-        @tensor C1′[3, 1, 4, 2] := Diagonal(B)[a, 1] * A[a, 2, 3, 4]
-        @tensor C2′[3, 1, 4, 2] := conj(Diagonal(B)[a, 2]) * A[1, a, 3, 4]
-        @tensor C3′[3, 1, 4, 2] := Diagonal(B)[a, 3] * conj(A[1, 2, a, 4])
-        @tensor C4′[3, 1, 4, 2] := conj(Diagonal(B)[a, 4]) * conj(A[1, 2, 3, a])
-        @test C1 == C1′
-        @test C2 == C2′
-        @test C3 == C3′
-        @test C4 == C4′
-        for i in 1:10
-            @test C1[:, i, :, :] ≈ permutedims(A[i, :, :, :], (2, 3, 1)) * B[i]
-            @test C2[:, :, :, i] ≈ permutedims(A[:, i, :, :], (2, 1, 3)) * conj(B[i])
-            @test C3[i, :, :, :] ≈ conj(permutedims(A[:, :, i, :], (1, 3, 2))) * B[i]
-            @test C4[:, :, i, :] ≈ conj(permutedims(A[:, :, :, i], (3, 1, 2))) * conj(B[i])
-        end
-
-        @tensor D1[1, 2] := A[1, 2, a, b] * Diagonal(B)[a, b]
-        @tensor D2[1, 2] := A[1, b, 2, a] * conj(Diagonal(B)[a, b])
-        @tensor D3[1, 2] := conj(A[a, 2, 1, b]) * Diagonal(B)[a, b]
-        @tensor D4[1, 2] := conj(A[a, 1, b, 2]) * conj(Diagonal(B)[a, b])
-        @tensor D1′[1, 2] := Diagonal(B)[a, b] * A[1, 2, a, b]
-        @tensor D2′[1, 2] := conj(Diagonal(B)[a, b]) * A[1, b, 2, a]
-        @tensor D3′[1, 2] := Diagonal(B)[a, b] * conj(A[a, 2, 1, b])
-        @tensor D4′[1, 2] := conj(Diagonal(B)[a, b]) * conj(A[a, 1, b, 2])
-        @test D1 == D1′
-        @test D2 == D2′
-        @test D3 == D3′
-        @test D4 == D4′
-
-        E1 = zero(D1)
-        E2 = zero(D2)
-        E3 = zero(D3)
-        E4 = zero(D4)
-        for i in 1:10
-            E1[:, :] += A[:, :, i, i] * B[i]
-            E2[:, :] += A[:, i, :, i] * conj(B[i])
-            E3[:, :] += A[i, :, :, i]' * B[i]
-            E4[:, :] += conj(A[i, :, i, :]) * conj(B[i])
-        end
-        @test D1 ≈ E1
-        @test D2 ≈ E2
-        @test D3 ≈ E3
-        @test D4 ≈ E4
-
-        F = randn(T, (10, 10))
-        @tensor G[a, c, b, d] := F[a, b] * Diagonal(B)[c, d]
-        @test reshape(G, (100, 100)) ≈ kron(Diagonal(B), F)
-    end
-
     @testset "Issue 133" begin
         vec = [1, 2]
         mat1 = rand(2, 2)