From 2d761c2e08729d54dacacbbc3a653623be5dbbf8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomasz=20=C5=9Amierzchalski?=
 <53766192+tomsmierz@users.noreply.github.com>
Date: Wed, 3 Apr 2024 13:46:21 +0200
Subject: [PATCH] Master to be -> master (#16)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* clean up types

* clean up types

* rm AbstracDitcs

* rm Abstract from Matrix

* rm AbstracDitcs and add zeros

* clean up tensors more

* clean up sparse

* clean up  virtual

* clean up linear alg ext

* simplify lin alg ext

* comments

* transpose

* corewct types

* simplify more

* transpose

* clean up #1

* clean up #2

* clean up #3

* clean up #4

* add canonise.jl

* formatt code

* clean up #5

* remove trans

* add new func

* clean up

* start cleaning gauges

* clean up

* add comments

* output env variational

* MpoTensor

* structs

* MpoTensor

* fix issues with types

* fix nothing type

* length for MpoTensor to be verified

* clean up

* fix subtyping in tensors

* clean up types

* add N explicitte in some places

* rm lenght

* MpoTensor

* transpose

* clea up

* ideas

* mv ideas to attic

* make MpoTensor external const

* renaming

* add tsvd example

* zipper

* improve bench

* clean up exports

* add comments

* add basic tests for QMps and Canonise

* clean up stuff

* measure_memory

* add measure_memory

* add format_bytes

* zipper

* sparse zipper

* add eltype for mps / mps

* fix some issues

* zipper psvd

* zipper psvd

* zipper sparse svd

* tsvd

* change svd_corner_matrix

* correct Adjoint for CornerTenso

* to cuda

* device

* device

* edvice

* diag

* fix Diagona

* fix device

* fix array 2

* fix diag 3

* works

* add comments

* toward cuda

* fix kwargs

* cuda contractons

* cuda central

* cleanup cuda

* patch array centraltensor

* patching dense central tensor

* vector memory

* start cleaning

* add GPU flags, clean up

* clean up

* zipper QMps on GPU

* start working on qr

* qr on GPU works

* add basic tests for QMps

* more basic tests

* aux

* change site

* clean up

* clean up CPU <-> GPU transfer

* gauges on gpu

* change types in gauges

* reordering

* clean up

* fix bug

* clean up

* fix

* rm unnecassry permusts

* dense site

* clean up

* clean up

* zipper central

* virtual

* test

* central

* cuproj

* aux_saprse

* add @view in site

* clean up

* add bench for multi gpu

* add examples

* add commenst

* add @inbounds in some places

* towards cpu contractions

* to cpu

* benchmarks for memoizations of cusparse

* added comment explaining convention

* SparseCSC

* move stuff to attic

* clean up a bit

* clean types a bit

* added measure_memory for memoization caches

* added handling of sparse matrices

* hotfix in sparse matix CSR

* clean up

* clean up 2

* fix types

* sitetensor

* SiteTensor Sparse

* VirtualTensor

* poolofprojectors

* memory

* copied from "leg-ordering" branch

* start working on SCR problem

* project_ket_on_bra virtual

* add SparseArrays to test cuSparse

* virtual update_env_left

* add  CUDA.:* (commented)

* virtual update_env_right

* cirtual update_reduced_env_right

* add bench for CSR

* add bench

* site

* info

* reduced_env_right sitetensor

* measure_spectrum

* input mps is in the correct canonical form

* test measure_spectrum

* virtual

* test measure spectrum

* move_to_CPU

* schmidt

* view virtual

* rand qmpo

* toward allocation

* allocate site

* site working

* speed up virtual (update_env_right)

* virtual alloc

* virtual no cusparse

* virtual no cusparse

* proj

* site

* test

* working site

* cleanup

* new zipper

* new zipper

* stable zipper

* building blocks of new zipper-variational

* fix type

* fix typo

* start new zipper

* new env

* new zipper

* new_zipper clear env

* kill attic

* fix new zipper

* args in zipper

* clean up

* repeat psvd

* sparse

* corner matrix for virtual

* virtualtest

* restart virtual

* env_levt_v1

* new virtual

* my_batched_mul

* virtual with 2 step projectors

* WIP

* cases without central tensor

* fix update_reduced_env_right virtual

* central batched_mul

* measure_memory(EnvironmentMixed)

* change fg to cl_h

* PoolOfProjectors in clustered_hamiltonian

* clean up tests

* split long lines

* add docs

* docs

* clean up toml

* clean up

* add the docs

* add flag for RMF in zipper

* add docs

* add depth of sweep in zipper

* Rename aux.jl to utils.jl, becouse aux.* is restricted filename in windows

* Update SpinGlassTensors.jl, changed aux to utils

* moved projectors.jl from SpinGlassNetworks

* remove networks from Project

* moved test

* add projectors to tests

* update julia

* added projectors.jl to runtest

* moved rank_reveal from SpinGlassNetworks

* bugfix

* update to CUDA 4.4.1, TensorOperations 4 and TensorCast 0.4. SEE README

* reformat

* fix ci

* onlcy set self-hosted

* fix nothing comparsion (#18)

* Fixes creation of sparse matrices (#19)

* change CUDA sparse to CSR, rename function createing sparse matrices

* up

* update ci runner

* fix runs-on ci

* fix ci

* set proper rev

* add flags

* fix project in TransmuteDims

* fix typo

---------

Co-authored-by: bartekGardas <bartek.gardas@gmail.com>
Co-authored-by: marekrams <marek.rams@uj.edu.pl>
Co-authored-by: annamariadziubyna <anna.maria.dziubyna@doctoral.uj.edu.pl>
Co-authored-by: annamariadziubyna <73058800+annamariadziubyna@users.noreply.github.com>
Co-authored-by: Łukasz Pawela <lukasz.pawela@gmail.com>
Co-authored-by: Łukasz Pawela <3093117+lpawela@users.noreply.github.com>
---
 .github/workflows/CI.yml          |  16 +-
 Project.toml                      |  31 ++-
 README.md                         |   1 +
 bench_mm.jl                       |  29 ++
 benchmark/args.jl                 |  12 +
 benchmark/cuda_matrix_mul.jl      |  29 ++
 benchmark/gpu_slicing.jl          |  15 ++
 benchmark/memoization_cusparse.jl |  46 ++++
 benchmark/memoization_test.jl     |  38 +++
 benchmark/mulit_gpu.jl            |  25 ++
 benchmark/mulit_gpu2.jl           |  10 +
 benchmark/psvd.jl                 |  86 ++++++
 benchmark/qr.jl                   |  18 ++
 benchmark/sparse_mul.jl           |  24 ++
 benchmark/sparse_mul_bench.jl     |  31 +++
 benchmark/svd.jl                  |  73 +++++
 benchmark/svd2.jl                 | 102 +++++++
 docs/make.jl                      |  16 +-
 docs/src/api.md                   |  27 +-
 docs/src/index.md                 |   4 +-
 docs/src/mpo.md                   |   5 +
 src/SpinGlassTensors.jl           |  42 ++-
 src/base.jl                       | 318 ++++++++--------------
 src/compressions.jl               | 240 -----------------
 src/contractions.jl               | 151 -----------
 src/contractions/central.jl       | 111 ++++++++
 src/contractions/dense.jl         | 204 ++++++++++++++
 src/contractions/diagonal.jl      |  21 ++
 src/contractions/site.jl          | 195 ++++++++++++++
 src/contractions/sparse.jl        |  70 +++++
 src/contractions/virtual.jl       | 434 ++++++++++++++++++++++++++++++
 src/environment.jl                | 293 ++++++++++++++++++++
 src/gauges.jl                     | 155 +++++++++++
 src/identities.jl                 |  45 ----
 src/linear_algebra_ext.jl         |  85 +++---
 src/mps/base.jl                   | 110 ++++++++
 src/mps/canonise.jl               |  98 +++++++
 src/mps/dot.jl                    |  53 ++++
 src/mps/identity.jl               |  34 +++
 src/mps/rand.jl                   |  45 ++++
 src/mps/transpose.jl              |  21 ++
 src/mps/utils.jl                  |  67 +++++
 src/projectors.jl                 | 140 ++++++++++
 src/transfer.jl                   |  84 ++++++
 src/utils/memory.jl               |  47 ++++
 src/utils/utils.jl                | 108 ++++++++
 src/variational.jl                |  97 +++++++
 src/zipper.jl                     | 278 +++++++++++++++++++
 test/attic/canonise.jl            |  25 ++
 test/attic/compressions.jl        |  35 +++
 test/attic/contractions.jl        |  61 +++++
 test/attic/environment.jl         |  10 +
 test/attic/linear_algebra_ext.jl  |  46 ++++
 test/{ => attic}/memoization.jl   |   0
 test/attic/mps.jl                 |  45 ++++
 test/attic/runtests.jl            |  20 ++
 test/base.jl                      | 187 -------------
 test/canonise.jl                  |  51 ++++
 test/compressions.jl              | 100 -------
 test/contractions.jl              | 111 --------
 test/identities.jl                |  80 ------
 test/projectors.jl                |  53 ++++
 test/runtests.jl                  |   7 +-
 test/variational.jl               |  52 ++++
 64 files changed, 3840 insertions(+), 1227 deletions(-)
 create mode 100755 bench_mm.jl
 create mode 100644 benchmark/args.jl
 create mode 100644 benchmark/cuda_matrix_mul.jl
 create mode 100644 benchmark/gpu_slicing.jl
 create mode 100644 benchmark/memoization_cusparse.jl
 create mode 100644 benchmark/memoization_test.jl
 create mode 100644 benchmark/mulit_gpu.jl
 create mode 100644 benchmark/mulit_gpu2.jl
 create mode 100644 benchmark/psvd.jl
 create mode 100644 benchmark/qr.jl
 create mode 100644 benchmark/sparse_mul.jl
 create mode 100644 benchmark/sparse_mul_bench.jl
 create mode 100644 benchmark/svd.jl
 create mode 100644 benchmark/svd2.jl
 create mode 100644 docs/src/mpo.md
 delete mode 100644 src/compressions.jl
 delete mode 100644 src/contractions.jl
 create mode 100644 src/contractions/central.jl
 create mode 100644 src/contractions/dense.jl
 create mode 100644 src/contractions/diagonal.jl
 create mode 100644 src/contractions/site.jl
 create mode 100644 src/contractions/sparse.jl
 create mode 100644 src/contractions/virtual.jl
 create mode 100644 src/environment.jl
 create mode 100644 src/gauges.jl
 delete mode 100644 src/identities.jl
 create mode 100644 src/mps/base.jl
 create mode 100644 src/mps/canonise.jl
 create mode 100644 src/mps/dot.jl
 create mode 100644 src/mps/identity.jl
 create mode 100644 src/mps/rand.jl
 create mode 100644 src/mps/transpose.jl
 create mode 100644 src/mps/utils.jl
 create mode 100644 src/projectors.jl
 create mode 100644 src/transfer.jl
 create mode 100644 src/utils/memory.jl
 create mode 100644 src/utils/utils.jl
 create mode 100644 src/variational.jl
 create mode 100644 src/zipper.jl
 create mode 100644 test/attic/canonise.jl
 create mode 100644 test/attic/compressions.jl
 create mode 100644 test/attic/contractions.jl
 create mode 100644 test/attic/environment.jl
 create mode 100644 test/attic/linear_algebra_ext.jl
 rename test/{ => attic}/memoization.jl (100%)
 create mode 100644 test/attic/mps.jl
 create mode 100644 test/attic/runtests.jl
 delete mode 100644 test/base.jl
 create mode 100644 test/canonise.jl
 delete mode 100644 test/compressions.jl
 delete mode 100644 test/contractions.jl
 delete mode 100644 test/identities.jl
 create mode 100644 test/projectors.jl
 create mode 100644 test/variational.jl

diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 56ad45a..4c92a49 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -4,25 +4,21 @@ on:
   - pull_request
 jobs:
   test:
-    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }}
-    runs-on: ${{ matrix.os }}
+    name: Julia ${{ matrix.version }}
+    runs-on: [self-hosted,titan,gpu]
     strategy:
       fail-fast: false
       matrix:
         version:
-          - '1.7'
-          - '1.8'
-        os:
-          - ubuntu-latest
-          - macOS-latest
-        arch:
-          - x64
+          - '1.9'
+          - '1.10'
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@v1
         with:
           version: ${{ matrix.version }}
-          arch: ${{ matrix.arch }}
+      - name: Fix TransmuteDims
+        run: julia --project=@. --color=yes -e 'using Pkg; Pkg.add(name="TransmuteDims", rev="strided2")'
       - uses: julia-actions/julia-buildpkg@latest
       - uses: julia-actions/julia-runtest@latest
         env:
diff --git a/Project.toml b/Project.toml
index 4a57f53..8ade337 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,25 +1,34 @@
 name = "SpinGlassTensors"
 uuid = "7584fc6a-5a23-4eeb-8277-827aab0146ea"
-authors = [
-    "Łukasz Pawela <lukasz.pawela@gmail.com>",
-    "Konrad Jałowiecki <dexter2206@gmail.com>",
-    "Bartłomiej Gardas <bartek.gardas@gmail.com>"
-    ]
-version = "0.3.0"
+authors = ["Anna Maria Dziubyna <annamariadziubyna@gmail.com>", "Tomasz Śmierzchalski <smierzchalski.tomek@gmail.com>", "Bartłomiej Gardas <bartek.gardas@gmail.com>", "Konrad Jałowiecki <dexter2206@gmail.com>", "Łukasz Pawela <lukasz.pawela@gmail.com>", "Marek M. Rams <marekrams@gmail.com>"]
+version = "1.0.0"
 
 [deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-Memoize = "c03570c3-d221-55d1-a50c-7939bbd78826"
+LowRankApprox = "898213cb-b102-5a47-900c-97e73b919f73"
+MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
+Memoization = "6fafb56a-5788-4b4e-91ca-c0cea6611c73"
+NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+TSVD = "9449cd9e-2762-5aa3-a617-5413e99d722e"
 TensorCast = "02d47bb6-7ce6-556a-be16-bb1710789e2b"
 TensorOperations = "6aa20fa7-93e2-5fca-9bc0-fbd0db3c71a2"
+TransmuteDims = "24ddb15e-299a-5cc3-8414-dbddc482d9ca"
+cuTENSOR = "011b41b2-24ef-40a8-b3eb-fa098493e9e1"
 
 [compat]
-DocStringExtensions = "0.8"
-Memoize = "0.4"
+CUDA = "4.4.1"
+DocStringExtensions = "0.9.3"
+LowRankApprox = "0.5.5"
+MKL = "0.4.2"
+Memoization = "0.2.1"
+SparseArrays = "1.9"
 TensorCast = "0.4"
-TensorOperations = "3.0.1"
-julia = "1.7, 1.8"
+TensorOperations = "4"
+cuTENSOR = "1.1.0"
+julia = "1.9, 1.10"
 
 [extras]
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
diff --git a/README.md b/README.md
index 9f1d9a5..7623f30 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,3 @@
 [![Coverage Status](https://coveralls.io/repos/github/iitis/SpinGlassTensors.jl/badge.svg?branch=master)](https://coveralls.io/github/iitis/SpinGlassTensors.jl?branch=master)
 # SpinGlassTensors.jl
+This works with CUDA v4.4.1. You need to manually `]add TransmuteDims#strided2`
\ No newline at end of file
diff --git a/bench_mm.jl b/bench_mm.jl
new file mode 100755
index 0000000..cf21ba5
--- /dev/null
+++ b/bench_mm.jl
@@ -0,0 +1,29 @@
+using TensorCast, TensorOperations
+function time_mm()
+    M = rand(100, 100, 100)
+    L = rand(100, 100)
+    R = rand(100, 100)
+    @time begin
+        @matmul M1[x, σ, α] := sum(β) L[x, β] * M[β, σ, α]
+        @matmul MM[x, σ, y] := sum(α) M1[x, σ, α] * R[α, y]
+    end
+end
+
+function time_tensor()
+    M = rand(100, 100, 100)
+    L = rand(100, 100)
+    R = rand(100, 100)
+
+    @time begin
+        @tensor M̃[x, σ, y] := L[x, β] * M[β, σ, α] * R[α, y] order = (α, β)
+        # @cast B[(x, σ), y] |= M̃[x, σ, y]
+    end
+end
+
+println("matmul")
+time_mm()
+time_mm()
+
+println("\n tensor")
+time_tensor()
+time_tensor()
diff --git a/benchmark/args.jl b/benchmark/args.jl
new file mode 100644
index 0000000..516cd3d
--- /dev/null
+++ b/benchmark/args.jl
@@ -0,0 +1,12 @@
+using LinearAlgebra
+
+function my_svd(A; kwargs...)
+    svd(A; kwargs...)
+end
+
+
+T = Float64
+n = 2
+A = rand(T, 2, 2)
+
+my_svd(A, full = true)
diff --git a/benchmark/cuda_matrix_mul.jl b/benchmark/cuda_matrix_mul.jl
new file mode 100644
index 0000000..165cf5d
--- /dev/null
+++ b/benchmark/cuda_matrix_mul.jl
@@ -0,0 +1,29 @@
+using CUDA
+using LinearAlgebra
+
+CUDA.allowscalar(false)
+
+nnz = 100
+Val = CUDA.rand(Float64, nnz)
+Ptr = CuArray(1:nnz+1)
+Ind = CuArray(rand(1:100, nnz))
+
+A = CUDA.CUSPARSE.CuSparseMatrixCSR(Ptr, Ind, Val, (100, 100))
+B = CUDA.rand(Float64, 100, 100)
+C = CUDA.CUSPARSE.CuSparseMatrixCSC(Ptr, Ind, Val, (100, 100))
+
+A * B # no scalar indexing
+CUDA.@allowscalar B * A # scalar indexing
+
+C * B # no scalar indexing
+CUDA.@allowscalar B * C # scalar indexing
+
+A' * B # no scalar indexing
+CUDA.@allowscalar B * A' # scalar indexing
+
+transpose(A) * B # no scalar indexing
+CUDA.@allowscalar B * transpose(A) # scalar indexing
+# problem is when we multiply dense x sparse
+
+D = rand(Float64, (100, 100))
+CUDA.@allowscalar D * A # scalar indexing
diff --git a/benchmark/gpu_slicing.jl b/benchmark/gpu_slicing.jl
new file mode 100644
index 0000000..2e6f8c3
--- /dev/null
+++ b/benchmark/gpu_slicing.jl
@@ -0,0 +1,15 @@
+using CUDA
+
+T = Float64
+n = 10000
+k = 500
+
+a = CUDA.rand(T, n, n)
+p = reverse(collect(1:k))
+p_d = CuArray(p)
+
+@time A = a[:, p]
+@time @inbounds A = a[:, p]
+@time A = a[:, p_d]
+@time @inbounds A = a[:, p_d]
+nothing
diff --git a/benchmark/memoization_cusparse.jl b/benchmark/memoization_cusparse.jl
new file mode 100644
index 0000000..01f51b3
--- /dev/null
+++ b/benchmark/memoization_cusparse.jl
@@ -0,0 +1,46 @@
+using Memoization
+using LinearAlgebra
+using CUDA
+using BenchmarkTools
+
+# Functions from constactions_cuda/sparse.jl which are not exported
+
+@memoize Dict function aux_cusparse(::Type{R}, n::Int64) where {R<:Real}
+    println("entering aux_cusparse function")
+    CuArray(1:n+1), CUDA.ones(R, n)
+end
+
+@memoize Dict function CUDA.CUSPARSE.CuSparseMatrixCSC(
+    ::Type{R},
+    p::Vector{Int},
+) where {R<:Real}
+    println("entering cusparse")
+    n = length(p)
+    cn, co = aux_cusparse(R, n)
+    CUDA.CUSPARSE.CuSparseMatrixCSC(cn, CuArray(p), co, (maximum(p), n))
+end
+
+
+function CuSparseMatrixCSC_no_memo(::Type{R}, p::Vector{Int}) where {R<:Real}
+    println("entering no memo")
+    n = length(p)
+    cn, co = aux_cusparse(R, n)
+    CUDA.CUSPARSE.CuSparseMatrixCSC(cn, CuArray(p), co, (maximum(p), n))
+end
+
+# test of their memoization
+
+p = sort(rand(1:5000, 10000000))
+p2 = sort(rand(1:5000, 10000000))
+@time A = CuSparseMatrixCSC_no_memo(Float64, p)
+@time B = CuSparseMatrixCSC_no_memo(Float64, p)
+
+@time C = CUDA.CUSPARSE.CuSparseMatrixCSC(Float64, p) # compilation time?
+
+@time D = CUDA.CUSPARSE.CuSparseMatrixCSC(Float64, p)
+@time E = CUDA.CUSPARSE.CuSparseMatrixCSC(Float64, p2)
+@time F = CUDA.CUSPARSE.CuSparseMatrixCSC(Float64, p2)
+CUDA.memory_status()
+Memoization.empty_all_caches!()
+CUDA.memory_status()
+# clearing memoization caches doeas not free GPU memory
diff --git a/benchmark/memoization_test.jl b/benchmark/memoization_test.jl
new file mode 100644
index 0000000..750d6fb
--- /dev/null
+++ b/benchmark/memoization_test.jl
@@ -0,0 +1,38 @@
+using SpinGlassTensors
+using Memoization
+using CUDA
+
+
+@memoize Dict function example_cuda_array(::Type{R}, size::Int64) where {R<:Real}
+    CUDA.rand(R, (size, size))
+end
+
+
+@memoize Dict function example_array(::Type{R}, size::Int64) where {R<:Real}
+    rand(R, size, size)
+end
+
+
+@memoize Dict function aux_cusparse(::Type{R}, n::Int64) where {R<:Real}
+    CuArray(1:n+1), CUDA.ones(R, n)
+end
+
+
+@memoize Dict function CUDA.CUSPARSE.CuSparseMatrixCSC(
+    ::Type{R},
+    p::Vector{Int},
+) where {R<:Real}
+    n = length(p)
+    cn, co = aux_cusparse(R, n)
+    CUDA.CUSPARSE.CuSparseMatrixCSC(cn, CuArray(p), co, (maximum(p), n))
+end
+
+
+A = example_cuda_array(Float64, 10000)
+B = example_cuda_array(Float64, 1100)
+C = example_array(Float64, 1000)
+p = rand(1:5000, 100000000)
+D = CUDA.CUSPARSE.CuSparseMatrixCSC(Float64, p)
+CUDA.memory_status()
+println("/n")
+measure_memory(Memoization.caches)
diff --git a/benchmark/mulit_gpu.jl b/benchmark/mulit_gpu.jl
new file mode 100644
index 0000000..16748c7
--- /dev/null
+++ b/benchmark/mulit_gpu.jl
@@ -0,0 +1,25 @@
+using CUDA
+
+function move_to_CUDA(a::Array{T,N}) where {T,N}
+    buf_a = Mem.alloc(Mem.Unified, sizeof(a))
+    d_a = unsafe_wrap(CuArray{T,N}, convert(CuPtr{T}, buf_a), size(a))
+    finalizer(d_a) do _
+        Mem.free(buf_a)
+    end
+    copyto!(d_a, a)
+    d_a
+end
+
+T = Float64
+n = 100
+gpus = Int(length(devices()))
+
+a = rand(T, n, n, gpus)
+a_d = move_to_CUDA(a)
+
+for (gpu, dev) ∈ enumerate(devices())
+    device!(dev)
+    @views a_d[:, :, gpu] .= 2 .* a_d[:, :, gpu]
+end
+
+a_d
diff --git a/benchmark/mulit_gpu2.jl b/benchmark/mulit_gpu2.jl
new file mode 100644
index 0000000..bbbcf8c
--- /dev/null
+++ b/benchmark/mulit_gpu2.jl
@@ -0,0 +1,10 @@
+using CUDA
+
+T = Float64
+n = 100
+gpus = Int(length(devices()))
+
+a = rand(T, n, n, gpus)
+a_d = cu(a, unified = true)
+
+a_d
diff --git a/benchmark/psvd.jl b/benchmark/psvd.jl
new file mode 100644
index 0000000..2f30108
--- /dev/null
+++ b/benchmark/psvd.jl
@@ -0,0 +1,86 @@
+using LinearAlgebra, MKL
+using TensorOperations
+using TensorCast
+using TSVD
+using LowRankApprox
+using RandomizedLinAlg
+using FameSVD
+
+N = 100
+cut = 8
+
+mat = rand(100, 100);
+U, S, V = svd(mat);
+S = exp.(collect(0:N-1) * log(4 / 5));
+
+mat = U * Diagonal(S) * V';
+U, S, V = svd(mat);
+
+U, S, V = U[:, 1:cut], S[1:cut], V[:, 1:cut]
+mat1 = U * Diagonal(S) * V'
+println(S[1:cut])
+println(norm(mat - mat1))
+
+Up, Sp, Vp = psvd(mat, rank = 2 * cut)
+
+mat2 = Up[:, 1:cut] * Diagonal(Sp[1:cut]) * Vp[:, 1:cut]'
+
+println(Sp[1:cut])
+println(Sp[1:cut] - S[1:cut])
+println(norm(mat - mat2))
+
+# Vp = V
+
+C = mat * Vp
+println(size(C))
+Ut, _ = qr(C)
+Ut = Ut[:, 1:cut]
+println(size(Ut))
+C = Ut' * mat
+Vp, _ = qr(C')
+Vp = Vp[:, 1:cut]
+
+
+
+C = mat * Vp
+Uf, Sf, Vf = svd(C);
+Uf, Sf, Vf = Uf[:, 1:cut], Sf[1:cut], Vf[:, 1:cut]
+mat3 = Uf * Diagonal(Sf) * Vf' * Vp'
+println(Sf - S[1:cut])
+println(norm(mat - mat3))
+
+nothing
+
+
+iter = 5
+Up, Sp, Vp = [], [], []
+for i = 1:iter
+    Utemp, Stemp, Vtemp = psvd(mat, rank = 2 * cut)
+    push!(Up, Utemp)
+    push!(Sp, Stemp)
+    push!(Vp, Vtemp)
+end
+
+Ups = hcat(Up...)
+Vps = hcat(Vp...)
+Sps = vcat(Sp...) / iter
+println(size(Ups), " ", size(Vps), " ", size(Sps))
+println(size(Up[1]), " ", size(Vp[1]), " ", size(Sp[1]))
+
+Uq, Ur = qr(Ups)
+Vq, Vr = qr(Vps)
+
+Ut, St, Vt = svd(Ur * Diagonal(Sps) * Vr')
+
+U2 = Uq * Ut[:, 1:cut]
+V2 = Vq * Vt[:, 1:cut]
+S2 = St[1:cut]
+println(St)
+println(S2)
+
+mat4 = U2 * Diagonal(S2) * V2'
+
+
+println(norm(mat1 - mat2))
+println(norm(mat1 - mat3))
+println(norm(mat1 - mat4))
diff --git a/benchmark/qr.jl b/benchmark/qr.jl
new file mode 100644
index 0000000..62b34c9
--- /dev/null
+++ b/benchmark/qr.jl
@@ -0,0 +1,18 @@
+using LinearAlgebra
+using CUDA
+
+
+T = Float64
+n, m = 10000, 10000
+
+A = rand(T, n, m)
+Ad = CuArray(A)
+
+@time q, r = qr(A);
+@time qd, rd = qr(Ad);
+
+println(size(q), " ", size(r))
+println(size(qd), " ", size(rd))
+
+@assert A ≈ q * r
+@assert Ad ≈ qd * rd
diff --git a/benchmark/sparse_mul.jl b/benchmark/sparse_mul.jl
new file mode 100644
index 0000000..dec7c10
--- /dev/null
+++ b/benchmark/sparse_mul.jl
@@ -0,0 +1,24 @@
+using CUDA
+using LinearAlgebra
+
+function CUDA.:*(Md::DenseCuMatrix{T}, Mcsc::CUSPARSE.CuSparseMatrixCSC{T}) where {T}
+    ret = CUDA.zeros(T, size(Mcsc, 1), size(Md, 2))
+    CUSPARSE.mm!('N', 'N', one(T), Mcsc, Md, zero(T), ret, 'O')
+    ret
+end
+
+T = Float64
+nnz = 100
+Val = CUDA.rand(T, nnz)
+Ptr = CuArray(1:nnz+1)
+Ind = CuArray(rand(1:nnz, nnz))
+
+Mcsr = CUSPARSE.CuSparseMatrixCSR(Ptr, Ind, Val, (nnz, nnz))
+Md = CUDA.rand(T, nnz, nnz)
+Mcsc = CUSPARSE.CuSparseMatrixCSC(Ptr, Ind, Val, (nnz, nnz))
+
+X = Mcsr * Md
+Y = Md * Mcsc
+
+@assert X ≈ CuArray(Mcsr) * Md
+@assert Y ≈ Md * CuArray(Mcsc)
diff --git a/benchmark/sparse_mul_bench.jl b/benchmark/sparse_mul_bench.jl
new file mode 100644
index 0000000..79deb4b
--- /dev/null
+++ b/benchmark/sparse_mul_bench.jl
@@ -0,0 +1,31 @@
+using CUDA
+using LinearAlgebra
+using SparseArrays
+
+function dense_x_CSC(Md::DenseCuMatrix{T}, Mcsc::CUSPARSE.CuSparseMatrixCSC{T}) where {T}
+    ret = CUDA.zeros(T, size(Md, 1), size(Mcsc, 2))
+    CUSPARSE.mm!('N', 'N', one(T), Mcsc, Md, zero(T), ret, 'O')
+    ret
+end
+
+T = Float64
+nnz = 2^14
+Val = CUDA.rand(T, nnz)
+Ptr = CuArray(1:nnz+1)
+Ind = CuArray(rand(1:nnz, nnz))
+
+Mcsr = CUSPARSE.CuSparseMatrixCSR(Ptr, Ind, Val, (nnz, nnz))
+Md = CUDA.rand(T, nnz, nnz)
+Mcsc = CUSPARSE.CuSparseMatrixCSC(Ptr, Ind, Val, (nnz, nnz))
+
+@time CUDA.@sync X = Mcsr * Md
+#@time CUDA.@sync Y = dense_x_CSC(Md, Mcsc)
+@time CUDA.@sync Z = (Mcsc' * Md')'
+
+println()
+
+@time CUDA.@sync X = Mcsr * Md
+#@time CUDA.@sync Y = dense_x_CSC(Md, Mcsc)
+@time CUDA.@sync Z = (Mcsc' * Md')'
+
+nothing
diff --git a/benchmark/svd.jl b/benchmark/svd.jl
new file mode 100644
index 0000000..52cc593
--- /dev/null
+++ b/benchmark/svd.jl
@@ -0,0 +1,73 @@
+using LinearAlgebra, MKL
+using TensorOperations
+using TensorCast
+using TSVD
+using LowRankApprox
+using RandomizedLinAlg
+using FameSVD
+
+
+# C = A * B
+struct MyTensor{T<:Number}
+    A::Array{T,2}
+    B::Array{T,2}
+end
+
+Base.Array(ten::MyTensor) = ten.A * ten.B
+
+# this is for tsvd to work
+Base.eltype(ten::MyTensor{T}) where {T} = T
+Base.size(ten::MyTensor) = (size(ten.A, 1), size(ten.B, 2))
+Base.size(ten::MyTensor, n::Int) = size(ten)[n]
+Base.adjoint(ten::MyTensor{T}) where {T} = MyTensor{T}(adjoint(ten.B), adjoint(ten.A))
+Base.:(*)(ten::MyTensor{T}, v::Vector{T}) where {T} = (ten.A * (ten.B * v))
+
+# this is for psvd to work
+LinearAlgebra.ishermitian(ten::MyTensor) = ishermitian(ten.A) && ishermitian(ten.B)
+LinearAlgebra.mul!(y, ten::MyTensor, x) = mul!(y, ten.B, ten.A * x)
+
+n = 2^12
+cut = 2^6
+T = Float64
+
+ten = MyTensor(rand(T, n, n), rand(T, n, n))
+
+println("MyTensor:")
+println("tsvd:")
+@time U, Σ, V = tsvd(ten, cut)
+
+println("psvd:")
+@time U, Σ, V = psvd(ten, rank = cut)
+
+println("Array:")
+println("tsvd:")
+@time begin
+    C = Array(ten)
+    U, Σ, V = tsvd(C, cut)
+end
+
+println("svd:")
+@time begin
+    C = Array(ten)
+    U, Σ, V = svd(C)
+end
+
+println("psvd:")
+@time begin
+    C = Array(ten)
+    U, Σ, V = psvd(C, rank = cut)
+end
+
+println("rsvd:")
+@time begin
+    C = Array(ten)
+    U, Σ, V = rsvd(C, cut, 0)
+end
+
+println("fsvd:")
+@time begin
+    C = Array(ten)
+    U, Σ, V = fsvd(C)
+end
+
+nothing
diff --git a/benchmark/svd2.jl b/benchmark/svd2.jl
new file mode 100644
index 0000000..84b0e20
--- /dev/null
+++ b/benchmark/svd2.jl
@@ -0,0 +1,102 @@
+using LinearAlgebra, MKL
+using TensorOperations
+using TensorCast
+using TSVD
+using LowRankApprox
+using RandomizedLinAlg
+using FameSVD
+
+
+# C = A * B
+struct MyTensor{T<:Number}
+    A::Array{T,2}
+    B::Array{T,2}
+end
+
+struct AMyTensor{T<:Number}
+    A::Array{T,2}
+    B::Array{T,2}
+end
+
+
+Base.Array(ten::MyTensor) = kron(ten.A, ten.B)
+
+# this is for tsvd to work
+Base.eltype(ten::MyTensor{T}) where {T} = T
+Base.size(ten::MyTensor) =
+    (size(ten.A, 1) * size(ten.B, 1), size(ten.A, 2) * size(ten.B, 2))
+Base.size(ten::MyTensor, n::Int) = size(ten)[n]
+# Base.adjoint(ten::MyTensor{T}) where T = MyTensor{T}(adjoint(ten.A), adjoint(ten.B))
+
+# Base.:(*)(ten::MyTensor{T}, v::Vector{T}) where T = (kron(ten.A, ten.B) * v)
+
+Base.adjoint(ten::MyTensor{T}) where {T} = AMyTensor{T}(ten.A, ten.B)
+
+
+function Base.:(*)(ten::MyTensor{T}, v::Vector{T}) where {T}
+    println("M")
+    vv = reshape(v, size(ten.A, 2), size(ten.B, 2))
+    println(size(vv))
+    @tensor x[x1, y1] := ten.A[x1, x2] * ten.B[y1, y2] * vv[x2, y2]
+    reshape(x, size(ten.A, 1) * size(ten.B, 1))
+end
+
+function Base.:(*)(ten::AMyTensor{T}, v::Vector{T}) where {T}
+    println("A")
+    vv = reshape(v, size(ten.A, 1), size(ten.B, 1))
+    println(size(vv))
+
+    @tensor x[x1, y1] := ten.A[x2, x1] * ten.B[y2, y1] * vv[x2, y2]
+    reshape(x, size(ten.A, 2) * size(ten.B, 2))
+end
+
+
+# this is for psvd to work
+LinearAlgebra.ishermitian(ten::MyTensor) = false
+
+
+function LinearAlgebra.mul!(y, ten::MyTensor, v)
+    println("K")
+    vv = reshape(v, size(ten.A, 2), size(ten.B, 2), :)
+    println(size(vv))
+    @tensor x[x1, y1, z1] := ten.A[x1, x2] * ten.B[y1, y2] * vv[x2, y2, z1]
+    y[:, :] = reshape(x, size(ten.A, 1) * size(ten.B, 1), :)
+end
+
+function LinearAlgebra.mul!(y, ten::AMyTensor, v)
+    println("L")
+    vv = reshape(v, size(ten.A, 1), size(ten.B, 1), :)
+    println(size(vv))
+    @tensor x[x1, y1, z1] := ten.A[x2, x1] * ten.B[y2, y1] * vv[x2, y2, z1]
+    y[:, :] = reshape(x, size(ten.A, 2) * size(ten.B, 2), :)
+end
+
+
+n = 2^2
+cut = 2^1
+T = Float64
+
+ten = MyTensor(rand(T, n + 1, n), rand(T, n + 2, n - 1))
+
+println("tsvd:")
+@time U, Σ1, V = tsvd(ten, cut)
+
+println("psvd:")
+@time U, Σ2, V = psvd(ten, rank = cut)
+
+println("svd:")
+@time begin
+    C = Array(ten)
+    U, Σ3, V = svd(C)
+end
+
+# println("psvd:")
+# @time begin
+#     C = Array(ten)
+#     U, Σ4, V = psvd(C, rank=cut)
+# end
+
+println(Σ1)
+println(Σ2)
+println(Σ3[1:cut])
+# println(Σ4)
diff --git a/docs/make.jl b/docs/make.jl
index 3cc0eed..d417f52 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,20 +1,20 @@
 using Documenter, SpinGlassTensors
 
 _pages = [
-    "Introduction" => "index.md",
-    "API Reference" => "api.md"
+    "User guide" => "index.md",
+    "Matrix Product States and Matrix Product Operations" => "mpo.md",
+    "API Reference" => "api.md",
 ]
 # ============================
 
-format = Documenter.HTML(edit_link = "master",
-                         prettyurls = get(ENV, "CI", nothing) == "true",
-)
+format =
+    Documenter.HTML(edit_link = "master", prettyurls = get(ENV, "CI", nothing) == "true")
 
 # format = Documenter.LaTeX(platform="none")
 
 makedocs(
-    sitename="SpinGlassTensors.jl",
+    sitename = "SpinGlassTensors.jl",
     modules = [SpinGlassTensors],
     pages = _pages,
-    format = format
-    )
\ No newline at end of file
+    format = format,
+)
diff --git a/docs/src/api.md b/docs/src/api.md
index 120ec4f..48c7ce6 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -6,29 +6,16 @@ CurrentModule = SpinGlassTensors
 ```
 ## Additional methods for `Base` and `LinearAlgebra`
 ```@docs
-dot
-norm
-randn
-rank
+left_nbrs_site
+right_nbrs_site
+project_ket_on_bra
 ```
-
 ## MPS
-```@docs
-MPS
-is_left_normalized
-is_right_normalized
-physical_dim
-verify_bonds
-verify_physical_dims
-```
 
-## Compresions and Contractions
 
+## Compresions and Contractions
 ```@docs
-canonise!
-compress!
-left_env
-right_env
-truncate!
-
+update_env_left
+update_env_right
+update_reduced_env_right
 ```
\ No newline at end of file
diff --git a/docs/src/index.md b/docs/src/index.md
index 27ca46e..2452d07 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -3,4 +3,6 @@
 Part of [SpinGlassPEPS](https://github.com/euro-hpc-pl/SpinGlassPEPS.jl) package. It constitutes the basis for the preparation of tensors and operations on them.
 
 !!! info
-    We don't expect the user to interact with this package, as it is more of a "back-end" type. Nevertheless, we provide API references should the need arise.
\ No newline at end of file
+    We don't expect the user to interact with this package, as it is more of a "back-end" type. Nevertheless, we provide API references should the need arise.
+
+This section of the package encompasses supplementary functionalities that serve as support for the main solver. It includes the creation and manipulation of tensors, with a particular emphasis on implementing Matrix Product States (MPS) and Matrix Product Operators (MPO). 
\ No newline at end of file
diff --git a/docs/src/mpo.md b/docs/src/mpo.md
new file mode 100644
index 0000000..66211e4
--- /dev/null
+++ b/docs/src/mpo.md
@@ -0,0 +1,5 @@
+# Matrix Product States and Matrix Product Operations
+
+```@docs
+MpoTensor
+```
\ No newline at end of file
diff --git a/src/SpinGlassTensors.jl b/src/SpinGlassTensors.jl
index 08b0ef1..f8d54ad 100644
--- a/src/SpinGlassTensors.jl
+++ b/src/SpinGlassTensors.jl
@@ -1,13 +1,43 @@
 module SpinGlassTensors
-using LinearAlgebra
+using LinearAlgebra, MKL
 using TensorOperations, TensorCast
-using Memoize
-
+using LowRankApprox, TSVD
+using CUDA, CUDA.CUSPARSE
+using cuTENSOR
+using NNlib
+using Memoization
+using SparseArrays
 using DocStringExtensions
+using Base.Cartesian
+
+import Base.Prehashed
+#   using SpinGlassNetworks
+
+CUDA.allowscalar(false)
 
+include("projectors.jl")
 include("base.jl")
 include("linear_algebra_ext.jl")
-include("compressions.jl")
-include("identities.jl")
-include("contractions.jl")
+include("utils/utils.jl")
+include("./mps/base.jl")
+include("./mps/transpose.jl")
+include("./mps/dot.jl")
+include("./mps/identity.jl")
+include("./mps/utils.jl")
+include("./mps/rand.jl")
+include("transfer.jl")
+include("environment.jl")
+include("utils/memory.jl")
+include("./mps/canonise.jl")
+include("variational.jl")
+include("zipper.jl")
+include("gauges.jl")
+include("contractions/sparse.jl")
+include("contractions/dense.jl")
+include("contractions/central.jl")
+include("contractions/diagonal.jl")
+include("contractions/site.jl")
+include("contractions/virtual.jl")
+
+
 end # module
diff --git a/src/base.jl b/src/base.jl
index c8d789e..8a0dc39 100644
--- a/src/base.jl
+++ b/src/base.jl
@@ -1,237 +1,131 @@
-export bond_dimension, is_left_normalized, is_right_normalized
-export verify_bonds, verify_physical_dims, tensor, rank, physical_dim
-export State, dropindices
-
-const State = Union{Vector,NTuple}
-
-abstract type AbstractTensorNetwork{T} end
-
-for (T, N) ∈ ((:PEPSRow, 5), (:MPO, 4), (:MPS, 3))
-    AT = Symbol(:Abstract, T)
-    @eval begin
-        export $AT
-        export $T
-
-        abstract type $AT{T} <: AbstractTensorNetwork{T} end
-
-        struct $T{T<:Number} <: $AT{T}
-            tensors::Vector{Array{T,$N}}
-        end
-
-        # consturctors
-        $T(::Type{T}, L::Int) where {T} = $T(Vector{Array{T,$N}}(undef, L))
-        $T(L::Int) = $T(Float64, L)
-
-        @inline Base.setindex!(a::$AT, A::AbstractArray{<:Number,$N}, i::Int) =
-            a.tensors[i] = A
-        @inline bond_dimension(a::$AT) = maximum(size.(a.tensors, $N))
-        Base.hash(a::$T, h::UInt) = hash(a.tensors, h)
-        @inline Base.:(==)(a::$T, b::$T) = a.tensors == b.tensors
-        @inline Base.:(≈)(a::$T, b::$T) = a.tensors ≈ b.tensors
-        Base.copy(a::$T) = $T(copy(a.tensors))
-
-        @inline Base.eltype(::$AT{T}) where {T} = T
+# base.jl: This file defines basic tensor structures to be used with SpinGlassEngine
+
+export Tensor,
+    SiteTensor,
+    VirtualTensor,
+    DiagonalTensor,
+    CentralTensor,
+    CentralOrDiagonal,
+    dense_central
+
+abstract type AbstractSparseTensor{T,N} end
+
+mutable struct SiteTensor{T<:Real,N} <: AbstractSparseTensor{T,N}
+    lp::PoolOfProjectors
+    loc_exp::AbstractVector{T}
+    projs::NTuple{4,Int} # pl, pt, pr, pb
+    dims::Dims{N}
+
+    function SiteTensor(lp::PoolOfProjectors, loc_exp, projs::NTuple{4,Vector{Int}})
+        T = eltype(loc_exp)
+        ks = Tuple(add_projector!(lp, p) for p ∈ projs)
+        dims = size.(Ref(lp), ks)
+        new{T,4}(lp, loc_exp, ks, dims)
     end
-end
-
-
-@inline Base.getindex(a::AbstractTensorNetwork, i) = getindex(a.tensors, i)
-@inline Base.iterate(a::AbstractTensorNetwork) = iterate(a.tensors)
-@inline Base.iterate(a::AbstractTensorNetwork, state) = iterate(a.tensors, state)
-@inline Base.lastindex(a::AbstractTensorNetwork) = lastindex(a.tensors)
-@inline Base.length(a::AbstractTensorNetwork) = length(a.tensors)
-@inline Base.size(a::AbstractTensorNetwork) = (length(a.tensors),)
-@inline Base.eachindex(a::AbstractTensorNetwork) = eachindex(a.tensors)
-
-"""
-    LinearAlgebra.rank(ψ::AbstractMPS)
-
-Returns rank of MPS tensors.
-"""
-@inline LinearAlgebra.rank(ψ::AbstractMPS) = Tuple(size(A, 2) for A ∈ ψ)
-
-"""
-    physical_dim(ψ::AbstractMPS, i::Int)
-
-Returns physical dimension of MPS tensors at given site i.
-"""
-@inline physical_dim(ψ::AbstractMPS, i::Int) = size(ψ[i], 2)
-
-
-
-@inline MPS(A::AbstractArray) = MPS(A, :right)
-
-
-
-"""
-    MPS(A::AbstractArray, s::Symbol, Dcut::Int = typemax(Int))
-
-Construct a matrix product state (MPS) using the provided tensor array `A`.
-
-## Arguments
-
-- `A::AbstractArray`: The tensor array that defines the MPS.
-- `s::Symbol`: The direction to canonically transform the MPS. Must be either `:left` or `:right`.
-- `Dcut::Int`: The maximum bond dimension allowed during the truncation step.
 
-## Returns
-
-- `ψ::AbstractMPS`: The constructed MPS.
-
-## Details
-
-This function constructs a matrix product state (MPS) using the provided tensor array `A`, 
-and then canonically transforms it in the direction specified by the `s` argument. If `s` is `:right`, 
-the MPS is right-canonized, while if `s` is `:left`, the MPS is left-canonized. 
-The `Dcut` argument determines the maximum bond dimension allowed during the truncation step. 
-If neither `Dcut` nor `s` is specified, it will construct right-canonized MPS with default Dcut value.
-
-## Example
-
-```@repl
-A = rand(2, 3, 2)
-ψ = MPS(A, :left, 2);
-typeof(ψ)
-length(ψ)
-bond_dimension(ψ)
-```
-"""
-@inline function MPS(A::AbstractArray, s::Symbol, Dcut::Int = typemax(Int))
-    @assert s ∈ (:left, :right)
-    if s == :right
-        ψ = _right_sweep(A)
-        _left_sweep!(ψ, Dcut)
-    else
-        ψ = _left_sweep(A)
-        _right_sweep!(ψ, Dcut)
+    function SiteTensor(
+        lp::PoolOfProjectors,
+        loc_exp,
+        projs::NTuple{4,Int},
+        dims::NTuple{4,Int},
+    )
+        T = eltype(loc_exp)
+        new{T,4}(lp, loc_exp, projs, dims)
     end
-    ψ
-end
-
-@inline dropindices(ψ::AbstractMPS, i::Int = 2) = (dropdims(A, dims = i) for A ∈ ψ)
-
-
-"""
-    MPS(states::Vector{Vector{T}}) where {T<:Number}
-
-Create a matrix product state (MPS) object from a vector of states.
-"""
-function MPS(states::Vector{Vector{T}}) where {T<:Number}
-    state_arrays = [reshape(copy(v), (1, length(v), 1)) for v ∈ states]
-    MPS(state_arrays)
-end
-
-function (::Type{T})(ψ::AbstractMPS) where {T<:AbstractMPO}
-    _verify_square(ψ)
-    T([@cast W[x, σ, y, η] |= A[x, (σ, η), y] (σ ∈ 1:isqrt(size(A, 2))) for A in ψ])
 end
 
-function (::Type{T})(O::AbstractMPO) where {T<:AbstractMPS}
-    T([@cast A[x, (σ, η), y] := W[x, σ, y, η] for W in O])
-end
 
-"""
-    Base.randn(::Type{MPS{T}}, D::Int, rank::Union{Vector,NTuple}) where {T}
-
-Create random MPS.The argument `D` specifies the physical dimension of the MPS 
-(i.e. the dimension of the vectors at each site), `rank` specifies rank of each site.
-"""
-function Base.randn(::Type{MPS{T}}, D::Int, rank::Union{Vector,NTuple}) where {T}
-    MPS([
-        randn(T, 1, first(rank), D),
-        randn.(T, D, rank[begin+1:end-1], D)...,
-        rand(T, D, last(rank), 1),
-    ])
+function mpo_transpose(ten::SiteTensor)
+    perm = [1, 4, 3, 2]
+    SiteTensor(ten.lp, ten.loc_exp, ten.projs[perm], ten.dims[perm])
 end
 
-function Base.randn(::Type{MPS{T}}, L::Int, D::Int, d::Int) where {T}
-    MPS([randn(T, 1, d, D), (randn(T, D, d, D) for _ = 2:L-1)..., randn(T, D, d, 1)])
+mutable struct CentralTensor{T<:Real,N} <: AbstractSparseTensor{T,N}
+    e11::AbstractMatrix{T}
+    e12::AbstractMatrix{T}
+    e21::AbstractMatrix{T}
+    e22::AbstractMatrix{T}
+    dims::Dims{N}
+
+    function CentralTensor(e11, e12, e21, e22)
+        s11, s12, s21, s22 = size.((e11, e12, e21, e22))
+        @assert s11[1] == s12[1] && s21[1] == s22[1] && s11[2] == s21[2] && s12[2] == s22[2]
+        dims = (s11[1] * s21[1], s11[2] * s12[2])
+        T = promote_type(eltype.((e11, e12, e21, e22))...)
+        new{T,2}(e11, e12, e21, e22, dims)
+    end
 end
 
+Base.adjoint(M::CentralTensor{R,2}) where {R<:Real} =
+    CentralTensor(M.e11', M.e21', M.e12', M.e22')
 
-Base.randn(::Type{MPS}, args...) = randn(MPS{Float64}, args...)
-
-function Base.randn(::Type{MPO{T}}, L::Int, D::Int, d::Int) where {T}
-    MPO(randn(MPS{T}, L, D, d^2))
-end
+mpo_transpose(ten::CentralTensor) =
+    CentralTensor(permutedims.((ten.e11, ten.e21, ten.e12, ten.e22), Ref((2, 1)))...)
 
-function Base.randn(::Type{MPO{T}}, D::Int, rank::Union{Vector,NTuple}) where {T}
-    MPO(randn(MPS{T}, D, rank .^ 2))
-end
+const MatOrCentral{T,N} = Union{AbstractMatrix{T},CentralTensor{T,N}}
 
-"""
-    is_left_normalized(ψ::MPS)
-
-Check whether MPS is left normalized.
-"""
-is_left_normalized(ψ::MPS) = all(
-    I(size(A, 3)) ≈ @tensor Id[x, y] := conj(A[α, σ, x]) * A[α, σ, y] order = (α, σ) for
-    A ∈ ψ
-)
-
-"""
-    is_right_normalized(ϕ::MPS)
-
-Check whether MPS is right normalized.
-"""
-is_right_normalized(ϕ::MPS) = all(
-    I(size(B, 1)) ≈ @tensor Id[x, y] := B[x, σ, α] * conj(B[y, σ, α]) order = (α, σ) for
-    B in ϕ
-)
-
-function _verify_square(ψ::AbstractMPS)
-    dims = physical_dim.(Ref(ψ), eachindex(ψ))
-    @assert isqrt.(dims) .^ 2 == dims "Incorrect MPS dimensions"
+# TODO: to be removed eventually
+function dense_central(ten::CentralTensor)
+    @cast V[(u1, u2), (d1, d2)] :=
+        ten.e11[u1, d1] * ten.e21[u2, d1] * ten.e12[u1, d2] * ten.e22[u2, d2]
+    V ./ maximum(V)
 end
+dense_central(ten::AbstractArray) = ten
 
-"""
-    verify_physical_dims(ψ::AbstractMPS, dims::NTuple)
+mutable struct DiagonalTensor{T<:Real,N} <: AbstractSparseTensor{T,N}
+    e1::MatOrCentral{T,N}
+    e2::MatOrCentral{T,N}
+    dims::Dims{N}
 
-Check whether MPS has correct physical dimension at given site.
-"""
-function verify_physical_dims(ψ::AbstractMPS, dims::NTuple)
-    for i ∈ eachindex(ψ)
-        @assert physical_dim(ψ, i) == dims[i] "Incorrect physical dim at site $(i)."
+    function DiagonalTensor(e1, e2)
+        dims = (size(e1, 1) * size(e2, 1), size(e1, 2) * size(e2, 2))
+        T = promote_type(eltype.((e1, e2))...)
+        new{T,2}(e1, e2, dims)
     end
 end
 
-"""
-    verify_bonds(ψ::AbstractMPS)
-
-Check whether MPS has correct sizes.
-"""
-function verify_bonds(ψ::AbstractMPS)
-    L = length(ψ)
-
-    @assert size(ψ[1], 1) == 1 "Incorrect size on the left boundary."
-    @assert size(ψ[end], 3) == 1 "Incorrect size on the right boundary."
+mpo_transpose(ten::DiagonalTensor) = DiagonalTensor(mpo_transpose.((ten.e2, ten.e1))...)
+
+mutable struct VirtualTensor{T<:Real,N} <: AbstractSparseTensor{T,N}
+    lp::PoolOfProjectors
+    con::MatOrCentral{T,2}
+    projs::NTuple{6,Int}  # == (p_lb, p_l, p_lt, p_rb, p_r, p_rt)
+    dims::Dims{N}
+
+    function VirtualTensor(lp::PoolOfProjectors, con, projs::NTuple{6,Vector{Int}})
+        T = eltype(con)
+        ks = Tuple(add_projector!(lp, p) for p ∈ projs)
+        dims = (
+            length(lp, ks[2]),
+            size(lp, ks[3]) * size(lp, ks[6]),
+            length(lp, ks[5]),
+            size(lp, ks[1]) * size(lp, ks[4]),
+        )
+        new{T,4}(lp, con, ks, dims)
+    end
 
-    for i ∈ 1:L-1
-        @assert size(ψ[i], 3) == size(ψ[i+1], 1) "Incorrect link between $i and $(i+1)."
+    function VirtualTensor(
+        lp::PoolOfProjectors,
+        con,
+        projs::NTuple{6,Int},
+        dims::NTuple{4,Int},
+    )
+        T = eltype(con)
+        new{T,4}(lp, con, projs, dims)
     end
 end
 
-function Base.show(io::IO, ψ::AbstractTensorNetwork)
-    L = length(ψ)
-    dims = [size(A) for A ∈ ψ]
-
-    println(io, "Matrix product state on $L sites:")
-    _show_sizes(io, dims)
-    println(io, "   ")
-end
+mpo_transpose(ten::VirtualTensor) =
+    VirtualTensor(ten.lp, ten.con, ten.projs[[3, 2, 1, 6, 5, 4]], ten.dims[[1, 4, 3, 2]])
+mpo_transpose(ten::AbstractArray{T,4}) where {T} = permutedims(ten, (1, 4, 3, 2))
+mpo_transpose(ten::AbstractArray{T,2}) where {T} = permutedims(ten, (2, 1))
 
+const SparseTensor{T,N} =
+    Union{SiteTensor{T,N},VirtualTensor{T,N},CentralTensor{T,N},DiagonalTensor{T,N}}
+const Tensor{T,N} = Union{AbstractArray{T,N},SparseTensor{T,N}}
+const CentralOrDiagonal{T,N} = Union{CentralTensor{T,N},DiagonalTensor{T,N}}
 
-function _show_sizes(io::IO, dims::Vector, sep::String = " x ", Lcut::Int = 8)
-    L = length(dims)
-    if L > Lcut
-        for i ∈ 1:Lcut
-            print(io, " ", dims[i], sep)
-        end
-        print(io, " ... × ", dims[end])
-    else
-        for i ∈ 1:(L-1)
-            print(io, dims[i], sep)
-        end
-        println(io, dims[end])
-    end
-end
+Base.eltype(ten::Tensor{T,N}) where {T,N} = T
+Base.ndims(ten::Tensor{T,N}) where {T,N} = N
+Base.size(ten::SparseTensor, n::Int) = ten.dims[n]
+Base.size(ten::SparseTensor) = ten.dims
diff --git a/src/compressions.jl b/src/compressions.jl
deleted file mode 100644
index 8a69d42..0000000
--- a/src/compressions.jl
+++ /dev/null
@@ -1,240 +0,0 @@
-export canonise!, truncate!, compress!, compress
-
-
-# This is for backwards compatibility
-function compress(
-    ϕ::AbstractMPS,
-    Dcut::Int,
-    tol::Number = 1E-8,
-    max_sweeps::Int = 4,
-    args...,
-)
-    ψ = copy(ϕ)
-    compress!(ψ, Dcut, tol, max_sweeps, args...)
-    ψ
-end
-
-"""
-    compress!(
-        ϕ::AbstractMPS,
-        Dcut::Int,
-        tol::Number = 1E-8,
-        max_sweeps::Int = 4,
-        args...,
-    )
-
-# Arguments
-
-- `ϕ::AbstractMPS`: the input MPS to be compressed
-- `Dcut::Int`: the maximum bond dimension of the compressed MPS
-- `tol::Number = 1E-8`: the tolerance threshold for convergence of the iterative compression process (default value: 1E-8)
-- `max_sweeps::Int = 4`: the maximum number of iterations allowed for the compression process (default value: 4)
-
-# Output
-- `overlap`: The overlap of the compressed MPS with the original input MPS.
-"""
-function compress!(
-    ϕ::AbstractMPS,
-    Dcut::Int,
-    tol::Number = 1E-8,
-    max_sweeps::Int = 4,
-    args...,
-)
-    # right canonise ϕ
-    _left_sweep!(ϕ, args...)
-
-    # Initial guess - truncated ϕ
-    ψ = copy(ϕ)
-    _right_sweep!(ϕ, Dcut, args...)
-
-    # Create environment
-    env = left_env(ϕ, ψ)
-
-    # Variational compression
-    overlap = Inf
-    overlap_before = -Inf
-
-    @info "Compressing state down to" Dcut
-
-    for sweep ∈ 1:max_sweeps
-        _left_sweep_var!!(ϕ, env, ψ, args...)
-        overlap = _right_sweep_var!!(ϕ, env, ψ, args...)
-
-        diff = abs(overlap_before - abs(overlap))
-        @info "Convergence" diff
-
-        if diff < tol
-            @info "Finished in $sweep sweeps of $(max_sweeps)."
-            return overlap
-        else
-            overlap_before = overlap
-        end
-    end
-    overlap
-end
-
-"""
-    truncate!(ψ::AbstractMPS, s::Symbol, Dcut::Int = typemax(Int), args...)
-
-Truncate the bond dimension of a matrix product state (MPS) in either 
-the left or right canonical form, depending on the value of the `s` input argument.
-
-# Arguments
-
-- `ψ::AbstractMPS`: the input MPS to be truncated
-- `s::Symbol`: determines whether to truncate the MPS in the left or right canonical form. Must be one of the following values:
-    - `:left`: truncate in left canonical form
-    - `:right`: truncate in right canonical form
-- `Dcut::Int`: the maximum bond dimension to which the MPS should be truncated. 
-
-"""
-function truncate!(ψ::AbstractMPS, s::Symbol, Dcut::Int = typemax(Int), args...)
-    @assert s ∈ (:left, :right)
-    if s == :right
-        _right_sweep!(ψ, args...)
-        _left_sweep!(ψ, Dcut, args...)
-    else
-        _left_sweep!(ψ, args...)
-        _right_sweep!(ψ, Dcut, args...)
-    end
-end
-
-"""
-    canonise!(ψ::AbstractMPS, s::Symbol)
-
-canonizes a matrix product state (MPS) in either the left or right canonical form, 
-depending on the value of the `s` input argument. Must be one of the following values:
-- `:left`: canonize in left canonical form
-- `:right`: canonize in right canonical form
-
-"""
-canonise!(ψ::AbstractMPS, s::Symbol) = canonise!(ψ, Val(s))
-
-canonise!(ψ::AbstractMPS, ::Val{:right}) = _left_sweep!(ψ, typemax(Int))
-canonise!(ψ::AbstractMPS, ::Val{:left}) = _right_sweep!(ψ, typemax(Int))
-
-
-function _right_sweep!(ψ::AbstractMPS, Dcut::Int = typemax(Int), args...)
-    R = ones(eltype(ψ), 1, 1)
-    for (i, A) ∈ enumerate(ψ)
-        @matmul M̃[(x, σ), y] := sum(α) R[x, α] * A[α, σ, y]
-        Q, R = qr_fact(M̃, Dcut, args...)
-        R = R ./ maximum(abs.(R))
-        @cast A[x, σ, y] := Q[(x, σ), y] (σ ∈ 1:size(A, 2))
-        ψ[i] = A
-    end
-end
-
-
-function _left_sweep!(ψ::AbstractMPS, Dcut::Int = typemax(Int), args...)
-    R = ones(eltype(ψ), 1, 1)
-    for i ∈ length(ψ):-1:1
-        B = ψ[i]
-        @matmul M̃[x, (σ, y)] := sum(α) B[x, σ, α] * R[α, y]
-        R, Q = rq_fact(M̃, Dcut, args...)
-        R = R ./ maximum(abs.(R))
-        @cast B[x, σ, y] := Q[x, (σ, y)] (σ ∈ 1:size(B, 2))
-        ψ[i] = B
-    end
-end
-
-
-function _left_sweep_var!!(
-    ϕ::AbstractMPS,
-    env::Vector{<:AbstractMatrix},
-    ψ::AbstractMPS,
-    args...,
-)
-    env[end] = ones(eltype(ϕ), 1, 1)
-
-    for i ∈ length(ψ):-1:1
-        L, R = env[i], env[i+1]
-
-        # optimize site
-        M = ψ[i]
-        @tensor MM[x, σ, α] := L[x, β] * M[β, σ, α]
-        @matmul MM[x, (σ, y)] := sum(α) MM[x, σ, α] * R[α, y]
-
-        _, Q = rq_fact(MM, args...)
-        @cast B[x, σ, y] := Q[x, (σ, y)] (σ ∈ 1:size(M, 2))
-
-        # update ϕ and right environment
-        ϕ[i] = B
-        A = ψ[i]
-
-        @tensor RR[x, y] := A[x, σ, α] * R[α, β] * conj(B)[y, σ, β] order = (β, α, σ)
-        env[i] = RR
-    end
-    env[1][1]
-end
-
-
-function _right_sweep_var!!(
-    ϕ::AbstractMPS,
-    env::Vector{<:AbstractMatrix},
-    ψ::AbstractMPS,
-    args...,
-)
-    env[1] = ones(eltype(ϕ), 1, 1)
-
-    for (i, M) ∈ enumerate(ψ)
-        L, R = env[i], env[i+1]
-
-        # optimize site
-        @tensor M̃[x, σ, α] := L[x, β] * M[β, σ, α]
-        @matmul B[(x, σ), y] := sum(α) M̃[x, σ, α] * R[α, y]
-
-        Q, _ = qr_fact(B, args...)
-        @cast A[x, σ, y] := Q[(x, σ), y] (σ ∈ 1:size(M, 2))
-
-        # update ϕ and left environment
-        ϕ[i] = A
-        B = ψ[i]
-
-        @tensor LL[x, y] := conj(A[β, σ, x]) * L[β, α] * B[α, σ, y] order = (α, β, σ)
-        env[i+1] = LL
-    end
-    env[end][1]
-end
-
-
-function _right_sweep(
-    A::AbstractArray,
-    Dcut::Int = typemax(Int),
-    args...,
-)
-    rank = ndims(A)
-    ψ = MPS(eltype(A), rank)
-    R = reshape(copy(A), (1, length(A)))
-
-    for i ∈ 1:rank
-        d = size(A, i)
-        @cast M[(x, σ), y] := R[x, (σ, y)] (σ ∈ 1:d)
-        Q, R = qr_fact(M, Dcut, args...)
-        R = R ./ maximum(abs.(R))
-        @cast B[x, σ, y] := Q[(x, σ), y] (σ ∈ 1:d)
-        ψ[i] = B
-    end
-    ψ
-end
-
-
-function _left_sweep(
-    A::AbstractArray,
-    Dcut::Int = typemax(Int),
-    args...,
-)
-    rank = ndims(A)
-    ψ = MPS(eltype(A), rank)
-    R = reshape(copy(A), (length(A), 1))
-
-    for i ∈ rank:-1:1
-        d = size(A, i)
-        @cast M[x, (σ, y)] := R[(x, σ), y] (σ ∈ 1:d)
-        R, Q = rq_fact(M, Dcut, args...)
-        R = R ./ maximum(abs.(R))
-        @cast B[x, σ, y] := Q[x, (σ, y)] (σ ∈ 1:d)
-        ψ[i] = B
-    end
-    ψ
-end
diff --git a/src/contractions.jl b/src/contractions.jl
deleted file mode 100644
index 69813df..0000000
--- a/src/contractions.jl
+++ /dev/null
@@ -1,151 +0,0 @@
-export left_env, right_env, dot!
-
-# --------------------------- Conventions -----------------------
-#
-#      MPS          MPS*         MPO       left env     right env
-#       2            2            2           - 1          2 -
-#   1 - A - 3    1 - B - 3    1 - W - 3      L               R
-#                                 4           - 2          1 -
-# ---------------------------------------------------------------
-#
-
-function LinearAlgebra.dot(ϕ::AbstractMPS, ψ::AbstractMPS)
-    T = promote_type(eltype(ψ), eltype(ϕ))
-    C = ones(T, 1, 1)
-    for (A, B) ∈ zip(ψ, ϕ)
-        @tensor C[x, y] := conj(B)[β, σ, x] * C[β, α] * A[α, σ, y] order = (α, β, σ)
-    end
-    tr(C)
-end
-
-"""
-    left_env(ϕ::AbstractMPS, ψ::AbstractMPS)
-
-Creates left environment (ϕ - bra, ψ - ket)
-"""
-function left_env(ϕ::AbstractMPS, ψ::AbstractMPS)
-    T = promote_type(eltype(ψ), eltype(ϕ))
-    S = typeof(similar(ψ[1], T, (1, 1)))
-    L = Vector{S}(undef, length(ψ) + 1)
-    L[1] = similar(ψ[1], T, (1, 1))
-    L[1][1, 1] = one(T)
-    for (i, (A, B)) ∈ enumerate(zip(ψ, ϕ))
-        C = L[i]
-        @tensor C[x, y] := conj(B)[β, σ, x] * C[β, α] * A[α, σ, y] order = (α, β, σ)
-        L[i+1] = C
-    end
-    L
-end
-
-
-# TODO: remove it (after SpinGlassEngine is updated)
-@memoize Dict function left_env(ϕ::AbstractMPS, σ::Vector{Int})
-    l = length(σ)
-    if l == 0
-        return ones(eltype(ϕ), 1)
-    end
-    m = σ[l]
-    L̃ = left_env(ϕ, σ[1:l-1])
-    M = ϕ[l]
-    @matmul L[x] := sum(α) L̃[α] * M[α, $m, x]
-    L
-end
-
-"""
-    right_env(ϕ::AbstractMPS, ψ::AbstractMPS)
-    
-Creates right environment (ϕ - bra, ψ - ket)
-"""
-function right_env(ϕ::AbstractMPS, ψ::AbstractMPS)
-    L = length(ψ)
-    T = promote_type(eltype(ψ), eltype(ϕ))
-    S = typeof(similar(ψ[1], T, (1, 1)))
-    R = Vector{S}(undef, L + 1)
-    R[end] = similar(ψ[1], T, (1, 1))
-    R[end][1, 1] = one(T)
-    for i ∈ L:-1:1
-        M = ψ[i]
-        M̃ = ϕ[i]
-        D = R[i+1]
-        @tensor D[x, y] := M[x, σ, α] * D[α, β] * conj(M̃)[y, σ, β] order = (β, α, σ)
-        R[i] = D
-    end
-    R
-end
-
-# TODO: remove it (after SpinGlassEngine is updated)
-@memoize Dict function right_env(
-    ϕ::AbstractMPS{T},
-    W::AbstractMPO{T},
-    σ::Union{Vector,NTuple},
-) where {T}
-    l = length(σ)
-    if l == 0
-        R = similar(ϕ[1], T, (1, 1))
-        R[1, 1] = one(T)
-        return R
-    end
-    k = length(W)
-    R̃ = right_env(ϕ, W, σ[2:l])
-    M = ϕ[k-l+1]
-    M̃ = W[k-l+1]
-    K = @view M̃[:, σ[1], :, :]
-    @tensor R[x, y] := K[y, β, γ] * M[x, γ, α] * R̃[α, β] order = (β, γ, α)
-    R
-end
-
-"""
-$(TYPEDSIGNATURES)
-
-Calculates the norm of an MPS \$\\ket{\\phi}\$
-"""
-LinearAlgebra.norm(ψ::AbstractMPS) = sqrt(abs(dot(ψ, ψ)))
-
-"""
-$(TYPEDSIGNATURES)
-
-Calculates \$\\bra{\\phi} O \\ket{\\psi}\$
-
-# Details
-
-Calculates the matrix element of \$O\$
-```math
-\\bra{\\phi} O \\ket{\\psi}
-```
-in one pass, utlizing `TensorOperations`.
-"""
-function LinearAlgebra.dot(ϕ::AbstractMPS, O::Union{Vector,NTuple}, ψ::AbstractMPS) #where T <: AbstractMatrix
-    S = promote_type(eltype(ψ), eltype(ϕ), eltype(O[1]))
-    C = similar(ψ[1], S, (1, 1))
-    C[1, 1] = one(S)
-    for (A, W, B) ∈ zip(ϕ, O, ψ)
-        @tensor C[x, y] := conj(A)[β, σ, x] * W[σ, η] * C[β, α] * B[α, η, y] order =
-            (α, η, β, σ)
-    end
-    tr(C)
-end
-
-function LinearAlgebra.dot(O::AbstractMPO, ψ::AbstractMPS)
-    S = promote_type(eltype(ψ), eltype(O))
-    T = typeof(ψ)
-    ϕ = T.name.wrapper(S, length(ψ))
-    for (i, (A, B)) ∈ enumerate(zip(O, ψ))
-        @matmul N[(x, a), σ, (y, b)] := sum(η) A[x, σ, y, η] * B[a, η, b]
-        ϕ[i] = N
-    end
-    ϕ
-end
-
-
-function LinearAlgebra.dot(O1::AbstractMPO, O2::AbstractMPO)
-    S = promote_type(eltype(O1), eltype(O2))
-    T = typeof(O1)
-    O = T.name.wrapper(S, length(O1))
-    for (i, (A, B)) ∈ enumerate(zip(O1, O2))
-        @matmul V[(x, a), σ, (y, b), η] := sum(γ) A[x, σ, y, γ] * B[a, γ, b, η]
-        O[i] = V
-    end
-    O
-end
-
-Base.:(*)(A::AbstractTensorNetwork, B::AbstractTensorNetwork) = dot(A, B)
diff --git a/src/contractions/central.jl b/src/contractions/central.jl
new file mode 100644
index 0000000..9397449
--- /dev/null
+++ b/src/contractions/central.jl
@@ -0,0 +1,111 @@
+# contractions with CentralTensor on CPU and CUDA
+
+export contract_tensor3_matrix, contract_matrix_tensor3, update_reduced_env_right
+# my_batched_mul!
+
+function contract_tensor3_matrix(LE::Tensor{R,3}, M::CentralTensor{R,2}) where {R<:Real}
+    contract_tensor3_central(LE, M.e11, M.e12, M.e21, M.e22)
+end
+
+function contract_matrix_tensor3(M::CentralTensor{R,2}, RE::Tensor{R,3}) where {R<:Real}
+    contract_tensor3_central(RE, M.e11', M.e21', M.e12', M.e22')
+end
+
+function update_reduced_env_right(RR::Tensor{R,2}, M::CentralTensor{R,2}) where {R<:Real}
+    RR = reshape(RR, size(RR, 1), 1, size(RR, 2))
+    dropdims(contract_matrix_tensor3(M, RR), dims = 2)
+end
+
+
+function contract_tensor3_central(LE, e11, e12, e21, e22)
+    sb, st = size(LE)
+    sbt = sb * st
+    sl1, sl2, sr1, sr2 = size(e11, 1), size(e22, 1), size(e11, 2), size(e22, 2)
+    sinter = sbt * max(sl1 * sl2 * min(sr1, sr2), sr1 * sr2 * min(sl1, sl2))
+    if sl1 * sl2 * sr1 * sr2 < sinter
+        @cast E[(l1, l2), (r1, r2)] := e11[l1, r1] * e21[l2, r1] * e12[l1, r2] * e22[l2, r2]
+        return reshape(reshape(LE, (sbt, sl1 * sl2)) * E, (sb, st, sr1 * sr2))
+    elseif sr1 <= sr2 && sl1 <= sl2
+        LE = reshape(LE, sbt, sl1, 1, sl2) .* reshape(e21', 1, 1, sr1, sl2)  # [tb, l1, r1, l2]
+        LE = reshape(reshape(LE, sbt * sl1 * sr1, sl2) * e22, (sbt, sl1, sr1, sr2))  # [tb, l1, r1, r2]
+        LE .*= reshape(e11, 1, sl1, sr1, 1)  # [tb, l1, r1, r2] .* [:, l1, r1, :]
+        LE .*= reshape(e12, 1, sl1, 1, sr2)  # [tb, l1, r1, r2] .* [:, l1, :, r2]
+        LE = sum(LE, dims = 2)
+    elseif sr1 <= sr2 && sl2 <= sl1
+        LE = permutedims(reshape(LE, (sbt, sl1, sl2)), (1, 3, 2))  # [tb, l2, l1]
+        LE = reshape(LE, sbt, sl2, 1, sl1) .* reshape(e11', 1, 1, sr1, sl1)  # [tb, l2, r1, l1]
+        LE = reshape(reshape(LE, sbt * sl2 * sr1, sl1) * e12, (sbt, sl2, sr1, sr2))  # [tb, l2, r1, r2]
+        LE .*= reshape(e21, 1, sl2, sr1, 1)  # [tb, l2, r1, r2] .* [:, l2, r1, :]
+        LE .*= reshape(e22, 1, sl2, 1, sr2)  # [tb, l2, r1, r2] .* [:, l2, :, r2]
+        LE = sum(LE, dims = 2)
+    elseif sr2 <= sr1 && sl1 <= sl2
+        LE = reshape(LE, sbt, sl1, 1, sl2) .* reshape(e22', 1, 1, sr2, sl2)  # [tb, l1, r2, l2]
+        LE = reshape(reshape(LE, sbt * sl1 * sr2, sl2) * e21, (sbt, sl1, sr2, sr1))  # [tb, l1, r2, r1]
+        LE .*= reshape(e11, 1, sl1, 1, sr1)  # [tb, l1, r2, r1] .* [:, l1, :, r1]
+        LE .*= reshape(e12, 1, sl1, sr2, 1)  # [tb, l1, r2, r1] .* [:, l1, r2, :]
+        LE = permutedims(dropdims(sum(LE, dims = 2), dims = 2), (1, 3, 2))
+    else # sr2 <= sr1 && sl2 <= sl1
+        LE = permutedims(reshape(LE, (sbt, sl1, sl2)), (1, 3, 2))  # [tb, l2, l1]
+        LE = reshape(LE, sbt, sl2, 1, sl1) .* reshape(e12', 1, 1, sr2, sl1)  # [tb, l2, r2, l1]
+        LE = reshape(reshape(LE, sbt * sl2 * sr2, sl1) * e11, (sbt, sl2, sr2, sr1))  # [tb, l2, r2, r1]
+        LE .*= reshape(e21, 1, sl2, 1, sr1)  # [tb, l2, r2, r1] .* [:, l2, :, r1]
+        LE .*= reshape(e22, 1, sl2, sr2, 1)  # [tb, l2, r2, r1] .* [:, l2, r2, :]
+        LE = permutedims(dropdims(sum(LE, dims = 2), dims = 2), (1, 3, 2))
+    end
+    reshape(LE, sb, st, sr1 * sr2)
+end
+
+function batched_mul!(
+    newLE::Tensor{R,3},
+    LE::Tensor{R,3},
+    M::AbstractArray{R,2},
+) where {R<:Real}
+    N1, N2 = size(M)
+    new_M = CUDA.CuArray(M)  # TODO: this is a hack to solve problem with types;
+    new_M = reshape(new_M, (N1, N2, 1))
+    NNlib.batched_mul!(newLE, LE, new_M)
+end
+
+function batched_mul!(
+    newLE::Tensor{R,3},
+    LE::Tensor{R,3},
+    M::CentralTensor{R,2},
+) where {R<:Real}
+    sb, _, st = size(LE)
+    sl1, sl2, sr1, sr2 = size(M.e11, 1), size(M.e22, 1), size(M.e11, 2), size(M.e22, 2)
+    sinter = sb * st * max(sl1 * sl2 * min(sr1, sr2), sr1 * sr2 * min(sl1, sl2))
+    if sl1 * sl2 * sr1 * sr2 < sinter
+        @cast E[(l1, l2), (r1, r2)] :=
+            M.e11[l1, r1] * M.e21[l2, r1] * M.e12[l1, r2] * M.e22[l2, r2]
+        E = reshape(E, (sl1 * sl2, sr1 * sr2, 1))
+        NNlib.batched_mul!(newLE, LE, E)
+    elseif sr1 <= sr2 && sl1 <= sl2
+        LE = reshape(LE, sb * sl1, 1, sl2, st) .* reshape(M.e21', 1, sr1, sl2, 1)  # [b * l1, r1, l2, t]
+        LE = batched_mul(reshape(LE, sb * sl1 * sr1, sl2, st), M.e22)  # [(b, l1, r1), r2, t]
+        LE = reshape(LE, (sb, sl1, sr1, sr2, st))  # [b, l1, r1, r2, t]
+        LE .*= reshape(M.e11, 1, sl1, sr1, 1, 1)  # [b, l1, r1, r2, t] .* [:, l1, r1, :, :]
+        LE .*= reshape(M.e12, 1, sl1, 1, sr2, 1)  # [b, l1, r1, r2, t] .* [:, l1, :, r2, :]
+        sum!(reshape(newLE, (sb, 1, sr1, sr2, st)), LE)
+    elseif sr1 <= sr2 && sl2 <= sl1
+        LE = reshape(LE, sb, 1, sl1, sl2 * st) .* reshape(M.e11', 1, sr1, sl1, 1)  # [b, r1, l1, l2, t]
+        LE = batched_mul(reshape(LE, sb * sr1, sl1, sl2 * st), M.e12)  # [(b, r1), r2, (l2, t)]
+        LE = reshape(LE, (sb, sr1, sr2, sl2, st))  # [b, r1, r2, l2, t]
+        LE .*= reshape(M.e21', 1, sr1, 1, sl2, 1)  # [b, r1, r2, l2, t] .* [:, r1, :, l2, :]
+        LE .*= reshape(M.e22', 1, 1, sr2, sl2, 1)  # [b, r1, r2, l2, t] .* [:, :, r2, l2, :]
+        sum!(reshape(newLE, (sb, sr1, sr2, 1, st)), LE)
+    elseif sr2 <= sr1 && sl1 <= sl2
+        LE = reshape(LE, sb * sl1, sl2, 1, st) .* reshape(M.e22, 1, sl2, sr2, 1)  # [b, l1, l2, r2, t]
+        LE = batched_mul(reshape(LE, sb * sl1, sl2, sr2 * st), M.e21) # [(b, l1), r1, (r2, t)]
+        LE = reshape(LE, (sb, sl1, sr1, sr2, st))  # [b, l1, r1, r2, t]
+        LE .*= reshape(M.e11, 1, sl1, sr1, 1, 1)  # [b, l1, r1, r2, t] .* [:, l1, r1, :, :]
+        LE .*= reshape(M.e12, 1, sl1, 1, sr2, 1)  # [b, l1, r1, r2, t] .* [:, l1, :, r2, :]
+        sum!(reshape(newLE, (sb, 1, sr1, sr2, st)), LE)
+    else # sr2 <= sr1 && sl2 <= sl1
+        LE = reshape(LE, sb, sl1, sl2, 1, st) .* reshape(M.e12, 1, sl1, 1, sr2, 1)  # [b, l1, l2, r2, t]
+        LE = batched_mul(reshape(LE, sb, sl1, sl2 * sr2 * st), M.e11)  # [b, r1, (l2, r2, t)]
+        LE = reshape(LE, (sb, sr1, sl2, sr2, st))  # [b, r1, l2, r2, t]
+        LE .*= reshape(M.e21', 1, sr1, sl2, 1, 1)  # [b, r1, l2, r2, t] .* [:, l2, :, r1]
+        LE .*= reshape(M.e22, 1, 1, sl2, sr2, 1)  # [b, r1, l2, r2, t] .* [:, :, l2, r2, :]
+        sum!(reshape(newLE, (sb, sr1, 1, sr2, st)), LE)
+    end
+end
diff --git a/src/contractions/dense.jl b/src/contractions/dense.jl
new file mode 100644
index 0000000..faa6f97
--- /dev/null
+++ b/src/contractions/dense.jl
@@ -0,0 +1,204 @@
+# contractions of dense objects on CPU and CUDA
+# export
+#     update_reduced_env_right2
+
+const MatrixOrCuMatrix{R} = Union{
+    CuMatrix{R},
+    Matrix{R},
+    Diagonal{R,CuArray{R,1,Mem.DeviceBuffer}},
+    Diagonal{R,Vector{R}},
+}
+
+function contract_tensor3_matrix(A::Tensor{R,3}, M::MatrixOrCuMatrix{R}) where {R<:Real}
+    sl1, sl2, sl3 = size(A)
+    A = reshape(A, sl1 * sl2, sl3)
+    reshape(A * M, sl1, sl2, :)
+end
+
+function contract_matrix_tensor3(M::MatrixOrCuMatrix{R}, A::Tensor{R,3}) where {R<:Real}
+    sl1, sl2, sl3 = size(A)
+    A = reshape(A, sl1 * sl2, sl3)
+    reshape(A * M', sl1, sl2, :)
+end
+
+"""
+        -- A --
+      |    |
+ L = LE -- M --
+      |    |
+        -- B --
+"""
+function update_env_left(
+    LE::S,
+    A::S,
+    M::T,
+    B::S,
+) where {S<:Tensor{R,3},T<:Tensor{R,4}} where {R<:Real}
+    @tensor order = (ot, α, oc, β, ob) LE[nb, nt, nc] :=
+        LE[ob, ot, oc] * A[ot, nt, α] * M[oc, α, nc, β] * B[ob, nb, β] # TODO: split the line
+end
+
+"""
+        -- A --
+      |    |
+ L = LE    |
+      |    |
+        -- B --
+"""
+function update_env_left(
+    LE::T,
+    A::S,
+    B::S,
+) where {S<:Tensor{R,3},T<:Tensor{R,2}} where {R<:Real}
+    @tensor order = (ot, α, ob) LE[nb, nt] := LE[ob, ot] * A[ot, nt, α] * B[ob, nb, α]
+end
+
+"""
+        -- A --
+      |    |
+ L = LE
+      |
+
+"""
+function update_env_left(LE::T, A::S) where {S<:Tensor{R,3},T<:Tensor{R,2}} where {R<:Real}
+    @tensor A[nb, nt, nc] := LE[nb, ot] * A[ot, nt, nc]
+end
+
+"""
+      -- A --
+         |    |
+ R =  -- M -- RE
+         |    |
+      -- B --
+"""
+function update_env_right(
+    RE::S,
+    A::S,
+    M::T,
+    B::S,
+) where {T<:Tensor{R,4},S<:Tensor{R,3}} where {R<:Real}
+    @tensor order = (ot, α, oc, β, ob) RE[nb, nt, nc] :=
+        RE[ob, ot, oc] * A[nt, ot, α] * M[nc, α, oc, β] * B[nb, ob, β]
+end
+
+"""
+      -- A --
+         |    |
+ R =     |    RE
+         |    |
+      -- B --
+"""
+function update_env_right(
+    RE::T,
+    A::S,
+    B::S,
+) where {T<:Tensor{R,2},S<:Tensor{R,3}} where {R<:Real}
+    @tensor order = (ot, α, ob) RE[nb, nt] := RE[ob, ot] * A[nt, ot, α] * B[nb, ob, α]
+end
+
+"""
+      -- A --
+         |    |
+ R =      --- RE
+              |
+
+"""
+function update_env_right(RE::S, C::S) where {S<:Tensor{R,3}} where {R<:Real}
+    @tensor order = (ot, oc) RR[nb, nt] := RE[nb, ot, oc] * C[nt, ot, oc]
+end
+
+"""
+   |    |    |
+  LE -- M -- RE
+   |    |    |
+     -- B --
+"""
+function project_ket_on_bra(
+    LE::S,
+    B::S,
+    M::T,
+    RE::S,
+) where {T<:Tensor{R,4},S<:Tensor{R,3}} where {R<:Real}
+    @tensor order = (ol, lc, oc, or, rc) A[nl, nr, nc] :=
+        LE[ol, nl, lc] * B[ol, or, oc] * M[lc, nc, rc, oc] * RE[or, nr, rc]
+end
+
+"""
+  LE -     - RE
+   |    |    |
+     -- B --
+"""
+function project_ket_on_bra(
+    LE::T,
+    B::S,
+    RE::T,
+) where {T<:Tensor{R,2},S<:Tensor{R,3}} where {R<:Real}
+    @tensor order = (ol, or) A[nl, nr, nc] := LE[ol, nl] * B[ol, or, nc] * RE[or, nr]
+end
+
+"""
+   |      |
+  LE ---- RE --
+"""
+function project_ket_on_bra(
+    LE::T,
+    RE::S,
+) where {T<:Tensor{R,2},S<:Tensor{R,3}} where {R<:Real}
+    @tensor A[nl, nr, nc] := LE[ol, nl] * RE[ol, nr, nc]
+end
+
+"""
+      K
+      |
+   -- M -- RE
+      |    |
+   -- B ---
+"""
+function update_reduced_env_right(
+    RE::Tensor{R,2},
+    m::Int,
+    M::MpoTensor{R,4},
+    B::Tensor{R,3},
+) where {R<:Real}
+    K = zeros(R, size(M, 2))
+    K[m] = one(R)
+    if typeof(RE) <: CuArray
+        K = CuArray(K)
+    end
+    K = reshape(K, 1, 1, size(K, 1))
+    for v ∈ M.top
+        K = contract_tensor3_matrix(K, v)
+    end
+    K = dropdims(K, dims = (1, 2))
+
+    for v ∈ reverse(M.bot)
+        B = contract_matrix_tensor3(v, B)   # TODO do we ever enter here? in mpo layers that we have now, we don't
+    end
+    update_reduced_env_right(K, RE, M.ctr, B)
+end
+
+function update_reduced_env_right(
+    K::Tensor{R,1},
+    RE::Tensor{R,2},
+    M::Tensor{R,4},
+    B::Tensor{R,3},
+) where {R<:Real}
+    @tensor order = (d, β, γ, α) RE[x, y] := K[d] * M[y, d, β, γ] * B[x, α, γ] * RE[α, β]
+end
+
+function update_reduced_env_right(RR::S, M0::S) where {S<:Tensor{<:Real,2}}
+    @tensor RR[x, y] := M0[y, z] * RR[x, z]
+end
+
+function contract_tensors43(B::Tensor{R,4}, A::Tensor{R,3}) where {R<:Real}
+    @matmul C[(x, y), (b, a), z] := sum(σ) B[y, z, a, σ] * A[x, b, σ]
+end
+
+function corner_matrix(
+    C::S,
+    M::T,
+    B::S,
+) where {S<:Tensor{R,3},T<:Tensor{R,4}} where {R<:Real}
+    @tensor order = (rr, mb, mr) Cnew[ll, ml, tt, mt] :=
+        M[ml, mt, mr, mb] * B[ll, rr, mb] * C[rr, tt, mr]
+end
diff --git a/src/contractions/diagonal.jl b/src/contractions/diagonal.jl
new file mode 100644
index 0000000..ce41966
--- /dev/null
+++ b/src/contractions/diagonal.jl
@@ -0,0 +1,21 @@
+# diagonal.jl: contractions with DiagonalTensor on CPU and CUDA
+
+function contract_tensor3_matrix(B::Tensor{R,3}, C::DiagonalTensor{R}) where {R<:Real}
+    @cast B[l, (r, s1), s2] := B[l, r, (s1, s2)] (s2 ∈ 1:size(C.e2, 1))
+    B = contract_tensor3_matrix(B, C.e2)
+    @cast B[l, r, s1, q2] := B[l, (r, s1), q2] (s1 ∈ 1:size(C.e1, 1))
+    B = permutedims(B, (1, 2, 4, 3))
+    @cast B[l, (r, q2), s1] := B[l, r, q2, s1]
+    B = contract_tensor3_matrix(B, C.e1)
+    @cast B[l, r, (q2, q1)] := B[l, (r, q2), q1] (q2 ∈ 1:size(C.e2, 2))
+end
+
+function contract_matrix_tensor3(C::DiagonalTensor{R}, B::Tensor{R,3}) where {R<:Real}
+    @cast B[l, (r, s2), s1] := B[l, r, (s2, s1)] (s1 ∈ 1:size(C.e1, 2))
+    B = contract_matrix_tensor3(C.e1, B)
+    @cast B[l, r, s2, q1] := B[l, (r, s2), q1] (s2 ∈ 1:size(C.e2, 2))
+    B = permutedims(B, (1, 2, 4, 3))
+    @cast B[l, (r, q1), s2] := B[l, r, q1, s2]
+    B = contract_matrix_tensor3(C.e2, B)
+    @cast B[l, r, (q1, q2)] := B[l, (r, q1), q2] (q1 ∈ 1:size(C.e1, 1))
+end
diff --git a/src/contractions/site.jl b/src/contractions/site.jl
new file mode 100644
index 0000000..bae7c6c
--- /dev/null
+++ b/src/contractions/site.jl
@@ -0,0 +1,195 @@
+# site.jl: contractions with SiteTensor on CPU and CUDA
+
+# TODO make sure slicing is done right, 
+# cf. https://discourse.julialang.org/t/correct-implementation-of-cuarrays-slicing-operations/90600
+
+function contract_sparse_with_three(
+    lp,
+    X1::S,
+    X2::S,
+    X3::S,
+    loc_exp::T,
+    k1::Q,
+    k2::Q,
+    k3::Q,
+    kout::Q,
+) where {S<:Tensor{R,3},T<:Tensor{R,1},Q<:Integer} where {R<:Real}
+    s1, s2, _ = size(X1)
+    s3, s4, _ = size(X3)
+
+    device = typeof(loc_exp) <: CuArray ? :GPU : :CPU
+    p1 = get_projector!(lp, k1, device)
+    p2 = get_projector!(lp, k2, device)
+    p3 = get_projector!(lp, k3, device)
+
+    total_memory = 2^32 # TODO add better handling for this; also depending on device
+    batch_size = max(
+        Int(
+            floor(
+                total_memory /
+                (8 * (s1 * s2 + s2 * s3 + s3 * s4 + s4 * s1 + min(s1 * s3, s2 * s4))),
+            ),
+        ),
+        1,
+    )
+    batch_size = Int(2^floor(log2(batch_size) + 1e-6))
+
+    total_size = length(p1)
+    batch_size = min(batch_size, total_size)
+
+    onGPU = typeof(loc_exp) <: CuArray
+    out = onGPU ? CUDA.zeros(R, size(lp, kout), s1, s4) : zeros(R, size(lp, kout), s1, s4)
+
+    from = 1
+    while from <= total_size
+        to = min(total_size, from + batch_size - 1)
+
+        vp1 = @view p1[from:to]
+        vp2 = @view p2[from:to]
+        vp3 = @view p3[from:to]
+
+        X1p = X1[:, :, vp1]
+        X2p = X2[:, :, vp2]
+        X3p = X3[:, :, vp3]
+
+        if s1 * s3 < s2 * s4
+            Xtmp = batched_mul(X1p, X2p)
+            outp = batched_mul(Xtmp, X3p)
+        else
+            Xtmp = batched_mul(X2p, X3p)
+            outp = batched_mul(X1p, Xtmp)
+        end
+
+        le = @view loc_exp[from:to]
+        outp .*= reshape(le, 1, 1, :)
+        outpp = reshape(outp, s1 * s4, :)
+        ipr, rf, rt = sparse(R, lp, kout, device; from, to)
+        @inbounds out[rf:rt, :, :] .+= reshape(ipr * outpp', :, s1, s4)
+        from = to + 1
+    end
+    permutedims(out, (2, 3, 1))
+end
+
+function update_env_left(
+    LE::S,
+    A::S,
+    M::T,
+    B::S,
+) where {S<:Tensor{R,3},T<:SiteTensor{R,4}} where {R<:Real}
+    contract_sparse_with_three(
+        M.lp,
+        permutedims(B, (2, 1, 3)),
+        LE,
+        A,
+        M.loc_exp,
+        M.projs[[4, 1, 2, 3]]...,
+    )
+end
+
+function update_env_right(
+    RE::S,
+    A::S,
+    M::SiteTensor{R,4},
+    B::S,
+) where {S<:Tensor{R,3}} where {R<:Real}
+    contract_sparse_with_three(
+        M.lp,
+        B,
+        RE,
+        permutedims(A, (2, 1, 3)),
+        M.loc_exp,
+        M.projs[[4, 3, 2, 1]]...,
+    )
+end
+
+function project_ket_on_bra(
+    LE::S,
+    B::S,
+    M::SiteTensor{R,4},
+    RE::S,
+) where {S<:Tensor{R,3}} where {R<:Real}
+    contract_sparse_with_three(
+        M.lp,
+        permutedims(LE, (2, 1, 3)),
+        B,
+        RE,
+        M.loc_exp,
+        M.projs[[1, 4, 3, 2]]...,
+    )
+end
+
+function update_reduced_env_right(
+    K::Tensor{R,1},
+    RE::Tensor{R,2},
+    M::SiteTensor{R,4},
+    B::Tensor{R,3},
+) where {R<:Real}
+    device = typeof(M.loc_exp) <: CuArray ? :GPU : :CPU
+    s1, s2, _ = size(B)
+
+    p2, p3, p4 = (get_projector!(M.lp, x, device) for x in M.projs[2:4])
+    k1 = M.projs[1]
+    total_memory = 2^32 # TODO add better handling for this; also depending on device
+
+    batch_size = max(Int(floor(total_memory / (8 * (s1 * s2 + s1 + s2 + 1)))), 1)
+    batch_size = Int(2^floor(log2(batch_size) + 1e-6))
+
+    out =
+        typeof(M.loc_exp) <: CuArray ? CUDA.zeros(R, size(M.lp, k1), s1) :
+        zeros(R, size(M.lp, k1), s1)
+    RE = reshape(RE, size(RE, 1), 1, size(RE, 2))
+
+    from = 1
+    total_size = length(p4)
+    while from <= total_size
+        to = min(total_size, from + batch_size - 1)
+        vp2 = @view p2[from:to]
+        vp3 = @view p3[from:to]
+        vp4 = @view p4[from:to]
+
+        @inbounds Kp = K[vp2]
+        @inbounds REp = RE[:, :, vp3]
+        @inbounds Bp = B[:, :, vp4]
+        le = @view M.loc_exp[from:to]
+
+        outp = dropdims(Bp ⊠ REp, dims = 2)
+        outp .*= reshape(le .* Kp, 1, :)
+
+        ipr, rf, rt = sparse(R, M.lp, k1, device; from, to)
+        @inbounds out[rf:rt, :] .+= ipr * outp'
+        from = to + 1
+    end
+    permutedims(out, (2, 1))
+end
+
+function contract_tensors43(M::SiteTensor{R,4}, B::Tensor{R,3}) where {R<:Real}
+    device = typeof(M.loc_exp) <: CuArray ? :GPU : :CPU
+    p4 = get_projector!(M.lp, M.projs[4], device)
+    sb1, sb2, _ = size(B)
+    sm1, sm2, sm3 = size.(Ref(M.lp), M.projs[1:3])
+    @inbounds Bp = B[:, :, p4] .* reshape(M.loc_exp, 1, 1, :)
+    @cast Bp[(x, y), z] := Bp[x, y, z]
+    ip123 = sparse(R, M.lp, M.projs[1], M.projs[2], M.projs[3], device)
+    out = reshape(ip123 * Bp', sm1, sm2, sm3, sb1, sb2)
+    out = permutedims(out, (4, 1, 5, 3, 2))
+    reshape(out, sb1 * sm1, sb2 * sm3, sm2)
+end
+
+function corner_matrix(
+    C::S,
+    M::T,
+    B::S,
+) where {S<:Tensor{R,3},T<:SiteTensor{R,4}} where {R<:Real}
+    device = typeof(M.loc_exp) <: CuArray ? :GPU : :CPU
+    projs = [get_projector!(M.lp, x, device) for x in M.projs]
+    @inbounds Bp = B[:, :, projs[4]]
+    @inbounds Cp = C[:, :, projs[3]]
+    outp = Bp ⊠ Cp
+    outp .*= reshape(M.loc_exp, 1, 1, :)
+    @cast outp[(x, y), z] := outp[x, y, z]
+    sm1, sm2 = maximum(projs[1]), maximum(projs[2])
+    @inbounds p12 = projs[1] .+ (projs[2] .- 1) .* sm1
+    ip12 = sparse(R, p12; mp = sm1 * sm2)
+    out = reshape(ip12 * outp', sm1, maximum(projs[2]), size(B, 1), size(C, 2))
+    permutedims(out, (3, 1, 4, 2))
+end
diff --git a/src/contractions/sparse.jl b/src/contractions/sparse.jl
new file mode 100644
index 0000000..786585e
--- /dev/null
+++ b/src/contractions/sparse.jl
@@ -0,0 +1,70 @@
+#TODO add support for CuSparseMatrixCSR (cf. https://github.com/JuliaGPU/CUDA.jl/issues/1113)
+
+# TODO This function is a patch and may not provide any advantage - to be tested
+#=
+function CUDA.:*(Md::DenseCuMatrix{T}, Mcsr::CUSPARSE.CuSparseMatrixCSR{T}) where T
+    ret = CUDA.zeros(T, size(Md, 1), size(Mcsr, 2))
+    CUSPARSE.mm!('T', 'T', one(T), Mcsr, Md, zero(T), ret, 'O')
+    ret'
+end
+=#
+#
+# TODO shouldn't we have CSR format instead?
+function SparseArrays.sparse(::Type{R}, p::CuArray{Int64,1}; mp = nothing) where {R<:Real}
+    n = length(p)
+    if isnothing(mp)
+        mp = maximum(p)
+    end
+    cn = CuArray(1:n+1)  # aux_cusparse(R, n)
+    co = CUDA.ones(R, n)
+    CuSparseMatrixCSR(CuSparseMatrixCSC(cn, p, co, (mp, n))) # TODO: Change when CUDA.jl is fixed
+end
+
+function SparseArrays.sparse(::Type{R}, p::Vector{Int64}; mp = nothing) where {R<:Real}
+    n = length(p)
+    if isnothing(mp)
+        mp = maximum(p)
+    end
+    cn = collect(1:n)
+    co = ones(R, n)
+    sparse(p, cn, co, mp, n)
+end
+
+@memoize Dict function SparseArrays.sparse(
+    ::Type{T},
+    lp::PoolOfProjectors,
+    k1::R,
+    k2::R,
+    k3::R,
+    device::Symbol,
+) where {T<:Real,R<:Int}
+    p1 = get_projector!(lp, k1) #, device)
+    p2 = get_projector!(lp, k2) #, device)
+    p3 = get_projector!(lp, k3) #, device)
+    @assert length(p1) == length(p2) == length(p3)
+    s1, s2, s3 = size(lp, k1), size(lp, k2), size(lp, k3)
+    p = p1 .+ s1 * (p2 .- 1) .+ s1 * s2 * (p3 .- 1)
+    if device == :GPU
+        p = CuArray(p)
+    end
+    sparse(T, p; mp = s1 * s2 * s3)
+end
+
+@memoize Dict function SparseArrays.sparse(
+    ::Type{R},
+    lp::PoolOfProjectors,
+    k::Int,
+    device::Symbol;
+    from::Int = 1,
+    to::Int = length(lp, k),
+) where {R<:Real}
+    p = get_projector!(lp, k)
+    pp = @view p[from:to]
+    rf = minimum(pp)
+    rt = maximum(pp)
+    if device == :GPU
+        pp = CuArray(pp)
+    end
+    ipr = sparse(R, pp .- (rf - 1))
+    (ipr, rf, rt)
+end
diff --git a/src/contractions/virtual.jl b/src/contractions/virtual.jl
new file mode 100644
index 0000000..78d6f57
--- /dev/null
+++ b/src/contractions/virtual.jl
@@ -0,0 +1,434 @@
+# virtual.jl: contractions with VirtualTensor on CPU and CUDA
+# export update_env_left2, update_env_right2, project_ket_on_bra2
+
+# @memoize Dict
+alloc_undef(R, onGPU, shape) = onGPU ? CuArray{R}(undef, shape) : Array{R}(undef, shape)
+alloc_zeros(R, onGPU, shape) = onGPU ? CUDA.zeros(R, shape) : zeros(R, shape)
+
+function proj_out(lp, k1, k2, k3, device)
+    p1 = get_projector!(lp, k1, device)
+    p2 = get_projector!(lp, k2, device)
+    p3 = get_projector!(lp, k3, device)
+    @assert length(p1) == length(p2) == length(p3)
+    s1, s2 = size(lp, k1), size(lp, k2)
+    p1 .+ s1 * (p2 .- 1) .+ s1 * s2 * (p3 .- 1)
+end
+
+function proj_2step_12(lp, (k1, k2), k3, device)
+    p1 = get_projector!(lp, k1, :CPU)
+    p2 = get_projector!(lp, k2, :CPU)
+    p3 = get_projector!(lp, k3, device)
+    @assert length(p1) == length(p2) == length(p3)
+    s1 = size(lp, k1)
+
+    p12, transitions_matrix = rank_reveal(hcat(p1, p2), :PE)
+    (p1, p2) = Tuple(Array(t) for t ∈ eachcol(transitions_matrix))
+
+    s12 = maximum(p12)
+
+    if device == :CPU
+        p12 = CuArray(p12)
+        p1 = CuArray(p1)
+        p2 = CuArray(p2)
+    end
+
+    pf1 = p12 .+ s12 .* (p3 .- 1)
+    pf2 = p1 .+ s1 .* (p2 .- 1)
+
+    pf1, pf2, s12
+end
+
+function proj_2step_23(lp, k1, (k2, k3), device)
+    p1 = get_projector!(lp, k1, device)
+    p2 = get_projector!(lp, k2, :CPU)
+    p3 = get_projector!(lp, k3, :CPU)
+    @assert length(p1) == length(p2) == length(p3)
+
+    s1, s2 = size(lp, k1), size(lp, k2)
+
+    p23, transitions_matrix = rank_reveal(hcat(p2, p3), :PE)
+    (p2, p3) = Tuple(Array(t) for t ∈ eachcol(transitions_matrix))
+
+    s23 = maximum(p23)
+
+    if device == :CPU
+        p23 = CuArray(p23)
+        p2 = CuArray(p2)
+        p3 = CuArray(p3)
+    end
+
+    pf1 = p1 .+ s1 .* (p23 .- 1)
+    pf2 = p2 .+ s2 .* (p3 .- 1)
+
+    pf1, pf2, s23
+end
+
+function merge_projectors_inter(lp, p1, p2, p3, onGPU; order = "1_23")
+    s1 = size(lp, p1)
+    s2 = size(lp, p2)
+    device = onGPU ? :GPU : :CPU
+    p1 = get_projector!(lp, p1, device)
+    p2 = get_projector!(lp, p2, :CPU)
+    p3 = get_projector!(lp, p3, :CPU)
+
+    p23, transitions_matrix = rank_reveal(hcat(p2, p3), :PE)
+    s23 = maximum(p23)
+    (p2, p3) = Tuple(Array(t) for t ∈ eachcol(transitions_matrix))
+    if onGPU
+        p23 = CuArray(p23)
+        p2 = CuArray(p2)
+        p3 = CuArray(p3)
+    end
+    p2_3 = p2 .+ s2 .* (p3 .- 1)
+    p123 = order == "1_23" ? p1 .+ s1 .* (p23 .- 1) : p23 .+ s23 .* (p1 .- 1)  # else "23_1"
+    p123, p2_3, s23
+end
+
+function update_env_left(
+    LE::S,
+    A::S,
+    M::VirtualTensor{R,4},
+    B::S,
+) where {S<:Tensor{R,3}} where {R<:Real}
+    p_lb, p_lc, p_lt, p_rb, p_rc, p_rt = M.projs
+    slb, srb = size(B, 1), size(B, 2)
+    slt, srt = size(A, 1), size(A, 2)
+    src = length(M.lp, p_rc)
+
+    slpb, slpc, slpt = size(M.lp, p_lb), size(M.lp, p_lc), size(M.lp, p_lt)
+    srpb, srpc, srpt = size(M.lp, p_rb), size(M.lp, p_rc), size(M.lp, p_rt)
+
+    onGPU = typeof(LE) <: CuArray
+
+    A = reshape(A, (slt, srt, slpt, srpt))
+    B = reshape(B, (slb, srb, slpb, srpb))
+    Lout = alloc_zeros(R, onGPU, (srb, srt, src))
+
+    if slpb * srpt >= slpt * srpb
+        pl_b_ct, pl_c_t, slpct =
+            merge_projectors_inter(M.lp, p_lb, p_lc, p_lt, onGPU; order = "1_23")
+        pr_bc_t, pr_b_c, srpbc =
+            merge_projectors_inter(M.lp, p_rt, p_rb, p_rc, onGPU; order = "23_1")
+
+        B2 = permutedims(B, (1, 3, 2, 4))  # [lb, lpb, rb, rpb]
+        B2 = reshape(B2, (slb * slpb, srb * srpb))  # [(lb, lpb), (rb, rpb)]
+
+        tmp1 = alloc_zeros(R, onGPU, (slb, slpb * slpct))
+        tmp2 = alloc_undef(R, onGPU, (srb * srpb, slpct))
+        tmp3 = alloc_zeros(R, onGPU, (srb * srpb, slpt * slpc))
+        tmp5 = alloc_undef(R, onGPU, (srb * srpb, srpc, slpt))
+        tmp8 = alloc_undef(R, onGPU, (srb * srpbc, srpt))
+
+        for ilt ∈ 1:slt
+            tmp1[:, pl_b_ct] = (@view LE[:, ilt, :])  # [lb, (lpb, lpct)]
+            mul!(tmp2, B2', reshape(tmp1, (slb * slpb, slpct)))  # [(rb, rpb), lpct]
+            tmp3[:, pl_c_t] = tmp2  # [(rb, rpb), (lpc, lpt)]
+            tmp4 = reshape(tmp3, (srb * srpb, slpc, slpt))  # [(rb, rpb), lpc, lpt]
+            batched_mul!(tmp5, tmp4, M.con)
+            tmp6 = reshape(tmp5, (srb, srpb * srpc, slpt))  # [rb, (rpb, rpc), lpt]
+            tmp7 = reshape(tmp6[:, pr_b_c, :], (srb * srpbc, slpt))  # [(rb, rpbc), lpt]
+            for irt ∈ 1:srt
+                mul!(tmp8, tmp7, (@view A[ilt, irt, :, :]))
+                tmp9 = reshape(tmp8, (srb, srpbc * srpt))
+                Lout[:, irt, :] .+= tmp9[:, pr_bc_t]  # [rb, rc]
+            end
+        end
+    else
+        pl_t_cb, pl_c_b, slpcb =
+            merge_projectors_inter(M.lp, p_lt, p_lc, p_lb, onGPU; order = "1_23")
+        pr_tc_b, pr_t_c, srptc =
+            merge_projectors_inter(M.lp, p_rb, p_rt, p_rc, onGPU; order = "23_1")
+
+        A2 = permutedims(A, (1, 3, 2, 4))  # [lt, lpt, rt, rpt]
+        A2 = reshape(A2, (slt * slpt, srt * srpt))  # [(lt, lpt), (rt, rpt)]
+
+        tmp1 = alloc_zeros(R, onGPU, (slt, slpt * slpcb))
+        tmp2 = alloc_undef(R, onGPU, (srt * srpt, slpcb))
+        tmp3 = alloc_zeros(R, onGPU, (srt * srpt, slpc * slpb))
+        tmp5 = alloc_undef(R, onGPU, (srt * srpt, srpc, slpb))
+        tmp8 = alloc_zeros(R, onGPU, (srt * srptc, srpb))
+
+        for ilb ∈ 1:slb
+            tmp1[:, pl_t_cb] = (@view LE[ilb, :, :])  # [lt, (lpt, lpcb)]
+            mul!(tmp2, A2', reshape(tmp1, (slt * slpt, slpcb)))  # [(rt, rpt), lpcb]
+            tmp3[:, pl_c_b] = tmp2
+            tmp4 = reshape(tmp3, (srt * srpt, slpc, slpb))  # [(rt, rpt), lpc, lpb]
+            batched_mul!(tmp5, tmp4, M.con)  # [(rt, rpt), lpb, rpc]
+            tmp6 = reshape(tmp5, (srt, srpt * srpc, slpb))  # [(rt, rpt * rpc), lcb]
+            tmp7 = reshape(tmp6[:, pr_t_c, :], (srt * srptc, slpb))  # [(rt, rptc), lpb]
+            for irb ∈ 1:srb
+                mul!(tmp8, tmp7, (@view B[ilb, irb, :, :]))
+                tmp9 = reshape(tmp8, (srt, srptc * srpb))
+                Lout[irb, :, :] .+= tmp9[:, pr_tc_b]  # [rt, rc]
+            end
+        end
+    end
+    Lout
+end
+
+
+function project_ket_on_bra(
+    LE::S,
+    B::S,
+    M::VirtualTensor{R,4},
+    RE::S,
+) where {S<:Tensor{R,3}} where {R<:Real}
+    p_lb, p_lc, p_lt, p_rb, p_rc, p_rt = M.projs
+    slb, slt = size(LE, 1), size(LE, 2)
+    srb, srt = size(RE, 1), size(RE, 2)
+    slpb, slpc, slpt = size(M.lp, p_lb), size(M.lp, p_lc), size(M.lp, p_lt)
+    srpb, srpc, srpt = size(M.lp, p_rb), size(M.lp, p_rc), size(M.lp, p_rt)
+
+    onGPU = typeof(LE) <: CuArray
+
+    B = reshape(B, (slb, srb, slpb, srpb))
+    B2 = permutedims(B, (1, 3, 2, 4))  # [lb, lpb, rb, rpb]
+    B2 = reshape(B2, (slb * slpb, srb * srpb))  # [(lb, lpb), (rb, rpb)]
+    LR = alloc_zeros(R, onGPU, (slt, srt, slpt, srpt))
+
+    if slpb >= srpb
+        pl_b_ct, pl_c_t, slpct =
+            merge_projectors_inter(M.lp, p_lb, p_lc, p_lt, onGPU; order = "1_23")
+        pr_bc_t, pr_b_c, srpbc =
+            merge_projectors_inter(M.lp, p_rt, p_rb, p_rc, onGPU; order = "23_1")
+
+        tmp1 = alloc_zeros(R, onGPU, (slb, slpb * slpct))
+        tmp2 = alloc_undef(R, onGPU, (srb * srpb, slpct))
+        tmp3 = alloc_zeros(R, onGPU, (srb * srpb, slpc * slpt))
+        tmp5 = alloc_undef(R, onGPU, (srb * srpb, srpc, slpt))
+        tmp8 = alloc_zeros(R, onGPU, (srb, srpbc * srpt))
+        for ilt ∈ 1:slt
+            tmp1[:, pl_b_ct] = (@view LE[:, ilt, :])  # [lb, (lpb, lpct)]
+            mul!(tmp2, B2', reshape(tmp1, (slb * slpb, slpct)))  # [(rb, rpb), lpct]
+            tmp3[:, pl_c_t] = tmp2  # [(rb, rpb), (lpc, lpt)]
+            tmp4 = reshape(tmp3, (srb * srpb, slpc, slpt))  # [(rb, rpb), lpc, lpt]
+            batched_mul!(tmp5, tmp4, M.con)  # [(rb, rpb), rpc, lpt]
+            tmp6 = reshape(tmp5, (srb, srpb * srpc, slpt))  # [rb, (rpb, rpc), lpt]
+            tmp7 = reshape(tmp6[:, pr_b_c, :], (srb * srpbc, slpt))  # [(rb, rpbc), lpt]
+            for irt ∈ 1:srt
+                tmp8[:, pr_bc_t] = (@view RE[:, irt, :])  # [rb, (rpbc, rpt)]
+                LR[ilt, irt, :, :] = tmp7' * reshape(tmp8, (srb * srpbc, srpt))  # [lpt, rpt]
+            end
+        end
+    else
+        pr_b_ct, pr_c_t, srpct =
+            merge_projectors_inter(M.lp, p_rb, p_rc, p_rt, onGPU; order = "1_23")
+        pl_bc_t, pl_b_c, slpbc =
+            merge_projectors_inter(M.lp, p_lt, p_lb, p_lc, onGPU; order = "23_1")
+
+        tmp1 = alloc_zeros(R, onGPU, (srb, srpb * srpct))
+        tmp2 = alloc_undef(R, onGPU, (slb * slpb, srpct))
+        tmp3 = alloc_zeros(R, onGPU, (slb * slpb, srpc * srpt))
+        tmp5 = alloc_undef(R, onGPU, (slb * slpb, slpc, srpt))
+        tmp8 = alloc_zeros(R, onGPU, (slb, slpbc * slpt))
+        for irt ∈ 1:srt
+            tmp1[:, pr_b_ct] = (@view RE[:, irt, :])  # [rb, (rpb, rpct)]
+            mul!(tmp2, B2, reshape(tmp1, (srb * srpb, srpct)))  # [(lb, lpb), rpct]
+            tmp3[:, pr_c_t] = tmp2  # [(lb, lpb), (rpc, rpt)]
+            tmp4 = reshape(tmp3, (slb * slpb, srpc, srpt))  # [(lb, lpb), rpc, rpt]
+            batched_mul!(tmp5, tmp4, M.con')  # [(lb, lpb), lpc, rpt]
+            tmp6 = reshape(tmp5, (slb, slpb * slpc, srpt))  # [lb, (lpb, lpc), rpt]
+            tmp7 = reshape(tmp6[:, pl_b_c, :], (slb * slpbc, srpt))  # [(lb, lpbc), rpt]
+            for ilt ∈ 1:slt
+                tmp8[:, pl_bc_t] = (@view LE[:, ilt, :])  # [lb, (lpbc, lpt)]
+                LR[ilt, irt, :, :] = reshape(tmp8, (slb * slpbc, slpt))' * tmp7  # [lct, rct]
+            end
+        end
+    end
+    reshape(LR, (slt, srt, slpt * srpt))
+end
+
+
+function update_env_right(
+    RE::S,
+    A::S,
+    M::VirtualTensor{R,4},
+    B::S,
+) where {S<:Tensor{R,3}} where {R<:Real}
+    p_lb, p_lc, p_lt, p_rb, p_rc, p_rt = M.projs
+    slb, srb = size(B, 1), size(B, 2)
+    slt, srt = size(A, 1), size(A, 2)
+    slc = length(M.lp, p_lc)
+
+    slpb, slpc, slpt = size(M.lp, p_lb), size(M.lp, p_lc), size(M.lp, p_lt)
+    srpb, srpc, srpt = size(M.lp, p_rb), size(M.lp, p_rc), size(M.lp, p_rt)
+
+    onGPU = typeof(RE) <: CuArray
+
+    A = reshape(A, (slt, srt, slpt, srpt))
+    B = reshape(B, (slb, srb, slpb, srpb))
+    Rout = alloc_zeros(R, onGPU, (slb, slt, slc))
+
+    if srpb * slpt >= srpt * slpb
+        B2 = permutedims(B, (1, 3, 2, 4))  # [lb, lpb, rb, rpb]
+        B2 = reshape(B2, (slb * slpb, srb * srpb))  # [(lb, lpb), (rb, rpb)]
+
+        pr_b_ct, pr_c_t, srpct =
+            merge_projectors_inter(M.lp, p_rb, p_rc, p_rt, onGPU; order = "1_23")
+        pl_bc_t, pl_b_c, slpbc =
+            merge_projectors_inter(M.lp, p_lt, p_lb, p_lc, onGPU; order = "23_1")
+
+        tmp1 = alloc_zeros(R, onGPU, (srb, srpb * srpct))
+        tmp2 = alloc_undef(R, onGPU, (slb * slpb, srpct))
+        tmp3 = alloc_zeros(R, onGPU, (slb * slpb, srpc, srpt))
+        tmp5 = alloc_undef(R, onGPU, (slb * slpb, slpc, srpt))
+        tmp8 = alloc_undef(R, onGPU, (slb * slpbc, slpt))
+
+        for irt ∈ 1:srt
+            tmp1[:, pr_b_ct] = (@view RE[:, irt, :])  # [rb, (rpb, rpct)]
+            mul!(tmp2, B2, reshape(tmp1, (srb * srpb, srpct)))  # [(lb, lpb), rpct]
+            tmp3[:, pr_c_t] = tmp2  # [(lb, lpb), (rpc, rpt)]
+            tmp4 = reshape(tmp3, (slb * slpb, srpc, srpt))  # [(lb, lpb), rpc, rpt]
+            batched_mul!(tmp5, tmp4, M.con')
+            tmp6 = reshape(tmp5, (slb, slpb * slpc, srpt))  # [lb, (lpb, lpc), rpt]
+            tmp7 = reshape(tmp6[:, pl_b_c, :], (slb * slpbc, srpt))  # [(lb, lpbc), rpt]
+            for ilt ∈ 1:slt
+                mul!(tmp8, tmp7, (@view A[ilt, irt, :, :])')
+                tmp9 = reshape(tmp8, (slb, slpbc * slpt))
+                Rout[:, ilt, :] .+= tmp9[:, pl_bc_t]
+            end
+        end
+    else
+        A2 = permutedims(A, (1, 3, 2, 4))  # [lt, lpt, rt, rpt]
+        A2 = reshape(A2, (slt * slpt, srt * srpt))  # [(lt, lpt), (rt, rpt)]
+
+        pr_t_cb, pr_c_b, srpcb =
+            merge_projectors_inter(M.lp, p_rt, p_rc, p_rb, onGPU; order = "1_23")
+        pl_tc_b, pl_t_c, slptc =
+            merge_projectors_inter(M.lp, p_lb, p_lt, p_lc, onGPU; order = "23_1")
+
+        tmp1 = alloc_zeros(R, onGPU, (srt, srpt * srpcb))
+        tmp2 = alloc_undef(R, onGPU, (slt * slpt, srpcb))
+        tmp3 = alloc_zeros(R, onGPU, (slt * slpt, srpc * srpb))
+        tmp5 = alloc_undef(R, onGPU, (slt * slpt, slpc, srpb))
+        tmp8 = alloc_undef(R, onGPU, (slt * slptc, slpb))
+        for irb ∈ 1:srb
+            tmp1[:, pr_t_cb] = (@view RE[irb, :, :])  # [rt, (rpt, rpcb)]
+            mul!(tmp2, A2, reshape(tmp1, (srt * srpt, srpcb)))  # [(lt, lpt), rpcb]
+            tmp3[:, pr_c_b] = tmp2  # [(lt, lpt), (rpc, rpb)]
+            tmp4 = reshape(tmp3, (slt * slpt, srpc, srpb))  # [(lt, lpt), rpc, rpb]
+            batched_mul!(tmp5, tmp4, M.con')  # [(lt, lpt), lpc, rpb]
+            tmp6 = reshape(tmp5, (slt, slpt * slpc, srpb))  # [lt, (lpt, lpc), rpb]
+            tmp7 = reshape(tmp6[:, pl_t_c, :], (slt * slptc, srpb))  # [(lb, lptc), rpb]
+            for ilb ∈ 1:slb
+                mul!(tmp8, tmp7, (@view B[ilb, irb, :, :])')
+                tmp9 = reshape(tmp8, (slt, slptc * slpb))
+                Rout[ilb, :, :] .+= tmp9[:, pl_tc_b]
+            end
+        end
+    end
+    Rout
+end
+
+
+function update_reduced_env_right(
+    K::Tensor{R,1},
+    RE::Tensor{R,2},
+    M::VirtualTensor{R,4},
+    B::Tensor{R,3},
+) where {R<:Real}
+    p_lb, p_lc, p_lt, p_rb, p_rc, p_rt = M.projs
+
+    slb, srb = size(B, 1), size(B, 2)
+    slpb, slpc, slpt = size(M.lp, p_lb), size(M.lp, p_lc), size(M.lp, p_lt)
+    srpb, srpc, srpt = size(M.lp, p_rb), size(M.lp, p_rc), size(M.lp, p_rt)
+
+    onGPU = typeof(RE) <: CuArray
+
+    K = reshape(K, (slpt, srpt))  # [lct, rct]
+    B = reshape(B, (slb, srb, slpb, srpb))  # [lb, rb, lpb, rpb]
+    B2 = permutedims(B, (1, 3, 2, 4))  # [lb, lpb, rb, rpb]
+    B2 = reshape(B2, (slb * slpb, srb * srpb))  # [(lb, lpb), (rb, rpb)]
+
+    if srpb * slpt >= srpt * slpb
+        pr_b_ct, pr_c_t, srpct =
+            merge_projectors_inter(M.lp, p_rb, p_rc, p_rt, onGPU; order = "1_23")
+        pl_bc_t, pl_b_c, slpbc =
+            merge_projectors_inter(M.lp, p_lt, p_lb, p_lc, onGPU; order = "23_1")
+
+        tmp1 = alloc_zeros(R, onGPU, (srb, srpb * srpct))
+        tmp4 = alloc_zeros(R, onGPU, (slb * slpb, srpc * srpt))
+        tmp6 = alloc_undef(R, onGPU, (slb * slpb, slpc, srpt))
+
+        tmp1[:, pr_b_ct] = RE  # [rb, (rpb, rpct)]
+        tmp2 = reshape(tmp1, (srb * srpb, srpct))  # [(rb, rpb), rpct]
+        tmp3 = B2 * tmp2  # [(lb, lpb), rpct]
+        tmp4[:, pr_c_t] = tmp3
+        tmp5 = reshape(tmp4, (slb * slpb, srpc, srpt))
+        batched_mul!(tmp6, tmp5, M.con')  # [(lb, lpb), lpc, rpt]
+        tmp7 = reshape(tmp6, (slb, slpb * slpc, srpt))
+        tmp8 = tmp7[:, pl_b_c, :]  # [lb, lpbc, rpt]
+        tmp9 = reshape(tmp8, (slb * slpbc, srpt)) * K' # [(lb, lpbc), lpt]
+        tmp10 = reshape(tmp9, (slb, slpbc * slpt))
+        Rtemp = tmp10[:, pl_bc_t]
+    else
+        pr_bc_t, pr_b_c, srpbc =
+            merge_projectors_inter(M.lp, p_rt, p_rb, p_rc, onGPU; order = "23_1")
+        pl_b_ct, pl_c_t, slpct =
+            merge_projectors_inter(M.lp, p_lb, p_lc, p_lt, onGPU; order = "1_23")
+
+        tmp1 = alloc_zeros(R, onGPU, (srb, srpbc * srpt))
+        tmp4 = alloc_zeros(R, onGPU, (srb, srpb * srpc, slpt))
+        tmp6 = alloc_undef(R, onGPU, (srb * srpb, slpc, slpt))
+
+        tmp1[:, pr_bc_t] = RE  # [rb, (rpbc, rpt)]
+        tmp2 = reshape(tmp1, (srb * srpbc, srpt))  # [(rb, rpbc), rpt]
+        tmp3 = reshape(tmp2 * K', (srb, srpbc, slpt)) # [rb, rpbc, lpt]
+        tmp4[:, pr_b_c, :] = tmp3
+        tmp5 = reshape(tmp4, (srb * srpb, srpc, slpt))
+        batched_mul!(tmp6, tmp5, M.con')  # [(rb, rpb), lpc, lpt]
+        tmp7 = reshape(tmp6, (srb * srpb, slpc * slpt))  # [(rb, rpb), (lpc, lpt)]
+        tmp8 = tmp7[:, pl_c_t]
+        tmp9 = B2 * tmp8  # [(lb, lpb), lpct]
+        tmp10 = reshape(tmp9, (slb, slpb * slpct))
+        Rtemp = tmp10[:, pl_b_ct]
+    end
+    Rtemp
+end
+
+
+function contract_tensors43(M::VirtualTensor{R,4}, B::Tensor{R,3}) where {R<:Real}
+    p_lb, p_l, p_lt, p_rb, p_r, p_rt = M.projs
+
+    slb, srb = size(B, 1), size(B, 2)
+    slcb, slc, slct = size(M.lp, p_lb), size(M.lp, p_l), size(M.lp, p_lt)
+    srcb, src, srct = size(M.lp, p_rb), size(M.lp, p_r), size(M.lp, p_rt)
+    slcp, srcp = length(M.lp, p_l), length(M.lp, p_r)
+
+    B = reshape(B, (slb, srb, slcb, srcb))
+
+    pls = sparse(R, M.lp, p_lb, p_l, p_lt, :CPU)
+    pls = typeof(B) <: CuArray ? CuArray(pls) : Array(pls)
+    pls = reshape(pls, (slcb, slc, slct * slcp))
+    pls = permutedims(pls, (3, 1, 2))  # [(slct, slcp), lcb, lc]
+
+    prs = sparse(R, M.lp, p_rb, p_r, p_rt, :CPU)
+    prs = typeof(B) <: CuArray ? CuArray(prs) : Array(prs)
+    prs = reshape(prs, (srcb, src, srct * srcp))
+    prs = permutedims(prs, (3, 1, 2))  # [(rct, rcp), rcb, rc]
+
+    if size(M.con, 1) <= size(M.con, 2)
+        prs = contract_matrix_tensor3(M.con, prs)
+    else
+        pls = contract_tensor3_matrix(pls, M.con)
+    end
+    @tensor order = (lb, c, rb) MB[l, lt, r, rt] :=
+        pls[lt, lb, c] * prs[rt, rb, c] * B[l, r, lb, rb]
+    MB = reshape(MB, slb, slct, slcp, srb, srct, srcp)
+    MB = permutedims(MB, (1, 3, 4, 6, 2, 5))
+    reshape(MB, (slb * slcp, srb * srcp, slct * srct))
+end
+
+function corner_matrix(
+    C::S,
+    M::T,
+    B::S,
+) where {S<:Tensor{R,3},T<:VirtualTensor{R,4}} where {R<:Real}
+    slb, srb = size(B, 1), size(B, 2)
+    srcc, stc = size(C, 2), size(C, 3)
+    V = contract_tensors43(M, B)
+    vl, vr, vt = size(V, 1), size(V, 2), size(V, 3)
+    V = reshape(V, (vl, srb, stc, vt))
+    @tensor Cnew[vl, vt, vrr] := V[vl, srb, stc, vt] * C[srb, vrr, stc]
+    reshape(Cnew, (slb, :, srcc, vt))
+end
diff --git a/src/environment.jl b/src/environment.jl
new file mode 100644
index 0000000..b18caf5
--- /dev/null
+++ b/src/environment.jl
@@ -0,0 +1,293 @@
+export Environment, EnvironmentMixed, left_nbrs_site, right_nbrs_site
+
+abstract type AbstractEnvironment end
+
+mutable struct EnvironmentMixed{T<:Real} <: AbstractEnvironment
+    bra::QMps{T}  # mps that is to be optimized
+    mpo::QMpo{T}
+    ket::QMps{T}
+    C::Tensor{T,3}
+    site::Any  # position of C is at: site - epsilon  ::Union(Sites, :central)
+    env::Dict
+    onGPU::Bool
+
+    function EnvironmentMixed(
+        bra::QMps{T},
+        C::Tensor{T,3},
+        mpo::QMpo{T},
+        ket::QMps{T};
+    ) where {T<:Real}
+        onGPU = bra.onGPU && mpo.onGPU && ket.onGPU
+        @assert bra.sites == ket.sites && issubset(bra.sites, mpo.sites)
+        id3 = onGPU ? CUDA.ones(T, 1, 1, 1) : ones(T, 1, 1, 1)
+        id2 = onGPU ? CUDA.ones(T, 1, 1) : ones(T, 1, 1)
+        env0 = Dict{Any,Any}((bra.sites[1], :left) => id2, (bra.sites[end], :right) => id3)
+        env = new{T}(bra, mpo, ket, C, last(bra.sites) + 1, env0, onGPU)  #
+        update_env_left!.(Ref(env), env.bra.sites)
+        env
+    end
+end
+
+function clear_env_containing_site!(env::EnvironmentMixed, site)
+    if site == :central
+        delete!(env.env, (env.site, :left))
+        delete!(env.env, (left_nbrs_site(env.site, env.ket.sites), :right))
+    else
+        if site == env.site
+            delete!(env.env, (:central, :right))
+        else
+            delete!(env.env, (left_nbrs_site(site, env.ket.sites), :right))
+        end
+        rs = right_nbrs_site(site, env.ket.sites)
+        if rs == env.site
+            delete!(env.env, (:central, :left))
+        else
+            delete!(env.env, (rs, :left))
+        end
+    end
+end
+
+mutable struct Environment{T<:Real} <: AbstractEnvironment
+    bra::QMps{T}  # mps that is to be optimized
+    mpo::QMpo{T}
+    ket::QMps{T}
+    env::Dict
+    log_norms::Dict
+
+    function Environment(bra::QMps{T}, mpo::QMpo{T}, ket::QMps{T}) where {T<:Real}
+        onGPU = bra.onGPU && mpo.onGPU && ket.onGPU
+        @assert bra.sites == ket.sites && issubset(bra.sites, mpo.sites)
+        id = onGPU ? CUDA.ones(T, 1, 1, 1) : ones(T, 1, 1, 1)
+        env0 = Dict((bra.sites[1], :left) => id, (bra.sites[end], :right) => id)
+        ln0 = Dict((bra.sites[1], :left) => zero(T), (bra.sites[end], :right) => zero(T))
+        env = new{T}(bra, mpo, ket, env0, ln0)
+        update_env_left!.(Ref(env), env.bra.sites)
+        env
+    end
+end
+
+function clear_env_containing_site!(env::Environment, site::Site)
+    delete!(env.env, (left_nbrs_site(site, env.ket.sites), :right))
+    delete!(env.env, (right_nbrs_site(site, env.ket.sites), :left))
+end
+
+"""
+Largest x in sites: x < site
+"""
+function left_nbrs_site(site::Site, sites)
+    ls = filter(i -> i < site, sites)
+    isempty(ls) && return -Inf
+    maximum(ls)
+end
+
+"""
+Smallest x in sites: x > site
+"""
+function right_nbrs_site(site::Site, sites)
+    ms = filter(i -> i > site, sites)
+    isempty(ms) && return Inf
+    minimum(ms)
+end
+
+"""
+        -- A --
+      |    |
+ L = LE -- M --
+      |    |
+        -- B --
+"""
+function update_env_left(
+    LE::S,
+    A::S,
+    M::T,
+    B::S,
+) where {S<:AbstractArray{R,3},T<:MpoTensor{R,4}} where {R<:Real}
+    for v ∈ M.top
+        A = contract_tensor3_matrix(A, v)
+    end
+    for v ∈ reverse(M.bot)
+        B = contract_matrix_tensor3(v, B)
+    end
+    update_env_left(LE, A, M.ctr, B)
+end
+
+function update_env_left!(env::Environment, site::Site)
+    site <= first(env.bra.sites) && return
+    ls = left_nbrs_site(site, env.bra.sites)
+    LL = update_env_left(env.env[(ls, :left)], env.bra[ls], env.mpo[ls], env.ket[ls])
+    rs = right_nbrs_site(ls, env.mpo.sites)
+    while rs < site
+        LL = contract_tensor3_matrix(LL, env.mpo[rs])
+        rs = right_nbrs_site(rs, env.mpo.sites)
+    end
+    nLL = maximum(abs.(LL))
+    LL ./= nLL
+    push!(env.env, (site, :left) => LL)
+    nLL = env.log_norms[(ls, :left)] + log(nLL)
+    push!(env.log_norms, (site, :left) => nLL)
+end
+
+function update_env_left!(env::EnvironmentMixed{T}, site) where {T}    # site::Union(Sites, :central)
+    if site == first(env.bra.sites)
+        if env.site == first(env.bra.sites)
+            LL = env.onGPU ? CUDA.ones(T, 1, 1, 1) : ones(T, 1, 1, 1)
+        else
+            LL = env.onGPU ? CUDA.ones(T, 1, 1) : ones(T, 1, 1)
+        end
+    elseif site == :central
+        if env.site == first(env.bra.sites)
+            LL = env.onGPU ? CUDA.ones(T, 1, 1) : ones(T, 1, 1)
+        else
+            ls = left_nbrs_site(env.site, env.bra.sites)
+            LL = update_env_left(env.env[(ls, :left)], env.bra[ls], env.ket[ls])
+            LL ./= maximum(abs.(LL))
+        end
+    elseif site < env.site
+        ls = left_nbrs_site(site, env.bra.sites)
+        LL = update_env_left(env.env[(ls, :left)], env.bra[ls], env.ket[ls])
+        LL ./= maximum(abs.(LL))
+    elseif site == env.site
+        ls = left_nbrs_site(site, env.bra.sites)
+        LL = update_env_left(env.env[(:central, :left)], env.C)
+        LL ./= maximum(abs.(LL))
+    else
+        ls = left_nbrs_site(site, env.bra.sites)
+        LL = update_env_left(env.env[(ls, :left)], env.bra[ls], env.mpo[ls], env.ket[ls])
+        rs = right_nbrs_site(ls, env.mpo.sites)
+        while rs < site
+            LL = contract_tensor3_matrix(LL, env.mpo[rs])
+            rs = right_nbrs_site(rs, env.mpo.sites)
+        end
+        LL ./= maximum(abs.(LL))
+    end
+    push!(env.env, (site, :left) => LL)
+end
+
+
+"""
+      -- A --
+         |    |
+ R =  -- M -- RE
+         |    |
+      -- B --
+"""
+function update_env_right(
+    RE::S,
+    A::S1,
+    M::T,
+    B::S,
+) where {T<:MpoTensor{R,4},S<:AbstractArray{R,3},S1<:AbstractArray{R,3}} where {R<:Real}
+    for v ∈ M.top
+        A = contract_tensor3_matrix(A, v)
+    end
+    for v ∈ reverse(M.bot)
+        B = contract_matrix_tensor3(v, B)
+    end
+    update_env_right(RE, A, M.ctr, B)
+end
+
+function update_env_right!(env::Environment, site::Site)
+    site >= last(env.bra.sites) && return
+    rs = right_nbrs_site(site, env.bra.sites)
+    RR = update_env_right(env.env[(rs, :right)], env.bra[rs], env.mpo[rs], env.ket[rs])
+    ls = left_nbrs_site(rs, env.mpo.sites)
+    while ls > site
+        RR = contract_matrix_tensor3(env.mpo[ls], RR)
+        ls = left_nbrs_site(ls, env.mpo.sites)
+    end
+    nRR = maximum(abs.(RR))
+    RR ./= nRR
+    push!(env.env, (site, :right) => RR)
+    nRR = env.log_norms[(rs, :right)] + log(nRR)
+    push!(env.log_norms, (site, :right) => nRR)
+end
+
+function update_env_right!(env::EnvironmentMixed{T}, site) where {T}   # site::Union(Sites, :central)
+    if site == last(env.bra.sites)
+        if env.site > last(env.bra.sites)
+            RR = env.onGPU ? CUDA.ones(T, 1, 1) : ones(T, 1, 1)
+        else
+            RR = env.onGPU ? CUDA.ones(T, 1, 1, 1) : ones(T, 1, 1, 1)
+        end
+    elseif site == :central
+        rs = env.site
+        RR = update_env_right(env.env[(rs, :right)], env.bra[rs], env.mpo[rs], env.ket[rs])
+    elseif site >= env.site
+        rs = right_nbrs_site(site, env.bra.sites)
+        RR = update_env_right(env.env[(rs, :right)], env.bra[rs], env.mpo[rs], env.ket[rs])
+        ls = left_nbrs_site(rs, env.mpo.sites)
+        while ls > site
+            RR = contract_matrix_tensor3(env.mpo[ls], RR)
+            ls = left_nbrs_site(ls, env.mpo.sites)
+        end
+    else
+        rs = right_nbrs_site(site, env.bra.sites)
+        if rs == env.site
+            RR = update_env_right(env.env[(:central, :right)], env.C)
+        else
+            RR = update_env_right(env.env[(rs, :right)], env.bra[rs], env.ket[rs])
+        end
+    end
+    RR ./= maximum(abs.(RR))
+    push!(env.env, (site, :right) => RR)
+end
+
+
+"""
+   |    |    |
+  LE -- M -- RE
+   |    |    |
+     -- B --
+"""
+function project_ket_on_bra(
+    LE::S,
+    B::S,
+    M::T,
+    RE::S,
+) where {S<:AbstractArray{R,3},T<:MpoTensor{R,4}} where {R<:Real}
+    for v ∈ reverse(M.bot)
+        B = contract_matrix_tensor3(v, B)
+    end
+    B = project_ket_on_bra(LE, B, M.ctr, RE)
+    for v ∈ reverse(M.top)
+        B = contract_matrix_tensor3(v, B)
+    end
+    B
+end
+
+
+project_ket_on_bra(env::Environment, site::Site) = project_ket_on_bra(
+    env.env[(site, :left)],
+    env.ket[site],
+    env.mpo[site],
+    env.env[(site, :right)],
+)
+
+function project_ket_on_bra(env::EnvironmentMixed, site)
+    if site == :central
+        B = project_ket_on_bra(env.env[(site, :left)], env.env[(site, :right)])
+    elseif site >= env.site
+        B = project_ket_on_bra(
+            env.env[(site, :left)],
+            env.ket[site],
+            env.mpo[site],
+            env.env[(site, :right)],
+        )
+    else
+        B = project_ket_on_bra(
+            env.env[(site, :left)],
+            env.ket[site],
+            env.env[(site, :right)],
+        )
+    end
+    B
+end
+
+function measure_env(env::Environment, site::Site)
+    L = update_env_left(env.env[(site, :left)], env.bra[site], env.mpo[site], env.ket[site])
+    R = env.env[(site, :right)]
+    overlap = @tensor L[b, t, c] * R[b, t, c]
+    negative = overlap < 0
+    overlap *= sign(overlap)
+    (log(overlap) + env.log_norms[(site, :left)] + env.log_norms[(site, :right)], negative)
+end
diff --git a/src/gauges.jl b/src/gauges.jl
new file mode 100644
index 0000000..c55cc78
--- /dev/null
+++ b/src/gauges.jl
@@ -0,0 +1,155 @@
+
+# gauges.jl: This file provides basic functions to optimize gauges for the PEPS network. CUDA is supported.
+
+export optimize_gauges_for_overlaps!!, overlap_density_matrix
+
+function update_rq!(ψ::QMps{T}, AT::Array{T,3}, i::Site) where {T<:Real}
+    @cast ATR[x, (σ, y)] := AT[x, σ, y]
+    RT, QT = rq_fact(ATR)
+    RT ./= maximum(abs.(RT))
+    @cast AT[x, σ, y] := QT[x, (σ, y)] (σ ∈ 1:size(AT, 2))
+    ψ[i] = AT
+    RT
+end
+
+function update_rq!(ψ::QMps{T}, AT::CuArray{T,3}, i::Site) where {T<:Real}
+    @cast ATR[x, (σ, y)] := AT[x, σ, y]
+    RT, QT = rq_fact(ATR)
+    RT ./= maximum(abs.(RT))
+    @cast AT[x, σ, y] := QT[x, (σ, y)] (σ ∈ 1:size(AT, 2))
+    ψ[i] = AT
+    RT
+end
+
+function update_qr!(ψ::QMps{T}, AT::Array{T,3}, i::Site) where {T<:Real}
+    @cast ATR[(x, σ), y] := AT[x, σ, y]
+    QT, RT = qr_fact(ATR)
+    RT ./= maximum(abs.(RT))
+    @cast AT[x, σ, y] := QT[(x, σ), y] (σ ∈ 1:size(AT, 2))
+    ψ[i] = AT
+    RT
+end
+
+function update_qr!(ψ::QMps{T}, AT::CuArray{T,3}, i::Site) where {T<:Real}
+    @cast ATR[(x, σ), y] := AT[x, σ, y]
+    QT, RT = qr_fact(ATR)
+    RT ./= maximum(abs.(RT))
+    @cast AT[x, σ, y] := QT[(x, σ), y] (σ ∈ 1:size(AT, 2))
+    ψ[i] = AT
+    RT
+end
+
+function _gauges_right_sweep!!!(
+    ψ_top::QMps{R},
+    ψ_bot::QMps{R},
+    gauges::Dict;
+    tol = 1E-12,
+) where {R<:Real}
+    RT = ψ_top.onGPU && ψ_bot.onGPU ? CUDA.ones(R, 1, 1) : ones(R, 1, 1)
+    RB = copy(RT)
+    for i ∈ ψ_top.sites
+        T, B = ψ_top[i], ψ_bot[i]
+
+        @tensor T[a, b, c] := RT[a, s] * T[s, b, c]
+        @tensor B[a, b, c] := RB[a, s] * B[s, b, c]
+        @tensor ρ_t[r, s] := T[i, r, j] * conj(T)[i, s, j]
+        @tensor ρ_b[r, s] := B[i, r, j] * conj(B)[i, s, j]
+
+        dρ_b, dρ_t = diag.((ρ_b, ρ_t))
+        K = (dρ_b .< tol) .|| (dρ_t .< tol)
+        dρ_b[K] .= 1
+        dρ_t[K] .= 1
+
+        X = (dρ_b ./ dρ_t) .^ (1 / 4) # optimize
+        X_inv = 1 ./ X
+        gauges[i] .*= X  # update
+
+        RT = update_qr!(ψ_top, T .* reshape(X, 1, :, 1), i)
+        RB = update_qr!(ψ_bot, B .* reshape(X_inv, 1, :, 1), i)
+    end
+end
+
+function _gauges_left_sweep!!!(
+    ψ_top::QMps{R},
+    ψ_bot::QMps{R},
+    gauges::Dict;
+    tol = 1E-12,
+) where {R<:Real}
+    RT = ψ_top.onGPU && ψ_bot.onGPU ? CUDA.ones(R, 1, 1) : ones(R, 1, 1)
+    RB = copy(RT)
+    for i ∈ reverse(ψ_top.sites)
+        T, B = ψ_top[i], ψ_bot[i]
+
+        @tensor T[a, b, c] := T[a, b, s] * RT[s, c]
+        @tensor B[a, b, c] := B[a, b, s] * RB[s, c]
+        @tensor ρ_t[r, s] := T[i, r, j] * conj(T)[i, s, j]
+        @tensor ρ_b[r, s] := B[i, r, j] * conj(B)[i, s, j]
+
+        dρ_b, dρ_t = diag.((ρ_b, ρ_t))
+        K = (dρ_b .< tol) .|| (dρ_t .< tol)
+        dρ_b[K] .= 1
+        dρ_t[K] .= 1
+
+        X = (dρ_b ./ dρ_t) .^ (1 / 4) # optimize
+        X_inv = 1 ./ X
+        gauges[i] .*= X # update
+
+        RT = update_rq!(ψ_top, T .* reshape(X, 1, :, 1), i)
+        RB = update_rq!(ψ_bot, B .* reshape(X_inv, 1, :, 1), i)
+    end
+end
+
+function optimize_gauges_for_overlaps!!(
+    ψ_top::QMps{T},
+    ψ_bot::QMps{T},
+    tol = 1E-8,
+    max_sweeps::Int = 4,
+) where {T<:Real}
+    onGPU = ψ_top.onGPU && ψ_bot.onGPU
+    canonise!(ψ_top, :right)
+    canonise!(ψ_bot, :right)
+    overlap_old = dot(ψ_top, ψ_bot)
+    gauges = Dict(i => (onGPU ? CUDA.ones : ones)(T, size(ψ_top[i], 2)) for i ∈ ψ_top.sites)
+    #gauges = Dict(i => ones(T, size(ψ_top[i], 2)) for i ∈ ψ_top.sites)
+    for _ ∈ 1:max_sweeps
+        _gauges_right_sweep!!!(ψ_top, ψ_bot, gauges)
+        _gauges_left_sweep!!!(ψ_top, ψ_bot, gauges)
+        overlap_new = dot(ψ_top, ψ_bot)
+        Δ = overlap_new / overlap_old
+        overlap_old = overlap_new
+        if abs(Δ - one(T)) < tol
+            break
+        end
+    end
+    gauges
+end
+
+function overlap_density_matrix(ϕ::QMps{T}, ψ::QMps{T}, k::Site) where {T<:Real}
+    @assert ψ.sites == ϕ.sites
+    C = _overlap_forward(ϕ, ψ, k)
+    D = _overlap_backwards(ϕ, ψ, k)
+    A, B = ψ[k], ϕ[k]
+    @tensor E[x, y] := C[b, a] * conj(B)[b, x, β] * A[a, y, α] * D[β, α]
+end
+
+function _overlap_forward(ϕ::QMps{T}, ψ::QMps{T}, k::Site) where {T<:Real}
+    C = ϕ.onGPU && ψ.onGPU ? CUDA.ones(T, 1, 1) : ones(T, 1, 1)
+    i = ψ.sites[1]
+    while i < k
+        A, B = ψ[i], ϕ[i]
+        @tensor order = (α, β, σ) C[x, y] := conj(B)[β, σ, x] * C[β, α] * A[α, σ, y]
+        i += 1
+    end
+    C
+end
+
+function _overlap_backwards(ϕ::QMps{T}, ψ::QMps{T}, k::Site) where {T<:Real}
+    D = ϕ.onGPU && ψ.onGPU ? CUDA.ones(T, 1, 1) : ones(T, 1, 1)
+    i = ψ.sites[end]
+    while i > k
+        A, B = ψ[i], ϕ[i]
+        @tensor order = (α, β, σ) D[x, y] := conj(B)[x, σ, β] * D[β, α] * A[y, σ, α]
+        i -= 1
+    end
+    D
+end
diff --git a/src/identities.jl b/src/identities.jl
deleted file mode 100644
index a3077c0..0000000
--- a/src/identities.jl
+++ /dev/null
@@ -1,45 +0,0 @@
-export IdentityMPO, IdentityMPS
-struct IdentityMPS{T<:Number,S<:AbstractArray} <: AbstractMPS{T} end
-struct IdentityMPO{T<:Number,S<:AbstractArray} <: AbstractMPO{T} end
-IdentityMPS() = IdentityMPS{Float64,Array}()
-IdentityMPO() = IdentityMPO{Float64,Array}()
-
-IdentityMPS(::Type{T}) where {T<:AbstractArray} = IdentityMPS{Float64,T}
-IdentityMPO(::Type{T}) where {T<:AbstractArray} = IdentityMPO{Float64,T}
-
-IdentityMPS(::Type{S}, ::Type{T}) where {S<:Number,T<:AbstractArray} = IdentityMPS{S,T}
-IdentityMPO(::Type{S}, ::Type{T}) where {S<:Number,T<:AbstractArray} = IdentityMPO{S,T}
-
-const IdentityMPSorMPO = Union{IdentityMPO,IdentityMPS}
-
-
-@inline function Base.getindex(::IdentityMPS{S,T}, ::Int) where {S,T}
-    ret = similar(T{S}, (1, 1, 1))
-    ret[1] = one(S)
-    ret
-end
-
-
-@inline function Base.getindex(::IdentityMPO{S,T}, ::Int) where {S,T}
-    ret = similar(T{S}, (1, 1, 1, 1))
-    ret[1] = one(S)
-    ret
-end
-
-
-LinearAlgebra.dot(O::AbstractMPO, ::IdentityMPO) = O
-LinearAlgebra.dot(::IdentityMPO, O::AbstractMPO) = O
-Base.length(::IdentityMPSorMPO) = Inf
-
-
-LinearAlgebra.dot(O::AbstractMPO, ::IdentityMPS) =
-    MPS([dropdims(sum(A, dims = 4), dims = 4) for A ∈ O])
-
-
-LinearAlgebra.dot(::IdentityMPO, ψ::AbstractMPS) = ψ
-LinearAlgebra.dot(ψ::AbstractMPS, ::IdentityMPO) = ψ
-
-function Base.show(io::IO, ::IdentityMPSorMPO)
-    println(io, "Trivial matrix product state")
-    println(io, "   ")
-end
diff --git a/src/linear_algebra_ext.jl b/src/linear_algebra_ext.jl
index ec7f65c..9055125 100644
--- a/src/linear_algebra_ext.jl
+++ b/src/linear_algebra_ext.jl
@@ -1,54 +1,59 @@
-export rq_fact, qr_fact
 
+# linear_algebra_ext.jl: This file provides basic functions to perform custom SVD, and QR.
+#                        Both are calculated on CPU, but can be transferd to GPU if need be.
 
-function qr_fact(M::AbstractMatrix, Dcut::Int = typemax(Int), tol::Float64 = 1E-12, args...)
-    F = qr(M, args...)
-    q, r = _qr_fix(Array(F.Q), Array(F.R))
-    if Dcut > size(q, 2)
-        return q, r
-    end
-    U, Σ, V = svd(r, Dcut, tol)
-    q * U, Diagonal(Σ) * V'
-end
+export rq_fact, qr_fact, svd_fact
 
+@inline phase(d::T; atol = eps()) where {T<:Real} =
+    isapprox(d, zero(T), atol = atol) ? one(T) : d / abs(d)
+@inline phase(d::AbstractArray; atol = eps()) = map(x -> phase(x; atol = atol), d)
 
-function rq_fact(M::AbstractMatrix, Dcut::Int = typemax(Int), tol::Float64 = 1E-12, args...)
-    q, r = qr_fact(M', Dcut, tol, args...)
-    r', q'
+function svd_fact(
+    A::AbstractMatrix{T},
+    Dcut::Int = typemax(Int),
+    tol = eps(T);
+    kwargs...,
+) where {T<:Real}
+    U, Σ, V = svd(A; kwargs...)
+    δ = min(Dcut, sum(Σ .> Σ[1] * max(eps(), tol)))
+    U, Σ, V = U[:, 1:δ], Σ[1:δ], V[:, 1:δ]
+    Σ ./= sqrt(sum(Σ .^ 2))
+    ϕ = reshape(phase(diag(U); atol = tol), 1, :)
+    U .* ϕ, Σ, V .* ϕ
 end
 
 
-function _qr_fix(Q::T, R::AbstractMatrix) where {T<:AbstractMatrix}
-    d = diag(R)
-    for i ∈ eachindex(d)
-        @inbounds d[i] = ifelse(isapprox(d[i], 0, atol = 1e-14), 1, d[i])
+function qr_fact(
+    M::AbstractMatrix{T},
+    Dcut::Int = typemax(Int),
+    tol::T = eps();
+    toGPU::Bool = true,
+    kwargs...,
+) where {T<:Real}
+    q, r = qr_fix(qr(Array(M); kwargs...))
+    if Dcut >= size(q, 2)
+        toGPU && return CuArray.((q, r))
+        return q, r
     end
-    ph = d ./ abs.(d)
-    Q * Diagonal(ph), Diagonal(ph) * R
+    U, Σ, V = svd_fact(r, Dcut, tol, kwargs...)
+    toGPU && return CuArray.((q * U, Σ .* V'))
+    q * U, Σ .* V'
 end
 
 
-function LinearAlgebra.svd(
-    A::AbstractMatrix,
+function rq_fact(
+    M::AbstractMatrix{T},
     Dcut::Int = typemax(Int),
-    tol::Float64 = 1E-12,
-    args...,
-)
-
-    U, Σ, V = svd(A, args...)
-
-    tol = Σ[1] * max(eps(), tol)
-    δ = min(Dcut, sum(Σ .> tol))
-
-    U = U[:, 1:δ]
-    Σ = Σ[1:δ]
-    Σ ./ sum(Σ .^ 2)
-    V = V[:, 1:δ]
+    tol::T = eps();
+    toGPU::Bool = true,
+    kwargs...,
+) where {T<:Real}
+    q, r = qr_fact(M', Dcut, tol; toGPU = toGPU, kwargs...)
+    toGPU && return CuArray.((r', q'))
+    r', q'
+end
 
-    d = diag(U)
-    for i ∈ eachindex(d)
-        @inbounds d[i] = ifelse(isapprox(d[i], 0, atol = 1e-14), 1, d[i])
-    end
-    ph = d ./ abs.(d)
-    U * Diagonal(ph), Σ, V * Diagonal(ph)
+function qr_fix(QR_fact; tol::T = eps()) where {T<:Real}
+    ϕ = phase(diag(QR_fact.R); atol = tol)
+    QR_fact.Q * Diagonal(ϕ), ϕ .* QR_fact.R
 end
diff --git a/src/mps/base.jl b/src/mps/base.jl
new file mode 100644
index 0000000..fe72822
--- /dev/null
+++ b/src/mps/base.jl
@@ -0,0 +1,110 @@
+# ./mps/base.jl: This file provides basic definitions of custom Matrix Product States / Operators.
+
+export Site, Sites, AbstractTensorNetwork, MpoTensor, QMpsOrMpo
+
+abstract type AbstractTensorNetwork{T} end
+
+const Site = Union{Int,Rational{Int}}
+const Sites = NTuple{N,Site} where {N}
+const TensorMap{T} = Dict{Site,Union{Tensor{T,2},Tensor{T,3},Tensor{T,4}}}  # 2 and 4 - mpo;  3 - mps
+
+"""
+A mutable struct representing a Matrix Product Operator (MPO) tensor in a tensor network.
+
+## Fields
+- `top::Vector{Tensor{T, 2}}`: Vector of tensors representing the top tensor of the MPO. Empty if `N == 2`.
+- `ctr::Union{Tensor{T, N}, Nothing}`: Central tensor of the MPO. `Nothing` if not present.
+- `bot::Vector{Tensor{T, 2}}`: Vector of tensors representing the bottom tensor of the MPO. Empty if `N == 2`.
+- `dims::Dims{N}`: Dimensions of the MPO tensor.
+
+## Description
+`MpoTensor{T, N}` is a mutable struct that represents a Matrix Product Operator tensor in a tensor network.     
+The MPO tensor is characterized by its top and bottom tensors, a central tensor (`ctr`), and dimensions (`dims`). 
+The top and bottom legs are vectors of two-dimensional tensors (`Tensor{T, 2}`). 
+The central tensor is of type `Tensor{T, N}` or `Nothing` if not present. 
+The dimensions of the MPO tensor are specified by `dims`. 
+"""
+mutable struct MpoTensor{T<:Real,N}
+    top::Vector{Tensor{T,2}}  # N == 2 top = []
+    ctr::Union{Tensor{T,N},Nothing}
+    bot::Vector{Tensor{T,2}}  # N == 2 bot = []
+    dims::Dims{N}
+end
+
+# """
+# Constructor function for creating a Matrix Product Operator (MPO) tensor from a `TensorMap`.
+
+#  ## Arguments
+# - `ten::TensorMap{T}`: A dictionary mapping `Site` indices to tensors of type `Tensor{T, 2}`, `Tensor{T, 3}`, or `Tensor{T, 4}`. The key `0` represents the central tensor (`ctr`) of the MPO.
+
+# ## Returns
+# - An instance of `MpoTensor{T, nn}` representing the Matrix Product Operator.
+
+# ## Description
+# The `MpoTensor` function constructs a Matrix Product Operator tensor from a `TensorMap`, 
+# where the keys are `Site` indices and the values are tensors of appropriate dimensions. 
+# The construction process involves sorting the tensor dictionary based on the site indices 
+# and separating tensors into the top, central, and bottom parts. 
+# The central tensor is identified by the key `0`. 
+# The resulting `MpoTensor` encapsulates the tensors along with their dimensions.
+
+# ## Exceptions
+# - Throws a `DomainError` if the central tensor (`ctr`) has dimensions other than 2 or 4.
+# """ 
+function MpoTensor(ten::TensorMap{T}) where {T}
+    sk = sort(collect(keys(ten)))
+    top = [ten[k] for k ∈ sk if k < 0]
+    bot = [ten[k] for k ∈ sk if k > 0]
+    ctr = get(ten, 0, nothing)
+
+    if isnothing(ctr)
+        top_bot = vcat(top, bot)
+        dims = (0, size(top_bot[1], 1), 0, size(top_bot[end], 2))
+        nn = 4
+    else
+        nn = ndims(ctr)
+        if nn == 2
+            @assert isempty(top) && isempty(bot) "Both top and bot should be empty"
+            dims = size(ctr)
+        elseif nn == 4
+            dims = (
+                size(ctr, 1),
+                isempty(top) ? size(ctr, 2) : size(top[1], 1),
+                size(ctr, 3),
+                isempty(bot) ? size(ctr, 4) : size(bot[end], 2),
+            )
+        else
+            throw(DomainError(ndims(ctr), "MpoTensor should have ndims 2 or 4"))
+        end
+    end
+    MpoTensor{T,nn}(top, ctr, bot, dims)
+end
+
+Base.eltype(ten::MpoTensor{T,N}) where {T,N} = T
+Base.ndims(ten::MpoTensor{T,N}) where {T,N} = N
+Base.size(ten::MpoTensor, n::Int) = ten.dims[n]
+Base.size(ten::MpoTensor) = ten.dims
+
+const MpoTensorMap{T} = Dict{Site,MpoTensor{T}}
+
+for (S, M) ∈ ((:QMpo, :MpoTensorMap), (:QMps, :TensorMap))
+    @eval begin
+        export $S, $M
+        mutable struct $S{F<:Real} <: AbstractTensorNetwork{F}
+            tensors::$M{F}
+            sites::Vector{Site}
+            onGPU::Bool
+
+            function $S(ten::$M{F}; onGPU::Bool = false) where {F}
+                new{F}(ten, sort(collect(keys(ten))), onGPU)
+            end
+        end
+    end
+end
+
+const QMpsOrMpo{T} = Union{QMpo{T},QMps{T}}
+
+@inline Base.getindex(ψ::QMpsOrMpo, i) = getindex(ψ.tensors, i)
+@inline Base.setindex!(ψ::QMpsOrMpo, A, i::Site) = ψ.tensors[i] = A
+@inline Base.eltype(ψ::QMpsOrMpo{T}) where {T} = T
+@inline Base.copy(ψ::QMps) = QMps(copy(ψ.tensors), onGPU = ψ.onGPU)
diff --git a/src/mps/canonise.jl b/src/mps/canonise.jl
new file mode 100644
index 0000000..ab0a381
--- /dev/null
+++ b/src/mps/canonise.jl
@@ -0,0 +1,98 @@
+
+# canonise.jl: This file provides basic function to left / right truncate / canonise MPS. CUDA is supported.
+
+export canonise!, truncate!, canonise_truncate!, measure_spectrum
+
+
+function measure_spectrum(ψ::QMps{T}) where {T<:Real}
+    # Assume that ψ is left_canonical
+    @assert is_left_normalized(ψ)
+    R = ones(T, 1, 1)
+    schmidt = Dict() # {Site =>AbstractArray}
+    for i ∈ reverse(ψ.sites)
+        B = permutedims(Array(ψ[i]), (1, 3, 2)) # [x, σ, α]
+        @matmul M[x, σ, y] := sum(α) B[x, σ, α] * R[α, y]
+        @cast M[x, (σ, y)] := M[x, σ, y]
+        Dcut, tolS = 100000, 0.0
+        U, S, _ = svd_fact(Array(M), Dcut, tolS)
+        push!(schmidt, i => S)
+        R = U * Diagonal(S)
+    end
+    schmidt
+end
+
+
+
+function truncate!(
+    ψ::QMps{T},
+    s::Symbol,
+    Dcut::Int = typemax(Int),
+    tolS::T = eps();
+    kwargs...,
+) where {T<:Real}
+    @assert s ∈ (:left, :right)
+    if s == :right
+        _right_sweep!(ψ; kwargs...)
+        _left_sweep!(ψ, Dcut, tolS; kwargs...)
+    else
+        _left_sweep!(ψ, args...)
+        _right_sweep!(ψ, Dcut, tolS; kwargs...)
+    end
+end
+
+canonise!(ψ::QMps, s::Symbol) = canonise!(ψ, Val(s))
+canonise!(ψ::QMps, ::Val{:right}) = _left_sweep!(ψ, typemax(Int))
+canonise!(ψ::QMps, ::Val{:left}) = _right_sweep!(ψ, typemax(Int))
+
+function canonise_truncate!(
+    ψ::QMps,
+    type::Symbol,
+    Dcut::Int = typemax(Int),
+    tolS = eps();
+    kwargs...,
+)
+    if type == :right
+        _left_sweep!(ψ, Dcut, tolS; kwargs...)
+    elseif type == :left
+        _right_sweep!(ψ, Dcut, tolS; kwargs...)
+    else
+        throw(ArgumentError("Wrong canonization type $type"))
+    end
+end
+
+function _right_sweep!(
+    ψ::QMps{T},
+    Dcut::Int = typemax(Int),
+    tolS::T = eps(T);
+    kwargs...,
+) where {T<:Real}
+    R = ψ.onGPU ? CUDA.ones(T, 1, 1) : ones(T, 1, 1)
+    for i ∈ ψ.sites
+        A = ψ[i]
+        @matmul M[x, y, σ] := sum(α) R[x, α] * A[α, y, σ]
+        M = permutedims(M, (3, 1, 2))  # [σ, x, y]
+        @cast M[(σ, x), y] := M[σ, x, y]
+        Q, R = qr_fact(M, Dcut, tolS; toGPU = ψ.onGPU, kwargs...)
+        R ./= maximum(abs.(R))
+        @cast A[σ, x, y] := Q[(σ, x), y] (σ ∈ 1:size(A, 3))
+        ψ[i] = permutedims(A, (2, 3, 1))  # [x, y, σ]
+    end
+end
+
+function _left_sweep!(
+    ψ::QMps{T},
+    Dcut::Int = typemax(Int),
+    tolS::T = eps(T);
+    kwargs...,
+) where {T<:Real}
+    R = ψ.onGPU ? CUDA.ones(T, 1, 1) : ones(T, 1, 1)
+    for i ∈ reverse(ψ.sites)
+        B = permutedims(ψ[i], (1, 3, 2)) # [x, σ, α]
+        @matmul M[x, σ, y] := sum(α) B[x, σ, α] * R[α, y]
+        @cast M[x, (σ, y)] := M[x, σ, y]
+        R, Q = rq_fact(M, Dcut, tolS; toGPU = ψ.onGPU, kwargs...)
+        R ./= maximum(abs.(R))
+        @cast B[x, σ, y] := Q[x, (σ, y)] (σ ∈ 1:size(B, 2))
+        ψ[i] = permutedims(B, (1, 3, 2))
+    end
+end
diff --git a/src/mps/dot.jl b/src/mps/dot.jl
new file mode 100644
index 0000000..8cefc22
--- /dev/null
+++ b/src/mps/dot.jl
@@ -0,0 +1,53 @@
+# ./mps/dot.jl: This file provides basic functionality to compute the dot product between MPS
+#               Other functions to contract MPS with other tensors are also provided.
+
+LinearAlgebra.norm(ψ::QMps) = sqrt(abs(dot(ψ, ψ)))
+
+Base.:(*)(ϕ::QMps, ψ::QMps) = dot(ϕ, ψ)
+Base.:(*)(W::QMpo, ψ::QMps) = dot(W, ψ)
+
+function LinearAlgebra.dot(ψ::QMps{T}, ϕ::QMps{T}) where {T<:Real}
+    @assert ψ.sites == ϕ.sites
+    C = ψ.onGPU && ϕ.onGPU ? CUDA.ones(T, 1, 1) : ones(T, 1, 1)
+    for i ∈ ϕ.sites
+        A, B = ϕ[i], ψ[i]
+        @tensor order = (α, β, σ) C[x, y] := conj(B)[β, x, σ] * C[β, α] * A[α, y, σ]
+    end
+    tr(C)
+end
+
+function LinearAlgebra.dot(ψ::QMpo{R}, ϕ::QMps{R}) where {R<:Real}
+    D = TensorMap{R}()
+    for i ∈ reverse(ϕ.sites)
+        M, B = ψ[i], ϕ[i]
+        for v ∈ reverse(M.bot)
+            B = contract_matrix_tensor3(v, B)
+        end
+        B = contract_tensors43(M.ctr, B)
+        for v ∈ reverse(M.top)
+            B = contract_matrix_tensor3(v, B)
+        end
+
+        mps_li = left_nbrs_site(i, ϕ.sites)
+        mpo_li = left_nbrs_site(i, ψ.sites)
+
+        while mpo_li > mps_li
+            st = size(B, 3)
+            sl2 = size(ψ[mpo_li], 2)
+            @cast B[l1, l2, (r, t)] := B[(l1, l2), r, t] (l2 ∈ 1:sl2)
+            B = permutedims(B, (1, 3, 2))
+            B = contract_matrix_tensor3(ψ[mpo_li], B)
+            B = permutedims(B, (1, 3, 2))
+            @cast B[(l1, l2), r, t] := B[l1, l2, (r, t)] (t ∈ 1:st)
+            mpo_li = left_nbrs_site(mpo_li, ψ.sites)
+        end
+        push!(D, i => B)
+    end
+    QMps(D; onGPU = ψ.onGPU && ϕ.onGPU)
+end
+
+contract_tensor3_matrix(B::AbstractArray{T,3}, M::MpoTensor{T,2}) where {T<:Real} =
+    contract_tensor3_matrix(B, M.ctr)
+contract_matrix_tensor3(M::MpoTensor{T,2}, B::AbstractArray{T,3}) where {T<:Real} =
+    contract_matrix_tensor3(M.ctr, B)
+contract_tensors43(B::Nothing, A::AbstractArray{T,3}) where {T<:Real} = A
diff --git a/src/mps/identity.jl b/src/mps/identity.jl
new file mode 100644
index 0000000..c07c7bc
--- /dev/null
+++ b/src/mps/identity.jl
@@ -0,0 +1,34 @@
+# ./mps/identity.jl: This file provides custom MPS Identity. Note, this approach is easier than
+#                    trying to overload the universal identity operator, I, from LinearAlgebra.
+
+export local_dims, IdentityQMps
+
+function IdentityQMps(
+    ::Type{T},
+    loc_dims::Dict,
+    Dmax::Int = 1;
+    onGPU = true,
+) where {T<:Real}
+    _zeros = onGPU ? CUDA.zeros : zeros
+    id = TensorMap{T}(keys(loc_dims) .=> _zeros.(T, Dmax, Dmax, values(loc_dims)))
+
+    site_min, ld_min = minimum(loc_dims)
+    site_max, ld_max = maximum(loc_dims)
+    if site_min == site_max
+        id[site_min] = _zeros(T, 1, 1, ld_min)
+    else
+        id[site_min] = _zeros(T, 1, Dmax, ld_min)
+        id[site_max] = _zeros(T, Dmax, 1, ld_max)
+    end
+
+    for (site, ld) ∈ loc_dims
+        id[site][1, 1, :] .= 1 / sqrt(ld)
+    end
+    QMps(id; onGPU = onGPU)
+end
+
+function local_dims(mpo::QMpo, dir::Symbol)
+    @assert dir ∈ (:down, :up)
+    dim = dir == :down ? 4 : 2
+    Dict{Site,Int}(k => size(mpo[k], dim) for k ∈ mpo.sites if ndims(mpo[k]) == 4)
+end
diff --git a/src/mps/rand.jl b/src/mps/rand.jl
new file mode 100644
index 0000000..b96ab9d
--- /dev/null
+++ b/src/mps/rand.jl
@@ -0,0 +1,45 @@
+
+# ./mps/rand.jl: This file provides methods to generate random MPS / MPO
+
+function Base.rand(
+    ::Type{QMps{T}},
+    loc_dims::Dict,
+    Dmax::Int = 1;
+    onGPU = false,
+) where {T<:Real}
+    id = TensorMap{T}(keys(loc_dims) .=> rand.(T, Dmax, Dmax, values(loc_dims)))
+    site_min, ld_min = minimum(loc_dims)
+    site_max, ld_max = maximum(loc_dims)
+    if site_min == site_max
+        id[site_min] = rand(T, 1, 1, ld_min)
+    else
+        id[site_min] = rand(T, 1, Dmax, ld_min)
+        id[site_max] = rand(T, Dmax, 1, ld_max)
+    end
+    onGPU ? move_to_CUDA!(QMps(id)) : QMps(id)
+end
+
+function Base.rand(::Type{CentralTensor{T}}, s::Vector{Int}) where {T<:Real}
+    CentralTensor(
+        Real.(rand(s[1], s[5])),
+        Real.(rand(s[2], s[6])),
+        Real.(rand(s[3], s[7])),
+        Real.(rand(s[4], s[8])),
+    )
+end
+
+function Base.rand(
+    ::Type{SiteTensor{T}},
+    lp::PoolOfProjectors,
+    l::Int,
+    D::NTuple,
+) where {T<:Real}
+    loc_exp = rand(l)
+    projs = D
+
+    SiteTensor(lp, loc_exp, projs)
+end
+
+function Base.rand(::Type{QMpo{T}}, loc_dims::Dict; onGPU::Bool = false) where {T<:Real}
+    QMpo(MpoTensorMap{T}(loc_dims))
+end
diff --git a/src/mps/transpose.jl b/src/mps/transpose.jl
new file mode 100644
index 0000000..dbb1ab1
--- /dev/null
+++ b/src/mps/transpose.jl
@@ -0,0 +1,21 @@
+
+# ./mps/transpose.jl: This file defines what it means to transpse MPO. Note, this should not be
+#                     done by overloading Base.transpose for QMpo to avoid overloading (Array)'.
+
+function Base.transpose(ψ::QMpo{T}) where {T<:Real}
+    QMpo(
+        MpoTensorMap{T}(keys(ψ.tensors) .=> mpo_transpose.(values(ψ.tensors)));
+        onGPU = ψ.onGPU,
+    )
+end
+
+mpo_transpose(M::MpoTensor{T,2}) where {T<:Real} = M
+
+function mpo_transpose(M::MpoTensor{T,4}) where {T<:Real}
+    MpoTensor{T,4}(
+        mpo_transpose.(reverse(M.bot)),
+        mpo_transpose(M.ctr),
+        mpo_transpose.(reverse(M.top)),
+        M.dims[[1, 4, 3, 2]],
+    )
+end
diff --git a/src/mps/utils.jl b/src/mps/utils.jl
new file mode 100644
index 0000000..5ab360d
--- /dev/null
+++ b/src/mps/utils.jl
@@ -0,0 +1,67 @@
+# ./mps/aux.jl: This file provides auxiliary functions to verify various MPS properties.
+
+export bond_dimension,
+    bond_dimensions, is_consistent, is_left_normalized, is_right_normalized, length, size
+
+@inline bond_dimension(ψ::QMpsOrMpo) = maximum(size.(values(ψ.tensors), 1))
+@inline bond_dimensions(ψ::QMpsOrMpo) = [size(ψ.tensors[n]) for n ∈ ψ.sites]
+@inline Base.length(ψ::QMpsOrMpo) = maximum(ψ.sites)
+@inline Base.size(ψ::QMpsOrMpo) = (maximum(ψ.sites),)
+
+function is_consistent(ψ::QMps)
+    site_min = minimum(ψ.sites)
+    site_max = maximum(ψ.sites)
+    @assert size(ψ.tensors[site_min], 1) == 1 "Incorrect size on the left boundary."
+    @assert size(ψ.tensors[site_max], 2) == 1 "Incorrect size on the right boundary."
+    for (s1, s2) ∈ zip(ψ.sites[begin:end-1], ψ.sites[begin+1:end])
+        @assert size(ψ.tensors[s1], 2) == size(ψ.tensors[s2], 1) "Incorrect link between $i and $(i+1)."
+    end
+    dev = which_device(ψ)
+    if ψ.onGPU
+        @assert :GPU ∈ dev && :CPU ∉ dev
+    end
+    if !ψ.onGPU
+        @assert :GPU ∉ dev && :CPU ∈ dev
+    end
+    true
+end
+
+function eye(::Type{T}, dim; toGPU::Bool = false) where {T}
+    v = ones(T, dim)
+    toGPU && return cu(spdiagm(v))
+    Diagonal(v)
+end
+
+function is_left_normalized(ψ::QMps, ::Val{false})
+    all(
+        eye(eltype(ψ), size(A, 2); toGPU = false) ≈
+        @tensor(Id[x, y] := A[α, x, σ] * A[α, y, σ]; order = (α, σ)) for
+        A ∈ values(ψ.tensors) # TODO: split the line
+    )
+end
+
+function is_left_normalized(ψ::QMps, ::Val{true})
+    all(
+        eye(eltype(ψ), size(A, 2); toGPU = true) ≈
+        @cutensor(Id[x, y] := A[α, x, σ] * A[α, y, σ]) for A ∈ values(ψ.tensors) # TODO: split the line
+    )
+end
+
+is_left_normalized(ψ::QMps) = is_left_normalized(ψ, Val(ψ.onGPU))
+
+function is_right_normalized(ψ::QMps, ::Val{false})
+    all(
+        eye(eltype(ψ), size(B, 1); toGPU = false) ≈
+        @tensor(Id[x, y] := B[x, α, σ] * B[y, α, σ]; order = (α, σ)) for
+        B ∈ values(ψ.tensors) # TODO: split the line
+    )
+end
+
+function is_right_normalized(ψ::QMps, ::Val{true})
+    all(
+        eye(eltype(ψ), size(B, 1); toGPU = true) ≈
+        @cutensor(Id[x, y] := B[x, α, σ] * B[y, α, σ]) for B ∈ values(ψ.tensors) # TODO: split the line
+    )
+end
+
+is_right_normalized(ψ::QMps) = is_right_normalized(ψ, Val(ψ.onGPU))
diff --git a/src/projectors.jl b/src/projectors.jl
new file mode 100644
index 0000000..fa21437
--- /dev/null
+++ b/src/projectors.jl
@@ -0,0 +1,140 @@
+export PoolOfProjectors, get_projector!, add_projector!, empty!
+
+const Proj{T} = Union{Vector{T},CuArray{T,1}}
+
+"""
+$(TYPEDSIGNATURES)
+
+`PoolOfProjectors` is a data structure for managing projectors associated with Ising model sites. 
+It allows efficient storage and retrieval of projectors based on their indices and provides support for different computational devices.
+
+# Fields:
+- `data::Dict{Symbol, Dict{Int, Proj{T}}}`: A dictionary that stores projectors associated with different 
+computational devices (`:CPU`, `:GPU`, etc.). The inner dictionary maps site indices to projectors.
+- `default_device::Symbol`: A symbol representing the default computational device for projectors in the pool.
+- `sizes::Dict{Int, Int}`: A dictionary that maps site indices to the maximum projector size for each site.
+    
+# Constructors:
+- `PoolOfProjectors(data::Dict{Int, Dict{Int, Vector{T}}}) where T`: Create a `PoolOfProjectors` with initial data for projectors. 
+The data is provided as a dictionary that maps site indices to projectors stored in different computational devices. 
+The `sizes` dictionary is automatically populated based on the maximum projector size for each site.
+- `PoolOfProjectors{T}() where T`: Create an empty `PoolOfProjectors` with no projectors initially stored.    
+"""
+struct PoolOfProjectors{T<:Integer}
+    data::Dict{Symbol,Dict{Int,Proj{T}}}
+    default_device::Symbol
+    sizes::Dict{Int,Int}
+
+    PoolOfProjectors(data::Dict{Int,Dict{Int,Vector{T}}}) where {T} =
+        new{T}(Dict(:CPU => data), :CPU, Dict{Int,Int}(k => maximum(v) for (k, v) ∈ data))
+    PoolOfProjectors{T}() where {T} =
+        new{T}(Dict(:CPU => Dict{Int,Proj{T}}()), :CPU, Dict{Int,Int}())
+end
+
+
+Base.eltype(lp::PoolOfProjectors{T}) where {T} = T
+Base.length(lp::PoolOfProjectors) = length(lp.data[lp.default_device])
+Base.length(lp::PoolOfProjectors, device::Symbol) = length(lp.data[device])
+
+"""
+$(TYPEDSIGNATURES)
+
+Empty the pool of projectors associated with a specific computational device.
+
+This function removes all projectors stored on the specified computational device, freeing up memory resources.
+
+# Arguments:
+- `lp::PoolOfProjectors`: The `PoolOfProjectors` object containing projectors.
+- `device::Symbol`: The computational device for which projectors should be emptied (e.g., `:CPU`, `:GPU`).
+"""
+function Base.empty!(lp::PoolOfProjectors, device::Symbol)
+    if device ∈ keys(lp.data)
+        empty!(lp.data[device])
+    end
+end
+
+Base.length(lp::PoolOfProjectors, index::Int) = length(lp.data[lp.default_device][index])
+Base.size(lp::PoolOfProjectors, index::Int) = lp.sizes[index]
+
+get_projector!(lp::PoolOfProjectors, index::Int) =
+    get_projector!(lp, index, lp.default_device)
+
+"""
+$(TYPEDSIGNATURES)
+
+TODO This is version for only one GPU
+
+Retrieve or create a projector from the `PoolOfProjectors` associated with a specific device.
+
+This function retrieves a projector from the `PoolOfProjectors` if it already exists. 
+If the projector does not exist in the pool, it creates a new one and stores it for future use on the specified computational device.
+
+# Arguments:
+- `lp::PoolOfProjectors{T}`: The `PoolOfProjectors` object containing projectors.
+- `index::Int`: The index of the projector to retrieve or create.
+- `device::Symbol`: The computational device on which the projector should be stored or retrieved (e.g., `:CPU`, `:GPU`).
+
+# Returns:
+- `Proj{T}`: The projector of type `T` associated with the specified index and device.
+"""
+function get_projector!(
+    lp::PoolOfProjectors{T},
+    index::Int,
+    device::Symbol,
+) where {T<:Integer}
+    if device ∉ keys(lp.data)
+        push!(lp.data, device => Dict{Int,Proj{T}}())
+    end
+
+    if index ∉ keys(lp.data[device])
+        if device == :GPU
+            p = CuArray{T}(lp.data[lp.default_device][index])
+        elseif device == :CPU
+            p = Array{T}(lp.data[lp.default_device][index])
+        else
+            throw(ArgumentError("device should be :CPU or :GPU"))
+        end
+        push!(lp.data[device], index => p)
+    end
+    lp.data[device][index]
+end
+
+"""
+$(TYPEDSIGNATURES)
+
+Add a projector to the `PoolOfProjectors` and associate it with an index.
+
+This function adds a projector `p` to the `PoolOfProjectors`. 
+The `PoolOfProjectors` stores projectors based on their computational device (e.g., CPU or GPU) and assigns a unique index to each projector. 
+The index can be used to retrieve the projector later using `get_projector!`.
+
+# Arguments:
+- `lp::PoolOfProjectors{T}`: The `PoolOfProjectors` object to which the projector should be added.
+- `p::Proj`: The projector to be added to the pool. The type of the projector `Proj` should match the type `T` specified in the `PoolOfProjectors`.
+
+# Returns:
+- `Int`: The unique index associated with the added projector in the pool.
+"""
+function add_projector!(lp::PoolOfProjectors{T}, p::Proj) where {T<:Integer}
+    if lp.default_device == :CPU
+        p = Array{T}(p)
+    elseif lp.default_device == :GPU
+        p = CuArray{T}(p)
+    else
+        throw(ArgumentError("default_device should be :CPU or :GPU"))
+    end
+    if p in values(lp.data[lp.default_device])
+        key = -1
+        for guess in keys(lp.data[lp.default_device])
+            if lp.data[lp.default_device][guess] == p
+                key = guess
+                break
+            end
+        end
+    else
+        key = length(lp.data[lp.default_device]) + 1
+        push!(lp.data[lp.default_device], key => p)
+        push!(lp.sizes, key => maximum(p))
+    end
+    key
+end
diff --git a/src/transfer.jl b/src/transfer.jl
new file mode 100644
index 0000000..994266e
--- /dev/null
+++ b/src/transfer.jl
@@ -0,0 +1,84 @@
+
+# transfer.jl: This file provides rules of how to transfer tensors to GPU. Note, NOT all of
+#              tensor's coponents are moved from CPU to GPU and most tensors are generated
+#              on CPU due to the size of clustered Hamiltonian.
+export which_device, move_to_CUDA!, move_to_CPU!
+
+move_to_CUDA!(ten::Array{T,N}) where {T,N} = CuArray(ten) #cu(ten, unified=true)
+
+
+move_to_CUDA!(ten::Union{CuArray{T,N},Nothing}) where {T,N} = ten
+move_to_CUDA!(ten::Diagonal) = Diagonal(move_to_CUDA!(diag(ten)))
+
+function move_to_CUDA!(ten::CentralTensor)
+    ten.e11 = move_to_CUDA!(ten.e11)
+    ten.e12 = move_to_CUDA!(ten.e12)
+    ten.e21 = move_to_CUDA!(ten.e21)
+    ten.e22 = move_to_CUDA!(ten.e22)
+    ten
+end
+
+function move_to_CUDA!(ten::DiagonalTensor)
+    ten.e1 = move_to_CUDA!(ten.e1)
+    ten.e2 = move_to_CUDA!(ten.e2)
+    ten
+end
+
+function move_to_CUDA!(ten::VirtualTensor)
+    ten.con = move_to_CUDA!(ten.con)
+    # ten.projs = move_to_CUDA!.(ten.projs) # TODO 1) is this necessary ?
+    ten
+end
+
+function move_to_CUDA!(ten::SiteTensor)
+    ten.loc_exp = move_to_CUDA!(ten.loc_exp)
+    # ten.projs = move_to_CUDA!.(ten.projs) # TODO 2) is this necessary ?
+    ten
+end
+
+function move_to_CUDA!(ten::MpoTensor)
+    for i ∈ 1:length(ten.top)
+        ten.top[i] = move_to_CUDA!(ten.top[i])
+    end
+    for i ∈ 1:length(ten.bot)
+        ten.bot[i] = move_to_CUDA!(ten.bot[i])
+    end
+    ten.ctr = move_to_CUDA!(ten.ctr)
+    ten
+end
+
+function move_to_CUDA!(ψ::Union{QMpo{T},QMps{T}}) where {T}
+    for k ∈ keys(ψ.tensors)
+        ψ[k] = move_to_CUDA!(ψ[k])
+    end
+    ψ.onGPU = true
+    ψ
+end
+
+move_to_CPU!(ten::CuArray{T,N}) where {T,N} = Array(ten)
+move_to_CPU!(ten::Union{Array{T,N},Nothing}) where {T,N} = ten
+move_to_CPU!(ten::Diagonal) = Diagonal(move_to_CPU!(diag(ten)))
+
+function move_to_CPU!(ψ::QMps{T}) where {T}
+    for k ∈ keys(ψ.tensors)
+        ψ[k] = move_to_CPU!(ψ[k])
+    end
+    ψ.onGPU = false
+    ψ
+end
+
+
+
+which_device(::Nothing) = Set()
+which_device(ψ::Union{QMpo{T},QMps{T}}) where {T} =
+    union(which_device.(values(ψ.tensors))...)
+which_device(ten::MpoTensor) =
+    union(which_device(ten.ctr), which_device.(ten.top)..., which_device.(ten.bot)...)
+which_device(ten::DiagonalTensor) = union(which_device.((ten.e1, ten.e2))...)
+which_device(ten::VirtualTensor) = union(which_device.((ten.con,))...) # TODO cf. 1)  ten.projs
+which_device(ten::CentralTensor) =
+    union(which_device.((ten.e11, ten.e12, ten.e21, ten.e22))...)
+which_device(ten::SiteTensor) = union(which_device.((ten.loc_exp,))...) # TODO cf. 2)  ten.projs
+which_device(ten::Array{T,N}) where {T,N} = Set((:CPU,))
+which_device(ten::CuArray{T,N}) where {T,N} = Set((:GPU,))
+which_device(ten::Diagonal) = which_device(diag(ten))
diff --git a/src/utils/memory.jl b/src/utils/memory.jl
new file mode 100644
index 0000000..06f33c5
--- /dev/null
+++ b/src/utils/memory.jl
@@ -0,0 +1,47 @@
+export measure_memory, format_bytes
+
+measure_memory(ten::AbstractArray) = [Base.summarysize(ten), 0]  # [CPU_memory, GPU_memory]
+measure_memory(ten::CuArray) = [0, prod(size(ten)) * sizeof(eltype(ten))]
+measure_memory(ten::SparseMatrixCSC) =
+    sum(measure_memory.([ten.colptr, ten.rowval, ten.nzval]))
+measure_memory(ten::CuSparseMatrixCSC) =
+    sum(measure_memory.([ten.colPtr, ten.rowVal, ten.nzVal]))
+measure_memory(ten::CuSparseMatrixCSR) =
+    sum(measure_memory.([ten.rowPtr, ten.colVal, ten.nzVal]))
+measure_memory(ten::Diagonal) = measure_memory(diag(ten))
+measure_memory(ten::SiteTensor) = sum(measure_memory.([ten.loc_exp]))  # ten.projs...]))
+measure_memory(ten::CentralTensor) =
+    sum(measure_memory.([ten.e11, ten.e12, ten.e21, ten.e22]))
+measure_memory(ten::DiagonalTensor) = sum(measure_memory.([ten.e1, ten.e2]))
+measure_memory(ten::VirtualTensor) = sum(measure_memory.([ten.con]))  # ten.projs...]))
+measure_memory(ten::MpoTensor) = sum(measure_memory.([ten.top..., ten.ctr, ten.bot...]))
+measure_memory(ten::Union{QMps,QMpo}) = sum(measure_memory.(values(ten.tensors)))
+measure_memory(env::Environment) = sum(measure_memory.(values(env.env)))
+measure_memory(env::EnvironmentMixed) = sum(measure_memory.(values(env.env)))
+measure_memory(lp::PoolOfProjectors) = sum([measure_memory(da) for da ∈ values(lp.data)])
+measure_memory(dict::Dict) = isempty(dict) ? [0, 0] : sum(measure_memory.(values(dict)))
+measure_memory(tuple::Tuple) = sum(measure_memory.(tuple))
+measure_memory(ten::Int) = [sizeof(ten), 0]
+measure_memory(::Nothing) = [0, 0]
+
+
+function format_bytes(bytes, decimals::Int = 2, k::Int = 1024)
+    bytes == 0 && return "0 Bytes"
+    dm = decimals < 0 ? 0 : decimals
+    sizes = ["Bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"]
+    i = convert(Int, floor(log(bytes) / log(k)))
+    string(round((bytes / ^(k, i)), digits = dm)) * " " * sizes[i+1]
+end
+
+function measure_memory(caches::IdDict{Any,Any}, bytes::Bool = true)
+    memoization_memory = bytes ? Dict{Any,Vector{String}}() : Dict{Any,Vector{Int64}}()
+    for key in keys(caches)
+        push!(
+            memoization_memory,
+            key =>
+                bytes ? format_bytes.(measure_memory(caches[key])) :
+                measure_memory(caches[key]),
+        )
+    end
+    memoization_memory
+end
diff --git a/src/utils/utils.jl b/src/utils/utils.jl
new file mode 100644
index 0000000..f498204
--- /dev/null
+++ b/src/utils/utils.jl
@@ -0,0 +1,108 @@
+export rank_reveal, unique_dims
+
+import Base.Prehashed
+"""
+$(TYPEDSIGNATURES)
+
+Reveal ranks and energies in a specified order.
+
+This function calculates and reveals the ranks and energies of a set of states in either the
+'PE' (Projector Energy) or 'EP' (Energy Projector) order.
+
+# Arguments:
+- `energy`: The energy values of states.
+- `order::Symbol`: The order in which to reveal the ranks and energies. 
+It can be either `:PE` for 'Projector Energy)' order (default) or `:EP` for 'Energy Projector' order.
+
+# Returns:
+- If `order` is `:PE`, the function returns a tuple `(P, E)` where:
+  - `P`: A permutation matrix representing projectors.
+  - `E`: An array of energy values.
+- If `order` is `:EP`, the function returns a tuple `(E, P)` where:
+  - `E`: An array of energy values.
+  - `P`: A permutation matrix representing projectors.
+"""
+function rank_reveal(energy, order = :PE) #TODO: add type
+    @assert order ∈ (:PE, :EP)
+    dim = order == :PE ? 1 : 2
+    E, idx = unique_dims(energy, dim)
+    P = identity.(idx)
+    order == :PE ? (P, E) : (E, P)
+end
+
+@generated function unique_dims(A::AbstractArray{T,N}, dim::Integer) where {T,N}
+    quote
+        1 <= dim <= $N || return copy(A)
+        hashes = zeros(UInt, axes(A, dim))
+
+        # Compute hash for each row
+        k = 0
+        @nloops $N i A d -> (
+            if d == dim
+                k = i_d
+            end
+        ) begin
+            @inbounds hashes[k] = hash(hashes[k], hash((@nref $N A i)))
+        end
+
+        # Collect index of first row for each hash
+        uniquerow = similar(Array{Int}, axes(A, dim))
+        firstrow = Dict{Prehashed,Int}()
+        for k in axes(A, dim)
+            uniquerow[k] = get!(firstrow, Prehashed(hashes[k]), k)
+        end
+        uniquerows = collect(values(firstrow))
+
+        # Check for collisions
+        collided = falses(axes(A, dim))
+        @inbounds begin
+            @nloops $N i A d -> (
+                if d == dim
+                    k = i_d
+                    j_d = uniquerow[k]
+                else
+                    j_d = i_d
+                end
+            ) begin
+                if (@nref $N A j) != (@nref $N A i)
+                    collided[k] = true
+                end
+            end
+        end
+
+        if any(collided)
+            nowcollided = similar(BitArray, axes(A, dim))
+            while any(collided)
+                # Collect index of first row for each collided hash
+                empty!(firstrow)
+                for j in axes(A, dim)
+                    collided[j] || continue
+                    uniquerow[j] = get!(firstrow, Prehashed(hashes[j]), j)
+                end
+                for v ∈ values(firstrow)
+                    push!(uniquerows, v)
+                end
+
+                # Check for collisions
+                fill!(nowcollided, false)
+                @nloops $N i A d -> begin
+                    if d == dim
+                        k = i_d
+                        j_d = uniquerow[k]
+                        (!collided[k] || j_d == k) && continue
+                    else
+                        j_d = i_d
+                    end
+                end begin
+                    if (@nref $N A j) != (@nref $N A i)
+                        nowcollided[k] = true
+                    end
+                end
+                (collided, nowcollided) = (nowcollided, collided)
+            end
+        end
+
+        (@nref $N A d -> d == dim ? sort!(uniquerows) : (axes(A, d))),
+        indexin(uniquerow, uniquerows)
+    end
+end
diff --git a/src/variational.jl b/src/variational.jl
new file mode 100644
index 0000000..9af479f
--- /dev/null
+++ b/src/variational.jl
@@ -0,0 +1,97 @@
+
+
+# variational.jl: This file provides basic functions to perform variational compression for MPS.
+#                 If the MPS is moved to the GPU, its compression will be performed on the device.
+
+export variational_compress!, variational_sweep!
+
+function variational_compress!(
+    bra::QMps{T},
+    mpo::QMpo{T},
+    ket::QMps{T},
+    tol = 1E-10,
+    max_sweeps::Int = 4,
+    kwargs...,
+) where {T<:Real}
+    @assert is_left_normalized(bra)
+    env = Environment(bra, mpo, ket)
+    overlap = Inf
+    overlap_0, negative = measure_env(env, last(env.bra.sites))
+    if negative
+        env.bra[last(env.bra.sites)] .*= -1
+    end
+
+    for sweep ∈ 1:max_sweeps
+        _left_sweep_var!(env; kwargs...)
+        _right_sweep_var!(env; kwargs...)
+        overlap, negative = measure_env(env, last(env.bra.sites))
+        if negative
+            env.bra[last(env.bra.sites)] .*= -1
+        end
+        Δ = abs(overlap_0 - overlap)
+        @info "Convergence" Δ
+        if Δ < tol
+            return overlap, env
+        else
+            overlap_0 = overlap
+        end
+    end
+    overlap, env
+end
+
+function _left_sweep_var!(env::Environment; kwargs...)
+    for site ∈ reverse(env.bra.sites)
+        _left_sweep_var_site!(env, site; kwargs...)
+    end
+end
+
+function _left_sweep_var_site!(env::Environment, site::Site; kwargs...)
+    toGPU = env.ket.onGPU && env.mpo.onGPU && env.bra.onGPU
+    update_env_right!(env, site)
+    A = project_ket_on_bra(env, site)
+    @cast B[l, (r, t)] := A[l, r, t]
+    _, Q = rq_fact(B; toGPU = toGPU, kwargs...)
+    @cast C[l, r, t] := Q[l, (r, t)] (t ∈ 1:size(A, 3))
+    env.bra[site] = C
+    clear_env_containing_site!(env, site)
+end
+
+function _right_sweep_var!(env::Environment; kwargs...)
+    for site ∈ env.bra.sites
+        _right_sweep_var_site!(env, site; kwargs...)
+    end
+end
+
+function _right_sweep_var_site!(env::Environment, site::Site; kwargs...)
+    toGPU = env.ket.onGPU && env.mpo.onGPU && env.bra.onGPU
+    update_env_left!(env, site)
+    A = project_ket_on_bra(env, site)
+    B = permutedims(A, (1, 3, 2))  # [l, t, r]
+    @cast B[(l, t), r] := B[l, t, r]
+    Q, _ = qr_fact(B; toGPU = toGPU, kwargs...)
+    @cast C[l, t, r] := Q[(l, t), r] (t ∈ 1:size(A, 3))
+    C = permutedims(C, (1, 3, 2))  # [l, r, t]
+    env.bra[site] = C
+    clear_env_containing_site!(env, site)
+end
+
+# TODO those 2 functions are to be removed eventually
+function variational_sweep!(
+    bra::QMps{T},
+    mpo::QMpo{T},
+    ket::QMps{T},
+    ::Val{:left};
+    kwargs...,
+) where {T<:Real}
+    _right_sweep_var!(Environment(bra, mpo, ket); kwargs...)
+end
+
+function variational_sweep!(
+    bra::QMps{T},
+    mpo::QMpo{T},
+    ket::QMps{T},
+    ::Val{:right};
+    kwargs...,
+) where {T<:Real}
+    _left_sweep_var!(Environment(bra, mpo, ket); kwargs...)
+end
diff --git a/src/zipper.jl b/src/zipper.jl
new file mode 100644
index 0000000..658092d
--- /dev/null
+++ b/src/zipper.jl
@@ -0,0 +1,278 @@
+export zipper, corner_matrix, CornerTensor
+
+struct CornerTensor{T<:Real}
+    C::Tensor{T,3}
+    M::MpoTensor{T,4}
+    B::Tensor{T,3}
+
+    function CornerTensor(C, M, B)
+        T = promote_type(eltype.((C, M, B))...)
+        new{T}(C, M, B)
+    end
+end
+
+struct Adjoint{T,S<:CornerTensor}
+    parent::S
+
+    function Adjoint{T}(ten::CornerTensor{S}) where {T,S}
+        F = promote_type(T, S)
+        new{F,CornerTensor{F}}(ten)
+    end
+end
+
+function zipper(
+    ψ::QMpo{R},
+    ϕ::QMps{R};
+    method::Symbol = :svd,
+    Dcut::Int = typemax(Int),
+    tol = eps(),
+    iters_rand = 3,
+    iters_svd = 1,
+    iters_var = 1,
+    Dtemp_multiplier = 2,
+    depth::Int = 0,
+    kwargs...,
+) where {R<:Real}
+    onGPU = ψ.onGPU && ϕ.onGPU
+    @assert is_left_normalized(ϕ)
+
+    C = onGPU ? CUDA.ones(R, 1, 1, 1) : ones(R, 1, 1, 1)
+    mpo_li = last(ψ.sites)
+
+    d = (depth == 0) ? mpo_li : depth
+
+    Dtemp = Dtemp_multiplier * Dcut
+    out = copy(ϕ)
+    env = EnvironmentMixed(out, C, ψ, ϕ)
+
+    for i ∈ reverse(ϕ.sites)
+        while mpo_li > i
+            C = contract_matrix_tensor3(ψ[mpo_li], C)
+            mpo_li = left_nbrs_site(mpo_li, ψ.sites)
+        end
+        @assert mpo_li == i "Mismatch between QMpo and QMps sites."
+        mpo_li = left_nbrs_site(mpo_li, ψ.sites)
+
+        if i > ϕ.sites[1]
+            CM = CornerTensor(C, ψ[i], out[i])
+
+            Urs, Srs, Vrs = [], [], []
+            for i = 1:iters_rand
+                Utemp, Stemp, Vtemp =
+                    svd_corner_matrix(CM, method, Dtemp, tol; toGPU = false, kwargs...)
+                push!(Urs, Utemp)
+                push!(Srs, Stemp)
+                push!(Vrs, Vtemp)
+            end
+
+            Ur = hcat(Urs...)
+            Vr = hcat(Vrs...)
+            Sr = vcat(Srs...) ./ iters_rand
+            QU, RU = qr_fact(Ur, Dtemp * iters_rand, 0.0; toGPU = false, kwargs...)
+            QV, RV = qr_fact(Vr, Dtemp * iters_rand, 0.0; toGPU = false, kwargs...)
+            Ur, Sr, Vr = svd_fact(RU * Diagonal(Sr) * RV', Dtemp, tol; kwargs...)
+            # Ur = QU * Ur
+            Vr = QV * Vr
+
+            if onGPU
+                Vr = CuArray(Vr)
+            end
+
+            for _ = 1:iters_svd
+                # CM * Vr
+                x = reshape(Vr, size(CM.C, 2), size(CM.M, 2), :)
+                x = permutedims(x, (3, 1, 2))
+                x = update_env_right(CM.C, x, CM.M, CM.B)
+                CCC = reshape(permutedims(x, (1, 3, 2)), size(CM.B, 1) * size(CM.M, 1), :)
+
+                Ut, _ = qr_fact(CCC, Dtemp, 0.0; toGPU = ψ.onGPU, kwargs...)
+
+                # CM' * Ut
+                x = reshape(Ut, size(CM.B, 1), size(CM.M, 1), :)
+                x = permutedims(x, (1, 3, 2))
+                yp = project_ket_on_bra(x, CM.B, CM.M, CM.C)
+                CCC = reshape(permutedims(yp, (2, 3, 1)), size(CM.C, 2) * size(CM.M, 2), :)
+
+                Vr, _ = qr_fact(CCC, Dtemp, 0.0; toGPU = ψ.onGPU, kwargs...)
+            end
+
+            # CM * Vr
+            x = reshape(Vr, size(CM.C, 2), size(CM.M, 2), :)
+            x = permutedims(x, (3, 1, 2))
+            x = update_env_right(CM.C, x, CM.M, CM.B)
+            CCC = reshape(permutedims(x, (1, 3, 2)), size(CM.B, 1) * size(CM.M, 1), :)
+            CCC ./= norm(CCC)
+
+            V, CCC = qr_fact(CCC', Dcut, tol; toGPU = ψ.onGPU, kwargs...)
+            V = V' * Vr'
+            s1, s2 = size(ψ[i])
+            @cast CCC[z, x, y] := CCC[z, (x, y)] (y ∈ 1:s1)
+            C = permutedims(CCC, (2, 1, 3))
+            @cast V[x, y, z] := V[x, (y, z)] (z ∈ 1:s2)
+            out[i] = V
+        else
+            L = onGPU ? CUDA.ones(R, 1, 1, 1) : ones(R, 1, 1, 1)
+            V = project_ket_on_bra(L, out[i], ψ[i], C)
+            V ./= norm(V)
+            out[i] = V
+            C = onGPU ? CUDA.ones(R, 1, 1, 1) : ones(R, 1, 1, 1)
+        end
+
+        for _ = 1:iters_var
+            env.site = i
+            update_env_right!(env, i)
+            env.C = C
+            update_env_left!(env, :central)
+            _left_sweep_var_site!(env, :central; kwargs...)
+            for k in reverse(ϕ.sites)
+                if (i - d) <= k < i
+                    _left_sweep_var_site!(env, k; kwargs...)
+                end
+            end
+            for k in ϕ.sites
+                if (i - d) <= k < i
+                    _right_sweep_var_site!(env, k; kwargs...)
+                end
+            end
+            _right_sweep_var_site!(env, :central; kwargs...)
+
+            for k in ϕ.sites
+                if (i + d) >= k >= i
+                    _right_sweep_var_site!(env, k; kwargs...)
+                end
+            end
+            for k in reverse(ϕ.sites)
+                if (i + d) >= k >= i
+                    _left_sweep_var_site!(env, k; kwargs...)
+                end
+            end
+            update_env_right!(env, :central)
+            C = project_ket_on_bra(env, :central)
+        end
+    end
+    out
+end
+
+function _left_sweep_var_site!(env::EnvironmentMixed, site; kwargs...)   # site: Union(Sites, :central)
+    update_env_right!(env, site)
+    A = project_ket_on_bra(env, site)
+    @cast B[l, (r, t)] := A[l, r, t]
+    _, Q = rq_fact(B; toGPU = env.onGPU, kwargs...)
+    @cast C[l, r, t] := Q[l, (r, t)] (t ∈ 1:size(A, 3))
+    if site == :central
+        env.C = C
+    else
+        env.bra[site] = C
+    end
+    clear_env_containing_site!(env, site)
+end
+
+function _right_sweep_var_site!(env::EnvironmentMixed, site; kwargs...)
+    update_env_left!(env, site)
+    A = project_ket_on_bra(env, site)
+    B = permutedims(A, (1, 3, 2))  # [l, t, r]
+    @cast B[(l, t), r] := B[l, t, r]
+    Q, _ = qr_fact(B; toGPU = env.onGPU, kwargs...)
+    @cast C[l, t, r] := Q[(l, t), r] (t ∈ 1:size(A, 3))
+    C = permutedims(C, (1, 3, 2))  # [l, r, t]
+    if site == :central
+        env.C = C
+    else
+        env.bra[site] = C
+    end
+    clear_env_containing_site!(env, site)
+end
+
+function Base.Array(CM::CornerTensor)  # change name, or be happy with "psvd(Array(Array(CM))"
+    B, M, C = CM.B, CM.M, CM.C
+    for v ∈ reverse(M.bot)
+        B = contract_matrix_tensor3(v, B)
+    end
+    Cnew = corner_matrix(C, M.ctr, B)
+    @cast Cnew[(t1, t2), t3, t4] := Cnew[t1, t2, t3, t4]
+    for v ∈ reverse(M.top)
+        Cnew = contract_matrix_tensor3(v, Cnew)
+    end
+    @cast Cnew[t12, (t3, t4)] := Cnew[t12, t3, t4]
+end
+
+Base.Array(CM::Adjoint{T,CornerTensor{T}}) where {T} = adjoint(Array(CM.ten))
+
+# TODO rethink this function
+function svd_corner_matrix(
+    CM,
+    method::Symbol,
+    Dcut::Int,
+    tol::Real;
+    toGPU::Bool = true,
+    kwargs...,
+)
+    if method == :svd
+        U, Σ, V = svd_fact(Array(Array(CM)), Dcut, tol; kwargs...)
+    elseif method == :psvd
+        U, Σ, V = psvd(Array(Array(CM)), rank = Dcut)
+    elseif method == :psvd_sparse
+        U, Σ, V = psvd(CM, rank = Dcut)
+    elseif method == :tsvd
+        U, Σ, V = tsvd(Array(CM), Dcut; kwargs...)
+    elseif method == :tsvd_sparse
+        v0 = 2 .* rand(eltype(CM), size(CM, 1)) .- 1
+        U, Σ, V = tsvd(CM, Dcut, initvec = v0; kwargs...)
+    else
+        throw(ArgumentError("Wrong method $method"))
+    end
+    toGPU && return CuArray.((U, Σ, V))
+    U, Σ, V
+end
+
+# this is for psvd to work
+LinearAlgebra.ishermitian(ten::CornerTensor) = false
+Base.size(ten::CornerTensor) =
+    (size(ten.B, 1) * size(ten.M, 1), size(ten.C, 2) * size(ten.M, 2))
+Base.size(ten::CornerTensor, n::Int) = size(ten)[n]
+Base.eltype(ten::CornerTensor{T}) where {T} = T
+Base.adjoint(ten::CornerTensor{T}) where {T} = Adjoint{T}(ten)
+
+CuArrayifneeded(ten::CornerTensor, x) = typeof(ten.B) <: CuArray ? CuArray(x) : x
+CuArrayifneeded(ten::Adjoint{T,CornerTensor{T}}, x) where {T} =
+    CuArrayifneeded(ten.parent, x)
+
+
+function LinearAlgebra.mul!(y, ten::CornerTensor, x)
+    x = CuArrayifneeded(ten, x) # CuArray(x) # TODO this an ugly patch
+    x = reshape(x, size(ten.C, 2), size(ten.M, 2), :)
+    x = permutedims(x, (3, 1, 2))
+    yp = update_env_right(ten.C, x, ten.M, ten.B)
+    y[:, :] .=
+        Array(reshape(permutedims(yp, (1, 3, 2)), size(ten.B, 1) * size(ten.M, 1), :))
+end
+
+function LinearAlgebra.mul!(y, ten::Adjoint{T,CornerTensor{T}}, x) where {T<:Real}
+    x = CuArrayifneeded(ten, x)  # CuArray(x)  # TODO this an ugly patch
+    x = reshape(x, size(ten.parent.B, 1), size(ten.parent.M, 1), :)
+    x = permutedims(x, (1, 3, 2))
+    yp = project_ket_on_bra(x, ten.parent.B, ten.parent.M, ten.parent.C)
+    y[:, :] .= Array(
+        reshape(
+            permutedims(yp, (2, 3, 1)),
+            size(ten.parent.C, 2) * size(ten.parent.M, 2),
+            :,
+        ),
+    )
+end
+
+function Base.:(*)(ten::CornerTensor{T}, x) where {T}
+    x = CuArrayifneeded(ten, x)  # CuArray(x)  # TODO this an ugly patch
+    x = reshape(x, 1, size(ten.C, 2), size(ten.M, 2))
+    yp = update_env_right(ten.C, x, ten.M, ten.B)
+    out = reshape(yp, size(ten.B, 1) * size(ten.M, 1))
+    Array(out)  # TODO this an ugly patch
+end
+
+function Base.:(*)(ten::Adjoint{T,CornerTensor{T}}, x) where {T<:Real}
+    x = CuArrayifneeded(ten, x)  # CuArray(x)  # TODO this an ugly patch
+    x = reshape(x, size(ten.parent.B, 1), 1, size(ten.parent.M, 1))
+    yp = project_ket_on_bra(x, ten.parent.B, ten.parent.M, ten.parent.C)
+    out = reshape(yp, size(ten.parent.C, 2) * size(ten.parent.M, 2))
+    Array(out)  # TODO this an ugly patch
+end
diff --git a/test/attic/canonise.jl b/test/attic/canonise.jl
new file mode 100644
index 0000000..24330a8
--- /dev/null
+++ b/test/attic/canonise.jl
@@ -0,0 +1,25 @@
+T = Float64
+D = 16
+
+sites = [1, 3 // 2, 2, 5 // 2, 3, 7 // 2, 4]
+d = [1, 2, 2, 2, 4, 2, 2]
+
+id = Dict(j => d[i] for (i, j) in enumerate(sites))
+
+@testset "Random QMps" begin
+    ψ = rand(QMps{T}, id, D)
+    ϕ = rand(QMps{T}, id, D)
+
+    ψ = move_to_CUDA!(ψ)
+    ϕ = move_to_CUDA!(ϕ)
+
+    @testset "is left normalized" begin
+        canonise!(ψ, :left)
+        #@test is_left_normalized(ψ)
+    end
+
+    @testset "is right normalized" begin
+        canonise!(ϕ, :right)
+        #@test is_right_normalized(ϕ)
+    end
+end
diff --git a/test/attic/compressions.jl b/test/attic/compressions.jl
new file mode 100644
index 0000000..a73e460
--- /dev/null
+++ b/test/attic/compressions.jl
@@ -0,0 +1,35 @@
+
+@testset "Compressions for sparse mps and mpo works" begin
+    D = 16
+    d = 2
+    sites = collect(1:4)
+    T = Float64
+
+    Dcut = 8
+    max_sweeps = 100
+    tol = 1E-10
+
+    ψ = rand(QMps{T}, sites, D, d)
+    W = rand(QMpo{T}, [1, 2, 3, 4], 2, 4)
+
+    bra = ψ
+    ket = ψ
+    mpo = W
+
+    @testset "Two mps representations are compressed to the same state" begin
+        χ = W * ψ
+        @test is_left_normalized(χ)
+
+        ϕ = copy(ψ)
+        @test bond_dimension(bra) == max(D, d)
+        @test bond_dimensions(bra) == [(1, d, D), (D, d, D), (D, d, D), (D, d, 1)]
+        canonise!(bra, :left)
+        bra = QMps(ψ)
+
+        @time overlap, env = variational_compress!(bra, mpo, ket, tol, max_sweeps)
+
+        ϕ = MPS(bra)
+        @time is_right_normalized(ϕ)
+        @test norm(χ) ≈ norm(bra) ≈ 1
+    end
+end
diff --git a/test/attic/contractions.jl b/test/attic/contractions.jl
new file mode 100644
index 0000000..4f5a3ea
--- /dev/null
+++ b/test/attic/contractions.jl
@@ -0,0 +1,61 @@
+@testset "Contraction" begin
+    D = 2
+    d = 2
+    sites = collect(1:2)
+    T = Float64
+
+    ψ = random_QMps(sites, D, d)
+    ϕ = random_QMps(sites, D, d)
+    O1 = random_QMpo(sites, D, d)
+
+    @testset "dot products of MPS" begin
+        @testset "is equal to itself" begin
+            @test dot(ψ, ψ) ≈ dot(ψ, ψ)
+        end
+
+        @testset "change of arguments results in conjugation" begin
+            @test dot(ψ, ϕ) ≈ conj(dot(ϕ, ψ))
+        end
+
+        @testset "norm is 2-norm" begin
+            @test norm(ψ) ≈ sqrt(abs(dot(ψ, ψ)))
+        end
+
+        @testset "renormalizations" begin
+            ψ.tensors[ψ.sites[end]] *= 1 / norm(ψ)
+            @test dot(ψ, ψ) ≈ 1
+
+            ϕ.tensors[ψ.sites[1]] *= 1 / norm(ϕ)
+            @test dot(ϕ, ϕ) ≈ 1
+        end
+    end
+
+    @testset "dot product of MPS with MPO" begin
+        B = randn(Float64, 4, 2, 3)
+        A = randn(Float64, 2, 2)
+        C = randn(Float64, 2, 2, 2, 2)
+        O2 = random_QMpo(sites, D, d)
+
+        @testset "contract_left gives correct sizes" begin
+            @test size(contract_left(B, A)) == (4, 2, 3)
+        end
+
+        @testset "contract_tensors43 gives correct sizes" begin
+            @test size(contract_tensors43(C, B)) == (8, 2, 6)
+        end
+
+        # @testset "dot product of QMpo and QMps" begin
+        #     D = dot(O2, ψ)
+        #     E = dot(O1, ψ)
+        #     @test size(D[1]) == size(E[1]) == (1, 2, 4)
+        #     @test size(D[2]) == size(E[2]) == (4, 2, 1)
+        # end
+
+        @testset "dot product of QMps and QMpo" begin
+            F = dot(ψ, O2)
+            G = dot(ψ, O1)
+            @test size(F[1]) == size(G[1]) == (1, 2, 4)
+            @test size(F[2]) == size(G[2]) == (4, 1, 2)
+        end
+    end
+end
diff --git a/test/attic/environment.jl b/test/attic/environment.jl
new file mode 100644
index 0000000..d8557cd
--- /dev/null
+++ b/test/attic/environment.jl
@@ -0,0 +1,10 @@
+@testset "Environment" begin
+    sites = [1, 1 // 2, 2, 3, 7 // 2, 4, 5, 6]
+    site = 3
+    @testset "left_nbrs_site gives correct left neighbor of a given site" begin
+        @test left_nbrs_site(site, sites) == 2
+    end
+    @testset "left_nbrs_site gives correct right neighbor of a given site" begin
+        @test right_nbrs_site(site, sites) == 7 // 2
+    end
+end
diff --git a/test/attic/linear_algebra_ext.jl b/test/attic/linear_algebra_ext.jl
new file mode 100644
index 0000000..7bbefce
--- /dev/null
+++ b/test/attic/linear_algebra_ext.jl
@@ -0,0 +1,46 @@
+using LowRankApprox
+
+@testset "Truncation with standard SVD works correctly" begin
+    D = 100
+    Dcut = D - 1
+    tol = 1E-8
+
+    a = rand(D, D)
+
+    U1, Σ1, V1 = svd(a)
+
+    δ = min(Dcut, size(Σ1)...)
+    U1 = U1[:, 1:δ]
+    Σ1 = Σ1[1:δ]
+    V1 = V1[:, 1:δ]
+
+    U2, Σ2, V2 = svd(a)
+
+    δ = min(Dcut, size(Σ2)...)
+    U2 = U2[:, 1:δ]
+    Σ2 = Σ2[1:δ]
+    V2 = V2[:, 1:δ]
+
+    r1 = U1 * Diagonal(Σ1) * V1'
+    r2 = U2 * Diagonal(Σ2) * V2'
+
+    @test norm(r1 - r2) < tol
+end
+
+
+@testset "Truncation with with random SVD works correctly" begin
+
+    D = 100
+    Dcut = D - 1
+    tol = 1E-8
+
+    a = rand(D, D)
+
+    U1, Σ1, V1 = psvd(a, rank = Dcut, atol = 1E-16, rtol = 1E-16)
+    U2, Σ2, V2 = psvd(a, rank = Dcut, atol = 1E-16, rtol = 1E-16)
+
+    r1 = U1 * Diagonal(Σ1) * V1'
+    r2 = U2 * Diagonal(Σ2) * V2'
+
+    @test norm(r1 - r2) < tol
+end
diff --git a/test/memoization.jl b/test/attic/memoization.jl
similarity index 100%
rename from test/memoization.jl
rename to test/attic/memoization.jl
diff --git a/test/attic/mps.jl b/test/attic/mps.jl
new file mode 100644
index 0000000..f00a778
--- /dev/null
+++ b/test/attic/mps.jl
@@ -0,0 +1,45 @@
+@testset "QMps" begin
+
+    T = Float64
+    D = 16
+
+    sites = [1, 3 // 2, 2, 5 // 2, 3, 7 // 2, 4]
+    d = [1, 2, 2, 2, 4, 2, 2]
+
+    id = Dict(j => d[i] for (i, j) in enumerate(sites))
+
+    @testset "Random QMps with varying physical dimension" begin
+        ψ = rand(QMps{T}, id, D)
+
+        @testset "has correct number of sites" begin
+            @test length(ψ) == maximum(sites)
+            @test size(ψ) == (maximum(sites),)
+        end
+
+        @testset "has correct type" begin
+            @test eltype(ψ) == T
+        end
+
+        @testset "has correct rank" begin
+            @test rank(ψ) == Tuple(d)
+        end
+
+        @testset "has correct bonds" begin
+            @test bond_dimension(ψ) ≈ D
+            @test bond_dimensions(ψ) == [
+                (1, d[1], D),
+                (D, d[2], D),
+                (D, d[3], D),
+                (D, d[4], D),
+                (D, d[5], D),
+                (D, d[6], D),
+                (D, d[7], 1),
+            ]
+            @test verify_bonds(ψ) === nothing
+        end
+
+        @testset "is equal to itself" begin
+            @test ψ == ψ
+        end
+    end
+end
diff --git a/test/attic/runtests.jl b/test/attic/runtests.jl
new file mode 100644
index 0000000..f43ee30
--- /dev/null
+++ b/test/attic/runtests.jl
@@ -0,0 +1,20 @@
+using SpinGlassTensors
+using TensorOperations
+using TensorCast
+using Logging
+using LinearAlgebra
+
+disable_logging(LogLevel(1))
+
+using Test
+
+my_tests = [
+    #"mps.jl",
+    "canonise.jl",
+    #"environment.jl"
+]
+
+
+for my_test in my_tests
+    include(my_test)
+end
diff --git a/test/base.jl b/test/base.jl
deleted file mode 100644
index 1f5a36f..0000000
--- a/test/base.jl
+++ /dev/null
@@ -1,187 +0,0 @@
-@testset "MPS" begin
-
-    D = 10
-    d = 4
-    sites = 5
-    T = ComplexF64
-
-    @testset "Random MPS with the same physical dimension" begin
-
-        ψ = randn(MPS{T}, sites, D, d)
-
-        @testset "has correct number of sites" begin
-            @test length(ψ) == sites
-            @test size(ψ) == (sites,)
-        end
-
-        @testset "has correct type" begin
-            @test eltype(ψ) == T
-        end
-
-        @testset "has correct rank" begin
-            @test rank(ψ) == Tuple(fill(d, sites))
-        end
-
-        @testset "has correct bonds" begin
-            @test bond_dimension(ψ) ≈ D
-            @test verify_bonds(ψ) === nothing
-        end
-
-        @testset "is equal to itself" begin
-            @test ψ == ψ
-            @test ψ ≈ ψ
-        end
-
-        @testset "is equal to its copy" begin
-            ϕ = copy(ψ)
-            @test ϕ == ψ
-            @test ϕ ≈ ψ
-        end
-    end
-
-    @testset "Random MPS with varying physical dimension" begin
-
-        dims = (3, 2, 5, 4)
-        ψ = randn(MPS{T}, D, dims)
-
-        @testset "has correct number of sites" begin
-            n = length(dims)
-            @test length(ψ) == n
-            @test size(ψ) == (n,)
-        end
-
-        @testset "has correct type" begin
-            @test eltype(ψ) == T
-        end
-
-        @testset "has correct rank" begin
-            @test rank(ψ) == dims
-        end
-
-        @testset "has correct bonds" begin
-            @test bond_dimension(ψ) ≈ D
-            @test verify_bonds(ψ) === nothing
-        end
-
-        @testset "is equal to itself" begin
-            @test ψ == ψ
-            @test ψ ≈ ψ
-        end
-
-        @testset "is equal to its copy" begin
-            ϕ = copy(ψ)
-            @test ϕ == ψ
-            @test ϕ ≈ ψ
-        end
-    end
-
-    @testset "Random MPO with the same physical dimension" begin
-
-        W = randn(MPO{T}, sites, D, d)
-
-        @testset "has correct number of sites" begin
-            @test length(W) == sites
-            @test size(W) == (sites,)
-        end
-
-        @testset "has correct type" begin
-            @test eltype(W) == T
-        end
-
-        @testset "is equal to itself" begin
-            @test W == W
-            @test W ≈ W
-        end
-
-        @testset "is equal to its copy" begin
-            U = copy(W)
-            @test U == W
-            @test U ≈ W
-        end
-    end
-
-    @testset "Random MPO with varying physical dimension" begin
-
-        dims = (3, 2, 5, 4)
-        W = randn(MPO{T}, D, dims)
-
-        @testset "has correct number of sites" begin
-            n = length(dims)
-            @test length(W) == n
-            @test size(W) == (n,)
-        end
-
-        @testset "has correct type" begin
-            @test eltype(W) == T
-        end
-
-        @testset "is equal to itself" begin
-            @test W == W
-            @test W ≈ W
-        end
-
-        @testset "is equal to its copy" begin
-            U = copy(W)
-            @test U == W
-            @test U ≈ W
-        end
-    end
-
-    @testset "MPS from tensor" begin
-        ϵ = 1E-14
-
-        dims = (2, 3, 4, 3, 5)
-        sites = length(dims)
-        A = randn(T, dims)
-
-        ψ = MPS(A, :right)
-
-        @test norm(ψ) ≈ 1
-        @test_nowarn verify_bonds(ψ)
-        @test_nowarn verify_physical_dims(ψ, dims)
-        @test is_right_normalized(ψ)
-
-        B = randn(T, dims...)
-        ϕ = MPS(B, :left)
-
-        @test norm(ϕ) ≈ 1
-        @test_nowarn verify_bonds(ϕ)
-        @test_nowarn verify_physical_dims(ϕ, dims)
-        @test is_left_normalized(ϕ)
-
-        χ = MPS(A, :left)
-
-        @test norm(χ) ≈ 1
-        @test abs(1 - abs(dot(ψ, χ))) < ϵ
-    end
-
-end
-
-
-@testset "Objects with equal tensors have the same hash" begin
-    D = 10
-    d = 4
-    sites = 5
-    T = ComplexF64
-
-    ψ = randn(MPS{T}, sites, D, d)
-    ϕ = copy(ψ)
-
-    W = randn(MPO{T}, sites, D, d)
-    V = copy(W)
-
-    @testset "Equal MPSs have the same hash" begin
-        @test hash(ψ) == hash(ϕ)
-    end
-
-    @testset "Equal MPOs have the same hash" begin
-        @test hash(W) == hash(V)
-    end
-
-    @testset "Equal tuples with MPS and MPO have the same hash" begin
-        tuple_1 = (ψ, W, [1, 2, 3])
-        tuple_2 = (ϕ, V, [1, 2, 3])
-        @test tuple_1 == tuple_2
-        @test hash(tuple_1) == hash(tuple_2)
-    end
-end
diff --git a/test/canonise.jl b/test/canonise.jl
new file mode 100644
index 0000000..89e44d7
--- /dev/null
+++ b/test/canonise.jl
@@ -0,0 +1,51 @@
+D = 16
+
+sites = [1, 3 // 2, 2, 5 // 2, 3, 7 // 2, 4]
+d = [1, 2, 2, 2, 4, 2, 2]
+
+id = Dict(j => d[i] for (i, j) in enumerate(sites))
+
+@testset "Random QMps ($T)" for T in (Float32, Float64)
+    for toCUDA ∈ (true, false)
+        ψ = rand(QMps{T}, id, D)
+        ϕ = rand(QMps{T}, id, D)
+        @test is_consistent(ψ)
+        @test is_consistent(ϕ)
+
+        if toCUDA
+            ψ = move_to_CUDA!(ψ)
+            ϕ = move_to_CUDA!(ϕ)
+            @test is_consistent(ψ)
+            @test is_consistent(ϕ)
+        end
+
+        @testset "is left normalized" begin
+            canonise!(ψ, :left)
+            @test is_consistent(ψ)
+            @test is_left_normalized(ψ)
+            @test dot(ψ, ψ) ≈ one(T)
+        end
+
+        @testset "is right normalized" begin
+            canonise!(ϕ, :right)
+            @test is_consistent(ϕ)
+            @test is_right_normalized(ϕ)
+            @test dot(ϕ, ϕ) ≈ one(T)
+        end
+    end
+end
+
+@testset "Measure spectrum($T)" for T in (Float32, Float64)
+    svd_mps = TensorMap{T}(
+        1 => T[
+            -0.694389933025747 -0.7195989305943268;;;
+            0.7195989305943268 -0.6943899330257469
+        ],
+        2 => T[0.7071067811865477; 0.0;;; -7.850461536237973e-17; 0.7071067811865477],
+    )
+    ψ = QMps(svd_mps)
+    @test is_left_normalized(ψ)
+    A = measure_spectrum(ψ)
+    @test A[1] ≈ [1.0]
+    @test A[2] ≈ [0.7071067811865476, 0.7071067811865475]
+end
diff --git a/test/compressions.jl b/test/compressions.jl
deleted file mode 100644
index fa36f9e..0000000
--- a/test/compressions.jl
+++ /dev/null
@@ -1,100 +0,0 @@
-@testset "Canonisation and Compression" begin
-
-    D = 32
-    Dcut = 16
-
-    d = 2
-    sites = 100
-
-    T = Float64
-
-    var_tol = 1E-10
-    var_max_sweeps = 100
-
-    ψ = randn(MPS{T}, sites, D, d)
-    ϕ = randn(MPS{T}, sites, D, d)
-    χ = randn(MPS{T}, sites, D, d)
-    Φ = randn(MPS{T}, sites, D, d)
-
-
-    @testset "Canonisation (left)" begin
-        b = canonise!(ψ, :left)
-        @test is_left_normalized(ψ)
-        @test dot(ψ, ψ) ≈ 1
-    end
-
-
-    @testset "Canonisation (right)" begin
-        b = canonise!(ϕ, :right)
-        @test is_right_normalized(ϕ)
-        @test dot(ϕ, ϕ) ≈ 1
-    end
-
-    @testset "Copy and truncate twice" begin
-        ψ̃ = copy(ψ)
-        @test ψ̃ == ψ
-        for (direction, predicate) ∈
-            ((:left, is_left_normalized), (:right, is_right_normalized))
-            truncate!(ψ, direction, Dcut)
-            truncate!(ψ̃, direction, Dcut)
-
-            @test predicate(ψ)
-            @test predicate(ψ̃)
-            @test bond_dimension(ψ̃) == bond_dimension(ψ)
-            @test all(size(A) == size(B) for (A, B) ∈ zip(ψ, ψ̃))
-            @test typeof(ψ̃) == typeof(ψ)
-            @test norm(ψ) ≈ norm(ψ̃) ≈ 1
-            @test abs(ψ * ψ̃) ≈ abs(ψ̃ * ψ) ≈ 1
-        end
-    end
-
-    @testset "Cauchy-Schwarz inequality (after truncation)" begin
-        @test abs(dot(ϕ, ψ)) <= norm(ϕ) * norm(ψ)
-    end
-
-    @testset "Truncation (SVD, right)" begin
-        truncate!(ψ, :right, Dcut)
-        @test is_right_normalized(ψ)
-        @test norm(ψ) ≈ 1
-    end
-
-    @testset "Truncation (SVD, left)" begin
-        truncate!(ψ, :left, Dcut)
-        @test is_left_normalized(ψ)
-        @test norm(ψ) ≈ 1
-    end
-
-
-    @testset "<left|right>" begin
-        ϵ = 1E-10
-        ψ = randn(MPS{T}, sites, D, d)
-
-        l = copy(ψ)
-        r = copy(ψ)
-
-        canonise!(l, :left)
-        @test is_left_normalized(l)
-
-        canonise!(r, :right)
-        @test is_right_normalized(r)
-
-        @test dot(l, l) ≈ 1
-        @test dot(r, r) ≈ 1
-
-        @test abs(1 - abs(dot(l, r))) < ϵ
-    end
-
-
-    @testset "Variational compression" begin
-        Ψ = copy(Φ)
-        canonise!(Ψ, :left)
-
-        overlap = compress!(Φ, Dcut, var_tol, var_max_sweeps)
-        #println(overlap)
-
-        @test norm(Φ) ≈ 1
-        @test is_left_normalized(Φ)
-        @test is_right_normalized(Φ) == false
-    end
-
-end
diff --git a/test/contractions.jl b/test/contractions.jl
deleted file mode 100644
index 1079819..0000000
--- a/test/contractions.jl
+++ /dev/null
@@ -1,111 +0,0 @@
-@testset "contractions" begin
-
-    D = 10
-    d = 3
-    sites = 5
-    T = ComplexF64
-
-    ψ = randn(MPS{T}, sites, D, d)
-    ϕ = randn(MPS{T}, sites, D, d)
-    mpo_ψ = randn(MPO{T}, sites, D, d)
-    mpo = randn(MPO{T}, 2, 2, 2)
-
-
-    Id = fill(I(d), length(ψ))
-
-    Id_m = MPO(fill(ones(1, 1, 1, d), length(ϕ)))
-
-    @testset "dot products" begin
-        @testset "is equal to itself" begin
-            @test dot(ψ, ψ) ≈ dot(ψ, ψ)
-        end
-
-        @testset "change of arguments results in conjugation" begin
-            @test dot(ψ, ϕ) ≈ conj(dot(ϕ, ψ))
-            @test dot(ψ, Id, ϕ) ≈ conj(dot(ϕ, Id, ψ))
-        end
-
-        @testset "dot with identity equal to dot of two MPS" begin
-            @test dot(ψ, Id, ϕ) ≈ dot(ψ, ϕ)
-        end
-
-        @testset "norm is 2-norm" begin
-            @test norm(ψ) ≈ sqrt(abs(dot(ψ, ψ)))
-        end
-
-        @testset "renormalizations" begin
-            ψ[end] *= 1 / norm(ψ)
-            @test dot(ψ, ψ) ≈ 1
-
-            ϕ[1] *= 1 / norm(ϕ)
-            @test dot(ϕ, ϕ) ≈ 1
-        end
-
-        @testset "dot products of MPO" begin
-            mpo1 = dot(mpo, mpo)
-
-            @testset "has correct sisze" begin
-                @test size(mpo1[1]) == (1, 2, 4, 2)
-                @test size(mpo1[2]) == (4, 2, 1, 2)
-            end
-        end
-
-    end
-
-    @testset "left environment returns correct overlap" begin
-        L = left_env(ϕ, ψ)
-        @test L[end][1] ≈ dot(ϕ, ψ)
-    end
-
-
-    @testset "right environment returns correct overlap" begin
-        R = right_env(ϕ, ψ)
-        @test R[1][end] ≈ dot(ϕ, ψ)
-    end
-
-
-    @testset "Cauchy-Schwarz inequality of MPS is OK" begin
-        @test abs(dot(ϕ, ψ)) <= norm(ϕ) * norm(ψ)
-    end
-
-
-    @testset "left_env correctly contracts MPS for a given configuration" begin
-        D = 10
-        d = 2
-        sites = 5
-        T = ComplexF64
-
-        ψ = randn(MPS{T}, sites, D, d)
-        state = 2 * (rand(sites) .< 0.5) .- 1
-
-        C = I
-        for (A, σ) ∈ zip(ψ, state)
-            C *= A[:, idx(σ), :]
-        end
-
-        @test tr(C) ≈ left_env(ψ, map(idx, state))[]
-    end
-
-
-    @testset "right_env correctly contracts MPO with MPS for a given configuration" begin
-        D = 10
-        d = 2
-        sites = 5
-        T = Float64
-
-        ψ = randn(MPS{T}, sites, D, d)
-        W = randn(MPO{T}, sites, D, d)
-
-        σ = 2 * (rand(sites) .< 0.5) .- 1
-
-        ϕ = MPS(T, sites)
-        for (i, A) ∈ enumerate(W)
-            m = idx(σ[i])
-            @cast B[x, s, y] := A[x, $m, y, s]
-            ϕ[i] = B
-        end
-
-        @test dot(ψ, ϕ) ≈ right_env(ψ, W, map(idx, σ))[]
-    end
-
-end
diff --git a/test/identities.jl b/test/identities.jl
deleted file mode 100644
index 7290ca4..0000000
--- a/test/identities.jl
+++ /dev/null
@@ -1,80 +0,0 @@
-using Random
-
-
-ψ = randn(MPS{Float64}, 4, 3, 2)
-O = randn(MPO{Float64}, 4, 3, 2)
-
-IMPS = IdentityMPS()
-IMPO = IdentityMPO()
-
-@testset "multiplication of IdentityMPO" begin
-
-    @testset "mutlitplication with MPS ψ returns ψ" begin
-        @test IMPO * ψ == ψ
-        @test ψ * IMPO == ψ
-    end
-
-    @testset "mutlitplication with MPO O returns O" begin
-        @test IMPO * O == O
-    end
-end
-
-@testset "Multiplication of IdentityMPS by an MPO O" begin
-    ϕ = O * IMPS
-
-    @testset "result has the correct type" begin
-        @test typeof(ϕ) == MPS{Float64}
-    end
-
-    @testset "length of result is the same as O" begin
-        @test length(ϕ) == length(O)
-    end
-
-    @testset "the multiplication drops the correct dims" begin
-        for i ∈ eachindex(O)
-            @test ϕ[i] == dropdims(sum(O[i], dims = 4), dims = 4)
-        end
-    end
-end
-
-@testset "Identities are singletons" begin
-    @test IMPO === IdentityMPO()
-    @test IMPS === IdentityMPS()
-end
-
-@testset "Identities have infinite length" begin
-    @test length(IMPS) == Inf
-    @test length(IMPO) == Inf
-end
-
-@testset "Indexing identities returns trivial tensors" begin
-    @testset "Indexing IdentityMPS" begin
-        A = IMPS[42]
-        @test length(A) == 1
-        @test ndims(A) == 3
-        @test norm(A) == 1
-    end
-
-    @testset "Indexing IdentityMPO" begin
-        B = IMPO[666]
-        @test length(B) == 1
-        @test ndims(B) == 4
-        @test norm(B) == 1
-    end
-end
-
-@testset "IdentityMPS is only equal to itself" begin
-    @test IdentityMPS() == IdentityMPS()
-
-    true_identity = IdentityMPS()
-    tensors = [true_identity[i] for i = 1:4]
-
-    @test IdentityMPS() != MPS(tensors)
-    @test MPS(tensors) != IdentityMPS()
-
-    Random.seed!(123)
-    another_mps = randn(MPS{Float64}, 5, 3, 4)
-
-    @test IdentityMPS() != another_mps
-    @test another_mps != IdentityMPS()
-end
diff --git a/test/projectors.jl b/test/projectors.jl
new file mode 100644
index 0000000..d255328
--- /dev/null
+++ b/test/projectors.jl
@@ -0,0 +1,53 @@
+
+@testset "Add and get from pool of projectors" begin
+    @testset "Start with empty pool and add elements to it" begin
+        lp = PoolOfProjectors{Int64}()
+        @test length(lp) == 0
+
+        p1 = [1, 1, 2, 2, 3, 3]
+        p2 = [1, 2, 1, 3]
+        k = add_projector!(lp, p1)
+        @test k == 1
+        @test length(lp) == 1
+
+        k = add_projector!(lp, p1)
+        @test k == 1
+        @test length(lp) == 1
+
+        k = add_projector!(lp, p2)
+        @test k == 2
+        @test length(lp) == 2
+
+        @test get_projector!(lp, 1) == p1
+        @test get_projector!(lp, 2) == p2
+
+        @test length(lp, 1) == 6
+        @test length(lp, 2) == 4
+        @test size(lp, 1) == 3
+        @test size(lp, 2) == 3
+
+        empty!(lp, lp.default_device)
+        @test length(lp) == 0
+    end
+
+    @testset "Different devices" begin
+        for T ∈ [Int16, Int32, Int64]
+            lp = PoolOfProjectors{T}()
+            p1 = Vector{T}([1, 1, 2, 2, 3, 3])
+            p2 = Vector{T}([1, 2, 1, 3])
+            k = add_projector!(lp, p1)
+            k = add_projector!(lp, p2)
+
+            @test typeof(get_projector!(lp, 2, :CPU)) <: Array{T,1}
+            @test typeof(get_projector!(lp, 1, :GPU)) <: CuArray{T,1}
+            @test length(lp, :GPU) == 1
+            @test length(lp, :CPU) == 2
+
+            @test typeof(get_projector!(lp, 1, :GPU)) <: CuArray{T,1}
+            @test length(lp, :GPU) == 1
+
+            @test typeof(get_projector!(lp, 2, :GPU)) <: CuArray{T,1}
+            @test length(lp, :GPU) == 2
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 6a2186e..fb5d8cf 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,16 +3,13 @@ using TensorOperations
 using TensorCast
 using Logging
 using LinearAlgebra
+using CUDA
 
 disable_logging(LogLevel(1))
 
 using Test
 
-idx(σ::Int) = (σ == -1) ? 1 : σ + 1
-
-
-my_tests =
-    ["base.jl", "memoization.jl", "contractions.jl", "compressions.jl", "identities.jl"]
+my_tests = ["canonise.jl", "variational.jl", "projectors.jl"]
 
 for my_test in my_tests
     include(my_test)
diff --git a/test/variational.jl b/test/variational.jl
new file mode 100644
index 0000000..a4e34fc
--- /dev/null
+++ b/test/variational.jl
@@ -0,0 +1,52 @@
+l = 2
+D1 = ([1, 1], [1, 2], [1, 1], [1, 1])
+D2 = ([1, 1], [1, 2], [1, 1], [1, 2])
+S = Float64
+rand_central = rand(CentralTensor{S}, [1, 1, 1, 1, 1, 1, 1, 1])
+map1 = MpoTensor(
+    TensorMap{S}(
+        Dict(
+            -1 // 2 => rand_central,
+            0 => rand(SiteTensor{S}, PoolOfProjectors{Integer}(), l, D1),
+        ),
+    ),
+)
+map2 = MpoTensor(
+    TensorMap{S}(
+        Dict(
+            -1 // 2 => rand_central,
+            0 => rand(SiteTensor{S}, PoolOfProjectors{Integer}(), l, D2),
+        ),
+    ),
+)
+map3 = MpoTensor(
+    TensorMap{S}(
+        Dict(
+            -1 // 2 => rand_central,
+            0 => rand(SiteTensor{S}, PoolOfProjectors{Integer}(), l, D1),
+        ),
+    ),
+)
+mpomap = Dict(1 => map1, 2 => map2, 3 => map3)
+
+D = 2
+sites = [1, 2, 3]
+d = [1, 1, 1]
+id = Dict(j => d[i] for (i, j) in enumerate(sites))
+
+@testset "Random QMpo with varying physical dimension" begin
+    W = rand(QMpo{S}, mpomap)
+
+    @test length(W) == 3
+    @test bond_dimension(W) == 1
+end
+
+@testset "Compressions for sparse mps and mpo works" begin
+    W = rand(QMpo{S}, mpomap)
+    ψ = rand(QMps{S}, id, D)
+    canonise!(ψ, :left)
+    ϕ = rand(QMps{S}, id, D)
+    canonise!(ϕ, :left)
+
+
+end