From ae37147aa770848b426a3326906084d1217d0271 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Tue, 22 Mar 2022 13:27:43 -0700
Subject: [PATCH] sync with internal repo (commit d92ae646)

---
 LICENSE                                       |   2 +-
 python/.gitignore                             |   5 +
 python/LICENSE                                |   2 +-
 python/README.md                              |  28 +-
 python/cuquantum/__init__.py                  |  15 +-
 python/cuquantum/_version.py                  |   6 +-
 python/cuquantum/custatevec/custatevec.pxd    |  33 +-
 python/cuquantum/custatevec/custatevec.pyx    | 918 +++++++++++++-----
 python/cuquantum/cutensornet/__init__.py      |   3 +-
 .../cutensornet/_internal/einsum_parser.py    | 344 +++++--
 .../cutensornet/_internal/formatters.py       |  44 +
 .../cutensornet/_internal/package_ifc.py      |  66 ++
 .../cutensornet/_internal/package_ifc_cupy.py |  36 +
 .../_internal/package_ifc_torch.py            |  34 +
 .../cutensornet/_internal/package_wrapper.py  |  16 +
 .../cutensornet/_internal/tensor_ifc_cupy.py  |  93 ++
 .../cutensornet/_internal/tensor_ifc_numpy.py |  82 ++
 .../cutensornet/_internal/tensor_ifc_torch.py |  88 ++
 .../cutensornet/_internal/tensor_wrapper.py   |  15 +-
 .../cuquantum/cutensornet/_internal/utils.py  | 137 ++-
 python/cuquantum/cutensornet/configuration.py |  55 +-
 python/cuquantum/cutensornet/cutensornet.pxd  |  45 +-
 python/cuquantum/cutensornet/cutensornet.pyx  | 605 +++++++++---
 python/cuquantum/cutensornet/memory.py        | 164 ++++
 .../cuquantum/cutensornet/tensor_network.py   | 419 ++++----
 python/cuquantum/utils.pxd                    |  52 +
 python/samples/accessor_get.py                |   5 +-
 python/samples/accessor_set.py                |   3 +
 python/samples/coarse/example12.py            |  34 +
 python/samples/coarse/example13.py            |  19 +
 python/samples/coarse/example14.py            |  17 +
 python/samples/coarse/example15.py            |  19 +
 python/samples/coarse/example16.py            |  17 +
 python/samples/coarse/example17.py            |  16 +
 python/samples/coarse/example18.py            |  17 +
 python/samples/coarse/example19.py            |  17 +
 python/samples/coarse/example20.py            |  34 +
 python/samples/diagonal_matrix.py             |  16 +-
 python/samples/expectation.py                 |   4 +-
 python/samples/expectation_pauli.py           |   4 +-
 python/samples/exponential_pauli.py           |   2 +-
 python/samples/gate_application.py            |   6 +-
 python/samples/memory_handler.py              | 117 +++
 python/samples/mgpu_batch_measure.py          | 118 +++
 python/samples/mgpu_sampler.py                | 133 +++
 python/samples/permutation_matrix.py          |  16 +-
 python/samples/sampler.py                     |   2 +-
 python/samples/swap_index_bits.py             |  46 +
 python/samples/tensornet_example.py           |  23 +-
 python/samples/test_matrix_type.py            |  63 ++
 python/setup.py                               |  27 +-
 .../custatevec_tests/test_custatevec.py       | 831 ++++++++++++++--
 .../cuquantum_tests/cutensornet_tests/data.py |  11 +-
 .../cutensornet_tests/test_contract.py        |   1 +
 .../cutensornet_tests/test_cutensornet.py     | 404 ++++++--
 .../cutensornet_tests/test_network.py         |   2 +
 .../cutensornet_tests/testutils.py            |  32 +-
 samples/custatevec/CMakeLists.txt             |  11 +
 samples/custatevec/Makefile                   |  12 +-
 samples/custatevec/accessor_get.cu            |  13 +-
 samples/custatevec/accessor_set.cu            |  13 +-
 samples/custatevec/diagonal_matrix.cu         |   2 +-
 samples/custatevec/expectation.cu             |   4 +-
 samples/custatevec/expectation_pauli.cu       |   6 +-
 samples/custatevec/exponential_pauli.cu       |   2 +-
 samples/custatevec/gate_application.cu        |   6 +-
 samples/custatevec/memory_handler.cu          | 189 ++++
 samples/custatevec/mgpu_batch_measure.cu      | 215 ++++
 samples/custatevec/mgpu_sampler.cu            | 238 +++++
 samples/custatevec/permutation_matrix.cu      |   4 +-
 samples/custatevec/sampler.cu                 |  13 +-
 samples/custatevec/swap_index_bits.cu         | 104 ++
 samples/custatevec/test_matrix_type.cu        | 121 +++
 samples/cutensornet/CMakeLists.txt            |   2 +-
 samples/cutensornet/README.md                 |  52 +-
 samples/cutensornet/tensornet_example.cu      | 184 ++--
 76 files changed, 5483 insertions(+), 1071 deletions(-)
 create mode 100644 python/.gitignore
 create mode 100644 python/cuquantum/cutensornet/_internal/formatters.py
 create mode 100644 python/cuquantum/cutensornet/_internal/package_ifc.py
 create mode 100644 python/cuquantum/cutensornet/_internal/package_ifc_cupy.py
 create mode 100644 python/cuquantum/cutensornet/_internal/package_ifc_torch.py
 create mode 100644 python/cuquantum/cutensornet/_internal/package_wrapper.py
 create mode 100644 python/cuquantum/cutensornet/_internal/tensor_ifc_cupy.py
 create mode 100644 python/cuquantum/cutensornet/_internal/tensor_ifc_numpy.py
 create mode 100644 python/cuquantum/cutensornet/_internal/tensor_ifc_torch.py
 create mode 100644 python/cuquantum/cutensornet/memory.py
 create mode 100644 python/samples/coarse/example12.py
 create mode 100644 python/samples/coarse/example13.py
 create mode 100644 python/samples/coarse/example14.py
 create mode 100644 python/samples/coarse/example15.py
 create mode 100644 python/samples/coarse/example16.py
 create mode 100644 python/samples/coarse/example17.py
 create mode 100644 python/samples/coarse/example18.py
 create mode 100644 python/samples/coarse/example19.py
 create mode 100644 python/samples/coarse/example20.py
 create mode 100644 python/samples/memory_handler.py
 create mode 100644 python/samples/mgpu_batch_measure.py
 create mode 100644 python/samples/mgpu_sampler.py
 create mode 100644 python/samples/swap_index_bits.py
 create mode 100644 python/samples/test_matrix_type.py
 create mode 100644 samples/custatevec/memory_handler.cu
 create mode 100644 samples/custatevec/mgpu_batch_measure.cu
 create mode 100644 samples/custatevec/mgpu_sampler.cu
 create mode 100644 samples/custatevec/swap_index_bits.cu
 create mode 100644 samples/custatevec/test_matrix_type.cu

diff --git a/LICENSE b/LICENSE
index 33a89f0..20388a7 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 BSD-3-Clause
 
-Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
diff --git a/python/.gitignore b/python/.gitignore
new file mode 100644
index 0000000..2715e49
--- /dev/null
+++ b/python/.gitignore
@@ -0,0 +1,5 @@
+__pycache__
+*.egg-info
+*.eggs
+*.cpp
+*.so
diff --git a/python/LICENSE b/python/LICENSE
index 33a89f0..20388a7 100644
--- a/python/LICENSE
+++ b/python/LICENSE
@@ -1,6 +1,6 @@
 BSD-3-Clause
 
-Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
diff --git a/python/README.md b/python/README.md
index 68eb2b7..e170bdd 100644
--- a/python/README.md
+++ b/python/README.md
@@ -11,10 +11,11 @@ Please visit the [NVIDIA cuQuantum Python documentation](https://docs.nvidia.com
 Build-time dependencies of the cuQuantum Python package and some versions that
 are known to work are as follows:
 
-* CUDA Toolkit 11.4+
-* cuQuantum 0.1.0
-* cuTENSOR 1.4.0+
+* CUDA Toolkit 11.x
+* cuQuantum 22.03
+* cuTENSOR 1.5.0+
 * Cython - e.g. 0.29.21
+* [packaging](https://packaging.pypa.io/en/latest/)
 
 ### Install cuQuantum Python from conda-forge
 
@@ -24,6 +25,18 @@ conda install -c conda-forge cuquantum-python
 ```
 The Conda solver will install all required dependencies for you.
 
+### Install cuQuantum Python from PyPI
+
+Alternatively, assuming you already have a Python environment set up (it doesn't matter if it's a Conda env or not),
+you can also install cuQuantum Python this way:
+
+```
+pip install cuquantum-python
+```
+The `pip` solver will also install both cuTENSOR and cuQuantum for you.
+
+Note: To properly install the wheels the environment variable `CUQUANTUM_ROOT` must not be set.
+
 ### Install cuQuantum Python from source
 
 To compile and install cuQuantum Python from source, please follow the steps below:
@@ -48,9 +61,9 @@ Runtime dependencies of the cuQuantum Python package include:
 
 * An NVIDIA GPU with compute capability 7.0+
 * Driver: Linux (450.80.02+)
-* CUDA Toolkit 11.4+
-* cuQuantum 0.1.0
-* cuTENSOR 1.4.0+
+* CUDA Toolkit 11.x
+* cuQuantum 22.03
+* cuTENSOR 1.5.0+
 * NumPy v1.17+
 * CuPy v9.5.0+
 * PyTorch v1.10+ (optional)
@@ -77,4 +90,5 @@ library in Python.
 ## Testing
 
 If pytest is installed, run `pytest tests` in the Python source root directory would
-run all tests.
+run all tests. Some tests would be skipped if `cffi` is not installed or if the environment
+variable `CUDA_PATH` is not set.
diff --git a/python/cuquantum/__init__.py b/python/cuquantum/__init__.py
index 82750fa..80c3820 100644
--- a/python/cuquantum/__init__.py
+++ b/python/cuquantum/__init__.py
@@ -1,7 +1,7 @@
 from cuquantum import custatevec
 from cuquantum import cutensornet
 from cuquantum.cutensornet import (
-    contract, contract_path, einsum, einsum_path, Network,
+    contract, contract_path, einsum, einsum_path, Network, BaseCUDAMemoryManager, MemoryPointer,
     NetworkOptions, OptimizerInfo, OptimizerOptions, PathFinderOptions, ReconfigOptions, SlicerOptions)
 from cuquantum.utils import ComputeType, cudaDataType, libraryPropertyType
 from cuquantum._version import __version__
@@ -17,14 +17,11 @@
         cutensornet.ContractionOptimizerInfoAttribute,
         cutensornet.ContractionOptimizerConfigAttribute,
         cutensornet.ContractionAutotunePreferenceAttribute,
+        cutensornet.WorksizePref,
+        cutensornet.Memspace,
+        cutensornet.GraphAlgo,
+        cutensornet.MemoryModel,
         ):
     cutensornet._internal.enum_utils.add_enum_class_doc(enum, chomp="_ATTRIBUTE|_PREFERENCE_ATTRIBUTE")
-# these have yet another convention...
-for v in cutensornet.GraphAlgorithm:
-    v.__doc__ = f"See `CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_ALGORITHM_{v.name}`."
-cutensornet.MemoryModel.SLICER_HEURISTIC.__doc__ = \
-    f"See `CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_MODEL_HEURISTIC`."
-cutensornet.MemoryModel.SLICER_CUTENSOR.__doc__ = \
-    f"See `CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_MODEL_CUTENSOR`."
 
-del enum, utils, v
+del enum, utils
diff --git a/python/cuquantum/_version.py b/python/cuquantum/_version.py
index 262d033..57c6515 100644
--- a/python/cuquantum/_version.py
+++ b/python/cuquantum/_version.py
@@ -1,2 +1,4 @@
-# TODO: find a better approach to sync the version string with the C libs
-__version__ = '.'.join(['0.1.0', '1'])  # the last digit is for cuQuantum Python only
+# Note: cuQuantum Python follows the cuQuantum SDK version, which is now
+# switched to YY.MM and is different from individual libraries' (semantic)
+# versioning scheme.
+__version__ = '22.03.0'  # the last digit is for cuQuantum Python only
diff --git a/python/cuquantum/custatevec/custatevec.pxd b/python/cuquantum/custatevec/custatevec.pxd
index 884b917..92275d8 100644
--- a/python/cuquantum/custatevec/custatevec.pxd
+++ b/python/cuquantum/custatevec/custatevec.pxd
@@ -3,28 +3,42 @@
 
 from libc.stdint cimport intptr_t, int32_t, uint32_t, int64_t
 
+from cuquantum.utils cimport Stream
+
 
 # The C types are prefixed with an underscore because we are not
 # yet protected by the module namespaces as done in CUDA Python.
 # Once we switch over the names would be prettier (in the Cython
 # layer).
 
+
+# Cython limitation: need standalone typedef if we wanna use it for casting
+ctypedef int (*DeviceAllocType)(void*, void**, size_t, Stream)
+ctypedef int (*DeviceFreeType)(void*, void*, size_t, Stream)
+
+
 cdef extern from '<custatevec.h>' nogil:
     # cuStateVec types
     ctypedef void* _Handle 'custatevecHandle_t'
     ctypedef int64_t _Index 'custatevecIndex_t'
     ctypedef int _Status 'custatevecStatus_t'
-    ctypedef struct _SamplerDescriptor 'custatevecSamplerDescriptor_t':
-        pass
-    ctypedef struct _AccessorDescriptor 'custatevecAccessorDescriptor':
-        pass
+    ctypedef void* _SamplerDescriptor 'custatevecSamplerDescriptor_t'
+    ctypedef void* _AccessorDescriptor 'custatevecAccessorDescriptor_t'
     ctypedef enum _ComputeType 'custatevecComputeType_t':
         pass
-    # ctypedef void(*custatevecLoggerCallback_t)(
-    #     int32_t logLevel,
-    #     const char* functionName,
-    #     const char* message)
-    # ctypedef custatevecLoggerCallback_t LoggerCallback
+    ctypedef struct _DeviceMemHandler 'custatevecDeviceMemHandler_t':
+        void* ctx
+        DeviceAllocType device_alloc
+        DeviceFreeType device_free
+
+        # Cython limitation: cannot use C defines in declaring a static array,
+        # so we just have to hard-code CUSTATEVEC_ALLOCATOR_NAME_LEN here...
+        char name[64]
+    ctypedef void(*LoggerCallbackData 'custatevecLoggerCallbackData_t')(
+        int32_t logLevel,
+        const char* functionName,
+        const char* message,
+        void* userData)
 
     # cuStateVec enums
     ctypedef enum _Pauli 'custatevecPauli_t':
@@ -55,3 +69,4 @@ cdef extern from '<custatevec.h>' nogil:
     int CUSTATEVEC_VER_MINOR
     int CUSTATEVEC_VER_PATCH
     int CUSTATEVEC_VERSION
+    int CUSTATEVEC_ALLOCATOR_NAME_LEN
diff --git a/python/cuquantum/custatevec/custatevec.pyx b/python/cuquantum/custatevec/custatevec.pyx
index 526529a..bcae585 100644
--- a/python/cuquantum/custatevec/custatevec.pyx
+++ b/python/cuquantum/custatevec/custatevec.pyx
@@ -4,9 +4,11 @@ cimport cython
 from libc.stdio cimport FILE
 from libcpp.vector cimport vector
 cimport cpython
-from cpython.mem cimport PyMem_Malloc, PyMem_Free
 
 from cuquantum.utils cimport is_nested_sequence
+from cuquantum.utils cimport cuqnt_alloc_wrapper
+from cuquantum.utils cimport cuqnt_free_wrapper
+from cuquantum.utils cimport logger_callback_with_data
 
 from enum import IntEnum
 
@@ -15,11 +17,12 @@ import numpy as _numpy
 
 cdef extern from * nogil:
     # from CUDA
-    ctypedef int Stream 'cudaStream_t'
     ctypedef enum DataType 'cudaDataType_t':
         pass
     ctypedef enum LibPropType 'libraryPropertyType':
         pass
+    ctypedef struct int2 'int2':
+        pass
 
     # cuStateVec functions
     int custatevecCreate(_Handle*)
@@ -33,11 +36,12 @@ cdef extern from * nogil:
     int custatevecSetStream(_Handle, Stream)
     int custatevecGetStream(_Handle, Stream*)
     # int custatevecLoggerSetCallback(LoggerCallback)
+    int custatevecLoggerSetCallbackData(LoggerCallbackData, void*)
     # int custatevecLoggerSetFile(FILE*)
-    # int custatevecLoggerOpenFile(const char*)
-    # int custatevecLoggerSetLevel(int32_t)
-    # int custatevecLoggerSetMask(int32_t)
-    # int custatevecLoggerForceDisable()
+    int custatevecLoggerOpenFile(const char*)
+    int custatevecLoggerSetLevel(int32_t)
+    int custatevecLoggerSetMask(int32_t)
+    int custatevecLoggerForceDisable()
     int custatevecAbs2SumOnZBasis(
         _Handle, const void*, DataType, const uint32_t, double*, double*,
         const int32_t*, const uint32_t)
@@ -56,58 +60,77 @@ cdef extern from * nogil:
     int custatevecBatchMeasure(
         _Handle, void*, DataType, const uint32_t, int32_t*, const int32_t*,
         const uint32_t, const double, _CollapseOp)
-    int custatevecApplyExp(
+    int custatevecBatchMeasureWithOffset(
+        _Handle, void*, DataType, const uint32_t, int32_t*, const int32_t*,
+        const uint32_t, const double, _CollapseOp, const double, const double)
+    int custatevecApplyPauliRotation(
         _Handle, void*, DataType, const uint32_t, double, const _Pauli*,
         const int32_t*, const uint32_t, const int32_t*, const int32_t*,
         const uint32_t)
-    int custatevecApplyMatrix_bufferSize(
+    int custatevecApplyMatrixGetWorkspaceSize(
         _Handle, DataType, const uint32_t, const void*, DataType,
         _MatrixLayout, const int32_t, const uint32_t, const uint32_t,
         _ComputeType, size_t*)
     int custatevecApplyMatrix(
         _Handle, void*, DataType, const uint32_t, const void*,
         DataType, _MatrixLayout, const int32_t, const int32_t*,
-        const uint32_t, const int32_t*, const uint32_t, const int32_t*,
+        const uint32_t, const int32_t*, const int32_t*, const uint32_t,
         _ComputeType, void*, size_t)
-    int custatevecExpectation_bufferSize(
+    int custatevecComputeExpectationGetWorkspaceSize(
         _Handle, DataType, const uint32_t, const void*, DataType, _MatrixLayout,
         const uint32_t, _ComputeType, size_t*)
-    int custatevecExpectation(
+    int custatevecComputeExpectation(
         _Handle, const void*, DataType, const uint32_t, void*, DataType, double*,
         const void*, DataType, _MatrixLayout, const int32_t*,
         const uint32_t, _ComputeType, void*, size_t)
-    int custatevecSampler_create(
+    int custatevecSamplerCreate(
         _Handle, const void*, DataType, const uint32_t, _SamplerDescriptor*,
         uint32_t, size_t*)
-    int custatevecSampler_preprocess(
-        _Handle, _SamplerDescriptor*, void*, const size_t)
-    int custatevecSampler_sample(
-        _Handle, _SamplerDescriptor*, _Index*, const int32_t*, const uint32_t,
+    int custatevecSamplerDestroy(_SamplerDescriptor)
+    int custatevecSamplerPreprocess(
+        _Handle, _SamplerDescriptor, void*, const size_t)
+    int custatevecSamplerSample(
+        _Handle, _SamplerDescriptor, _Index*, const int32_t*, const uint32_t,
         const double*, const uint32_t, _SamplerOutput)
-    int custatevecApplyGeneralizedPermutationMatrix_bufferSize(
+    int custatevecSamplerGetSquaredNorm(_Handle, _SamplerDescriptor, double*)
+    int custatevecSamplerApplySubSVOffset(
+        _Handle, _SamplerDescriptor, int32_t, uint32_t, double, double)
+    int custatevecApplyGeneralizedPermutationMatrixGetWorkspaceSize(
         _Handle, DataType, const uint32_t, const _Index*, void*, DataType,
         const int32_t*, const uint32_t, const uint32_t, size_t*)
     int custatevecApplyGeneralizedPermutationMatrix(
         _Handle, void*, DataType, const uint32_t, _Index*, const void*,
         DataType, const int32_t, const int32_t*, const uint32_t,
         const int32_t*, const int32_t*, const uint32_t, void*, size_t)
-    int custatevecExpectationsOnPauliBasis(
-        _Handle, void*, DataType, const uint32_t, double*, const _Pauli**,
-        const int32_t**, const uint32_t*, const uint32_t)
-    int custatevecAccessor_create(
+    int custatevecComputeExpectationsOnPauliBasis(
+        _Handle, void*, DataType, const uint32_t, double*, const _Pauli**, const uint32_t,
+        const int32_t**, const uint32_t*)
+    int custatevecAccessorCreate(
         _Handle, void*, DataType, const uint32_t,
         _AccessorDescriptor*, const int32_t*, const uint32_t, const int32_t*,
         const int32_t*, const uint32_t, size_t*)
-    int custatevecAccessor_createReadOnly(
+    int custatevecAccessorCreateView(
         _Handle, const void*, DataType, const uint32_t,
-        _AccessorDescriptor*, const int32_t*, const uint32_t, const int32_t*,
+        _AccessorDescriptor, const int32_t*, const uint32_t, const int32_t*,
         const int32_t*, const uint32_t, size_t*)
-    int custatevecAccessor_setExtraWorkspace(
-        _Handle, _AccessorDescriptor*, void*, size_t)
-    int custatevecAccessor_get(
-        _Handle, _AccessorDescriptor*, void*, const _Index, const _Index)
-    int custatevecAccessor_set(
-        _Handle, _AccessorDescriptor*, const void*, const _Index, const _Index)
+    int custatevecAccessorDestroy(_AccessorDescriptor)
+    int custatevecAccessorSetExtraWorkspace(
+        _Handle, _AccessorDescriptor, void*, size_t)
+    int custatevecAccessorGet(
+        _Handle, _AccessorDescriptor, void*, const _Index, const _Index)
+    int custatevecAccessorSet(
+        _Handle, _AccessorDescriptor, const void*, const _Index, const _Index)
+    int custatevecSwapIndexBits(
+        _Handle, void*, DataType, const uint32_t, const int2*, const uint32_t,
+        const int32_t*, const int32_t*, const uint32_t)
+    int custatevecTestMatrixTypeGetWorkspaceSize(
+        _Handle, _MatrixType, const void*, DataType, _MatrixLayout,
+        const uint32_t, const int32_t, _ComputeType, size_t*)
+    int custatevecTestMatrixType(
+        _Handle, double*, _MatrixType, const void*, DataType, _MatrixLayout,
+        const uint32_t, const int32_t, _ComputeType, void*, size_t)
+    int custatevecGetDeviceMemHandler(_Handle, _DeviceMemHandler*)
+    int custatevecSetDeviceMemHandler(_Handle, const _DeviceMemHandler*)
 
 
 class cuStateVecError(RuntimeError):
@@ -133,7 +156,7 @@ cpdef intptr_t create() except*:
     """Initialize the cuStateVec library and create a handle.
 
     Returns:
-        intptr_t: The opaque library handle (as Python `int`).
+        intptr_t: The opaque library handle (as Python :class:`int`).
 
     .. note:: The returned handle should be tied to the current device.
 
@@ -155,6 +178,12 @@ cpdef destroy(intptr_t handle):
 
     .. seealso:: `custatevecDestroy`
     """
+    # reduce the ref counts of user-provided Python objects:
+    # if Python callables are attached to the handle as the handler,
+    # we need to decrease the ref count to avoid leaking
+    if handle in owner_pyobj:
+        del owner_pyobj[handle]
+
     with nogil:
         status = custatevecDestroy(<_Handle>handle)
     check_status(status)
@@ -184,7 +213,7 @@ cpdef set_workspace(intptr_t handle, intptr_t workspace, size_t workspace_size):
 
     Args:
         handle (intptr_t): The library handle.
-        workspace (intptr_t): The pointer address (as Python `int`) to the
+        workspace (intptr_t): The pointer address (as Python :class:`int`) to the
             workspace (on device).
         workspace_size (size_t): The workspace size (in bytes).
 
@@ -231,7 +260,7 @@ cpdef set_stream(intptr_t handle, intptr_t stream):
     Args:
         handle (intptr_t): The library handle.
         stream (intptr_t): The CUDA stream handle (``cudaStream_t`` as Python
-            `int`).
+            :class:`int`).
 
     .. seealso:: `custatevecSetStream`
     """
@@ -249,7 +278,7 @@ cpdef intptr_t get_stream(intptr_t handle):
 
     Returns:
         intptr_t:
-            The CUDA stream handle (``cudaStream_t`` as Python `int`).
+            The CUDA stream handle (``cudaStream_t`` as Python :class:`int`).
 
     .. seealso:: `custatevecGetStream`
     """
@@ -261,9 +290,6 @@ cpdef intptr_t get_stream(intptr_t handle):
     return stream
 
 
-# TODO(leofang): add logger callback APIs
-
-
 cpdef tuple abs2sum_on_z_basis(
         intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
         bint get_parity0, bint get_parity1,
@@ -272,7 +298,7 @@ cpdef tuple abs2sum_on_z_basis(
 
     Args:
         handle (intptr_t): The library handle.
-        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+        sv (intptr_t): The pointer address (as Python :class:`int`) to the statevector
             (on device).
         sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
         n_index_bits (uint32_t): The number of index bits.
@@ -282,7 +308,7 @@ cpdef tuple abs2sum_on_z_basis(
             for parity 1.
         basis_bits: A host array of Z-basis index bits. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of index bits
 
         n_basis_bits (uint32_t): the number of basis bits.
@@ -335,27 +361,27 @@ cpdef abs2sum_array(
 
     Args:
         handle (intptr_t): The library handle.
-        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+        sv (intptr_t): The pointer address (as Python :class:`int`) to the statevector
             (on device).
         sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
         n_index_bits (uint32_t): The number of index bits.
-        abs2sum (intptr_t): The pointer address (as Python `int`) to the array
+        abs2sum (intptr_t): The pointer address (as Python :class:`int`) to the array
             (on either host or device) that would hold the sums.
         bit_ordering: A host array of index bit ordering. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of index bit ordering
 
         bit_ordering_len (uint32_t): The length of ``bit_ordering``.
         mask_bit_string: A host array for a bit string to specify mask. It can
             be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of index bit ordering
 
         mask_ordering: A host array of mask ordering. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of index bit ordering
 
         mask_len (uint32_t): The length of ``mask_ordering``.
@@ -405,14 +431,14 @@ cpdef collapse_on_z_basis(
 
     Args:
         handle (intptr_t): The library handle.
-        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+        sv (intptr_t): The pointer address (as Python :class:`int`) to the statevector
             (on device).
         sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
         n_index_bits (uint32_t): The number of index bits.
         parity (int32_t): The parity, 0 or 1.
         basis_bits: A host array of Z-basis index bits. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of index bits
 
         n_basis_bits (uint32_t): the number of basis bits.
@@ -444,18 +470,18 @@ cpdef collapse_by_bitstring(
 
     Args:
         handle (intptr_t): The library handle.
-        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+        sv (intptr_t): The pointer address (as Python :class:`int`) to the statevector
             (on device).
         sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
         n_index_bits (uint32_t): The number of index bits.
         bit_string: A host array of a bit string. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of bits
 
         bit_ordering: A host array of bit string ordering. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of bit ordering
 
         bit_string_len (uint32_t): The length of ``bit_string``.
@@ -498,13 +524,13 @@ cpdef int measure_on_z_basis(
 
     Args:
         handle (intptr_t): The library handle.
-        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+        sv (intptr_t): The pointer address (as Python :class:`int`) to the statevector
             (on device).
         sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
         n_index_bits (uint32_t): The number of index bits.
         basis_bits: A host array of Z-basis index bits. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of index bits
 
         n_basis_bits (uint32_t): The number of basis bits.
@@ -543,15 +569,15 @@ cpdef batch_measure(
 
     Args:
         handle (intptr_t): The library handle.
-        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+        sv (intptr_t): The pointer address (as Python :class:`int`) to the statevector
             (on device).
         sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
         n_index_bits (uint32_t): The number of index bits.
-        bit_string (intptr_t): The pointer address (as Python `int`) to a host
+        bit_string (intptr_t): The pointer address (as Python :class:`int`) to a host
             array of measured bit string.
         bit_ordering: A host array of bit string ordering. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of bit ordering
 
         bit_string_len (uint32_t): The length of ``bit_string``.
@@ -577,7 +603,53 @@ cpdef batch_measure(
     check_status(status)
 
 
-cpdef apply_exp(
+cpdef batch_measure_with_offset(
+        intptr_t handle, intptr_t sv, int sv_data_type,
+        uint32_t n_index_bits, intptr_t bit_string, bit_ordering,
+        const uint32_t bit_string_len, double rand_num, int collapse,
+        double offset, double abs2sum):
+    """Performs measurement (on a partial statevector) of arbitrary number of
+    single qubits.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv (intptr_t): The pointer address (as Python :class:`int`) to the partial
+            statevector (on device).
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        bit_string (intptr_t): The pointer address (as Python :class:`int`) to a host
+            array of measured bit string.
+        bit_ordering: A host array of bit string ordering. It can be
+
+            - an :class:`int` as the pointer address to the array
+            - a Python sequence of bit ordering
+
+        bit_string_len (uint32_t): The length of ``bit_string``.
+        rand_num (double): A random number in [0, 1).
+        collapse (Collapse): Indicate the collapse operation.
+        offset (double): partial sum of squared absolute values.
+        abs2sum (double): sum of squared absolute values for the entire statevector.
+
+    .. seealso:: `custatevecBatchMeasureWithOffset`
+    """
+    # bit_ordering can be a pointer address, or a Python sequence
+    cdef vector[int32_t] bitOrderingData
+    cdef int32_t* bitOrderingPtr
+    if cpython.PySequence_Check(bit_ordering):
+        bitOrderingData = bit_ordering
+        bitOrderingPtr = bitOrderingData.data()
+    else:  # a pointer address
+        bitOrderingPtr = <int32_t*><intptr_t>bit_ordering
+
+    with nogil:
+        status = custatevecBatchMeasureWithOffset(
+            <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
+            <int32_t*>bit_string, bitOrderingPtr, bit_string_len,
+            rand_num, <_CollapseOp>collapse, offset, abs2sum)
+    check_status(status)
+
+
+cpdef apply_pauli_rotation(
         intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
         double theta, paulis,
         targets, uint32_t n_targets,
@@ -586,35 +658,35 @@ cpdef apply_exp(
 
     Args:
         handle (intptr_t): The library handle.
-        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+        sv (intptr_t): The pointer address (as Python :class:`int`) to the statevector
             (on device).
         sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
         n_index_bits (uint32_t): The number of index bits.
         theta (double): The rotation angle.
         paulis: A host array of :data:`Pauli` operators. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of :data:`Pauli`
 
         targets: A host array of target bits. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of target bits
 
         n_targets (uint32_t): The length of ``targets``.
         controls: A host array of control bits. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of control bits
 
         control_bit_values: A host array of control bit values. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of control bit values
 
         n_controls (uint32_t): The length of ``controls``.
 
-    .. seealso:: `custatevecApplyExp`
+    .. seealso:: `custatevecApplyPauliRotation`
     """
     # paulis can be a pointer address, or a Python sequence
     cdef vector[_Pauli] paulisData
@@ -653,7 +725,7 @@ cpdef apply_exp(
         controlBitValuesPtr = <int32_t*><intptr_t>control_bit_values
 
     with nogil:
-        status = custatevecApplyExp(
+        status = custatevecApplyPauliRotation(
             <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
             theta, paulisPtr,
             targetsPtr, n_targets,
@@ -661,7 +733,7 @@ cpdef apply_exp(
     check_status(status)
 
 
-cpdef size_t apply_matrix_buffer_size(
+cpdef size_t apply_matrix_get_workspace_size(
         intptr_t handle, int sv_data_type, uint32_t n_index_bits, intptr_t matrix,
         int matrix_data_type, int layout, int32_t adjoint, uint32_t n_targets,
         uint32_t n_controls, int compute_type) except*:
@@ -671,7 +743,7 @@ cpdef size_t apply_matrix_buffer_size(
         handle (intptr_t): The library handle.
         sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
         n_index_bits (uint32_t): The number of index bits.
-        matrix (intptr_t): The pointer address (as Python `int`) to a matrix
+        matrix (intptr_t): The pointer address (as Python :class:`int`) to a matrix
             (on either host or device).
         matrix_data_type (cuquantum.cudaDataType): The data type of the matrix.
         layout (MatrixLayout): The memory layout the the matrix.
@@ -684,11 +756,11 @@ cpdef size_t apply_matrix_buffer_size(
     Returns:
         size_t: The required workspace size (in bytes).
 
-    .. seealso:: `custatevecApplyMatrix_bufferSize`
+    .. seealso:: `custatevecApplyMatrixGetWorkspaceSize`
     """
     cdef size_t extraWorkspaceSizeInBytes
     with nogil:
-        status = custatevecApplyMatrix_bufferSize(
+        status = custatevecApplyMatrixGetWorkspaceSize(
             <_Handle>handle, <DataType>sv_data_type, n_index_bits, <void*>matrix,
             <DataType>matrix_data_type, <_MatrixLayout>layout, adjoint, n_targets,
             n_controls, <_ComputeType>compute_type, &extraWorkspaceSizeInBytes)
@@ -700,41 +772,41 @@ cpdef apply_matrix(
         intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
         intptr_t matrix, int matrix_data_type, int layout, int32_t adjoint,
         targets, uint32_t n_targets,
-        controls, uint32_t n_controls, control_bit_values,
+        controls, control_bit_values, uint32_t n_controls,
         int compute_type, intptr_t workspace, size_t workspace_size):
     """Apply the specified gate matrix.
 
     Args:
         handle (intptr_t): The library handle.
-        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+        sv (intptr_t): The pointer address (as Python :class:`int`) to the statevector
             (on device).
         sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
         n_index_bits (uint32_t): The number of index bits.
-        matrix (intptr_t): The pointer address (as Python `int`) to a matrix
+        matrix (intptr_t): The pointer address (as Python :class:`int`) to a matrix
             (on either host or device).
         matrix_data_type (cuquantum.cudaDataType): The data type of the matrix.
         layout (MatrixLayout): The memory layout the the matrix.
         adjoint (int32_t): Whether the adjoint of the matrix would be applied.
         targets: A host array of target bits. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of target bits
 
         n_targets (uint32_t): The length of ``targets``.
         controls: A host array of control bits. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of control bits
 
-        n_controls (uint32_t): The length of ``controls``.
         control_bit_values: A host array of control bit values. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of control bit values
 
+        n_controls (uint32_t): The length of ``controls``.
         compute_type (cuquantum.ComputeType): The compute type of matrix
             multiplication.
-        workspace (intptr_t): The pointer address (as Python `int`) to the
+        workspace (intptr_t): The pointer address (as Python :class:`int`) to the
             workspace (on device).
         workspace_size (size_t): The workspace size (in bytes).
 
@@ -773,22 +845,21 @@ cpdef apply_matrix(
             <void*>matrix, <DataType>matrix_data_type,
             <_MatrixLayout>layout, adjoint,
             targetsPtr, n_targets,
-            controlsPtr, n_controls,
-            controlBitValuesPtr, <_ComputeType>compute_type,
-            <void*>workspace, workspace_size)
+            controlsPtr, controlBitValuesPtr, n_controls,
+            <_ComputeType>compute_type, <void*>workspace, workspace_size)
     check_status(status)
 
 
-cpdef size_t expectation_buffer_size(
+cpdef size_t compute_expectation_get_workspace_size(
         intptr_t handle, int sv_data_type, uint32_t n_index_bits, intptr_t matrix,
         int matrix_data_type, int layout, uint32_t n_basis_bits, int compute_type) except*:
-    """Computes the required workspace size for :func:`expectation`.
+    """Computes the required workspace size for :func:`compute_expectation`.
 
     Args:
         handle (intptr_t): The library handle.
         sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
         n_index_bits (uint32_t): The number of index bits.
-        matrix (intptr_t): The pointer address (as Python `int`) to a matrix
+        matrix (intptr_t): The pointer address (as Python :class:`int`) to a matrix
             (on either host or device).
         matrix_data_type (cuquantum.cudaDataType): The data type of the matrix.
         layout (MatrixLayout): The memory layout the the matrix.
@@ -799,11 +870,11 @@ cpdef size_t expectation_buffer_size(
     Returns:
         size_t: The required workspace size (in bytes).
 
-    .. seealso:: `custatevecExpectation_bufferSize`
+    .. seealso:: `custatevecComputeExpectationGetWorkspaceSize`
     """
     cdef size_t extraWorkspaceSizeInBytes
     with nogil:
-        status = custatevecExpectation_bufferSize(
+        status = custatevecComputeExpectationGetWorkspaceSize(
             <_Handle>handle, <DataType>sv_data_type, n_index_bits, <void*>matrix,
             <DataType>matrix_data_type, <_MatrixLayout>layout, n_basis_bits,
             <_ComputeType>compute_type, &extraWorkspaceSizeInBytes)
@@ -811,7 +882,7 @@ cpdef size_t expectation_buffer_size(
     return extraWorkspaceSizeInBytes
 
 
-cpdef expectation(
+cpdef compute_expectation(
         intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
         intptr_t expect, int expect_data_type,
         intptr_t matrix, int matrix_data_type, int layout,
@@ -822,30 +893,30 @@ cpdef expectation(
 
     Args:
         handle (intptr_t): The library handle.
-        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+        sv (intptr_t): The pointer address (as Python :class:`int`) to the statevector
             (on device).
         sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
         n_index_bits (uint32_t): The number of index bits.
-        expect (intptr_t): The pointer address (as Python `int`) for storing the
+        expect (intptr_t): The pointer address (as Python :class:`int`) for storing the
             expectation value (on host).
         expect_data_type (cuquantum.cudaDataType): The data type of ``expect``.
-        matrix (intptr_t): The pointer address (as Python `int`) to a matrix
+        matrix (intptr_t): The pointer address (as Python :class:`int`) to a matrix
             (on either host or device).
         matrix_data_type (cuquantum.cudaDataType): The data type of the matrix.
         layout (MatrixLayout): The memory layout the the matrix.
         basis_bits: A host array of basis index bits. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of basis bits
 
         n_basis_bits (uint32_t): The length of ``basis_bits``.
         compute_type (cuquantum.ComputeType): The compute type of matrix
             multiplication.
-        workspace (intptr_t): The pointer address (as Python `int`) to the
+        workspace (intptr_t): The pointer address (as Python :class:`int`) to the
             workspace (on device).
         workspace_size (size_t): The workspace size (in bytes).
 
-    .. seealso:: `custatevecExpectation`
+    .. seealso:: `custatevecComputeExpectation`
     """
     # basis_bits can be a pointer address, or a Python sequence
     cdef vector[int32_t] basisBitsData
@@ -860,7 +931,7 @@ cpdef expectation(
     # TODO(leofang): check for beta 2
     cdef double residualNorm
     with nogil:
-        status = custatevecExpectation(
+        status = custatevecComputeExpectation(
             <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
             <void*>expect, <DataType>expect_data_type, &residualNorm,
             <void*>matrix, <DataType>matrix_data_type,
@@ -878,7 +949,7 @@ cpdef tuple sampler_create(
 
     Args:
         handle (intptr_t): The library handle.
-        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+        sv (intptr_t): The pointer address (as Python :class:`int`) to the statevector
             (on device).
         sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
         n_index_bits (uint32_t): The number of index bits.
@@ -888,41 +959,33 @@ cpdef tuple sampler_create(
     Returns:
         tuple:
             A 2-tuple. The first element is the pointer address (as Python
-            `int`) to the sampler descriptor, and the second element is the
+            :class:`int`) to the sampler descriptor, and the second element is the
             amount of required workspace size (in bytes).
 
-    .. note:: Unlike its C counterpart, the returned sampler descriptor must
-        be explicitly cleaned up using :func:`sampler_destroy` when the work
-        is done.
-
-    .. seealso:: `custatevecSampler_create`
+    .. seealso:: `custatevecSamplerCreate`
     """
-    cdef _SamplerDescriptor* sampler = <_SamplerDescriptor*>(
-        PyMem_Malloc(sizeof(_SamplerDescriptor)))
+    cdef _SamplerDescriptor sampler
     cdef size_t extraWorkspaceSizeInBytes
     with nogil:
-        status = custatevecSampler_create(
+        status = custatevecSamplerCreate(
             <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
-            sampler, n_max_shots, &extraWorkspaceSizeInBytes)
+            &sampler, n_max_shots, &extraWorkspaceSizeInBytes)
     check_status(status)
     return (<intptr_t>sampler, extraWorkspaceSizeInBytes)
 
 
-# TODO(leofang): fix this when the beta 2 (?) APIs are up
 cpdef sampler_destroy(intptr_t sampler):
     """Destroy the sampler descriptor.
 
     Args:
-        sampler (intptr_t): The pointer address (as Python `int`) to the
+        sampler (intptr_t): The pointer address (as Python :class:`int`) to the
             sampler descriptor.
 
-    .. note:: This function has no C counterpart in the current release.
-
-    .. seealso:: :func:`sampler_create`
+    .. seealso:: `custatevecSamplerDestroy`
     """
-    # This API is unique in Python as we can't pass around structs
-    # allocated on stack
-    PyMem_Free(<void*>sampler)
+    with nogil:
+        status = custatevecSamplerDestroy(<_SamplerDescriptor>sampler)
+    check_status(status)
 
 
 cpdef sampler_preprocess(
@@ -932,17 +995,17 @@ cpdef sampler_preprocess(
 
     Args:
         handle (intptr_t): The library handle.
-        sampler (intptr_t): The pointer address (as Python `int`) to the
+        sampler (intptr_t): The pointer address (as Python :class:`int`) to the
             sampler descriptor.
-        workspace (intptr_t): The pointer address (as Python `int`) to the
+        workspace (intptr_t): The pointer address (as Python :class:`int`) to the
             workspace (on device).
         workspace_size (size_t): The workspace size (in bytes).
 
-    .. seealso:: `custatevecSampler_preprocess`
+    .. seealso:: `custatevecSamplerPreprocess`
     """
     with nogil:
-        status = custatevecSampler_preprocess(
-            <_Handle>handle, <_SamplerDescriptor*>sampler,
+        status = custatevecSamplerPreprocess(
+            <_Handle>handle, <_SamplerDescriptor>sampler,
             <void*>workspace, workspace_size)
     check_status(status)
 
@@ -955,25 +1018,25 @@ cpdef sampler_sample(
 
     Args:
         handle (intptr_t): The library handle.
-        sampler (intptr_t): The pointer address (as Python `int`) to the
+        sampler (intptr_t): The pointer address (as Python :class:`int`) to the
             sampler descriptor.
-        bit_strings (intptr_t): The pointer address (as Python `int`) for
+        bit_strings (intptr_t): The pointer address (as Python :class:`int`) for
             storing the sampled bit strings (on host).
         bit_ordering: A host array of bit string ordering. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of bit ordering
 
         bit_string_len (uint32_t): The number of bits in ``bit_ordering``.
         rand_nums: A host array of random numbers in [0, 1). It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of random numbers
 
         n_shots (uint32_t): The number of shots.
         order (SamplerOutput): The order of sampled bit strings.
 
-    .. seealso:: `custatevecSampler_sample`
+    .. seealso:: `custatevecSamplerSample`
     """
     # bit_ordering can be a pointer address, or a Python sequence
     cdef vector[int32_t] bitOrderingData
@@ -994,17 +1057,62 @@ cpdef sampler_sample(
         randNumsPtr = <double*><intptr_t>rand_nums
 
     with nogil:
-        status = custatevecSampler_sample(
-            <_Handle>handle, <_SamplerDescriptor*>sampler, <_Index*>bit_strings,
+        status = custatevecSamplerSample(
+            <_Handle>handle, <_SamplerDescriptor>sampler, <_Index*>bit_strings,
             bitOrderingPtr, bit_string_len, randNumsPtr, n_shots,
             <_SamplerOutput>order)
     check_status(status)
 
 
-cpdef size_t apply_generalized_permutation_matrix_buffer_size(
+cpdef double sampler_get_squared_norm(
+        intptr_t handle, intptr_t sampler) except*:
+    """Get the squared norm of the statevetor.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sampler (intptr_t): The pointer address (as Python :class:`int`) to the
+            sampler descriptor.
+
+    Returns:
+        double: The squared norm of the statevector.
+
+    .. seealso:: `custatevecSamplerGetSquaredNorm`
+    """
+    cdef double sq_norm
+    with nogil:
+        status = custatevecSamplerGetSquaredNorm(
+            <_Handle>handle, <_SamplerDescriptor>sampler, &sq_norm)
+    check_status(status)
+    return sq_norm
+
+
+cpdef sampler_apply_sub_sv_offset(
+        intptr_t handle, intptr_t sampler, int32_t sub_sv_id,
+        uint32_t n_sub_sv, double offset, double sq_norm):
+    """Apply the partial norm and norm to the statevector.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sampler (intptr_t): The pointer address (as Python :class:`int`) to the
+            sampler descriptor.
+        sub_sv_id (int32_t): The ordinal of the sub-statevector.
+        n_sub_sv (uint32_t): The number of sub-statevectors.
+        offset (double): The cumulative sum for the sub-statevector.
+        sq_norm (double): The squared norm for all sub-statevectors.
+
+    .. seealso:: `custatevecSamplerApplySubSVOffset`
+    """
+    with nogil:
+        status = custatevecSamplerApplySubSVOffset(
+            <_Handle>handle, <_SamplerDescriptor>sampler, sub_sv_id,
+            n_sub_sv, offset, sq_norm)
+    check_status(status)
+
+
+cpdef size_t apply_generalized_permutation_matrix_get_workspace_size(
         intptr_t handle, int sv_data_type, uint32_t n_index_bits,
         permutation, intptr_t diagonals, int diagonals_data_type,
-        basis_bits, uint32_t n_basis_bits, uint32_t mask_len) except*:
+        targets, uint32_t n_targets, uint32_t n_controls) except*:
     """Computes the required workspace size for :func:`apply_generalized_permutation_matrix`.
 
     Args:
@@ -1013,24 +1121,24 @@ cpdef size_t apply_generalized_permutation_matrix_buffer_size(
         n_index_bits (uint32_t): The number of index bits.
         permutation: A host or device array for the permutation table. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of permutation elements
 
-        diagonals (intptr_t): The pointer address (as Python `int`) to a matrix
+        diagonals (intptr_t): The pointer address (as Python :class:`int`) to a matrix
             (on either host or device).
         diagonals_data_type (cuquantum.cudaDataType): The data type of the matrix.
-        basis_bits: A host array of permutation matrix basis bits. It can be
+        targets: A host array of permutation matrix target bits. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of basis bits
 
-        n_basis_bits (uint32_t): The length of ``basis_bits``.
-        mask_len (uint32_t): The length of ``mask_ordering``.
+        n_targets (uint32_t): The length of ``targets``.
+        n_controls (uint32_t): The length of ``controls`` and ``control_bit_values``.
 
     Returns:
         size_t: The required workspace size (in bytes).
 
-    .. seealso:: `custatevecApplyGeneralizedPermutationMatrix_bufferSize`
+    .. seealso:: `custatevecApplyGeneralizedPermutationMatrixGetWorkspaceSize`
     """
     cdef size_t extraWorkspaceSize
 
@@ -1044,20 +1152,20 @@ cpdef size_t apply_generalized_permutation_matrix_buffer_size(
     else:  # a pointer address
         permutationPtr = <_Index*><intptr_t>permutation
 
-    # basis_bits can be a pointer address, or a Python sequence
-    cdef vector[int32_t] basisBitsData
-    cdef int32_t* basisBitsPtr
-    if cpython.PySequence_Check(basis_bits):
-        basisBitsData = basis_bits
-        basisBitsPtr = basisBitsData.data()
+    # targets can be a pointer address, or a Python sequence
+    cdef vector[int32_t] targetsData
+    cdef int32_t* targetsPtr
+    if cpython.PySequence_Check(targets):
+        targetsData = targets
+        targetsPtr = targetsData.data()
     else:  # a pointer address
-        basisBitsPtr = <int32_t*><intptr_t>basis_bits
+        targetsPtr = <int32_t*><intptr_t>targets
 
     with nogil:
-        status = custatevecApplyGeneralizedPermutationMatrix_bufferSize(
+        status = custatevecApplyGeneralizedPermutationMatrixGetWorkspaceSize(
             <_Handle>handle, <DataType>sv_data_type, n_index_bits,
             permutationPtr, <void*>diagonals, <DataType>diagonals_data_type,
-            basisBitsPtr, n_basis_bits, mask_len, &extraWorkspaceSize)
+            targetsPtr, n_targets, n_controls, &extraWorkspaceSize)
     check_status(status)
     return extraWorkspaceSize
 
@@ -1065,45 +1173,44 @@ cpdef size_t apply_generalized_permutation_matrix_buffer_size(
 cpdef apply_generalized_permutation_matrix(
         intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
         permutation, intptr_t diagonals, int diagonals_data_type,
-        int32_t adjoint, basis_bits, uint32_t n_basis_bits,
-        mask_bit_string, mask_ordering, uint32_t mask_len,
+        int32_t adjoint, targets, uint32_t n_targets,
+        controls, control_bit_values, uint32_t n_controls,
         intptr_t workspace, size_t workspace_size):
     """Apply a generalized permutation matrix.
 
     Args:
         handle (intptr_t): The library handle.
-        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+        sv (intptr_t): The pointer address (as Python :class:`int`) to the statevector
             (on device).
         sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
         n_index_bits (uint32_t): The number of index bits.
         permutation: A host or device array for the permutation table. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of permutation elements
 
-        diagonals (intptr_t): The pointer address (as Python `int`) to a matrix
+        diagonals (intptr_t): The pointer address (as Python :class:`int`) to a matrix
             (on either host or device).
         diagonals_data_type (cuquantum.cudaDataType): The data type of the matrix.
         adjoint (int32_t): Whether the adjoint of the matrix would be applied.
-        basis_bits: A host array of permutation matrix basis bits. It can be
+        targets: A host array of permutation matrix target bits. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of basis bits
 
-        n_basis_bits (uint32_t): The length of ``basis_bits``.
-        mask_bit_string: A host array for a bit string to specify mask. It can
-            be
+        n_targets (uint32_t): The length of ``targets``.
+        controls: A host array for control bits. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of index bit ordering
 
-        mask_ordering: A host array of mask ordering. It can be
+        control_bit_values: A host array of control bit values. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of index bit ordering
 
-        mask_len (uint32_t): The length of ``mask_ordering``.
-        workspace (intptr_t): The pointer address (as Python `int`) to the
+        n_controls (uint32_t): The length of ``controls`` and ``control_bit_values``.
+        workspace (intptr_t): The pointer address (as Python :class:`int`) to the
             workspace (on device).
         workspace_size (size_t): The workspace size (in bytes).
 
@@ -1119,81 +1226,80 @@ cpdef apply_generalized_permutation_matrix(
     else:  # a pointer address
         permutationPtr = <_Index*><intptr_t>permutation
 
-    # basis_bits can be a pointer address, or a Python sequence
-    cdef vector[int32_t] basisBitsData
-    cdef int32_t* basisBitsPtr
-    if cpython.PySequence_Check(basis_bits):
-        basisBitsData = basis_bits
-        basisBitsPtr = basisBitsData.data()
+    # targets can be a pointer address, or a Python sequence
+    cdef vector[int32_t] targetsData
+    cdef int32_t* targetsPtr
+    if cpython.PySequence_Check(targets):
+        targetsData = targets
+        targetsPtr = targetsData.data()
     else:  # a pointer address
-        basisBitsPtr = <int32_t*><intptr_t>basis_bits
+        targetsPtr = <int32_t*><intptr_t>targets
 
-    # mask_bit_string can be a pointer address, or a Python sequence
-    cdef vector[int32_t] maskBitStringData
-    cdef int32_t* maskBitStringPtr
-    if cpython.PySequence_Check(mask_bit_string):
-        maskBitStringData = mask_bit_string
-        maskBitStringPtr = maskBitStringData.data()
+    # controls can be a pointer address, or a Python sequence
+    cdef vector[int32_t] controlsData
+    cdef int32_t* controlsPtr
+    if cpython.PySequence_Check(controls):
+        controlsData = controls
+        controlsPtr = controlsData.data()
     else:  # a pointer address
-        maskBitStringPtr = <int32_t*><intptr_t>mask_bit_string
+        controlsPtr = <int32_t*><intptr_t>controls
 
-    # mask_ordering can be a pointer address, or a Python sequence
-    cdef vector[int32_t] maskOrderingData
-    cdef int32_t* maskOrderingPtr
-    if cpython.PySequence_Check(mask_ordering):
-        maskOrderingData = mask_ordering
-        maskOrderingPtr = maskOrderingData.data()
+    # control_bit_values can be a pointer address, or a Python sequence
+    cdef vector[int32_t] control_bit_valuesData
+    cdef int32_t* control_bit_valuesPtr
+    if cpython.PySequence_Check(control_bit_values):
+        control_bit_valuesData = control_bit_values
+        control_bit_valuesPtr = control_bit_valuesData.data()
     else:  # a pointer address
-        maskOrderingPtr = <int32_t*><intptr_t>mask_ordering
+        control_bit_valuesPtr = <int32_t*><intptr_t>control_bit_values
 
     with nogil:
         status = custatevecApplyGeneralizedPermutationMatrix(
             <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
             permutationPtr, <void*>diagonals, <DataType>diagonals_data_type,
-            adjoint, basisBitsPtr, n_basis_bits,
-            maskBitStringPtr, maskOrderingPtr, mask_len,
+            adjoint, targetsPtr, n_targets,
+            controlsPtr, control_bit_valuesPtr, n_controls,
             <void*>workspace, workspace_size)
     check_status(status)
 
 
-cpdef expectations_on_pauli_basis(
+cpdef compute_expectations_on_pauli_basis(
         intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
-        intptr_t expectations, pauli_ops,
-        basis_bits, n_basis_bits, uint32_t n_pauli_op_arrays):
+        intptr_t expectations, pauli_ops, uint32_t n_pauli_op_arrays,
+        basis_bits, n_basis_bits):
     """Compute expectation values for multiple multi-qubit Pauli strings.
 
     Args:
         handle (intptr_t): The library handle.
-        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+        sv (intptr_t): The pointer address (as Python :class:`int`) to the statevector
             (on device).
         sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
         n_index_bits (uint32_t): The number of index bits.
-        expectations (intptr_t): The pointer address (as Python `int`) to store
+        expectations (intptr_t): The pointer address (as Python :class:`int`) to store
             the corresponding expectation values on host. The returned values
             are stored in double (float64).
         pauli_ops: A host array of :data:`Pauli` operators. It can be
 
-            - an `int` as the pointer address to the nested sequence
-            - a Python sequence of `int`, each of which is a pointer address
+            - an :class:`int` as the pointer address to the nested sequence
+            - a Python sequence of :class:`int`, each of which is a pointer address
               to the corresponding Pauli string
             - a nested Python sequence of :data:`Pauli`
 
+        n_pauli_op_arrays (uint32_t): The number of Pauli operator arrays.
         basis_bits: A host array of basis index bits. It can be
 
-            - an `int` as the pointer address to the nested sequence
-            - a Python sequence of `int`, each of which is a pointer address
+            - an :class:`int` as the pointer address to the nested sequence
+            - a Python sequence of :class:`int`, each of which is a pointer address
               to the corresponding basis bits
             - a nested Python sequence of basis bits
 
         n_basis_bits: A host array of the length of each array in
             ``basis_bits``. It can be
 
-            - an `int` as the pointer address to the array
-            - a Python sequence of `int`
+            - an :class:`int` as the pointer address to the array
+            - a Python sequence of :class:`int`
 
-        n_pauli_op_arrays (uint32_t): The number of Pauli operator arrays.
-
-    .. seealso:: `custatevecExpectationsOnPauliBasis`
+    .. seealso:: `custatevecComputeExpectationsOnPauliBasis`
     """
     # pauli_ops can be:
     #   - a plain pointer address
@@ -1257,10 +1363,10 @@ cpdef expectations_on_pauli_basis(
         nBasisBitsPtr = <uint32_t*><intptr_t>n_basis_bits
 
     with nogil:
-        status = custatevecExpectationsOnPauliBasis(
+        status = custatevecComputeExpectationsOnPauliBasis(
             <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
-            <double*>expectations, <const _Pauli**>pauliOpsPtr,
-            <const int32_t**>basisBitsPtr, nBasisBitsPtr, n_pauli_op_arrays)
+            <double*>expectations, <const _Pauli**>pauliOpsPtr, n_pauli_op_arrays,
+            <const int32_t**>basisBitsPtr, nBasisBitsPtr)
     check_status(status)
 
 
@@ -1273,24 +1379,24 @@ cpdef (intptr_t, size_t) accessor_create(
 
     Args:
         handle (intptr_t): The library handle.
-        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+        sv (intptr_t): The pointer address (as Python :class:`int`) to the statevector
             (on device).
         sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
         n_index_bits (uint32_t): The number of index bits.
         bit_ordering: A host array of basis bits for the external buffer. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of basis bits
 
         bit_ordering_len (uint32_t): The length of ``bit_ordering``.
         mask_bit_string: A host array for specifying mask values. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of mask values
 
         mask_ordering: A host array of mask ordering. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of index bit ordering
 
         mask_len (uint32_t): The length of ``mask_ordering``.
@@ -1298,17 +1404,12 @@ cpdef (intptr_t, size_t) accessor_create(
     Returns:
         tuple:
             A 2-tuple. The first element is the accessor descriptor (as Python
-            `int`), and the second element is the required workspace size (in
+            :class:`int`), and the second element is the required workspace size (in
             bytes).
 
-    .. note:: Unlike its C counterpart, the returned accessor descriptor must
-        be explicitly cleaned up using :func:`accessor_destroy` when the work
-        is done.
-
-    .. seealso:: `custatevecAccessor_create`
+    .. seealso:: `custatevecAccessorCreate`
     """
-    cdef _AccessorDescriptor* accessor = <_AccessorDescriptor*>(
-        PyMem_Malloc(sizeof(_AccessorDescriptor)))
+    cdef _AccessorDescriptor accessor
     cdef size_t workspace_size
 
     # bit_ordering can be a pointer address, or a Python sequence
@@ -1339,15 +1440,15 @@ cpdef (intptr_t, size_t) accessor_create(
         maskOrderingPtr = <int32_t*><intptr_t>mask_ordering
 
     with nogil:
-        status = custatevecAccessor_create(
+        status = custatevecAccessorCreate(
             <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
-            accessor, bitOrderingPtr, bit_ordering_len,
+            &accessor, bitOrderingPtr, bit_ordering_len,
             maskBitStringPtr, maskOrderingPtr, mask_len, &workspace_size)
     check_status(status)
     return (<intptr_t>accessor, workspace_size)
 
 
-cpdef (intptr_t, size_t) accessor_create_readonly(
+cpdef (intptr_t, size_t) accessor_create_view(
         intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
         bit_ordering, uint32_t bit_ordering_len,
         mask_bit_string, mask_ordering, uint32_t mask_len):
@@ -1355,24 +1456,24 @@ cpdef (intptr_t, size_t) accessor_create_readonly(
 
     Args:
         handle (intptr_t): The library handle.
-        sv (intptr_t): The pointer address (as Python `int`) to the statevector
+        sv (intptr_t): The pointer address (as Python :class:`int`) to the statevector
             (on device). The statevector is read-only.
         sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
         n_index_bits (uint32_t): The number of index bits.
         bit_ordering: A host array of basis bits for the external buffer. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of basis bits
 
         bit_ordering_len (uint32_t): The length of ``bit_ordering``.
         mask_bit_string: A host array for specifying mask values. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of mask values
 
         mask_ordering: A host array of mask ordering. It can be
 
-            - an `int` as the pointer address to the array
+            - an :class:`int` as the pointer address to the array
             - a Python sequence of index bit ordering
 
         mask_len (uint32_t): The length of ``mask_ordering``.
@@ -1380,17 +1481,12 @@ cpdef (intptr_t, size_t) accessor_create_readonly(
     Returns:
         tuple:
             A 2-tuple. The first element is the accessor descriptor (as Python
-            `int`), and the second element is the required workspace size (in
+            :class:`int`), and the second element is the required workspace size (in
             bytes).
 
-    .. note:: Unlike its C counterpart, the returned accessor descriptor must
-        be explicitly cleaned up using :func:`accessor_destroy` when the work
-        is done.
-
-    .. seealso:: `custatevecAccessor_createReadOnly`
+    .. seealso:: `custatevecAccessorCreateView`
     """
-    cdef _AccessorDescriptor* accessor = <_AccessorDescriptor*>(
-        PyMem_Malloc(sizeof(_AccessorDescriptor)))
+    cdef _AccessorDescriptor accessor
     cdef size_t workspace_size
 
     # bit_ordering can be a pointer address, or a Python sequence
@@ -1421,9 +1517,9 @@ cpdef (intptr_t, size_t) accessor_create_readonly(
         maskOrderingPtr = <int32_t*><intptr_t>mask_ordering
 
     with nogil:
-        status = custatevecAccessor_createReadOnly(
+        status = custatevecAccessorCreateView(
             <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
-            accessor, bitOrderingPtr, bit_ordering_len,
+            &accessor, bitOrderingPtr, bit_ordering_len,
             maskBitStringPtr, maskOrderingPtr, mask_len, &workspace_size)
     check_status(status)
     return (<intptr_t>accessor, workspace_size)
@@ -1435,13 +1531,11 @@ cpdef accessor_destroy(intptr_t accessor):
     Args:
         accessor (intptr_t): The accessor descriptor.
 
-    .. note:: This function has no C counterpart in the current release.
-
-    .. seealso:: :func:`accessor_create`
+    .. seealso:: :func:`custatevecAccessorDestroy`
     """
-    # This API is unique in Python as we can't pass around structs
-    # allocated on stack
-    PyMem_Free(<void*>accessor)
+    with nogil:
+        status = custatevecAccessorDestroy(<_AccessorDescriptor>accessor)
+    check_status(status)
 
 
 cpdef accessor_set_extra_workspace(
@@ -1455,11 +1549,11 @@ cpdef accessor_set_extra_workspace(
         workspace (intptr_t): The pointer address to the workspace (on device).
         workspace_size (size_t): The size of workspace (in bytes).
 
-    .. seealso:: `custatevecAccessor_setExtraWorkspace`
+    .. seealso:: `custatevecAccessorSetExtraWorkspace`
     """
     with nogil:
-        status = custatevecAccessor_setExtraWorkspace(
-            <_Handle>handle, <_AccessorDescriptor*>accessor,
+        status = custatevecAccessorSetExtraWorkspace(
+            <_Handle>handle, <_AccessorDescriptor>accessor,
             <void*>workspace, workspace_size)
     check_status(status)
 
@@ -1476,11 +1570,11 @@ cpdef accessor_get(
         begin (int): The beginning index.
         end (int): The end index.
 
-    .. seealso:: `custatevecAccessor_get`
+    .. seealso:: `custatevecAccessorGet`
     """
     with nogil:
-        status = custatevecAccessor_get(
-            <_Handle>handle, <_AccessorDescriptor*>accessor, <void*>buf,
+        status = custatevecAccessorGet(
+            <_Handle>handle, <_AccessorDescriptor>accessor, <void*>buf,
             begin, end)
     check_status(status)
 
@@ -1497,15 +1591,360 @@ cpdef accessor_set(
         begin (int): The beginning index.
         end (int): The end index.
 
-    .. seealso:: `custatevecAccessor_set`
+    .. seealso:: `custatevecAccessorSet`
     """
     with nogil:
-        status = custatevecAccessor_set(
-            <_Handle>handle, <_AccessorDescriptor*>accessor, <void*>buf,
+        status = custatevecAccessorSet(
+            <_Handle>handle, <_AccessorDescriptor>accessor, <void*>buf,
             begin, end)
     check_status(status)
 
 
+cpdef swap_index_bits(
+        intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_index_bits,
+        swapped_bits, uint32_t n_swapped_bits,
+        mask_bit_string, mask_ordering, uint32_t mask_len):
+    """Swap index bits and reorder statevector elements on the device.
+
+    Args:
+        handle (intptr_t): The library handle.
+        sv (intptr_t): The pointer address (as Python :class:`int`) to the statevector
+            (on device).
+        sv_data_type (cuquantum.cudaDataType): The data type of the statevector.
+        n_index_bits (uint32_t): The number of index bits.
+        swapped_bits: A host array of pairs of swapped index bits. It can be
+
+            - an :class:`int` as the pointer address to the nested sequence
+            - a nested Python sequence of swapped index bits
+
+        n_swapped_bits (uint32_t): The number of pairs of swapped index bits.
+        mask_bit_string: A host array for a bit string to specify mask. It can
+            be
+
+            - an :class:`int` as the pointer address to the array
+            - a Python sequence of index bit ordering
+
+        mask_ordering: A host array of mask ordering. It can be
+
+            - an :class:`int` as the pointer address to the array
+            - a Python sequence of index bit ordering
+
+        mask_len (uint32_t): The length of ``mask_ordering``.
+
+    .. seealso:: `custatevecSwapIndexBits`
+    """
+    # swapped_bits can be:
+    #   - a plain pointer address
+    #   - a nested Python sequence (ex: a list of 2-tuples)
+    # Note: it cannot be a mix of sequences and ints. It also cannot be a
+    # 1D sequence (of ints), because it's inefficient.
+    cdef vector[intptr_t] swappedBitsCData
+    cdef int2* swappedBitsPtr
+    if is_nested_sequence(swapped_bits):
+        try:
+            # direct conversion
+            data = _numpy.asarray(swapped_bits, dtype=_numpy.int32)
+            data = data.reshape(-1)
+        except:
+            # unlikely, but let's do it in the stupid way
+            data = _numpy.empty(2*n_swapped_bits, dtype=_numpy.int32)
+            for i, (first, second) in enumerate(swapped_bits):
+                data[2*i] = first
+                data[2*i+1] = second
+        assert data.size == 2*n_swapped_bits
+        swappedBitsPtr = <int2*>(<intptr_t>data.ctypes.data)
+    elif isinstance(swapped_bits, int):
+        # a pointer address, take it as is
+        swappedBitsPtr = <int2*><intptr_t>swapped_bits
+    else:
+        raise ValueError("swapped_bits is provided in an "
+                         "un-recognized format")
+
+    # mask_bit_string can be a pointer address, or a Python sequence
+    cdef vector[int32_t] maskBitStringData
+    cdef int32_t* maskBitStringPtr
+    if cpython.PySequence_Check(mask_bit_string):
+        maskBitStringData = mask_bit_string
+        maskBitStringPtr = maskBitStringData.data()
+    else:  # a pointer address
+        maskBitStringPtr = <int32_t*><intptr_t>mask_bit_string
+
+    # mask_ordering can be a pointer address, or a Python sequence
+    cdef vector[int32_t] maskOrderingData
+    cdef int32_t* maskOrderingPtr
+    if cpython.PySequence_Check(mask_ordering):
+        maskOrderingData = mask_ordering
+        maskOrderingPtr = maskOrderingData.data()
+    else:  # a pointer address
+        maskOrderingPtr = <int32_t*><intptr_t>mask_ordering
+
+    with nogil:
+        status = custatevecSwapIndexBits(
+            <_Handle>handle, <void*>sv, <DataType>sv_data_type, n_index_bits,
+            swappedBitsPtr, n_swapped_bits,
+            maskBitStringPtr, maskOrderingPtr, mask_len)
+    check_status(status)
+
+
+cpdef size_t test_matrix_type_get_workspace_size(
+        intptr_t handle, int matrix_type,
+        intptr_t matrix, int matrix_data_type, int layout, uint32_t n_targets,
+        int32_t adjoint, int compute_type) except*:
+    """Computes the required workspace size for :func:`test_matrix_type`.
+
+    Args:
+        handle (intptr_t): The library handle.
+        matrix_type (cuquantum.MatrixType): The matrix type of the gate matrix.
+        matrix (intptr_t): The pointer address (as Python :class:`int`) to a matrix
+            (on either host or device).
+        matrix_data_type (cuquantum.cudaDataType): The data type of the matrix.
+        layout (MatrixLayout): The memory layout the the matrix.
+        n_targets (uint32_t): The length of ``targets``.
+        adjoint (int32_t): Whether the adjoint of the matrix would be applied.
+        compute_type (cuquantum.ComputeType): The compute type of matrix
+            multiplication.
+
+    Returns:
+        size_t: The required workspace size (in bytes).
+
+    .. seealso:: `custatevecTestMatrixTypeGetWorkspaceSize`
+    """
+    cdef size_t extraWorkspaceSizeInBytes
+    with nogil:
+        status = custatevecTestMatrixTypeGetWorkspaceSize(
+            <_Handle>handle, <_MatrixType>matrix_type, <void*>matrix,
+            <DataType>matrix_data_type, <_MatrixLayout>layout, n_targets,
+            adjoint, <_ComputeType>compute_type, &extraWorkspaceSizeInBytes)
+    check_status(status)
+    return extraWorkspaceSizeInBytes
+
+
+cpdef double test_matrix_type(
+        intptr_t handle, int matrix_type,
+        intptr_t matrix, int matrix_data_type, int layout, uint32_t n_targets,
+        int32_t adjoint, int compute_type, intptr_t workspace,
+        size_t workspace_size) except*:
+    """Test the deviation of a given matrix from a certain matrix type
+    (Hermitian or unitary).
+
+    Args:
+        handle (intptr_t): The library handle.
+        matrix_type (cuquantum.MatrixType): The matrix type of the gate matrix.
+        matrix (intptr_t): The pointer address (as Python :class:`int`) to a matrix
+            (on either host or device).
+        matrix_data_type (cuquantum.cudaDataType): The data type of the matrix.
+        layout (MatrixLayout): The memory layout the the matrix.
+        n_targets (uint32_t): The length of ``targets``.
+        adjoint (int32_t): Whether the adjoint of the matrix would be applied.
+        compute_type (cuquantum.ComputeType): The compute type of matrix
+            multiplication.
+        workspace (intptr_t): The pointer address (as Python :class:`int`) to the
+            workspace (on device).
+        workspace_size (size_t): The workspace size (in bytes).
+
+    Returns:
+        double: The residual norm for the deviation from certain matrix type.
+
+    .. seealso:: `custatevecTestMatrixType`
+    """
+    cdef double residualNorm
+    with nogil:
+        status = custatevecTestMatrixType(
+            <_Handle>handle, &residualNorm, <_MatrixType>matrix_type,
+            <void*>matrix, <DataType>matrix_data_type, <_MatrixLayout>layout,
+            n_targets, adjoint, <_ComputeType>compute_type,
+            <void*>workspace, workspace_size)
+    check_status(status)
+    return residualNorm
+
+
+cpdef set_device_mem_handler(intptr_t handle, handler):
+    """ Set the device memory handler for cuTensorNet.
+
+    The ``handler`` object can be passed in multiple ways:
+
+      - If ``handler`` is an :class:`int`, it refers to the address of a fully
+        initialized `custatevecDeviceMemHandler_t` struct.
+      - If ``handler`` is a Python sequence:
+
+        - If ``handler`` is a sequence of length 4, it is interpreted as ``(ctx, device_alloc,
+          device_free, name)``, where the first three elements are the pointer
+          addresses (:class:`int`) of the corresponding members. ``name`` is a
+          :class:`str` as the name of the handler.
+        - If ``handler`` is a sequence of length 3, it is interpreted as ``(malloc, free,
+          name)``, where the first two objects are Python *callables* with the
+          following calling convention:
+
+            - ``ptr = malloc(size, stream)``
+            - ``free(ptr, size, stream)``
+
+          with all arguments and return value (``ptr``) being Python :class:`int`.
+          ``name`` is the same as above.
+
+    .. note:: Only when ``handler`` is a length-3 sequence will the GIL be
+        held whenever a routine requires memory allocation and deallocation,
+        so for all other cases be sure your ``handler`` does not manipulate
+        any Python objects.
+
+    Args:
+        handle (intptr_t): The library handle.
+        handler: The memory handler object, see above.
+
+    .. seealso:: `custatevecSetDeviceMemHandler`
+    """
+    cdef bytes name
+    cdef _DeviceMemHandler our_handler
+    cdef _DeviceMemHandler* handlerPtr = &our_handler
+
+    if isinstance(handler, int):
+        handlerPtr = <_DeviceMemHandler*><intptr_t>handler
+    elif cpython.PySequence_Check(handler):
+        name = handler[-1].encode('ascii')
+        if len(name) > CUSTATEVEC_ALLOCATOR_NAME_LEN:
+            raise ValueError("the handler name is too long")
+        our_handler.name[:len(name)] = name
+        our_handler.name[len(name)] = 0
+
+        if len(handler) == 4:
+            # handler = (ctx_ptr, malloc_ptr, free_ptr, name)
+            assert (isinstance(handler[1], int) and isinstance(handler[2], int))
+            our_handler.ctx = <void*><intptr_t>(handler[0])
+            our_handler.device_alloc = <DeviceAllocType><intptr_t>(handler[1])
+            our_handler.device_free = <DeviceFreeType><intptr_t>(handler[2])
+        elif len(handler) == 3:
+            # handler = (malloc, free, name)
+            assert (callable(handler[0]) and callable(handler[1]))
+            ctx = (handler[0], handler[1])
+            owner_pyobj[handle] = ctx  # keep it alive
+            our_handler.ctx = <void*>ctx
+            our_handler.device_alloc = cuqnt_alloc_wrapper
+            our_handler.device_free = cuqnt_free_wrapper
+        else:
+            raise ValueError("handler must be a sequence of length 3 or 4, "
+                             "see the documentation for detail")
+    else:
+        raise NotImplementedError("handler format not recognized")
+
+    with nogil:
+        status = custatevecSetDeviceMemHandler(<_Handle>handle, handlerPtr)
+    check_status(status)
+
+
+cpdef tuple get_device_mem_handler(intptr_t handle):
+    """ Get the device memory handler for cuTensorNet.
+
+    Args:
+        handle (intptr_t): The library handle.
+
+    Returns:
+        tuple:
+            The ``handler`` object, which has two forms:
+
+              - If ``handler`` is a 3-tuple, it is interpreted as ``(malloc, free,
+                name)``, where the first two objects are Python *callables*, and ``name``
+                is the name of the handler. This 3-tuple handler would be compared equal
+                (elementwisely) to the one previously passed to :func:`set_device_mem_handler`.
+              - If ``handler`` is a 4-tuple, it is interpreted as ``(ctx, device_alloc,
+                device_free, name)``, where the first three elements are the pointer
+                addresses (:class:`int`) of the corresponding members. ``name`` is the
+                same as above.
+
+    .. seealso:: `custatevecGetDeviceMemHandler`
+    """
+    cdef _DeviceMemHandler handler
+    with nogil:
+        status = custatevecGetDeviceMemHandler(<_Handle>handle, &handler)
+    check_status(status)
+
+    cdef tuple ctx
+    cdef bytes name = handler.name
+    if (handler.device_alloc == cuqnt_alloc_wrapper and
+            handler.device_free == cuqnt_free_wrapper):
+        ctx = <object>(handler.ctx)
+        return (ctx[0], ctx[1], name.decode('ascii'))
+    else:
+        # TODO: consider other possibilities?
+        return (<intptr_t>handler.ctx,
+                <intptr_t>handler.device_alloc,
+                <intptr_t>handler.device_free,
+                name.decode('ascii'))
+
+
+# can't be cpdef because args & kwargs can't be handled in a C signature
+def logger_set_callback_data(callback, *args, **kwargs):
+    """Set the logger callback along with arguments.
+
+    Args:
+        callback: A Python callable with the following signature (no return):
+
+          - ``callback(log_level, func_name, message, *args, **kwargs)``
+
+          where ``log_level`` (:class:`int`), ``func_name`` (`str`), and
+          ``message`` (`str`) are provided by the logger API.
+
+    .. seealso:: `custatevecLoggerSetCallbackData`
+    """
+    func_arg = (callback, args, kwargs)
+    # if only set once, the callback lifetime should be as long as this module,
+    # because we don't know when the logger is done using it
+    owner_pyobj['callback'] = func_arg
+    with nogil:
+        status = custatevecLoggerSetCallbackData(
+            <LoggerCallbackData>logger_callback_with_data, <void*>(func_arg))
+    check_status(status)
+
+
+cpdef logger_open_file(filename):
+    """Set the filename for the logger to write to.
+
+    Args:
+        filename (str): The log filename.
+
+    .. seealso:: `custatevecLoggerOpenFile`
+    """
+    cdef bytes name = filename.encode()
+    cdef char* name_ptr = name
+    with nogil:
+        status = custatevecLoggerOpenFile(name_ptr)
+    check_status(status)
+
+
+cpdef logger_set_level(int level):
+    """Set the logging level.
+
+    Args:
+        level (int): The logging level.
+
+    .. seealso:: `custatevecLoggerSetLevel`
+    """
+    with nogil:
+        status = custatevecLoggerSetLevel(level)
+    check_status(status)
+
+
+cpdef logger_set_mask(int mask):
+    """Set the logging mask.
+
+    Args:
+        level (int): The logging mask.
+
+    .. seealso:: `custatevecLoggerSetMask`
+    """
+    with nogil:
+        status = custatevecLoggerSetMask(mask)
+    check_status(status)
+
+
+cpdef logger_force_disable():
+    """Disable the logger.
+
+    .. seealso:: `custatevecLoggerForceDisable`
+    """
+    with nogil:
+        status = custatevecLoggerForceDisable()
+    check_status(status)
+
+
 class Pauli(IntEnum):
     """See `custatevecPauli_t`."""
     I = CUSTATEVEC_PAULI_I
@@ -1518,7 +1957,6 @@ class MatrixLayout(IntEnum):
     COL = CUSTATEVEC_MATRIX_LAYOUT_COL
     ROW = CUSTATEVEC_MATRIX_LAYOUT_ROW
 
-# unused in beta 1
 class MatrixType(IntEnum):
     """See `custatevecMatrixType_t`."""
     GENERAL = CUSTATEVEC_MATRIX_TYPE_GENERAL
@@ -1544,3 +1982,7 @@ MAJOR_VER = CUSTATEVEC_VER_MAJOR
 MINOR_VER = CUSTATEVEC_VER_MINOR
 PATCH_VER = CUSTATEVEC_VER_PATCH
 VERSION = CUSTATEVEC_VERSION
+
+
+# who owns a reference to user-provided Python objects (k: owner, v: object)
+cdef dict owner_pyobj = {}
diff --git a/python/cuquantum/cutensornet/__init__.py b/python/cuquantum/cutensornet/__init__.py
index c5eecc9..93d6a78 100644
--- a/python/cuquantum/cutensornet/__init__.py
+++ b/python/cuquantum/cutensornet/__init__.py
@@ -1,3 +1,4 @@
 from cuquantum.cutensornet.cutensornet import *
-from cuquantum.cutensornet.tensor_network import *
 from cuquantum.cutensornet.configuration import *
+from cuquantum.cutensornet.memory import *
+from cuquantum.cutensornet.tensor_network import *
diff --git a/python/cuquantum/cutensornet/_internal/einsum_parser.py b/python/cuquantum/cutensornet/_internal/einsum_parser.py
index 2780510..81466d1 100644
--- a/python/cuquantum/cutensornet/_internal/einsum_parser.py
+++ b/python/cuquantum/cutensornet/_internal/einsum_parser.py
@@ -2,33 +2,100 @@
 A collection of functions for parsing Einsum expressions.
 """
 
+from collections import Counter
+from itertools import chain
+
 import numpy as np
 
+from . import formatters
 from .tensor_wrapper import wrap_operands
 
 
-def parse_einsum_str(expr):
+native_to_str = lambda native : "'" + ''.join(s if s is not Ellipsis else '...' for s in native) + "'"
+
+def select_morpher(interleaved, mapper=None):
     """
-    Parse einsum expression. Note that no validity checks are performed.
+    Select appropriate function for mode label representation based on string or interleaved format.
+    """
+    if mapper is None:
+        return (lambda s : tuple(s)) if interleaved else native_to_str
+
+    return (lambda s : tuple(mapper(s))) if interleaved else lambda s : native_to_str(mapper(s))
 
-    Return operand as well as output indices if explicit mode or None for implicit mode.
+
+class ModeLabelMapper(object):
+    """
+    Map mode labels, with special treatment for Ellipsis characters.
     """
-    inputs, output = expr.split('->') if "->" in expr else (expr, None)
+    def __init__(self, _map):
+        """
+        Args:
+            _map = dict-like object to map mode labels.
+        """
+        self._map = _map
 
-    ellipses = '...' in inputs
-    if ellipses:
-        raise ValueError("Ellipsis broadcasting is not supported.")
+    def __call__(self, sequence):
+        return tuple(s if s is Ellipsis else self._map[s] for s in sequence)
+
+
+def parse_einsum_str(expr):
+    """
+    Parse einsum expression in string format, retaining ellipses if present.
 
-    inputs = tuple(tuple(_input) for _input in inputs.split(","))
+    Return operand as well as output mode labels if explicit form or None for implicit form.
+    """
+    disallowed_labels = set(['.', '-', '>'])
+
+    inputs, output, *rest = expr.split('->') if "->" in expr else (expr, None)
+    if rest:
+        raise ValueError("""Invalid expression.
+It is not permitted to specify more than one '->' in the Einstein summation expression.""")
+
+    def parse_single(single):
+        """
+        Parse single operand mode labels considering ellipsis. Leading or trailing whitespace, if present, is removed.
+        """
+        subexpr = single.strip().split('...')
+        n = len(subexpr)
+        expr = [[Ellipsis]] * (2*n - 1)
+        expr[::2] = subexpr
+
+        return tuple(chain(*expr))
+
+    def check_single(single):
+        """
+        Check for disallowed characters used as mode labels for a single operand.
+        """
+        for s in single:
+            if s is Ellipsis:
+                continue
+            if s.isspace() or s in disallowed_labels:
+                return False
+
+        return True
+
+    inputs = list(parse_single(_input) for _input in inputs.split(","))
+    if output is not None:
+        output = parse_single(output)
+
+    checks = [check_single(_input) for _input in inputs]
+    if not all(checks):
+        incorrect = [f"{location}: {native_to_str(inputs[location])}"
+                        for location, predicate in enumerate(checks) if predicate is False]
+        incorrect = formatters.array2string(incorrect)
+        message = f"""Incorrect term.
+Whitespace characters and characters from the set {disallowed_labels} cannot be used as mode labels in a summation expression.
+The incorrectly specified terms as a sequence of "position: term" are: \n{incorrect}"""
+        raise ValueError(message)
 
     return inputs, output
 
 
 def parse_einsum_interleaved(operand_sublists):
     """
-    Parse einsum expression in interleaved format. Note that no validity checks are performed.
+    Parse einsum expression in interleaved format, retaining ellipses if present.
 
-    Return operands as well as output indices if explicit mode or None for implicit mode.
+    Return operands as well as output mode labels if explicit form or None for implicit form.
     """
     inputs   = list()
     operands = list()
@@ -37,22 +104,67 @@ def parse_einsum_interleaved(operand_sublists):
     for i in range(N):
         operands.append(operand_sublists[2*i])
         inputs.append(operand_sublists[2*i + 1])
-    
+
     N = len(operand_sublists)
     output = operand_sublists[N-1] if N % 2 == 1 else None
 
-    ellipses = [Ellipsis in _input for _input in inputs]
-    if any(ellipses):
-        raise ValueError("Ellipsis broadcasting is not supported.")
-
     return operands, inputs, output
 
 
-def map_modes(user_inputs, user_output):
+def check_ellipses(user_inputs, morpher):
     """
-    Map modes in user-defined inputs and output to ordinals. Create the forward as well as inverse maps.
+    Check ellipsis specification for validity.
 
-    Return mapped inputs and output along with the forward and reverse maps.
+    Args:
+        user_inputs: Einsum expression in "neutral format" (sequence of sequences) before mapping.
+        morpher: A callable that transforms a term in neutral format (sequence) to string or interleaved format.
+    """
+
+    checks = [user_input.count(Ellipsis) <= 1 for user_input in user_inputs]
+    if not all(checks):
+        incorrect = [f"{location}: {morpher(user_inputs[location])}"
+                        for location, predicate in enumerate(checks) if predicate is False]
+        incorrect = formatters.array2string(incorrect)
+        message = f"""Incorrect ellipsis use.
+There must not be more than one ellipsis present in each term.
+The incorrectly specified terms as a sequence of "position: term" are: \n{incorrect}"""
+        raise ValueError(message)
+
+
+def check_einsum_with_operands(user_inputs, operands, morpher):
+    """
+    Check that the number of modes in each Einsum term is consistent with the shape of the corresponding operand.
+
+    Args:
+        operands: Wrapped operands.
+        user_inputs: Einsum expression in "neutral format" (sequence of sequences) before mapping.
+        morpher: A callable that transforms a term in neutral format (sequence) to string or interleaved format.
+    """
+
+    checks = [len(i) - 1 <= len(o.shape) if Ellipsis in i else len(i) == len(o.shape) for i, o in zip(user_inputs, operands)]
+    if not all(checks):
+        mismatch = [f"{location}: {morpher(user_inputs[location])} <=> {operands[location].shape}"
+                        for location, predicate in enumerate(checks) if predicate is False]
+        mismatch = formatters.array2string(mismatch)
+        message = f"""Term-operand shape mismatch.
+The number of mode labels in each term of the expression must match the shape of the corresponding operand.
+The mismatch as a sequence of "position: mode labels in term <=> operand shape" is: \n{mismatch}"""
+        raise ValueError(message)
+
+
+def map_modes(user_inputs, user_output, num_extra_labels, morpher):
+    """
+    Map modes in user-defined inputs and output to ordinals, leaving ellipsis for later processing. Create extra mode labels
+    in anticipation of ellipsis replacement. Create the forward as well as inverse maps.
+
+    Args:
+        user_inputs: Einsum expression in "neutral format" (sequence of sequences) before mapping.
+        user_output: The output mode labels before mapping as a sequence or None.
+        num_extra_labels: The number of extra mode labels to generate to use in ellipsis expansion later.
+        morpher: A callable that transforms a term in neutral format (sequence) to string or interleaved format.
+
+    Returns:
+        tuple:  A 5-tuple containing (mapped input, mapped output, forward map, reverse map, largest label).
     """
 
     ordinal = 0
@@ -63,46 +175,36 @@ def map_modes(user_inputs, user_output):
                 mode_map_user_to_ord[mode] = ordinal
                 ordinal += 1
 
-    mode_map_ord_to_user = { v : k for k, v in mode_map_user_to_ord.items() }
+    mode_map_user_to_ord.update((f'__{i}__', i) for i in range(ordinal, ordinal+num_extra_labels))
+    label_end = ordinal + num_extra_labels
+
+    mode_map_ord_to_user = {v : k for k, v in mode_map_user_to_ord.items()}
 
-    inputs = tuple(tuple(mode_map_user_to_ord[m] for m in modes) for modes in user_inputs)
+    inputs = list(tuple(m if m is Ellipsis else mode_map_user_to_ord[m] for m in modes) for modes in user_inputs)
 
     output = None
     if user_output is not None:
-        extra = set(user_output) - set(mode_map_user_to_ord.keys())
+        extra = set(user_output) - set(mode_map_user_to_ord.keys()) - set([Ellipsis])
         if extra:
-            output_modes = "'{}'".format(user_output) if isinstance(user_output, str) else user_output
+            output_modes = morpher(user_output)
             message = f"""Extra modes in output.
 The specified output modes {output_modes} contain the extra modes: {extra}"""
             raise ValueError(message)
-        output = tuple(mode_map_user_to_ord[m] for m in user_output) 
+        output = tuple(m if m is Ellipsis else mode_map_user_to_ord[m] for m in user_output)
 
-    return inputs, output, mode_map_user_to_ord, mode_map_ord_to_user
+    return inputs, output, mode_map_user_to_ord, mode_map_ord_to_user, label_end
 
 
-def check_einsum_with_operands(user_inputs, operands, interleaved):
-    """
-    Check that the number of modes in each Einsum term is consistent with the shape of the corresponding operand.
-    operands == wrapped
-    user_inputs = *before* mapping
+def create_size_dict(inputs, operands):
     """
+    Create size dictionary (mode label to extent map) capturing the extent of each mode.
 
-    checks = [len(i) == len(o.shape) for i, o in zip(user_inputs, operands)]
-    if not all(checks):
-        morpher =  (lambda s : tuple(s)) if interleaved else lambda s : "'" + ''.join(s) + "'"
-        mismatch = [f"{location}: {morpher(user_inputs[location])} <=> {operands[location].shape}" 
-                        for location, predicate in enumerate(checks) if predicate is False]
-        mismatch = np.array2string(np.array(mismatch, dtype='object'), separator=', ', formatter={'object': lambda s: s})
-        message = f"""Term-operand shape mismatch.
-The number of modes in each term of the expression must match the shape of the corresponding operand.
-The mismatch in the number of modes as a sequence of "operand position: modes in term <=> operand shape" is: \n{mismatch}"""
-        raise ValueError(message)
-
+    Args:
+        inputs: Einsum expression in "neutral format" (sequence of sequences) after relabelling modes.
+        operands: Wrapped operands.
 
-def create_size_dict(inputs, operands):
-    """
-    Create size dictionary capturing the extent of each mode.
-    inputs = based on renumbered modes.
+    Returns:
+        size_dict: size dictionary.
     """
 
     size_dict = dict()
@@ -112,7 +214,7 @@ def create_size_dict(inputs, operands):
             if mode in size_dict:
                 if size_dict[mode] == 1:    # Handle broadcasting
                     size_dict[mode] = shape[m]
-                elif size_dict[mode] != shape[m]:
+                elif size_dict[mode] != shape[m] and shape[m] != 1:
                     message = f"""Extent mismatch.
 The extent ({shape[m]}) of mode {m} for operand {i} does not match the extent ({size_dict[mode]}) of the same mode found
 in previous operand(s)."""
@@ -123,76 +225,85 @@ def create_size_dict(inputs, operands):
     return size_dict
 
 
-def calculate_mode_frequency(inputs):
-    """
-    Calculate the number of times a mode appears in the operand list.
+def infer_output_mode_labels(inputs):
     """
-    from collections import defaultdict
-    mode_frequency = defaultdict(int)
+    Infer output mode labels (those that appear exactly once).
 
-    for index, modes in enumerate(inputs):
-        for mode in modes:
-            mode_frequency[mode] += 1
+    Args:
+        inputs: Einsum expression in "neutral format" (sequence of sequences) after relabelling modes.
+    """
+    mode_label_freq = Counter(chain(*inputs))
+    del mode_label_freq[Ellipsis]
 
-    return mode_frequency
+    return tuple(sorted(m for m, c in mode_label_freq.items() if c == 1))
 
 
-def check_classical_einsum(mode_frequency, output, mode_map_user_to_ord, mode_map_ord_to_user):
+def process_ellipses(inputs, output, operands, label_end, mapping_morpher):
     """
-    Check if classical Einsum. Also infer output indices (all the modes that appear exactly once).
+    Replace ellipses by generated mode labels, using 'label_end' and aligning shapes from the right. Infer or update
+    output mode labels.
+
+    Args:
+        inputs: Einsum expression in "neutral format" (sequence of sequences) after relabelling modes.
+        output: The output mode labels after relabelling as a sequence or None.
+        operands: Wrapped operands.
+        label_end: One past the largest mode label (int), including modes resulting from Ellipsis expansion.
+        mapping_morpher: A callable that transforms a term in neutral format (sequence) to string or interleaved format,
+            while converting internal labels to user labels.
+
+    Returns:
+        tuple: a 2-tuple (inputs, output) after ellipsis expansion and inferring output mode labels if needed.
     """
 
-    single_modes = set()
-    double_modes = set()
-    rest = set()
-    for mode, frequency in mode_frequency.items():
-        if frequency == 1:
-            single_modes.add(mode)
-        elif frequency == 2:
-            double_modes.add(mode)
-        else:
-            rest.add(mode)
-
-    if rest:
-        rest = tuple(mode_map_ord_to_user[r] for r in rest)
-        message = f"""No generalized Einsum support.
-These modes appear more than twice: {rest}"""
-        raise ValueError(message)
-
+    inferred = False
     if output is None:
-        # Implicit mode: lexical sort based on user mode labels.
-        output = sorted(mode_map_ord_to_user[m] for m in single_modes)
-        output = tuple(mode_map_user_to_ord[m] for m in output)
-        return output
+        output = infer_output_mode_labels(inputs)
+        inferred = True
 
-    output_set = set(output)
+    shortest, longest = label_end, 0
+    for i, _input in enumerate(inputs):
+        if Ellipsis not in _input:
+            continue
 
-    missing = set(mode_map_ord_to_user[m] for m in single_modes - output_set)
-    if missing:
-        message = f"""No generalized Einsum support.
-These single modes must appear in the output: {missing}"""
-        raise ValueError(message)
+        n = len(operands[i].shape) - (len(_input) - 1)
+        assert n >= 0, "Internal error"
 
-    common = set(mode_map_ord_to_user[c] for c in output_set & double_modes)
-    if common:
-        message = f"""No generalized Einsum support.
-These double modes must not appear in the output: {common}"""
-        raise ValueError(message)
+        s = _input.index(Ellipsis)
+        shortest, longest = min(shortest, n), max(longest, n)
+        inputs[i] = _input[:s] + tuple(range(label_end-n, label_end)) + _input[s+1:]
 
-    return output
+    if not inferred:
+        count = output.count(Ellipsis)
+        if count > 1:
+            message = f"""Incorrect ellipsis use.
+The output term cannot have more than one ellipsis. Specified term = {mapping_morpher(output)}"""
+            raise ValueError(message)
+        if count == 1:    # Replace ellipsis by the longest sequence of labels.
+            s = output.index(Ellipsis)
+            output = output[:s] + tuple(range(label_end-longest, label_end)) + output[s+1:]
+        else:    # If all ellipses expand to the same number of mode labels, the latter are reduced.
+            if shortest != longest:
+                message = f"""Ellipsis length mismatch for reduction.
+The ellipses specified in the expression do not expand to the same number of mode labels and thus cannot be reduced. The
+expanded number of dimensions ranges from {shortest} to {longest}."""
+                raise ValueError(message)
+    else:  #  The mode labels corresponding to ellipsis expansion followed by the inferred mode labels.
+        output = tuple(range(label_end-longest, label_end)) + output
+
+    return inputs, output
 
 
 def parse_einsum(*operands):
     """
-    Classical Einsum definition: modes that appear twice are summed over and those that appear once must appear in the output.
-    Recognizes both string and interleaved formats. Any hashable type is accepted in interleaved format for mode specification, 
-    and unicode strings are accepted. If the output is not provided (implicit form or missing output sublist), it will be 
-    inferred from the expression.
+    Parse the generalized Einstein summation expression in both string and interleaved formats. Any hashable and comparable
+    object is accepted in the interleaved format for mode label specification, and unicode strings are accepted. If the
+    output is not provided (implicit form or missing output sublist), it will be inferred from the expression.
 
-    Returns wrapped operands, mapped inputs and output, size dictionary based on internal mode numbers, and the forward as 
+    Returns wrapped operands, mapped inputs and output, size dictionary based on internal mode numbers, and the forward as
     well as the reverse mode maps.
     """
 
+    # Parse einsum keeping ellipses.
     interleaved = False
     if isinstance(operands[0], str):
         inputs, output = parse_einsum_str(operands[0])
@@ -207,28 +318,49 @@ def parse_einsum(*operands):
 The number of operands ({num_operand}) must match the number of inputs ({num_input}) specified in the Einsum expression."""
         raise ValueError(message)
 
-    if num_operand < 2:
-        message = "The network must consist of at least two tensors."
+    morpher = select_morpher(interleaved)
+
+    if num_operand < 1:
+        message = "The network must consist of at least one tensor."
         raise ValueError(message)
 
     # First wrap operands.
     operands = wrap_operands(operands)
 
-    # Basic check to ensure that the number of modes is consistent with the operand shape.
-    check_einsum_with_operands(inputs, operands, interleaved)
+    # Preliminary checks, before mode label remapping.
+
+    ellipses = any(Ellipsis in _input for _input in inputs)
+
+    # Ensure at most one ellipsis per operand.
+    if ellipses:
+        check_ellipses(inputs, morpher)
+
+    # Ensure that ellipsis is not present only in the output.
+    if not ellipses and output is not None and Ellipsis in output:
+        message = f"""Invalid ellipsis specification.
+The output term {morpher(output)} contains ellipsis while none of the input terms do."""
+        raise ValueError(message)
+
+    # Ensure that the number of modes is consistent with the operand shape.
+    check_einsum_with_operands(inputs, operands, morpher)
+
+    # Calculate the maximum number of extra mode labels that will be needed.
+    num_extra_labels = max(len(o.shape) for o in operands) if ellipses else 0
 
     # Map data to ordinals for cutensornet.
-    inputs, output, mode_map_user_to_ord, mode_map_ord_to_user = map_modes(inputs, output)
+    inputs, output, mode_map_user_to_ord, mode_map_ord_to_user, label_end = map_modes(inputs, output, num_extra_labels, morpher)
 
-    # Create mode-extent map based on internal mode numbers.
-    size_dict = create_size_dict(inputs, operands)
+    mapper = ModeLabelMapper(mode_map_ord_to_user)
+    mapping_morpher = select_morpher(interleaved, mapper)
 
-    # Create output modes if not specified.
-    mode_frequency = calculate_mode_frequency(inputs)
+    # Ellipsis expansion.
+    if ellipses:
+        inputs, output = process_ellipses(inputs, output, operands, label_end, mapping_morpher)
+    elif output is None:
+        output = infer_output_mode_labels(inputs)
 
-    # Finally, check if the expression is a classical Einsum. Calculate output indices in implicit mode (output=None).
-    output = check_classical_einsum(mode_frequency, output, mode_map_user_to_ord, mode_map_ord_to_user)
+    # Create mode-extent map based on internal mode numbers.
+    size_dict = create_size_dict(inputs, operands)
 
     return operands, inputs, output, size_dict, mode_map_user_to_ord, mode_map_ord_to_user
 
-
diff --git a/python/cuquantum/cutensornet/_internal/formatters.py b/python/cuquantum/cutensornet/_internal/formatters.py
new file mode 100644
index 0000000..93e294d
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/formatters.py
@@ -0,0 +1,44 @@
+"""
+Formatters for printing data.
+"""
+
+import numpy as np
+
+class MemoryStr(object):
+    """
+    A simple type to pretty-print memory-like values.
+    """
+
+    def __init__(self, memory, base_unit='B'):
+        self.memory = memory
+        self.base_unit = base_unit
+        self.base = 1024
+
+    def __str__(self):
+        """
+        Convert large values to powers of 1024 for readability.
+        """
+
+        base, base_unit, memory = self.base, self.base_unit, self.memory
+
+        if memory < base:
+            value, unit = memory, base_unit
+        elif memory < base**2:
+            value, unit = memory/base, f'Ki{base_unit}'
+        elif memory < base**3:
+            value, unit = memory/base**2, f'Mi{base_unit}'
+        else:
+            value, unit = memory/base**3, f'Gi{base_unit}'
+
+        return f"{value:0.2f} {unit}"
+
+
+def array2string(array_like):
+    """
+    String representation of an array-like object with possible truncation of "interior" values to limit string size.
+
+    The NumPy function "set_printoptions" can be used to control the display of the array.
+    """
+
+    return np.array2string(np.asanyarray(array_like, dtype='object'), separator=', ', formatter={'object': lambda s: s})
+
diff --git a/python/cuquantum/cutensornet/_internal/package_ifc.py b/python/cuquantum/cutensornet/_internal/package_ifc.py
new file mode 100644
index 0000000..2bd4f0a
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/package_ifc.py
@@ -0,0 +1,66 @@
+"""
+An abstract interface to certain package-provided operations.
+"""
+
+__all__ = ['Package']
+
+from abc import ABC, abstractmethod
+
+
+class Package(ABC):
+
+    @staticmethod
+    @abstractmethod
+    def get_current_stream(device_id):
+        """
+        Obtain the current stream on the device.
+
+        Args:
+            device_id: The id (ordinal) of the device.
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def to_stream_pointer(stream):
+        """
+        Obtain the stream pointer.
+
+        Args:
+            stream: The stream object.
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def to_stream_context(stream):
+        """
+        Create a context manager from the stream.
+
+        Args:
+            stream: The stream object.
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def create_external_stream(device_id, stream_ptr):
+        """
+        Wrap a stream pointer into an external stream object.
+
+        Args:
+            device_id: The id (ordinal) of the device.
+            stream: The stream pointer (int) to be wrapped.
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def create_stream(device_id):
+        """
+        Create a new stream on the specified device.
+
+        Args:
+            device_id: The id (ordinal) of the device.
+        """
+        raise NotImplementedError
diff --git a/python/cuquantum/cutensornet/_internal/package_ifc_cupy.py b/python/cuquantum/cutensornet/_internal/package_ifc_cupy.py
new file mode 100644
index 0000000..b64f71e
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/package_ifc_cupy.py
@@ -0,0 +1,36 @@
+"""
+Interface to CuPy operations.
+"""
+
+__all__ = ['CupyPackage']
+
+import cupy as cp
+
+from .package_ifc import Package
+
+
+class CupyPackage(Package):
+
+    @staticmethod
+    def get_current_stream(device_id):
+        with cp.cuda.Device(device_id):
+            stream = cp.cuda.get_current_stream()
+        return stream
+
+    @staticmethod
+    def to_stream_pointer(stream):
+        return stream.ptr
+
+    @staticmethod
+    def to_stream_context(stream):
+        return stream
+
+    @staticmethod
+    def create_external_stream(device_id, stream_ptr):
+        return cp.cuda.ExternalStream(stream_ptr)
+
+    @staticmethod
+    def create_stream(device_id):
+        with cp.cuda.Device(device_id):
+            stream = cp.cuda.Stream(null=False, non_blocking=False, ptds=False)
+        return stream
diff --git a/python/cuquantum/cutensornet/_internal/package_ifc_torch.py b/python/cuquantum/cutensornet/_internal/package_ifc_torch.py
new file mode 100644
index 0000000..91281a5
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/package_ifc_torch.py
@@ -0,0 +1,34 @@
+"""
+Interface to Torch operations.
+"""
+
+__all__ = ['TorchPackage']
+
+import torch
+
+from .package_ifc import Package
+
+
+class TorchPackage(Package):
+
+    @staticmethod
+    def get_current_stream(device_id):
+        return torch.cuda.current_stream(device=device_id)
+
+    @staticmethod
+    def to_stream_pointer(stream):
+        return stream.cuda_stream
+
+    @staticmethod
+    def to_stream_context(stream):
+        return torch.cuda.stream(stream)
+
+    @classmethod
+    def create_external_stream(device_id, stream_ptr):
+        return torch.cuda.ExternalStream(stream_ptr, device=device_id)
+
+    @staticmethod
+    def create_stream(device_id):
+        stream = torch.cuda.Stream(device=device_id)
+        return stream
+
diff --git a/python/cuquantum/cutensornet/_internal/package_wrapper.py b/python/cuquantum/cutensornet/_internal/package_wrapper.py
new file mode 100644
index 0000000..f91217a
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/package_wrapper.py
@@ -0,0 +1,16 @@
+"""
+Create map from package names to package interface objects.
+"""
+
+__all__ = ['PACKAGE']
+
+from .package_ifc_cupy import CupyPackage
+
+PACKAGE = {'cupy': CupyPackage}
+try:
+    import torch
+    from .package_ifc_torch import TorchPackage
+    PACKAGE['torch'] = TorchPackage
+except ImportError as e:
+    pass
+
diff --git a/python/cuquantum/cutensornet/_internal/tensor_ifc_cupy.py b/python/cuquantum/cutensornet/_internal/tensor_ifc_cupy.py
new file mode 100644
index 0000000..9c566fb
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/tensor_ifc_cupy.py
@@ -0,0 +1,93 @@
+"""
+Interface to seamlessly use Cupy ndarray objects.
+"""
+
+__all__ = ['CupyTensor']
+
+import cupy
+import numpy
+
+from .tensor_ifc import Tensor
+
+
+class CupyTensor(Tensor):
+    """
+    Tensor wrapper for cupy ndarrays.
+    """
+    name = 'cupy'
+    module = cupy
+    name_to_dtype = Tensor.create_name_dtype_map(conversion_function=lambda name: cupy.dtype(name), exception_type=TypeError)
+
+    def __init__(self, tensor):
+        super().__init__(tensor)
+
+    @property
+    def data_ptr(self):
+        return self.tensor.data.ptr
+
+    @property
+    def device(self):
+        return 'cuda'
+
+    @property
+    def device_id(self):
+        return self.tensor.device.id
+
+    @property
+    def dtype(self):
+        """Name of the data type"""
+        return self.tensor.dtype.name
+
+    @property
+    def shape(self):
+        return tuple(self.tensor.shape)
+
+    @property
+    def strides(self):
+        return tuple(stride_in_bytes / self.tensor.itemsize for stride_in_bytes in self.tensor.strides)
+
+    def numpy(self):
+        return self.tensor.get()
+
+    @classmethod
+    def empty(cls, shape, **context):
+        """
+        Create an empty tensor of the specified shape and data type.
+        """
+        name = context.get('dtype', 'float32')
+        dtype = CupyTensor.name_to_dtype[name]
+        device = context.get('device', None)
+        with cupy.cuda.Device(device=device):
+            tensor = cupy.empty(shape, dtype=dtype)
+
+        return tensor
+
+    def to(self, device='cpu'):
+        """
+        Create a copy of the tensor on the specified device (integer or 
+          'cpu'). Copy to  Numpy ndarray if CPU, otherwise return Cupy type.
+        """
+        if device == 'cpu':
+            return self.numpy()
+
+        if not isinstance(device, int):
+            raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device}'.")
+
+        with cupy.cuda.Device(device):
+            tensor_device = cupy.asarray(self.tensor)
+
+        return tensor_device
+
+    def copy_(self, src):
+        """
+        Inplace copy of src (copy the data from src into self).
+        """
+
+        cupy.copyto(self.tensor, src)
+
+    def istensor(self):
+        """
+        Check if the object is ndarray-like.
+        """
+        return isinstance(self.tensor, cupy.ndarray)
+
diff --git a/python/cuquantum/cutensornet/_internal/tensor_ifc_numpy.py b/python/cuquantum/cutensornet/_internal/tensor_ifc_numpy.py
new file mode 100644
index 0000000..3286c24
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/tensor_ifc_numpy.py
@@ -0,0 +1,82 @@
+"""
+Interface to seamlessly use Numpy ndarray objects.
+"""
+
+__all__ = ['NumpyTensor']
+
+import cupy
+import numpy
+
+from .tensor_ifc import Tensor
+
+class NumpyTensor(Tensor):
+    """
+    Tensor wrapper for numpy ndarrays.
+    """
+    name = 'numpy'
+    module = numpy
+    name_to_dtype = Tensor.create_name_dtype_map(conversion_function=lambda name: numpy.dtype(name), exception_type=TypeError)
+
+    def __init__(self, tensor):
+        super().__init__(tensor)
+
+    @property
+    def data_ptr(self):
+        return self.tensor.ctypes.data
+
+    @property
+    def device(self):
+        return 'cpu'
+
+    @property
+    def device_id(self):
+        return None
+
+    @property
+    def dtype(self):
+        """Name of the data type"""
+        return self.tensor.dtype.name
+
+    @property
+    def shape(self):
+        return tuple(self.tensor.shape)
+
+    @property
+    def strides(self):
+        return tuple(stride_in_bytes / self.tensor.itemsize for stride_in_bytes in self.tensor.strides)
+
+    def numpy(self):
+        return self.tensor
+
+    @classmethod
+    def empty(cls, shape, **context):
+        """
+        Create an empty tensor of the specified shape and data type.
+        """
+        name = context.get('dtype', 'float32')
+        dtype = NumpyTensor.name_to_dtype[name]
+        return cls(module.empty(shape, dtype=dtype))
+
+    def to(self, device='cpu'):
+        """
+        Create a copy of the tensor on the specified device (integer or 
+          'cpu'). Copy to  Cupy ndarray on the specified device if it 
+          is not CPU. Otherwise, return self.
+        """
+        if device == 'cpu':
+            return self
+
+        if not isinstance(device, int):
+            raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device}'.")
+
+        with cupy.cuda.Device(device):
+            tensor_device = cupy.asarray(self.tensor)
+
+        return tensor_device
+
+    def istensor(self):
+        """
+        Check if the object is ndarray-like.
+        """
+        return isinstance(self.tensor, numpy.ndarray)
+
diff --git a/python/cuquantum/cutensornet/_internal/tensor_ifc_torch.py b/python/cuquantum/cutensornet/_internal/tensor_ifc_torch.py
new file mode 100644
index 0000000..234dee6
--- /dev/null
+++ b/python/cuquantum/cutensornet/_internal/tensor_ifc_torch.py
@@ -0,0 +1,88 @@
+"""
+Interface to seamlessly use Torch tensor objects.
+"""
+
+__all__ = ['TorchTensor']
+
+import torch
+
+from . import typemaps
+from .tensor_ifc import Tensor
+
+
+class TorchTensor(Tensor):
+    """
+    Tensor wrapper for Torch Tensors.
+    """
+    name = 'torch'
+    module = torch
+    name_to_dtype = Tensor.create_name_dtype_map(conversion_function=lambda name: eval('torch.'+name), exception_type=AttributeError)
+
+    def __init__(self, tensor):
+        super().__init__(tensor)
+
+    @property
+    def data_ptr(self):
+        return self.tensor.data_ptr()
+
+    @property
+    def device(self):
+        str(self.tensor.device).split(':')[0]
+
+    @property
+    def device_id(self):
+        return self.tensor.device.index
+
+    @property
+    def dtype(self):
+        """Name of the data type"""
+        return str(self.tensor.dtype).split('.')[-1]
+
+    @property
+    def shape(self):
+        return tuple(self.tensor.shape)
+
+    @property
+    def strides(self):
+        return self.tensor.stride()
+
+    def numpy(self):
+        return self.tensor.get()
+
+    @classmethod
+    def empty(cls, shape, **context):
+        """
+        Create an empty tensor of the specified shape and data type on the specified device (None, 'cpu', or device id).
+        """
+        name = context.get('dtype', 'float32')
+        dtype = TorchTensor.name_to_dtype[name]
+        device = context.get('device', None)
+        tensor = torch.empty(shape, dtype=dtype, device=device)
+
+        return tensor
+
+    def to(self, device='cpu'):
+        """
+        Create a copy of the tensor on the specified device (integer or 
+          'cpu'). Copy to  Numpy ndarray if CPU, otherwise return Cupy type.
+        """
+        if not(device == 'cpu' or isinstance(device, int)):
+            raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device}'.")
+
+        tensor_device = self.tensor.to(device=device)
+
+        return tensor_device
+
+    def copy_(self, src):
+        """
+        Inplace copy of src (copy the data from src into self).
+        """
+
+        self.tensor.copy_(src)
+
+    def istensor(self):
+        """
+        Check if the object is ndarray-like.
+        """
+        return isinstance(self.tensor, torch.Tensor)
+
diff --git a/python/cuquantum/cutensornet/_internal/tensor_wrapper.py b/python/cuquantum/cutensornet/_internal/tensor_wrapper.py
index 257ef70..05e2a6c 100644
--- a/python/cuquantum/cutensornet/_internal/tensor_wrapper.py
+++ b/python/cuquantum/cutensornet/_internal/tensor_wrapper.py
@@ -6,8 +6,9 @@
 
 import numpy as np
 
-from .cupy_ifc import CupyTensor
-from .numpy_ifc import NumpyTensor
+from . import formatters
+from .tensor_ifc_cupy import CupyTensor
+from .tensor_ifc_numpy import NumpyTensor
 
 
 _TENSOR_TYPES = {
@@ -18,7 +19,7 @@
 # Optional modules
 try:
     import torch
-    from .torch_ifc import TorchTensor
+    from .tensor_ifc_torch import TorchTensor
     _TENSOR_TYPES['torch']  = TorchTensor
 except ImportError as e:
     pass
@@ -51,9 +52,9 @@ def check_valid_package(native_operands):
     checks = [p in _SUPPORTED_PACKAGES for p in operands_pkg]
     if not all(checks):
         unknown = [f"{location}: {operands_pkg[location]}" for location, predicate in enumerate(checks) if predicate is False]
-        unknown = np.array2string(np.array(unknown, dtype='object'), separator=', ', formatter={'object': lambda s: s})
+        unknown = formatters.array2string(unknown)
         message = f"""The operands should be ndarray-like objects from one of {_SUPPORTED_PACKAGES} packages.
-The unsupported operands as a sequence of "zero-based operand ordinal: package" is: \n{unknown}"""
+The unsupported operands as a sequence of "position: package" is: \n{unknown}"""
         raise ValueError(message)
 
     return operands_pkg
@@ -66,9 +67,9 @@ def check_valid_operand_type(wrapped_operands):
     if not all(istensor):
         unknown = [f"{location}: {type(wrapped_operands[location].tensor)}" 
                     for location, predicate in enumerate(istensor) if predicate is False]
-        unknown = np.array2string(np.array(unknown, dtype='object'), separator=', ', formatter={'object': lambda s: s})
+        unknown = formatters.array2string(unknown)
         message = f"""The operands should be ndarray-like objects from one of {_SUPPORTED_PACKAGES} packages.
-The unsupported operands as a sequence of "zero-based operand ordinal: type" is: \n{unknown}"""
+The unsupported operands as a sequence of "position: type" is: \n{unknown}"""
         raise ValueError(message)
 
 
diff --git a/python/cuquantum/cutensornet/_internal/utils.py b/python/cuquantum/cutensornet/_internal/utils.py
index 1c741ac..d795ec6 100644
--- a/python/cuquantum/cutensornet/_internal/utils.py
+++ b/python/cuquantum/cutensornet/_internal/utils.py
@@ -8,8 +8,10 @@
 import cupy as cp
 import numpy as np
 
-from . import tensor_wrapper
+from . import formatters
 from . import mem_limit
+from . import package_wrapper
+from . import tensor_wrapper
 
 def infer_object_package(obj):
     """
@@ -37,38 +39,54 @@ def check_or_create_options(cls, options, options_description):
     return options
 
 
-def get_or_create_stream(device, stream):
+def _create_stream_ctx_ptr_cupy_stream(package_ifc, stream):
     """
-    Create a stream object from a stream pointer or extract the stream pointer from a stream object.
-    Return the stream object as well as the stream pointer.
+    Utility function to create a stream context as a "package-native" object, get stream pointer as well as
+    create a cupy stream object.
     """
+    stream_ctx = package_ifc.to_stream_context(stream)
+    stream_ptr = package_ifc.to_stream_pointer(stream)
+    stream = cp.cuda.ExternalStream(stream_ptr)
 
-    if stream is None:
-        with device: 
-            stream = cp.cuda.get_current_stream()
-            stream_ptr = stream.ptr
-        return stream, stream_ptr
+    return stream, stream_ctx, stream_ptr
 
-    if isinstance(stream, int):
-        stream_ptr = stream
-        stream = cp.cuda.ExternalStream(stream_ptr)
 
-        return stream, stream_ptr
+def get_or_create_stream(device, stream, op_package):
+    """
+    Create a stream object from a stream pointer or extract the stream pointer from a stream object, or
+    use the current stream.
 
-    module = infer_object_package(stream)
+    Args:
+        device: The device (CuPy object) for the stream.
+        stream: A stream object, stream pointer, or None.
+        op_package: The package the tensor network operands belong to.
 
-    if module not in ['cupy', 'torch']:
-        raise TypeError("The CUDA stream must be specified as a CuPy or Torch stream object. "
-                        "Alternatively, the stream pointer can be directly provided as an int.")
+    Returns:
+        tuple: CuPy stream object, package stream context, stream pointer.
+    """
 
-    if module == 'cupy':
-        stream_ptr = stream.ptr
+    device_id  = device.id
+    op_package_ifc = package_wrapper.PACKAGE[op_package]
+    if stream is None:
+        stream = op_package_ifc.get_current_stream(device_id)
+        return _create_stream_ctx_ptr_cupy_stream(op_package_ifc, stream)
 
-    if module == 'torch':
-        stream_ptr = stream.cuda_stream
+    if isinstance(stream, int):
+        stream_ptr = stream
+        if op_package == 'torch':
+            message = "A stream object must be provided for PyTorch operands, not stream pointer."
+            raise TypeError(message)
+        stream_ctx = op_package_ifc.to_stream_context(stream)
         stream = cp.cuda.ExternalStream(stream_ptr)
 
-    return stream, stream_ptr
+        return stream, stream_ctx, stream_ptr
+
+    stream_package = infer_object_package(stream)
+    if stream_package != op_package:
+            message = "The stream object must belong to the same package as the tensor network operands."
+            raise TypeError(message)
+
+    return _create_stream_ctx_ptr_cupy_stream(op_package_ifc, stream)
 
 
 def get_memory_limit(memory_limit, device):
@@ -119,24 +137,39 @@ def get_operands_data(operands):
     return op_data, alignments
 
 
-def create_empty_tensor(cls, extents, dtype, device):
+def create_empty_tensor(cls, extents, dtype, device_id, stream_ctx):
     """
     Create a wrapped tensor of the same type as (the wrapped) cls on the specified device having the 
     specified extents and dtype.
+
+    The tensor is created within a stream context to allow for asynchronous memory allocators like 
+    CuPy's MemoryAsyncPool.
     """
-    tensor = cls.empty(extents, dtype=dtype, device=device)
+    with stream_ctx:
+        tensor = cls.empty(extents, dtype=dtype, device=device_id)
     tensor = tensor_wrapper.wrap_operand(tensor)
     return tensor
 
 
-def create_output_tensor(cls, output, size_dict, device_id, data_type):
+def create_output_tensor(cls, package, output, size_dict, device, data_type):
     """
-    Create output tensor and associated data (modes, extents, strides, alignment)
+    Create output tensor and associated data (modes, extents, strides, alignment). This operation is
+    blocking and is safe to use with asynchronous memory pools.
     """
     modes = tuple(m for m in output)
     extents = tuple(size_dict[m] for m in output)
 
-    output = create_empty_tensor(cls, extents, data_type, device_id)
+    package_ifc = package_wrapper.PACKAGE[package]
+    device_id = device.id
+
+    stream = package_ifc.create_stream(device_id)
+    stream, stream_ctx, _ = _create_stream_ctx_ptr_cupy_stream(package_ifc, stream)
+
+    with device:
+        start = stream.record()
+        output = create_empty_tensor(cls, extents, data_type, device_id, stream_ctx)
+        end = stream.record()
+        end.synchronize()
 
     strides = output.strides
     alignment = get_maximal_alignment(output.data_ptr)
@@ -178,6 +211,17 @@ def get_maximal_alignment(address):
     return alignment
 
 
+def get_operands_package(operands):
+    """
+    Return the package name of the tensors.
+    """
+    package = infer_object_package(operands[0].tensor)
+    if not all (infer_object_package(operand.tensor) == package for operand in operands):
+        packages = set(infer_object_package(operand.tensor) for operand in operands)
+        raise TypeError(f"All tensors in the network must be from the same library package. Packages found = {packages}.")
+    return package
+
+
 def check_operands_match(orig_operands, new_operands, attribute, description):
     """
     Check if the specified attribute matches between the corresponding new and old operands, and raise an exception if it 
@@ -188,9 +232,9 @@ def check_operands_match(orig_operands, new_operands, attribute, description):
     if not all(checks): 
         mismatch = [f"{location}: {getattr(orig_operands[location], attribute)} => {getattr(new_operands[location], attribute)}"
                         for location, predicate in enumerate(checks) if predicate is False]
-        mismatch = np.array2string(np.array(mismatch, dtype='object'), separator=', ', formatter={'object': lambda s: s})
+        mismatch = formatters.array2string(mismatch)
         message = f"""The {description} of each new operand must match the {description} of the corresponding original operand.
-The mismatch in {description} as a sequence of "operand position: original {description} => new {description}" is: \n{mismatch}"""
+The mismatch in {description} as a sequence of "position: original {description} => new {description}" is: \n{mismatch}"""
         raise ValueError(message)
 
 
@@ -203,30 +247,12 @@ def check_alignments_match(orig_alignments, new_alignments):
     if not all(checks): 
         mismatch = [f"{location}: {orig_alignments[location]} => {new_alignments[location]}" 
                         for location, predicate in enumerate(checks) if predicate is False] 
-        mismatch = np.array2string(np.array(mismatch, dtype='object'), separator=', ', formatter={'object': lambda s: s})
+        mismatch = formatters.array2string(mismatch)
         message = f"""The data alignment of each new operand must match the data alignment of the corresponding original operand.
-The mismatch in data alignment as a sequence of "operand position: original alignment => new alignment" is: \n{mismatch}"""
+The mismatch in data alignment as a sequence of "position: original alignment => new alignment" is: \n{mismatch}"""
         raise ValueError(message)
 
 
-def convert_memory_with_units(memory):
-    """
-    Convert the provided memory value into a form suitable for printing.
-    """
-    base = 1024
-
-    if memory < base:
-        value, unit = memory, 'B'
-    elif memory < base**2:
-        value, unit = memory/base, 'KiB'
-    elif memory < base**3:
-        value, unit = memory/base**2, 'MiB'
-    else:
-        value, unit = memory/base**3, 'GiB'
-
-    return value, unit
-
-
 def check_autotune_params(iterations):
     """
     Check if the autotune parameters are of the correct type and within range.
@@ -242,6 +268,19 @@ def check_autotune_params(iterations):
     return message
 
 
+def get_ptr_from_memory_pointer(mem_ptr):
+    """
+    Access the value associated with one of the attributes 'device_ptr', 'device_pointer', 'ptr'.
+    """
+    attributes = ('device_ptr', 'device_pointer', 'ptr')
+    for attr in attributes:
+        if hasattr(mem_ptr, attr):
+            return getattr(mem_ptr, attr)
+
+    message = f"Memory pointer objects should have one of the following attributes specifying the device pointer: {attributes}"
+    raise AttributeError(message)
+
+
 # Decorator definitions
 
 def atomic(handler: Callable[[Optional[object]], None], method: bool = False) -> Callable:
diff --git a/python/cuquantum/cutensornet/configuration.py b/python/cuquantum/cutensornet/configuration.py
index d3945a0..e5e6cc8 100644
--- a/python/cuquantum/cutensornet/configuration.py
+++ b/python/cuquantum/cutensornet/configuration.py
@@ -1,4 +1,4 @@
-""" 
+"""
 A collection of types for defining options to cutensornet.
 """
 
@@ -7,13 +7,16 @@
 import collections
 from dataclasses import dataclass
 from logging import Logger
-from typing import Dict, Hashable, Iterable, Mapping, Optional, Tuple, Type, Union
+from typing import Dict, Hashable, Iterable, Mapping, Optional, Tuple, Union
 
 import cupy as cp
 
+import cuquantum
 from cuquantum import cutensornet as cutn
 from ._internal import enum_utils
+from ._internal import formatters
 from ._internal.mem_limit import MEM_LIMIT_RE_PCT, MEM_LIMIT_RE_VAL, MEM_LIMIT_DOC
+from .memory import BaseCUDAMemoryManager
 
 
 @dataclass
@@ -25,18 +28,25 @@ class NetworkOptions(object):
         device_id: CUDA device ordinal (used if the tensor network resides on the CPU). Device 0 will be used if not specified.
         handle: cuTensorNet library handle. A handle will be created if one is not provided.
         logger (logging.Logger): Python Logger object. The root logger will be used if a logger object is not provided.
-        memory_limit: Maximum memory available to cuTensorNet. It can be specified as a value (with optional suffix like 
+        memory_limit: Maximum memory available to cuTensorNet. It can be specified as a value (with optional suffix like
             K[iB], M[iB], G[iB]) or as a percentage. The default is 80%.
+        allocator: An object that supports the :class:`BaseCUDAMemoryManager` protocol, used to draw device memory. If an
+            allocator is not provided, a memory allocator from the library package will be used
+            (:func:`torch.cuda.caching_allocator_alloc` for PyTorch operands, :func:`cupy.cuda.alloc` otherwise).
     """
     compute_type : Optional[int] = None
     device_id : Optional[int] = None
     handle : Optional[int] = None
-    logger : Optional[Type[Logger]] = None
+    logger : Optional[Logger] = None
     memory_limit : Optional[Union[int, str]] = r'80%'
+    allocator : Optional[BaseCUDAMemoryManager] = None
 
     def __post_init__(self):
         #  Defer creating handle as well as computing the memory limit till we know the device the network is on.
 
+        if self.compute_type is not None:
+            self.compute_type = cuquantum.ComputeType(self.compute_type)
+
         if self.device_id is None:
             self.device_id = 0
 
@@ -50,6 +60,8 @@ def __post_init__(self):
             if not (m1 or m2):
                 raise ValueError(MEM_LIMIT_DOC % self.memory_limit)
 
+        if self.allocator is not None and not isinstance(self.allocator, BaseCUDAMemoryManager):
+            raise TypeError("The allocator must be an object of type that fulfils the BaseCUDAMemoryManager protocol.")
 
 # Generate the options dataclasses from ContractionOptimizerConfigAttributes.
 
@@ -77,20 +89,22 @@ class OptimizerOptions(object):
 
     Attributes:
         samples: Number of samples for hyperoptimization. See `CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_HYPER_NUM_SAMPLES`.
+        threads: Number of threads for the hyperoptimizer. See `CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_HYPER_NUM_THREADS`.
         path: Options for the path finder (:class:`~cuquantum.PathFinderOptions` object or dict containing the ``(parameter, value)``
             items for ``PathFinderOptions``). Alternatively, the path can be provided as a sequence of pairs in the
             :func:`numpy.einsum_path` format.
-        slicing: Options for the slicer (:class:`~cuquantum.SlicerOptions` object or dict containing the ``(parameter, value)`` items for 
-            ``SlicerOptions``). Alternatively, a sequence of sliced modes or sequence of ``(sliced mode, sliced extent)`` pairs 
+        slicing: Options for the slicer (:class:`~cuquantum.SlicerOptions` object or dict containing the ``(parameter, value)`` items for
+            ``SlicerOptions``). Alternatively, a sequence of sliced modes or sequence of ``(sliced mode, sliced extent)`` pairs
             can be directly provided.
-        reconfiguration: Options for the reconfiguration algorithm as a :class:`~cuquantum.ReconfigOptions` object or dict containing the 
+        reconfiguration: Options for the reconfiguration algorithm as a :class:`~cuquantum.ReconfigOptions` object or dict containing the
             ``(parameter, value)`` items for ``ReconfigOptions``.
         seed: Optional seed for the random number generator. See `CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SEED`.
     """
     samples : Optional[int] = None
-    path : Optional[Union[Type[PathFinderOptions], PathType]] = None
-    slicing : Optional[Union[Type[SlicerOptions], ModeSequenceType, ModeExtentSequenceType]] = None
-    reconfiguration : Optional[Type[ReconfigOptions]] = None
+    threads : Optional[int] = None
+    path : Optional[Union[PathFinderOptions, PathType]] = None
+    slicing : Optional[Union[SlicerOptions, ModeSequenceType, ModeExtentSequenceType]] = None
+    reconfiguration : Optional[ReconfigOptions] = None
     seed : Optional[int] = None
 
     def _check_option(self, option, option_class, checker=None):
@@ -129,7 +143,7 @@ def _check_specified_slices(self):
                 raise TypeError("Slicing must be specified as a sequence of modes or as a sequence of (mode, extent) pairs.")
 
     def _check_int(self, attribute, name):
-        message = f"Invalid value ({attribute}) for '{name}'. Expect positive integer or None."  
+        message = f"Invalid value ({attribute}) for '{name}'. Expect positive integer or None."
         if not isinstance(attribute, (type(None), int)):
             raise ValueError(message)
         if isinstance(attribute, int) and attribute < 0:
@@ -154,8 +168,23 @@ class OptimizerInfo(object):
         slices: A sequence of ``(sliced mode, sliced extent)`` pairs.
     """
     largest_intermediate : float
-    opt_cost : float  
+    opt_cost : float
     path : PathType
     slices : ModeExtentSequenceType
 
-
+    def __str__(self):
+        path = [str(p) for p in self.path]
+        slices = [str(s) for s in self.slices]
+        s = f"""Optimizer Information:
+    Largest intermediate = {formatters.MemoryStr(self.largest_intermediate, base_unit='Elements')}
+    Optimized cost = {self.opt_cost:.3e} FLOPS
+    Path = {formatters.array2string(path)}"""
+        if len(slices):
+            s += """
+    Number of slices = {len(slices)}
+    Slices = {formatters.array2string(slices)}"""
+        else:
+            s += """
+    Slicing not needed."""
+
+        return s
diff --git a/python/cuquantum/cutensornet/cutensornet.pxd b/python/cuquantum/cutensornet/cutensornet.pxd
index a847c88..e5b3763 100644
--- a/python/cuquantum/cutensornet/cutensornet.pxd
+++ b/python/cuquantum/cutensornet/cutensornet.pxd
@@ -6,6 +6,20 @@
 # Once we switch over the names would be prettier (in the Cython
 # layer).
 
+from libc.stdint cimport int32_t
+
+cdef extern from * nogil:
+    # from CUDA
+    ctypedef int Stream 'cudaStream_t'
+    ctypedef enum DataType 'cudaDataType_t':
+        pass
+
+
+# Cython limitation: need standalone typedef if we wanna use it for casting
+ctypedef int (*DeviceAllocType)(void*, void**, size_t, Stream)
+ctypedef int (*DeviceFreeType)(void*, void*, size_t, Stream)
+
+
 cdef extern from '<cutensornet.h>' nogil:
     # cuTensorNet types
     ctypedef void* _Handle 'cutensornetHandle_t'
@@ -15,6 +29,7 @@ cdef extern from '<cutensornet.h>' nogil:
     ctypedef void* _ContractionOptimizerConfig 'cutensornetContractionOptimizerConfig_t'
     ctypedef void* _ContractionOptimizerInfo 'cutensornetContractionOptimizerInfo_t'
     ctypedef void* _ContractionAutotunePreference 'cutensornetContractionAutotunePreference_t'
+    ctypedef void* _WorkspaceDescriptor 'cutensornetWorkspaceDescriptor_t'
     ctypedef enum _ComputeType 'cutensornetComputeType_t':
         pass
 
@@ -25,15 +40,27 @@ cdef extern from '<cutensornet.h>' nogil:
     ctypedef struct _ContractionPath 'cutensornetContractionPath_t':
         int numContractions
         _NodePair *data
+    ctypedef struct _DeviceMemHandler 'cutensornetDeviceMemHandler_t':
+        void* ctx
+        DeviceAllocType device_alloc
+        DeviceFreeType device_free
+        # Cython limitation: cannot use C defines in declaring a static array,
+        # so we just have to hard-code CUTENSORNET_ALLOCATOR_NAME_LEN here...
+        char name[64]
+    ctypedef void(*LoggerCallbackData 'cutensornetLoggerCallbackData_t')(
+        int32_t logLevel,
+        const char* functionName,
+        const char* message,
+        void* userData)
 
     # cuTensorNet enums
     ctypedef enum _GraphAlgo 'cutensornetGraphAlgo_t':
-        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_ALGORITHM_RB
-        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_ALGORITHM_KWAY
+        CUTENSORNET_GRAPH_ALGO_RB
+        CUTENSORNET_GRAPH_ALGO_KWAY
 
     ctypedef enum _MemoryModel 'cutensornetMemoryModel_t':
-        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_MODEL_HEURISTIC
-        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_MODEL_CUTENSOR
+        CUTENSORNET_MEMORY_MODEL_HEURISTIC
+        CUTENSORNET_MEMORY_MODEL_CUTENSOR
 
     ctypedef enum _ContractionOptimizerConfigAttribute 'cutensornetContractionOptimizerConfigAttributes_t':
         CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_NUM_PARTITIONS
@@ -52,6 +79,7 @@ cdef extern from '<cutensornet.h>' nogil:
         CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_HYPER_NUM_SAMPLES
         CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SIMPLIFICATION_DISABLE_DR
         CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SEED
+        CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_HYPER_NUM_THREADS
 
     ctypedef enum _ContractionOptimizerInfoAttribute 'cutensornetContractionOptimizerInfoAttributes_t':
         CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES
@@ -67,8 +95,17 @@ cdef extern from '<cutensornet.h>' nogil:
     ctypedef enum _ContractionAutotunePreferenceAttribute 'cutensornetContractionAutotunePreferenceAttributes_t':
         CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS
 
+    ctypedef enum _WorksizePref 'cutensornetWorksizePref_t':
+        CUTENSORNET_WORKSIZE_PREF_MIN
+        CUTENSORNET_WORKSIZE_PREF_RECOMMENDED
+        CUTENSORNET_WORKSIZE_PREF_MAX
+
+    ctypedef enum _Memspace 'cutensornetMemspace_t':
+        CUTENSORNET_MEMSPACE_DEVICE
+
     # cuTensorNet consts
     int CUTENSORNET_MAJOR
     int CUTENSORNET_MINOR
     int CUTENSORNET_PATCH
     int CUTENSORNET_VERSION
+    int CUTENSORNET_ALLOCATOR_NAME_LEN
diff --git a/python/cuquantum/cutensornet/cutensornet.pyx b/python/cuquantum/cutensornet/cutensornet.pyx
index e94868e..e43d098 100644
--- a/python/cuquantum/cutensornet/cutensornet.pyx
+++ b/python/cuquantum/cutensornet/cutensornet.pyx
@@ -7,6 +7,9 @@ from libc.stdint cimport intptr_t, int32_t, uint32_t, int64_t, uint64_t, uintptr
 from libcpp.vector cimport vector
 
 from cuquantum.utils cimport is_nested_sequence
+from cuquantum.utils cimport cuqnt_alloc_wrapper
+from cuquantum.utils cimport cuqnt_free_wrapper
+from cuquantum.utils cimport logger_callback_with_data
 
 from enum import IntEnum
 import warnings
@@ -15,11 +18,6 @@ import numpy as _numpy
 
 
 cdef extern from * nogil:
-    # from CUDA
-    ctypedef int Stream 'cudaStream_t'
-    ctypedef enum DataType 'cudaDataType_t':
-        pass
-
     # cuTensorNet functions
     # library
     int cutensornetCreate(_Handle*)
@@ -35,6 +33,26 @@ cdef extern from * nogil:
         int32_t, const int64_t[], const int64_t[], const int32_t[],
         uint32_t, DataType, _ComputeType, _NetworkDescriptor*)
     int cutensornetDestroyNetworkDescriptor(_NetworkDescriptor)
+    int cutensornetGetOutputTensorDetails(
+        const _Handle, const _NetworkDescriptor,
+        int32_t*, size_t*, int32_t*, int64_t*, int64_t*)
+
+    # workspace descriptor
+    int cutensornetCreateWorkspaceDescriptor(
+        const _Handle, _WorkspaceDescriptor*)
+    int cutensornetWorkspaceComputeSizes(
+        const _Handle, const _NetworkDescriptor,
+        const _ContractionOptimizerInfo, _WorkspaceDescriptor)
+    int cutensornetWorkspaceGetSize(
+        const _Handle, const _WorkspaceDescriptor,
+        _WorksizePref, _Memspace, uint64_t*)
+    int cutensornetWorkspaceSet(
+        const _Handle, _WorkspaceDescriptor, _Memspace,
+        void* const, uint64_t)
+    int cutensornetWorkspaceGet(
+        const _Handle, const _WorkspaceDescriptor, _Memspace,
+        void**, uint64_t*)
+    int cutensornetDestroyWorkspaceDescriptor(_WorkspaceDescriptor)
 
     # optimizer info
     int cutensornetCreateContractionOptimizerInfo(
@@ -61,23 +79,28 @@ cdef extern from * nogil:
         const _Handle, _ContractionOptimizerConfig,
         _ContractionOptimizerConfigAttribute, const void*, size_t)
 
-    # contraction
-    int cutensornetContractionGetWorkspaceSize(
-        const _Handle, const _NetworkDescriptor,
-        const _ContractionOptimizerInfo,
-        uint64_t* workspaceSize)
+    # pathfinder
     int cutensornetContractionOptimize(
         const _Handle, const _NetworkDescriptor,
         const _ContractionOptimizerConfig,
         uint64_t, _ContractionOptimizerInfo)
+
+    # contraction plan
     int cutensornetCreateContractionPlan(
         const _Handle, const _NetworkDescriptor,
         const _ContractionOptimizerInfo,
-        const uint64_t, _ContractionPlan)
+        const _WorkspaceDescriptor, _ContractionPlan)
     int cutensornetDestroyContractionPlan(_ContractionPlan)
     int cutensornetContractionAutotune(
         const _Handle, _ContractionPlan, const void* const[],
-        void*, void*, uint64_t, _ContractionAutotunePreference, Stream)
+        void*, const _WorkspaceDescriptor,
+        _ContractionAutotunePreference, Stream)
+    int cutensornetContraction(
+        const _Handle, const _ContractionPlan, const void* const[],
+        void*, const _WorkspaceDescriptor,
+        int64_t, Stream)
+
+    # autotune pref
     int cutensornetCreateContractionAutotunePreference(
         const _Handle, _ContractionAutotunePreference*)
     int cutensornetDestroyContractionAutotunePreference(
@@ -88,9 +111,19 @@ cdef extern from * nogil:
     int cutensornetContractionAutotunePreferenceSetAttribute(
         const _Handle, _ContractionAutotunePreference,
         _ContractionAutotunePreferenceAttribute, const void*, size_t)
-    int cutensornetContraction(
-        const _Handle, const _ContractionPlan, const void* const[],
-        void*, void*, uint64_t, int64_t, Stream)
+
+    # memory handlers
+    int cutensornetGetDeviceMemHandler(const _Handle, _DeviceMemHandler*)
+    int cutensornetSetDeviceMemHandler(_Handle, const _DeviceMemHandler*)
+
+    # logger
+    #int cutensornetLoggerSetCallback(LoggerCallback)
+    int cutensornetLoggerSetCallbackData(LoggerCallbackData, void*)
+    #int cutensornetLoggerSetFile(FILE*)
+    int cutensornetLoggerOpenFile(const char*)
+    int cutensornetLoggerSetLevel(int32_t)
+    int cutensornetLoggerSetMask(int32_t)
+    int cutensornetLoggerForceDisable()
 
 
 class cuTensorNetError(RuntimeError):
@@ -112,7 +145,7 @@ cpdef intptr_t create() except*:
     """Create a cuTensorNet handle.
 
     Returns:
-        intptr_t: the opaque library handle (as Python `int`).
+        intptr_t: the opaque library handle (as Python :class:`int`).
 
     .. seealso:: `cutensornetCreate`
     """
@@ -129,6 +162,12 @@ cpdef destroy(intptr_t handle):
 
     .. seealso:: `cutensornetDestroy`
     """
+    # reduce the ref counts of user-provided Python objects:
+    # if Python callables are attached to the handle as the handler,
+    # we need to decrease the ref count to avoid leaking
+    if handle in owner_pyobj:
+        del owner_pyobj[handle]
+
     with nogil:
         status = cutensornetDestroy(<_Handle>handle)
     check_status(status)
@@ -147,7 +186,7 @@ cpdef size_t get_version() except*:
 
 
 cpdef size_t get_cudart_version() except*:
-    """Query the version of the CUDA runtime.
+    """Query the version of the CUDA runtime used to build cuTensorNet.
 
     Returns:
         size_t: the CUDA runtime version (ex: 11040 for CUDA 11.4).
@@ -173,51 +212,54 @@ cpdef intptr_t create_network_descriptor(
         n_modes_in: A host array of the number of modes for each input tensor.
             It can be
 
-            - an `int` as the pointer address to the array
-            - a Python sequence of `int`
+            - an :class:`int` as the pointer address to the array
+            - a Python sequence of :class:`int`
 
         extents_in: A host array of extents for each input tensor. It can be
 
-            - an `int` as the pointer address to the nested sequence
-            - a Python sequence of `int`, each of which is a pointer address
+            - an :class:`int` as the pointer address to the nested sequence
+            - a Python sequence of :class:`int`, each of which is a pointer address
               to the corresponding tensor's extents
-            - a nested Python sequence of `int`
+            - a nested Python sequence of :class:`int`
 
         strides_in: A host array of strides for each input tensor. It can be
 
-            - an `int` as the pointer address to the nested sequence
-            - a Python sequence of `int`, each of which is a pointer address
+            - an :class:`int` as the pointer address to the nested sequence
+            - a Python sequence of :class:`int`, each of which is a pointer address
               to the corresponding tensor's strides
-            - a nested Python sequence of `int`
+            - a nested Python sequence of :class:`int`
 
         modes_in: A host array of modes for each input tensor. It can be
 
-            - an `int` as the pointer address to the nested sequence
-            - a Python sequence of `int`, each of which is a pointer address
+            - an :class:`int` as the pointer address to the nested sequence
+            - a Python sequence of :class:`int`, each of which is a pointer address
               to the corresponding tensor's modes
-            - a nested Python sequence of `int`
+            - a nested Python sequence of :class:`int`
 
         alignments_in: A host array of alignments for each input tensor. It can
             be
 
-            - an `int` as the pointer address to the array
-            - a Python sequence of `int`
+            - an :class:`int` as the pointer address to the array
+            - a Python sequence of :class:`int`
 
-        n_modes_out (int32_t): The number of modes of the output tensor.
+        n_modes_out (int32_t): The number of modes of the output tensor. If
+            this is set to -1 and ``modes_out`` is set to 0 (not provided),
+            the output modes will be inferred. If this is set to 0, the
+            network is force reduced.
         extents_out: The extents of the output tensor (on host). It can be
 
-            - an `int` as the pointer address to the array
-            - a Python sequence of `int`
+            - an :class:`int` as the pointer address to the array
+            - a Python sequence of :class:`int`
 
         strides_out: The strides of the output tensor (on host). It can be
 
-            - an `int` as the pointer address to the array
-            - a Python sequence of `int`
+            - an :class:`int` as the pointer address to the array
+            - a Python sequence of :class:`int`
 
         modes_out: The modes of the output tensor (on host). It can be
 
-            - an `int` as the pointer address to the array
-            - a Python sequence of `int`
+            - an :class:`int` as the pointer address to the array
+            - a Python sequence of :class:`int`
 
         alignment_out (uint32_t): The alignment for the output tensor.
         data_type (cuquantum.cudaDataType): The data type of the input and
@@ -226,7 +268,7 @@ cpdef intptr_t create_network_descriptor(
             contraction.
 
     Returns:
-        intptr_t: An opaque descriptor handle (as Python `int`).
+        intptr_t: An opaque descriptor handle (as Python :class:`int`).
 
     .. note::
         If ``strides_in`` (``strides_out``) is set to 0 (`NULL`), it means
@@ -380,16 +422,184 @@ cpdef destroy_network_descriptor(intptr_t tn_desc):
     check_status(status)
 
 
+cpdef tuple get_output_tensor_details(intptr_t handle, intptr_t tn_desc):
+    """Get the output tensor's metadata.
+
+    Args:
+        handle (intptr_t): The library handle.
+        tn_desc (intptr_t): The tensor network descriptor.
+
+    Returns:
+        tuple:
+            The metadata of the output tensor: ``(num_modes, modes, extents,
+            strides)``.
+
+    .. seealso:: `cutensornetGetOutputTensorDetails`
+    """
+    cdef int32_t numModesOut = 0
+    with nogil:
+        status = cutensornetGetOutputTensorDetails(
+            <_Handle>handle, <_NetworkDescriptor>tn_desc,
+            &numModesOut, NULL, NULL, NULL, NULL)
+    check_status(status)
+    modes = _numpy.empty(numModesOut, dtype=_numpy.int32)
+    extents = _numpy.empty(numModesOut, dtype=_numpy.int64)
+    strides = _numpy.empty(numModesOut, dtype=_numpy.int64)
+    cdef int32_t* mPtr = <int32_t*><intptr_t>modes.ctypes.data
+    cdef int64_t* ePtr = <int64_t*><intptr_t>extents.ctypes.data
+    cdef int64_t* sPtr = <int64_t*><intptr_t>strides.ctypes.data
+    with nogil:
+        status = cutensornetGetOutputTensorDetails(
+            <_Handle>handle, <_NetworkDescriptor>tn_desc,
+            &numModesOut, NULL, mPtr, ePtr, sPtr)
+    check_status(status)
+    return (numModesOut, modes, extents, strides)
+
+
+cpdef intptr_t create_workspace_descriptor(intptr_t handle) except*:
+    """Create a workspace descriptor.
+
+    Args:
+        handle (intptr_t): The library handle.
+
+    Returns:
+        intptr_t: An opaque workspace descriptor (as Python :class:`int`).
+
+    .. seealso:: `cutensornetCreateWorkspaceDescriptor`
+    """
+    cdef _WorkspaceDescriptor workspace
+    with nogil:
+        status = cutensornetCreateWorkspaceDescriptor(
+            <_Handle>handle, <_WorkspaceDescriptor*>&workspace)
+    check_status(status)
+    return <intptr_t>workspace
+
+
+cpdef destroy_workspace_descriptor(intptr_t workspace):
+    """Destroy a workspace descriptor.
+
+    Args:
+        workspace (intptr_t): The workspace descriptor.
+
+    .. seealso:: `cutensornetDestroyWorkspaceDescriptor`
+    """
+    with nogil:
+        status = cutensornetDestroyWorkspaceDescriptor(
+            <_WorkspaceDescriptor>workspace)
+    check_status(status)
+
+
+cpdef workspace_compute_sizes(
+        intptr_t handle, intptr_t tn_desc, intptr_t info, intptr_t workspace):
+    """Compute the required workspace sizes.
+
+    Args:
+        handle (intptr_t): The library handle.
+        tn_desc (intptr_t): The tensor network descriptor.
+        info (intptr_t): The optimizer info handle.
+        workspace (intptr_t): The workspace descriptor.
+
+    .. seealso:: `cutensornetWorkspaceComputeSizes`
+    """
+    with nogil:
+        status = cutensornetWorkspaceComputeSizes(
+            <_Handle>handle, <_NetworkDescriptor>tn_desc,
+            <_ContractionOptimizerInfo>info,
+            <_WorkspaceDescriptor>workspace)
+    check_status(status)
+
+
+cpdef uint64_t workspace_get_size(
+        intptr_t handle, intptr_t workspace, int pref, int mem_space) except*:
+    """Get the workspace size for the corresponding preference and memory
+    space. Must be called after :func:`workspace_compute_sizes`.
+
+    Args:
+        handle (intptr_t): The library handle.
+        workspace (intptr_t): The workspace descriptor.
+        pref (WorksizePref): The preference for the workspace size.
+        mem_space (Memspace): The memory space for the workspace being
+            queried.
+ 
+    Returns:
+        uint64_t: The computed workspace size.
+
+    .. seealso:: `cutensornetWorkspaceGetSize`.
+    """
+    cdef uint64_t workspaceSize
+    with nogil:
+        status = cutensornetWorkspaceGetSize(
+            <_Handle>handle, <_WorkspaceDescriptor>workspace,
+            <_WorksizePref>pref, <_Memspace>mem_space,
+            &workspaceSize)
+    check_status(status)
+    return workspaceSize
+
+
+cpdef workspace_set(
+        intptr_t handle, intptr_t workspace, int mem_space,
+        intptr_t workspace_ptr, uint64_t workspace_size):
+    """Set the workspace pointer and size for the corresponding memory space
+    in the workspace descriptor for later use.
+
+    Args:
+        handle (intptr_t): The library handle.
+        workspace (intptr_t): The workspace descriptor.
+        mem_space (Memspace): The memory space for the workspace being
+            queried.
+        workspace_ptr (intptr_t): The pointer address to the workspace.
+        workspace_size (uint64_t): The size of the workspace.
+
+    .. seealso:: `cutensornetWorkspaceSet`
+    """
+    with nogil:
+        status = cutensornetWorkspaceSet(
+            <_Handle>handle, <_WorkspaceDescriptor>workspace,
+            <_Memspace>mem_space,
+            <void*>workspace_ptr, <uint64_t>workspace_size)
+    check_status(status)
+
+
+cpdef tuple workspace_get(
+        intptr_t handle, intptr_t workspace, int mem_space):
+    """Get the workspace pointer and size for the corresponding memory space
+    that are set in a workspace descriptor.
+
+    Args:
+        handle (intptr_t): The library handle.
+        workspace (intptr_t): The workspace descriptor.
+        mem_space (Memspace): The memory space for the workspace being
+            queried.
+
+    Returns:
+        tuple:
+            A 2-tuple ``(workspace_ptr, workspace_size)`` for the pointer
+            address to the workspace and the size of it.
+
+    .. seealso:: `cutensornetWorkspaceSet`
+    """
+    cdef void* workspace_ptr
+    cdef uint64_t workspace_size
+
+    with nogil:
+        status = cutensornetWorkspaceGet(
+            <_Handle>handle, <_WorkspaceDescriptor>workspace,
+            <_Memspace>mem_space,
+            &workspace_ptr, &workspace_size)
+    check_status(status)
+    return (<intptr_t>workspace_ptr, workspace_size)
+
+
 cpdef intptr_t create_contraction_optimizer_info(
         intptr_t handle, intptr_t tn_desc) except*:
     """Create a contraction optimizer info object.
 
     Args:
         handle (intptr_t): The library handle.
-        tn_desc (intptr_t): the tensor network descriptor.
+        tn_desc (intptr_t): The tensor network descriptor.
 
     Returns:
-        intptr_t: An opaque optimizer info handle (as Python `int`).
+        intptr_t: An opaque optimizer info handle (as Python :class:`int`).
 
     .. seealso:: `cutensornetCreateContractionOptimizerInfo`
     """
@@ -482,7 +692,7 @@ cpdef contraction_optimizer_info_get_attribute(
         handle (intptr_t): The library handle.
         info (intptr_t): The optimizer info handle.
         attr (ContractionOptimizerInfoAttribute): The attribute to query.
-        buf (intptr_t): The pointer address (as Python `int`) for storing
+        buf (intptr_t): The pointer address (as Python :class:`int`) for storing
             the returned attribute value.
         size (size_t): The size of ``buf`` (in bytes).
 
@@ -511,7 +721,7 @@ cpdef contraction_optimizer_info_set_attribute(
         handle (intptr_t): The library handle.
         info (intptr_t): The optimizer info handle.
         attr (ContractionOptimizerInfoAttribute): The attribute to set.
-        buf (intptr_t): The pointer address (as Python `int`) to the attribute data.
+        buf (intptr_t): The pointer address (as Python :class:`int`) to the attribute data.
         size (size_t): The size of ``buf`` (in bytes).
 
     .. note:: To compute ``size``, use the itemsize of the corresponding data
@@ -538,7 +748,7 @@ cpdef intptr_t create_contraction_optimizer_config(
         handle (intptr_t): The library handle.
 
     Returns:
-        intptr_t: An opaque optimizer config handle (as Python `int`).
+        intptr_t: An opaque optimizer config handle (as Python :class:`int`).
 
     .. seealso:: `cutensornetCreateContractionOptimizerConfig`
     """
@@ -583,6 +793,7 @@ cdef dict contract_opti_cfg_sizes = {
     CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_HYPER_NUM_SAMPLES: _numpy.int32,
     CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SIMPLIFICATION_DISABLE_DR: _numpy.int32,
     CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SEED: _numpy.int32,
+    CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_HYPER_NUM_THREADS: _numpy.int32,
 }
 
 cpdef contraction_optimizer_config_get_attribute_dtype(int attr):
@@ -619,7 +830,7 @@ cpdef contraction_optimizer_config_get_attribute(
         handle (intptr_t): The library handle.
         config (intptr_t): The optimizer config handle.
         attr (ContractionOptimizerConfigAttribute): The attribute to set.
-        buf (intptr_t): The pointer address (as Python `int`) for storing
+        buf (intptr_t): The pointer address (as Python :class:`int`) for storing
             the returned attribute value.
         size (size_t): The size of ``buf`` (in bytes).
 
@@ -645,7 +856,7 @@ cpdef contraction_optimizer_config_set_attribute(
         handle (intptr_t): The library handle.
         config (intptr_t): The optimizer config handle.
         attr (ContractionOptimizerConfigAttribute): The attribute to set.
-        buf (intptr_t): The pointer address (as Python `int`) to the attribute data.
+        buf (intptr_t): The pointer address (as Python :class:`int`) to the attribute data.
         size (size_t): The size of ``buf`` (in bytes).
 
     .. note:: To compute ``size``, use the itemsize of the corresponding data
@@ -661,35 +872,6 @@ cpdef contraction_optimizer_config_set_attribute(
     check_status(status)
 
 
-cpdef uint64_t contraction_get_workspace_size(
-        intptr_t handle, intptr_t tn_desc, intptr_t info) except*:
-    """Compute the required workspace size for contracting the input tensor
-    network.
-
-    Args:
-        handle (intptr_t): The library handle.
-        tn_desc (intptr_t): the tensor network descriptor.
-        info (intptr_t): The optimizer info handle.
-
-    Returns:
-        uint64_t: The workspace size (in bytes).
-
-    .. note:: This function should be called either after a contraction path
-        is manually set, or after :func:`contraction_optimize` is called.
-
-    .. seealso:: `cutensornetContractionGetWorkspaceSize`
-    """
-    # TODO(leofang): note in the docstring that the API name deviates
-    # from its C counterpart in beta 2
-    cdef uint64_t workspaceSize
-    with nogil:
-        status = cutensornetContractionGetWorkspaceSize(
-            <_Handle>handle, <_NetworkDescriptor>tn_desc,
-            <_ContractionOptimizerInfo>info, &workspaceSize)
-    check_status(status)
-    return workspaceSize
-
-
 cpdef contraction_optimize(
         intptr_t handle, intptr_t tn_desc, intptr_t config,
         uint64_t size_limit, intptr_t info):
@@ -697,7 +879,7 @@ cpdef contraction_optimize(
 
     Args:
         handle (intptr_t): The library handle.
-        tn_desc (intptr_t): the tensor network descriptor.
+        tn_desc (intptr_t): The tensor network descriptor.
         config (intptr_t): The optimizer config handle.
         size_limit (uint64_t): Maximal device memory that is available to the
             user.
@@ -720,7 +902,7 @@ cpdef contraction_optimize(
 
 cpdef intptr_t create_contraction_plan(
         intptr_t handle, intptr_t tn_desc, intptr_t info,
-        uint64_t workspace_size) except*:
+        intptr_t workspace) except*:
     """Create a contraction plan for the given tensor network and the
     associated path.
 
@@ -729,21 +911,23 @@ cpdef intptr_t create_contraction_plan(
 
     Args:
         handle (intptr_t): The library handle.
-        tn_desc (intptr_t): the tensor network descriptor.
+        tn_desc (intptr_t): The tensor network descriptor.
         info (intptr_t): The optimizer info handle.
-        workspace_size (uint64_t): The workspace size (in bytes).
+        workspace (intptr_t): The workspace descriptor.
 
     Returns:
-        intptr_t: An opaque contraction plan handle (as Python `int`).
+        intptr_t: An opaque contraction plan handle (as Python :class:`int`).
 
     .. seealso:: `cutensornetCreateContractionPlan`
     """
     cdef _ContractionPlan plan
+    # we always release gil here, because we don't need to allocate
+    # memory at this point yet
     with nogil:
         status = cutensornetCreateContractionPlan(
             <_Handle>handle, <_NetworkDescriptor>tn_desc,
             <_ContractionOptimizerInfo>info,
-            workspace_size, &plan)
+            <_WorkspaceDescriptor>workspace, &plan)
     check_status(status)
     return <intptr_t>plan
 
@@ -764,7 +948,7 @@ cpdef destroy_contraction_plan(intptr_t plan):
 cpdef contraction_autotune(
         intptr_t handle, intptr_t plan,
         raw_data_in, intptr_t raw_data_out, intptr_t workspace,
-        uint64_t workspace_size, intptr_t pref, intptr_t stream):
+        intptr_t pref, intptr_t stream):
     """Autotune the contraction plan to find the best kernels for each pairwise
     tensor contraction.
 
@@ -774,20 +958,18 @@ cpdef contraction_autotune(
     Args:
         handle (intptr_t): The library handle.
         plan (intptr_t): The contraction plan handle.
-        raw_data_in: A host array of pointer addresses (as Python `int`) for
+        raw_data_in: A host array of pointer addresses (as Python :class:`int`) for
             each input tensor (on device). It can be
 
-            - an `int` as the pointer address to the array
-            - a Python sequence of `int`
+            - an :class:`int` as the pointer address to the array
+            - a Python sequence of :class:`int`
 
-        raw_data_out (intptr_t): The pointer address (as Python `int`) to the
+        raw_data_out (intptr_t): The pointer address (as Python :class:`int`) to the
             output tensor (on device).
-        workspace (intptr_t): The pointer address (as Python `int`) to the
-            workspace (on device).
-        workspace_size (uint64_t): The workspace size (in bytes).
+        workspace (intptr_t): The workspace descriptor.
         pref (intptr_t): The autotune preference handle.
         stream (intptr_t): The CUDA stream handle (``cudaStream_t`` as Python
-            `int`).
+            :class:`int`).
 
     .. seealso:: `cutensornetContractionAutotune`
     """
@@ -803,8 +985,9 @@ cpdef contraction_autotune(
     with nogil:
         status = cutensornetContractionAutotune(
             <_Handle>handle, <_ContractionPlan>plan,
-            rawDataInPtr, <void*>raw_data_out, <void*>workspace,
-            workspace_size, <_ContractionAutotunePreference>pref,
+            rawDataInPtr, <void*>raw_data_out,
+            <_WorkspaceDescriptor>workspace,
+            <_ContractionAutotunePreference>pref,
             <Stream>stream)
     check_status(status)
 
@@ -876,7 +1059,7 @@ cpdef contraction_autotune_preference_get_attribute(
         handle (intptr_t): The library handle.
         autotune_preference (intptr_t): The autotune preference handle.
         attr (ContractionAutotunePreferenceAttribute): The attribute to query.
-        buf (intptr_t): The pointer address (as Python `int`) for storing
+        buf (intptr_t): The pointer address (as Python :class:`int`) for storing
             the returned attribute value.
         size (size_t): The size of ``buf`` (in bytes).
 
@@ -899,7 +1082,7 @@ cpdef contraction_autotune_preference_set_attribute(
         handle (intptr_t): The library handle.
         autotune_preference (intptr_t): The autotune preference handle.
         attr (ContractionAutotunePreferenceAttribute): The attribute to query.
-        buf (intptr_t): The pointer address (as Python `int`) to the attribute data.
+        buf (intptr_t): The pointer address (as Python :class:`int`) to the attribute data.
         size (size_t): The size of ``buf`` (in bytes).
 
     .. note:: To compute ``size``, use the itemsize of the corresponding data
@@ -918,7 +1101,7 @@ cpdef contraction_autotune_preference_set_attribute(
 cpdef contraction(
         intptr_t handle, intptr_t plan,
         raw_data_in, intptr_t raw_data_out, intptr_t workspace,
-        uint64_t workspace_size, int64_t slice_id, intptr_t stream):
+        int64_t slice_id, intptr_t stream):
     """Perform the contraction of the input tensors.
 
     The input tensors should form a tensor network that is prescribed by the
@@ -927,20 +1110,18 @@ cpdef contraction(
     Args:
         handle (intptr_t): The library handle.
         plan (intptr_t): The contraction plan handle.
-        raw_data_in: A host array of pointer addresses (as Python `int`) for
+        raw_data_in: A host array of pointer addresses (as Python :class:`int`) for
             each input tensor (on device). It can be
 
-            - an `int` as the pointer address to the array
-            - a Python sequence of `int`
+            - an :class:`int` as the pointer address to the array
+            - a Python sequence of :class:`int`
 
-        raw_data_out (intptr_t): The pointer address (as Python `int`) to the
+        raw_data_out (intptr_t): The pointer address (as Python :class:`int`) to the
             output tensor (on device).
-        workspace (intptr_t): The pointer address (as Python `int`) to the
-            workspace (on device).
-        workspace_size (uint64_t): The workspace size (in bytes).
+        workspace (intptr_t): The workspace descriptor.
         slice_id (int64_t): The slice ID.
         stream (intptr_t): The CUDA stream handle (``cudaStream_t`` as Python
-            `int`).
+            :class:`int`).
 
     .. note:: The number of slices can be queried by :func:`contraction_optimizer_info_get_attribute`.
 
@@ -958,8 +1139,197 @@ cpdef contraction(
     with nogil:
         status = cutensornetContraction(
             <_Handle>handle, <_ContractionPlan>plan,
-            rawDataInPtr, <void*>raw_data_out, <void*>workspace,
-            workspace_size, slice_id, <Stream>stream)
+            rawDataInPtr, <void*>raw_data_out,
+            <_WorkspaceDescriptor>workspace,
+            slice_id, <Stream>stream)
+    check_status(status)
+
+
+cpdef set_device_mem_handler(intptr_t handle, handler):
+    """ Set the device memory handler for cuTensorNet.
+
+    The ``handler`` object can be passed in multiple ways:
+
+      - If ``handler`` is an :class:`int`, it refers to the address of a fully
+        initialized `cutensornetDeviceMemHandler_t` struct.
+      - If ``handler`` is a Python sequence:
+
+        - If ``handler`` is a sequence of length 4, it is interpreted as ``(ctx, device_alloc,
+          device_free, name)``, where the first three elements are the pointer
+          addresses (:class:`int`) of the corresponding members. ``name`` is a
+          :class:`str` as the name of the handler.
+        - If ``handler`` is a sequence of length 3, it is interpreted as ``(malloc, free,
+          name)``, where the first two objects are Python *callables* with the
+          following calling convention:
+
+            - ``ptr = malloc(size, stream)``
+            - ``free(ptr, size, stream)``
+
+          with all arguments and return value (``ptr``) being Python :class:`int`.
+          ``name`` is the same as above.
+
+    .. note:: Only when ``handler`` is a length-3 sequence will the GIL be
+        held whenever a routine requires memory allocation and deallocation,
+        so for all other cases be sure your ``handler`` does not manipulate
+        any Python objects.
+
+    Args:
+        handle (intptr_t): The library handle.
+        handler: The memory handler object, see above.
+
+    .. seealso:: `cutensornetSetDeviceMemHandler`
+    """
+    cdef bytes name
+    cdef _DeviceMemHandler our_handler
+    cdef _DeviceMemHandler* handlerPtr = &our_handler
+
+    if isinstance(handler, int):
+        handlerPtr = <_DeviceMemHandler*><intptr_t>handler
+    elif cpython.PySequence_Check(handler):
+        name = handler[-1].encode('ascii')
+        if len(name) > CUTENSORNET_ALLOCATOR_NAME_LEN:
+            raise ValueError("the handler name is too long")
+        our_handler.name[:len(name)] = name
+        our_handler.name[len(name)] = 0
+
+        if len(handler) == 4:
+            # handler = (ctx_ptr, malloc_ptr, free_ptr, name)
+            assert (isinstance(handler[1], int) and isinstance(handler[2], int))
+            our_handler.ctx = <void*><intptr_t>(handler[0])
+            our_handler.device_alloc = <DeviceAllocType><intptr_t>(handler[1])
+            our_handler.device_free = <DeviceFreeType><intptr_t>(handler[2])
+        elif len(handler) == 3:
+            # handler = (malloc, free, name)
+            assert (callable(handler[0]) and callable(handler[1]))
+            ctx = (handler[0], handler[1])
+            owner_pyobj[handle] = ctx  # keep it alive
+            our_handler.ctx = <void*>ctx
+            our_handler.device_alloc = cuqnt_alloc_wrapper
+            our_handler.device_free = cuqnt_free_wrapper
+        else:
+            raise ValueError("handler must be a sequence of length 3 or 4, "
+                             "see the documentation for detail")
+    else:
+        raise NotImplementedError("handler format not recognized")
+
+    with nogil:
+        status = cutensornetSetDeviceMemHandler(<_Handle>handle, handlerPtr)
+    check_status(status)
+
+
+cpdef tuple get_device_mem_handler(intptr_t handle):
+    """ Get the device memory handler for cuTensorNet.
+
+    Args:
+        handle (intptr_t): The library handle.
+
+    Returns:
+        tuple:
+            The ``handler`` object, which has two forms:
+
+              - If ``handler`` is a 3-tuple, it is interpreted as ``(malloc, free,
+                name)``, where the first two objects are Python *callables*, and ``name``
+                is the name of the handler. This 3-tuple handler would be compared equal
+                (elementwisely) to the one previously passed to :func:`set_device_mem_handler`.
+              - If ``handler`` is a 4-tuple, it is interpreted as ``(ctx, device_alloc,
+                device_free, name)``, where the first three elements are the pointer
+                addresses (:class:`int`) of the corresponding members. ``name`` is the
+                same as above.
+
+    .. seealso:: `cutensornetGetDeviceMemHandler`
+    """
+    cdef _DeviceMemHandler handler
+    with nogil:
+        status = cutensornetGetDeviceMemHandler(<_Handle>handle, &handler)
+    check_status(status)
+
+    cdef tuple ctx
+    cdef bytes name = handler.name
+    if (handler.device_alloc == cuqnt_alloc_wrapper and
+            handler.device_free == cuqnt_free_wrapper):
+        ctx = <object>(handler.ctx)
+        return (ctx[0], ctx[1], name.decode('ascii'))
+    else:
+        # TODO: consider other possibilities?
+        return (<intptr_t>handler.ctx,
+                <intptr_t>handler.device_alloc,
+                <intptr_t>handler.device_free,
+                name.decode('ascii'))
+
+
+# can't be cpdef because args & kwargs can't be handled in a C signature
+def logger_set_callback_data(callback, *args, **kwargs):
+    """Set the logger callback along with arguments.
+
+    Args:
+        callback: A Python callable with the following signature (no return):
+
+          - ``callback(log_level, func_name, message, *args, **kwargs)``
+
+          where ``log_level`` (:py:`int`), ``func_name`` (`str`), and
+          ``message`` (`str`) are provided by the logger API.
+
+    .. seealso:: `cutensornetLoggerSetCallbackData`
+    """
+    func_arg = (callback, args, kwargs)
+    # if only set once, the callback lifetime should be as long as this module,
+    # because we don't know when the logger is done using it
+    global logger_callback_holder
+    logger_callback_holder = func_arg
+    with nogil:
+        status = cutensornetLoggerSetCallbackData(
+            <LoggerCallbackData>logger_callback_with_data, <void*>(func_arg))
+    check_status(status)
+
+
+cpdef logger_open_file(filename):
+    """Set the filename for the logger to write to.
+
+    Args:
+        filename (str): The log filename.
+
+    .. seealso:: `cutensornetLoggerOpenFile`
+    """
+    cdef bytes name = filename.encode()
+    cdef char* name_ptr = name
+    with nogil:
+        status = cutensornetLoggerOpenFile(name_ptr)
+    check_status(status)
+
+
+cpdef logger_set_level(int level):
+    """Set the logging level.
+
+    Args:
+        level (int): The logging level.
+
+    .. seealso:: `cutensornetLoggerSetLevel`
+    """
+    with nogil:
+        status = cutensornetLoggerSetLevel(level)
+    check_status(status)
+
+
+cpdef logger_set_mask(int mask):
+    """Set the logging mask.
+
+    Args:
+        level (int): The logging mask.
+
+    .. seealso:: `cutensornetLoggerSetMask`
+    """
+    with nogil:
+        status = cutensornetLoggerSetMask(mask)
+    check_status(status)
+
+
+cpdef logger_force_disable():
+    """Disable the logger.
+
+    .. seealso:: `cutensornetLoggerForceDisable`
+    """
+    with nogil:
+        status = cutensornetLoggerForceDisable()
     check_status(status)
 
 
@@ -987,7 +1357,7 @@ cdef class ContractionPath:
 
     Args:
         num_contractions (int): The number of contractions in the provided path.
-        data (uintptr_t): The pointer address (as Python `int`) to the provided path.
+        data (uintptr_t): The pointer address (as Python :class:`int`) to the provided path.
 
     .. note::
         Users are responsible for managing the lifetime of the underlying path data
@@ -1029,15 +1399,15 @@ cdef class ContractionPath:
         return sizeof(_ContractionPath)
 
 
-class GraphAlgorithm(IntEnum):
+class GraphAlgo(IntEnum):
     """See `cutensornetGraphAlgo_t`."""
-    RB = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_ALGORITHM_RB
-    KWAY = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_ALGORITHM_KWAY
+    RB = CUTENSORNET_GRAPH_ALGO_RB
+    KWAY = CUTENSORNET_GRAPH_ALGO_KWAY
 
 class MemoryModel(IntEnum):
     """See `cutensornetMemoryModel_t`."""
-    SLICER_HEURISTIC = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_MODEL_HEURISTIC
-    SLICER_CUTENSOR = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SLICER_MEMORY_MODEL_CUTENSOR
+    HEURISTIC = CUTENSORNET_MEMORY_MODEL_HEURISTIC
+    CUTENSOR = CUTENSORNET_MEMORY_MODEL_CUTENSOR
 
 class ContractionOptimizerConfigAttribute(IntEnum):
     """See `cutensornetContractionOptimizerConfigAttributes_t`."""
@@ -1057,6 +1427,7 @@ class ContractionOptimizerConfigAttribute(IntEnum):
     HYPER_NUM_SAMPLES = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_HYPER_NUM_SAMPLES
     SIMPLIFICATION_DISABLE_DR = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SIMPLIFICATION_DISABLE_DR
     SEED = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_SEED
+    HYPER_NUM_THREADS = CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_HYPER_NUM_THREADS
 
 class ContractionOptimizerInfoAttribute(IntEnum):
     """See `cutensornetContractionOptimizerInfoAttributes_t`."""
@@ -1074,6 +1445,16 @@ class ContractionAutotunePreferenceAttribute(IntEnum):
     """See `cutensornetContractionAutotunePreferenceAttributes_t`."""
     MAX_ITERATIONS = CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS
 
+class WorksizePref(IntEnum):
+    """See `cutensornetWorksizePref_t`."""
+    MIN = CUTENSORNET_WORKSIZE_PREF_MIN
+    RECOMMENDED = CUTENSORNET_WORKSIZE_PREF_RECOMMENDED
+    MAX = CUTENSORNET_WORKSIZE_PREF_MAX
+
+class Memspace(IntEnum):
+    """See `cutensornetMemspace_t`."""
+    DEVICE = CUTENSORNET_MEMSPACE_DEVICE
+
 del IntEnum
 
 
@@ -1082,3 +1463,7 @@ MAJOR_VER = CUTENSORNET_MAJOR
 MINOR_VER = CUTENSORNET_MINOR
 PATCH_VER = CUTENSORNET_PATCH
 VERSION = CUTENSORNET_VERSION
+
+
+# who owns a reference to user-provided Python objects (k: owner, v: object)
+cdef dict owner_pyobj = {}
diff --git a/python/cuquantum/cutensornet/memory.py b/python/cuquantum/cutensornet/memory.py
new file mode 100644
index 0000000..097a313
--- /dev/null
+++ b/python/cuquantum/cutensornet/memory.py
@@ -0,0 +1,164 @@
+""" Interface for pluggable memory handlers.
+"""
+
+__all__ = ['BaseCUDAMemoryManager', 'MemoryPointer']
+
+from abc import abstractmethod
+from typing_extensions import Protocol, runtime_checkable
+import weakref
+
+import cupy as cp
+
+class MemoryPointer:
+    """
+    An RAII class for a device memory buffer.
+
+    Args:
+        device_ptr: The address of the device memory buffer.
+        size: The size of the memory buffer in bytes.
+        finalizer: A nullary callable that will be called when the buffer is to be freed.
+
+    .. seealso:: :class:`numba.cuda.MemoryPointer`
+    """
+
+    def __init__(self, device_ptr, size, finalizer):
+        self.device_ptr = device_ptr
+        self.size = size
+        if finalizer is not None:
+            self._finalizer = weakref.finalize(self, finalizer)
+
+    def free(self):
+        """
+        "Frees" the memory buffer by calling the finalizer.
+        """
+        if finalizer is None:
+            return
+
+        if not self._finalizer.alive:
+            raise RuntimeError("The buffer has already been freed.")
+        self._finalizer()
+
+
+@runtime_checkable
+class BaseCUDAMemoryManager(Protocol):
+    """
+    Protocol for memory manager plugins.
+
+    .. seealso:: :class:`numba.cuda.BaseCUDAMemoryManager`
+    """
+
+    @abstractmethod
+    def memalloc(self, size):
+        """
+        Allocate device memory.
+
+        Args:
+            size: The size of the memory buffer in bytes.
+
+        Returns:
+            An object that owns the allocated memory and is responsible for releasing it (to the OS or a pool). The object must
+            have an attribute named ``device_ptr``, ``device_pointer``, or ``ptr`` specifying the pointer to the allocated memory
+            buffer. See :class:`MemoryPointer` for an example interface.
+
+        Note:
+            Objects of type :class:`numba.cuda.MemoryPointer` as well as :class:`cupy.cuda.MemoryPointer` meet the requirements
+            listed above for the device memory pointer object.
+        """
+        raise NotImplementedError
+
+
+class _RawCUDAMemoryManager(BaseCUDAMemoryManager):
+    """
+    Raw device memory allocator.
+
+    Args:
+        device_id: The ID (int) of the device on which memory is to be allocated.
+        logger (logging.Logger): Python Logger object.
+    """
+
+    def __init__(self, device_id, logger):
+        """
+        __init__(device_id)
+        """
+        self.device = cp.cuda.Device(device_id)
+        self.logger = logger
+
+    def memalloc(self, size):
+        with self.device:
+            device_ptr = cp.cuda.runtime.malloc(size)
+
+        self.logger.debug(f"_RawCUDAMemoryManager (allocate memory): size = {size}, ptr = {device_ptr}, "
+                          f"device = {self.device}, stream={cp.cuda.get_current_stream()}")
+
+        def create_finalizer():
+            def finalizer():
+                with self.device:
+                    cp.cuda.runtime.free(device_ptr)
+                self.logger.debug(f"_RawCUDAMemoryManager (release memory): ptr = {device_ptr}")
+            return finalizer
+
+        return MemoryPointer(device_ptr, size, finalizer=create_finalizer())
+
+
+class _CupyCUDAMemoryManager(BaseCUDAMemoryManager):
+    """
+    CuPy device memory allocator.
+
+    Args:
+        device_id: The ID (int) of the device on which memory is to be allocated.
+        logger (logging.Logger): Python Logger object.
+    """
+
+    def __init__(self, device_id, logger):
+        """
+        __init__(device_id)
+        """
+        self.device = cp.cuda.Device(device_id)
+        self.logger = logger
+
+    def memalloc(self, size):
+        with self.device:
+            cp_mem_ptr = cp.cuda.alloc(size)
+            device_ptr = cp_mem_ptr.ptr
+
+        self.logger.debug(f"_CupyCUDAMemoryManager (allocate memory): size = {size}, ptr = {device_ptr}, "
+                          f"device = {self.device}, stream={cp.cuda.get_current_stream()}")
+
+        return cp_mem_ptr
+
+
+class _TorchCUDAMemoryManager(BaseCUDAMemoryManager):
+    """
+    Torch caching memory allocator.
+
+    Args:
+        device_id: The ID (int) of the device on which memory is to be allocated.
+        logger (logging.Logger): Python Logger object.
+    """
+
+    def __init__(self, device_id, logger):
+        """
+        __init__(device_id)
+        """
+        self.device_id = device_id
+        self.logger = logger
+
+    def memalloc(self, size):
+        from torch.cuda import caching_allocator_alloc, caching_allocator_delete, current_stream
+
+        device_ptr = caching_allocator_alloc(size, device=self.device_id)
+
+        self.logger.debug(f"_TorchCUDAMemoryManager (allocate memory): size = {size}, ptr = {device_ptr}, "
+                          f"device_id = {self.device_id}, stream={current_stream()}")
+
+        def create_finalizer():
+            def finalizer():
+                caching_allocator_delete(device_ptr)
+                self.logger.debug(f"_TorchCUDAMemoryManager (release memory): ptr = {device_ptr}")
+            return finalizer
+
+        return MemoryPointer(device_ptr, size, finalizer=create_finalizer())
+
+
+_MEMORY_MANAGER = {'_raw' : _RawCUDAMemoryManager, 'cupy' : _CupyCUDAMemoryManager, 'torch' : _TorchCUDAMemoryManager}
+
diff --git a/python/cuquantum/cutensornet/tensor_network.py b/python/cuquantum/cutensornet/tensor_network.py
index 0033a49..f526f82 100644
--- a/python/cuquantum/cutensornet/tensor_network.py
+++ b/python/cuquantum/cutensornet/tensor_network.py
@@ -16,7 +16,9 @@
 
 from cuquantum import cutensornet as cutn
 from . import configuration
+from . import memory
 from ._internal import einsum_parser
+from ._internal import formatters
 from ._internal import optimizer_ifc
 from ._internal import tensor_wrapper
 from ._internal import typemaps
@@ -32,44 +34,43 @@ class Network:
     """
     Network(subscripts, *operands, options=None)
 
-    Create a tensor network object specified as an einsum expression.
+    Create a tensor network object specified as an Einstein summation expression.
 
-    The Einstein summation convention provides an elegant way of representing many tensor network operations.
-    This object allows the user to invest
-    considerable effort into computing the best contraction path as well as autotuning the contraction upfront
-    for repeated contractions over the same network *topology* (different input tensors, or "operands", with the same Einstein
-    summation expression). Also see :meth:`~Network.contract_path` and :meth:`autotune`.
+    The Einstein summation convention provides an elegant way of representing many tensor network operations. This object
+    allows the user to invest considerable effort into computing the best contraction path as well as autotuning the contraction
+    upfront for repeated contractions over the same network *topology* (different input tensors, or "operands", with the same
+    Einstein summation expression). Also see :meth:`~Network.contract_path` and :meth:`autotune`.
 
     For the Einstein summation expression, both the explicit and implicit forms are supported.
 
-    In the implicit form, the output indices are inferred from the summation expression and *reordered lexicographically*.
-    An example is the expression ``'ij,jh'``, for which the output indices are ``'hi'``. (This corresponds to a matrix
+    In the implicit form, the output mode labels are inferred from the summation expression and *reordered lexicographically*.
+    An example is the expression ``'ij,jh'``, for which the output mode labels are ``'hi'``. (This corresponds to a matrix
     multiplication followed by a transpose.)
 
-    In the explicit form, output indices can be directly stated following the identifier ``'->'`` in the summation expression.
+    In the explicit form, output mode labels can be directly stated following the identifier ``'->'`` in the summation expression.
     An example is the expression ``'ij,jh->ih'`` (which corresponds to a matrix multiplication).
 
-    To specify an Einstein summation expression, both the subscript format (as shown above) and the ""interleaved" format
+    To specify an Einstein summation expression, both the subscript format (as shown above) and the interleaved format
     are supported.
 
-    The interleaved format is an alternative way for specifying the operands and their modes as 
+    The interleaved format is an alternative way for specifying the operands and their mode labels as
     ``Network(op0, modes0, op1, modes1, ..., [modes_out])``, where ``opN``
-    is the N-th operand and ``modesN`` is a sequence of hashable object (strings, integers, etc) representing the N-th operand's
-    modes.
+    is the N-th operand and ``modesN`` is a sequence of hashable and comparable objects (strings, integers, etc) representing the
+    N-th operand's mode labels.
 
-    Ellipsis broadcasting is currently *not supported*.
+    Ellipsis broadcasting is supported.
 
     Additional information on various operations on the network can be obtained by passing in a :class:`logging.Logger` object
-    to :class:`NetworkOptions` or by setting the appropriate options in the root logger object, which is used by default::
+    to :class:`NetworkOptions` or by setting the appropriate options in the root logger object, which is used by default:
 
-        import logging
-        logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M:%S')
+        >>> import logging
+        >>> logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M:%S')
 
     Args:
-        subscripts : The modes (subscripts) for summation as a comma-separated list of characters. Unicode characters are
-            allowed in the expression thereby expanding the size of the tensor network that can be specified using the 
-            Einstein summation convention.
-        operands : A sequence of tensors (ndarray-like objects). The currently supported types are :class:`numpy.ndarray`, 
+        subscripts : The mode labels (subscripts) defining the Einstein summation expression as a comma-separated sequence of
+            characters. Unicode characters are allowed in the expression thereby expanding the size of the tensor network that
+            can be specified using the Einstein summation convention.
+        operands : A sequence of tensors (ndarray-like objects). The currently supported types are :class:`numpy.ndarray`,
             :class:`cupy.ndarray`, and :class:`torch.Tensor`.
         options : Specify options for the tensor network as a :class:`~cuquantum.NetworkOptions` object. Alternatively, a `dict`
             containing the parameters for the ``NetworkOptions`` constructor can also be provided. If not specified,
@@ -78,11 +79,6 @@ class Network:
     See Also:
         :meth:`~Network.contract_path`, :meth:`autotune`, :meth:`~Network.contract`, :meth:`reset_operands`
 
-    Note:
-        In this release, only the *classical* Einstein summation is supported -- an index (mode) must appear exactly once or
-        twice. An index that appears twice represents an inner product on that dimension. If an index appears once,
-        it must appear in the output.
-
     Examples:
 
         >>> from cuquantum import Network
@@ -98,15 +94,15 @@ class Network:
         >>> operands = [np.random.rand(*shape) for shape in shapes]
 
         Create a :class:`Network` object:
-       
+
         >>> n = Network(expr, *operands)
 
         Find the best contraction order:
-       
+
         >>> path, info = n.contract_path({'samples': 500})
 
         Autotune the network:
-       
+
         >>> n.autotune(iterations=5)
 
         Perform the contraction. The result is of the same type and on the same device as the operands:
@@ -125,14 +121,14 @@ class Network:
         >>> np.allclose(r2, factorial(len(operands))*r1)
         True
 
-        Finally, free network resources. If this call isn't made, it may hinder further operations (especially if the 
-        network is large) as it causes **memory leak**. (*To avoid having to explicitly make this call, it is recommended
-        to use the* :class:`Network` *object as a context manager*.)
+        Finally, free network resources. If this call isn't made, it may hinder further operations (especially if the
+        network is large) since the memory will be released only when the object goes out of scope. (*To avoid having
+        to explicitly make this call, it is recommended to use the* :class:`Network` *object as a context manager*.)
 
         >>> n.free()
 
-        If the operands are on the GPU, they can also be updated using in-place operations. In this case, the call 
-        to :meth:`reset_operands` can be skipped -- subsequent :meth:`~Network.contract` calls will use the same 
+        If the operands are on the GPU, they can also be updated using in-place operations. In this case, the call
+        to :meth:`reset_operands` can be skipped -- subsequent :meth:`~Network.contract` calls will use the same
         operands (with updated contents). The following example illustrates this using CuPy operands and also demonstrates
         the usage of a :class:`Network` context (so as to skip calling :meth:`free`):
 
@@ -175,8 +171,10 @@ def __init__(self, *operands, options=None):
         options = utils.check_or_create_options(configuration.NetworkOptions, options, "network options")
         self.options = options
 
-        # Logger
+        # Logger.
         self.logger = options.logger if options.logger is not None else logging.getLogger()
+        self.logger.info(f"CUDA runtime version = {cutn.get_cudart_version()}")
+        self.logger.info(f"cuTensorNet version = {cutn.MAJOR_VER}.{cutn.MINOR_VER}.{cutn.PATCH_VER}")
         self.logger.info("Beginning network creation...")
 
         # Parse Einsum expression.
@@ -190,21 +188,31 @@ def __init__(self, *operands, options=None):
             self.device_id = options.device_id
             self.operands = tensor_wrapper.to(self.operands, self.device_id)
 
+        # Infer the library package the operands belong to.
+        self.package = utils.get_operands_package(self.operands)
+
         # The output class is that of the first wrapped device operand.
         self.output_class = self.operands[0].__class__
-    
-        # Ensure all the operands are on the same device
+
         self.device = cp.cuda.Device(self.device_id)
 
-        # Set memory limit
-        self.memory_limit = utils.get_memory_limit(self.options.memory_limit, self.device) 
-        self.logger.info(f"The memory limit is {self.memory_limit} bytes.")
+        # Set memory allocator.
+        self.allocator = options.allocator if options.allocator is not None else memory._MEMORY_MANAGER[self.package](self.device_id, self.logger)
+
+        # Set memory limit.
+        self.memory_limit = utils.get_memory_limit(self.options.memory_limit, self.device)
+        self.logger.info(f"The memory limit is {formatters.MemoryStr(self.memory_limit)}.")
 
-        # Define data types
+        # Define data types.
         self.data_type = utils.get_operands_dtype(self.operands)
+        if self.data_type not in typemaps.NAME_TO_COMPUTE_TYPE:
+            message = f"""Unsupported data type.
+The data type '{self.data_type}' is currently not supported.
+"""
+            raise ValueError(message)
         self.compute_type = options.compute_type if options.compute_type is not None else typemaps.NAME_TO_COMPUTE_TYPE[self.data_type]
 
-        # Prepare data for cutensornet
+        # Prepare data for cutensornet.
         num_inputs = len(self.inputs)
         num_modes_out = len(self.output)
 
@@ -215,9 +223,9 @@ def __init__(self, *operands, options=None):
         num_modes_in = tuple(len(m) for m in modes_in)
 
         self.contraction, modes_out, extents_out, strides_out, alignment_out = utils.create_output_tensor(
-                self.output_class, self.output, self.size_dict, self.device_id, self.data_type)
+                self.output_class, self.package, self.output, self.size_dict, self.device, self.data_type)
 
-        # Create/set handle
+        # Create/set handle.
         if options.handle is not None:
             self.own_handle = False
             self.handle = options.handle
@@ -226,7 +234,7 @@ def __init__(self, *operands, options=None):
             with self.device:
                 self.handle = cutn.create()
 
-        # Network definition
+        # Network definition.
         self.network = cutn.create_network_descriptor(self.handle, num_inputs,
                 num_modes_in, extents_in, strides_in, modes_in, alignments_in,        # inputs
                 num_modes_out, extents_out, strides_out, modes_out, alignment_out,    # output
@@ -235,11 +243,19 @@ def __init__(self, *operands, options=None):
         # Keep output extents for creating new tensors, if needed.
         self.extents_out = extents_out
 
+        # Path optimization atributes.
         self.optimizer_config_ptr, self.optimizer_info_ptr = None, None
-        self.workspace, self.workspace_size = None, None
+        self.optimized = False
+
+        # Workspace attributes.
+        self.workspace_desc = cutn.create_workspace_descriptor(self.handle)
+        self.workspace_ptr, self.workspace_size = None, None
+
+        # Contraction plan attributes.
         self.plan = None
+
+        # Autotuning attributes.
         self.autotune_pref_ptr = None
-        self.optimized = False
         self.autotuned = False
 
         self.valid_state = True
@@ -276,17 +292,11 @@ def _free_plan_resources(self, exception=None):
 
         return True
 
-    def _free_workspace_resources(self, exception=None):
+    def _free_workspace_memory(self, exception=None):
         """
-        Free resources related to network workspace.
+        Free workspace by releasing the MemoryPointer object.
         """
-
-        if self.workspace is not None:
-            with self.device:
-                cp.cuda.runtime.free(self.workspace)
-            self.workspace = None
-
-        self.workspace_size = None
+        self.workspace_ptr = None
 
         return True
 
@@ -303,34 +313,65 @@ def _free_path_resources(self, exception=None):
             cutn.destroy_contraction_optimizer_info(self.optimizer_info_ptr)
             self.optimizer_info_ptr = None
 
-        self._free_workspace_resources()
+        self._free_workspace_memory()
+        self.workspace_size = None
 
         self._free_plan_resources()
 
         return True
 
     @utils.precondition(_check_valid_network)
-    @utils.precondition(_check_optimized, "Workspace allocation")
-    @utils.atomic(_free_workspace_resources, method=True)
-    def _allocate_workspace(self):
+    @utils.precondition(_check_optimized, "Workspace memory allocation")
+    @utils.atomic(_free_workspace_memory, method=True)
+    def _allocate_workspace_memory_perhaps(self, stream_ctx):
+        if self.workspace_ptr is not None:
+            return
+
+        assert self.workspace_size is not None, "Internal Error."
+
+        self.logger.debug("Allocating memory for contracting the tensor network...")
+        with self.device, stream_ctx:
+            try:
+                self.workspace_ptr = self.allocator.memalloc(self.workspace_size)
+            except TypeError as e:
+                message = "The method 'memalloc' in the allocator object must conform to the interface in the "\
+                          "'BaseCUDAMemoryManager' protocol."
+                raise TypeError(message) from e
+        self.logger.debug(f"Finished allocating memory of size {formatters.MemoryStr(self.workspace_size)} for contraction.")
+
+        device_ptr = utils.get_ptr_from_memory_pointer(self.workspace_ptr)
+        cutn.workspace_set(self.handle, self.workspace_desc, cutn.Memspace.DEVICE, device_ptr, self.workspace_size)
+        self.logger.debug(f"The workspace memory (device pointer = {device_ptr}) has been set in the workspace descriptor.")
+
+    @utils.precondition(_check_valid_network)
+    @utils.precondition(_check_optimized, "Workspace size calculation")
+    def _calculate_workspace_size(self):
         """
         Allocate workspace for cutensornet.
         """
 
-        workspace_size = cutn.contraction_get_workspace_size(self.handle, self.network, self.optimizer_info_ptr)
-        self.workspace_size = workspace_size
+        # Release workspace already allocated, if any, because the new requirements are likely different.
+        self.workspace_ptr = None
 
-        self.logger.debug("Allocating workspace for contraction...")
+        cutn.workspace_compute_sizes(self.handle, self.network, self.optimizer_info_ptr, self.workspace_desc)
 
-        with self.device:
-            if self.workspace:
-                cp.cuda.runtime.free(self.workspace)
+        min_size = cutn.workspace_get_size(self.handle, self.workspace_desc, cutn.WorksizePref.MIN, cutn.Memspace.DEVICE)
+        max_size = cutn.workspace_get_size(self.handle, self.workspace_desc, cutn.WorksizePref.MAX, cutn.Memspace.DEVICE)
+
+        if self.memory_limit < min_size:
+            message = f"""Insufficient memory.
+The memory limit specified is {self.memory_limit}, while the minimum workspace size needed is {min_size}.
+"""
+            raise RuntimeError(message)
+
+        self.workspace_size = max_size if max_size < self.memory_limit else self.memory_limit
+        self.logger.info(f"The workspace size requirements range from {formatters.MemoryStr(min_size)} to "\
+                         f"{formatters.MemoryStr(max_size)}.")
+        self.logger.info(f"The workspace size has been set to {formatters.MemoryStr(self.workspace_size)}.")
 
-            self.workspace = cp.cuda.runtime.malloc(workspace_size)
+        # Set workspace size to enable contraction planning. The device pointer will be set later during allocation.
+        cutn.workspace_set(self.handle, self.workspace_desc, cutn.Memspace.DEVICE, 0, self.workspace_size)
 
-        value, unit = utils.convert_memory_with_units(workspace_size)
-        value = f"{value}" if unit == "B" else f"{value:0.2f}"
-        self.logger.debug(f"Finished allocating workspace of size {value} {unit} for contraction.")
 
     @utils.precondition(_check_valid_network)
     @utils.precondition(_check_optimized, "Planning")
@@ -345,10 +386,10 @@ def _create_plan(self):
         if self.plan:
             cutn.destroy_contraction_plan(self.plan)
 
-        self.plan = cutn.create_contraction_plan(self.handle, self.network, self.optimizer_info_ptr, self.workspace_size)
+        self.plan = cutn.create_contraction_plan(self.handle, self.network, self.optimizer_info_ptr, self.workspace_desc)
 
         self.logger.debug("Finished creating contraction plan.")
-    
+
     def _set_opt_config_options(self, options):
         """
         Set ContractionOptimizerConfig options if the value is not None.
@@ -389,47 +430,49 @@ def _set_optimizer_options(self, optimize):
 
         assert isinstance(optimize.path, configuration.PathFinderOptions), "Internal error."
 
-        # PathFinder options
+        # PathFinder options.
         self._set_opt_config_options(optimize.path)
 
-        # Slicer options
+        # Slicer options.
         if isinstance(optimize.slicing, configuration.SlicerOptions):
             self._set_opt_config_options(optimize.slicing)
 
-        # Reconfiguration options
+        # Reconfiguration options.
         self._set_opt_config_options(optimize.reconfiguration)
 
-        # The "global" options
+        # The "global" options.
         ConfEnum = cutn.ContractionOptimizerConfigAttribute
 
         enum = ConfEnum.HYPER_NUM_SAMPLES
         self._set_opt_config_option('samples', enum, optimize.samples)
 
+        enum = ConfEnum.HYPER_NUM_THREADS
+        self._set_opt_config_option('threads', enum, optimize.threads)
+
         enum = ConfEnum.SEED
         self._set_opt_config_option('seed', enum, optimize.seed)
 
     @utils.precondition(_check_valid_network)
     @utils.atomic(_free_path_resources, method=True)
     def contract_path(self, optimize=None):
-        """Compute the best contraction path together with any slicing that is needed to ensure that the contraction can be 
+        """Compute the best contraction path together with any slicing that is needed to ensure that the contraction can be
         performed within the specified memory limit.
 
         Args:
-            optimize :  This parameter specifies options for path optimization as an :class:`OptimizerOptions` object. Alternatively, a 
-                dictionary containing the parameters for the ``OptimizerOptions`` constructor can also be provided. If not 
+            optimize :  This parameter specifies options for path optimization as an :class:`OptimizerOptions` object. Alternatively, a
+                dictionary containing the parameters for the ``OptimizerOptions`` constructor can also be provided. If not
                 specified, the value will be set to the default-constructed ``OptimizerOptions`` object.
 
-        Returns: 
+        Returns:
             tuple: A 2-tuple (``path``, ``opt_info``):
 
-                - ``path`` :  A sequence of pairs of operand indices representing the best contraction order in the
+                - ``path`` :  A sequence of pairs of operand ordinals representing the best contraction order in the
                   :func:`numpy.einsum_path` format.
                 - ``opt_info`` : An object of type :class:`OptimizerInfo` containing information about the best contraction order.
 
         Notes:
 
             - If the path is provided, the user has to set the sliced modes too if slicing is desired.
-            - If the path or sliced modes are provided, the metrics in :class:`OptimizerInfo` may not be correct.
         """
 
         optimize = utils.check_or_create_options(configuration.OptimizerOptions, optimize, "path optimizer options")
@@ -441,7 +484,7 @@ def contract_path(self, optimize=None):
 
         opt_info_ifc = optimizer_ifc.OptimizerInfoInterface(self)
 
-        # Compute path (or set provided path)
+        # Compute path (or set provided path).
         if isinstance(optimize.path, configuration.PathFinderOptions):
             # Set optimizer options.
             self._set_optimizer_options(optimize)
@@ -454,7 +497,7 @@ def contract_path(self, optimize=None):
             opt_info_ifc.path = optimize.path
             self.logger.info("Finished setting user-provided path.")
 
-        # Set slicing if provided
+        # Set slicing if provided.
         if not isinstance(optimize.slicing, configuration.SlicerOptions):
             self.logger.info("Setting user-provided sliced modes...")
             opt_info_ifc.sliced_mode_extent = optimize.slicing
@@ -463,20 +506,21 @@ def contract_path(self, optimize=None):
         self.num_slices = opt_info_ifc.num_slices
         assert self.num_slices > 0
 
-        # Create OptimizerInfo object here
+        # Create OptimizerInfo object.
         largest_intermediate = opt_info_ifc.largest_intermediate
         opt_cost = opt_info_ifc.flop_count
         path = opt_info_ifc.path
         slices = opt_info_ifc.sliced_mode_extent
 
         opt_info = configuration.OptimizerInfo(largest_intermediate, opt_cost, path, slices)
+        self.logger.info(f"{opt_info}")
 
         self.optimized = True
 
-        # Allocate workspace
-        self._allocate_workspace()
+        # Calculate workspace size required.
+        self._calculate_workspace_size()
 
-        # Create plan
+        # Create plan.
         self._create_plan()
 
         return opt_info.path, opt_info
@@ -522,7 +566,7 @@ def autotune(self, *, iterations=3, stream=None):
         Args:
             iterations: The number of iterations for autotuning. See `CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS`.
             stream: Provide the CUDA stream to use for the autotuning operation. Acceptable inputs include ``cudaStream_t``
-                (as Python `int`), :class:`cupy.cuda.Stream`, and :class:`torch.cuda.Stream`. If a stream is not provided,
+                (as Python :class:`int`), :class:`cupy.cuda.Stream`, and :class:`torch.cuda.Stream`. If a stream is not provided,
                 the current stream will be used.
         """
 
@@ -535,16 +579,19 @@ def autotune(self, *, iterations=3, stream=None):
         options = {'iterations': (AutoEnum.MAX_ITERATIONS, iterations)}
         self._set_autotune_options(options)
 
+        # Allocate device memory (in stream context) if needed.
+        stream, stream_ctx, stream_ptr = utils.get_or_create_stream(self.device, stream, self.package)
+        self._allocate_workspace_memory_perhaps(stream_ctx)
+
         # Check if we still hold an output tensor; if not, create a new one.
         if self.contraction is None:
-            self.contraction = utils.create_empty_tensor(self.output_class, self.extents_out, self.data_type, self.device_id)
+            self.contraction = utils.create_empty_tensor(self.output_class, self.extents_out, self.data_type, self.device_id, stream_ctx)
 
         self.logger.info(f"Starting autotuning...")
-        stream, stream_ptr = utils.get_or_create_stream(self.device, stream)
         with self.device:
             start = stream.record()
-            cutn.contraction_autotune(self.handle, self.plan, self.operands_data, self.contraction.data_ptr, 
-                    self.workspace, self.workspace_size, self.autotune_pref_ptr, stream_ptr)
+            cutn.contraction_autotune(self.handle, self.plan, self.operands_data, self.contraction.data_ptr,
+                 self.workspace_desc, self.autotune_pref_ptr, stream_ptr)
             end = stream.record()
             end.synchronize()
             elapsed = cp.cuda.get_elapsed_time(start, end)
@@ -562,7 +609,8 @@ def reset_operands(self, *operands):
         This method will perform various checks on the new operands to make sure:
 
             - The shapes, strides, datatypes match those of the old ones.
-            - If input tensors are on GPU, the device and alignments must match.
+            - The packages that the operands belong to match those of the old ones.
+            - If input tensors are on GPU, the library package, device, and alignments must match.
 
         Args:
             operands: See :class:`Network`'s documentation.
@@ -573,7 +621,7 @@ def reset_operands(self, *operands):
             raise ValueError(message)
 
         self.logger.info("Resetting operands...")
-        # First wrap operands
+        # First wrap operands.
         operands = tensor_wrapper.wrap_operands(operands)
 
         utils.check_operands_match(self.operands, operands, 'dtype', "data type")
@@ -585,6 +633,11 @@ def reset_operands(self, *operands):
             # Copy to existing device pointers because the new operands are on the CPU.
             tensor_wrapper.copy_(operands, self.operands)
         else:
+            package = utils.get_operands_package(operands)
+            if self.package != package:
+                message = f"Library package mismatch: '{self.package}' => '{package}'"
+                raise TypeError(message)
+
             if self.device_id != device_id:
                 raise ValueError(f"The new operands must be on the same device ({device_id}) as the original operands "
                                  f"({self.device_id}).")
@@ -604,25 +657,27 @@ def contract(self, *, stream=None):
 
         Args:
             stream: Provide the CUDA stream to use for the autotuning operation. Acceptable inputs include ``cudaStream_t``
-                (as Python `int`), :class:`cupy.cuda.Stream`, and :class:`torch.cuda.Stream`. If a stream is not provided,
+                (as Python :class:`int`), :class:`cupy.cuda.Stream`, and :class:`torch.cuda.Stream`. If a stream is not provided,
                 the current stream will be used.
 
         Returns:
             The result is of the same type and on the same device as the operands.
         """
 
-        self.logger.info("Starting network contraction...") 
-        stream, stream_ptr = utils.get_or_create_stream(self.device, stream)
+        # Allocate device memory (in stream context) if needed.
+        stream, stream_ctx, stream_ptr = utils.get_or_create_stream(self.device, stream, self.package)
+        self._allocate_workspace_memory_perhaps(stream_ctx)
 
         # Check if we still hold an output tensor; if not, create a new one.
         if self.contraction is None:
-            self.contraction = utils.create_empty_tensor(self.output_class, self.extents_out, self.data_type, self.device_id)
+            self.contraction = utils.create_empty_tensor(self.output_class, self.extents_out, self.data_type, self.device_id, stream_ctx)
 
+        self.logger.info("Starting network contraction...")
         with self.device:
             start = stream.record()
             for s in range(self.num_slices):
-                cutn.contraction(self.handle, self.plan, self.operands_data, self.contraction.data_ptr, self.workspace, 
-                        self.workspace_size, s, stream_ptr)
+                cutn.contraction(self.handle, self.plan, self.operands_data, self.contraction.data_ptr,
+                    self.workspace_desc, s, stream_ptr)
             end = stream.record()
             end.synchronize()
             elapsed = cp.cuda.get_elapsed_time(start, end)
@@ -633,7 +688,7 @@ def contract(self, *, stream=None):
             out = self.contraction.to('cpu')
         else:
             out = self.contraction.tensor
-        self.contraction = None  # we cannot overwrite what we already hand to users
+        self.contraction = None  # We cannot overwrite what we've already handed to users.
         return out
 
     def free(self):
@@ -653,6 +708,10 @@ def free(self):
                 cutn.destroy_contraction_autotune_preference(self.autotune_pref_ptr)
                 self.autotune_pref_ptr = None
 
+            if self.workspace_desc is not None:
+                cutn.destroy_workspace_descriptor(self.workspace_desc)
+                self.workspace_desc = None
+
             if self.network is not None:
                 cutn.destroy_network_descriptor(self.network)
                 self.network = None
@@ -661,10 +720,10 @@ def free(self):
                 cutn.destroy(self.handle)
                 self.handle = None
                 self.own_handle = False
-        except BaseException as e:
+        except Exception as e:
             self.logger.critical("Internal error: only part of the network resources have been released.")
-            self.logger.critical(e)
-            raise e 
+            self.logger.critical(str(e))
+            raise e
         finally:
             self.valid_state = False
 
@@ -678,31 +737,30 @@ def contract(*operands, options=None, optimize=None, stream=None, return_info=Fa
     Evaluate the Einstein summation convention on the operands.
 
     Explicit as well as implicit form is supported for the Einstein summation expression. In addition to the subscript format,
-    the "interleaved" format is also supported as a means of specifying the operands and their modes. See :class:`Network` for more 
-    detail on the types of operands as well as for examples.
-
-    Args: 
-        subscripts : The modes (subscripts) for summation as a comma-separated list of characters. Unicode characters are
-            allowed in the expression thereby expanding the size of the tensor network that can be specified using the 
-            Einstein summation convention.
-        operands : A sequence of tensors (ndarray-like objects). The currently supported types are :class:`numpy.ndarray`, 
+    the interleaved format is also supported as a means of specifying the operands and their mode labels. See :class:`Network`
+    for more detail on the types of operands as well as for examples.
+
+    Args:
+        subscripts : The mode labels (subscripts) defining the Einstein summation expression as a comma-separated sequence of
+            characters. Unicode characters are allowed in the expression thereby expanding the size of the tensor network that
+            can be specified using the Einstein summation convention.
+        operands : A sequence of tensors (ndarray-like objects). The currently supported types are :class:`numpy.ndarray`,
             :class:`cupy.ndarray`, and :class:`torch.Tensor`.
         options : Specify options for the tensor network as a :class:`~cuquantum.NetworkOptions` object. Alternatively, a `dict`
             containing the parameters for the ``NetworkOptions`` constructor can also be provided. If not specified,
             the value will be set to the default-constructed ``NetworkOptions`` object.
-        optimize :  This parameter specifies options for path optimization as an :class:`OptimizerOptions` object. Alternatively, a 
-            dictionary containing the parameters for the ``OptimizerOptions`` constructor can also be provided. If not 
+        optimize :  This parameter specifies options for path optimization as an :class:`OptimizerOptions` object. Alternatively, a
+            dictionary containing the parameters for the ``OptimizerOptions`` constructor can also be provided. If not
             specified, the value will be set to the default-constructed ``OptimizerOptions`` object.
         stream: Provide the CUDA stream to use for the autotuning operation. Acceptable inputs include ``cudaStream_t``
-            (as Python `int`), :class:`cupy.cuda.Stream`, and :class:`torch.cuda.Stream`. If a stream is not provided,
+            (as Python :class:`int`), :class:`cupy.cuda.Stream`, and :class:`torch.cuda.Stream`. If a stream is not provided,
             the current stream will be used.
         return_info : If true, information about the best contraction order will also be returned.
 
     Returns:
         If ``return_info`` is `False`, the output tensor (ndarray-like object) of the same type and on the same device
-        as the operands containing
-        the result of the contraction; otherwise, a 2-tuple consisting of the output tensor and an :class:`OptimizerInfo`
-        object that contains information about the best contraction order etc.
+        as the operands containing the result of the contraction; otherwise, a 2-tuple consisting of the output tensor and an
+        :class:`OptimizerInfo` object that contains information about the best contraction order etc.
 
     .. note::
         It is encouraged for users to maintain the library handle themselves so as to reduce the context initialization time:
@@ -719,15 +777,15 @@ def contract(*operands, options=None, optimize=None, stream=None, return_info=Fa
             cutensornet.destroy(handle)
 
     Examples:
-        
+
         Use NumPy operands:
 
         >>> from cuquantum import contract
         >>> import numpy as np
-        >>> a = np.ones((3,2))
-        >>> b = np.ones((2,3))
+        >>> a = np.arange(6.).reshape(3, 2)
+        >>> b = np.arange(6.).reshape(2, 3)
 
-        Perform matrix multiplication in the explicit form. The result ``r`` is a NumPy ndarray (with the computation
+        Perform matrix multiplication using the explicit form. The result ``r`` is a NumPy ndarray (with the computation
         performed on the GPU):
 
         >>> r = contract('ij,jk->ik', a, b)
@@ -736,15 +794,15 @@ def contract(*operands, options=None, optimize=None, stream=None, return_info=Fa
 
         >>> r = contract('ij,jk', a, b)
 
-        Interleaved format using characters for modes:
+        Interleaved format using characters for mode labels:
 
         >>> r = contract(a, ['i', 'j'], b, ['j', 'k'], ['i', 'k'], return_info=True)
 
-        Interleaved format using string labels for modes, using implicit form:
+        Interleaved format using string labels for mode labels and implicit form:
 
         >>> r = contract(a, ['first', 'second'], b, ['second', 'third'])
 
-        Interleaved format using integer modes, using explicit form:
+        Interleaved format using integer mode labels and explicit form:
 
         >>> r = contract(a, [1, 2], b, [2, 3], [1, 3])
 
@@ -778,22 +836,34 @@ def contract(*operands, options=None, optimize=None, stream=None, return_info=Fa
         >>> o = OptimizerOptions(path = [(0,2), (0,1)])
         >>> r = contract('ij,jk,kl', a, b, a, optimize=o)
 
+        Perform elementwise multiplication :math:`a \odot b^T` using the ellipsis shorthand notation:
+
+        >>> r = contract('...,...', a, b.T)
+
+        Obtain the double inner product :math:`a : b^T` (Frobenius inner product for real-valued tensors) using the
+        ellipsis shorthand notation:
+
+        >>> r = contract('...,...->', a, b.T)
+
         Use CuPy operands. The result ``r`` is a CuPy ndarray on the same device as the operands, and ``dev`` is any valid
         device ID on your system that you wish to use to store the tensors and compute the contraction:
 
         >>> import cupy
         >>> dev = 0
         >>> with cupy.cuda.Device(dev):
-        ...     a = cupy.ones((3,2))
-        ...     b = cupy.ones((2,3))
+        ...     a = cupy.arange(6.).reshape(3, 2)
+        ...     b = cupy.arange(6.).reshape(2, 3)
         >>> r = contract('ij,jk', a, b)
 
         Use PyTorch operands. The result ``r`` is a PyTorch tensor on the same device (``dev``) as the operands:
 
+    .. doctest::
+        :skipif: torch is None
+
         >>> import torch
         >>> dev = 0
-        >>> a = torch.ones((3,2), device=f'cuda:{dev}')
-        >>> b = torch.ones((2,3), device=f'cuda:{dev}')
+        >>> a = torch.arange(6., device=f'cuda:{dev}').reshape(3, 2)
+        >>> b = torch.arange(6., device=f'cuda:{dev}').reshape(2, 3)
         >>> r = contract('ij,jk', a, b)
     """
 
@@ -801,15 +871,15 @@ def contract(*operands, options=None, optimize=None, stream=None, return_info=Fa
 
     optimize = utils.check_or_create_options(configuration.OptimizerOptions, optimize, "path optimizer options")
 
-    # Create network
+    # Create network.
     with Network(*operands, options=options) as network:
 
-        # Compute path
+        # Compute path.
         opt_info = network.contract_path(optimize=optimize)
 
         # Skip autotuning since the network is contracted only once.
 
-        # Contraction
+        # Contraction.
         output = network.contract(stream=stream)
 
     if return_info:
@@ -825,26 +895,26 @@ def contract_path(*operands, options=None, optimize=None):
     Evaluate the "best" contraction order by allowing the creation of intermediate tensors.
 
     Explicit as well as implicit form is supported for the Einstein summation expression. In addition to the subscript format,
-    the "interleaved" format is also supported as a means of specifying the operands and their modes. See :class:`Network` for more
-    detail on the types of operands as well as for examples.
-
-    Args: 
-        subscripts : The modes (subscripts) for summation as a comma-separated list of characters. Unicode characters are
-            allowed in the expression thereby expanding the size of the tensor network that can be specified using the 
-            Einstein summation convention.
-        operands : A sequence of tensors (ndarray-like objects). The currently supported types are :class:`numpy.ndarray`, 
+    the interleaved format is also supported as a means of specifying the operands and their mode labels. See :class:`Network`
+    for more detail on the types of operands as well as for examples.
+
+    Args:
+        subscripts : The mode labels (subscripts) defining the Einstein summation expression as a comma-separated sequence of
+            characters. Unicode characters are allowed in the expression thereby expanding the size of the tensor network that
+            can be specified using the Einstein summation convention.
+        operands : A sequence of tensors (ndarray-like objects). The currently supported types are :class:`numpy.ndarray`,
             :class:`cupy.ndarray`, and :class:`torch.Tensor`.
         options : Specify options for the tensor network as a :class:`~cuquantum.NetworkOptions` object. Alternatively, a `dict`
             containing the parameters for the ``NetworkOptions`` constructor can also be provided. If not specified,
             the value will be set to the default-constructed ``NetworkOptions`` object.
-        optimize :  This parameter specifies options for path optimization as an :class:`OptimizerOptions` object. Alternatively, a 
-            dictionary containing the parameters for the ``OptimizerOptions`` constructor can also be provided. If not 
+        optimize :  This parameter specifies options for path optimization as an :class:`OptimizerOptions` object. Alternatively, a
+            dictionary containing the parameters for the ``OptimizerOptions`` constructor can also be provided. If not
             specified, the value will be set to the default-constructed ``OptimizerOptions`` object.
 
-    Returns: 
+    Returns:
         tuple: A 2-tuple (``path``, ``opt_info``):
 
-            - ``path`` :  A sequence of pairs of operand indices representing the best contraction order in the
+            - ``path`` :  A sequence of pairs of operand ordinals representing the best contraction order in the
               :func:`numpy.einsum_path` format.
             - ``opt_info`` : An object of type :class:`OptimizerInfo` containing information about the best contraction order.
 
@@ -868,10 +938,10 @@ def contract_path(*operands, options=None, optimize=None):
 
     optimize = utils.check_or_create_options(configuration.OptimizerOptions, optimize, "path optimizer options")
 
-    # Create network
+    # Create network.
     with Network(*operands, options=options) as network:
 
-        # Compute path
+        # Compute path.
         path, opt_info = network.contract_path(optimize=optimize)
 
     return path, opt_info
@@ -914,14 +984,14 @@ def einsum(*operands, out=None, dtype=None, order='K', casting='safe', optimize=
     specific to cuTensorNet and is recommended over this function.
 
     Explicit as well as implicit form is supported for the Einstein summation expression. In addition to the subscript format,
-    the "interleaved" format is also supported as a means of specifying the operands and their modes. See :class:`Network` for more 
-    detail on the types of operands as well as for examples.
-
-    Args: 
-        subscripts : The modes (subscripts) for summation as a comma-separated list of characters. Unicode characters are
-            allowed in the expression thereby expanding the size of the tensor network that can be specified using the 
-            Einstein summation convention.
-        operands : A sequence of tensors (ndarray-like objects). The currently supported types are :class:`numpy.ndarray`, 
+    the interleaved format is also supported as a means of specifying the operands and their mode labels. See :class:`Network`
+    for more detail on the types of operands as well as for examples.
+
+    Args:
+        subscripts : The mode labels (subscripts) defining the Einstein summation expression as a comma-separated sequence of
+            characters. Unicode characters are allowed in the expression thereby expanding the size of the tensor network that
+            can be specified using the Einstein summation convention.
+        operands : A sequence of tensors (ndarray-like objects). The currently supported types are :class:`numpy.ndarray`,
             :class:`cupy.ndarray`, and :class:`torch.Tensor`.
         out : Not supported in this release.
         dtype : Not supported in this release.
@@ -938,27 +1008,27 @@ def einsum(*operands, out=None, dtype=None, order='K', casting='safe', optimize=
 
     _check_einsum_options(out, dtype, order, casting, optimize)
 
-    # Create network
+    # Create network.
     with Network(*operands) as network:
 
         if optimize is True:
-            # Compute path
+            # Compute path.
             network.contract_path()
         else:
             if optimize is False:
                 # Use canonical path.
                 path = [(0, 1)] * (network.num_inputs - 1)
-            else: 
+            else:
                 # Use specified path.
                 path = optimize
-                
+
             # Set path (path validation is done when setting OptimizerOptions).
             optimize = configuration.OptimizerOptions(path=path)
             network.contract_path(optimize=optimize)
 
         # Skip autotuning since the network is contracted only once.
 
-        # Contraction
+        # Contraction.
         output = network.contract()
 
     return output
@@ -974,23 +1044,24 @@ def einsum_path(*operands, optimize=True):
     extensive set of options specific to cuTensorNet and is recommended over this function.
 
     Explicit as well as implicit form is supported for the Einstein summation expression. In addition to the subscript format,
-    the "interleaved" format is also supported as a means of specifying the operands and their modes. See `Network` for more 
-    detail on the types of operands as well as for examples.
-
-    Args: 
-        subscripts : The modes (subscripts) for summation as a comma-separated list of characters. Unicode characters are
-            allowed in the expression thereby expanding the size of the tensor network that can be specified using the 
-            Einstein summation convention.
-        operands : A sequence of tensors (ndarray-like objects). The currently supported types are :class:`numpy.ndarray`, 
+    the interleaved format is also supported as a means of specifying the operands and their mode labels. See :class:`Network`
+    for more detail on the types of operands as well as for examples.
+
+    Args:
+        subscripts : The mode labels (subscripts) defining the Einstein summation expression as a comma-separated sequence of
+            characters. Unicode characters are allowed in the expression thereby expanding the size of the tensor network that
+            can be specified using the Einstein summation convention.
+        operands : A sequence of tensors (ndarray-like objects). The currently supported types are :class:`numpy.ndarray`,
             :class:`cupy.ndarray`, and :class:`torch.Tensor`.
         optimize : This parameter specifies options for path optimization. The only value allowed with this interface is `True`.
 
     Returns:
         tuple: A 2-tuple (``path``, ``opt_info``):
 
-            - ``path`` :  A sequence of pairs of operand indices representing the best contraction order in the
-              :func:`numpy.einsum_path` format.
-            - ``opt_info`` : An object of type :class:`OptimizerInfo` containing information about the best contraction order.
+            - ``path`` :  A list starting with the string 'einsum_path' and followed by a sequence of pairs of operand ordinals
+                representing the best contraction order in the :func:`numpy.einsum_path` format.
+            - ``opt_info`` : String representation of an object of type :class:`OptimizerInfo` containing information about
+                the best contraction order.
     """
 
     if optimize is not True:
@@ -998,10 +1069,10 @@ def einsum_path(*operands, optimize=True):
 The only allowed value for 'optimize' is True."""
         raise NotImplementedError(message)
 
-    # Create network
+    # Create network.
     with Network(*operands) as network:
 
-        # Compute path
+        # Compute path.
         path, opt_info = network.contract_path()
 
-    return path, opt_info
+    return ['einsum_path', *path], str(opt_info)
diff --git a/python/cuquantum/utils.pxd b/python/cuquantum/utils.pxd
index e5a4b7b..f201123 100644
--- a/python/cuquantum/utils.pxd
+++ b/python/cuquantum/utils.pxd
@@ -1,6 +1,12 @@
+from libc.stdint cimport intptr_t
 cimport cpython
 
 
+cdef extern from * nogil:
+    # from CUDA
+    ctypedef int Stream 'cudaStream_t'
+
+
 cdef inline bint is_nested_sequence(data):
     if not cpython.PySequence_Check(data):
         return False
@@ -10,3 +16,49 @@ cdef inline bint is_nested_sequence(data):
                 return False
         else:
             return True
+
+
+cdef inline int cuqnt_alloc_wrapper(void* ctx, void** ptr, size_t size, Stream stream) with gil:
+    """Assuming the user provides an alloc routine: ptr = alloc(size, stream).
+
+    Note: this function holds the Python GIL.
+    """
+    cdef tuple pairs
+
+    try:
+        pairs = <object>(ctx)
+        user_alloc = pairs[0]
+        ptr[0] = <void*>(<intptr_t>user_alloc(size, stream))
+    except:
+        # TODO: logging?
+        return 1
+    else:
+        return 0
+
+
+cdef inline int cuqnt_free_wrapper(void* ctx, void* ptr, size_t size, Stream stream) with gil:
+    """Assuming the user provides a free routine: free(ptr, size, stream).
+
+    Note: this function holds the Python GIL.
+    """
+    cdef tuple pairs
+
+    try:
+        pairs = <object>(ctx)
+        user_free = pairs[1]
+        user_free(<intptr_t>ptr, size, stream)
+    except:
+        # TODO: logging?
+        return 1
+    else:
+        return 0
+
+
+cdef inline void logger_callback_with_data(
+        int log_level, const char* func_name, const char* message,
+        void* func_arg) with gil:
+    func, args, kwargs = <object>func_arg
+    cdef bytes function_name = func_name
+    cdef bytes function_message = message
+    func(log_level, function_name.decode(), function_message.decode(),
+         *args, **kwargs)
diff --git a/python/samples/accessor_get.py b/python/samples/accessor_get.py
index 229f508..6116f22 100644
--- a/python/samples/accessor_get.py
+++ b/python/samples/accessor_get.py
@@ -29,7 +29,7 @@
 handle = cusv.create()
 
 # create accessor and check the size of external workspace
-accessor, workspace_size = cusv.accessor_create_readonly(
+accessor, workspace_size = cusv.accessor_create_view(
     handle, d_sv.data.ptr, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, bitOrdering, len(bitOrdering),
     maskBitString, maskOrdering, maskLen)
 
@@ -47,6 +47,9 @@
 cusv.accessor_get(
     handle, accessor, h_buf.ctypes.data, accessBegin, accessEnd)
 
+# destroy accessor
+cusv.accessor_destroy(accessor)
+
 # destroy handle
 cusv.destroy(handle)
 
diff --git a/python/samples/accessor_set.py b/python/samples/accessor_set.py
index 95ba1ff..10a3cb7 100644
--- a/python/samples/accessor_set.py
+++ b/python/samples/accessor_set.py
@@ -41,6 +41,9 @@
 cusv.accessor_set(
     handle, accessor, h_buf.ctypes.data, 0, nSvSize)
 
+# destroy accessor
+cusv.accessor_destroy(accessor)
+
 # destroy handle
 cusv.destroy(handle)
 
diff --git a/python/samples/coarse/example12.py b/python/samples/coarse/example12.py
new file mode 100644
index 0000000..8cca3f2
--- /dev/null
+++ b/python/samples/coarse/example12.py
@@ -0,0 +1,34 @@
+"""
+Verify FLOPS and largest intermediate size against NumPy for a given path.
+"""
+import re
+
+from cuquantum import contract_path, OptimizerOptions
+import numpy as np
+
+expr = 'ehl,gj,edhg,bif,d,c,k,iklj,cf,a->ba'
+shapes = [(8, 2, 5), (5, 7), (8, 8, 2, 5), (8, 6, 3), (8,), (6,), (5,), (6, 5, 5, 7), (6, 3), (3,)]
+
+operands = [np.random.rand(*shape) for shape in shapes]
+
+# NumPy path and metrics.
+path_np, i = np.einsum_path(expr, *operands)
+
+flops_np = float(re.search("Optimized FLOP count:(.*)\n", i).group(1))
+largest_np = float(re.search("Largest intermediate:(.*) elements\n", i).group(1))
+flops_np -= 1    # NumPy adds 1 to the FLOP count.
+
+# Set path and obtain metrics.
+o = OptimizerOptions(path=path_np[1:])
+path, i = contract_path(expr, *operands, optimize=o)
+assert list(path) == path_np[1:], "Error: path doesn't match what was set."
+
+flops = i.opt_cost
+largest = i.largest_intermediate
+
+if flops != flops_np or largest != largest_np:
+    message = f""" Results don't match.
+flops: NumPy = {flops_np}, cuTensorNet = {flops},
+largest intermediate: NumPy  = {largest_np}, cuTensorNet = {largest}
+"""
+    raise ValueError(message)
diff --git a/python/samples/coarse/example13.py b/python/samples/coarse/example13.py
new file mode 100644
index 0000000..7b6c0b9
--- /dev/null
+++ b/python/samples/coarse/example13.py
@@ -0,0 +1,19 @@
+"""
+Set sliced modes.
+"""
+import re
+
+from cuquantum import contract, OptimizerOptions
+import numpy as np
+
+expr = 'ehl,gj,edhg,bif,d,c,k,iklj,cf,a->ba'
+shapes = [(8, 2, 5), (5, 7), (8, 8, 2, 5), (8, 6, 3), (8,), (6,), (5,), (6, 5, 5, 7), (6, 3), (3,)]
+
+operands = [np.random.rand(*shape) for shape in shapes]
+
+# Set sliced modes.
+o = OptimizerOptions(slicing=(('e', 2), ('h',1)))
+
+r = contract(expr, *operands, optimize=o)
+s = np.einsum(expr, *operands)
+assert np.allclose(r, s), "Incorrect results."
diff --git a/python/samples/coarse/example14.py b/python/samples/coarse/example14.py
new file mode 100644
index 0000000..6e3aa07
--- /dev/null
+++ b/python/samples/coarse/example14.py
@@ -0,0 +1,17 @@
+"""
+Example illustrating a generalized Einstein summation expression.
+"""
+import numpy as np
+
+from cuquantum import contract
+
+
+a = np.arange(16.).reshape(4,4)
+b = np.arange(64.).reshape(4,4,4)
+
+# Elementwise multiplication of tensor diagonals.
+expr = "ii,iii->i"
+
+r = contract(expr, a, b)
+s = np.einsum(expr, a, b)
+assert np.allclose(r, s), "Incorrect results."
diff --git a/python/samples/coarse/example15.py b/python/samples/coarse/example15.py
new file mode 100644
index 0000000..4368099
--- /dev/null
+++ b/python/samples/coarse/example15.py
@@ -0,0 +1,19 @@
+"""
+Example illustrating a generalized Einstein summation expression.
+"""
+import numpy as np
+
+from cuquantum import contract
+
+
+a = np.random.rand(3,2)
+b = np.random.rand(3,3)
+c = np.random.rand(3,2)
+d = np.random.rand(3,4)
+
+# A hyperedge example.
+expr = "ij,ik,ij,kl->l"
+
+r = contract(expr, a, b, c, d)
+s = np.einsum(expr, a, b, c, d)
+assert np.allclose(r, s), "Incorrect results."
diff --git a/python/samples/coarse/example16.py b/python/samples/coarse/example16.py
new file mode 100644
index 0000000..b233112
--- /dev/null
+++ b/python/samples/coarse/example16.py
@@ -0,0 +1,17 @@
+"""
+Example illustrating a batched operation.
+"""
+import numpy as np
+
+from cuquantum import contract
+
+
+a = np.random.rand(2,4)
+b = np.random.rand(2,4)
+
+# Batched inner product.
+expr = "ij,ij->i"
+
+r = contract(expr, a, b)
+s = np.einsum(expr, a, b)
+assert np.allclose(r, s), "Incorrect results."
diff --git a/python/samples/coarse/example17.py b/python/samples/coarse/example17.py
new file mode 100644
index 0000000..26725ed
--- /dev/null
+++ b/python/samples/coarse/example17.py
@@ -0,0 +1,16 @@
+"""
+Example illustrating mode broadcasting.
+"""
+import numpy as np
+
+from cuquantum import contract
+
+
+a = np.random.rand(3,1)
+b = np.random.rand(3,3)
+
+expr = "ij,jk"
+
+r = contract(expr, a, b)
+s = np.einsum(expr, a, b)
+assert np.allclose(r, s), "Incorrect results."
diff --git a/python/samples/coarse/example18.py b/python/samples/coarse/example18.py
new file mode 100644
index 0000000..4043e2f
--- /dev/null
+++ b/python/samples/coarse/example18.py
@@ -0,0 +1,17 @@
+"""
+Example illustrating ellipsis broadcasting.
+"""
+import numpy as np
+
+from cuquantum import contract
+
+
+a = np.random.rand(3,1)
+b = np.random.rand(3,3)
+
+# Elementwise product of two matrices.
+expr = "...,..."
+
+r = contract(expr, a, b)
+s = np.einsum(expr, a, b)
+assert np.allclose(r, s), "Incorrect results."
diff --git a/python/samples/coarse/example19.py b/python/samples/coarse/example19.py
new file mode 100644
index 0000000..7ea11bd
--- /dev/null
+++ b/python/samples/coarse/example19.py
@@ -0,0 +1,17 @@
+"""
+Example illustrating ellipsis broadcasting.
+"""
+import numpy as np
+
+from cuquantum import contract
+
+
+a = np.arange(3.).reshape(3,1)
+b = np.arange(9.).reshape(3,3)
+
+# Double inner product (Frobenuis inner product) of two matrices. 
+expr = "...,...->"
+
+r = contract(expr, a, b)
+print(r)
+assert np.allclose(r, 54.), "Incorrect results."
diff --git a/python/samples/coarse/example20.py b/python/samples/coarse/example20.py
new file mode 100644
index 0000000..9f6e4aa
--- /dev/null
+++ b/python/samples/coarse/example20.py
@@ -0,0 +1,34 @@
+"""
+Example illustrating a simple memory manager plugin using a PyTorch tensor as a memory buffer.
+"""
+import logging
+import torch
+
+from cuquantum import contract, MemoryPointer
+
+
+logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M:%S')
+
+class TorchMemMgr:
+    def __init__(self, device):
+        self.device = device
+        self.logger = logging.getLogger()
+
+    def memalloc(self, size):
+        buffer = torch.empty((size, ), device=self.device, dtype=torch.int8, requires_grad=False)
+        device_pointer = buffer.data_ptr()
+        self.logger.info(f"The user memory allocator has allocated {size} bytes at pointer {device_pointer}.")
+
+        def create_finalizer():
+            def finalizer():
+                buffer    # Keep buffer alive for as long as it is needed.
+                self.logger.info("The memory allocation has been released.")
+            return finalizer
+
+        return MemoryPointer(device_pointer, size, finalizer=create_finalizer())
+
+device_id = 0
+a = torch.rand((3,2), device=device_id)
+b = torch.rand((2,3), device=device_id)
+
+r = contract("ij,jk", a, b, options={'allocator' : TorchMemMgr(device_id)})
diff --git a/python/samples/diagonal_matrix.py b/python/samples/diagonal_matrix.py
index 46132ce..6c373b7 100644
--- a/python/samples/diagonal_matrix.py
+++ b/python/samples/diagonal_matrix.py
@@ -6,12 +6,12 @@
 
 
 nIndexBits = 3
-nSvSize    = (1 << nIndexBits)
-nBasisBits = 1
-maskLen    = 0
-adjoint    = 0
+nSvSize = (1 << nIndexBits)
+adjoint = 0
 
-basisBits  = [2]
+targets  = [2]
+n_targets = 1
+n_controls = 0
 
 d_sv       = cp.asarray([0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j,
                          0.2+0.2j, 0.3+0.3j, 0.3+0.4j, 0.4+0.5j], dtype=np.complex64)
@@ -25,9 +25,9 @@
 handle = cusv.create()
 
 # check the size of external workspace
-workspaceSize = cusv.apply_generalized_permutation_matrix_buffer_size(
+workspaceSize = cusv.apply_generalized_permutation_matrix_get_workspace_size(
     handle, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, 0, diagonals.ctypes.data, cuquantum.cudaDataType.CUDA_C_32F,
-    basisBits, nBasisBits, maskLen)
+    targets, n_targets, n_controls)
 if workspaceSize > 0:
     workspace = cp.cuda.memory.alloc(workspaceSize)
     workspace_ptr = workspace.ptr
@@ -38,7 +38,7 @@
 cusv.apply_generalized_permutation_matrix(
     handle, d_sv.data.ptr, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits,
     0, diagonals.ctypes.data, cuquantum.cudaDataType.CUDA_C_32F, adjoint,
-    basisBits, nBasisBits, 0, 0, maskLen,
+    targets, n_targets, 0, 0, n_controls,
     workspace_ptr, workspaceSize)
 
 # destroy handle
diff --git a/python/samples/expectation.py b/python/samples/expectation.py
index 1d51939..48c850f 100644
--- a/python/samples/expectation.py
+++ b/python/samples/expectation.py
@@ -34,7 +34,7 @@
 handle = cusv.create()
 
 # check the size of external workspace
-workspaceSize = cusv.expectation_buffer_size(
+workspaceSize = cusv.compute_expectation_get_workspace_size(
     handle, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, matrix_ptr, cuquantum.cudaDataType.CUDA_C_32F,
     cusv.MatrixLayout.ROW, nBasisBits, cuquantum.ComputeType.COMPUTE_32F)
 if workspaceSize > 0:
@@ -44,7 +44,7 @@
     workspace_ptr = 0
 
 # apply gate
-cusv.expectation(
+cusv.compute_expectation(
     handle, d_sv.data.ptr, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits,
     expect.ctypes.data, cuquantum.cudaDataType.CUDA_C_64F,
     matrix_ptr, cuquantum.cudaDataType.CUDA_C_32F, cusv.MatrixLayout.ROW,
diff --git a/python/samples/expectation_pauli.py b/python/samples/expectation_pauli.py
index bdd4c1a..4c64fef 100644
--- a/python/samples/expectation_pauli.py
+++ b/python/samples/expectation_pauli.py
@@ -23,9 +23,9 @@
 handle = cusv.create()
 
 # apply Pauli operator
-cusv.expectations_on_pauli_basis(
+cusv.compute_expectations_on_pauli_basis(
     handle, d_sv.data.ptr, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, exp_values.ctypes.data,
-    paulis, basisBits, nBasisBits, len(paulis))
+    paulis, len(paulis), basisBits, nBasisBits)
 
 # destroy handle
 cusv.destroy(handle)
diff --git a/python/samples/exponential_pauli.py b/python/samples/exponential_pauli.py
index 47bbf3b..2c8a040 100644
--- a/python/samples/exponential_pauli.py
+++ b/python/samples/exponential_pauli.py
@@ -27,7 +27,7 @@
 handle = cusv.create()
 
 # apply Pauli operator
-cusv.apply_exp(
+cusv.apply_pauli_rotation(
     handle, d_sv.data.ptr, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, np.pi/2, paulis.ctypes.data,
     targets.ctypes.data, nTargets, controls.ctypes.data, controlBitValues.ctypes.data, nControls)
 
diff --git a/python/samples/gate_application.py b/python/samples/gate_application.py
index 08e65b9..fa5a03a 100644
--- a/python/samples/gate_application.py
+++ b/python/samples/gate_application.py
@@ -34,7 +34,7 @@
 
 # cuStateVec handle initialization
 handle = cusv.create()
-workspaceSize = cusv.apply_matrix_buffer_size(
+workspaceSize = cusv.apply_matrix_get_workspace_size(
     handle, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, matrix_ptr, cuquantum.cudaDataType.CUDA_C_32F,
     cusv.MatrixLayout.ROW, adjoint, nTargets, nControls, cuquantum.ComputeType.COMPUTE_32F)
 
@@ -48,8 +48,8 @@
 # apply gate
 cusv.apply_matrix(
     handle, d_sv.data.ptr, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, matrix_ptr, cuquantum.cudaDataType.CUDA_C_32F,
-    cusv.MatrixLayout.ROW, adjoint, targets.ctypes.data, nTargets, controls.ctypes.data, nControls,
-    0, cuquantum.ComputeType.COMPUTE_32F, workspace_ptr, workspaceSize)
+    cusv.MatrixLayout.ROW, adjoint, targets.ctypes.data, nTargets, controls.ctypes.data, 0, nControls,
+    cuquantum.ComputeType.COMPUTE_32F, workspace_ptr, workspaceSize)
 
 # destroy handle
 cusv.destroy(handle)
diff --git a/python/samples/memory_handler.py b/python/samples/memory_handler.py
new file mode 100644
index 0000000..ccf6ff8
--- /dev/null
+++ b/python/samples/memory_handler.py
@@ -0,0 +1,117 @@
+import cupy as cp
+import numpy as np
+
+from cuquantum import custatevec as cusv
+from cuquantum import cudaDataType, ComputeType
+
+
+if cp.cuda.runtime.runtimeGetVersion() < 11020:
+    raise RuntimeError("memory_handler example WAIVED : This example uses CUDA's "
+                       "built-in stream-ordered memory allocator, which requires "
+                       "CUDA 11.2+.")
+
+nIndexBits   = 3
+nSvSize      = (1 << nIndexBits)
+
+sv = cp.asarray([0.48+1j*0.0, 0.36+1j*0.0, 0.64+1j*0.0, 0.48+1j*0.0, 
+                 0.0+1j*0.0, 0.0+1j*0.0, 0.0+1j*0.0, 0.0+1j*0.0],
+                dtype=cp.complex128)
+
+# gates
+adjoint = 0
+layout = cusv.MatrixLayout.ROW
+
+# Hadamard gate
+hTargets = (2,)
+hNTargets = 1
+hGate = np.asarray([1/np.sqrt(2)+1j*0.0, 1/np.sqrt(2)+1j*0.0,
+                    1/np.sqrt(2)+1j*0.0, -1/np.sqrt(2)+1j*0.0],
+                   dtype=np.complex128)
+
+# control-SWAP gate
+swapTargets = (0, 1)
+swapNTargets = 2
+swapControls = (2,)
+swapNControls = 1
+swapGate = np.asarray([1.0+1j*0.0, 0.0+1j*0.0, 0.0+1j*0.0, 0.0+1j*0.0,
+                       0.0+1j*0.0, 0.0+1j*0.0, 1.0+1j*0.0, 0.0+1j*0.0,
+                       0.0+1j*0.0, 1.0+1j*0.0, 0.0+1j*0.0, 0.0+1j*0.0,
+                       0.0+1j*0.0, 0.0+1j*0.0, 0.0+1j*0.0, 1.0+1j*0.0],
+                      dtype=np.complex128)
+
+# observable
+basisBits = (2,)
+nBasisBits = 1
+observable = np.asarray([1.0+1j*0.0, 0.0+1j*0.0,
+                         0.0+1j*0.0, 0.0+1j*0.0], dtype=np.complex128)
+
+# check device config
+dev = cp.cuda.Device()
+if not dev.attributes['MemoryPoolsSupported']:
+    raise RuntimeError("memory handler example WAIVED: device does not support CUDA Memory pools")
+
+# avoid shrinking the pool
+mempool = cp.cuda.runtime.deviceGetDefaultMemPool(dev.id)
+if int(cp.__version__.split('.')[0]) >= 10:
+    # this API is exposed since CuPy v10
+    cp.cuda.runtime.memPoolSetAttribute(
+        mempool, cp.cuda.runtime.cudaMemPoolAttrReleaseThreshold, 0xffffffffffffffff)  # = UINT64_MAX
+
+# custatevec handle initialization
+handle = cusv.create()
+stream = cp.cuda.Stream()
+cusv.set_stream(handle, stream.ptr)
+
+# device memory handler
+# In Python we support 3 kinds of calling conventions as of v22.03, this example
+# involves using Python callables. Please refer to the documentation of
+# set_device_mem_handler() for further detail.
+def malloc(size, stream):
+    return cp.cuda.runtime.mallocAsync(size, stream)
+
+def free(ptr, size, stream):
+    cp.cuda.runtime.freeAsync(ptr, stream)
+
+handler = (malloc, free, "memory_handler python example")
+cusv.set_device_mem_handler(handle, handler)
+
+# apply Hadamard gate
+cusv.apply_matrix(
+    handle, sv.data.ptr, cudaDataType.CUDA_C_64F, nIndexBits,
+    hGate.ctypes.data, cudaDataType.CUDA_C_64F, layout, adjoint,
+    hTargets, hNTargets, 0, 0, 0, ComputeType.COMPUTE_DEFAULT,
+    0, 0)  # last two 0s indicate we're using our own mempool
+
+# apply Hadamard gate
+cusv.apply_matrix(
+    handle, sv.data.ptr, cudaDataType.CUDA_C_64F, nIndexBits,
+    swapGate.ctypes.data, cudaDataType.CUDA_C_64F, layout, adjoint,
+    swapTargets, swapNTargets, swapControls, 0, swapNControls, ComputeType.COMPUTE_DEFAULT,
+    0, 0)  # last two 0s indicate we're using our own mempool
+
+# apply Hadamard gate
+cusv.apply_matrix(
+    handle, sv.data.ptr, cudaDataType.CUDA_C_64F, nIndexBits,
+    hGate.ctypes.data, cudaDataType.CUDA_C_64F, layout, adjoint,
+    hTargets, hNTargets, 0, 0, 0, ComputeType.COMPUTE_DEFAULT,
+    0, 0)  # last two 0s indicate we're using our own mempool
+
+# compute expectation
+expect = np.empty((1,), dtype=np.float64)
+cusv.compute_expectation(
+    handle, sv.data.ptr, cudaDataType.CUDA_C_64F, nIndexBits, 
+    expect.ctypes.data, cudaDataType.CUDA_R_64F, 
+    observable.ctypes.data, cudaDataType.CUDA_C_64F, layout,
+    basisBits, nBasisBits, ComputeType.COMPUTE_DEFAULT,
+    0, 0)  # last two 0s indicate we're using our own mempool
+
+stream.synchronize()
+
+# destroy handle
+cusv.destroy(handle)
+
+expectationValueResult = 0.9608
+if np.isclose(expect, expectationValueResult):
+    print("memory_handler example PASSED")
+else:
+    raise RuntimeError("memory_handler example FAILED: wrong result")
diff --git a/python/samples/mgpu_batch_measure.py b/python/samples/mgpu_batch_measure.py
new file mode 100644
index 0000000..d4a15e3
--- /dev/null
+++ b/python/samples/mgpu_batch_measure.py
@@ -0,0 +1,118 @@
+import sys
+
+import cupy as cp
+import numpy as np
+
+from cuquantum import custatevec as cusv
+from cuquantum import cudaDataType, ComputeType
+
+
+nGlobalBits = 2
+nLocalBits = 2
+nSubSvs = (1 << nGlobalBits)
+subSvSize = (1 << nLocalBits)
+bitStringLen = 2
+bitOrdering = (1, 0)
+
+bitString = np.empty(bitStringLen, dtype=np.int32)
+bitString_result = np.asarray((0, 0), dtype=np.int32)
+
+# In real appliction, random number in range [0, 1) will be used.
+randnum = 0.72; 
+
+h_sv = np.asarray([[ 0.000+0.000j,  0.000+0.125j,  0.000+0.250j,  0.000+0.375j],
+                   [ 0.000+0.000j,  0.000-0.125j,  0.000-0.250j,  0.000-0.375j],
+                   [ 0.125+0.000j,  0.125-0.125j,  0.125-0.250j,  0.125-0.375j],
+                   [-0.125+0.000j, -0.125-0.125j, -0.125-0.250j, -0.125-0.375j]],
+                  dtype=np.complex128)
+h_sv_result = np.asarray([[ 0.0     +0.0j, 0.0+0.0j, 0.0+0.0j, 0.0+0.0j],
+                          [ 0.0     +0.0j, 0.0+0.0j, 0.0+0.0j, 0.0+0.0j],
+                          [ 0.707107+0.0j, 0.0+0.0j, 0.0+0.0j, 0.0+0.0j],
+                          [-0.707107+0.0j, 0.0+0.0j, 0.0+0.0j, 0.0+0.0j]],
+                         dtype=np.complex128)
+
+# device allocation
+if len(sys.argv) == 1:
+    numDevices = cp.cuda.runtime.getDeviceCount()
+    devices = [i % numDevices for i in range(nSubSvs)]
+else:
+    numDevices = min(len(sys.argv) - 1, nSubSvs)
+    devices = [int(sys.argv[i+1]) for i in range(numDevices)]
+    for i in range(numDevices, nSubSvs):
+        devices.append(devices[i % numDevices])
+
+print("The following devices will be used in this sample:")
+for iSv in range(nSubSvs):
+    print(f"  sub-SV {iSv} : device id {devices[iSv]}")
+
+d_sv = []
+for iSv in range(nSubSvs):
+    with cp.cuda.Device(devices[iSv]):
+        d_sv.append(cp.asarray(h_sv[iSv]))
+
+# custatevec handle initialization
+handle = []
+for iSv in range(nSubSvs):
+    with cp.cuda.Device(devices[iSv]):
+        handle.append(cusv.create())
+
+# get abs2sum for each sub state vector
+abs2SumArray = np.empty((nSubSvs,), dtype=np.float64)
+for iSv in range(nSubSvs):
+    with cp.cuda.Device(devices[iSv]) as dev:
+        cusv.abs2sum_array(
+            handle[iSv], d_sv[iSv].data.ptr, cudaDataType.CUDA_C_64F, nLocalBits,
+            # when sliced into a 0D array, NumPy returns a scalar, so we can't do
+            # abs2SumArray[iSv].ctypes.data and need this workaround
+            abs2SumArray.ctypes.data + iSv * abs2SumArray.dtype.itemsize,
+            0, 0, 0, 0, 0)
+        dev.synchronize()
+
+# get cumulative array
+cumulativeArray = np.zeros((nSubSvs + 1,), dtype=np.float64)
+cumulativeArray[1:] = np.cumsum(abs2SumArray)
+
+# measurement
+for iSv in range(nSubSvs):
+    if (cumulativeArray[iSv] <= randnum and randnum < cumulativeArray[iSv + 1]):
+        norm = cumulativeArray[nSubSvs]
+        offset = cumulativeArray[iSv]
+        with cp.cuda.Device(devices[iSv]) as dev:
+            cusv.batch_measure_with_offset(
+                handle[iSv], d_sv[iSv].data.ptr, cudaDataType.CUDA_C_64F, nLocalBits,
+                bitString.ctypes.data, bitOrdering, bitStringLen, randnum,
+                cusv.Collapse.NONE, offset, norm)
+            dev.synchronize()
+
+# get abs2Sum after collapse
+for iSv in range(nSubSvs):
+    with cp.cuda.Device(devices[iSv]) as dev:
+        cusv.abs2sum_array(
+            handle[iSv], d_sv[iSv].data.ptr, cudaDataType.CUDA_C_64F, nLocalBits,
+            abs2SumArray.ctypes.data + iSv * abs2SumArray.dtype.itemsize, 0, 0,
+            bitString.ctypes.data, bitOrdering, bitStringLen)
+        dev.synchronize()
+
+# get norm after collapse
+norm = np.sum(abs2SumArray, dtype=np.float64)
+
+# collapse sub state vectors
+for iSv in range(nSubSvs):
+    with cp.cuda.Device(devices[iSv]) as dev:
+        cusv.collapse_by_bitstring(
+            handle[iSv], d_sv[iSv].data.ptr, cudaDataType.CUDA_C_64F, nLocalBits,
+            bitString.ctypes.data, bitOrdering, bitStringLen, norm)
+        dev.synchronize()
+
+        # destroy handle when done
+        cusv.destroy(handle[iSv])
+
+        h_sv[iSv] = cp.asnumpy(d_sv[iSv])
+
+correct = np.allclose(h_sv, h_sv_result)
+correct &= np.allclose(bitString, bitString_result) 
+  
+if correct:
+    print("mgpu_batch_measure example PASSED");
+else:
+    raise RuntimeError("mgpu_batch_measure example FAILED: wrong result")
diff --git a/python/samples/mgpu_sampler.py b/python/samples/mgpu_sampler.py
new file mode 100644
index 0000000..d701e7a
--- /dev/null
+++ b/python/samples/mgpu_sampler.py
@@ -0,0 +1,133 @@
+import sys
+
+import numpy as np
+import cupy as cp
+
+import cuquantum
+from cuquantum import custatevec as cusv
+from cuquantum import cudaDataType
+
+
+nGlobalBits = 2
+nLocalBits  = 2
+nSubSvs     = (1 << nGlobalBits)
+subSvSize   = (1 << nLocalBits)
+
+nMaxShots  = 5
+nShots     = 5
+
+bitStringLen  = 4
+bitOrdering = [0, 1, 2, 3]
+
+bitStrings = np.empty(nShots, dtype=np.int64)
+bitStrings_result = np.asarray([0b0011, 0b0011, 0b0111, 0b1011, 0b1110], dtype=np.int64)
+
+# In real appliction, random numbers in range [0, 1) will be used.
+randnums = np.asarray([0.1, 0.2, 0.4, 0.6, 0.8], dtype=np.float64)
+
+h_sv = np.asarray([[ 0.000+0.000j,  0.000+0.125j,  0.000+0.250j,  0.000+0.375j],
+                   [ 0.000+0.000j,  0.000-0.125j,  0.000-0.250j,  0.000-0.375j],
+                   [ 0.125+0.000j,  0.125-0.125j,  0.125-0.250j,  0.125-0.375j],
+                   [-0.125+0.000j, -0.125-0.125j, -0.125-0.250j, -0.125-0.375j]],
+                  dtype=np.complex128)
+
+# device allocation
+if len(sys.argv) == 1:
+    numDevices = cp.cuda.runtime.getDeviceCount()
+    devices = [i % numDevices for i in range(nSubSvs)]
+else:
+    numDevices = min(len(sys.argv) - 1, nSubSvs)
+    devices = [int(sys.argv[i+1]) for i in range(numDevices)]
+    for i in range(numDevices, nSubSvs):
+        devices.append(devices[i % numDevices])
+
+print("The following devices will be used in this sample:")
+for iSv in range(nSubSvs):
+    print(f"  sub-SV {iSv} : device id {devices[iSv]}")
+
+d_sv = []
+for iSv in range(nSubSvs):
+    with cp.cuda.Device(devices[iSv]):
+        d_sv.append(cp.asarray(h_sv[iSv]))
+
+# custatevec handle initialization
+handle = []
+for iSv in range(nSubSvs):
+    with cp.cuda.Device(devices[iSv]):
+        handle.append(cusv.create())
+
+# create sampler and check the size of external workspace
+sampler = []
+extraWorkspaceSizeInBytes = []
+for iSv in range(nSubSvs):
+    with cp.cuda.Device(devices[iSv]) as dev:
+        s, size = cusv.sampler_create(
+            handle[iSv], d_sv[iSv].data.ptr, cudaDataType.CUDA_C_64F, nLocalBits,
+            nMaxShots)
+        sampler.append(s)
+        extraWorkspaceSizeInBytes.append(size)
+
+# allocate external workspace if necessary
+extraWorkspace = []
+for iSv in range(nSubSvs):
+    if extraWorkspaceSizeInBytes[iSv] > 0:
+        with cp.cuda.Device(devices[iSv]) as dev:
+            extraWorkspace.append(cp.cuda.alloc(extraWorkspaceSizeInBytes[iSv]))
+
+# sample preprocess
+for iSv in range(nSubSvs):
+    with cp.cuda.Device(devices[iSv]) as dev:
+        cusv.sampler_preprocess(
+            handle[iSv], sampler[iSv], extraWorkspace[iSv].ptr,
+            extraWorkspaceSizeInBytes[iSv])
+
+# get norm of the sub state vectors
+subNorms = []
+for iSv in range(nSubSvs):
+    with cp.cuda.Device(devices[iSv]) as dev:
+        subNorms.append(cusv.sampler_get_squared_norm(handle[iSv], sampler[iSv]))
+        dev.synchronize()
+
+# get cumulative array & norm
+cumulativeArray = np.zeros(nSubSvs + 1, dtype=np.float64)
+cumulativeArray[1:] = np.cumsum(subNorms)
+norm = cumulativeArray[nSubSvs]
+
+# apply offset and norm
+for iSv in range(nSubSvs):
+    with cp.cuda.Device(devices[iSv]) as dev:
+        cusv.sampler_apply_sub_sv_offset(
+            handle[iSv], sampler[iSv], iSv, nSubSvs, cumulativeArray[iSv], norm)
+
+# divide randnum array
+shotOffsets = np.zeros(nSubSvs+1, dtype=np.int32)
+pos = np.searchsorted(randnums, cumulativeArray[1:]/norm)
+pos[nSubSvs-1] = nShots
+shotOffsets[1:] = pos
+
+# sample bit strings
+for iSv in range(nSubSvs):
+    shotOffset = int(shotOffsets[iSv])
+    nSubShots = shotOffsets[iSv + 1] - shotOffsets[iSv]
+    if nSubShots > 0:
+        with cp.cuda.Device(devices[iSv]) as dev:
+            cusv.sampler_sample(
+                handle[iSv], sampler[iSv],
+                # when sliced into a 0D array, NumPy returns a scalar, so we can't do
+                # bitStrings[shotOffset].ctypes.data and need this workaround
+                bitStrings.ctypes.data + shotOffset * bitStrings.dtype.itemsize,
+                bitOrdering, bitStringLen,
+                randnums.ctypes.data + shotOffset * randnums.dtype.itemsize,
+                nSubShots, cusv.SamplerOutput.RANDNUM_ORDER)
+
+# destroy sampler descriptor and custatevec handle
+for iSv in range(nSubSvs):
+    cp.cuda.Device(devices[iSv]).synchronize()
+    cusv.sampler_destroy(sampler[iSv])
+    cusv.destroy(handle[iSv])
+
+correct = np.allclose(bitStrings, bitStrings_result)
+if correct:
+    print("mgpu_sampler example PASSED")
+else:
+    raise RuntimeError("mgpu_sampler example FAILED: wrong result")
diff --git a/python/samples/permutation_matrix.py b/python/samples/permutation_matrix.py
index 0486fb1..536561c 100644
--- a/python/samples/permutation_matrix.py
+++ b/python/samples/permutation_matrix.py
@@ -7,13 +7,13 @@
 
 nIndexBits = 3
 nSvSize    = (1 << nIndexBits)
-nBasisBits = 2
-maskLen    = 1
+n_targets = 2
+n_controls    = 1
 adjoint    = 0
 
-basisBits  = [0, 1]
-maskOrdering = [2]
-maskBitString = [1]
+targets  = [0, 1]
+controls = [2]
+control_bit_values = [1]
 permutation  = np.asarray([0, 2, 1, 3], dtype=np.int64)
 
 d_sv       = cp.asarray([0.0+0.0j, 0.0+0.1j, 0.1+0.1j, 0.1+0.2j,
@@ -28,9 +28,9 @@
 handle = cusv.create()
 
 # check the size of external workspace
-workspaceSize = cusv.apply_generalized_permutation_matrix_buffer_size(
+workspaceSize = cusv.apply_generalized_permutation_matrix_get_workspace_size(
     handle, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits, permutation.ctypes.data, diagonals.ctypes.data,
-    cuquantum.cudaDataType.CUDA_C_32F, basisBits, nBasisBits, maskLen)
+    cuquantum.cudaDataType.CUDA_C_32F, targets, n_targets, n_controls)
 if workspaceSize > 0:
     workspace = cp.cuda.memory.alloc(workspaceSize)
     workspace_ptr = workspace.ptr
@@ -41,7 +41,7 @@
 cusv.apply_generalized_permutation_matrix(
     handle, d_sv.data.ptr, cuquantum.cudaDataType.CUDA_C_32F, nIndexBits,
     permutation.ctypes.data, diagonals.ctypes.data, cuquantum.cudaDataType.CUDA_C_32F, adjoint,
-    basisBits, nBasisBits, maskBitString, maskOrdering, maskLen,
+    targets, n_targets, controls, control_bit_values, n_controls,
     workspace_ptr, workspaceSize)
 
 # destroy handle
diff --git a/python/samples/sampler.py b/python/samples/sampler.py
index 851121e..bd425f6 100644
--- a/python/samples/sampler.py
+++ b/python/samples/sampler.py
@@ -45,7 +45,7 @@
     handle, sampler, bitStrings.ctypes.data, bitOrdering.ctypes.data, bitStringLen,
     randnums.ctypes.data, nShots, cusv.SamplerOutput.ASCENDING_ORDER)
 
-# destroy sampler (only required in Python)
+# destroy sampler
 cusv.sampler_destroy(sampler)
 
 # destroy handle
diff --git a/python/samples/swap_index_bits.py b/python/samples/swap_index_bits.py
new file mode 100644
index 0000000..931e51b
--- /dev/null
+++ b/python/samples/swap_index_bits.py
@@ -0,0 +1,46 @@
+import cupy as cp
+import numpy as np
+
+from cuquantum import custatevec as cusv
+from cuquantum import cudaDataType, ComputeType
+
+
+nIndexBits = 3
+nSvSize = (1 << nIndexBits)
+
+# swap 0th and 2nd qubits
+nBitSwaps = 1
+bitSwaps = [(0, 2)]
+
+# swap the state vector elements only if 1st qubit is 1
+maskLen = 1;
+maskBitString = [1]
+maskOrdering = [1]
+
+# 0.2|001> + 0.4|011> - 0.4|101> - 0.8|111>
+sv = cp.asarray([0.0+0.0j,  0.2+0.0j, 0.0+0.0j,  0.4+0.0j, 
+                 0.0+0.0j, -0.4+0.0j, 0.0+0.0j, -0.8+0.0j],
+                dtype=cp.complex128)
+
+# 0.2|001> + 0.4|110> - 0.4|101> - 0.8|111>
+sv_result = cp.asarray([0.0+0.0j,  0.2+0.0j, 0.0+0.0j,  0.0+0.0j, 
+                        0.0+0.0j, -0.4+0.0j, 0.4+0.0j, -0.8+0.0j],
+                       dtype=cp.complex128)
+
+# custatevec handle initialization
+handle = cusv.create()
+
+# bit swap
+cusv.swap_index_bits(
+    handle, sv.data.ptr, cudaDataType.CUDA_C_64F, nIndexBits,
+    bitSwaps, nBitSwaps,
+    maskBitString, maskOrdering, maskLen)
+
+# destroy handle
+cusv.destroy(handle)
+
+correct = cp.allclose(sv, sv_result)
+if correct:
+    print("swap_index_bits example PASSED")
+else:
+    raise RuntimeError("swap_index_bits example FAILED: wrong result")
diff --git a/python/samples/tensornet_example.py b/python/samples/tensornet_example.py
index df928dc..d59ef15 100644
--- a/python/samples/tensornet_example.py
+++ b/python/samples/tensornet_example.py
@@ -136,8 +136,20 @@
 # Initialize all pair-wise contraction plans (for cuTENSOR)
 ###########################################################
 
+workDesc = cutn.create_workspace_descriptor(handle)
+cutn.workspace_compute_sizes(handle, descNet, optimizerInfo, workDesc)
+requiredWorkspaceSize = cutn.workspace_get_size(
+    handle, workDesc,
+    cutn.WorksizePref.MIN,
+    cutn.Memspace.DEVICE)
+if worksize < requiredWorkspaceSize:
+    raise MemoryError("Not enough workspace memory is available.")
+cutn.workspace_set(
+    handle, workDesc,
+    cutn.Memspace.DEVICE,
+    work.ptr, worksize)
 plan = cutn.create_contraction_plan(
-    handle, descNet, optimizerInfo, worksize)
+    handle, descNet, optimizerInfo, workDesc)
 
 ###################################################################################
 # Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
@@ -145,10 +157,10 @@
 
 pref = cutn.create_contraction_autotune_preference(handle)
 
-# may be 0
+numAutotuningIterations = 5  # may be 0
 n_iter_dtype = cutn.contraction_autotune_preference_get_attribute_dtype(
     cutn.ContractionAutotunePreferenceAttribute.MAX_ITERATIONS)
-numAutotuningIterations = np.asarray([5], dtype=n_iter_dtype)
+numAutotuningIterations = np.asarray([numAutotuningIterations], dtype=n_iter_dtype)
 cutn.contraction_autotune_preference_set_attribute(
     handle, pref,
     cutn.ContractionAutotunePreferenceAttribute.MAX_ITERATIONS,
@@ -157,7 +169,7 @@
 # modify the plan again to find the best pair-wise contractions
 cutn.contraction_autotune(
     handle, plan, rawDataIn_d, D_d.data.ptr,
-    work.ptr, worksize, pref, stream.ptr)
+    workDesc, pref, stream.ptr)
 
 cutn.destroy_contraction_autotune_preference(pref)
  
@@ -182,7 +194,7 @@
         e1.record()
         cutn.contraction(
             handle, plan, rawDataIn_d, D_d.data.ptr,
-            work.ptr, worksize, sliceId, stream.ptr)
+            workDesc, sliceId, stream.ptr)
         e2.record()
 
         # Synchronize and measure timing
@@ -210,6 +222,7 @@
 cutn.destroy_contraction_optimizer_info(optimizerInfo)
 cutn.destroy_contraction_optimizer_config(optimizerConfig)
 cutn.destroy_network_descriptor(descNet)
+cutn.destroy_workspace_descriptor(workDesc)
 cutn.destroy(handle)
 
 print("Free resource and exit.")
diff --git a/python/samples/test_matrix_type.py b/python/samples/test_matrix_type.py
new file mode 100644
index 0000000..19eeed1
--- /dev/null
+++ b/python/samples/test_matrix_type.py
@@ -0,0 +1,63 @@
+import cupy as cp
+import numpy as np
+
+from cuquantum import custatevec as cusv
+from cuquantum import cudaDataType, ComputeType
+
+
+def run_test_matrix_type(
+        handle, matrixType, matrix, matrixDataType, layout, nTargets,
+        adjoint, computeType):
+    # check the size of external workspace
+    extraWorkspaceSizeInBytes = cusv.test_matrix_type_get_workspace_size(
+        handle, matrixType, matrix.ctypes.data, matrixDataType, layout,
+        nTargets, adjoint, computeType)
+
+    # allocate external workspace if necessary
+    if extraWorkspaceSizeInBytes > 0:
+        extraWorkspace = cp.cuda.alloc(extraWorkspaceSizeInBytes)
+        extraWorkspacePtr = extraWorkspace.ptr
+    else:
+        extraWorkspacePtr = 0
+
+    # execute testing
+    residualNorm = cusv.test_matrix_type(
+        handle, matrixType, matrix.ctypes.data, matrixDataType, layout,
+        nTargets, adjoint, computeType, extraWorkspacePtr, extraWorkspaceSizeInBytes)
+
+    cp.cuda.Device().synchronize()
+
+    return residualNorm
+
+
+if __name__ == '__main__':
+    nTargets = 1
+    adjoint = 0
+
+    # unitary and Hermitian matrix
+    matrix = np.asarray([0.5+0.0j, 1/np.sqrt(2)-0.5j,
+                         1/np.sqrt(2)+0.5j, -0.5+0.0j], dtype=np.complex128)
+
+    # custatevec handle initialization
+    handle = cusv.create()
+
+    matrixDataType = cudaDataType.CUDA_C_64F
+    layout = cusv.MatrixLayout.ROW
+    computeType = ComputeType.COMPUTE_DEFAULT
+
+    unitaryResidualNorm = run_test_matrix_type(handle, cusv.MatrixType.UNITARY, matrix,
+                                               matrixDataType, layout, nTargets, adjoint,
+                                               computeType)
+    hermiteResidualNorm = run_test_matrix_type(handle, cusv.MatrixType.HERMITIAN, matrix,
+                                               matrixDataType, layout, nTargets, adjoint,
+                                               computeType)
+
+    # destroy handle
+    cusv.destroy(handle)
+
+    correct = np.allclose(unitaryResidualNorm, 0.)
+    correct &= np.allclose(hermiteResidualNorm, 0.)
+    if correct:
+        print("test_matrix_type example PASSED")
+    else:
+        raise RuntimeError("test_matrix_type example FAILED: wrong result")
diff --git a/python/setup.py b/python/setup.py
index 2b59615..580fe6b 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -3,6 +3,7 @@
 import subprocess
 import sys
 
+from packaging.version import Version
 from setuptools import setup, Extension, find_packages
 from Cython.Build import cythonize
 
@@ -13,6 +14,13 @@
     exec(f.read())
 
 
+# set up version constraints: note that CalVer like 22.03 is normalized to
+# 22.3 by setuptools, so we must follow the same practice in the constraints;
+# also, we don't need the Python patch number here
+cuqnt_py_ver = Version(__version__)
+cuqnt_ver_major_minor = f"{cuqnt_py_ver.major}.{cuqnt_py_ver.minor}"
+
+
 # search order:
 # 1. installed "cuquantum" package
 # 2. env var
@@ -75,8 +83,10 @@
     raise RuntimeError('CUDA is not found, please set $CUDA_PATH') from e
 
 
+# TODO: use setup.cfg and/or pyproject.toml
 setup_requires = [
     'Cython>=0.29.22,<3',
+    'packaging',
     ]
 install_requires = [
     'numpy',
@@ -88,9 +98,8 @@
     assert using_cuquantum_wheel  # if this raises, the env is corrupted
     # cuTENSOR version is constrained in the cuquantum package, so we don't
     # need to list it
-    ver = '.'.join(__version__.split('.')[:3])  # remove the Python patch number
-    setup_requires.append('cuquantum=='+ver+'.*')
-    install_requires.append('cuquantum=='+ver+'.*')
+    setup_requires.append(f'cuquantum=={cuqnt_ver_major_minor}.*')
+    install_requires.append(f'cuquantum=={cuqnt_ver_major_minor}.*')
 
 
 def check_cuda_version():
@@ -135,8 +144,8 @@ def prepare_libs_and_rpaths():
 
     global cusv_lib, cutn_lib, extra_linker_flags
     if using_cuquantum_wheel:
-        cusv_lib = [':libcustatevec.so.0']
-        cutn_lib = [':libcutensornet.so.0', ':libcutensor.so.1']
+        cusv_lib = [':libcustatevec.so.1']
+        cutn_lib = [':libcutensornet.so.1', ':libcutensor.so.1']
         # The rpaths must be adjusted given the following full-wheel installation:
         #   cuquantum-python: site-packages/cuquantum/{custatevec, cutensornet}/  [=$ORIGIN]
         #   cusv & cutn:      site-packages/cuquantum/lib/
@@ -193,7 +202,7 @@ def prepare_libs_and_rpaths():
     license="BSD-3-Clause",
     license_files = ('LICENSE',),
     classifiers=[
-        "Development Status :: 4 - Beta",
+        "Development Status :: 5 - Production/Stable",
         "License :: OSI Approved :: BSD License",
         "Operating System :: POSIX :: Linux",
         "Topic :: Education",
@@ -202,12 +211,16 @@ def prepare_libs_and_rpaths():
         "Programming Language :: Python :: 3.7",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: Implementation :: CPython",
         "Environment :: GPU :: NVIDIA CUDA",
+        "Environment :: GPU :: NVIDIA CUDA :: 11.0",
+        "Environment :: GPU :: NVIDIA CUDA :: 11.1",
         "Environment :: GPU :: NVIDIA CUDA :: 11.2",
         "Environment :: GPU :: NVIDIA CUDA :: 11.3",
         "Environment :: GPU :: NVIDIA CUDA :: 11.4",
         "Environment :: GPU :: NVIDIA CUDA :: 11.5",
+        #"Environment :: GPU :: NVIDIA CUDA :: 11.6",  # PyPI has not added it yet
     ],
     ext_modules=cythonize([
         custatevec,
@@ -217,10 +230,12 @@ def prepare_libs_and_rpaths():
     packages=find_packages(include=['cuquantum', 'cuquantum.*']),
     package_data={"": ["*.pxd", "*.pyx", "*.py"],},
     zip_safe=False,
+    python_requires='>=3.7',
     setup_requires=setup_requires,
     install_requires=install_requires,
     tests_require=install_requires + [
         # pytest < 6.2 is slow in collecting tests
         'pytest>=6.2',
+        #'cffi>=1.0.0',  # optional
     ]
 )
diff --git a/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py b/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py
index a86c4ea..859005d 100644
--- a/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py
+++ b/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py
@@ -1,3 +1,11 @@
+import copy
+import os
+import tempfile
+
+try:
+    import cffi
+except ImportError:
+    cffi = None
 import cupy
 from cupy import testing
 import numpy
@@ -5,7 +13,7 @@
 
 import cuquantum
 from cuquantum import ComputeType, cudaDataType
-from cuquantum import custatevec
+from cuquantum import custatevec as cusv
 
 
 ###################################################################
@@ -31,9 +39,9 @@
 
 @pytest.fixture()
 def handle():
-    h = custatevec.create()
+    h = cusv.create()
     yield h
-    custatevec.destroy(h)
+    cusv.destroy(h)
 
 
 @testing.parameterize(*testing.product({
@@ -48,6 +56,76 @@ def get_sv(self):
         arr[0] = 1  # initialize in |000...00>
         return arr
 
+    # TODO: make this a static method
+    def _return_data(self, data, name, dtype, return_value):
+        if return_value == 'int':
+            if len(data) == 0:
+                # empty, give it a NULL
+                return 0, 0
+            else:
+                # return int as void*
+                data = numpy.asarray(data, dtype=dtype)
+                setattr(self, name, data)  # keep data alive
+                return data.ctypes.data, data.size
+        elif return_value == 'seq':
+            # data itself is already a flat sequence
+            return data, len(data)
+        else:
+            assert False
+
+
+@pytest.fixture()
+def multi_gpu_handles():
+    # TODO: consider making this class more flexible
+    # (ex: arbitrary number of qubits and/or devices, etc)
+    n_devices = 2  # should be power of 2
+    handles = []
+
+    for dev in range(n_devices):
+        with cupy.cuda.Device(dev):
+            h = cusv.create()
+            handles.append(h)
+    yield handles
+    for dev in range(n_devices):
+        with cupy.cuda.Device(dev):
+            h = handles.pop(0)
+            cusv.destroy(h)
+
+
+def get_exponent(n):
+    assert (n % 2) == 0
+    exponent = 1
+    while True:
+        out = n >> exponent
+        if out != 1:
+            exponent += 1
+        else:
+            break
+    return exponent
+
+
+@testing.parameterize(*testing.product({
+    'n_qubits': (4,),
+    'dtype': (numpy.complex64, numpy.complex128),
+}))
+class TestMultiGpuSV:
+    # TODO: consider making this class more flexible
+    # (ex: arbitrary number of qubits and/or devices, etc)
+    n_devices = 2  # should be power of 2
+
+    def get_sv(self):
+        self.n_global_bits = get_exponent(self.n_devices)
+        self.n_local_bits = self.n_qubits - self.n_global_bits
+
+        self.sub_sv = []
+        for dev in range(self.n_devices):
+            with cupy.cuda.Device(dev):
+                self.sub_sv.append(cupy.zeros(
+                    2**self.n_local_bits, dtype=self.dtype))
+        self.sub_sv[0][0] = 1  # initialize in |000...00>
+        return self.sub_sv
+
+    # TODO: make this a static method
     def _return_data(self, data, name, dtype, return_value):
         if return_value == 'int':
             if len(data) == 0:
@@ -68,21 +146,170 @@ def _return_data(self, data, name, dtype, return_value):
 class TestLibHelper:
 
     def test_get_version(self):
-        ver = custatevec.get_version()
-        assert ver == (custatevec.MAJOR_VER * 1000
-            + custatevec.MINOR_VER * 100
-            + custatevec.PATCH_VER)
-        assert ver == custatevec.VERSION
+        ver = cusv.get_version()
+        assert ver == (cusv.MAJOR_VER * 1000
+            + cusv.MINOR_VER * 100
+            + cusv.PATCH_VER)
+        assert ver == cusv.VERSION
 
     def test_get_property(self):
-        assert custatevec.MAJOR_VER == custatevec.get_property(
+        assert cusv.MAJOR_VER == cusv.get_property(
             cuquantum.libraryPropertyType.MAJOR_VERSION)
-        assert custatevec.MINOR_VER == custatevec.get_property(
+        assert cusv.MINOR_VER == cusv.get_property(
             cuquantum.libraryPropertyType.MINOR_VERSION)
-        assert custatevec.PATCH_VER == custatevec.get_property(
+        assert cusv.PATCH_VER == cusv.get_property(
             cuquantum.libraryPropertyType.PATCH_LEVEL)
 
 
+# we don't wanna recompile for every test case...
+_cffi_mod1 = None
+_cffi_mod2 = None
+
+def _can_use_cffi():
+    if cffi is None or os.environ.get('CUDA_PATH') is None:
+        return False
+    else:
+        return True
+
+
+class MemoryResourceFactory:
+
+    def __init__(self, source, name=None):
+        self.source = source
+        self.name = source if name is None else name
+
+    def get_dev_mem_handler(self):
+        if self.source == "py-callable":
+            return (*self._get_cuda_callable(), self.name)
+        elif self.source == "cffi":
+            # ctx is not needed, so set to NULL
+            return (0, *self._get_functor_address(), self.name)
+        elif self.source == "cffi_struct":
+            return self._get_handler_address()
+        # TODO: add more different memory sources
+        else:
+            raise NotImplementedError
+
+    def _get_cuda_callable(self):
+        def alloc(size, stream):
+            return cupy.cuda.runtime.mallocAsync(size, stream)
+
+        def free(ptr, size, stream):
+            cupy.cuda.runtime.freeAsync(ptr, stream)
+
+        return alloc, free
+
+    def _get_functor_address(self):
+        if not _can_use_cffi():
+            raise RuntimeError
+
+        global _cffi_mod1
+        if _cffi_mod1 is None:
+            import importlib
+            mod_name = f"cusv_test_{self.source}"
+            ffi = cffi.FFI()
+            ffi.set_source(mod_name, """
+                #include <cuda_runtime.h>
+
+                // cffi limitation: we can't use the actual type cudaStream_t because
+                // it's considered an "incomplete" type and we can't get the functor
+                // address by doing so...
+
+                int my_alloc(void* ctx, void** ptr, size_t size, void* stream) {
+                    return (int)cudaMallocAsync(ptr, size, stream);
+                }
+
+                int my_free(void* ctx, void* ptr, size_t size, void* stream) {
+                    return (int)cudaFreeAsync(ptr, stream);
+                }
+                """,
+                include_dirs=[os.environ['CUDA_PATH']+'/include'],
+                library_dirs=[os.environ['CUDA_PATH']+'/lib64'],
+                libraries=['cudart'],
+            )
+            ffi.cdef("""
+                int my_alloc(void* ctx, void** ptr, size_t size, void* stream);
+                int my_free(void* ctx, void* ptr, size_t size, void* stream);
+            """)
+            ffi.compile(verbose=True)
+            self.ffi = ffi
+            _cffi_mod1 = importlib.import_module(mod_name)
+        self.ffi_mod = _cffi_mod1
+
+        alloc_addr = self._get_address("my_alloc")
+        free_addr = self._get_address("my_free")
+        return alloc_addr, free_addr
+
+    def _get_handler_address(self):
+        if not _can_use_cffi():
+            raise RuntimeError
+
+        global _cffi_mod2
+        if _cffi_mod2 is None:
+            import importlib
+            mod_name = f"cusv_test_{self.source}"
+            ffi = cffi.FFI()
+            ffi.set_source(mod_name, """
+                #include <cuda_runtime.h>
+
+                // cffi limitation: we can't use the actual type cudaStream_t because
+                // it's considered an "incomplete" type and we can't get the functor
+                // address by doing so...
+
+                int my_alloc(void* ctx, void** ptr, size_t size, void* stream) {
+                    return (int)cudaMallocAsync(ptr, size, stream);
+                }
+
+                int my_free(void* ctx, void* ptr, size_t size, void* stream) {
+                    return (int)cudaFreeAsync(ptr, stream);
+                }
+
+                typedef struct {
+                    void* ctx;
+                    int (*device_alloc)(void* ctx, void** ptr, size_t size, void* stream);
+                    int (*device_free)(void* ctx, void* ptr, size_t size, void* stream);
+                    char name[64];
+                } myHandler;
+
+                myHandler* init_myHandler(myHandler* h, const char* name) {
+                    h->ctx = NULL;
+                    h->device_alloc = my_alloc;
+                    h->device_free = my_free;
+                    memcpy(h->name, name, 64);
+                    return h;
+                }
+                """,
+                include_dirs=[os.environ['CUDA_PATH']+'/include'],
+                library_dirs=[os.environ['CUDA_PATH']+'/lib64'],
+                libraries=['cudart'],
+            )
+            ffi.cdef("""
+                typedef struct {
+                    ...;
+                } myHandler;
+
+                myHandler* init_myHandler(myHandler* h, const char* name);
+            """)
+            ffi.compile(verbose=True)
+            self.ffi = ffi
+            _cffi_mod2 = importlib.import_module(mod_name)
+        self.ffi_mod = _cffi_mod2
+
+        h = self.handler = self.ffi_mod.ffi.new("myHandler*")
+        self.ffi_mod.lib.init_myHandler(h, self.name.encode())
+        return self._get_address(h)
+
+    def _get_address(self, func_name_or_ptr):
+        if isinstance(func_name_or_ptr, str):
+            func_name = func_name_or_ptr
+            data = str(self.ffi_mod.ffi.addressof(self.ffi_mod.lib, func_name))
+        else:
+            ptr = func_name_or_ptr  # ptr to struct
+            data = str(self.ffi_mod.ffi.addressof(ptr[0]))
+        # data has this format: "<cdata 'int(*)(void *, void * *, size_t, void *)' 0x7f6c5da37300>"
+        return int(data.split()[-1][:-1], base=16)
+
+
 class TestHandle:
 
     def test_handle_create_destroy(self, handle):
@@ -90,23 +317,23 @@ def test_handle_create_destroy(self, handle):
         pass
 
     def test_workspace(self, handle):
-        default_workspace_size = custatevec.get_default_workspace_size(handle)
+        default_workspace_size = cusv.get_default_workspace_size(handle)
         # this is about 18MB as of cuQuantum beta 1
         assert default_workspace_size > 0
         # cuStateVec does not like a smaller workspace...
         size = 24*1024**2
         assert size > default_workspace_size
         memptr = cupy.cuda.alloc(size)
-        custatevec.set_workspace(handle, memptr.ptr, size)  # should not fail
+        cusv.set_workspace(handle, memptr.ptr, size)  # should not fail
 
     def test_stream(self, handle):
         # default is on the null stream
-        assert 0 == custatevec.get_stream(handle)
+        assert 0 == cusv.get_stream(handle)
 
         # simple set/get round-trip
         stream = cupy.cuda.Stream()
-        custatevec.set_stream(handle, stream.ptr)
-        assert stream.ptr == custatevec.get_stream(handle)
+        cusv.set_stream(handle, stream.ptr)
+        assert stream.ptr == cusv.get_stream(handle)
 
 
 class TestAbs2Sum(TestSV):
@@ -125,21 +352,21 @@ def test_abs2sum_on_z_basis(self, handle, input_form):
         data_type = dtype_to_data_type[sv.dtype]
 
         # case 1: both are computed
-        sum0, sum1 = custatevec.abs2sum_on_z_basis(
+        sum0, sum1 = cusv.abs2sum_on_z_basis(
             handle, sv.data.ptr, data_type, self.n_qubits,
             True, True, basis_bits, basis_bits_len)
         assert numpy.allclose(sum0+sum1, 1)
         assert (sum0 is not None) and (sum1 is not None)
 
         # case 2: only sum0 is computed
-        sum0, sum1 = custatevec.abs2sum_on_z_basis(
+        sum0, sum1 = cusv.abs2sum_on_z_basis(
             handle, sv.data.ptr, data_type, self.n_qubits,
             True, False, basis_bits, basis_bits_len)
         assert numpy.allclose(sum0, 1)
         assert (sum0 is not None) and (sum1 is None)
 
         # case 3: only sum1 is computed
-        sum0, sum1 = custatevec.abs2sum_on_z_basis(
+        sum0, sum1 = cusv.abs2sum_on_z_basis(
             handle, sv.data.ptr, data_type, self.n_qubits,
             False, True, basis_bits, basis_bits_len)
         assert numpy.allclose(sum1, 0)
@@ -147,7 +374,7 @@ def test_abs2sum_on_z_basis(self, handle, input_form):
 
         # case 4: none is computed
         with pytest.raises(ValueError):
-            sum0, sum1 = custatevec.abs2sum_on_z_basis(
+            sum0, sum1 = cusv.abs2sum_on_z_basis(
                 handle, sv.data.ptr, data_type, self.n_qubits,
                 False, False, basis_bits, basis_bits_len)
 
@@ -172,9 +399,9 @@ def test_abs2sum_array_no_mask(self, handle, xp, input_form):
         bit_ordering, bit_ordering_len = self._return_data(
             bit_ordering, 'bit_ordering', *input_form['bit_ordering'])
         # test abs2sum on both host and device
-        abs2sum = xp.empty((2**bit_ordering_len,), dtype=xp.float64)
+        abs2sum = xp.zeros((2**bit_ordering_len,), dtype=xp.float64)
         abs2sum_ptr = abs2sum.data.ptr if xp is cupy else abs2sum.ctypes.data
-        custatevec.abs2sum_array(
+        cusv.abs2sum_array(
             handle, sv.data.ptr, data_type, self.n_qubits, abs2sum_ptr,
             bit_ordering, bit_ordering_len, 0, 0, 0)
         assert xp.allclose(abs2sum.sum(), 1)
@@ -202,7 +429,7 @@ def test_collapse_on_z_basis(self, handle, parity, input_form):
             basis_bits, 'basis_bits', *input_form['basis_bits'])
         data_type = dtype_to_data_type[sv.dtype]
 
-        custatevec.collapse_on_z_basis(
+        cusv.collapse_on_z_basis(
             handle, sv.data.ptr, data_type, self.n_qubits,
             parity, basis_bits, basis_bits_len, 1)
 
@@ -235,7 +462,7 @@ def test_collapse_by_bitstring(self, handle, input_form):
 
         norm = 0.5
         # the sv after collapse is normalized as sv -> sv / \sqrt{norm}
-        custatevec.collapse_by_bitstring(
+        cusv.collapse_by_bitstring(
             handle, sv.data.ptr, data_type, self.n_qubits,
             bitstring, bit_ordering, bitstring_len,
             norm)
@@ -250,7 +477,7 @@ def test_collapse_by_bitstring(self, handle, input_form):
 )
 @pytest.mark.parametrize(
     'collapse',
-    (custatevec.Collapse.NORMALIZE_AND_ZERO, custatevec.Collapse.NONE)
+    (cusv.Collapse.NORMALIZE_AND_ZERO, cusv.Collapse.NONE)
 )
 class TestMeasure(TestSV):
 
@@ -273,11 +500,11 @@ def test_measure_on_z_basis(self, handle, rand, collapse, input_form):
         data_type = dtype_to_data_type[sv.dtype]
         orig_sv = sv.copy()
 
-        parity = custatevec.measure_on_z_basis(
+        parity = cusv.measure_on_z_basis(
             handle, sv.data.ptr, data_type, self.n_qubits,
             basis_bits, basis_bits_len, rand, collapse)
 
-        if collapse == custatevec.Collapse.NORMALIZE_AND_ZERO:
+        if collapse == cusv.Collapse.NORMALIZE_AND_ZERO:
             if parity == 0:
                 # collapse to |000>
                 assert cupy.allclose(sv[0], 1)
@@ -309,12 +536,12 @@ def test_batch_measure(self, handle, rand, collapse, input_form):
         bit_ordering, _ = self._return_data(
             bit_ordering, 'bit_ordering', *input_form['bit_ordering'])
 
-        custatevec.batch_measure(
+        cusv.batch_measure(
             handle, sv.data.ptr, data_type, self.n_qubits,
             bitstring.ctypes.data, bit_ordering, bitstring.size,
             rand, collapse)
 
-        if collapse == custatevec.Collapse.NORMALIZE_AND_ZERO:
+        if collapse == cusv.Collapse.NORMALIZE_AND_ZERO:
             if bitstring.sum() == 0:
                 # collapse to |000>
                 assert cupy.allclose(sv[0], 1)
@@ -342,7 +569,7 @@ class TestApply(TestSV):
              'paulis': (numpy.int32, 'seq'),},
         )
     )
-    def test_apply_exp(self, handle, input_form):
+    def test_apply_pauli_rotation(self, handle, input_form):
         # change sv to |100>
         sv = self.get_sv()
         sv[0] = 0
@@ -356,11 +583,11 @@ def test_apply_exp(self, handle, input_form):
         controls, controls_len = self._return_data(
             controls, 'controls', *input_form['controls'])
         control_values = 0  # set all control bits to 1
-        paulis = [custatevec.Pauli.X, custatevec.Pauli.X]
+        paulis = [cusv.Pauli.X, cusv.Pauli.X]
         paulis, _ = self._return_data(
             paulis, 'paulis', *input_form['paulis'])
 
-        custatevec.apply_exp(
+        cusv.apply_pauli_rotation(
             handle, sv.data.ptr, data_type, self.n_qubits,
             0.5*numpy.pi, paulis,
             targets, targets_len,
@@ -370,19 +597,23 @@ def test_apply_exp(self, handle, input_form):
         # result is |111>
         assert cupy.allclose(sv[-1], 1)
 
+    @pytest.mark.parametrize(
+        'mempool', (None, 'py-callable', 'cffi', 'cffi_struct')
+    )
     @pytest.mark.parametrize(
         'input_form', (
-            {'targets': (numpy.int32, 'int'), 'controls': (numpy.int32, 'int'),
-             # sizeof(enum) == sizeof(int)
-             'paulis': (numpy.int32, 'int'),},
-            {'targets': (numpy.int32, 'seq'), 'controls': (numpy.int32, 'seq'),
-             'paulis': (numpy.int32, 'seq'),},
+            {'targets': (numpy.int32, 'int'), 'controls': (numpy.int32, 'int')},
+            {'targets': (numpy.int32, 'seq'), 'controls': (numpy.int32, 'seq')},
         )
     )
     @pytest.mark.parametrize(
         'xp', (numpy, cupy)
      )
-    def test_apply_matrix(self, handle, xp, input_form):
+    def test_apply_matrix(self, handle, xp, input_form, mempool):
+        if (isinstance(mempool, str) and mempool.startswith('cffi')
+                and not _can_use_cffi()):
+            pytest.skip("cannot run cffi tests")
+
         sv = self.get_sv()
         data_type = dtype_to_data_type[sv.dtype]
         compute_type = dtype_to_compute_type[sv.dtype]
@@ -398,26 +629,37 @@ def test_apply_matrix(self, handle, xp, input_form):
         matrix[-1][0] = 1
         matrix_ptr = matrix.ctypes.data if xp is numpy else matrix.data.ptr
 
-        workspace_size = custatevec.apply_matrix_buffer_size(
-            handle, data_type, self.n_qubits,
-            matrix_ptr, data_type, custatevec.MatrixLayout.ROW, 0,
-            targets_len, controls_len, compute_type)
-        if workspace_size:
-            workspace = cupy.cuda.alloc(workspace_size)
-            workspace_ptr = workspace.ptr
+        if mempool is None:
+            workspace_size = cusv.apply_matrix_get_workspace_size(
+                handle, data_type, self.n_qubits,
+                matrix_ptr, data_type, cusv.MatrixLayout.ROW, 0,
+                targets_len, controls_len, compute_type)
+            if workspace_size:
+                workspace = cupy.cuda.alloc(workspace_size)
+                workspace_ptr = workspace.ptr
+            else:
+                workspace_ptr = 0
         else:
+            mr = MemoryResourceFactory(mempool)
+            handler = mr.get_dev_mem_handler()
+            cusv.set_device_mem_handler(handle, handler)
+
             workspace_ptr = 0
+            workspace_size = 0
 
-        custatevec.apply_matrix(
+        cusv.apply_matrix(
             handle, sv.data.ptr, data_type, self.n_qubits,
-            matrix_ptr, data_type, custatevec.MatrixLayout.ROW, 0,
+            matrix_ptr, data_type, cusv.MatrixLayout.ROW, 0,
             targets, targets_len,
-            controls, controls_len, 0,
+            controls, 0, controls_len,
             compute_type, workspace_ptr, workspace_size)
 
         assert sv[-1] == 1  # output state is |111>
 
 
+    @pytest.mark.parametrize(
+        'mempool', (None, 'py-callable', 'cffi', 'cffi_struct')
+    )
     @pytest.mark.parametrize(
         'input_form', (
             {'permutation': (numpy.int64, 'int'), 'basis_bits': (numpy.int32, 'int'),
@@ -429,7 +671,12 @@ def test_apply_matrix(self, handle, xp, input_form):
     @pytest.mark.parametrize(
         'xp', (numpy, cupy)
      )
-    def test_apply_generalized_permutation_matrix(self, handle, xp, input_form):
+    def test_apply_generalized_permutation_matrix(
+            self, handle, xp, input_form, mempool):
+        if (isinstance(mempool, str) and mempool.startswith('cffi')
+                and not _can_use_cffi()):
+            pytest.skip("cannot run cffi tests")
+
         sv = self.get_sv()
         sv[:] = 1  # invalid sv just to make math checking easier
         data_type = dtype_to_data_type[sv.dtype]
@@ -454,18 +701,26 @@ def test_apply_generalized_permutation_matrix(self, handle, xp, input_form):
         mask_ordering = 0
         mask_len = 0
 
-        workspace_size = custatevec.apply_generalized_permutation_matrix_buffer_size(
-            handle, data_type, self.n_qubits,
-            permutation, diagonal_ptr, data_type,
-            basis_bits, basis_bits_len, mask_len)
+        if mempool is None:
+            workspace_size = cusv.apply_generalized_permutation_matrix_get_workspace_size(
+                handle, data_type, self.n_qubits,
+                permutation, diagonal_ptr, data_type,
+                basis_bits, basis_bits_len, mask_len)
 
-        if workspace_size:
-            workspace = cupy.cuda.alloc(workspace_size)
-            workspace_ptr = workspace.ptr
+            if workspace_size:
+                workspace = cupy.cuda.alloc(workspace_size)
+                workspace_ptr = workspace.ptr
+            else:
+                workspace_ptr = 0
         else:
+            mr = MemoryResourceFactory(mempool)
+            handler = mr.get_dev_mem_handler()
+            cusv.set_device_mem_handler(handle, handler)
+
             workspace_ptr = 0
+            workspace_size = 0
 
-        custatevec.apply_generalized_permutation_matrix(
+        cusv.apply_generalized_permutation_matrix(
             handle, sv.data.ptr, data_type, self.n_qubits,
             permutation, diagonal_ptr, data_type, 0,
             basis_bits, basis_bits_len,
@@ -477,6 +732,9 @@ def test_apply_generalized_permutation_matrix(self, handle, xp, input_form):
 
 class TestExpect(TestSV):
 
+    @pytest.mark.parametrize(
+        'mempool', (None, 'py-callable', 'cffi', 'cffi_struct')
+    )
     @pytest.mark.parametrize(
         'input_form', (
             {'basis_bits': (numpy.int32, 'int'),},
@@ -489,7 +747,11 @@ class TestExpect(TestSV):
     @pytest.mark.parametrize(
         'xp', (numpy, cupy)
     )
-    def test_expectation(self, handle, xp, expect_dtype, input_form):
+    def test_compute_expectation(self, handle, xp, expect_dtype, input_form, mempool):
+        if (isinstance(mempool, str) and mempool.startswith('cffi')
+                and not _can_use_cffi()):
+            pytest.skip("cannot run cffi tests")
+
         # create a uniform sv
         sv = self.get_sv()
         sv[:] = numpy.sqrt(1/(2**self.n_qubits))
@@ -504,15 +766,23 @@ def test_expectation(self, handle, xp, expect_dtype, input_form):
         matrix = xp.ones((2**self.n_qubits, 2**self.n_qubits), dtype=sv.dtype)
         matrix_ptr = matrix.ctypes.data if xp is numpy else matrix.data.ptr
 
-        workspace_size = custatevec.expectation_buffer_size(
-            handle, data_type, self.n_qubits,
-            matrix_ptr, data_type, custatevec.MatrixLayout.ROW,
-            basis_bits_len, compute_type)
-        if workspace_size:
-            workspace = cupy.cuda.alloc(workspace_size)
-            workspace_ptr = workspace.ptr
+        if mempool is None:
+            workspace_size = cusv.compute_expectation_get_workspace_size(
+                handle, data_type, self.n_qubits,
+                matrix_ptr, data_type, cusv.MatrixLayout.ROW,
+                basis_bits_len, compute_type)
+            if workspace_size:
+                workspace = cupy.cuda.alloc(workspace_size)
+                workspace_ptr = workspace.ptr
+            else:
+                workspace_ptr = 0
         else:
+            mr = MemoryResourceFactory(mempool)
+            handler = mr.get_dev_mem_handler()
+            cusv.set_device_mem_handler(handle, handler)
+
             workspace_ptr = 0
+            workspace_size = 0
 
         expect = numpy.empty((1,), dtype=expect_dtype)
         # TODO(leofang): check if this is relaxed in beta 2
@@ -520,25 +790,58 @@ def test_expectation(self, handle, xp, expect_dtype, input_form):
             cudaDataType.CUDA_R_64F if expect_dtype == numpy.float64
             else cudaDataType.CUDA_C_64F)
 
-        custatevec.expectation(
+        cusv.compute_expectation(
             handle, sv.data.ptr, data_type, self.n_qubits,
             expect.ctypes.data, expect_data_type,
-            matrix_ptr, data_type, custatevec.MatrixLayout.ROW,
+            matrix_ptr, data_type, cusv.MatrixLayout.ROW,
             basis_bits, basis_bits_len,
             compute_type, workspace_ptr, workspace_size)
 
         assert xp.allclose(expect, 2**self.n_qubits)
 
+    # TODO: test other input forms?
+    def test_compute_expectations_on_pauli_basis(self, handle):
+        # create a uniform sv
+        sv = self.get_sv()
+        sv[:] = numpy.sqrt(1/(2**self.n_qubits))
+        data_type = dtype_to_data_type[sv.dtype]
+        compute_type = dtype_to_compute_type[sv.dtype]
+
+        # measure XX...X, YY..Y, ZZ...Z
+        paulis = [[cusv.Pauli.X for i in range(self.n_qubits)],
+                  [cusv.Pauli.Y for i in range(self.n_qubits)],
+                  [cusv.Pauli.Z for i in range(self.n_qubits)],]
+
+        basis_bits = [[*range(self.n_qubits)] for i in range(len(paulis))]
+        n_basis_bits = [len(basis_bits[i]) for i in range(len(paulis))]
+        expect = numpy.empty((len(paulis),), dtype=numpy.float64)
+
+        cusv.compute_expectations_on_pauli_basis(
+            handle, sv.data.ptr, data_type, self.n_qubits,
+            expect.ctypes.data, paulis, len(paulis),
+            basis_bits, n_basis_bits)
+
+        result = numpy.zeros_like(expect)
+        result[0] = 1  # for XX...X
+        assert numpy.allclose(expect, result)
+
 
 class TestSampler(TestSV):
 
+    @pytest.mark.parametrize(
+        'mempool', (None, 'py-callable', 'cffi', 'cffi_struct')
+    )
     @pytest.mark.parametrize(
         'input_form', (
             {'bit_ordering': (numpy.int32, 'int'),},
             {'bit_ordering': (numpy.int32, 'seq'),},
         )
     )
-    def test_sampling(self, handle, input_form):
+    def test_sampling(self, handle, input_form, mempool):
+        if (isinstance(mempool, str) and mempool.startswith('cffi')
+                and not _can_use_cffi()):
+            pytest.skip("cannot run cffi tests")
+
         # create a uniform sv
         sv = self.get_sv()
         sv[:] = numpy.sqrt(1/(2**self.n_qubits))
@@ -554,25 +857,39 @@ def test_sampling(self, handle, input_form):
         bit_ordering, _ = self._return_data(
             bit_ordering, 'bit_ordering', *input_form['bit_ordering'])
 
-        sampler, workspace_size = custatevec.sampler_create(
+        sampler, workspace_size = cusv.sampler_create(
             handle, sv.data.ptr, data_type, self.n_qubits, shots)
-        if workspace_size:
-            workspace = cupy.cuda.alloc(workspace_size)
-            workspace_ptr = workspace.ptr
+        if mempool is None:
+            if workspace_size:
+                workspace = cupy.cuda.alloc(workspace_size)
+                workspace_ptr = workspace.ptr
+            else:
+                workspace_ptr = 0
         else:
+            mr = MemoryResourceFactory(mempool)
+            handler = mr.get_dev_mem_handler()
+            cusv.set_device_mem_handler(handle, handler)
+
             workspace_ptr = 0
+            workspace_size = 0
 
         try:
-            custatevec.sampler_preprocess(
+            cusv.sampler_preprocess(
                 handle, sampler, workspace_ptr, workspace_size)
-            custatevec.sampler_sample(
+            cusv.sampler_sample(
                 handle, sampler, bitstrings.ctypes.data,
                 bit_ordering, self.n_qubits,
                 rand_nums.ctypes.data, shots,
-                custatevec.SamplerOutput.RANDNUM_ORDER)
+                cusv.SamplerOutput.RANDNUM_ORDER)
+            norm = cusv.sampler_get_squared_norm(handle, sampler)
+
+            # TODO: add a multi-GPU test for this API
+            # We're being sloppy here by checking a trivial case, which is
+            # effectively a no-op. This is just a call check.
+            cusv.sampler_apply_sub_sv_offset(
+                handle, sampler, 0, 1, 0, norm)
         finally:
-            # This is Python-only API. Need finally to ensure it's freed.
-            custatevec.sampler_destroy(sampler)
+            cusv.sampler_destroy(sampler)
 
         keys, counts = numpy.unique(bitstrings, return_counts=True)
         # keys are the returned bitstrings 000, 001, ..., 111
@@ -580,9 +897,14 @@ def test_sampling(self, handle, input_form):
         # so the following should hold:
         assert (keys == numpy.arange(2**self.n_qubits)).all()
 
+        assert numpy.allclose(norm, 1)
+
         # TODO: test counts, which should follow a uniform distribution
 
 
+@pytest.mark.parametrize(
+    'mempool', (None, 'py-callable', 'cffi', 'cffi_struct')
+)
 # TODO(leofang): test mask_bitstring & mask_ordering
 @pytest.mark.parametrize(
     'input_form', (
@@ -595,7 +917,11 @@ def test_sampling(self, handle, input_form):
 )
 class TestAccessor(TestSV):
 
-    def test_accessor_get(self, handle, input_form, readonly):
+    def test_accessor_get(self, handle, readonly, input_form, mempool):
+        if (isinstance(mempool, str) and mempool.startswith('cffi')
+                and not _can_use_cffi()):
+            pytest.skip("cannot run cffi tests")
+
         # create a monotonically increasing sv
         sv = self.get_sv()
         data = cupy.arange(2**self.n_qubits, dtype=sv.dtype)
@@ -615,9 +941,9 @@ def test_accessor_get(self, handle, input_form, readonly):
         mask_len = 0
 
         if readonly:
-            accessor_create = custatevec.accessor_create_readonly
+            accessor_create = cusv.accessor_create_view
         else:
-            accessor_create = custatevec.accessor_create
+            accessor_create = cusv.accessor_create
 
         accessor, workspace_size = accessor_create(
             handle, sv.data.ptr, data_type, self.n_qubits,
@@ -625,24 +951,39 @@ def test_accessor_get(self, handle, input_form, readonly):
             mask_bitstring, mask_ordering, mask_len)
 
         try:
-            if workspace_size:
-                workspace = cupy.cuda.alloc(workspace_size)
-                custatevec.accessor_set_extra_workspace(
-                    handle, accessor, workspace.ptr, workspace_size)
+            if mempool is None:
+                if workspace_size:
+                    workspace = cupy.cuda.alloc(workspace_size)
+                    workspace_ptr = workspace.ptr
+                else:
+                    workspace_ptr = 0
+            else:
+                mr = MemoryResourceFactory(mempool)
+                handler = mr.get_dev_mem_handler()
+                cusv.set_device_mem_handler(handle, handler)
+
+                workspace_ptr = 0
+                workspace_size = 0
+
+            cusv.accessor_set_extra_workspace(
+                handle, accessor, workspace_ptr, workspace_size)
 
             buf_len = 2**2
             buf = cupy.empty(buf_len, dtype=sv.dtype)
 
             # copy the last buf_len elements
-            custatevec.accessor_get(
+            cusv.accessor_get(
                 handle, accessor, buf.data.ptr, sv.size-1-buf_len, sv.size-1)
         finally:
-            # This is Python-only API. Need finally to ensure it's freed.
-            custatevec.accessor_destroy(accessor)
+            cusv.accessor_destroy(accessor)
 
         assert (sv[sv.size-1-buf_len: sv.size-1] == buf).all()
 
-    def test_accessor_set(self, handle, input_form, readonly):
+    def test_accessor_set(self, handle, readonly, input_form, mempool):
+        if (isinstance(mempool, str) and mempool.startswith('cffi')
+                and not _can_use_cffi()):
+            pytest.skip("cannot run cffi tests")
+
         # create a monotonically increasing sv
         sv = self.get_sv()
         data = cupy.arange(2**self.n_qubits, dtype=sv.dtype)
@@ -662,9 +1003,9 @@ def test_accessor_set(self, handle, input_form, readonly):
         mask_len = 0
 
         if readonly:
-            accessor_create = custatevec.accessor_create_readonly
+            accessor_create = cusv.accessor_create_view
         else:
-            accessor_create = custatevec.accessor_create
+            accessor_create = cusv.accessor_create
 
         accessor, workspace_size = accessor_create(
             handle, sv.data.ptr, data_type, self.n_qubits,
@@ -672,29 +1013,321 @@ def test_accessor_set(self, handle, input_form, readonly):
             mask_bitstring, mask_ordering, mask_len)
 
         try:
-            if workspace_size:
-                workspace = cupy.cuda.alloc(workspace_size)
-                custatevec.accessor_set_extra_workspace(
-                    handle, accessor, workspace.ptr, workspace_size)
+            if mempool is None:
+                if workspace_size:
+                    workspace = cupy.cuda.alloc(workspace_size)
+                    workspace_ptr = workspace.ptr
+                else:
+                    workspace_ptr = 0
+            else:
+                mr = MemoryResourceFactory(mempool)
+                handler = mr.get_dev_mem_handler()
+                cusv.set_device_mem_handler(handle, handler)
+
+                workspace_ptr = 0
+                workspace_size = 0
+
+            cusv.accessor_set_extra_workspace(
+                handle, accessor, workspace_ptr, workspace_size)
 
             buf_len = 2**2
             buf = cupy.zeros(buf_len, dtype=sv.dtype)
 
             if readonly:
                 # copy the last buf_len elements would fail
-                with pytest.raises(custatevec.cuStateVecError) as e_info:
-                    custatevec.accessor_set(
+                with pytest.raises(cusv.cuStateVecError) as e_info:
+                    cusv.accessor_set(
                         handle, accessor, buf.data.ptr, sv.size-1-buf_len, sv.size-1)
             else:
                 # copy the last buf_len elements
-                custatevec.accessor_set(
+                cusv.accessor_set(
                     handle, accessor, buf.data.ptr, sv.size-1-buf_len, sv.size-1)
         finally:
-            # This is Python-only API. Need finally to ensure it's freed.
-            custatevec.accessor_destroy(accessor)
+            cusv.accessor_destroy(accessor)
 
         if readonly:
             # sv unchanged
             assert (sv[sv.size-1-buf_len: sv.size-1] == data[sv.size-1-buf_len: sv.size-1]).all()
         else:
             assert (sv[sv.size-1-buf_len: sv.size-1] == 0).all()
+
+
+class TestTestMatrixType:
+
+    @pytest.mark.parametrize(
+        'mempool', (None, 'py-callable', 'cffi', 'cffi_struct')
+    )
+    @pytest.mark.parametrize(
+        'matrix_type', (cusv.MatrixType.UNITARY, cusv.MatrixType.HERMITIAN)
+    )
+    @pytest.mark.parametrize(
+        'input_form', (
+            {'targets': (numpy.int32, 'int'), },
+            {'targets': (numpy.int32, 'seq'), },
+        )
+    )
+    @pytest.mark.parametrize(
+        'dtype', (numpy.complex64, numpy.complex128)
+    )
+    @pytest.mark.parametrize(
+        'xp', (numpy, cupy)
+     )
+    def test_apply_matrix_type(
+            self, handle, xp, dtype, input_form, matrix_type, mempool):
+        if (isinstance(mempool, str) and mempool.startswith('cffi')
+                and not _can_use_cffi()):
+            pytest.skip("cannot run cffi tests")
+
+        data_type = dtype_to_data_type[xp.dtype(dtype)]
+        compute_type = dtype_to_compute_type[xp.dtype(dtype)]
+        n_targets = 4
+
+        # matrix can live on host or device
+        # choose a trivial matrix
+        data = xp.ones(2**n_targets, dtype=dtype)
+        matrix = xp.diag(data)
+        matrix_ptr = matrix.ctypes.data if xp is numpy else matrix.data.ptr
+
+        if mempool is None:
+            workspace_size = cusv.test_matrix_type_get_workspace_size(
+                handle, matrix_type,
+                matrix_ptr, data_type, cusv.MatrixLayout.ROW, n_targets,
+                0, compute_type)
+            if workspace_size:
+                workspace = cupy.cuda.alloc(workspace_size)
+                workspace_ptr = workspace.ptr
+            else:
+                workspace_ptr = 0
+        else:
+            mr = MemoryResourceFactory(mempool)
+            handler = mr.get_dev_mem_handler()
+            cusv.set_device_mem_handler(handle, handler)
+
+            workspace_ptr = 0
+            workspace_size = 0
+
+        residual = cusv.test_matrix_type(
+            handle, matrix_type,
+            matrix_ptr, data_type, cusv.MatrixLayout.ROW, n_targets,
+            0, compute_type, workspace_ptr, workspace_size)
+        assert numpy.isclose(residual, 0)
+
+
+@pytest.mark.parametrize(
+    'rand',
+    # the choices here ensure we get either parity
+    (0, numpy.nextafter(1, 0))
+)
+@pytest.mark.parametrize(
+    'collapse',
+    (cusv.Collapse.NORMALIZE_AND_ZERO, cusv.Collapse.NONE)
+)
+@pytest.mark.skipif(
+    cupy.cuda.runtime.getDeviceCount() < 2, reason='not enough GPUs')
+class TestBatchMeasureWithSubSV(TestMultiGpuSV):
+
+    @pytest.mark.parametrize(
+        'input_form', (
+            {'bit_ordering': (numpy.int32, 'int'),},
+            {'bit_ordering': (numpy.int32, 'seq'),},
+        )
+    )
+    def test_batch_measure_with_offset(
+            self, multi_gpu_handles, rand, collapse, input_form):
+        handles = multi_gpu_handles
+        sub_sv = self.get_sv()
+        data_type = dtype_to_data_type[sub_sv[0].dtype]
+        bit_ordering = list(range(self.n_local_bits))
+        bit_ordering, bit_ordering_len = self._return_data(
+            bit_ordering, 'bit_ordering', *input_form['bit_ordering'])
+
+        # change sv to 1/\sqrt{2} (|0000> + |1111>), and compute abs2sum;
+        # calling abs2sum_array is also OK, but we focus on testing the target API
+        cumulative_array = numpy.zeros(self.n_devices+1, dtype=numpy.float64)
+        for i_sv in range(self.n_devices):
+            with cupy.cuda.Device(i_sv):
+                if i_sv == 0:
+                    # |0 000> is on GPU 0
+                    sub_sv[i_sv][0] = numpy.sqrt(0.5)
+                elif i_sv == 1:
+                    # |1 111> is on GPU 1
+                    sub_sv[i_sv][-1] = numpy.sqrt(0.5)
+                abs2sum = cupy.asnumpy(cupy.sum(cupy.abs(sub_sv[i_sv])**2))
+                cumulative_array[i_sv+1] = cumulative_array[i_sv] + abs2sum
+
+        orig_sub_sv = copy.deepcopy(sub_sv)
+
+        bitstring = numpy.empty(self.n_local_bits, dtype=numpy.int32)
+        for i_sv in range(self.n_devices):
+            if (cumulative_array[i_sv] <= rand
+                    and rand < cumulative_array[i_sv+1]):
+                global_bits = i_sv
+                norm = cumulative_array[-1]
+                offset = cumulative_array[i_sv]
+                with cupy.cuda.Device(i_sv) as dev:
+                    cusv.batch_measure_with_offset(
+                        handles[i_sv], sub_sv[i_sv].data.ptr, data_type,
+                        self.n_local_bits, bitstring.ctypes.data,
+                        bit_ordering, bit_ordering_len, rand,
+                        collapse, offset, norm)
+                    dev.synchronize()
+                break
+        else:
+            assert False
+
+        if global_bits == 0:
+            # get |0 000>
+            assert (bitstring == 0).all()
+        elif global_bits == 1:
+            # get |1 111>
+            assert (bitstring == 1).all()
+        else:
+            assert False
+
+        if collapse == cusv.Collapse.NORMALIZE_AND_ZERO:
+            # the measured sub sv is collapsed (those not measured are intact!)
+            if global_bits == 0:
+                # collapse to |0 000>
+                with cupy.cuda.Device(0):
+                    assert cupy.allclose(sub_sv[0][0], 1)
+                    assert not (sub_sv[0] == orig_sub_sv[0]).all()
+                with cupy.cuda.Device(1):
+                    assert (sub_sv[1] == orig_sub_sv[1]).all()
+            elif global_bits == 1:
+                # collapse to |1 111>
+                with cupy.cuda.Device(0):
+                    assert (sub_sv[0] == orig_sub_sv[0]).all()
+                with cupy.cuda.Device(1):
+                    assert cupy.allclose(sub_sv[1][-1], 1)
+                    assert not (sub_sv[1] == orig_sub_sv[1]).all()
+            else:
+                assert False, f"unexpected bitstring: {bitstring}"
+        else:
+            # sv is intact
+            with cupy.cuda.Device(0):
+                assert (sub_sv[0] == orig_sub_sv[0]).all()
+            with cupy.cuda.Device(1):
+                assert (sub_sv[1] == orig_sub_sv[1]).all()
+
+
+class TestSwap:
+
+    @pytest.mark.parametrize(
+        'input_form', (
+            {'swapped_bits': (numpy.int32, 'int'),
+             'mask_bitstring': (numpy.int32, 'int'), 'mask_ordering': (numpy.int32, 'int')},
+            {'swapped_bits': (numpy.int32, 'seq'),
+             'mask_bitstring': (numpy.int32, 'seq'), 'mask_ordering': (numpy.int32, 'seq')},
+        )
+    )
+    @pytest.mark.parametrize(
+        'dtype', (numpy.complex64, numpy.complex128)
+    )
+    def test_swap_index_bits(self, handle, dtype, input_form):
+        n_qubits = 4
+        sv = cupy.zeros(2**n_qubits, dtype=dtype)
+        data_type = dtype_to_data_type[sv.dtype]
+
+        # set sv to |0110>
+        sv[6] = 1
+        orig_sv = sv.copy()
+
+        swapped_bits = [(0, 2), (1, 3)]
+        n_swapped_bits = len(swapped_bits)
+        if input_form['swapped_bits'][1] == 'int':
+            swapped_bits_data = numpy.asarray(
+                swapped_bits, dtype=input_form['swapped_bits'][0])
+            swapped_bits = swapped_bits_data.ctypes.data
+
+        # TODO: test mask
+        mask_bitstring = 0
+        mask_ordering = 0
+        mask_len = 0
+
+        cusv.swap_index_bits(
+            handle, sv.data.ptr, data_type, n_qubits,
+            swapped_bits, n_swapped_bits,
+            mask_bitstring, mask_ordering, mask_len)
+
+        # now we should get |1001>
+        assert (sv != orig_sv).any()
+        assert sv[6] == 0
+        assert sv[9] == 1
+
+
+class TestMemHandler:
+
+    # TODO: add more different memory sources
+    @pytest.mark.parametrize(
+        'source', (None, "py-callable", 'cffi', 'cffi_struct')
+    )
+    def test_set_get_device_mem_handler(self, handle, source):
+        if (isinstance(source, str) and source.startswith('cffi')
+                and not _can_use_cffi()):
+            pytest.skip("cannot run cffi tests")
+
+        if source is not None:
+            mr = MemoryResourceFactory(source)
+            handler = mr.get_dev_mem_handler()
+            cusv.set_device_mem_handler(handle, handler)
+            # round-trip test
+            queried_handler = cusv.get_device_mem_handler(handle)
+            if source == 'cffi_struct':
+                # I'm lazy, otherwise I'd also fetch the functor addresses here...
+                assert queried_handler[0] == 0  # ctx is NULL
+                assert queried_handler[-1] == source
+            else:
+                assert queried_handler == handler
+        else:
+            with pytest.raises(cusv.cuStateVecError) as e:
+                queried_handler = cusv.get_device_mem_handler(handle)
+            assert 'CUSTATEVEC_STATUS_NO_DEVICE_ALLOCATOR' in str(e.value)
+
+
+class TestLogger:
+
+    def test_logger_set_level(self):
+        cusv.logger_set_level(6)  # on
+        cusv.logger_set_level(0)  # off
+
+    def test_logger_set_mask(self):
+        cusv.logger_set_mask(16)  # should not raise
+
+    def test_logger_set_callback_data(self):
+        # we also test logger_open_file() here to avoid polluting stdout
+
+        def callback(level, name, message, my_data, is_ok=False):
+            log = f"{level}, {name}, {message} (is_ok={is_ok}) -> logged\n"
+            my_data.append(log)
+
+        handle = None
+        my_data = []
+        is_ok = True
+
+        with tempfile.TemporaryDirectory() as temp:
+            file_name = os.path.join(temp, "cusv_test")
+            cusv.logger_open_file(file_name)
+            cusv.logger_set_callback_data(callback, my_data, is_ok=is_ok)
+            cusv.logger_set_level(6)
+
+            try:
+                handle = cusv.create()
+                cusv.destroy(handle)
+            except:
+                if handle:
+                    cusv.destroy(handle)
+                raise
+            finally:
+                cusv.logger_force_disable()  # to not affect the rest of tests
+
+            with open(file_name) as f:
+                log_from_f = f.read()
+
+        # check the log file
+        assert '[custatevecCreate]' in log_from_f
+        assert '[custatevecDestroy]' in log_from_f
+
+        # check the captured data (note we log 2 APIs)
+        log = ''.join(my_data)
+        assert log.count("-> logged") >= 2
+        assert log.count("is_ok=True") >= 2
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/data.py b/python/tests/cuquantum_tests/cutensornet_tests/data.py
index 517f3ef..4890870 100644
--- a/python/tests/cuquantum_tests/cutensornet_tests/data.py
+++ b/python/tests/cuquantum_tests/cutensornet_tests/data.py
@@ -1,4 +1,8 @@
 import itertools
+try:
+    import torch
+except ImportError:
+    torch = None
 
 
 # TODO: investigate test parallelism across cartesian product
@@ -6,8 +10,9 @@
 sources = [
     "numpy",
     "cupy",
-    "torch"
 ]
+if torch:
+    sources.append("torch")
 
 devices = [
     "cpu",
@@ -59,8 +64,8 @@
 handles = [None]
 loggers = [None]
 memory_limits = [
-    int(1e6),
-    "1 MiB",
+    int(1e8),
+    "100 MiB",
     "80%"
 ]
 
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_contract.py b/python/tests/cuquantum_tests/cutensornet_tests/test_contract.py
index 1226c8e..a9b56a7 100644
--- a/python/tests/cuquantum_tests/cutensornet_tests/test_contract.py
+++ b/python/tests/cuquantum_tests/cutensornet_tests/test_contract.py
@@ -18,6 +18,7 @@ def _test_contract(
         use_numpy_einsum_path
     ):
         for stream_name in stream_names:
+            if stream_name is not None and stream_name != self.tensor_package: continue
             optimize = deepcopy(self.optimize)
 
             if use_numpy_einsum_path:
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py b/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py
index 3a707dc..1cc6915 100644
--- a/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py
+++ b/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py
@@ -1,7 +1,13 @@
 import contextlib
 from collections import abc
 import functools
+import os
+import tempfile
 
+try:
+    import cffi
+except ImportError:
+    cffi = None
 import cupy
 from cupy import testing
 import numpy
@@ -9,7 +15,7 @@
 
 import cuquantum
 from cuquantum import ComputeType, cudaDataType
-from cuquantum import cutensornet
+from cuquantum import cutensornet as cutn
 
 
 ###################################################################
@@ -45,7 +51,7 @@ def decorator(impl):
         def test_func(self, *args, **kwargs):
             try:
                 if name == 'handle':
-                    h = cutensornet.create()
+                    h = cutn.create()
                 elif name == 'dscr':
                     tn, dtype, input_form, output_form = self.tn, self.dtype, self.input_form, self.output_form
                     einsum, shapes = tn  # unpack
@@ -54,7 +60,7 @@ def test_func(self, *args, **kwargs):
                         tn.get_input_metadata(**input_form)
                     o_n_modes, o_extents, o_strides, o_modes, o_alignments = \
                         tn.get_output_metadata(**output_form)
-                    h = cutensornet.create_network_descriptor(
+                    h = cutn.create_network_descriptor(
                         self.handle,
                         i_n_inputs, i_n_modes, i_extents, i_strides, i_modes, i_alignments,
                         o_n_modes, o_extents, o_strides, o_modes, o_alignments,
@@ -62,12 +68,14 @@ def test_func(self, *args, **kwargs):
                     # we also need to keep the tn data alive
                     self.tn = tn
                 elif name == 'config':
-                    h = cutensornet.create_contraction_optimizer_config(self.handle)
+                    h = cutn.create_contraction_optimizer_config(self.handle)
                 elif name == 'info':
-                    h = cutensornet.create_contraction_optimizer_info(
+                    h = cutn.create_contraction_optimizer_info(
                         self.handle, self.dscr)
                 elif name == 'autotune':
-                    h = cutensornet.create_contraction_autotune_preference(self.handle)
+                    h = cutn.create_contraction_autotune_preference(self.handle)
+                elif name == 'workspace':
+                    h = cutn.create_workspace_descriptor(self.handle)
                 else:
                     assert False, f'name "{name}" not recognized'
                 setattr(self, name, h)
@@ -77,38 +85,188 @@ def test_func(self, *args, **kwargs):
                 raise
             finally:
                 if name == 'handle' and hasattr(self, name):
-                    cutensornet.destroy(self.handle)
+                    cutn.destroy(self.handle)
                     del self.handle
                 elif name == 'dscr' and hasattr(self, name):
-                    cutensornet.destroy_network_descriptor(self.dscr)
+                    cutn.destroy_network_descriptor(self.dscr)
                     del self.dscr
                 elif name == 'config' and hasattr(self, name):
-                    cutensornet.destroy_contraction_optimizer_config(self.config)
+                    cutn.destroy_contraction_optimizer_config(self.config)
                     del self.config
                 elif name == 'info' and hasattr(self, name):
-                    cutensornet.destroy_contraction_optimizer_info(self.info)
+                    cutn.destroy_contraction_optimizer_info(self.info)
                     del self.info
                 elif name == 'autotune' and hasattr(self, name):
-                    cutensornet.destroy_contraction_autotune_preference(self.autotune)
+                    cutn.destroy_contraction_autotune_preference(self.autotune)
                     del self.autotune
+                elif name == 'workspace' and hasattr(self, name):
+                    h = cutn.destroy_workspace_descriptor(self.workspace)
+                    del self.workspace
         return test_func
     return decorator
 
 
+# we don't wanna recompile for every test case...
+_cffi_mod1 = None
+_cffi_mod2 = None
+
+def _can_use_cffi():
+    if cffi is None or os.environ.get('CUDA_PATH') is None:
+        return False
+    else:
+        return True
+
+
+class MemoryResourceFactory:
+
+    def __init__(self, source, name=None):
+        self.source = source
+        self.name = source if name is None else name
+
+    def get_dev_mem_handler(self):
+        if self.source == "py-callable":
+            return (*self._get_cuda_callable(), self.name)
+        elif self.source == "cffi":
+            # ctx is not needed, so set to NULL
+            return (0, *self._get_functor_address(), self.name)
+        elif self.source == "cffi_struct":
+            return self._get_handler_address()
+        # TODO: add more different memory sources
+        else:
+            raise NotImplementedError
+
+    def _get_cuda_callable(self):
+        def alloc(size, stream):
+            return cupy.cuda.runtime.mallocAsync(size, stream)
+
+        def free(ptr, size, stream):
+            cupy.cuda.runtime.freeAsync(ptr, stream)
+
+        return alloc, free
+
+    def _get_functor_address(self):
+        if not _can_use_cffi():
+            raise RuntimeError
+
+        global _cffi_mod1
+        if _cffi_mod1 is None:
+            import importlib
+            mod_name = f"cutn_test_{self.source}"
+            ffi = cffi.FFI()
+            ffi.set_source(mod_name, """
+                #include <cuda_runtime.h>
+
+                // cffi limitation: we can't use the actual type cudaStream_t because
+                // it's considered an "incomplete" type and we can't get the functor
+                // address by doing so...
+
+                int my_alloc(void* ctx, void** ptr, size_t size, void* stream) {
+                    return (int)cudaMallocAsync(ptr, size, stream);
+                }
+
+                int my_free(void* ctx, void* ptr, size_t size, void* stream) {
+                    return (int)cudaFreeAsync(ptr, stream);
+                }
+                """,
+                include_dirs=[os.environ['CUDA_PATH']+'/include'],
+                library_dirs=[os.environ['CUDA_PATH']+'/lib64'],
+                libraries=['cudart'],
+            )
+            ffi.cdef("""
+                int my_alloc(void* ctx, void** ptr, size_t size, void* stream);
+                int my_free(void* ctx, void* ptr, size_t size, void* stream);
+            """)
+            ffi.compile(verbose=True)
+            self.ffi = ffi
+            _cffi_mod1 = importlib.import_module(mod_name)
+        self.ffi_mod = _cffi_mod1
+
+        alloc_addr = self._get_address("my_alloc")
+        free_addr = self._get_address("my_free")
+        return alloc_addr, free_addr
+
+    def _get_handler_address(self):
+        if not _can_use_cffi():
+            raise RuntimeError
+
+        global _cffi_mod2
+        if _cffi_mod2 is None:
+            import importlib
+            mod_name = f"cutn_test_{self.source}"
+            ffi = cffi.FFI()
+            ffi.set_source(mod_name, """
+                #include <cuda_runtime.h>
+
+                // cffi limitation: we can't use the actual type cudaStream_t because
+                // it's considered an "incomplete" type and we can't get the functor
+                // address by doing so...
+
+                int my_alloc(void* ctx, void** ptr, size_t size, void* stream) {
+                    return (int)cudaMallocAsync(ptr, size, stream);
+                }
+
+                int my_free(void* ctx, void* ptr, size_t size, void* stream) {
+                    return (int)cudaFreeAsync(ptr, stream);
+                }
+
+                typedef struct {
+                    void* ctx;
+                    int (*device_alloc)(void* ctx, void** ptr, size_t size, void* stream);
+                    int (*device_free)(void* ctx, void* ptr, size_t size, void* stream);
+                    char name[64];
+                } myHandler;
+
+                myHandler* init_myHandler(myHandler* h, const char* name) {
+                    h->ctx = NULL;
+                    h->device_alloc = my_alloc;
+                    h->device_free = my_free;
+                    memcpy(h->name, name, 64);
+                    return h;
+                }
+                """,
+                include_dirs=[os.environ['CUDA_PATH']+'/include'],
+                library_dirs=[os.environ['CUDA_PATH']+'/lib64'],
+                libraries=['cudart'],
+            )
+            ffi.cdef("""
+                typedef struct {
+                    ...;
+                } myHandler;
+
+                myHandler* init_myHandler(myHandler* h, const char* name);
+            """)
+            ffi.compile(verbose=True)
+            self.ffi = ffi
+            _cffi_mod2 = importlib.import_module(mod_name)
+        self.ffi_mod = _cffi_mod2
+
+        h = self.handler = self.ffi_mod.ffi.new("myHandler*")
+        self.ffi_mod.lib.init_myHandler(h, self.name.encode())
+        return self._get_address(h)
+
+    def _get_address(self, func_name_or_ptr):
+        if isinstance(func_name_or_ptr, str):
+            func_name = func_name_or_ptr
+            data = str(self.ffi_mod.ffi.addressof(self.ffi_mod.lib, func_name))
+        else:
+            ptr = func_name_or_ptr  # ptr to struct
+            data = str(self.ffi_mod.ffi.addressof(ptr[0]))
+        # data has this format: "<cdata 'int(*)(void *, void * *, size_t, void *)' 0x7f6c5da37300>"
+        return int(data.split()[-1][:-1], base=16)
+
+
 class TestLibHelper:
 
     def test_get_version(self):
-        ver = cutensornet.get_version()
-        assert ver == (cutensornet.MAJOR_VER * 10000
-            + cutensornet.MINOR_VER * 100
-            + cutensornet.PATCH_VER)
-        assert ver == cutensornet.VERSION
+        ver = cutn.get_version()
+        assert ver == (cutn.MAJOR_VER * 10000
+            + cutn.MINOR_VER * 100
+            + cutn.PATCH_VER)
+        assert ver == cutn.VERSION
 
     def test_get_cudart_version(self):
-        # CUDA runtime is statically linked, so we can't compare
-        # with the "runtime" version
-        ver = cutensornet.get_cudart_version()
-        assert isinstance(ver, int)
+        ver = cutn.get_cudart_version()
+        assert ver == cupy.cuda.runtime.runtimeGetVersion()
 
 
 class TestHandle:
@@ -274,8 +432,14 @@ class TestTensorNetworkDescriptor(TestTensorNetworkBase):
     @manage_resource('handle')
     @manage_resource('dscr')
     def test_descriptor_create_destroy(self):
-        # simple round-trip test
-        pass
+        # we could just do a simple round-trip test, but let's also get
+        # this helper API tested
+        handle, dscr = self.handle, self.dscr
+        num_modes, modes, extents, strides = cutn.get_output_tensor_details(handle, dscr)
+        assert num_modes == self.tn.output_n_modes
+        assert (modes == numpy.asarray(self.tn.output_mode, dtype=numpy.int32)).all()
+        assert (extents == numpy.asarray(self.tn.output_extent, dtype=numpy.int64)).all()
+        assert (strides == numpy.asarray(self.tn.output_stride, dtype=numpy.int64)).all()
 
 
 class TestOptimizerInfo(TestTensorNetworkBase):
@@ -288,36 +452,36 @@ def test_optimizer_info_create_destroy(self):
         pass
 
     @pytest.mark.parametrize(
-        'attr', [val for val in cutensornet.ContractionOptimizerInfoAttribute]
+        'attr', [val for val in cutn.ContractionOptimizerInfoAttribute]
     )
     @manage_resource('handle')
     @manage_resource('dscr')
     @manage_resource('info')
     def test_optimizer_info_get_set_attribute(self, attr):
         if attr in (
-                cutensornet.ContractionOptimizerInfoAttribute.NUM_SLICES,
-                cutensornet.ContractionOptimizerInfoAttribute.PHASE1_FLOP_COUNT,
-                cutensornet.ContractionOptimizerInfoAttribute.FLOP_COUNT,
-                cutensornet.ContractionOptimizerInfoAttribute.LARGEST_TENSOR,
-                cutensornet.ContractionOptimizerInfoAttribute.SLICING_OVERHEAD,
+                cutn.ContractionOptimizerInfoAttribute.NUM_SLICES,
+                cutn.ContractionOptimizerInfoAttribute.PHASE1_FLOP_COUNT,
+                cutn.ContractionOptimizerInfoAttribute.FLOP_COUNT,
+                cutn.ContractionOptimizerInfoAttribute.LARGEST_TENSOR,
+                cutn.ContractionOptimizerInfoAttribute.SLICING_OVERHEAD,
                 ):
             pytest.skip("setter not supported")
         elif attr in (
-                cutensornet.ContractionOptimizerInfoAttribute.PATH,
-                cutensornet.ContractionOptimizerInfoAttribute.SLICED_MODE,
-                cutensornet.ContractionOptimizerInfoAttribute.SLICED_EXTENT,
+                cutn.ContractionOptimizerInfoAttribute.PATH,
+                cutn.ContractionOptimizerInfoAttribute.SLICED_MODE,
+                cutn.ContractionOptimizerInfoAttribute.SLICED_EXTENT,
                 ):
             pytest.skip("TODO")
         handle, info = self.handle, self.info
-        dtype = cutensornet.contraction_optimizer_info_get_attribute_dtype(attr)
+        dtype = cutn.contraction_optimizer_info_get_attribute_dtype(attr)
         # Hack: assume this is a valid value for all attrs
         factor = numpy.asarray([30], dtype=dtype)
-        cutensornet.contraction_optimizer_info_set_attribute(
+        cutn.contraction_optimizer_info_set_attribute(
             handle, info, attr,
             factor.ctypes.data, factor.dtype.itemsize)
         # do a round-trip test as a sanity check
         factor2 = numpy.zeros_like(factor)
-        cutensornet.contraction_optimizer_info_get_attribute(
+        cutn.contraction_optimizer_info_get_attribute(
             handle, info, attr,
             factor2.ctypes.data, factor2.dtype.itemsize)
         assert factor == factor2
@@ -333,29 +497,29 @@ def test_optimizer_config_create_destroy(self):
 
     @pytest.mark.parametrize(
         # TODO(leofang): enable this when the getter bug is fixed
-        'attr', [val for val in cutensornet.ContractionOptimizerConfigAttribute]
-        #'attr', [cutensornet.ContractionOptimizerConfigAttribute.GRAPH_IMBALANCE_FACTOR]
+        'attr', [val for val in cutn.ContractionOptimizerConfigAttribute]
+        #'attr', [cutn.ContractionOptimizerConfigAttribute.GRAPH_IMBALANCE_FACTOR]
     )
     @manage_resource('handle')
     @manage_resource('config')
     def test_optimizer_config_get_set_attribute(self, attr):
-        if attr == cutensornet.ContractionOptimizerConfigAttribute.SIMPLIFICATION_DISABLE_DR:
+        if attr == cutn.ContractionOptimizerConfigAttribute.SIMPLIFICATION_DISABLE_DR:
             pytest.skip("pending on MR 275")
         handle, config = self.handle, self.config
-        dtype = cutensornet.contraction_optimizer_config_get_attribute_dtype(attr)
+        dtype = cutn.contraction_optimizer_config_get_attribute_dtype(attr)
         # Hack: assume this is a valid value for all attrs
-        if attr in (cutensornet.ContractionOptimizerConfigAttribute.GRAPH_ALGORITHM,
-                    cutensornet.ContractionOptimizerConfigAttribute.SLICER_MEMORY_MODEL,
-                    cutensornet.ContractionOptimizerConfigAttribute.SLICER_DISABLE_SLICING):
+        if attr in (cutn.ContractionOptimizerConfigAttribute.GRAPH_ALGORITHM,
+                    cutn.ContractionOptimizerConfigAttribute.SLICER_MEMORY_MODEL,
+                    cutn.ContractionOptimizerConfigAttribute.SLICER_DISABLE_SLICING):
             factor = numpy.asarray([1], dtype=dtype)
         else:
             factor = numpy.asarray([30], dtype=dtype)
-        cutensornet.contraction_optimizer_config_set_attribute(
+        cutn.contraction_optimizer_config_set_attribute(
             handle, config, attr,
             factor.ctypes.data, factor.dtype.itemsize)
         # do a round-trip test as a sanity check
         factor2 = numpy.zeros_like(factor)
-        cutensornet.contraction_optimizer_config_get_attribute(
+        cutn.contraction_optimizer_config_get_attribute(
             handle, config, attr,
             factor2.ctypes.data, factor2.dtype.itemsize)
         assert factor == factor2
@@ -370,28 +534,31 @@ def test_autotune_preference_create_destroy(self):
         pass
 
     @pytest.mark.parametrize(
-        'attr', [val for val in cutensornet.ContractionAutotunePreferenceAttribute]
+        'attr', [val for val in cutn.ContractionAutotunePreferenceAttribute]
     )
     @manage_resource('handle')
     @manage_resource('autotune')
     def test_autotune_preference_get_set_attribute(self, attr):
         handle, pref = self.handle, self.autotune
-        dtype = cutensornet.contraction_autotune_preference_get_attribute_dtype(attr)
+        dtype = cutn.contraction_autotune_preference_get_attribute_dtype(attr)
         # Hack: assume this is a valid value for all attrs
         factor = numpy.asarray([10], dtype=dtype)
-        cutensornet.contraction_autotune_preference_set_attribute(
+        cutn.contraction_autotune_preference_set_attribute(
             handle, pref, attr,
             factor.ctypes.data, factor.dtype.itemsize)
         # do a round-trip test as a sanity check
         factor2 = numpy.zeros_like(factor)
-        cutensornet.contraction_autotune_preference_get_attribute(
+        cutn.contraction_autotune_preference_get_attribute(
             handle, pref, attr,
             factor2.ctypes.data, factor2.dtype.itemsize)
         assert factor == factor2
 
 
 @pytest.mark.parametrize(
-    'get_workspace_size', (True, False)
+    'mempool', (None, 'py-callable', 'cffi', 'cffi_struct')
+)
+@pytest.mark.parametrize(
+    'workspace_pref', ("min", "recommended", "max")
 )
 @pytest.mark.parametrize(
     'autotune', (True, False)
@@ -411,40 +578,153 @@ class TestContraction(TestTensorNetworkBase):
     @manage_resource('info')
     @manage_resource('config')
     @manage_resource('autotune')
+    @manage_resource('workspace')
     def test_contraction_workflow(
-            self, get_workspace_size, autotune, contract, stream):
+            self, mempool, workspace_pref, autotune, contract, stream):
+        if (isinstance(mempool, str) and mempool.startswith('cffi')
+                and not _can_use_cffi()):
+            pytest.skip("cannot run cffi tests")
+
         # unpack
         handle, dscr, info, config, pref = self.handle, self.dscr, self.info, self.config, self.autotune
+        workspace = self.workspace
         tn, input_form, output_form = self.tn, self.input_form, self.output_form
 
-        workspace_size = 4*1024**2  # large enough for our test cases
+        if mempool:
+            mr = MemoryResourceFactory(mempool)
+            handler = mr.get_dev_mem_handler()
+            cutn.set_device_mem_handler(handle, handler)
+
+        workspace_size = 32*1024**2  # large enough for our test cases
         # we have to run this API in any case in order to create a path
-        cutensornet.contraction_optimize(
+        cutn.contraction_optimize(
             handle, dscr, config, workspace_size, info)
-        if get_workspace_size:
-            workspace_size = cutensornet.contraction_get_workspace_size(
-                handle, dscr, info)
-        workspace = cupy.cuda.alloc(workspace_size)
+
+        # manage workspace
+        if mempool is None:
+            cutn.workspace_compute_sizes(handle, dscr, info, workspace)
+            required_size = cutn.workspace_get_size(
+                handle, workspace,
+                getattr(cutn.WorksizePref, f"{workspace_pref.upper()}"),
+                cutn.Memspace.DEVICE)  # TODO: parametrize memspace?
+            if workspace_size < required_size:
+                assert False, \
+                    f"wrong assumption on the workspace size " \
+                    f"(given: {workspace_size}, needed: {required_size})"
+            workspace_ptr = cupy.cuda.alloc(workspace_size)
+            cutn.workspace_set(
+                handle, workspace,
+                cutn.Memspace.DEVICE,
+                workspace_ptr.ptr, workspace_size)
+            # round-trip check
+            assert (workspace_ptr.ptr, workspace_size) == cutn.workspace_get(
+                handle, workspace,
+                cutn.Memspace.DEVICE)
+        else:
+            cutn.workspace_set(
+                handle, workspace,
+                cutn.Memspace.DEVICE,
+                0, 0)  # TODO: check custom workspace size?
 
         plan = None
         try:
-            plan = cutensornet.create_contraction_plan(
-                handle, dscr, info, workspace_size)
+            plan = cutn.create_contraction_plan(
+                handle, dscr, info, workspace)
             if autotune:
-                cutensornet.contraction_autotune(
+                cutn.contraction_autotune(
                     handle, plan,
                     tn.get_input_tensors(**input_form),
                     tn.get_output_tensor(),
-                    workspace.ptr, workspace_size, pref, stream.ptr)
+                    workspace, pref, stream.ptr)
             if contract:
                 # assume no slicing for simple test cases!
-                cutensornet.contraction(
+                cutn.contraction(
                     handle, plan,
                     tn.get_input_tensors(**input_form),
                     tn.get_output_tensor(),
-                    workspace.ptr, workspace_size, 0, stream.ptr)
+                    workspace, 0, stream.ptr)
                 # TODO(leofang): check correctness?
             stream.synchronize()
         finally:
             if plan is not None:
-                cutensornet.destroy_contraction_plan(plan)
+                cutn.destroy_contraction_plan(plan)
+
+
+# TODO: add more different memory sources
+@pytest.mark.parametrize(
+    'source', (None, "py-callable", 'cffi', 'cffi_struct')
+)
+class TestMemHandler:
+
+    @manage_resource('handle')
+    def test_set_get_device_mem_handler(self, source):
+        if (isinstance(source, str) and source.startswith('cffi')
+                and not _can_use_cffi()):
+            pytest.skip("cannot run cffi tests")
+
+        handle = self.handle
+        if source is not None:
+            mr = MemoryResourceFactory(source)
+            handler = mr.get_dev_mem_handler()
+            cutn.set_device_mem_handler(handle, handler)
+            # round-trip test
+            queried_handler = cutn.get_device_mem_handler(handle)
+            if source == 'cffi_struct':
+                # I'm lazy, otherwise I'd also fetch the functor addresses here...
+                assert queried_handler[0] == 0  # ctx is NULL
+                assert queried_handler[-1] == source
+            else:
+                assert queried_handler == handler
+        else:
+            with pytest.raises(cutn.cuTensorNetError) as e:
+                queried_handler = cutn.get_device_mem_handler(handle)
+            assert 'CUTENSORNET_STATUS_NO_DEVICE_ALLOCATOR' in str(e.value)
+
+
+class TestLogger:
+
+    def test_logger_set_level(self):
+        cutn.logger_set_level(6)  # on
+        cutn.logger_set_level(0)  # off
+
+    def test_logger_set_mask(self):
+        cutn.logger_set_mask(16)  # should not raise
+
+    def test_logger_set_callback_data(self):
+        # we also test logger_open_file() here to avoid polluting stdout
+
+        def callback(level, name, message, my_data, is_ok=False):
+            log = f"{level}, {name}, {message} (is_ok={is_ok}) -> logged\n"
+            my_data.append(log)
+
+        handle = None
+        my_data = []
+        is_ok = True
+
+        with tempfile.TemporaryDirectory() as temp:
+            file_name = os.path.join(temp, "cutn_test")
+            cutn.logger_open_file(file_name)
+            cutn.logger_set_callback_data(callback, my_data, is_ok=is_ok)
+            cutn.logger_set_level(6)
+
+            try:
+                handle = cutn.create()
+                cutn.destroy(handle)
+            except:
+                if handle:
+                    cutn.destroy(handle)
+                raise
+            finally:
+                cutn.logger_force_disable()  # to not affect the rest of tests
+
+            with open(file_name) as f:
+                log_from_f = f.read()
+
+        # check the log file
+        assert '[cutensornetCreate]' in log_from_f
+        assert '[cutensornetDestroy]' in log_from_f
+
+        # check the captured data (note we log 2 APIs)
+        log = ''.join(my_data)
+        assert log.count("-> logged") >= 2
+        assert log.count("is_ok=True") >= 2
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_network.py b/python/tests/cuquantum_tests/cutensornet_tests/test_network.py
index bf824af..ee12955 100644
--- a/python/tests/cuquantum_tests/cutensornet_tests/test_network.py
+++ b/python/tests/cuquantum_tests/cutensornet_tests/test_network.py
@@ -89,6 +89,7 @@ def _test_contract(
         )
 
         for stream_name in stream_names:
+            if stream_name is not None and stream_name != self.tensor_package: continue
             network_einsum.autotune(iterations=self.iterations, stream=streams[stream_name])  # if iterations=0, autotune is skipped
             stream_name_sync_dispatcher(stream_name, skip=skip_sync)
             cutensornet_contract = network_einsum.contract(stream=streams[stream_name])
@@ -98,6 +99,7 @@ def _test_contract(
         network_einsum.free()
 
         for stream_name in stream_names:
+            if stream_name is not None and stream_name != self.tensor_package: continue
             network_interleaved.autotune(iterations=self.iterations, stream=streams[stream_name])  # if iterations=0, autotune is skipped
             stream_name_sync_dispatcher(stream_name, skip=skip_sync)
             cutensornet_contract = network_interleaved.contract(stream=streams[stream_name])
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/testutils.py b/python/tests/cuquantum_tests/cutensornet_tests/testutils.py
index 57ddc6d..959d21b 100644
--- a/python/tests/cuquantum_tests/cutensornet_tests/testutils.py
+++ b/python/tests/cuquantum_tests/cutensornet_tests/testutils.py
@@ -1,32 +1,43 @@
+import functools
+
 import cupy
-import torch
 import numpy
-import functools
+try:
+    import torch
+except ImportError:
+    torch = None
 
 from cuquantum import Network
 from cuquantum import NetworkOptions, OptimizerOptions
 
 from .data import *
 
-torch.backends.cuda.matmul.allow_tf32 = False
-torch.backends.cudnn.allow_tf32 = False
 
+def infer_object_package(name):
+    return name.split('.')[0]
 
 def dtype_name_dispatcher(source, dtype_name):
     import sys
     return getattr(sys.modules[source], dtype_name)
 
+
 stream_names = [
     "default",
     "cupy",
-    "torch"
 ]
 
 streams = dict(zip(
     stream_names,
-    [None, cupy.cuda.Stream(), torch.cuda.Stream()]
+    [None, cupy.cuda.Stream()]
 ))
 
+if torch:
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    stream_names.append("torch")
+    streams["torch"] = torch.cuda.Stream()
+
+
 def stream_name_sync_dispatcher(stream_name, skip=False):
     stream = streams[stream_name]
     if not skip:
@@ -53,7 +64,7 @@ def generate_data_dispatcher(source, device, shape, dtype_name, array_order):
                     1.j * cupy.random.random(shape)).astype(dtype, order=array_order)
         else:
             data = cupy.random.random(shape).astype(dtype, order=array_order)
-    elif source == "torch":
+    elif torch and source == "torch":
         if "int" in dtype_name:
             data = torch.randint(-1, 2, shape, dtype=dtype, device=device)
         else:
@@ -73,7 +84,7 @@ def data_to_numpy(source, data):
         return data
     elif source == "cupy":
         return cupy.asnumpy(data)
-    elif source == "torch":
+    elif torch and source == "torch":
         return data.cpu().numpy()
 
 def data_operands_to_numpy(source, data_operands):
@@ -93,7 +104,7 @@ def einsum_dispatcher(source, einsum_expr, data_operands):
         return numpy.einsum(einsum_expr, *data_operands, optimize="optimal")
     elif source == "cupy":
         return cupy.einsum(einsum_expr, *data_operands)
-    elif source == "torch":
+    elif torch and source == "torch":
         return torch.einsum(einsum_expr, *data_operands)
 
 def network_options_dispatcher(network_options, mode=None):
@@ -152,7 +163,7 @@ def allclose_dispatcher(source, dtype_name):
             cupy.allclose, rtol=rtol_mapper[dtype_name],
             atol=atol_mapper[dtype_name]
         )
-    elif source == "torch":
+    elif torch and source == "torch":
         return functools.partial(
             torch.allclose, rtol=rtol_mapper[dtype_name],
             atol=atol_mapper[dtype_name]
@@ -212,6 +223,7 @@ def __init__(self, network_options_pack):
             self.data_operands
         )
         self.tensor_class = tensor_class_dispatcher(self.data_operands)
+        self.tensor_package = infer_object_package(self.tensor_class.__module__)
         self.interleaved_inputs = interleaved_format_from_einsum(self.einsum_expr, self.data_operands)
         self.numpy_einsum_path = numpy.einsum_path(self.einsum_expr, *self.numpy_data_operands)
         self.einsum = einsum_dispatcher(self.source, self.einsum_expr, self.data_operands)
diff --git a/samples/custatevec/CMakeLists.txt b/samples/custatevec/CMakeLists.txt
index ba01afe..d61ddac 100644
--- a/samples/custatevec/CMakeLists.txt
+++ b/samples/custatevec/CMakeLists.txt
@@ -93,6 +93,12 @@ set(CMAKE_CUDA_STANDARD 11)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 set(CMAKE_CUDA_EXTENSIONS OFF)
 
+set(CMAKE_CUDA_FLAGS_ARCH_SM70 "-gencode arch=compute_70,code=sm_70")
+set(CMAKE_CUDA_FLAGS_ARCH_SM75 "-gencode arch=compute_75,code=sm_75")
+set(CMAKE_CUDA_FLAGS_ARCH_SM80 "-gencode arch=compute_80,code=sm_80 -gencode arch=compute_80,code=compute_80")
+set(CMAKE_CUDA_FLAGS_ARCH "${CMAKE_CUDA_FLAGS_ARCH_SM70} ${CMAKE_CUDA_FLAGS_ARCH_SM75} ${CMAKE_CUDA_FLAGS_ARCH_SM80}")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_ARCH}")
+
 # ##########################################
 # custatevec_example utility function
 # ##########################################
@@ -154,3 +160,8 @@ add_custatevec_example(custatevec_examples "cuStateVec.example.measure_zbasis"
 add_custatevec_example(custatevec_examples "cuStateVec.example.batch_measure"      batch_measure.cu)
 add_custatevec_example(custatevec_examples "cuStateVec.example.accessor_get"       accessor_get.cu)
 add_custatevec_example(custatevec_examples "cuStateVec.example.accessor_set"       accessor_set.cu)
+add_custatevec_example(custatevec_examples "cuStateVec.example.test_matrix_type"   test_matrix_type.cu)
+add_custatevec_example(custatevec_examples "cuStateVec.example.memory_handler"     memory_handler.cu)
+add_custatevec_example(custatevec_examples "cuStateVec.example.swap_index_bits"    swap_index_bits.cu)
+add_custatevec_example(custatevec_examples "cuStateVec.example.mgpu_sampler"       mgpu_sampler.cu)
+add_custatevec_example(custatevec_examples "cuStateVec.example.mgpu_batch_measure" mgpu_batch_measure.cu)
diff --git a/samples/custatevec/Makefile b/samples/custatevec/Makefile
index 2c1684b..67226eb 100644
--- a/samples/custatevec/Makefile
+++ b/samples/custatevec/Makefile
@@ -53,6 +53,11 @@ all: check-env
 	nvcc batch_measure.cu      -o batch_measure      ${CXX_FLAGS}
 	nvcc accessor_get.cu       -o accessor_get       ${CXX_FLAGS}
 	nvcc accessor_set.cu       -o accessor_set       ${CXX_FLAGS}
+	nvcc test_matrix_type.cu   -o test_matrix_type   ${CXX_FLAGS}
+	nvcc memory_handler.cu     -o memory_handler     ${CXX_FLAGS}
+	nvcc swap_index_bits.cu    -o swap_index_bits    ${CXX_FLAGS}
+	nvcc mgpu_batch_measure.cu -o mgpu_batch_measure ${CXX_FLAGS}
+	nvcc mgpu_sampler.cu       -o mgpu_sampler       ${CXX_FLAGS}
 
 check-env:
 	@ echo "" && \
@@ -77,4 +82,9 @@ clean:
 		measure_zbasis \
 		batch_measure \
 		accessor_get \
-		accessor_set
+		accessor_set \
+		test_matrix_type \
+		memory_handler \
+		swap_index_bits \
+		mgpu_batch_measure \
+		mgpu_sampler
diff --git a/samples/custatevec/accessor_get.cu b/samples/custatevec/accessor_get.cu
index d816ff2..4ce3198 100644
--- a/samples/custatevec/accessor_get.cu
+++ b/samples/custatevec/accessor_get.cu
@@ -74,7 +74,7 @@ int main(void) {
     size_t extraWorkspaceSizeInBytes = 0;
 
     // create accessor and check the size of external workspace
-    HANDLE_ERROR( custatevecAccessor_createReadOnly(
+    HANDLE_ERROR( custatevecAccessorCreateView(
                   handle, d_sv, CUDA_C_64F, nIndexBits, &accessor, bitOrdering, bitOrderingLen,
                   maskBitString, maskOrdering, maskLen, &extraWorkspaceSizeInBytes) );
 
@@ -83,14 +83,15 @@ int main(void) {
         HANDLE_CUDA_ERROR( cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes) );
 
     // set external workspace
-    HANDLE_ERROR( custatevecAccessor_setExtraWorkspace(
-                  handle, &accessor, extraWorkspace, extraWorkspaceSizeInBytes) );
+    HANDLE_ERROR( custatevecAccessorSetExtraWorkspace(
+                  handle, accessor, extraWorkspace, extraWorkspaceSizeInBytes) );
 
     // get state vector components
-    HANDLE_ERROR( custatevecAccessor_get(
-                  handle, &accessor, buffer, accessBegin, accessEnd) );
+    HANDLE_ERROR( custatevecAccessorGet(
+                  handle, accessor, buffer, accessBegin, accessEnd) );
 
-    // destroy handle
+    // destroy descriptor and handle
+    HANDLE_ERROR( custatevecAccessorDestroy(accessor) );
     HANDLE_ERROR( custatevecDestroy(handle) );
 
     //----------------------------------------------------------------------------------------------
diff --git a/samples/custatevec/accessor_set.cu b/samples/custatevec/accessor_set.cu
index 19da9d9..1223d50 100644
--- a/samples/custatevec/accessor_set.cu
+++ b/samples/custatevec/accessor_set.cu
@@ -70,7 +70,7 @@ int main(void) {
     size_t extraWorkspaceSizeInBytes = 0;
 
     // create accessor and check the size of external workspace
-    HANDLE_ERROR( custatevecAccessor_create(
+    HANDLE_ERROR( custatevecAccessorCreate(
                   handle, d_sv, CUDA_C_64F, nIndexBits, &accessor, bitOrdering, bitOrderingLen,
                   nullptr, nullptr, maskLen, &extraWorkspaceSizeInBytes) );
 
@@ -79,14 +79,15 @@ int main(void) {
         HANDLE_CUDA_ERROR( cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes) );
 
     // set external workspace
-    HANDLE_ERROR( custatevecAccessor_setExtraWorkspace(
-                  handle, &accessor, extraWorkspace, extraWorkspaceSizeInBytes) );
+    HANDLE_ERROR( custatevecAccessorSetExtraWorkspace(
+                  handle, accessor, extraWorkspace, extraWorkspaceSizeInBytes) );
 
     // set state vector components
-    HANDLE_ERROR( custatevecAccessor_set(
-                  handle, &accessor, buffer, 0, nSvSize) );
+    HANDLE_ERROR( custatevecAccessorSet(
+                  handle, accessor, buffer, 0, nSvSize) );
 
-    // destroy handle
+    // destroy descriptor and handle
+    HANDLE_ERROR( custatevecAccessorDestroy(accessor) );
     HANDLE_ERROR( custatevecDestroy(handle) );
 
     //----------------------------------------------------------------------------------------------
diff --git a/samples/custatevec/diagonal_matrix.cu b/samples/custatevec/diagonal_matrix.cu
index af9e798..1eb263a 100644
--- a/samples/custatevec/diagonal_matrix.cu
+++ b/samples/custatevec/diagonal_matrix.cu
@@ -67,7 +67,7 @@ int main(void) {
     size_t extraWorkspaceSizeInBytes = 0;
 
     // check the size of external workspace
-    HANDLE_ERROR( custatevecApplyGeneralizedPermutationMatrix_bufferSize(
+    HANDLE_ERROR( custatevecApplyGeneralizedPermutationMatrixGetWorkspaceSize(
                   handle, CUDA_C_64F, nIndexBits, nullptr, diagonals, CUDA_C_64F, basisBits,
                   nBasisBits, maskLen, &extraWorkspaceSizeInBytes) );
 
diff --git a/samples/custatevec/expectation.cu b/samples/custatevec/expectation.cu
index bea4815..82ee20f 100644
--- a/samples/custatevec/expectation.cu
+++ b/samples/custatevec/expectation.cu
@@ -67,7 +67,7 @@ int main(void) {
     size_t extraWorkspaceSizeInBytes = 0;
 
     // check the size of external workspace
-    HANDLE_ERROR( custatevecExpectation_bufferSize(
+    HANDLE_ERROR( custatevecComputeExpectationGetWorkspaceSize(
                   handle, CUDA_C_64F, nIndexBits, matrix, CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_ROW,
                   nBasisBits, CUSTATEVEC_COMPUTE_64F, &extraWorkspaceSizeInBytes) );
 
@@ -76,7 +76,7 @@ int main(void) {
         HANDLE_CUDA_ERROR( cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes) );
 
     // compute expectation
-    HANDLE_ERROR( custatevecExpectation(
+    HANDLE_ERROR( custatevecComputeExpectation(
                   handle, d_sv, CUDA_C_64F, nIndexBits, &expectationValue, CUDA_C_64F, nullptr,
                   matrix, CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_ROW, basisBits, nBasisBits,
                   CUSTATEVEC_COMPUTE_64F, extraWorkspace, extraWorkspaceSizeInBytes) );
diff --git a/samples/custatevec/expectation_pauli.cu b/samples/custatevec/expectation_pauli.cu
index e25f99c..d452a5f 100644
--- a/samples/custatevec/expectation_pauli.cu
+++ b/samples/custatevec/expectation_pauli.cu
@@ -69,9 +69,9 @@ int main(void) {
     HANDLE_ERROR( custatevecCreate(&handle) );
 
     // apply Pauli operator
-    HANDLE_ERROR( custatevecExpectationsOnPauliBasis(
-                  handle, d_sv, CUDA_C_64F, nIndexBits, expectationValues, pauliOperatorsArray,
-                  basisBitsArray, nBasisBitsArray, nPauliOperatorArrays) );
+    HANDLE_ERROR( custatevecComputeExpectationsOnPauliBasis(
+                  handle, d_sv, CUDA_C_64F, nIndexBits, expectationValues,
+                  pauliOperatorsArray, nPauliOperatorArrays, basisBitsArray, nBasisBitsArray) );
 
     // destroy handle
     HANDLE_ERROR( custatevecDestroy(handle) );
diff --git a/samples/custatevec/exponential_pauli.cu b/samples/custatevec/exponential_pauli.cu
index 5b097ed..f16d733 100644
--- a/samples/custatevec/exponential_pauli.cu
+++ b/samples/custatevec/exponential_pauli.cu
@@ -69,7 +69,7 @@ int main(void) {
     HANDLE_ERROR( custatevecCreate(&handle) );
 
     // apply Pauli operator
-    HANDLE_ERROR( custatevecApplyExp(
+    HANDLE_ERROR( custatevecApplyPauliRotation(
                   handle, d_sv, CUDA_C_64F, nIndexBits, pi / 2.0, paulis, targets, nTargets, 
                   controls, controlBitValues, nControls) );
 
diff --git a/samples/custatevec/gate_application.cu b/samples/custatevec/gate_application.cu
index b679802..35a047e 100644
--- a/samples/custatevec/gate_application.cu
+++ b/samples/custatevec/gate_application.cu
@@ -69,7 +69,7 @@ int main(void) {
     size_t extraWorkspaceSizeInBytes = 0;
 
     // check the size of external workspace
-    HANDLE_ERROR( custatevecApplyMatrix_bufferSize(
+    HANDLE_ERROR( custatevecApplyMatrixGetWorkspaceSize(
                   handle, CUDA_C_64F, nIndexBits, matrix, CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_ROW,
                   adjoint, nTargets, nControls, CUSTATEVEC_COMPUTE_64F, &extraWorkspaceSizeInBytes) );
 
@@ -80,8 +80,8 @@ int main(void) {
     // apply gate
     HANDLE_ERROR( custatevecApplyMatrix(
                   handle, d_sv, CUDA_C_64F, nIndexBits, matrix, CUDA_C_64F,
-                  CUSTATEVEC_MATRIX_LAYOUT_ROW, adjoint, targets, nTargets, controls, nControls, 
-                  nullptr, CUSTATEVEC_COMPUTE_64F, extraWorkspace, extraWorkspaceSizeInBytes) );
+                  CUSTATEVEC_MATRIX_LAYOUT_ROW, adjoint, targets, nTargets, controls, nullptr,
+                  nControls, CUSTATEVEC_COMPUTE_64F, extraWorkspace, extraWorkspaceSizeInBytes) );
 
     // destroy handle
     HANDLE_ERROR( custatevecDestroy(handle) );
diff --git a/samples/custatevec/memory_handler.cu b/samples/custatevec/memory_handler.cu
new file mode 100644
index 0000000..5dc5efe
--- /dev/null
+++ b/samples/custatevec/memory_handler.cu
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  - Neither the name(s) of the copyright holder(s) nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
+#include <cuComplex.h>        // cuDoubleComplex
+#include <custatevec.h>       // custatevecApplyMatrix
+#include <string.h>           // strcpy
+#include <stdio.h>            // printf
+#include <stdlib.h>           // EXIT_FAILURE
+
+#include "helper.hpp"         // HANDLE_ERROR, HANDLE_CUDA_ERROR
+
+#define SUPPORTS_MEMORY_POOL ( __CUDACC_VER_MAJOR__ > 11 || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 2))
+#if SUPPORTS_MEMORY_POOL
+
+// upon success, this function should return 0, otherwise a nonzero value
+int myMemPoolAlloc(void* ctx, void** ptr, size_t size, cudaStream_t stream) {
+    cudaMemPool_t& pool = *static_cast<cudaMemPool_t*>(ctx);
+    cudaError_t status = cudaMallocFromPoolAsync(ptr, size, pool, stream);
+    return (int)status;
+}
+
+// upon success, this function should return 0, otherwise a nonzero value
+int myMemPoolFree(void*, void* ptr, size_t, cudaStream_t stream) {
+    cudaError_t status = cudaFreeAsync(ptr, stream);
+    return (int)status;
+}
+
+int main(void) {
+    // state vector
+    const int nIndexBits   = 3;
+    const int nSvSize      = (1 << nIndexBits);
+
+    cuDoubleComplex h_sv[] = {{ 0.48, 0.0}, { 0.36, 0.0}, { 0.64, 0.0}, { 0.48, 0.0}, 
+                              { 0.0,  0.0}, { 0.0,  0.0}, { 0.0,  0.0}, { 0.0,  0.0}};
+
+    //----------------------------------------------------------------------------------------------
+    // gates
+    const int adjoint = 0;
+    const custatevecMatrixLayout_t layout = CUSTATEVEC_MATRIX_LAYOUT_ROW;
+
+    // Hadamard gate
+    const int hTargets[] = {2};
+    const uint32_t hNTargets = 1;
+    const double Rsqrt2 = 1. / std::sqrt(2.);
+    cuDoubleComplex hGate[] = {{Rsqrt2, 0.0}, {Rsqrt2, 0.0},
+                               {Rsqrt2, 0.0}, {-Rsqrt2, 0.0}};
+
+    // control-SWAP gate
+    const int swapTargets[] = {0, 1};
+    const uint32_t swapNTargets = 2;
+    const int swapControls[] = {2};
+    const uint32_t swapNControls = 1;
+    cuDoubleComplex swapGate[] = {{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
+                                  {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0},
+                                  {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
+                                  {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}};
+
+    // observable
+    const int basisBits[] = {2};
+    const uint32_t nBasisBits = 1;
+    cuDoubleComplex observable[] = {{1.0, 0.0}, {0.0, 0.0},
+                                    {0.0, 0.0}, {0.0, 0.0}};
+
+    //----------------------------------------------------------------------------------------------
+    // device configuration
+    int deviceId;
+    HANDLE_CUDA_ERROR( cudaGetDevice(&deviceId) );
+
+    cudaError_t status;
+    int isMemPoolSupported;
+    status = cudaDeviceGetAttribute(&isMemPoolSupported, cudaDevAttrMemoryPoolsSupported, deviceId);
+    if (status != cudaSuccess || !isMemPoolSupported) {
+        printf("memory handler example WAIVED: CUDA Memory pools is not supported.\n");
+        return EXIT_SUCCESS;
+    }
+
+    cudaMemPool_t memPool;
+    HANDLE_CUDA_ERROR( cudaDeviceGetDefaultMemPool(&memPool, deviceId) );
+
+    // avoid shrinking the pool 
+    uint64_t threshold = UINT64_MAX;
+    cudaMemPoolSetAttribute(memPool, cudaMemPoolAttrReleaseThreshold, &threshold);
+
+    cudaStream_t stream;
+    HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
+
+    //----------------------------------------------------------------------------------------------
+    // data transfer of state vector
+    cuDoubleComplex *d_sv;
+    HANDLE_CUDA_ERROR( cudaMallocAsync((void**)&d_sv, nSvSize * sizeof(cuDoubleComplex), stream) );
+
+    HANDLE_CUDA_ERROR( cudaMemcpyAsync(d_sv, h_sv, nSvSize * sizeof(cuDoubleComplex), 
+                                       cudaMemcpyHostToDevice, stream) );
+
+    //----------------------------------------------------------------------------------------------
+    // custatevec handle initialization
+    custatevecHandle_t handle;
+    HANDLE_ERROR( custatevecCreate(&handle) );
+    HANDLE_ERROR( custatevecSetStream(handle, stream) );
+
+    // device memory handler
+    custatevecDeviceMemHandler_t handler;
+    handler.ctx = &memPool;
+    handler.device_alloc = myMemPoolAlloc;
+    handler.device_free = myMemPoolFree;
+    strcpy(handler.name, "mempool");
+    HANDLE_ERROR( custatevecSetDeviceMemHandler(handle, &handler) );
+
+    // apply Hadamard gate
+    HANDLE_ERROR( custatevecApplyMatrix(
+                  handle, d_sv, CUDA_C_64F, nIndexBits, hGate, CUDA_C_64F,
+                  layout, adjoint, hTargets, hNTargets, nullptr, nullptr, 0, 
+                  CUSTATEVEC_COMPUTE_DEFAULT, nullptr, 0) );
+
+    // apply control-SWAP gate
+    HANDLE_ERROR( custatevecApplyMatrix(
+                  handle, d_sv, CUDA_C_64F, nIndexBits, swapGate, CUDA_C_64F,
+                  layout, adjoint, swapTargets, swapNTargets, swapControls, nullptr, swapNControls, 
+                  CUSTATEVEC_COMPUTE_DEFAULT, nullptr, 0) );
+
+    // apply Hadamard gate
+    HANDLE_ERROR( custatevecApplyMatrix(
+                  handle, d_sv, CUDA_C_64F, nIndexBits, hGate, CUDA_C_64F,
+                  layout, adjoint, hTargets, hNTargets, nullptr, nullptr, 0, 
+                  CUSTATEVEC_COMPUTE_DEFAULT, nullptr, 0) );
+
+    // compute expectation
+    double expectationValue;
+    HANDLE_ERROR( custatevecComputeExpectation(
+                  handle, d_sv, CUDA_C_64F, nIndexBits, &expectationValue, CUDA_R_64F, nullptr,
+                  observable, CUDA_C_64F, layout, basisBits, nBasisBits,
+                  CUSTATEVEC_COMPUTE_DEFAULT, nullptr, 0) );
+
+    HANDLE_CUDA_ERROR( cudaStreamSynchronize(stream) );
+
+    // destroy handle
+    HANDLE_ERROR( custatevecDestroy(handle) );
+
+    //----------------------------------------------------------------------------------------------
+
+    // release device memory and stream
+    HANDLE_CUDA_ERROR( cudaFreeAsync(d_sv, stream) );
+    HANDLE_CUDA_ERROR( cudaStreamDestroy(stream) );
+
+    double expectationValueResult = 0.9608;
+    bool correct = almost_equal(expectationValue, expectationValueResult);
+    if (correct) {
+        printf("memory_handler example PASSED\n");
+        return EXIT_SUCCESS;
+    }
+    else {
+        printf("memory_handler example FAILED: wrong result\n");
+        return EXIT_FAILURE;
+    }
+}
+
+#else
+int main(void) {
+    printf("memory_handler example WAIVED : This example uses CUDA's built-in stream-ordered memory allocator, which requires CUDA 11.2+.\n");
+    return EXIT_SUCCESS;
+}
+#endif
\ No newline at end of file
diff --git a/samples/custatevec/mgpu_batch_measure.cu b/samples/custatevec/mgpu_batch_measure.cu
new file mode 100644
index 0000000..40d1dd8
--- /dev/null
+++ b/samples/custatevec/mgpu_batch_measure.cu
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  - Neither the name(s) of the copyright holder(s) nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
+#include <cuComplex.h>        // cuDoubleComplex
+#include <custatevec.h>       // custatevecBatchMeasurementWithOffset
+#include <stdio.h>            // printf
+#include <stdlib.h>           // EXIT_FAILURE
+
+#include "helper.hpp"         // HANDLE_ERROR, HANDLE_CUDA_ERROR
+
+int main(int argc, char** argv) {
+
+    const int nGlobalBits  = 2;
+    const int nLocalBits   = 2;
+    const int nSubSvs      = (1 << nGlobalBits);
+    const int subSvSize    = (1 << nLocalBits);
+    const int bitStringLen = 2;
+
+    const int bitOrdering[] = {1, 0};
+
+    int bitString[bitStringLen];
+    const int bitString_result[] = {0, 0};
+
+    // In real appliction, random number in range [0, 1) will be used.
+    const double randnum = 0.72; 
+
+    cuDoubleComplex h_sv[][subSvSize]        = {{{ 0.000, 0.000}, { 0.000, 0.125}, { 0.000, 0.250}, { 0.000, 0.375}},
+                                                {{ 0.000, 0.000}, { 0.000,-0.125}, { 0.000,-0.250}, { 0.000,-0.375}},
+                                                {{ 0.125, 0.000}, { 0.125,-0.125}, { 0.125,-0.250}, { 0.125,-0.375}},
+                                                {{-0.125, 0.000}, {-0.125,-0.125}, {-0.125,-0.250}, {-0.125,-0.375}}};
+    cuDoubleComplex h_sv_result[][subSvSize] = {{{ 0.0,      0.0}, { 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}}, 
+                                                {{ 0.0,      0.0}, { 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}},
+                                                {{ 0.707107, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}},
+                                                {{-0.707107, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}}};
+   
+    cuDoubleComplex *d_sv[nSubSvs];
+
+    // device allocation
+    int numDevices;
+    int devices[nSubSvs];
+    if (argc == 1)
+    {
+        HANDLE_CUDA_ERROR( cudaGetDeviceCount(&numDevices) );
+        for (int i = 0; i < nSubSvs; i++) {
+            devices[i] = i % numDevices;
+        }
+    }
+    else {
+        numDevices = min(argc - 1, nSubSvs);
+        for (int i = 0; i < numDevices; i++) {
+            const int deviceId = atoi(argv[i + 1]);
+            devices[i] = deviceId;
+        }
+        for (int i = numDevices; i < nSubSvs; i++) {
+            devices[i] = devices[i % numDevices];
+        }
+    }
+
+    printf("The following devices will be used in this sample: \n");
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        printf("  sub-SV #%d : device id %d\n", iSv, devices[iSv]);
+    }
+
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+        HANDLE_CUDA_ERROR( cudaMalloc((void**)&d_sv[iSv], subSvSize * sizeof(cuDoubleComplex)) );
+        HANDLE_CUDA_ERROR( cudaMemcpy(d_sv[iSv], h_sv[iSv], subSvSize * sizeof(cuDoubleComplex), 
+                           cudaMemcpyHostToDevice) );
+    }
+
+    //----------------------------------------------------------------------------------------------
+
+    // custatevec handle initialization
+    custatevecHandle_t handle[nSubSvs];
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+        HANDLE_ERROR( custatevecCreate(&handle[iSv]) );
+    }
+
+    // get abs2sum for each sub state vector
+    double abs2SumArray[nSubSvs];
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+        HANDLE_ERROR( custatevecAbs2SumArray(
+                      handle[iSv], d_sv[iSv], CUDA_C_64F, nLocalBits, &abs2SumArray[iSv], nullptr,
+                      0, nullptr, nullptr, 0) );
+    }
+
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+        HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
+    }
+
+    // get cumulative array
+    double cumulativeArray[nSubSvs + 1];
+    cumulativeArray[0] = 0.0;
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        cumulativeArray[iSv + 1] = cumulativeArray[iSv] + abs2SumArray[iSv];
+    }
+
+    // measurement
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        if (cumulativeArray[iSv] <= randnum && randnum < cumulativeArray[iSv + 1]) {
+            double norm = cumulativeArray[nSubSvs];
+            double offset = cumulativeArray[iSv];
+            HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+            HANDLE_ERROR( custatevecBatchMeasureWithOffset(
+                          handle[iSv], d_sv[iSv], CUDA_C_64F, nLocalBits, bitString, bitOrdering,
+                          bitStringLen, randnum, CUSTATEVEC_COLLAPSE_NONE, offset, norm) );
+        }
+    }
+
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+        HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
+    }
+
+    // get abs2Sum after collapse
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+        HANDLE_ERROR( custatevecAbs2SumArray(
+                      handle[iSv], d_sv[iSv], CUDA_C_64F, nLocalBits, &abs2SumArray[iSv], nullptr,
+                      0, bitString, bitOrdering, bitStringLen) );
+    }
+
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+        HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
+    }
+
+    // get norm after collapse
+    double norm = 0.0;
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        norm += abs2SumArray[iSv];
+    }
+
+    // collapse sub state vectors
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+        HANDLE_ERROR( custatevecCollapseByBitString(
+                      handle[iSv], d_sv[iSv], CUDA_C_64F, nLocalBits, bitString, bitOrdering,
+                      bitStringLen, norm) );
+    }
+
+    // destroy handle
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+        HANDLE_ERROR( custatevecDestroy(handle[iSv]) );
+    }
+
+    //----------------------------------------------------------------------------------------------
+
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaMemcpy(h_sv[iSv], d_sv[iSv], subSvSize * sizeof(cuDoubleComplex),
+                           cudaMemcpyDeviceToHost) );
+    }
+
+    bool correct = true;
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        for (int i = 0; i < subSvSize; i++) {
+            if (!almost_equal(h_sv[iSv][i], h_sv_result[iSv][i])) {
+                correct = false;
+                break;
+            }
+        }
+    }
+
+    for (int i = 0; i < bitStringLen; i++) {
+        if (bitString[i] != bitString_result[i]) {
+            correct = false;
+            break;
+        }
+    }
+      
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaFree(d_sv[iSv]) );
+    }
+
+    if (correct) {
+        printf("mgpu_batch_measure example PASSED\n");
+        return EXIT_SUCCESS;
+    }
+    else {
+        printf("mgpu_batch_measure example FAILED: wrong result\n");
+        return EXIT_FAILURE;
+    }
+
+}
\ No newline at end of file
diff --git a/samples/custatevec/mgpu_sampler.cu b/samples/custatevec/mgpu_sampler.cu
new file mode 100644
index 0000000..ebf1590
--- /dev/null
+++ b/samples/custatevec/mgpu_sampler.cu
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  - Neither the name(s) of the copyright holder(s) nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
+#include <cuComplex.h>        // cuDoubleComplex
+#include <custatevec.h>       // custatevecSampler
+#include <stdio.h>            // printf
+#include <stdlib.h>           // EXIT_FAILURE
+
+#include "helper.hpp"         // HANDLE_ERROR, HANDLE_CUDA_ERROR
+
+double* binarySearch(double* first, double* last, const double value) {
+    double* it;
+    int count = last - first;
+    while (count > 0) {
+        it = first;
+        int step = count / 2;
+        it += step;
+        if (*it < value) {
+            first = ++it;
+            count -= step + 1;
+        }
+        else {
+            count = step;
+        }
+    }
+    return first;
+}
+
+int main(int argc, char** argv) {
+
+    const int nGlobalBits = 2;
+    const int nLocalBits  = 2;
+    const int nSubSvs     = (1 << nGlobalBits);
+    const int subSvSize   = (1 << nLocalBits);
+    
+    const int nMaxShots  = 5;
+    const int nShots     = 5;
+
+    const int bitStringLen  = 4;
+    const int bitOrdering[] = {0, 1, 2, 3};
+
+    custatevecIndex_t bitStrings[nShots];
+    custatevecIndex_t bitStrings_result[nShots] = {0b0011, 0b0011, 0b0111, 0b1011, 0b1110};
+
+    // In real appliction, random numbers in range [0, 1) will be used.
+    double randnums[] = {0.1, 0.2, 0.4, 0.6, 0.8};
+
+    cuDoubleComplex h_sv[][subSvSize] = {{{ 0.000, 0.000}, { 0.000, 0.125}, { 0.000, 0.250}, { 0.000, 0.375}},
+                                         {{ 0.000, 0.000}, { 0.000,-0.125}, { 0.000,-0.250}, { 0.000,-0.375}},
+                                         {{ 0.125, 0.000}, { 0.125,-0.125}, { 0.125,-0.250}, { 0.125,-0.375}},
+                                         {{-0.125, 0.000}, {-0.125,-0.125}, {-0.125,-0.250}, {-0.125,-0.375}}};
+
+    custatevecSamplerDescriptor_t sampler[nSubSvs];
+
+    cuDoubleComplex *d_sv[nSubSvs];
+
+    // device allocation
+    int numDevices;
+    int devices[nSubSvs];
+    if (argc == 1)
+    {
+        HANDLE_CUDA_ERROR( cudaGetDeviceCount(&numDevices) );
+        for (int i = 0; i < nSubSvs; i++) {
+            devices[i] = i % numDevices;
+        }
+    }
+    else {
+        numDevices = min(argc - 1, nSubSvs);
+        for (int i = 0; i < numDevices; i++) {
+            const int deviceId = atoi(argv[i + 1]);
+            devices[i] = deviceId;
+        }
+        for (int i = numDevices; i < nSubSvs; i++) {
+            devices[i] = devices[i % numDevices];
+        }
+    }
+
+    printf("The following devices will be used in this sample: \n");
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        printf("  sub-SV #%d : device id %d\n", iSv, devices[iSv]);
+    }
+
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+        HANDLE_CUDA_ERROR( cudaMalloc((void**)&d_sv[iSv], subSvSize * sizeof(cuDoubleComplex)) );
+        HANDLE_CUDA_ERROR( cudaMemcpy(d_sv[iSv], h_sv[iSv], subSvSize * sizeof(cuDoubleComplex), 
+                           cudaMemcpyHostToDevice) );
+    }
+
+    //----------------------------------------------------------------------------------------------
+
+    // custatevec handle initialization
+    custatevecHandle_t handle[nSubSvs];
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+        HANDLE_ERROR( custatevecCreate(&handle[iSv]) );
+    }
+
+    void* extraWorkspace[nSubSvs];
+    size_t extraWorkspaceSizeInBytes[nSubSvs];
+
+    // create sampler and check the size of external workspace
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+        HANDLE_ERROR( custatevecSamplerCreate(
+                      handle[iSv], d_sv[iSv], CUDA_C_64F, nLocalBits, &sampler[iSv], nMaxShots,
+                      &extraWorkspaceSizeInBytes[iSv]) );
+    }
+
+    // allocate external workspace if necessary
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        if (extraWorkspaceSizeInBytes[iSv] > 0) {
+            HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+            HANDLE_CUDA_ERROR( cudaMalloc(&extraWorkspace[iSv], extraWorkspaceSizeInBytes[iSv]) );
+        }
+    }
+
+    // sample preprocess
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+        HANDLE_ERROR( custatevecSamplerPreprocess(
+                      handle[iSv], sampler[iSv], extraWorkspace[iSv],
+                      extraWorkspaceSizeInBytes[iSv]) );
+    }
+
+    // get norm of the sub state vectors
+    double subNorms[nSubSvs];
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+        HANDLE_ERROR( custatevecSamplerGetSquaredNorm(
+                      handle[iSv], sampler[iSv], &subNorms[iSv]) );
+    }
+
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+        HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
+    }
+
+    // get cumulative array
+    double cumulativeArray[nSubSvs + 1];
+    cumulativeArray[0] = 0.0;
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        cumulativeArray[iSv + 1] = cumulativeArray[iSv] + subNorms[iSv];
+    }
+    double norm = cumulativeArray[nSubSvs];
+
+    // apply offset and norm
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+        HANDLE_ERROR( custatevecSamplerApplySubSVOffset(
+                      handle[iSv], sampler[iSv], iSv, nSubSvs, cumulativeArray[iSv], norm) );
+    }
+
+    // divide randnum array
+    int shotOffsets[nSubSvs + 1];
+    shotOffsets[0] = 0;
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        double* pos = binarySearch(randnums, randnums + nShots, cumulativeArray[iSv + 1] / norm);
+        if (iSv == nSubSvs - 1) {
+            pos = randnums + nShots;
+        }
+        shotOffsets[iSv + 1] = pos - randnums;
+    }
+
+    // sample bit strings
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        int shotOffset = shotOffsets[iSv];
+        int nSubShots = shotOffsets[iSv + 1] - shotOffsets[iSv];
+        if (nSubShots > 0) {
+            HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+            HANDLE_ERROR( custatevecSamplerSample(
+                          handle[iSv], sampler[iSv], &bitStrings[shotOffset], bitOrdering,
+                          bitStringLen, &randnums[shotOffset], nSubShots,
+                          CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER) );
+        }
+    }
+
+    // destroy sampler descriptor and custatevec handle
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaSetDevice(devices[iSv]) );
+        HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
+        HANDLE_ERROR( custatevecSamplerDestroy(sampler[iSv]) );
+        HANDLE_ERROR( custatevecDestroy(handle[iSv]) );
+    }
+
+    //----------------------------------------------------------------------------------------------
+
+    bool correct = true;
+    for (int i = 0; i < nShots; i++) {
+        if (bitStrings[i] != bitStrings_result[i]) {
+            correct = false;
+            break;
+        }
+    }
+
+    for (int iSv = 0; iSv < nSubSvs; iSv++) {
+        HANDLE_CUDA_ERROR( cudaFree(d_sv[iSv]) );
+        if (extraWorkspaceSizeInBytes[iSv] > 0) {
+            HANDLE_CUDA_ERROR( cudaFree(extraWorkspace[iSv]) );
+        }
+    }
+
+    if (correct) {
+        printf("mgpu_sampler example PASSED\n");
+        return EXIT_SUCCESS;
+    }
+    else {
+        printf("mgpu_sampler example FAILED: wrong result\n");
+        return EXIT_FAILURE;
+    }
+}
diff --git a/samples/custatevec/permutation_matrix.cu b/samples/custatevec/permutation_matrix.cu
index 59c5b14..1c1a906 100644
--- a/samples/custatevec/permutation_matrix.cu
+++ b/samples/custatevec/permutation_matrix.cu
@@ -70,7 +70,7 @@ int main(void) {
     size_t extraWorkspaceSizeInBytes = 0;
 
     // check the size of external workspace
-    HANDLE_ERROR( custatevecApplyGeneralizedPermutationMatrix_bufferSize(
+    HANDLE_ERROR( custatevecApplyGeneralizedPermutationMatrixGetWorkspaceSize(
                   handle, CUDA_C_64F, nIndexBits, permutation, diagonals, CUDA_C_64F, basisBits,
                   nBasisBits, maskLen, &extraWorkspaceSizeInBytes) );
 
@@ -81,7 +81,7 @@ int main(void) {
     // apply matrix
     HANDLE_ERROR( custatevecApplyGeneralizedPermutationMatrix(
                   handle, d_sv, CUDA_C_64F, nIndexBits, permutation, diagonals, CUDA_C_64F,
-                  adjoint, basisBits, nBasisBits, maskBitString, maskOrdering, maskLen,
+                  adjoint, basisBits, nBasisBits, maskOrdering, maskBitString, maskLen,
                   extraWorkspace, extraWorkspaceSizeInBytes) );
 
     // destroy handle
diff --git a/samples/custatevec/sampler.cu b/samples/custatevec/sampler.cu
index f7206ee..238d6a6 100644
--- a/samples/custatevec/sampler.cu
+++ b/samples/custatevec/sampler.cu
@@ -72,7 +72,7 @@ int main(void) {
     size_t extraWorkspaceSizeInBytes = 0;
 
     // create sampler and check the size of external workspace
-    HANDLE_ERROR( custatevecSampler_create(
+    HANDLE_ERROR( custatevecSamplerCreate(
                   handle, d_sv, CUDA_C_64F, nIndexBits, &sampler, nMaxShots, 
                   &extraWorkspaceSizeInBytes) );
     
@@ -81,15 +81,16 @@ int main(void) {
         HANDLE_CUDA_ERROR( cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes) );
     
     // sample preprocess
-    HANDLE_ERROR( custatevecSampler_preprocess(
-                  handle, &sampler, extraWorkspace, extraWorkspaceSizeInBytes) );
+    HANDLE_ERROR( custatevecSamplerPreprocess(
+                  handle, sampler, extraWorkspace, extraWorkspaceSizeInBytes) );
     
     // sample bit strings
-    HANDLE_ERROR( custatevecSampler_sample(
-                  handle, &sampler, bitStrings, bitOrdering, bitStringLen, randnums, nShots, 
+    HANDLE_ERROR( custatevecSamplerSample(
+                  handle, sampler, bitStrings, bitOrdering, bitStringLen, randnums, nShots, 
                   CUSTATEVEC_SAMPLER_OUTPUT_ASCENDING_ORDER) );
 
-    // destroy handle
+    // destroy descriptor and handle
+    HANDLE_ERROR( custatevecSamplerDestroy(sampler) );
     HANDLE_ERROR( custatevecDestroy(handle) );
 
     //----------------------------------------------------------------------------------------------
diff --git a/samples/custatevec/swap_index_bits.cu b/samples/custatevec/swap_index_bits.cu
new file mode 100644
index 0000000..fc93155
--- /dev/null
+++ b/samples/custatevec/swap_index_bits.cu
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  - Neither the name(s) of the copyright holder(s) nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
+#include <cuComplex.h>        // cuDoubleComplex
+#include <custatevec.h>       // custatevecSwapIndexBits
+#include <stdio.h>            // printf
+#include <stdlib.h>           // EXIT_FAILURE
+
+#include "helper.hpp"         // HANDLE_ERROR, HANDLE_CUDA_ERROR
+
+int main(void) {
+
+    const int nIndexBits = 3;
+    const int nSvSize    = (1 << nIndexBits);
+
+    // swap 0th and 2nd qubits
+    const int nBitSwaps  = 1;
+    const int2 bitSwaps[] = {{0, 2}};
+
+    // swap the state vector elements only if 1st qubit is 1
+    const int maskLen = 1;
+    int maskBitString[] = {1};
+    int maskOrdering[] = {1};
+
+    // 0.2|001> + 0.4|011> - 0.4|101> - 0.8|111>
+    cuDoubleComplex h_sv[]        = {{ 0.0, 0.0}, { 0.2, 0.0}, { 0.0, 0.0}, { 0.4, 0.0}, 
+                                     { 0.0, 0.0}, {-0.4, 0.0}, { 0.0, 0.0}, {-0.8, 0.0}};
+
+    // 0.2|001> + 0.4|110> - 0.4|101> - 0.8|111>
+    cuDoubleComplex h_sv_result[] = {{ 0.0, 0.0}, { 0.2, 0.0}, { 0.0, 0.0}, { 0.0, 0.0}, 
+                                     { 0.0, 0.0}, {-0.4, 0.0}, { 0.4, 0.0}, {-0.8, 0.0}};
+
+    cuDoubleComplex *d_sv;
+    HANDLE_CUDA_ERROR( cudaMalloc(&d_sv, nSvSize * sizeof(cuDoubleComplex)) );
+
+    HANDLE_CUDA_ERROR( cudaMemcpy(d_sv, h_sv, nSvSize * sizeof(cuDoubleComplex), 
+                       cudaMemcpyHostToDevice) );
+
+    //----------------------------------------------------------------------------------------------
+
+    // custatevec handle initialization
+    custatevecHandle_t handle;
+    HANDLE_ERROR( custatevecCreate(&handle) );
+
+    // bit swap
+    HANDLE_ERROR( custatevecSwapIndexBits(
+                  handle, d_sv, CUDA_C_64F, nIndexBits, bitSwaps, nBitSwaps,
+                  maskBitString, maskOrdering, maskLen) );
+
+    // destroy handle
+    HANDLE_ERROR( custatevecDestroy(handle) );
+
+    //----------------------------------------------------------------------------------------------
+
+    HANDLE_CUDA_ERROR( cudaMemcpy(h_sv, d_sv, nSvSize * sizeof(cuDoubleComplex),
+                       cudaMemcpyDeviceToHost) );
+
+    bool correct = true;
+    for (int i = 0; i < nSvSize; i++) {
+        if (!almost_equal(h_sv[i], h_sv_result[i])) {
+            correct = false;
+            break;
+        }
+    }
+
+    HANDLE_CUDA_ERROR( cudaFree(d_sv) );
+
+    if (correct) {
+        printf("swap_index_bits example PASSED\n");
+        return EXIT_SUCCESS;
+    }
+    else {
+        printf("swap_index_bits example FAILED: wrong result\n");
+        return EXIT_FAILURE;
+    }
+
+}
diff --git a/samples/custatevec/test_matrix_type.cu b/samples/custatevec/test_matrix_type.cu
new file mode 100644
index 0000000..c66b904
--- /dev/null
+++ b/samples/custatevec/test_matrix_type.cu
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+ *
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *  - Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  - Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  - Neither the name(s) of the copyright holder(s) nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
+#include <cuComplex.h>        // cuDoubleComplex
+#include <custatevec.h>       // custatevecTestMatrixType
+#include <stdio.h>            // printf
+#include <stdlib.h>           // EXIT_FAILURE
+
+#include "helper.hpp"         // HANDLE_ERROR, HANDLE_CUDA_ERROR
+
+double runTestMatrixType(custatevecHandle_t       handle,
+                         custatevecMatrixType_t   matrixType,
+                         const void*              matrix,
+                         cudaDataType_t           matrixDataType,
+                         custatevecMatrixLayout_t layout,
+                         const uint32_t           nTargets,
+                         const int32_t            adjoint,
+                         custatevecComputeType_t  computeType) {
+
+    double residualNorm;
+
+    void* extraWorkspace = nullptr;
+    size_t extraWorkspaceSizeInBytes = 0;
+
+    // check the size of external workspace
+    HANDLE_ERROR( custatevecTestMatrixTypeGetWorkspaceSize(
+                  handle, matrixType, matrix, matrixDataType, layout,
+                  nTargets, adjoint, computeType, &extraWorkspaceSizeInBytes) );
+
+    // allocate external workspace if necessary
+    if (extraWorkspaceSizeInBytes > 0)
+        HANDLE_CUDA_ERROR( cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes) );
+
+    // execute testing
+    HANDLE_ERROR( custatevecTestMatrixType(
+                  handle, &residualNorm, matrixType, matrix, matrixDataType, layout,
+                  nTargets, adjoint, computeType, extraWorkspace, extraWorkspaceSizeInBytes) );
+
+    HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
+
+    if (extraWorkspaceSizeInBytes)
+        HANDLE_CUDA_ERROR( cudaFree(extraWorkspace) );
+
+    return residualNorm;
+}
+
+int main(void) {
+
+    const int nTargets = 1;
+    const int adjoint = 0;
+
+    // unitary and Hermitian matrix
+    const double Rsqrt2 = 1. / std::sqrt(2.);
+    cuDoubleComplex matrix[] = {{0.5, 0.0}, {Rsqrt2, -0.5},
+                                {Rsqrt2, 0.5}, {-0.5, 0.0}};
+
+    //----------------------------------------------------------------------------------------------
+
+    // custatevec handle initialization
+    custatevecHandle_t handle;
+    HANDLE_ERROR( custatevecCreate(&handle) );
+
+    cudaDataType_t matrixDataType = CUDA_C_64F;
+    custatevecMatrixLayout_t layout = CUSTATEVEC_MATRIX_LAYOUT_ROW;
+    custatevecComputeType_t computeType = CUSTATEVEC_COMPUTE_DEFAULT;
+
+    double unitaryResidualNorm = runTestMatrixType(handle, CUSTATEVEC_MATRIX_TYPE_UNITARY, matrix,
+                                                   matrixDataType, layout, nTargets, adjoint,
+                                                   computeType) ;
+
+    double hermiteResidualNorm = runTestMatrixType(handle, CUSTATEVEC_MATRIX_TYPE_HERMITIAN, matrix,
+                                                   matrixDataType, layout, nTargets, adjoint,
+                                                   computeType) ;
+
+    // destroy handle
+    HANDLE_ERROR( custatevecDestroy(handle) );
+
+    //----------------------------------------------------------------------------------------------
+
+    bool correct = true;
+
+    correct &= almost_equal(unitaryResidualNorm, 0.);
+    correct &= almost_equal(hermiteResidualNorm, 0.);
+
+    if (correct) {
+        printf("test_matrix_type example PASSED\n");
+        return EXIT_SUCCESS;
+    }
+    else {
+        printf("test_matrix_type example FAILED: wrong result\n");
+        return EXIT_FAILURE;
+    }
+
+}
\ No newline at end of file
diff --git a/samples/cutensornet/CMakeLists.txt b/samples/cutensornet/CMakeLists.txt
index 86c6927..b28376d 100644
--- a/samples/cutensornet/CMakeLists.txt
+++ b/samples/cutensornet/CMakeLists.txt
@@ -52,7 +52,7 @@ else ()
 endif ()
 
 # ##########################################
-# custatevec_example check-env utilities
+# cutensornet_example check-env utilities
 # ##########################################
 
 function(set_with_fallback VARIABLE FALLBACK)
diff --git a/samples/cutensornet/README.md b/samples/cutensornet/README.md
index 5aa6cee..173d48b 100644
--- a/samples/cutensornet/README.md
+++ b/samples/cutensornet/README.md
@@ -2,36 +2,50 @@
 
 * [Documentation](https://docs.nvidia.com/cuda/cutensornet/index.html)
 
-# Install
+## Install
 
-## Linux
+### Linux
 
-You can use make to compile the cuTensorNet samples. The option CUTENSORNET_ROOT need to be defined if cuTensorNet is not the CUDA installation folder.
-
-With make
+You can use make to compile the cuTensorNet samples. The environment variables `CUTENSOR_ROOT` and `CUTENSORNET_ROOT` need to be defined if cuTENSOR and cuTensorNet is not the CUDA installation folder.
 
+Using `make`:
 ```
-export CUTENSORNET_ROOT=<path_to_custatevec_root>
+export CUTENSOR_ROOT=<path_to_cutensor_root>
+export CUTENSORNET_ROOT=<path_to_cutensornet_root>
 make -j8
 ```
+or `cmake`:
+```
+export CUTENSOR_ROOT=<path_to_cutensor_root>
+export CUTENSORNET_ROOT=<path_to_cutensornet_root>
+cmake . && make
+```
+
+## Run
+
+To execute the sample, simply run:
+```
+./tensornet_example
+```
 
-# Support
+## Support
 
 * **Supported SM Architectures:** SM 7.0, SM 7.5, SM 8.0, SM 8.6
 * **Supported OSes:** Linux
-* **Supported CPU Architectures**: x86_64, arm64
-* **Language**: `C++11`
+* **Supported CPU Architectures**: x86_64, aarch64-sbsa, ppc64le
+* **Language**: C++11 or above
+
+## Prerequisites
 
-# Prerequisites
+* [CUDA Toolkit 11.x](https://developer.nvidia.com/cuda-downloads) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)).
 
-* [CUDA 1X.X toolkit](https://developer.nvidia.com/cuda-downloads) (or above) and compatible driver (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)).
+## Description
+This sample helps users get familiar with cuTensorNet. It provides an example of calling cuTensorNet to find a contraction path and as well as performing the contraction.
 
-# Description
-This sample helps users get familiar with cuTensorNet.
-It provides an example of calling cuTensorNet to find a contraction path and as well as performing the contraction.
 The sample consists of:
-* Defining a Tensor Network (Create Contraction Descriptor using "cutensornetCreateNetworkDescriptor").
-* Find a close-to-optimal order of contraction via "cutensornetContractionOptimize". Users can control some parameters of the cutensornetContractionOptimize (e.g., path finder) using the "cutensornetContractionOptimizerConfigSetAttribute" function. Users also can provide their own path and use the SetAttribute tool to set the Info structure to their own path.
-* Create a planning to performs the contraction using "cutensornetCreateContractionPlan". This step will prepare a planning for the execution of list of the pairwise contractions provided by the path.
-* Users can optionally call "cutensornetContractionAutotune" to perform autotuning and choose the best performing kernel for the corresponding path such that the winner kernels will be called for all subsequent calls to performs the contraction "cutensornetContraction". The autotuning could bring improvement in particular when "cutensornetContraction" is called multiple times.
-* Performs the computation of the contraction using "cutensornetContraction".
+* Define a tensor network using `cutensornetCreateNetworkDescriptor`.
+* Find a close-to-optimal order of contraction (i.e., a contraction path) via `cutensornetContractionOptimize`. Users can control the parameters of `cutensornetContractionOptimize` (e.g., the pathfinder) using the `cutensornetContractionOptimizerConfigSetAttribute` function. Users also can provide their own path and use the `cutensornetContractionOptimizerInfoSetAttribute` API to set the `cutensornetContractionOptimizerInfo_t` structure to their own path.
+* Create a contraction plan for performing the contraction using `cutensornetCreateContractionPlan`. This step will prepare a plan for the execution of a list of the pairwise contractions provided by the path.
+* Optionally, call `cutensornetContractionAutotune` to perform autotuning, which chooses the best performant kernels for the corresponding path such that the winner kernels will be called for all subsequent calls of `cutensornetContraction` to perform the contraction of the tensor network. The autotuning could bring improvement in particular when `cutensornetContraction` is called multiple times with the same plan/network.
+* Perform the computation of the contraction using `cutensornetContraction`.
+* Free the cuTensorNet resources.
diff --git a/samples/cutensornet/tensornet_example.cu b/samples/cutensornet/tensornet_example.cu
index 4f57081..d40d339 100644
--- a/samples/cutensornet/tensornet_example.cu
+++ b/samples/cutensornet/tensornet_example.cu
@@ -233,7 +233,7 @@ int main()
    const int32_t nmodeD = modesD.size();
 
    /*******************************
-   * Create Contraction Descriptor
+   * Create Network Descriptor
    *******************************/
 
    const int32_t* modesIn[] = {modesA.data(), modesB.data(), modesC.data()};
@@ -277,9 +277,9 @@ int main()
    cutensornetContractionOptimizerConfig_t optimizerConfig;
    HANDLE_ERROR (cutensornetCreateContractionOptimizerConfig(handle, &optimizerConfig));
 
-    // Set the value of the partitioner imbalance factor, if desired
-    int imbalance_factor = 30;
-    HANDLE_ERROR(cutensornetContractionOptimizerConfigSetAttribute(
+   // Set the value of the partitioner imbalance factor, if desired
+   int imbalance_factor = 30;
+   HANDLE_ERROR(cutensornetContractionOptimizerConfigSetAttribute(
                                                                handle,
                                                                optimizerConfig,
                                                                CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_IMBALANCE_FACTOR,
@@ -311,95 +311,123 @@ int main()
    * Initialize all pair-wise contraction plans (for cuTENSOR)
    *******************************/
    cutensornetContractionPlan_t plan;
-   HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
-                                                descNet,
-                                                optimizerInfo,
-                                                worksize,
-                                                &plan) );
 
-
-   /*******************************
-   * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
-   *******************************/
-   cutensornetContractionAutotunePreference_t autotunePref;
-   HANDLE_ERROR(cutensornetCreateContractionAutotunePreference(handle,
-                           &autotunePref));
-
-   const int numAutotuningIterations = 5; // may be 0
-   HANDLE_ERROR(cutensornetContractionAutotunePreferenceSetAttribute(
-                           handle,
-                           autotunePref,
-                           CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
-                           &numAutotuningIterations,
-                           sizeof(numAutotuningIterations)));
-
-   // modify the plan again to find the best pair-wise contractions
-   HANDLE_ERROR(cutensornetContractionAutotune(handle,
-                           plan,
-                           rawDataIn_d,
-                           D_d,
-                           work, worksize,
-                           autotunePref,
-                           stream));
-
-   HANDLE_ERROR(cutensornetDestroyContractionAutotunePreference(autotunePref));
-
-   printf("Create a contraction plan for cuTENSOR and optionally auto-tune it.\n");
-
-   /**********************
-   * Run
-   **********************/
-   GPUTimer timer;
-   double minTimeCUTENSOR = 1e100;
-   const int numRuns = 3; // to get stable perf results
-   for (int i=0; i < numRuns; ++i)
+   cutensornetWorkspaceDescriptor_t workDesc;
+   HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc));
+
+   uint64_t requiredWorkspaceSize = 0;
+   HANDLE_ERROR(cutensornetWorkspaceComputeSizes(handle,
+                                          descNet,
+                                          optimizerInfo,
+                                          workDesc));
+
+   HANDLE_ERROR(cutensornetWorkspaceGetSize(handle,
+                                         workDesc,
+                                         CUTENSORNET_WORKSIZE_PREF_MIN,
+                                         CUTENSORNET_MEMSPACE_DEVICE,
+                                         &requiredWorkspaceSize));
+   if (worksize < requiredWorkspaceSize)
+   {
+      printf("Not enough workspace memory is available.");
+   }
+   else
    {
-      cudaMemcpy(D_d, D, sizeD, cudaMemcpyHostToDevice); // restore output
-      cudaDeviceSynchronize();
-
-      /*
-      * Contract over all slices.
-      *
-      * A user may choose to parallelize this loop across multiple devices.
-      */
-      for(int64_t sliceId=0; sliceId < numSlices; ++sliceId)
+      HANDLE_ERROR (cutensornetWorkspaceSet(handle,
+                                            workDesc,
+                                            CUTENSORNET_MEMSPACE_DEVICE,
+                                            work,
+                                            worksize));
+                                          
+      HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
+                                                     descNet,
+                                                     optimizerInfo,
+                                                     workDesc,
+                                                     &plan) );
+
+      /*******************************
+      * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
+      *******************************/
+      cutensornetContractionAutotunePreference_t autotunePref;
+      HANDLE_ERROR(cutensornetCreateContractionAutotunePreference(handle,
+                              &autotunePref));
+
+      const int numAutotuningIterations = 5; // may be 0
+      HANDLE_ERROR(cutensornetContractionAutotunePreferenceSetAttribute(
+                              handle,
+                              autotunePref,
+                              CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
+                              &numAutotuningIterations,
+                              sizeof(numAutotuningIterations)));
+
+      // modify the plan again to find the best pair-wise contractions
+      HANDLE_ERROR(cutensornetContractionAutotune(handle,
+                              plan,
+                              rawDataIn_d,
+                              D_d,
+                              workDesc,
+                              autotunePref,
+                              stream));
+
+      HANDLE_ERROR(cutensornetDestroyContractionAutotunePreference(autotunePref));
+
+      printf("Create a contraction plan for cuTENSOR and optionally auto-tune it.\n");
+
+      /**********************
+      * Run
+      **********************/
+      GPUTimer timer;
+      double minTimeCUTENSOR = 1e100;
+      const int numRuns = 3; // to get stable perf results
+      for (int i=0; i < numRuns; ++i)
       {
-         timer.start();
-
-         HANDLE_ERROR(cutensornetContraction(handle,
-                                 plan,
-                                 rawDataIn_d,
-                                 D_d,
-                                 work, worksize, sliceId, stream));
-
-         // Synchronize and measure timing
-         auto time = timer.seconds();
-         minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
+         cudaMemcpy(D_d, D, sizeD, cudaMemcpyHostToDevice); // restore output
+         cudaDeviceSynchronize();
+
+         /*
+         * Contract over all slices.
+         *
+         * A user may choose to parallelize this loop across multiple devices.
+         */
+         for(int64_t sliceId=0; sliceId < numSlices; ++sliceId)
+         {
+            timer.start();
+
+            HANDLE_ERROR(cutensornetContraction(handle,
+                                    plan,
+                                    rawDataIn_d,
+                                    D_d,
+                                    workDesc, sliceId, stream));
+
+            // Synchronize and measure timing
+            auto time = timer.seconds();
+            minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
+         }
       }
-   }
 
-   printf("Contract the network, each slice uses the same contraction plan.\n");
+      printf("Contract the network, each slice uses the same contraction plan.\n");
 
-   /*************************/
+      /*************************/
 
-   double flops = -1;
+      double flops = -1;
 
-   HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
-               handle,
-               optimizerInfo,
-               CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
-               &flops,
-               sizeof(flops)));
+      HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
+                  handle,
+                  optimizerInfo,
+                  CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
+                  &flops,
+                  sizeof(flops)));
 
-   printf("numSlices: %ld\n", numSlices);
-   printf("%.2f ms / slice\n", minTimeCUTENSOR * 1000.f);
-   printf("%.2f GFLOPS/s\n", flops/1e9/minTimeCUTENSOR );
+      printf("numSlices: %ld\n", numSlices);
+      printf("%.2f ms / slice\n", minTimeCUTENSOR * 1000.f);
+      printf("%.2f GFLOPS/s\n", flops/1e9/minTimeCUTENSOR );
+   }
 
    HANDLE_ERROR(cutensornetDestroy(handle));
    HANDLE_ERROR(cutensornetDestroyNetworkDescriptor(descNet));
    HANDLE_ERROR(cutensornetDestroyContractionPlan(plan));
    HANDLE_ERROR(cutensornetDestroyContractionOptimizerConfig(optimizerConfig));
    HANDLE_ERROR(cutensornetDestroyContractionOptimizerInfo(optimizerInfo));
+   HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
 
    if (A) free(A);
    if (B) free(B);